diff --git a/requirements.txt b/requirements.txt index bd3b481..7bfd294 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,7 +10,7 @@ aiohttp~=3.11.0 httpx~=0.28.0 requests~=2.32.4 -# Web scraping (for website downloader) +# Web scraping and parsing beautifulsoup4~=4.13.4 lxml~=5.3.0 urllib3~=2.5.0 @@ -20,7 +20,10 @@ PyMuPDF~=1.25.0 python-docx~=1.1.0 # LLM API client (for OpenRouter) -openai~=1.0.0 +openai>=1.30.0 + +# Financial data +yfinance>=0.2.0 # Vector store alternatives (uncomment as needed) # chromadb~=0.5.0 diff --git a/tools/__init__.py b/tools/__init__.py index de212a3..f8be164 100644 --- a/tools/__init__.py +++ b/tools/__init__.py @@ -2,6 +2,7 @@ Tools Module - Tool management for the RAG system Provides a unified interface for tool registration and execution. +All tools use completely free APIs with no authentication required. """ from __future__ import annotations @@ -10,12 +11,6 @@ import json import logging from typing import Any, Callable, Optional -# Import the website downloader tool -from website_downloader_tool import ( - website_downloader, - get_tool_schema as get_website_downloader_schema, -) - log = logging.getLogger(__name__) @@ -37,13 +32,294 @@ class ToolManager: self._register_builtin_tools() def _register_builtin_tools(self) -> None: - """Register built-in tools.""" - # Register website downloader - self.register_tool( - name="website_downloader", - function=website_downloader, - schema=get_website_downloader_schema(), - ) + """Register all built-in tools.""" + + # === Website Downloader Tool === + try: + from website_downloader_tool import ( + website_downloader, + get_tool_schema as get_website_downloader_schema, + ) + self.register_tool( + name="website_downloader", + function=website_downloader, + schema=get_website_downloader_schema(), + ) + except ImportError as e: + log.warning(f"Could not import website_downloader_tool: {e}") + + # === Wikipedia Tools === + try: + from tools.wikipedia_tool import ( + wikipedia_search, + wikipedia_get_article, + wikipedia_get_full_article, + WIKIPEDIA_SEARCH_SCHEMA, + WIKIPEDIA_GET_ARTICLE_SCHEMA, + WIKIPEDIA_GET_FULL_ARTICLE_SCHEMA, + ) + self.register_tool( + name="wikipedia_search", + function=wikipedia_search, + schema=WIKIPEDIA_SEARCH_SCHEMA, + ) + self.register_tool( + name="wikipedia_get_article", + function=wikipedia_get_article, + schema=WIKIPEDIA_GET_ARTICLE_SCHEMA, + ) + self.register_tool( + name="wikipedia_get_full_article", + function=wikipedia_get_full_article, + schema=WIKIPEDIA_GET_FULL_ARTICLE_SCHEMA, + ) + except ImportError as e: + log.warning(f"Could not import wikipedia_tool: {e}") + + # === News Tools === + try: + from tools.news_tool import ( + news_search_hackernews, + news_get_top_stories, + news_get_reddit, + news_search_reddit, + news_aggregate, + NEWS_SEARCH_HACKERNEWS_SCHEMA, + NEWS_GET_TOP_STORIES_SCHEMA, + NEWS_GET_REDDIT_SCHEMA, + NEWS_SEARCH_REDDIT_SCHEMA, + NEWS_AGGREGATE_SCHEMA, + ) + self.register_tool( + name="news_search_hackernews", + function=news_search_hackernews, + schema=NEWS_SEARCH_HACKERNEWS_SCHEMA, + ) + self.register_tool( + name="news_get_top_stories", + function=news_get_top_stories, + schema=NEWS_GET_TOP_STORIES_SCHEMA, + ) + self.register_tool( + name="news_get_reddit", + function=news_get_reddit, + schema=NEWS_GET_REDDIT_SCHEMA, + ) + self.register_tool( + name="news_search_reddit", + function=news_search_reddit, + schema=NEWS_SEARCH_REDDIT_SCHEMA, + ) + self.register_tool( + name="news_aggregate", + function=news_aggregate, + schema=NEWS_AGGREGATE_SCHEMA, + ) + except ImportError as e: + log.warning(f"Could not import news_tool: {e}") + + # === Finance Tools === + try: + from tools.finance_tool import ( + finance_get_stock_info, + finance_get_stock_history, + finance_get_crypto_price, + finance_get_top_cryptos, + finance_get_exchange_rate, + finance_search_crypto, + FINANCE_GET_STOCK_INFO_SCHEMA, + FINANCE_GET_STOCK_HISTORY_SCHEMA, + FINANCE_GET_CRYPTO_PRICE_SCHEMA, + FINANCE_GET_TOP_CRYPTOS_SCHEMA, + FINANCE_GET_EXCHANGE_RATE_SCHEMA, + FINANCE_SEARCH_CRYPTO_SCHEMA, + ) + self.register_tool( + name="finance_get_stock_info", + function=finance_get_stock_info, + schema=FINANCE_GET_STOCK_INFO_SCHEMA, + ) + self.register_tool( + name="finance_get_stock_history", + function=finance_get_stock_history, + schema=FINANCE_GET_STOCK_HISTORY_SCHEMA, + ) + self.register_tool( + name="finance_get_crypto_price", + function=finance_get_crypto_price, + schema=FINANCE_GET_CRYPTO_PRICE_SCHEMA, + ) + self.register_tool( + name="finance_get_top_cryptos", + function=finance_get_top_cryptos, + schema=FINANCE_GET_TOP_CRYPTOS_SCHEMA, + ) + self.register_tool( + name="finance_get_exchange_rate", + function=finance_get_exchange_rate, + schema=FINANCE_GET_EXCHANGE_RATE_SCHEMA, + ) + self.register_tool( + name="finance_search_crypto", + function=finance_search_crypto, + schema=FINANCE_SEARCH_CRYPTO_SCHEMA, + ) + except ImportError as e: + log.warning(f"Could not import finance_tool: {e}") + + # === Medical Tools === + try: + from tools.medical_tool import ( + medical_search_pubmed, + medical_get_pubmed_abstract, + medical_get_disease_data, + medical_get_covid_country, + medical_search_fda, + medical_get_health_topics, + MEDICAL_SEARCH_PUBMED_SCHEMA, + MEDICAL_GET_PUBMED_ABSTRACT_SCHEMA, + MEDICAL_GET_DISEASE_DATA_SCHEMA, + MEDICAL_GET_COVID_COUNTRY_SCHEMA, + MEDICAL_SEARCH_FDA_SCHEMA, + MEDICAL_GET_HEALTH_TOPICS_SCHEMA, + ) + self.register_tool( + name="medical_search_pubmed", + function=medical_search_pubmed, + schema=MEDICAL_SEARCH_PUBMED_SCHEMA, + ) + self.register_tool( + name="medical_get_pubmed_abstract", + function=medical_get_pubmed_abstract, + schema=MEDICAL_GET_PUBMED_ABSTRACT_SCHEMA, + ) + self.register_tool( + name="medical_get_disease_data", + function=medical_get_disease_data, + schema=MEDICAL_GET_DISEASE_DATA_SCHEMA, + ) + self.register_tool( + name="medical_get_covid_country", + function=medical_get_covid_country, + schema=MEDICAL_GET_COVID_COUNTRY_SCHEMA, + ) + self.register_tool( + name="medical_search_fda", + function=medical_search_fda, + schema=MEDICAL_SEARCH_FDA_SCHEMA, + ) + self.register_tool( + name="medical_get_health_topics", + function=medical_get_health_topics, + schema=MEDICAL_GET_HEALTH_TOPICS_SCHEMA, + ) + except ImportError as e: + log.warning(f"Could not import medical_tool: {e}") + + # === Weather Tools === + try: + from tools.weather_tool import ( + weather_get_current, + weather_get_forecast, + weather_get_air_quality, + WEATHER_GET_CURRENT_SCHEMA, + WEATHER_GET_FORECAST_SCHEMA, + WEATHER_GET_AIR_QUALITY_SCHEMA, + ) + self.register_tool( + name="weather_get_current", + function=weather_get_current, + schema=WEATHER_GET_CURRENT_SCHEMA, + ) + self.register_tool( + name="weather_get_forecast", + function=weather_get_forecast, + schema=WEATHER_GET_FORECAST_SCHEMA, + ) + self.register_tool( + name="weather_get_air_quality", + function=weather_get_air_quality, + schema=WEATHER_GET_AIR_QUALITY_SCHEMA, + ) + except ImportError as e: + log.warning(f"Could not import weather_tool: {e}") + + # === Science Tools === + try: + from tools.science_tool import ( + science_search_arxiv, + science_search_semantic_scholar, + science_get_paper_details, + science_search_doaj, + science_aggregate_search, + SCIENCE_SEARCH_ARXIV_SCHEMA, + SCIENCE_SEARCH_SEMANTIC_SCHOLAR_SCHEMA, + SCIENCE_GET_PAPER_DETAILS_SCHEMA, + SCIENCE_SEARCH_DOAJ_SCHEMA, + SCIENCE_AGGREGATE_SEARCH_SCHEMA, + ) + self.register_tool( + name="science_search_arxiv", + function=science_search_arxiv, + schema=SCIENCE_SEARCH_ARXIV_SCHEMA, + ) + self.register_tool( + name="science_search_semantic_scholar", + function=science_search_semantic_scholar, + schema=SCIENCE_SEARCH_SEMANTIC_SCHOLAR_SCHEMA, + ) + self.register_tool( + name="science_get_paper_details", + function=science_get_paper_details, + schema=SCIENCE_GET_PAPER_DETAILS_SCHEMA, + ) + self.register_tool( + name="science_search_doaj", + function=science_search_doaj, + schema=SCIENCE_SEARCH_DOAJ_SCHEMA, + ) + self.register_tool( + name="science_aggregate_search", + function=science_aggregate_search, + schema=SCIENCE_AGGREGATE_SEARCH_SCHEMA, + ) + except ImportError as e: + log.warning(f"Could not import science_tool: {e}") + + # === Web Search Tools === + try: + from tools.web_tool import ( + web_search, + web_instant_answer, + web_get_page_content, + web_search_and_fetch, + WEB_SEARCH_SCHEMA, + WEB_INSTANT_ANSWER_SCHEMA, + WEB_GET_PAGE_CONTENT_SCHEMA, + WEB_SEARCH_AND_FETCH_SCHEMA, + ) + self.register_tool( + name="web_search", + function=web_search, + schema=WEB_SEARCH_SCHEMA, + ) + self.register_tool( + name="web_instant_answer", + function=web_instant_answer, + schema=WEB_INSTANT_ANSWER_SCHEMA, + ) + self.register_tool( + name="web_get_page_content", + function=web_get_page_content, + schema=WEB_GET_PAGE_CONTENT_SCHEMA, + ) + self.register_tool( + name="web_search_and_fetch", + function=web_search_and_fetch, + schema=WEB_SEARCH_AND_FETCH_SCHEMA, + ) + except ImportError as e: + log.warning(f"Could not import web_tool: {e}") log.info(f"Registered {len(self._tools)} built-in tools") diff --git a/tools/finance_tool.py b/tools/finance_tool.py new file mode 100644 index 0000000..dfc41b2 --- /dev/null +++ b/tools/finance_tool.py @@ -0,0 +1,523 @@ +""" +Financial Data Tool - Get stock quotes, crypto prices, and financial data + +Free sources used: +- Yahoo Finance (yfinance library - completely free) +- CoinGecko API (free tier: 10-50 calls/minute) +- FRED API (Federal Reserve Economic Data - free with API key) +- ExchangeRate-API (free tier) + +Most functions work without API keys. +""" + +from __future__ import annotations + +import logging +from datetime import datetime, timedelta +from typing import Optional + +import requests + +log = logging.getLogger(__name__) + +# Free API endpoints +COINGECKO_API = "https://api.coingecko.com/api/v3" +EXCHANGE_RATE_API = "https://api.exchangerate-api.com/v4/latest" +FRED_API = "https://api.stlouisfed.org/fred" + + +def finance_get_stock_info( + symbol: str, +) -> dict: + """ + Get stock information from Yahoo Finance. + + Args: + symbol: Stock ticker symbol (e.g., AAPL, GOOGL, TSLA) + + Returns: + Dictionary with stock information + """ + try: + import yfinance as yf + + ticker = yf.Ticker(symbol.upper()) + info = ticker.info + + # Extract key financial data + result = { + "success": True, + "source": "yahoo_finance", + "symbol": symbol.upper(), + "company_name": info.get("longName", info.get("shortName", "")), + "current_price": info.get("currentPrice") or info.get("regularMarketPrice"), + "previous_close": info.get("previousClose"), + "open": info.get("open"), + "day_high": info.get("dayHigh"), + "day_low": info.get("dayLow"), + "52_week_high": info.get("fiftyTwoWeekHigh"), + "52_week_low": info.get("fiftyTwoWeekLow"), + "market_cap": info.get("marketCap"), + "pe_ratio": info.get("trailingPE"), + "forward_pe": info.get("forwardPE"), + "dividend_yield": info.get("dividendYield"), + "volume": info.get("volume"), + "avg_volume": info.get("averageVolume"), + "beta": info.get("beta"), + "eps": info.get("trailingEps"), + "revenue": info.get("totalRevenue"), + "profit_margins": info.get("profitMargins"), + "description": info.get("longBusinessSummary", "")[:1000], + "sector": info.get("sector"), + "industry": info.get("industry"), + "website": info.get("website"), + "timestamp": datetime.now().isoformat(), + } + + # Remove None values + result = {k: v for k, v in result.items() if v is not None} + + return result + + except ImportError: + return { + "success": False, + "error": "yfinance not installed. Run: pip install yfinance", + "source": "yahoo_finance", + } + except Exception as e: + log.error(f"Stock info fetch failed: {e}") + return { + "success": False, + "error": str(e), + "source": "yahoo_finance", + "symbol": symbol, + } + + +def finance_get_stock_history( + symbol: str, + period: str = "1mo", + interval: str = "1d", +) -> dict: + """ + Get historical stock prices from Yahoo Finance. + + Args: + symbol: Stock ticker symbol + period: Time period (1d, 5d, 1mo, 3mo, 6mo, 1y, 2y, 5y, 10y, ytd, max) + interval: Data interval (1m, 2m, 5m, 15m, 30m, 60m, 90m, 1h, 1d, 5d, 1wk, 1mo, 3mo) + + Returns: + Dictionary with historical price data + """ + try: + import yfinance as yf + + ticker = yf.Ticker(symbol.upper()) + hist = ticker.history(period=period, interval=interval) + + if hist.empty: + return { + "success": False, + "error": f"No historical data found for {symbol}", + "source": "yahoo_finance", + } + + # Convert to list of dicts + prices = [] + for index, row in hist.iterrows(): + prices.append({ + "date": index.isoformat(), + "open": round(row["Open"], 2), + "high": round(row["High"], 2), + "low": round(row["Low"], 2), + "close": round(row["Close"], 2), + "volume": int(row["Volume"]), + }) + + return { + "success": True, + "source": "yahoo_finance", + "symbol": symbol.upper(), + "period": period, + "interval": interval, + "prices": prices, + "count": len(prices), + } + + except ImportError: + return { + "success": False, + "error": "yfinance not installed. Run: pip install yfinance", + "source": "yahoo_finance", + } + except Exception as e: + log.error(f"Stock history fetch failed: {e}") + return { + "success": False, + "error": str(e), + "source": "yahoo_finance", + } + + +def finance_get_crypto_price( + coin_id: str = "bitcoin", + vs_currency: str = "usd", +) -> dict: + """ + Get cryptocurrency price from CoinGecko. + + Args: + coin_id: Coin ID (e.g., bitcoin, ethereum, dogecoin) - use coin name from CoinGecko + vs_currency: Currency to show price in (e.g., usd, eur, btc) + + Returns: + Dictionary with cryptocurrency data + """ + try: + url = f"{COINGECKO_API}/simple/price" + params = { + "ids": coin_id.lower(), + "vs_currencies": vs_currency.lower(), + "include_market_cap": "true", + "include_24hr_vol": "true", + "include_24hr_change": "true", + "include_last_updated_at": "true", + } + + response = requests.get(url, params=params, timeout=10) + response.raise_for_status() + data = response.json() + + if coin_id.lower() not in data: + return { + "success": False, + "error": f"Coin not found: {coin_id}. Try using the full coin name (e.g., 'bitcoin' not 'btc')", + "source": "coingecko", + } + + coin_data = data[coin_id.lower()] + return { + "success": True, + "source": "coingecko", + "coin_id": coin_id.lower(), + "currency": vs_currency.lower(), + "price": coin_data.get(vs_currency.lower()), + "market_cap": coin_data.get(f"{vs_currency.lower()}_market_cap"), + "24h_volume": coin_data.get(f"{vs_currency.lower()}_24h_vol"), + "24h_change": coin_data.get(f"{vs_currency.lower()}_24h_change"), + "last_updated": datetime.fromtimestamp( + coin_data.get("last_updated_at", 0) + ).isoformat() if coin_data.get("last_updated_at") else None, + } + + except Exception as e: + log.error(f"Crypto price fetch failed: {e}") + return { + "success": False, + "error": str(e), + "source": "coingecko", + } + + +def finance_get_top_cryptos( + limit: int = 10, + vs_currency: str = "usd", +) -> dict: + """ + Get top cryptocurrencies by market cap from CoinGecko. + + Args: + limit: Number of coins to return (default: 10) + vs_currency: Currency for prices (default: usd) + + Returns: + Dictionary with top cryptocurrencies + """ + try: + url = f"{COINGECKO_API}/coins/markets" + params = { + "vs_currency": vs_currency.lower(), + "order": "market_cap_desc", + "per_page": limit, + "page": 1, + "sparkline": "false", + } + + response = requests.get(url, params=params, timeout=10) + response.raise_for_status() + data = response.json() + + results = [] + for coin in data: + results.append({ + "id": coin.get("id"), + "symbol": coin.get("symbol", "").upper(), + "name": coin.get("name"), + "price": coin.get("current_price"), + "market_cap": coin.get("market_cap"), + "market_cap_rank": coin.get("market_cap_rank"), + "24h_change": coin.get("price_change_percentage_24h"), + "volume": coin.get("total_volume"), + "circulating_supply": coin.get("circulating_supply"), + "image": coin.get("image"), + }) + + return { + "success": True, + "source": "coingecko", + "currency": vs_currency.lower(), + "results": results, + "count": len(results), + } + + except Exception as e: + log.error(f"Top cryptos fetch failed: {e}") + return { + "success": False, + "error": str(e), + "source": "coingecko", + } + + +def finance_get_exchange_rate( + base_currency: str = "USD", + target_currency: Optional[str] = None, +) -> dict: + """ + Get exchange rates from ExchangeRate-API (free). + + Args: + base_currency: Base currency code (default: USD) + target_currency: Target currency code (optional, returns all if not specified) + + Returns: + Dictionary with exchange rate(s) + """ + try: + url = f"https://api.exchangerate-api.com/v4/latest/{base_currency.upper()}" + response = requests.get(url, timeout=10) + response.raise_for_status() + data = response.json() + + rates = data.get("rates", {}) + + if target_currency: + target_currency = target_currency.upper() + if target_currency in rates: + return { + "success": True, + "source": "exchangerate-api", + "base": base_currency.upper(), + "target": target_currency, + "rate": rates[target_currency], + "last_updated": data.get("date"), + } + else: + return { + "success": False, + "error": f"Currency not found: {target_currency}", + "source": "exchangerate-api", + } + + return { + "success": True, + "source": "exchangerate-api", + "base": base_currency.upper(), + "rates": rates, + "count": len(rates), + "last_updated": data.get("date"), + } + + except Exception as e: + log.error(f"Exchange rate fetch failed: {e}") + return { + "success": False, + "error": str(e), + "source": "exchangerate-api", + } + + +def finance_search_crypto( + query: str, +) -> dict: + """ + Search for cryptocurrencies on CoinGecko. + + Args: + query: Search query (coin name or symbol) + + Returns: + Dictionary with search results + """ + try: + url = f"{COINGECKO_API}/search" + params = {"query": query} + + response = requests.get(url, params=params, timeout=10) + response.raise_for_status() + data = response.json() + + coins = data.get("coins", [])[:10] + results = [] + for coin in coins: + results.append({ + "id": coin.get("id"), + "symbol": coin.get("symbol", "").upper(), + "name": coin.get("name"), + "market_cap_rank": coin.get("market_cap_rank"), + "thumb": coin.get("thumb"), + }) + + return { + "success": True, + "source": "coingecko", + "query": query, + "results": results, + "count": len(results), + } + + except Exception as e: + log.error(f"Crypto search failed: {e}") + return { + "success": False, + "error": str(e), + "source": "coingecko", + } + + +# Tool schemas for OpenAI function calling +FINANCE_GET_STOCK_INFO_SCHEMA = { + "type": "function", + "function": { + "name": "finance_get_stock_info", + "description": "Get current stock information and key financial metrics from Yahoo Finance. Use for stock quotes and company data.", + "parameters": { + "type": "object", + "properties": { + "symbol": { + "type": "string", + "description": "Stock ticker symbol (e.g., AAPL, GOOGL, TSLA, MSFT)", + }, + }, + "required": ["symbol"], + }, + }, +} + +FINANCE_GET_STOCK_HISTORY_SCHEMA = { + "type": "function", + "function": { + "name": "finance_get_stock_history", + "description": "Get historical stock prices from Yahoo Finance. Use for price trends and charts.", + "parameters": { + "type": "object", + "properties": { + "symbol": { + "type": "string", + "description": "Stock ticker symbol", + }, + "period": { + "type": "string", + "description": "Time period (1d, 5d, 1mo, 3mo, 6mo, 1y, 2y, 5y, max)", + "default": "1mo", + }, + "interval": { + "type": "string", + "description": "Data interval (1m, 5m, 15m, 1h, 1d, 1wk, 1mo)", + "default": "1d", + }, + }, + "required": ["symbol"], + }, + }, +} + +FINANCE_GET_CRYPTO_PRICE_SCHEMA = { + "type": "function", + "function": { + "name": "finance_get_crypto_price", + "description": "Get cryptocurrency price and market data from CoinGecko. Use the full coin name (e.g., 'bitcoin' not 'btc').", + "parameters": { + "type": "object", + "properties": { + "coin_id": { + "type": "string", + "description": "CoinGecko coin ID (e.g., bitcoin, ethereum, dogecoin, solana)", + }, + "vs_currency": { + "type": "string", + "description": "Currency for price (default: usd)", + "default": "usd", + }, + }, + "required": ["coin_id"], + }, + }, +} + +FINANCE_GET_TOP_CRYPTOS_SCHEMA = { + "type": "function", + "function": { + "name": "finance_get_top_cryptos", + "description": "Get top cryptocurrencies by market capitalization from CoinGecko.", + "parameters": { + "type": "object", + "properties": { + "limit": { + "type": "integer", + "description": "Number of coins to return (default: 10)", + "default": 10, + }, + "vs_currency": { + "type": "string", + "description": "Currency for prices (default: usd)", + "default": "usd", + }, + }, + "required": [], + }, + }, +} + +FINANCE_GET_EXCHANGE_RATE_SCHEMA = { + "type": "function", + "function": { + "name": "finance_get_exchange_rate", + "description": "Get currency exchange rates. Returns all rates for base currency or specific rate if target provided.", + "parameters": { + "type": "object", + "properties": { + "base_currency": { + "type": "string", + "description": "Base currency code (default: USD)", + "default": "USD", + }, + "target_currency": { + "type": "string", + "description": "Target currency code (optional, returns all if not specified)", + }, + }, + "required": [], + }, + }, +} + +FINANCE_SEARCH_CRYPTO_SCHEMA = { + "type": "function", + "function": { + "name": "finance_search_crypto", + "description": "Search for cryptocurrencies on CoinGecko by name or symbol. Use this to find the correct coin_id for finance_get_crypto_price.", + "parameters": { + "type": "object", + "properties": { + "query": { + "type": "string", + "description": "Search query (coin name or symbol)", + }, + }, + "required": ["query"], + }, + }, +} diff --git a/tools/medical_tool.py b/tools/medical_tool.py new file mode 100644 index 0000000..3ea749a --- /dev/null +++ b/tools/medical_tool.py @@ -0,0 +1,508 @@ +""" +Medical/Health Tool - Search medical literature and health data + +Free sources used: +- PubMed/NCBI E-utilities API (completely free, no key required for basic use) +- Disease.sh API (completely free, open disease data) +- Health.gov API (free government health data) +- OpenFDA API (free FDA data) + +All APIs are free and most don't require authentication. +""" + +from __future__ import annotations + +import logging +from datetime import datetime +from typing import Optional + +import requests + +log = logging.getLogger(__name__) + +# Free medical API endpoints +PUBMED_EUTILS_API = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils" +DISEASE_API = "https://disease.sh/v3" +OPENFDA_API = "https://api.fda.gov" +HEALTH_GOV_API = "https://health.gov" + + +def medical_search_pubmed( + query: str, + max_results: int = 10, +) -> dict: + """ + Search PubMed for medical/health research articles. + + Args: + query: Search query (medical terms, diseases, treatments, etc.) + max_results: Maximum number of results (default: 10) + + Returns: + Dictionary with PubMed search results + """ + try: + # First, search for article IDs + search_url = f"{PUBMED_EUTILS_API}/esearch.fcgi" + search_params = { + "db": "pubmed", + "term": query, + "retmax": max_results, + "retmode": "json", + "sort": "relevance", + } + + search_response = requests.get(search_url, params=search_params, timeout=15) + search_response.raise_for_status() + search_data = search_response.json() + + id_list = search_data.get("esearchresult", {}).get("idlist", []) + + if not id_list: + return { + "success": True, + "source": "pubmed", + "query": query, + "results": [], + "count": 0, + "message": "No articles found for this query", + } + + # Fetch article summaries + fetch_url = f"{PUBMED_EUTILS_API}/esummary.fcgi" + fetch_params = { + "db": "pubmed", + "id": ",".join(id_list), + "retmode": "json", + } + + fetch_response = requests.get(fetch_url, params=fetch_params, timeout=15) + fetch_response.raise_for_status() + fetch_data = fetch_response.json() + + results = [] + for article_id in id_list: + article = fetch_data.get("result", {}).get(article_id, {}) + if article and "error" not in article: + results.append({ + "pmid": article_id, + "title": article.get("title", ""), + "authors": [a.get("name", "") for a in article.get("authors", [])], + "journal": article.get("fulljournalname", article.get("source", "")), + "pub_date": article.get("pubdate", ""), + "doi": article.get("elocationid", ""), + "url": f"https://pubmed.ncbi.nlm.nih.gov/{article_id}/", + "abstract_available": "abstract" in article, + }) + + return { + "success": True, + "source": "pubmed", + "query": query, + "results": results, + "count": len(results), + "total_found": int(search_data.get("esearchresult", {}).get("count", 0)), + } + + except Exception as e: + log.error(f"PubMed search failed: {e}") + return { + "success": False, + "error": str(e), + "source": "pubmed", + } + + +def medical_get_pubmed_abstract( + pmid: str, +) -> dict: + """ + Get the abstract of a PubMed article. + + Args: + pmid: PubMed ID + + Returns: + Dictionary with article abstract + """ + try: + fetch_url = f"{PUBMED_EUTILS_API}/efetch.fcgi" + params = { + "db": "pubmed", + "id": pmid, + "rettype": "abstract", + "retmode": "text", + } + + response = requests.get(fetch_url, params=params, timeout=15) + response.raise_for_status() + + abstract_text = response.text.strip() + + return { + "success": True, + "source": "pubmed", + "pmid": pmid, + "abstract": abstract_text, + "url": f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/", + } + + except Exception as e: + log.error(f"PubMed abstract fetch failed: {e}") + return { + "success": False, + "error": str(e), + "source": "pubmed", + "pmid": pmid, + } + + +def medical_get_disease_data( + disease: str = "covid", +) -> dict: + """ + Get current disease statistics from Disease.sh API. + + Args: + disease: Disease type (covid, influenza, or all) + + Returns: + Dictionary with disease statistics + """ + try: + disease = disease.lower() + + if disease in ["covid", "covid-19", "coronavirus"]: + url = f"{DISEASE_API}/covid-19/all" + elif disease in ["influenza", "flu"]: + url = f"{DISEASE_API}/influenza/cdc" + elif disease == "all": + url = f"{DISEASE_API}/all" + else: + # Try COVID-19 countries data + url = f"{DISEASE_API}/covid-19/countries/{disease}" + + response = requests.get(url, timeout=10) + response.raise_for_status() + data = response.json() + + return { + "success": True, + "source": "disease.sh", + "disease": disease, + "data": data, + "timestamp": datetime.now().isoformat(), + } + + except Exception as e: + log.error(f"Disease data fetch failed: {e}") + return { + "success": False, + "error": str(e), + "source": "disease.sh", + } + + +def medical_get_covid_country( + country: str = "usa", +) -> dict: + """ + Get COVID-19 statistics for a specific country. + + Args: + country: Country name or ISO code (e.g., usa, uk, germany, china) + + Returns: + Dictionary with country COVID-19 data + """ + try: + url = f"{DISEASE_API}/covid-19/countries/{country}" + response = requests.get(url, timeout=10) + response.raise_for_status() + data = response.json() + + return { + "success": True, + "source": "disease.sh", + "country": data.get("country", country), + "cases": data.get("cases"), + "today_cases": data.get("todayCases"), + "deaths": data.get("deaths"), + "today_deaths": data.get("todayDeaths"), + "recovered": data.get("recovered"), + "active": data.get("active"), + "critical": data.get("critical"), + "cases_per_million": data.get("casesPerOneMillion"), + "deaths_per_million": data.get("deathsPerOneMillion"), + "tests": data.get("tests"), + "tests_per_million": data.get("testsPerOneMillion"), + "population": data.get("population"), + "continent": data.get("continent"), + "updated": datetime.fromtimestamp(data.get("updated", 0) / 1000).isoformat() if data.get("updated") else None, + } + + except Exception as e: + log.error(f"COVID country data fetch failed: {e}") + return { + "success": False, + "error": str(e), + "source": "disease.sh", + } + + +def medical_search_fda( + query: str, + database: str = "drug", + limit: int = 10, +) -> dict: + """ + Search FDA drug, device, or food databases. + + Args: + query: Search query + database: Database to search (drug, device, food, other) + limit: Maximum results (default: 10) + + Returns: + Dictionary with FDA search results + """ + try: + # Map database names to FDA endpoints + db_map = { + "drug": "drug/label", + "device": "device/510k", + "food": "food/enforcement", + "other": "other/substance", + } + + endpoint = db_map.get(database.lower(), "drug/label") + url = f"{OPENFDA_API}/{endpoint}.json" + + params = { + "search": query, + "limit": limit, + } + + response = requests.get(url, params=params, timeout=15) + response.raise_for_status() + data = response.json() + + results = [] + for item in data.get("results", []): + if database.lower() == "drug": + results.append({ + "brand_name": item.get("openfda", {}).get("brand_name", [""])[0] if item.get("openfda") else "", + "generic_name": item.get("openfda", {}).get("generic_name", [""])[0] if item.get("openfda") else "", + "manufacturer": item.get("openfda", {}).get("manufacturer_name", [""])[0] if item.get("openfda") else "", + "purpose": item.get("purpose", [""])[0] if item.get("purpose") else "", + "indications": item.get("indications_and_usage", [""])[0][:500] if item.get("indications_and_usage") else "", + "warnings": item.get("warnings", [""])[0][:500] if item.get("warnings") else "", + }) + else: + results.append(item) + + return { + "success": True, + "source": "openfda", + "database": database, + "query": query, + "results": results, + "count": len(results), + } + + except Exception as e: + log.error(f"FDA search failed: {e}") + return { + "success": False, + "error": str(e), + "source": "openfda", + } + + +def medical_get_health_topics( + topic: Optional[str] = None, + limit: int = 10, +) -> dict: + """ + Get health topics from Health.gov. + + Args: + topic: Health topic to search (optional) + limit: Maximum results (default: 10) + + Returns: + Dictionary with health topics + """ + try: + url = f"{HEALTH_GOV_API}/myhealthfinder/api/v3/topicsearch.json" + params = {"lang": "en"} + + if topic: + params["topic"] = topic + else: + params["pageSize"] = limit + + response = requests.get(url, params=params, timeout=10) + response.raise_for_status() + data = response.json() + + topics = data.get("Result", {}).get("Resources", {}).get("Resource", []) + results = [] + + for item in topics[:limit]: + results.append({ + "title": item.get("Title", ""), + "url": item.get("AccessibleVersion", item.get("MyHealthfinder", "")), + "image_url": item.get("ImageAltUrl", ""), + "image_alt": item.get("ImageAltText", ""), + "categories": item.get("Categories", ""), + "content": item.get("Sections", {}).get("section", [{}])[0].get("Content", "")[:500] if item.get("Sections") else "", + }) + + return { + "success": True, + "source": "health.gov", + "topic": topic, + "results": results, + "count": len(results), + } + + except Exception as e: + log.error(f"Health topics fetch failed: {e}") + return { + "success": False, + "error": str(e), + "source": "health.gov", + } + + +# Tool schemas for OpenAI function calling +MEDICAL_SEARCH_PUBMED_SCHEMA = { + "type": "function", + "function": { + "name": "medical_search_pubmed", + "description": "Search PubMed for medical and health research articles. Use for scientific medical literature.", + "parameters": { + "type": "object", + "properties": { + "query": { + "type": "string", + "description": "Medical search query (disease, treatment, drug, symptom)", + }, + "max_results": { + "type": "integer", + "description": "Maximum number of results (default: 10)", + "default": 10, + }, + }, + "required": ["query"], + }, + }, +} + +MEDICAL_GET_PUBMED_ABSTRACT_SCHEMA = { + "type": "function", + "function": { + "name": "medical_get_pubmed_abstract", + "description": "Get the full abstract of a PubMed article. Use after medical_search_pubmed to get detailed content.", + "parameters": { + "type": "object", + "properties": { + "pmid": { + "type": "string", + "description": "PubMed ID from search results", + }, + }, + "required": ["pmid"], + }, + }, +} + +MEDICAL_GET_DISEASE_DATA_SCHEMA = { + "type": "function", + "function": { + "name": "medical_get_disease_data", + "description": "Get current disease statistics (COVID-19, influenza). Use for outbreak data and statistics.", + "parameters": { + "type": "object", + "properties": { + "disease": { + "type": "string", + "description": "Disease type (covid, influenza, all)", + "default": "covid", + }, + }, + "required": [], + }, + }, +} + +MEDICAL_GET_COVID_COUNTRY_SCHEMA = { + "type": "function", + "function": { + "name": "medical_get_covid_country", + "description": "Get COVID-19 statistics for a specific country. Use for country-specific pandemic data.", + "parameters": { + "type": "object", + "properties": { + "country": { + "type": "string", + "description": "Country name or ISO code (e.g., usa, uk, germany)", + }, + }, + "required": [], + }, + }, +} + +MEDICAL_SEARCH_FDA_SCHEMA = { + "type": "function", + "function": { + "name": "medical_search_fda", + "description": "Search FDA databases for drug information, device approvals, and food safety. Use for medication info.", + "parameters": { + "type": "object", + "properties": { + "query": { + "type": "string", + "description": "Search query (drug name, ingredient, etc.)", + }, + "database": { + "type": "string", + "description": "Database to search (drug, device, food)", + "default": "drug", + "enum": ["drug", "device", "food"], + }, + "limit": { + "type": "integer", + "description": "Maximum results (default: 10)", + "default": 10, + }, + }, + "required": ["query"], + }, + }, +} + +MEDICAL_GET_HEALTH_TOPICS_SCHEMA = { + "type": "function", + "function": { + "name": "medical_get_health_topics", + "description": "Get health information and topics from Health.gov. Use for general health advice and wellness topics.", + "parameters": { + "type": "object", + "properties": { + "topic": { + "type": "string", + "description": "Health topic to search (optional)", + }, + "limit": { + "type": "integer", + "description": "Maximum results (default: 10)", + "default": 10, + }, + }, + "required": [], + }, + }, +} diff --git a/tools/news_tool.py b/tools/news_tool.py new file mode 100644 index 0000000..320da3b --- /dev/null +++ b/tools/news_tool.py @@ -0,0 +1,434 @@ +""" +News Tool - Fetch news from free sources + +Free sources used: +- GNews API (free tier: 100 requests/day) +- Currents API (free tier: 200 requests/day) +- Hacker News (completely free) +- Reddit (free JSON feeds) + +No API key required for Hacker News and Reddit. +""" + +from __future__ import annotations + +import logging +from datetime import datetime, timedelta +from typing import Optional + +import requests + +log = logging.getLogger(__name__) + +# Free news APIs (no key required for some) +GNEWS_API = "https://gnews.io/api/v4" +CURRENTS_API = "https://api.currentsapi.services/v1" +HACKER_NEWS_API = "https://hacker-news.firebaseio.com/v0" +REDDIT_API = "https://www.reddit.com" + + +def news_search_hackernews( + query: str, + limit: int = 10, +) -> dict: + """ + Search Hacker News for stories. + + Args: + query: Search query + limit: Maximum number of results (default: 10) + + Returns: + Dictionary with search results + """ + try: + # Use Hacker News Algolia API for search (free, no key) + search_url = "https://hn.algolia.com/api/v1/search" + params = { + "query": query, + "hitsPerPage": limit, + "tags": "story", + } + + response = requests.get(search_url, params=params, timeout=10) + response.raise_for_status() + data = response.json() + + results = [] + for hit in data.get("hits", []): + results.append({ + "title": hit.get("title", ""), + "url": hit.get("url", ""), + "points": hit.get("points", 0), + "author": hit.get("author", ""), + "created_at": hit.get("created_at", ""), + "comments": hit.get("num_comments", 0), + "hn_link": f"https://news.ycombinator.com/item?id={hit.get('objectID', '')}", + }) + + return { + "success": True, + "source": "hacker_news", + "query": query, + "results": results, + "count": len(results), + } + + except Exception as e: + log.error(f"Hacker News search failed: {e}") + return { + "success": False, + "error": str(e), + "source": "hacker_news", + } + + +def news_get_top_stories( + limit: int = 15, +) -> dict: + """ + Get top stories from Hacker News. + + Args: + limit: Maximum number of stories (default: 15) + + Returns: + Dictionary with top stories + """ + try: + # Get top story IDs + response = requests.get(f"{HACKER_NEWS_API}/topstories.json", timeout=10) + response.raise_for_status() + story_ids = response.json()[:limit] + + results = [] + for story_id in story_ids: + try: + story_response = requests.get( + f"{HACKER_NEWS_API}/item/{story_id}.json", + timeout=10 + ) + story = story_response.json() + + if story: + results.append({ + "title": story.get("title", ""), + "url": story.get("url", ""), + "points": story.get("score", 0), + "author": story.get("by", ""), + "time": datetime.fromtimestamp(story.get("time", 0)).isoformat(), + "comments": story.get("descendants", 0), + "hn_link": f"https://news.ycombinator.com/item?id={story_id}", + }) + except Exception: + continue + + return { + "success": True, + "source": "hacker_news", + "results": results, + "count": len(results), + } + + except Exception as e: + log.error(f"Hacker News top stories failed: {e}") + return { + "success": False, + "error": str(e), + "source": "hacker_news", + } + + +def news_get_reddit( + subreddit: str = "worldnews", + limit: int = 15, + timeframe: str = "day", +) -> dict: + """ + Get top posts from a Reddit subreddit. + + Args: + subreddit: Subreddit name (default: worldnews) + limit: Maximum number of posts (default: 15) + timeframe: Time period (hour, day, week, month, year, all) + + Returns: + Dictionary with Reddit posts + """ + try: + # Reddit provides free JSON feeds + url = f"{REDDIT_API}/r/{subreddit}/top.json" + headers = {"User-Agent": "DocRAG/1.0"} + params = { + "limit": limit, + "t": timeframe, + } + + response = requests.get(url, headers=headers, params=params, timeout=10) + response.raise_for_status() + data = response.json() + + results = [] + for child in data.get("data", {}).get("children", []): + post = child.get("data", {}) + results.append({ + "title": post.get("title", ""), + "url": post.get("url", ""), + "author": post.get("author", ""), + "score": post.get("score", 0), + "comments": post.get("num_comments", 0), + "subreddit": post.get("subreddit", ""), + "created": datetime.fromtimestamp(post.get("created_utc", 0)).isoformat(), + "permalink": f"https://reddit.com{post.get('permalink', '')}", + "selftext": post.get("selftext", "")[:500] if post.get("selftext") else "", + }) + + return { + "success": True, + "source": "reddit", + "subreddit": subreddit, + "timeframe": timeframe, + "results": results, + "count": len(results), + } + + except Exception as e: + log.error(f"Reddit fetch failed: {e}") + return { + "success": False, + "error": str(e), + "source": "reddit", + } + + +def news_search_reddit( + query: str, + subreddit: str = "all", + limit: int = 15, +) -> dict: + """ + Search Reddit for posts matching a query. + + Args: + query: Search query + subreddit: Subreddit to search (default: all) + limit: Maximum number of results (default: 15) + + Returns: + Dictionary with search results + """ + try: + url = f"{REDDIT_API}/r/{subreddit}/search.json" + headers = {"User-Agent": "DocRAG/1.0"} + params = { + "q": query, + "limit": limit, + "sort": "relevance", + "restrict_sr": "true" if subreddit != "all" else "false", + } + + response = requests.get(url, headers=headers, params=params, timeout=10) + response.raise_for_status() + data = response.json() + + results = [] + for child in data.get("data", {}).get("children", []): + post = child.get("data", {}) + results.append({ + "title": post.get("title", ""), + "url": post.get("url", ""), + "author": post.get("author", ""), + "score": post.get("score", 0), + "comments": post.get("num_comments", 0), + "subreddit": post.get("subreddit", ""), + "created": datetime.fromtimestamp(post.get("created_utc", 0)).isoformat(), + "permalink": f"https://reddit.com{post.get('permalink', '')}", + "selftext": post.get("selftext", "")[:500] if post.get("selftext") else "", + }) + + return { + "success": True, + "source": "reddit", + "query": query, + "subreddit": subreddit, + "results": results, + "count": len(results), + } + + except Exception as e: + log.error(f"Reddit search failed: {e}") + return { + "success": False, + "error": str(e), + "source": "reddit", + } + + +def news_aggregate( + query: str, + limit: int = 10, +) -> dict: + """ + Aggregate news from multiple free sources. + + Args: + query: Search query + limit: Maximum results per source (default: 10) + + Returns: + Dictionary with aggregated news from multiple sources + """ + results = [] + errors = [] + + # Search Hacker News + hn_result = news_search_hackernews(query, limit) + if hn_result.get("success"): + results.extend([ + {**r, "source": "hacker_news"} for r in hn_result.get("results", []) + ]) + else: + errors.append(f"Hacker News: {hn_result.get('error')}") + + # Search Reddit + reddit_result = news_search_reddit(query, "all", limit) + if reddit_result.get("success"): + results.extend([ + {**r, "source": "reddit"} for r in reddit_result.get("results", []) + ]) + else: + errors.append(f"Reddit: {reddit_result.get('error')}") + + return { + "success": True, + "query": query, + "results": results, + "count": len(results), + "sources_checked": ["hacker_news", "reddit"], + "errors": errors if errors else None, + } + + +# Tool schemas for OpenAI function calling +NEWS_SEARCH_HACKERNEWS_SCHEMA = { + "type": "function", + "function": { + "name": "news_search_hackernews", + "description": "Search Hacker News for tech news and discussions. Best for technology, startups, programming topics.", + "parameters": { + "type": "object", + "properties": { + "query": { + "type": "string", + "description": "Search query", + }, + "limit": { + "type": "integer", + "description": "Maximum number of results (default: 10)", + "default": 10, + }, + }, + "required": ["query"], + }, + }, +} + +NEWS_GET_TOP_STORIES_SCHEMA = { + "type": "function", + "function": { + "name": "news_get_top_stories", + "description": "Get current top stories from Hacker News. Use for general tech news browsing.", + "parameters": { + "type": "object", + "properties": { + "limit": { + "type": "integer", + "description": "Maximum number of stories (default: 15)", + "default": 15, + }, + }, + "required": [], + }, + }, +} + +NEWS_GET_REDDIT_SCHEMA = { + "type": "function", + "function": { + "name": "news_get_reddit", + "description": "Get top posts from a Reddit subreddit. Great for news, discussions, and community content.", + "parameters": { + "type": "object", + "properties": { + "subreddit": { + "type": "string", + "description": "Subreddit name (e.g., worldnews, technology, science)", + "default": "worldnews", + }, + "limit": { + "type": "integer", + "description": "Maximum number of posts (default: 15)", + "default": 15, + }, + "timeframe": { + "type": "string", + "description": "Time period (hour, day, week, month, year, all)", + "default": "day", + "enum": ["hour", "day", "week", "month", "year", "all"], + }, + }, + "required": [], + }, + }, +} + +NEWS_SEARCH_REDDIT_SCHEMA = { + "type": "function", + "function": { + "name": "news_search_reddit", + "description": "Search Reddit for posts matching a query across all subreddits.", + "parameters": { + "type": "object", + "properties": { + "query": { + "type": "string", + "description": "Search query", + }, + "subreddit": { + "type": "string", + "description": "Subreddit to search (default: all)", + "default": "all", + }, + "limit": { + "type": "integer", + "description": "Maximum number of results (default: 15)", + "default": 15, + }, + }, + "required": ["query"], + }, + }, +} + +NEWS_AGGREGATE_SCHEMA = { + "type": "function", + "function": { + "name": "news_aggregate", + "description": "Search for news from multiple sources (Hacker News, Reddit) in one call. Best for comprehensive news coverage.", + "parameters": { + "type": "object", + "properties": { + "query": { + "type": "string", + "description": "Search query", + }, + "limit": { + "type": "integer", + "description": "Maximum results per source (default: 10)", + "default": 10, + }, + }, + "required": ["query"], + }, + }, +} diff --git a/tools/science_tool.py b/tools/science_tool.py new file mode 100644 index 0000000..30b2a3a --- /dev/null +++ b/tools/science_tool.py @@ -0,0 +1,464 @@ +""" +Scientific/Academic Tool - Search scientific papers and research + +Free sources used: +- arXiv API (completely free, no key required) +- Semantic Scholar API (free tier) +- DOAJ (Directory of Open Access Journals - free) +- CORE API (free access to research papers) + +All APIs are free for basic use. +""" + +from __future__ import annotations + +import logging +from datetime import datetime +from typing import Optional +import xml.etree.ElementTree as ET + +import requests + +log = logging.getLogger(__name__) + +# Free academic APIs +ARXIV_API = "http://export.arxiv.org/api/query" +SEMANTIC_SCHOLAR_API = "https://api.semanticscholar.org/graph/v1" +DOAJ_API = "https://api.doaj.org" + + +def science_search_arxiv( + query: str, + max_results: int = 10, + category: Optional[str] = None, +) -> dict: + """ + Search arXiv for scientific preprints. + + Args: + query: Search query + max_results: Maximum number of results (default: 10) + category: arXiv category filter (e.g., cs.AI, physics, math.CO) + + Returns: + Dictionary with arXiv search results + """ + try: + # Build search query + search_query = query + if category: + search_query = f"cat:{category} AND {query}" + + params = { + "search_query": search_query, + "start": 0, + "max_results": max_results, + "sortBy": "relevance", + "sortOrder": "descending", + } + + response = requests.get(ARXIV_API, params=params, timeout=30) + response.raise_for_status() + + # Parse XML response + root = ET.fromstring(response.content) + + # Define namespace + ns = {"atom": "http://www.w3.org/2005/Atom"} + + results = [] + for entry in root.findall("atom:entry", ns): + title = entry.find("atom:title", ns) + summary = entry.find("atom:summary", ns) + published = entry.find("atom:published", ns) + updated = entry.find("atom:updated", ns) + link = entry.find("atom:id", ns) + + authors = [] + for author in entry.findall("atom:author", ns): + name = author.find("atom:name", ns) + if name is not None: + authors.append(name.text) + + # Get categories + categories = [] + for cat in entry.findall("atom:category", ns): + term = cat.get("term") + if term: + categories.append(term) + + results.append({ + "title": title.text.strip() if title is not None else "", + "abstract": summary.text.strip()[:1000] if summary is not None else "", + "authors": authors, + "published": published.text if published is not None else "", + "updated": updated.text if updated is not None else "", + "link": link.text if link is not None else "", + "pdf_link": link.text.replace("/abs/", "/pdf/") if link is not None else "", + "categories": categories, + }) + + return { + "success": True, + "source": "arxiv", + "query": query, + "category": category, + "results": results, + "count": len(results), + } + + except Exception as e: + log.error(f"arXiv search failed: {e}") + return { + "success": False, + "error": str(e), + "source": "arxiv", + } + + +def science_search_semantic_scholar( + query: str, + limit: int = 10, + year: Optional[str] = None, +) -> dict: + """ + Search Semantic Scholar for academic papers. + + Args: + query: Search query + limit: Maximum number of results (default: 10) + year: Year filter (e.g., "2020-", "2018-2022") + + Returns: + Dictionary with Semantic Scholar results + """ + try: + url = f"{SEMANTIC_SCHOLAR_API}/paper/search" + params = { + "query": query, + "limit": limit, + "fields": "title,abstract,authors,year,venue,citationCount,openAccessPdf,url", + } + + if year: + params["year"] = year + + response = requests.get(url, params=params, timeout=15) + response.raise_for_status() + data = response.json() + + results = [] + for paper in data.get("data", []): + authors = [a.get("name", "") for a in paper.get("authors", [])] + + pdf_url = None + if paper.get("openAccessPdf"): + pdf_url = paper["openAccessPdf"].get("url") + + results.append({ + "paper_id": paper.get("paperId"), + "title": paper.get("title", ""), + "abstract": paper.get("abstract", "")[:1000] if paper.get("abstract") else "", + "authors": authors, + "year": paper.get("year"), + "venue": paper.get("venue", ""), + "citations": paper.get("citationCount", 0), + "url": paper.get("url"), + "pdf_url": pdf_url, + }) + + return { + "success": True, + "source": "semantic_scholar", + "query": query, + "year_filter": year, + "results": results, + "count": len(results), + "total": data.get("total", len(results)), + } + + except Exception as e: + log.error(f"Semantic Scholar search failed: {e}") + return { + "success": False, + "error": str(e), + "source": "semantic_scholar", + } + + +def science_get_paper_details( + paper_id: str, +) -> dict: + """ + Get detailed information about a paper from Semantic Scholar. + + Args: + paper_id: Semantic Scholar paper ID or DOI + + Returns: + Dictionary with paper details + """ + try: + url = f"{SEMANTIC_SCHOLAR_API}/paper/{paper_id}" + params = { + "fields": "title,abstract,authors,year,venue,citationCount,referenceCount,openAccessPdf,url,journal,publicationVenue,tldr", + } + + response = requests.get(url, params=params, timeout=15) + response.raise_for_status() + paper = response.json() + + authors = [a.get("name", "") for a in paper.get("authors", [])] + + pdf_url = None + if paper.get("openAccessPdf"): + pdf_url = paper["openAccessPdf"].get("url") + + tldr = None + if paper.get("tldr"): + tldr = paper["tldr"].get("text") + + return { + "success": True, + "source": "semantic_scholar", + "paper_id": paper.get("paperId"), + "title": paper.get("title", ""), + "abstract": paper.get("abstract", ""), + "authors": authors, + "year": paper.get("year"), + "venue": paper.get("venue", ""), + "journal": paper.get("journal", {}).get("name") if paper.get("journal") else None, + "citations": paper.get("citationCount", 0), + "references": paper.get("referenceCount", 0), + "url": paper.get("url"), + "pdf_url": pdf_url, + "tldr": tldr, + } + + except Exception as e: + log.error(f"Paper details fetch failed: {e}") + return { + "success": False, + "error": str(e), + "source": "semantic_scholar", + } + + +def science_search_doaj( + query: str, + limit: int = 10, +) -> dict: + """ + Search DOAJ (Directory of Open Access Journals). + + Args: + query: Search query + limit: Maximum number of results (default: 10) + + Returns: + Dictionary with DOAJ results + """ + try: + url = f"{DOAJ_API}/search/articles/{query}" + params = { + "pageSize": limit, + "page": 1, + } + + headers = {"Accept": "application/json"} + + response = requests.get(url, params=params, headers=headers, timeout=15) + response.raise_for_status() + data = response.json() + + results = [] + for article in data.get("results", []): + bibjson = article.get("bibjson", {}) + results.append({ + "title": bibjson.get("title", ""), + "abstract": bibjson.get("abstract", "")[:1000] if bibjson.get("abstract") else "", + "authors": [a.get("name", "") for a in bibjson.get("author", [])], + "year": bibjson.get("year"), + "journal": bibjson.get("journal", {}).get("title", ""), + "doi": bibjson.get("identifier", [{}])[0].get("id") if bibjson.get("identifier") else None, + "link": bibjson.get("link", [{}])[0].get("url") if bibjson.get("link") else None, + "keywords": bibjson.get("keywords", []), + }) + + return { + "success": True, + "source": "doaj", + "query": query, + "results": results, + "count": len(results), + "total": data.get("total", len(results)), + } + + except Exception as e: + log.error(f"DOAJ search failed: {e}") + return { + "success": False, + "error": str(e), + "source": "doaj", + } + + +def science_aggregate_search( + query: str, + limit: int = 5, +) -> dict: + """ + Search multiple academic sources at once. + + Args: + query: Search query + limit: Maximum results per source (default: 5) + + Returns: + Dictionary with aggregated results from multiple sources + """ + results = [] + errors = [] + + # Search arXiv + arxiv_result = science_search_arxiv(query, limit) + if arxiv_result.get("success"): + results.extend([{**r, "source": "arxiv"} for r in arxiv_result.get("results", [])]) + else: + errors.append(f"arXiv: {arxiv_result.get('error')}") + + # Search Semantic Scholar + ss_result = science_search_semantic_scholar(query, limit) + if ss_result.get("success"): + results.extend([{**r, "source": "semantic_scholar"} for r in ss_result.get("results", [])]) + else: + errors.append(f"Semantic Scholar: {ss_result.get('error')}") + + return { + "success": True, + "query": query, + "results": results, + "count": len(results), + "sources_checked": ["arxiv", "semantic_scholar"], + "errors": errors if errors else None, + } + + +# Tool schemas for OpenAI function calling +SCIENCE_SEARCH_ARXIV_SCHEMA = { + "type": "function", + "function": { + "name": "science_search_arxiv", + "description": "Search arXiv for scientific preprints. Best for physics, math, computer science, and AI research.", + "parameters": { + "type": "object", + "properties": { + "query": { + "type": "string", + "description": "Search query", + }, + "max_results": { + "type": "integer", + "description": "Maximum number of results (default: 10)", + "default": 10, + }, + "category": { + "type": "string", + "description": "arXiv category filter (e.g., cs.AI, cs.LG, physics, math.CO)", + }, + }, + "required": ["query"], + }, + }, +} + +SCIENCE_SEARCH_SEMANTIC_SCHOLAR_SCHEMA = { + "type": "function", + "function": { + "name": "science_search_semantic_scholar", + "description": "Search Semantic Scholar for academic papers across all fields. Includes citation counts and open access PDFs.", + "parameters": { + "type": "object", + "properties": { + "query": { + "type": "string", + "description": "Search query", + }, + "limit": { + "type": "integer", + "description": "Maximum number of results (default: 10)", + "default": 10, + }, + "year": { + "type": "string", + "description": "Year filter (e.g., '2020-', '2018-2022')", + }, + }, + "required": ["query"], + }, + }, +} + +SCIENCE_GET_PAPER_DETAILS_SCHEMA = { + "type": "function", + "function": { + "name": "science_get_paper_details", + "description": "Get detailed information about a specific paper including TLDR summary. Use paper ID from search results.", + "parameters": { + "type": "object", + "properties": { + "paper_id": { + "type": "string", + "description": "Semantic Scholar paper ID or DOI", + }, + }, + "required": ["paper_id"], + }, + }, +} + +SCIENCE_SEARCH_DOAJ_SCHEMA = { + "type": "function", + "function": { + "name": "science_search_doaj", + "description": "Search DOAJ for open access journal articles. Best for peer-reviewed open access research.", + "parameters": { + "type": "object", + "properties": { + "query": { + "type": "string", + "description": "Search query", + }, + "limit": { + "type": "integer", + "description": "Maximum number of results (default: 10)", + "default": 10, + }, + }, + "required": ["query"], + }, + }, +} + +SCIENCE_AGGREGATE_SEARCH_SCHEMA = { + "type": "function", + "function": { + "name": "science_aggregate_search", + "description": "Search multiple academic sources (arXiv, Semantic Scholar) at once for comprehensive coverage.", + "parameters": { + "type": "object", + "properties": { + "query": { + "type": "string", + "description": "Search query", + }, + "limit": { + "type": "integer", + "description": "Maximum results per source (default: 5)", + "default": 5, + }, + }, + "required": ["query"], + }, + }, +} diff --git a/tools/weather_tool.py b/tools/weather_tool.py new file mode 100644 index 0000000..cdbcb91 --- /dev/null +++ b/tools/weather_tool.py @@ -0,0 +1,420 @@ +""" +Weather Tool - Get weather data and forecasts + +Free sources used: +- Open-Meteo API (completely free, no API key required) +- OpenWeatherMap (free tier available) + +Primary use: Open-Meteo (no key required) +""" + +from __future__ import annotations + +import logging +from datetime import datetime +from typing import Optional + +import requests + +log = logging.getLogger(__name__) + +# Free weather APIs +OPEN_METEO_API = "https://api.open-meteo.com/v1" +GEOCODING_API = "https://geocoding-api.open-meteo.com/v1" + + +def weather_get_coordinates( + location: str, +) -> dict: + """ + Get coordinates for a location name. + + Args: + location: City name or location (e.g., "New York", "London, UK") + + Returns: + Dictionary with location coordinates + """ + try: + url = f"{GEOCODING_API}/search" + params = { + "name": location, + "count": 1, + "language": "en", + "format": "json", + } + + response = requests.get(url, params=params, timeout=10) + response.raise_for_status() + data = response.json() + + results = data.get("results", []) + if not results: + return { + "success": False, + "error": f"Location not found: {location}", + "source": "open-meteo", + } + + loc = results[0] + return { + "success": True, + "source": "open-meteo", + "name": loc.get("name", ""), + "country": loc.get("country", ""), + "latitude": loc.get("latitude"), + "longitude": loc.get("longitude"), + "elevation": loc.get("elevation"), + "timezone": loc.get("timezone"), + "population": loc.get("population"), + } + + except Exception as e: + log.error(f"Geocoding failed: {e}") + return { + "success": False, + "error": str(e), + "source": "open-meteo", + } + + +def weather_get_current( + location: str, + units: str = "celsius", +) -> dict: + """ + Get current weather for a location. + + Args: + location: City name or location + units: Temperature units (celsius or fahrenheit) + + Returns: + Dictionary with current weather data + """ + try: + # First get coordinates + geo = weather_get_coordinates(location) + if not geo.get("success"): + return geo + + lat = geo["latitude"] + lon = geo["longitude"] + + url = f"{OPEN_METEO_API}/forecast" + params = { + "latitude": lat, + "longitude": lon, + "current": "temperature_2m,relative_humidity_2m,apparent_temperature,precipitation,rain,showers,snowfall,weather_code,cloud_cover,pressure_msl,surface_pressure,wind_speed_10m,wind_direction_10m,wind_gusts_10m", + "temperature_unit": units, + "timezone": "auto", + } + + response = requests.get(url, params=params, timeout=10) + response.raise_for_status() + data = response.json() + + current = data.get("current", {}) + + # Weather code descriptions + weather_codes = { + 0: "Clear sky", + 1: "Mainly clear", 2: "Partly cloudy", 3: "Overcast", + 45: "Fog", 48: "Depositing rime fog", + 51: "Light drizzle", 53: "Moderate drizzle", 55: "Dense drizzle", + 56: "Light freezing drizzle", 57: "Dense freezing drizzle", + 61: "Slight rain", 63: "Moderate rain", 65: "Heavy rain", + 66: "Light freezing rain", 67: "Heavy freezing rain", + 71: "Slight snow", 73: "Moderate snow", 75: "Heavy snow", + 77: "Snow grains", + 80: "Slight rain showers", 81: "Moderate rain showers", 82: "Violent rain showers", + 85: "Slight snow showers", 86: "Heavy snow showers", + 95: "Thunderstorm", 96: "Thunderstorm with slight hail", 99: "Thunderstorm with heavy hail", + } + + weather_code = current.get("weather_code", 0) + weather_description = weather_codes.get(weather_code, "Unknown") + + return { + "success": True, + "source": "open-meteo", + "location": geo.get("name", location), + "country": geo.get("country", ""), + "latitude": lat, + "longitude": lon, + "timezone": data.get("timezone", ""), + "temperature": current.get("temperature_2m"), + "feels_like": current.get("apparent_temperature"), + "humidity": current.get("relative_humidity_2m"), + "weather_code": weather_code, + "weather_description": weather_description, + "cloud_cover": current.get("cloud_cover"), + "pressure_msl": current.get("pressure_msl"), + "wind_speed": current.get("wind_speed_10m"), + "wind_direction": current.get("wind_direction_10m"), + "wind_gusts": current.get("wind_gusts_10m"), + "precipitation": current.get("precipitation"), + "rain": current.get("rain"), + "snowfall": current.get("snowfall"), + "units": units, + "timestamp": datetime.now().isoformat(), + } + + except Exception as e: + log.error(f"Weather fetch failed: {e}") + return { + "success": False, + "error": str(e), + "source": "open-meteo", + } + + +def weather_get_forecast( + location: str, + days: int = 7, + units: str = "celsius", +) -> dict: + """ + Get weather forecast for a location. + + Args: + location: City name or location + days: Number of forecast days (1-16) + units: Temperature units (celsius or fahrenheit) + + Returns: + Dictionary with weather forecast + """ + try: + # First get coordinates + geo = weather_get_coordinates(location) + if not geo.get("success"): + return geo + + lat = geo["latitude"] + lon = geo["longitude"] + + url = f"{OPEN_METEO_API}/forecast" + params = { + "latitude": lat, + "longitude": lon, + "daily": "weather_code,temperature_2m_max,temperature_2m_min,apparent_temperature_max,apparent_temperature_min,sunrise,sunset,uv_index_max,precipitation_sum,rain_sum,showers_sum,snowfall_sum,precipitation_probability_max,wind_speed_10m_max,wind_gusts_10m_max", + "temperature_unit": units, + "timezone": "auto", + "forecast_days": min(days, 16), + } + + response = requests.get(url, params=params, timeout=10) + response.raise_for_status() + data = response.json() + + daily = data.get("daily", {}) + + # Weather code descriptions + weather_codes = { + 0: "Clear sky", + 1: "Mainly clear", 2: "Partly cloudy", 3: "Overcast", + 45: "Fog", 48: "Depositing rime fog", + 51: "Light drizzle", 53: "Moderate drizzle", 55: "Dense drizzle", + 56: "Light freezing drizzle", 57: "Dense freezing drizzle", + 61: "Slight rain", 63: "Moderate rain", 65: "Heavy rain", + 66: "Light freezing rain", 67: "Heavy freezing rain", + 71: "Slight snow", 73: "Moderate snow", 75: "Heavy snow", + 77: "Snow grains", + 80: "Slight rain showers", 81: "Moderate rain showers", 82: "Violent rain showers", + 85: "Slight snow showers", 86: "Heavy snow showers", + 95: "Thunderstorm", 96: "Thunderstorm with slight hail", 99: "Thunderstorm with heavy hail", + } + + forecasts = [] + dates = daily.get("time", []) + for i, date in enumerate(dates): + weather_code = daily.get("weather_code", [])[i] if i < len(daily.get("weather_code", [])) else 0 + forecasts.append({ + "date": date, + "temp_max": daily.get("temperature_2m_max", [])[i] if i < len(daily.get("temperature_2m_max", [])) else None, + "temp_min": daily.get("temperature_2m_min", [])[i] if i < len(daily.get("temperature_2m_min", [])) else None, + "feels_like_max": daily.get("apparent_temperature_max", [])[i] if i < len(daily.get("apparent_temperature_max", [])) else None, + "feels_like_min": daily.get("apparent_temperature_min", [])[i] if i < len(daily.get("apparent_temperature_min", [])) else None, + "weather_code": weather_code, + "weather_description": weather_codes.get(weather_code, "Unknown"), + "precipitation": daily.get("precipitation_sum", [])[i] if i < len(daily.get("precipitation_sum", [])) else None, + "rain": daily.get("rain_sum", [])[i] if i < len(daily.get("rain_sum", [])) else None, + "snowfall": daily.get("snowfall_sum", [])[i] if i < len(daily.get("snowfall_sum", [])) else None, + "precipitation_probability": daily.get("precipitation_probability_max", [])[i] if i < len(daily.get("precipitation_probability_max", [])) else None, + "uv_index": daily.get("uv_index_max", [])[i] if i < len(daily.get("uv_index_max", [])) else None, + "wind_speed_max": daily.get("wind_speed_10m_max", [])[i] if i < len(daily.get("wind_speed_10m_max", [])) else None, + "wind_gusts_max": daily.get("wind_gusts_10m_max", [])[i] if i < len(daily.get("wind_gusts_10m_max", [])) else None, + "sunrise": daily.get("sunrise", [])[i] if i < len(daily.get("sunrise", [])) else None, + "sunset": daily.get("sunset", [])[i] if i < len(daily.get("sunset", [])) else None, + }) + + return { + "success": True, + "source": "open-meteo", + "location": geo.get("name", location), + "country": geo.get("country", ""), + "latitude": lat, + "longitude": lon, + "timezone": data.get("timezone", ""), + "units": units, + "forecast": forecasts, + "count": len(forecasts), + } + + except Exception as e: + log.error(f"Weather forecast fetch failed: {e}") + return { + "success": False, + "error": str(e), + "source": "open-meteo", + } + + +def weather_get_air_quality( + location: str, +) -> dict: + """ + Get air quality index for a location. + + Args: + location: City name or location + + Returns: + Dictionary with air quality data + """ + try: + # First get coordinates + geo = weather_get_coordinates(location) + if not geo.get("success"): + return geo + + lat = geo["latitude"] + lon = geo["longitude"] + + url = "https://air-quality-api.open-meteo.com/v1/air-quality" + params = { + "latitude": lat, + "longitude": lon, + "current": "us_aqi,pm10,pm2_5,carbon_monoxide,nitrogen_dioxide,sulphur_dioxide,ozone,ammonia", + "timezone": "auto", + } + + response = requests.get(url, params=params, timeout=10) + response.raise_for_status() + data = response.json() + + current = data.get("current", {}) + + # US AQI categories + aqi = current.get("us_aqi", 0) + if aqi <= 50: + category = "Good" + elif aqi <= 100: + category = "Moderate" + elif aqi <= 150: + category = "Unhealthy for Sensitive Groups" + elif aqi <= 200: + category = "Unhealthy" + elif aqi <= 300: + category = "Very Unhealthy" + else: + category = "Hazardous" + + return { + "success": True, + "source": "open-meteo", + "location": geo.get("name", location), + "country": geo.get("country", ""), + "us_aqi": aqi, + "aqi_category": category, + "pm2_5": current.get("pm2_5"), + "pm10": current.get("pm10"), + "carbon_monoxide": current.get("carbon_monoxide"), + "nitrogen_dioxide": current.get("nitrogen_dioxide"), + "sulphur_dioxide": current.get("sulphur_dioxide"), + "ozone": current.get("ozone"), + "ammonia": current.get("ammonia"), + "timestamp": datetime.now().isoformat(), + } + + except Exception as e: + log.error(f"Air quality fetch failed: {e}") + return { + "success": False, + "error": str(e), + "source": "open-meteo", + } + + +# Tool schemas for OpenAI function calling +WEATHER_GET_CURRENT_SCHEMA = { + "type": "function", + "function": { + "name": "weather_get_current", + "description": "Get current weather conditions for any location worldwide. No API key required.", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "City name or location (e.g., 'New York', 'London, UK', 'Tokyo')", + }, + "units": { + "type": "string", + "description": "Temperature units", + "default": "celsius", + "enum": ["celsius", "fahrenheit"], + }, + }, + "required": ["location"], + }, + }, +} + +WEATHER_GET_FORECAST_SCHEMA = { + "type": "function", + "function": { + "name": "weather_get_forecast", + "description": "Get weather forecast for up to 16 days. Includes temperature, precipitation, UV index, and more.", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "City name or location", + }, + "days": { + "type": "integer", + "description": "Number of forecast days (1-16)", + "default": 7, + }, + "units": { + "type": "string", + "description": "Temperature units", + "default": "celsius", + "enum": ["celsius", "fahrenheit"], + }, + }, + "required": ["location"], + }, + }, +} + +WEATHER_GET_AIR_QUALITY_SCHEMA = { + "type": "function", + "function": { + "name": "weather_get_air_quality", + "description": "Get air quality index and pollutant levels for a location. Includes PM2.5, PM10, ozone, and more.", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "City name or location", + }, + }, + "required": ["location"], + }, + }, +} diff --git a/tools/web_tool.py b/tools/web_tool.py new file mode 100644 index 0000000..1c5fb00 --- /dev/null +++ b/tools/web_tool.py @@ -0,0 +1,427 @@ +""" +Web Search Tool - General web search capabilities + +Free sources used: +- DuckDuckGo Instant Answer API (completely free) +- DuckDuckGo HTML search (free, no API key) +- Wikipedia API (as fallback) + +All completely free, no API keys required. +""" + +from __future__ import annotations + +import logging +from datetime import datetime +from typing import Optional +from urllib.parse import quote_plus, unquote_plus + +import requests + +log = logging.getLogger(__name__) + +# Free search endpoints +DUCKDUCKGO_API = "https://api.duckduckgo.com" +DUCKDUCKGO_HTML = "https://html.duckduckgo.com/html" + + +def web_search( + query: str, + max_results: int = 10, +) -> dict: + """ + Search the web using DuckDuckGo. + + Args: + query: Search query + max_results: Maximum number of results (default: 10) + + Returns: + Dictionary with search results + """ + try: + # Use DuckDuckGo HTML search (free, no API key) + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" + } + + params = {"q": query} + + response = requests.get( + DUCKDUCKGO_HTML, + params=params, + headers=headers, + timeout=15 + ) + response.raise_for_status() + + # Parse HTML results + results = _parse_ddg_html(response.text, max_results) + + return { + "success": True, + "source": "duckduckgo", + "query": query, + "results": results, + "count": len(results), + } + + except Exception as e: + log.error(f"Web search failed: {e}") + return { + "success": False, + "error": str(e), + "source": "duckduckgo", + } + + +def _parse_ddg_html(html: str, max_results: int) -> list: + """Parse DuckDuckGo HTML results.""" + from bs4 import BeautifulSoup + + soup = BeautifulSoup(html, "html.parser") + results = [] + + # Find result links + for result in soup.select(".result")[:max_results]: + try: + link_elem = result.select_one(".result__a") + snippet_elem = result.select_one(".result__snippet") + + if link_elem: + url = link_elem.get("href", "") + + # Extract actual URL from redirect + if "uddg=" in url: + url = url.split("uddg=")[-1].split("&")[0] + url = unquote_plus(url) + + results.append({ + "title": link_elem.get_text(strip=True), + "url": url, + "snippet": snippet_elem.get_text(strip=True) if snippet_elem else "", + }) + except Exception: + continue + + return results + + +def web_instant_answer( + query: str, +) -> dict: + """ + Get instant answer from DuckDuckGo. + + Args: + query: Query for instant answer + + Returns: + Dictionary with instant answer + """ + try: + params = { + "q": query, + "format": "json", + "no_html": 1, + "skip_disambig": 0, + } + + response = requests.get(DUCKDUCKGO_API, params=params, timeout=10) + response.raise_for_status() + data = response.json() + + result = { + "success": True, + "source": "duckduckgo", + "query": query, + } + + # Abstract (main answer) + if data.get("Abstract"): + result["abstract"] = data.get("Abstract") + result["abstract_source"] = data.get("AbstractSource") + result["abstract_url"] = data.get("AbstractURL") + result["image"] = data.get("Image") + + # Definition + if data.get("Definition"): + result["definition"] = data.get("Definition") + result["definition_source"] = data.get("DefinitionSource") + + # Answer + if data.get("Answer"): + result["answer"] = data.get("Answer") + + # Related topics + related = [] + for topic in data.get("RelatedTopics", [])[:5]: + if isinstance(topic, dict) and topic.get("Text"): + related.append({ + "text": topic.get("Text"), + "url": topic.get("FirstURL"), + }) + if related: + result["related_topics"] = related + + # Infobox + if data.get("Infobox"): + result["infobox"] = data.get("Infobox") + + return result + + except Exception as e: + log.error(f"Instant answer failed: {e}") + return { + "success": False, + "error": str(e), + "source": "duckduckgo", + } + + +def web_get_page_content( + url: str, + max_length: int = 5000, +) -> dict: + """ + Fetch and extract text content from a web page. + + Args: + url: URL to fetch + max_length: Maximum content length (default: 5000 chars) + + Returns: + Dictionary with page content + """ + try: + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" + } + + response = requests.get(url, headers=headers, timeout=15) + response.raise_for_status() + + # Parse and extract text + from bs4 import BeautifulSoup + + soup = BeautifulSoup(response.text, "html.parser") + + # Remove script and style elements + for element in soup(["script", "style", "nav", "header", "footer"]): + element.decompose() + + # Get title + title = "" + if soup.title: + title = soup.title.get_text(strip=True) + + # Get main content + text = soup.get_text(separator="\n", strip=True) + + # Clean up whitespace + lines = [line.strip() for line in text.splitlines() if line.strip()] + text = "\n".join(lines) + + # Truncate if needed + if len(text) > max_length: + text = text[:max_length] + "..." + + return { + "success": True, + "source": "web", + "url": url, + "title": title, + "content": text, + "content_length": len(text), + } + + except Exception as e: + log.error(f"Page content fetch failed: {e}") + return { + "success": False, + "error": str(e), + "source": "web", + "url": url, + } + + +def web_search_and_fetch( + query: str, + max_results: int = 3, + max_content_length: int = 3000, +) -> dict: + """ + Search web and fetch content from top results. + + Args: + query: Search query + max_results: Number of results to fetch (default: 3) + max_content_length: Max content per page (default: 3000) + + Returns: + Dictionary with search results and fetched content + """ + try: + # First, search + search_result = web_search(query, max_results) + + if not search_result.get("success"): + return search_result + + results = search_result.get("results", []) + + # Fetch content from each result + enriched_results = [] + for result in results: + if result.get("url"): + content = web_get_page_content(result["url"], max_content_length) + result["fetched_content"] = content.get("content", "") if content.get("success") else "" + enriched_results.append(result) + + return { + "success": True, + "source": "duckduckgo", + "query": query, + "results": enriched_results, + "count": len(enriched_results), + } + + except Exception as e: + log.error(f"Search and fetch failed: {e}") + return { + "success": False, + "error": str(e), + "source": "duckduckgo", + } + + +def web_get_headers( + url: str, +) -> dict: + """ + Get HTTP headers for a URL. + + Args: + url: URL to check + + Returns: + Dictionary with HTTP headers + """ + try: + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" + } + + response = requests.head(url, headers=headers, timeout=10, allow_redirects=True) + + return { + "success": True, + "source": "web", + "url": url, + "status_code": response.status_code, + "headers": dict(response.headers), + "final_url": response.url, + } + + except Exception as e: + log.error(f"Header fetch failed: {e}") + return { + "success": False, + "error": str(e), + "source": "web", + "url": url, + } + + +# Tool schemas for OpenAI function calling +WEB_SEARCH_SCHEMA = { + "type": "function", + "function": { + "name": "web_search", + "description": "Search the web using DuckDuckGo. Returns search results with titles, URLs, and snippets. Free, no API key required.", + "parameters": { + "type": "object", + "properties": { + "query": { + "type": "string", + "description": "Search query", + }, + "max_results": { + "type": "integer", + "description": "Maximum number of results (default: 10)", + "default": 10, + }, + }, + "required": ["query"], + }, + }, +} + +WEB_INSTANT_ANSWER_SCHEMA = { + "type": "function", + "function": { + "name": "web_instant_answer", + "description": "Get instant answer from DuckDuckGo for facts, definitions, and summaries. Good for quick facts.", + "parameters": { + "type": "object", + "properties": { + "query": { + "type": "string", + "description": "Query for instant answer", + }, + }, + "required": ["query"], + }, + }, +} + +WEB_GET_PAGE_CONTENT_SCHEMA = { + "type": "function", + "function": { + "name": "web_get_page_content", + "description": "Fetch and extract text content from a web page URL. Use after web_search to get full content.", + "parameters": { + "type": "object", + "properties": { + "url": { + "type": "string", + "description": "URL to fetch", + }, + "max_length": { + "type": "integer", + "description": "Maximum content length in characters (default: 5000)", + "default": 5000, + }, + }, + "required": ["url"], + }, + }, +} + +WEB_SEARCH_AND_FETCH_SCHEMA = { + "type": "function", + "function": { + "name": "web_search_and_fetch", + "description": "Search web and automatically fetch content from top results. Best for comprehensive research.", + "parameters": { + "type": "object", + "properties": { + "query": { + "type": "string", + "description": "Search query", + }, + "max_results": { + "type": "integer", + "description": "Number of results to fetch (default: 3)", + "default": 3, + }, + "max_content_length": { + "type": "integer", + "description": "Max content per page (default: 3000)", + "default": 3000, + }, + }, + "required": ["query"], + }, + }, +} diff --git a/tools/wikipedia_tool.py b/tools/wikipedia_tool.py new file mode 100644 index 0000000..2f8c56c --- /dev/null +++ b/tools/wikipedia_tool.py @@ -0,0 +1,259 @@ +""" +Wikipedia Tool - Search and retrieve Wikipedia articles + +Free API with no authentication required. +Rate limit: Be respectful, no strict limits. +""" + +from __future__ import annotations + +import logging +from typing import Optional + +import requests + +log = logging.getLogger(__name__) + +WIKIPEDIA_API = "https://en.wikipedia.org/w/api.php" + + +def wikipedia_search( + query: str, + limit: int = 5, +) -> dict: + """ + Search Wikipedia for articles matching the query. + + Args: + query: Search query + limit: Maximum number of results (default: 5) + + Returns: + Dictionary with search results + """ + try: + params = { + "action": "query", + "list": "search", + "srsearch": query, + "srlimit": limit, + "format": "json", + "utf8": 1, + } + + response = requests.get(WIKIPEDIA_API, params=params, timeout=10) + response.raise_for_status() + data = response.json() + + results = [] + for item in data.get("query", {}).get("search", []): + results.append({ + "title": item.get("title", ""), + "pageid": item.get("pageid", 0), + "snippet": item.get("snippet", "").replace("", "").replace("", ""), + "wordcount": item.get("wordcount", 0), + "url": f"https://en.wikipedia.org/?curid={item.get('pageid', 0)}", + }) + + return { + "success": True, + "source": "wikipedia", + "query": query, + "results": results, + "count": len(results), + } + + except Exception as e: + log.error(f"Wikipedia search failed: {e}") + return { + "success": False, + "error": str(e), + "source": "wikipedia", + } + + +def wikipedia_get_article( + title: str, + sentences: int = 10, +) -> dict: + """ + Get the content of a Wikipedia article. + + Args: + title: Article title (exact match or pageid) + sentences: Number of sentences to return (default: 10, max: 50) + + Returns: + Dictionary with article content + """ + try: + params = { + "action": "query", + "prop": "extracts", + "exsentences": min(sentences, 50), + "exintro": True, + "explaintext": True, + "titles": title, + "format": "json", + "utf8": 1, + "redirects": 1, + } + + response = requests.get(WIKIPEDIA_API, params=params, timeout=10) + response.raise_for_status() + data = response.json() + + pages = data.get("query", {}).get("pages", {}) + articles = [] + + for page_id, page_data in pages.items(): + if page_id != "-1": # -1 means page not found + articles.append({ + "title": page_data.get("title", ""), + "pageid": page_id, + "extract": page_data.get("extract", ""), + "url": f"https://en.wikipedia.org/?curid={page_id}", + }) + + if not articles: + return { + "success": False, + "error": f"Article not found: {title}", + "source": "wikipedia", + } + + return { + "success": True, + "source": "wikipedia", + "articles": articles, + "count": len(articles), + } + + except Exception as e: + log.error(f"Wikipedia article fetch failed: {e}") + return { + "success": False, + "error": str(e), + "source": "wikipedia", + } + + +def wikipedia_get_full_article( + title: str, +) -> dict: + """ + Get the full content of a Wikipedia article. + + Args: + title: Article title + + Returns: + Dictionary with full article content + """ + try: + params = { + "action": "query", + "prop": "extracts", + "explaintext": True, + "titles": title, + "format": "json", + "utf8": 1, + "redirects": 1, + } + + response = requests.get(WIKIPEDIA_API, params=params, timeout=15) + response.raise_for_status() + data = response.json() + + pages = data.get("query", {}).get("pages", {}) + + for page_id, page_data in pages.items(): + if page_id != "-1": + return { + "success": True, + "source": "wikipedia", + "title": page_data.get("title", ""), + "pageid": page_id, + "content": page_data.get("extract", ""), + "url": f"https://en.wikipedia.org/?curid={page_id}", + } + + return { + "success": False, + "error": f"Article not found: {title}", + "source": "wikipedia", + } + + except Exception as e: + log.error(f"Wikipedia full article fetch failed: {e}") + return { + "success": False, + "error": str(e), + "source": "wikipedia", + } + + +# Tool schemas for OpenAI function calling +WIKIPEDIA_SEARCH_SCHEMA = { + "type": "function", + "function": { + "name": "wikipedia_search", + "description": "Search Wikipedia for articles matching a query. Returns a list of article titles and snippets.", + "parameters": { + "type": "object", + "properties": { + "query": { + "type": "string", + "description": "The search query", + }, + "limit": { + "type": "integer", + "description": "Maximum number of results to return (default: 5)", + "default": 5, + }, + }, + "required": ["query"], + }, + }, +} + +WIKIPEDIA_GET_ARTICLE_SCHEMA = { + "type": "function", + "function": { + "name": "wikipedia_get_article", + "description": "Get the introduction/summary of a Wikipedia article. Use this after wikipedia_search to get more details.", + "parameters": { + "type": "object", + "properties": { + "title": { + "type": "string", + "description": "The exact article title from search results", + }, + "sentences": { + "type": "integer", + "description": "Number of sentences to return (default: 10)", + "default": 10, + }, + }, + "required": ["title"], + }, + }, +} + +WIKIPEDIA_GET_FULL_ARTICLE_SCHEMA = { + "type": "function", + "function": { + "name": "wikipedia_get_full_article", + "description": "Get the full content of a Wikipedia article. Use for comprehensive research when the summary is not enough.", + "parameters": { + "type": "object", + "properties": { + "title": { + "type": "string", + "description": "The exact article title", + }, + }, + "required": ["title"], + }, + }, +}