import logging import requests import os import re from urllib.parse import unquote, quote import wikipedia import feedparser from bs4 import BeautifulSoup logger = logging.getLogger(__name__) class SearchManager: def __init__(self): self.session = requests.Session() self.headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", } # Curated list of high-authority, public RSS feeds self.news_feeds = [ "https://www.reutersagency.com/en/reuters-best/rss/", "http://feeds.bbci.co.uk/news/rss.xml", "http://rss.cnn.com/rss/cnn_topstories.rss", "https://www.aljazeera.com/xml/rss/all.xml", "https://feeds.npr.org/1001/rss.xml" ] def search(self, query, num_results=5): """Unified parallel search: Web/News (Primary) + Wikipedia (Reference).""" logger.info(f"Unified Discovery for: {query}") results = [] # 1. Web Phase (Primary): Use DuckDuckGo Search # This provides specific, long-tail articles directly without redirects web_links = self._search_web(query, num_results=num_results) if web_links: logger.info(f"Web Discovery found {len(web_links)} specific articles.") results.extend(web_links) # 2. Encyclopedic Phase (Secondary): Check Wikipedia for broad context # Limit to 2 results to avoid 'cluttering' the findings as per user request wiki_links = self._search_wikipedia(query, num_results=2) if wiki_links: logger.info(f"Wikipedia Knowledge found {len(wiki_links)} pages.") results.extend(wiki_links) # 3. Curated News feeds for high-authority updates (Optional backup) temporal_keywords = ["today", "latest", "now", "2026", "news", "headline", "update"] if any(k in query.lower() for k in temporal_keywords): news_links = self._search_curated_news(query, num_results=2) if news_links: results.extend(news_links) # Deduplicate while preserving order (Web results now naturally come first) unique_results = [] seen = set() for r in results: if r not in seen: unique_results.append(r) seen.add(r) return unique_results[:num_results] def _search_wikipedia(self, query, num_results): """Use the Wikipedia library for clean knowledge extraction.""" try: pages = wikipedia.search(query, results=num_results) urls = [] for page in pages: try: p = wikipedia.page(page, auto_suggest=False) urls.append(p.url) except: continue return urls except: return [] def _search_curated_news(self, query, num_results): """Search curated high-authority RSS feeds.""" links = [] words = query.lower().split() for feed_url in self.news_feeds: try: feed = feedparser.parse(feed_url) for entry in feed.entries: if any(word in entry.title.lower() for word in words): links.append(entry.link) if len(links) >= num_results: break except: continue if len(links) >= num_results: break return links def _search_web(self, query, num_results): """Search Web for direct links avoiding Javascript redirects.""" try: from duckduckgo_search import DDGS links = [] results = DDGS().text(query, max_results=num_results) for r in results: links.append(r['href']) return links except Exception as e: logger.error(f"Web Search Error: {e}") return [] if __name__ == "__main__": logging.basicConfig(level=logging.INFO) sm = SearchManager() print(f"Parallel Search: {sm.search('Who was John Bidwell', num_results=2)}")