projects/pyrag3/search_manager.py
2026-04-05 17:30:07 -07:00

110 lines
4.1 KiB
Python

import logging
import requests
import os
import re
from urllib.parse import unquote, quote
import wikipedia
import feedparser
from bs4 import BeautifulSoup
logger = logging.getLogger(__name__)
class SearchManager:
def __init__(self):
self.session = requests.Session()
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
}
# Curated list of high-authority, public RSS feeds
self.news_feeds = [
"https://www.reutersagency.com/en/reuters-best/rss/",
"http://feeds.bbci.co.uk/news/rss.xml",
"http://rss.cnn.com/rss/cnn_topstories.rss",
"https://www.aljazeera.com/xml/rss/all.xml",
"https://feeds.npr.org/1001/rss.xml"
]
def search(self, query, num_results=5):
"""Unified parallel search: Web/News (Primary) + Wikipedia (Reference)."""
logger.info(f"Unified Discovery for: {query}")
results = []
# 1. Web Phase (Primary): Use DuckDuckGo Search
# This provides specific, long-tail articles directly without redirects
web_links = self._search_web(query, num_results=num_results)
if web_links:
logger.info(f"Web Discovery found {len(web_links)} specific articles.")
results.extend(web_links)
# 2. Encyclopedic Phase (Secondary): Check Wikipedia for broad context
# Limit to 2 results to avoid 'cluttering' the findings as per user request
wiki_links = self._search_wikipedia(query, num_results=2)
if wiki_links:
logger.info(f"Wikipedia Knowledge found {len(wiki_links)} pages.")
results.extend(wiki_links)
# 3. Curated News feeds for high-authority updates (Optional backup)
temporal_keywords = ["today", "latest", "now", "2026", "news", "headline", "update"]
if any(k in query.lower() for k in temporal_keywords):
news_links = self._search_curated_news(query, num_results=2)
if news_links:
results.extend(news_links)
# Deduplicate while preserving order (Web results now naturally come first)
unique_results = []
seen = set()
for r in results:
if r not in seen:
unique_results.append(r)
seen.add(r)
return unique_results[:num_results]
def _search_wikipedia(self, query, num_results):
"""Use the Wikipedia library for clean knowledge extraction."""
try:
pages = wikipedia.search(query, results=num_results)
urls = []
for page in pages:
try:
p = wikipedia.page(page, auto_suggest=False)
urls.append(p.url)
except: continue
return urls
except:
return []
def _search_curated_news(self, query, num_results):
"""Search curated high-authority RSS feeds."""
links = []
words = query.lower().split()
for feed_url in self.news_feeds:
try:
feed = feedparser.parse(feed_url)
for entry in feed.entries:
if any(word in entry.title.lower() for word in words):
links.append(entry.link)
if len(links) >= num_results: break
except: continue
if len(links) >= num_results: break
return links
def _search_web(self, query, num_results):
"""Search Web for direct links avoiding Javascript redirects."""
try:
from duckduckgo_search import DDGS
links = []
results = DDGS().text(query, max_results=num_results)
for r in results:
links.append(r['href'])
return links
except Exception as e:
logger.error(f"Web Search Error: {e}")
return []
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO)
sm = SearchManager()
print(f"Parallel Search: {sm.search('Who was John Bidwell', num_results=2)}")