projects/pyrag3/search_manager.py

import logging
import requests
import os
import re
from urllib.parse import unquote, quote
import wikipedia
import feedparser
from bs4 import BeautifulSoup

logger = logging.getLogger(__name__)

class SearchManager:
    def __init__(self):
        self.session = requests.Session()
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
        }
        # Curated list of high-authority, public RSS feeds
        self.news_feeds = [
            "https://www.reutersagency.com/en/reuters-best/rss/",
            "http://feeds.bbci.co.uk/news/rss.xml",
            "http://rss.cnn.com/rss/cnn_topstories.rss",
            "https://www.aljazeera.com/xml/rss/all.xml",
            "https://feeds.npr.org/1001/rss.xml"
        ]

    def search(self, query, num_results=5):
        """Unified parallel search: Web/News (Primary) + Wikipedia (Reference)."""
        logger.info(f"Unified Discovery for: {query}")

        results = []

        # 1. Web Phase (Primary): Use DuckDuckGo Search
        # This provides specific, long-tail articles directly without redirects
        web_links = self._search_web(query, num_results=num_results)
        if web_links:
            logger.info(f"Web Discovery found {len(web_links)} specific articles.")
            results.extend(web_links)

        # 2. Encyclopedic Phase (Secondary): Check Wikipedia for broad context
        # Limit to 2 results to avoid 'cluttering' the findings as per user request
        wiki_links = self._search_wikipedia(query, num_results=2)
        if wiki_links:
            logger.info(f"Wikipedia Knowledge found {len(wiki_links)} pages.")
            results.extend(wiki_links)

        # 3. Curated News feeds for high-authority updates (Optional backup)
        temporal_keywords = ["today", "latest", "now", "2026", "news", "headline", "update"]
        if any(k in query.lower() for k in temporal_keywords):
            news_links = self._search_curated_news(query, num_results=2)
            if news_links:
                results.extend(news_links)

        # Deduplicate while preserving order (Web results now naturally come first)
        unique_results = []
        seen = set()
        for r in results:
            if r not in seen:
                unique_results.append(r)
                seen.add(r)

        return unique_results[:num_results]

    def _search_wikipedia(self, query, num_results):
        """Use the Wikipedia library for clean knowledge extraction."""
        try:
            pages = wikipedia.search(query, results=num_results)
            urls = []
            for page in pages:
                try:
                    p = wikipedia.page(page, auto_suggest=False)
                    urls.append(p.url)
                except: continue
            return urls
        except:
            return []

    def _search_curated_news(self, query, num_results):
        """Search curated high-authority RSS feeds."""
        links = []
        words = query.lower().split()
        for feed_url in self.news_feeds:
            try:
                feed = feedparser.parse(feed_url)
                for entry in feed.entries:
                    if any(word in entry.title.lower() for word in words):
                        links.append(entry.link)
                        if len(links) >= num_results: break
            except: continue
            if len(links) >= num_results: break
        return links

    def _search_web(self, query, num_results):
        """Search Web for direct links avoiding Javascript redirects."""
        try:
            from duckduckgo_search import DDGS
            links = []
            results = DDGS().text(query, max_results=num_results)
            for r in results:
                links.append(r['href'])
            return links
        except Exception as e:
            logger.error(f"Web Search Error: {e}")
            return []

if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)
    sm = SearchManager()
    print(f"Parallel Search: {sm.search('Who was John Bidwell', num_results=2)}")