110 lines
4.1 KiB
Python
110 lines
4.1 KiB
Python
import logging
|
|
import requests
|
|
import os
|
|
import re
|
|
from urllib.parse import unquote, quote
|
|
import wikipedia
|
|
import feedparser
|
|
from bs4 import BeautifulSoup
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class SearchManager:
|
|
def __init__(self):
|
|
self.session = requests.Session()
|
|
self.headers = {
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
}
|
|
# Curated list of high-authority, public RSS feeds
|
|
self.news_feeds = [
|
|
"https://www.reutersagency.com/en/reuters-best/rss/",
|
|
"http://feeds.bbci.co.uk/news/rss.xml",
|
|
"http://rss.cnn.com/rss/cnn_topstories.rss",
|
|
"https://www.aljazeera.com/xml/rss/all.xml",
|
|
"https://feeds.npr.org/1001/rss.xml"
|
|
]
|
|
|
|
def search(self, query, num_results=5):
|
|
"""Unified parallel search: Web/News (Primary) + Wikipedia (Reference)."""
|
|
logger.info(f"Unified Discovery for: {query}")
|
|
|
|
results = []
|
|
|
|
# 1. Web Phase (Primary): Use DuckDuckGo Search
|
|
# This provides specific, long-tail articles directly without redirects
|
|
web_links = self._search_web(query, num_results=num_results)
|
|
if web_links:
|
|
logger.info(f"Web Discovery found {len(web_links)} specific articles.")
|
|
results.extend(web_links)
|
|
|
|
# 2. Encyclopedic Phase (Secondary): Check Wikipedia for broad context
|
|
# Limit to 2 results to avoid 'cluttering' the findings as per user request
|
|
wiki_links = self._search_wikipedia(query, num_results=2)
|
|
if wiki_links:
|
|
logger.info(f"Wikipedia Knowledge found {len(wiki_links)} pages.")
|
|
results.extend(wiki_links)
|
|
|
|
# 3. Curated News feeds for high-authority updates (Optional backup)
|
|
temporal_keywords = ["today", "latest", "now", "2026", "news", "headline", "update"]
|
|
if any(k in query.lower() for k in temporal_keywords):
|
|
news_links = self._search_curated_news(query, num_results=2)
|
|
if news_links:
|
|
results.extend(news_links)
|
|
|
|
# Deduplicate while preserving order (Web results now naturally come first)
|
|
unique_results = []
|
|
seen = set()
|
|
for r in results:
|
|
if r not in seen:
|
|
unique_results.append(r)
|
|
seen.add(r)
|
|
|
|
return unique_results[:num_results]
|
|
|
|
def _search_wikipedia(self, query, num_results):
|
|
"""Use the Wikipedia library for clean knowledge extraction."""
|
|
try:
|
|
pages = wikipedia.search(query, results=num_results)
|
|
urls = []
|
|
for page in pages:
|
|
try:
|
|
p = wikipedia.page(page, auto_suggest=False)
|
|
urls.append(p.url)
|
|
except: continue
|
|
return urls
|
|
except:
|
|
return []
|
|
|
|
def _search_curated_news(self, query, num_results):
|
|
"""Search curated high-authority RSS feeds."""
|
|
links = []
|
|
words = query.lower().split()
|
|
for feed_url in self.news_feeds:
|
|
try:
|
|
feed = feedparser.parse(feed_url)
|
|
for entry in feed.entries:
|
|
if any(word in entry.title.lower() for word in words):
|
|
links.append(entry.link)
|
|
if len(links) >= num_results: break
|
|
except: continue
|
|
if len(links) >= num_results: break
|
|
return links
|
|
|
|
def _search_web(self, query, num_results):
|
|
"""Search Web for direct links avoiding Javascript redirects."""
|
|
try:
|
|
from duckduckgo_search import DDGS
|
|
links = []
|
|
results = DDGS().text(query, max_results=num_results)
|
|
for r in results:
|
|
links.append(r['href'])
|
|
return links
|
|
except Exception as e:
|
|
logger.error(f"Web Search Error: {e}")
|
|
return []
|
|
|
|
if __name__ == "__main__":
|
|
logging.basicConfig(level=logging.INFO)
|
|
sm = SearchManager()
|
|
print(f"Parallel Search: {sm.search('Who was John Bidwell', num_results=2)}")
|