diff --git a/README.md b/README.md deleted file mode 100644 index 9ec7987..0000000 --- a/README.md +++ /dev/null @@ -1,2 +0,0 @@ -# projects - diff --git a/__pycache__/crawler.cpython-312.pyc b/__pycache__/crawler.cpython-312.pyc new file mode 100644 index 0000000..49004a1 Binary files /dev/null and b/__pycache__/crawler.cpython-312.pyc differ diff --git a/__pycache__/gitea_api.cpython-312.pyc b/__pycache__/gitea_api.cpython-312.pyc new file mode 100644 index 0000000..e8f6f0b Binary files /dev/null and b/__pycache__/gitea_api.cpython-312.pyc differ diff --git a/__pycache__/main.cpython-312.pyc b/__pycache__/main.cpython-312.pyc new file mode 100644 index 0000000..00a3b4b Binary files /dev/null and b/__pycache__/main.cpython-312.pyc differ diff --git a/__pycache__/repo_manager.cpython-312.pyc b/__pycache__/repo_manager.cpython-312.pyc new file mode 100644 index 0000000..fe3596e Binary files /dev/null and b/__pycache__/repo_manager.cpython-312.pyc differ diff --git a/__pycache__/retrieval_service.cpython-312.pyc b/__pycache__/retrieval_service.cpython-312.pyc new file mode 100644 index 0000000..bd42503 Binary files /dev/null and b/__pycache__/retrieval_service.cpython-312.pyc differ diff --git a/__pycache__/search_manager.cpython-312.pyc b/__pycache__/search_manager.cpython-312.pyc new file mode 100644 index 0000000..4baf28a Binary files /dev/null and b/__pycache__/search_manager.cpython-312.pyc differ diff --git a/__pycache__/vector_service.cpython-312.pyc b/__pycache__/vector_service.cpython-312.pyc new file mode 100644 index 0000000..5d66180 Binary files /dev/null and b/__pycache__/vector_service.cpython-312.pyc differ diff --git a/.gitignore b/pyrag3/.gitignore similarity index 100% rename from .gitignore rename to pyrag3/.gitignore diff --git a/pyrag3/build_binary.py b/pyrag3/build_binary.py new file mode 100644 index 0000000..013bfc3 --- /dev/null +++ b/pyrag3/build_binary.py @@ -0,0 +1,33 @@ +import subprocess +import sys +import os + +def build(): + """Compile main.py into a standalone binary using Nuitka.""" + print("Starting Nuitka build process...") + + # Define flags + # --onefile: Create a single executable + # --standalone: Include all dependencies + # --follow-imports: Follow all imports + flags = [ + sys.executable, "-m", "nuitka", + "--onefile", + "--standalone", + "--follow-imports", + "--output-filename=doc_retrieve.exe" if sys.platform == "win32" else "--output-filename=doc_retrieve", + "main.py" + ] + + print(f"Running command: {' '.join(flags)}") + + try: + subprocess.run(flags, check=True) + print("Build successful!") + except subprocess.CalledProcessError as e: + print(f"Build failed with exit code {e.returncode}") + # Not exiting here to allow me to see the output in command_status + raise + +if __name__ == "__main__": + build() diff --git a/pyrag3/crawler.py b/pyrag3/crawler.py new file mode 100644 index 0000000..39e54dc --- /dev/null +++ b/pyrag3/crawler.py @@ -0,0 +1,111 @@ +import os +import logging +import queue +import threading +import time +import re +from pathlib import Path +from urllib.parse import urlparse, urljoin +import requests +from bs4 import BeautifulSoup + +# Reuse some constants and logic from the original downloader +ASSET_EXTENSIONS = ( + ".css", ".js", ".png", ".jpg", ".jpeg", ".gif", ".webp", ".svg", ".ico", + ".woff", ".woff2", ".ttf", ".eot" +) + +logger = logging.getLogger(__name__) + +class Crawler: + def __init__(self, root_dir): + self.root_dir = Path(root_dir) + self.session = requests.Session() + self.session.headers.update({ + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36" + }) + + def download_page(self, url, max_assets=20): + """Download a single page and its essential assets.""" + try: + resp = self.session.get(url, timeout=10) + resp.raise_for_status() + soup = BeautifulSoup(resp.text, "html.parser") + except Exception as e: + logger.error(f"Failed to fetch {url}: {e}") + return None + + def _sanitize_filename(self, name): + """Remove invalid characters for Windows/Linux filenames.""" + # Replace non-alphanumeric with underscores + res = re.sub(r'[^a-zA-Z0-9_\-\.]', '_', name) + # Limit length + return res[:100] + + def download_page(self, url, max_assets=20): + """Download a single page and its essential assets.""" + try: + resp = self.session.get(url, timeout=10) + resp.raise_for_status() + soup = BeautifulSoup(resp.text, "html.parser") + except Exception as e: + logger.error(f"Failed to fetch {url}: {e}") + return None + + # Determine local path + parsed = urlparse(url) + host_dir = self.root_dir / self._sanitize_filename(parsed.netloc) + host_dir.mkdir(parents=True, exist_ok=True) + + # Sanitize the filename from the path + raw_name = parsed.path.strip("/").replace("/", "_") or "index" + page_name = self._sanitize_filename(raw_name) + + if not page_name.lower().endswith(".html"): + page_name += ".html" + + local_path = host_dir / page_name + + # Simple asset downloading (minimal version of website_downloader.py) + assets_dir = host_dir / "assets" + assets_dir.mkdir(exist_ok=True) + + asset_count = 0 + for tag in soup.find_all(["img", "link", "script"]): + if asset_count >= max_assets: + break + + attr = "src" if tag.name in ["img", "script"] else "href" + link = tag.get(attr) + if not link or link.startswith("data:"): + continue + + asset_url = urljoin(url, link) + asset_parsed = urlparse(asset_url) + + if any(asset_parsed.path.lower().endswith(ext) for ext in ASSET_EXTENSIONS): + asset_name = os.path.basename(asset_parsed.path) + if not asset_name: continue + + asset_local_path = assets_dir / asset_name + try: + asset_resp = self.session.get(asset_url, timeout=5) + asset_resp.raise_for_status() + with open(asset_local_path, "wb") as f: + f.write(asset_resp.content) + # Rewrite link in soup + tag[attr] = f"assets/{asset_name}" + asset_count += 1 + except Exception as e: + logger.debug(f"Failed to download asset {asset_url}: {e}") + + with open(local_path, "w", encoding="utf-8") as f: + f.write(soup.prettify()) + + return local_path + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + c = Crawler("rag_data/repo/web") + path = c.download_page("https://example.com") + print(f"Downloaded to: {path}") diff --git a/pyrag3/gitea_api.py b/pyrag3/gitea_api.py new file mode 100644 index 0000000..1e9c42c --- /dev/null +++ b/pyrag3/gitea_api.py @@ -0,0 +1,138 @@ +import requests +import base64 +import os +import logging +from urllib.parse import quote + +logger = logging.getLogger(__name__) + +class GiteaAPI: + def __init__(self, owner="butterfly", repo="ragdocs"): + self.base_url = "https://git.client.guacamolebox.net/api/v1" + self.owner = owner + self.repo = repo + # Hardcoded for portability in standalone binary + self.token = "f8e1300d871e905e27ce54c3455a3343104d9b04" + self.headers = {"Authorization": f"token {self.token}", "Content-Type": "application/json"} + + def delete_file(self, filepath, sha, branch="main", message="Automated Cleanup"): + """Delete a file from the repository via the Gitea API.""" + url = f"{self.base_url}/repos/{self.owner}/{self.repo}/contents/{quote(filepath, safe='/')}" + data = { + "branch": branch, + "sha": sha, + "message": message + } + try: + resp = requests.delete(url, headers=self.headers, json=data, timeout=15) + if resp.status_code == 200: + logger.info(f"Deleted remote file: {filepath}") + return True + else: + logger.error(f"Failed to delete {filepath}: {resp.text}") + return False + except Exception as e: + logger.error(f"Error deleting {filepath}: {e}") + return False + + def list_files(self, path="", recursive=True): + """Recursively list all files in the repository.""" + url = f"{self.base_url}/repos/{self.owner}/{self.repo}/contents/{quote(path, safe='/')}" + try: + resp = requests.get(url, headers=self.headers, timeout=15) + if resp.status_code == 404: + return [] + resp.raise_for_status() + items = resp.json() + + # Gitea returns a list for directories, but a dict for single files + if isinstance(items, dict): + items = [items] + + files = [] + for item in items: + if item["type"] == "file": + files.append({ + "path": item["path"], + "download_url": item["download_url"], + "size": item["size"], + "sha": item["sha"] + }) + elif item["type"] == "dir" and recursive: + files.extend(self.list_files(item["path"], recursive=True)) + return files + except Exception as e: + logger.debug(f"Note: Failed to list files at {path}: {e}") + return [] + + def upload_file(self, local_path, remote_path, branch="main", message="Added new document"): + """Upload a file to the repository via the Gitea API (POST for Create, PUT for Update).""" + url = f"{self.base_url}/repos/{self.owner}/{self.repo}/contents/{quote(remote_path, safe='/')}" + + try: + with open(local_path, "rb") as f: + content = base64.b64encode(f.read()).decode("utf-8") + + # 1. Existing Check + sha = None + exists = False + try: + existing_resp = requests.get(url, headers=self.headers, timeout=10) + if existing_resp.status_code == 200: + sha = existing_resp.json().get("sha") + exists = True + elif existing_resp.status_code == 404: + exists = False + except: + exists = False # Assume new if check fails + + data = { + "branch": branch, + "content": content, + "message": message + } + if sha: + data["sha"] = sha + + # 2. Method selection (POST for Create, PUT for Update) + method = requests.put if exists else requests.post + resp = method(url, headers=self.headers, json=data, timeout=15) + + if resp.status_code not in [200, 201]: + logger.error(f"Failed to upload {remote_path}: {resp.text}") + return None + + return resp.json()["content"]["download_url"] + except Exception as e: + logger.error(f"Failed to upload {local_path} to {remote_path}: {e}") + return None + + def download_file(self, remote_path, local_path, is_binary=True): + """Download a file from the repository to a local path.""" + url = self.get_raw_url(remote_path) + try: + resp = requests.get(url, headers=self.headers, timeout=30) + resp.raise_for_status() + + mode = "wb" if is_binary else "w" + content = resp.content if is_binary else resp.text + + with open(local_path, mode) as f: + f.write(content) + logger.info(f"Downloaded {remote_path} to {local_path}") + return True + except Exception as e: + logger.error(f"Failed to download {remote_path}: {e}") + return False + + def get_raw_url(self, filepath, branch="main"): + """Generate the raw URL for a given file path.""" + return f"https://git.client.guacamolebox.net/{self.owner}/{self.repo}/raw/branch/{branch}/{quote(filepath, safe='/')}" + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + api = GiteaAPI() + files = api.list_files() + print(f"Found {len(files)} files.") + for f in files[:3]: + print(f" - {f['path']} at {f['download_url']}") diff --git a/pyrag3/main.py b/pyrag3/main.py new file mode 100644 index 0000000..0e4d2d1 --- /dev/null +++ b/pyrag3/main.py @@ -0,0 +1,175 @@ +import argparse +import json +import os +import logging +import shutil +from pathlib import Path +from pyrag3.repo_manager import RepoManager +from pyrag3.retrieval_service import RetrievalService +from pyrag3.search_manager import SearchManager +from pyrag3.crawler import Crawler + +# Setup logging +logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") +logger = logging.getLogger(__name__) + +class DocumentRetrieveApp: + def __init__(self, data_root="rag_data"): + self.data_root = Path(data_root) + self.db_path = self.data_root / "index.db" + + # Initialize modules + self.repo = RepoManager() + self.retrieval = RetrievalService(db_path=self.db_path) + self.searcher = SearchManager() + # Crawler still uses a local temp dir for downloading before upload + self.crawler = Crawler(root_dir=self.data_root / "temp_web") + + def sync(self): + """Sync with remote shared index and then verify file metadata.""" + logger.info("Synchronizing shared index and remote metadata...") + try: + # 1. Pull the shared 'brain' (index.db) if it exists + self.retrieval.pull_index_db() + + # 2. Re-verify the file lists to catch any missing raw documents + self.retrieval.sync_and_reindex() + return True + except Exception as e: + logger.error(f"Sync failed: {e}") + return False + + def query(self, text, limit=5, fallback=True, similarity_threshold=0.35, perfect_threshold=0.70): + """Perform Hybrid chunk-level retrieval (Local Index + Web Discovery).""" + logger.info(f"Hybrid Query (Chunk Level): {text}") + + # 1. Initial Local Search for top chunks + results = self.retrieval.search(text, limit=limit) + best_score = results[0].get('score', 0) if results else 0 + + # 2. Proactive Discovery (If no good chunks found) + if fallback and best_score < perfect_threshold: + logger.info("Triggering search discovery for higher-relevance context...") + search_urls = self.searcher.search(text, num_results=5) + + new_files = [] + for url in search_urls: + # Deduplicate: check if URL is already represented in local chunks + if any(url == r['url'] for r in results): + continue + + logger.info(f"Archiving fresh context: {url}...") + local_path = self.crawler.download_page(url) + if local_path: + new_files.append(local_path) + + if new_files: + logger.info(f"Ingesting {len(new_files)} new discoveries...") + uploaded_urls = self.repo.commit_and_push(new_files, message=f"Discovery: {text}") + + if uploaded_urls: + for local_p in new_files: + remote_path = f"web_results/{local_p.name}" + d_url = next((u for u in uploaded_urls if local_p.name in u), None) + if d_url: + # Index as granular chunks immediately + self.retrieval.ingest_document(local_p, remote_path, d_url) + + # Cleanup and push updated shared index + try: shutil.rmtree(self.crawler.root_dir, ignore_errors=True) + except: pass + self.retrieval.push_index_db() + + # Final Search to get the new high-relevance chunks + results = self.retrieval.search(text, limit=limit) + + # Final filtering based on chunk similarity score + final_results = [r for r in results if r.get('score', 0) >= similarity_threshold] + + if final_results: + logger.info(f"Returning {len(final_results)} unified chunk results.") + return final_results + else: + logger.info("Returning 0 unified chunk results.") + return [] + +def main(): + parser = argparse.ArgumentParser(description="Storage-Efficient Document Retrieval Tool") + subparsers = parser.add_subparsers(dest="command", help="Command to run") + + # Query command + query_parser = subparsers.add_parser("query", help="Query the document index") + query_parser.add_argument("text", help="The query text") + query_parser.add_argument("--limit", type=int, default=5, help="Number of results to return") + query_parser.add_argument("--no-fallback", action="store_true", help="Disable web search fallback") + query_parser.add_argument("--format", choices=["json", "text"], default="json", help="Output format") + + # Add command + add_parser = subparsers.add_parser("add", help="Add a local file or directory to the repo and index") + add_parser.add_argument("path", help="Path to local file or directory") + + # Update command + subparsers.add_parser("update", help="Sync remote inventory and re-index") + + # Reset command + subparsers.add_parser("reset", help="DANGEROUS: Purge all data from repo and index") + + args = parser.parse_args() + + app = DocumentRetrieveApp() + + if args.command == "update": + if app.sync(): + print("Successfully synced remote metadata and updated index.") + else: + print("Update failed.") + + elif args.command == "reset": + confirm = input("⚠️ CAUTION: This will permanently DELETE all files in Gitea and the local index. Are you sure? (y/n): ") + if confirm.lower() == 'y': + if app.retrieval.reset_all(): + print("Successfully purged all knowledge base assets.") + else: + print("Reset failed.") + else: + print("Reset cancelled.") + + elif args.command == "add": + path = Path(args.path) + if path.is_file(): + if app.retrieval.add_local_file(str(path)): + print(f"Successfully added {path.name} to the distributed knowledge base.") + else: + print(f"Failed to add {path.name}.") + elif path.is_dir(): + files = list(path.glob("*")) + print(f"Adding {len(files)} files from {path.name}...") + count = 0 + for f in files: + if f.is_file(): + if app.retrieval.add_local_file(str(f)): + count += 1 + print(f"Successfully added {count} files from {path.name}.") + else: + print(f"Path not found: {args.path}") + + elif args.command == "query": + results = app.query(args.text, limit=args.limit, fallback=(not args.no_fallback)) + + if args.format == "json": + # This now includes the full 'content' field for the LLM + print(json.dumps(results, indent=2)) + else: + if not results: + print("No results found.") + for r in results: + print(f"[{r['score']:.4f}] {r['title']}") + print(f"URL: {r['url']}") + content_preview = r.get('content', '')[:200].replace('\n', ' ') + print(f"Preview: {content_preview}...") + print("-" * 40) + else: + parser.print_help() + +if __name__ == "__main__": + main() diff --git a/pyrag3/rag_data/index.db b/pyrag3/rag_data/index.db new file mode 100644 index 0000000..65d85c4 Binary files /dev/null and b/pyrag3/rag_data/index.db differ diff --git a/pyrag3/repo_manager.py b/pyrag3/repo_manager.py new file mode 100644 index 0000000..23dc6b8 --- /dev/null +++ b/pyrag3/repo_manager.py @@ -0,0 +1,52 @@ +import logging +from pathlib import Path +from pyrag3.gitea_api import GiteaAPI + +# Hardcoded repository details as requested +REPO_URL = "https://f8e1300d871e905e27ce54c3455a3343104d9b04@git.client.guacamolebox.net/butterfly/ragdocs.git" + +logger = logging.getLogger(__name__) + +class RepoManager: + def __init__(self, repo_url=REPO_URL): + self.repo_url = repo_url + self.api = GiteaAPI() + + def sync_repo(self): + """Deprecated: We no longer clone or pull to save storage.""" + logger.info("Syncing repository metadata via API (skipping clone/pull)...") + # In the new architecture, sync is handled by the RetrievalService + # listing files via API. + pass + + def commit_and_push(self, file_paths, message="Added new documents"): + """Upload new files directly to the remote repository via API.""" + logger.info(f"Uploading {len(file_paths)} files to repository via API...") + uploaded_urls = [] + + for file_path in file_paths: + file_path = Path(file_path) + # Use relative path from the designated "web_pages" root or similar + # For simplicity, we'll use a standard remote folder + remote_path = f"web_results/{file_path.name}" + + url = self.api.upload_file(file_path, remote_path, message=message) + if url: + uploaded_urls.append(url) + logger.debug(f"Uploaded {file_path} to {url}") + else: + logger.error(f"Failed to upload {file_path}") + + return uploaded_urls + +if __name__ == "__main__": + # Quick test + logging.basicConfig(level=logging.INFO) + manager = RepoManager() + manager.sync_repo() + +if __name__ == "__main__": + # Quick test + logging.basicConfig(level=logging.INFO) + manager = RepoManager() + manager.sync_repo() diff --git a/pyrag3/requirements.txt b/pyrag3/requirements.txt new file mode 100644 index 0000000..8860af7 --- /dev/null +++ b/pyrag3/requirements.txt @@ -0,0 +1,11 @@ +requests +beautifulsoup4 +pypdf +googlesearch-python +zstandard +nuitka +duckduckgo-search +lxml +wikipedia +feedparser +markdownify diff --git a/pyrag3/retrieval_service.py b/pyrag3/retrieval_service.py new file mode 100644 index 0000000..eec81e3 --- /dev/null +++ b/pyrag3/retrieval_service.py @@ -0,0 +1,332 @@ +import sqlite3 +import os +import logging +import requests +import struct +import base64 +import mimetypes +import re +from pathlib import Path +from pypdf import PdfReader +from bs4 import BeautifulSoup +import markdownify as md +from pyrag3.gitea_api import GiteaAPI +from pyrag3.vector_service import VectorService + +logger = logging.getLogger(__name__) + +class RetrievalService: + def __init__(self, db_path="rag_data/index.db"): + self.db_path = Path(db_path) + self.db_path.parent.mkdir(parents=True, exist_ok=True) + self.api = GiteaAPI() + self.vector_service = VectorService() + self._init_db() + + def _init_db(self): + """Initialize SQLite database for metadata and vectors with chunk-level support.""" + if not os.path.exists(os.path.dirname(self.db_path)): + os.makedirs(os.path.dirname(self.db_path)) + + with sqlite3.connect(self.db_path) as conn: + # We migrate the schema for chunk support (using ID as primary key) + # Check if columns exist; if not, recreate or migrate + try: + cursor = conn.execute("SELECT content FROM documents LIMIT 1") + except sqlite3.OperationalError: + logger.info("Migrating database for chunk and content support...") + conn.execute("DROP TABLE IF EXISTS documents") + + conn.execute(""" + CREATE TABLE IF NOT EXISTS documents ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + url TEXT, + sha TEXT, + path TEXT, + title TEXT, + content TEXT, + embedding BLOB + ) + """) + conn.execute("CREATE INDEX IF NOT EXISTS idx_url ON documents(url)") + conn.commit() + + def _serialize_vector(self, vector): + """Convert a list of floats to a binary BLOB.""" + if not vector: return None + return struct.pack(f"{len(vector)}f", *vector) + + def _deserialize_vector(self, blob): + """Convert a binary BLOB back to a list of floats.""" + if not blob: return None + count = len(blob) // 4 + return list(struct.unpack(f"{count}f", blob)) + + def _chunk_text(self, text, chunk_size=1500, overlap=200): + """Split text into overlapping chunks recursively for higher-precision search.""" + if not text: return [] + + chunks = [] + current_chunk = "" + + # Split by paragraphs + paragraphs = text.split("\n\n") + for para in paragraphs: + if len(current_chunk) + len(para) <= chunk_size: + current_chunk += para + "\n\n" + else: + if current_chunk: + chunks.append(current_chunk.strip()) + + # Handle oversized paragraphs by sentences + if len(para) > chunk_size: + sentences = para.split(". ") + sub_chunk = "" + for sent in sentences: + if len(sub_chunk) + len(sent) <= chunk_size: + sub_chunk += sent + ". " + else: + if sub_chunk: chunks.append(sub_chunk.strip()) + sub_chunk = sent + ". " + current_chunk = sub_chunk + else: + current_chunk = para + "\n\n" + + if current_chunk: + chunks.append(current_chunk.strip()) + + return chunks + + def _parse_content(self, raw_content, filepath): + """Extract rich markdown text and significant images for multimodal embedding.""" + ext = os.path.splitext(filepath)[1].lower() + mime_type, _ = mimetypes.guess_type(filepath) + + content = None + title = os.path.basename(filepath) + image_b64 = None + + try: + if mime_type and mime_type.startswith("image/"): + image_b64 = base64.b64encode(raw_content).decode("utf-8") + elif ext == ".pdf": + temp_pdf = Path("temp_index.pdf") + temp_pdf.write_bytes(raw_content) + reader = PdfReader(temp_pdf) + content = "\n".join([page.extract_text() for page in reader.pages if page.extract_text()]) + temp_pdf.unlink() + elif ext in [".html", ".htm"]: + soup = BeautifulSoup(raw_content, "html.parser") + title = soup.title.string if soup.title else title + + # 1. Clean HTML for Text Extraction + for script in soup(["script", "style", "nav", "footer", "header"]): + script.decompose() + + # 2. Extract first significant image for multimodal context + for img in soup.find_all("img"): + src = img.get("src") + if src: + if any(x in src.lower() for x in ["icon", "logo", "pixel", "tracker", "sprite"]): + continue + if src.startswith("data:image"): + image_b64 = src.split(",")[1] + break + + # 3. Convert to clean Markdown + content = md.markdownify(str(soup), heading_style="ATX") + else: + content = raw_content.decode("utf-8", errors="ignore") + except Exception as e: + logger.error(f"Error parsing {filepath}: {e}") + + if content: + content = re.sub(r' +', ' ', content).strip() + + return content, title, image_b64, mime_type + + def ingest_document(self, local_path, remote_path, download_url, sha=None): + """Parse, CHUNK, vector, and index a document into granular pieces.""" + local_path = Path(local_path) + try: + raw_bytes = local_path.read_bytes() + content, title, image_b64, mime = self._parse_content(raw_bytes, str(local_path)) + + chunks = self._chunk_text(content) if content else [] + if not chunks and image_b64: + chunks = ["Visual Content"] + + indexed_count = 0 + with sqlite3.connect(self.db_path) as conn: + for i, chunk_text in enumerate(chunks): + # We can combine chunk text with the image for multimodal context if it's the first chunk + current_img = image_b64 if i == 0 else None + + embedding = self.vector_service.get_embedding(text=chunk_text, image_b64=current_img, mime_type=mime) + if embedding: + blob = self._serialize_vector(embedding) + conn.execute(""" + INSERT INTO documents (url, sha, path, title, content, embedding) + VALUES (?, ?, ?, ?, ?, ?) + """, (download_url, sha, remote_path, title, chunk_text, blob)) + indexed_count += 1 + conn.commit() + + if indexed_count > 0: + logger.debug(f"Granularly indexed {local_path.name} into {indexed_count} chunks.") + return True + return False + except Exception as e: + logger.error(f"Failed to ingest {local_path.name}: {e}") + return False + + def ingest_document_from_bytes(self, raw_bytes, remote_path, download_url, sha=None): + """Helper for sync_and_reindex to ingest without a local file.""" + try: + content, title, image_b64, mime = self._parse_content(raw_bytes, remote_path) + chunks = self._chunk_text(content) if content else [] + if not chunks and image_b64: chunks = ["Visual Content"] + + indexed_count = 0 + with sqlite3.connect(self.db_path) as conn: + for i, chunk_text in enumerate(chunks): + current_img = image_b64 if i == 0 else None + embedding = self.vector_service.get_embedding(text=chunk_text, image_b64=current_img, mime_type=mime) + if embedding: + blob = self._serialize_vector(embedding) + conn.execute(""" + INSERT INTO documents (url, sha, path, title, content, embedding) + VALUES (?, ?, ?, ?, ?, ?) + """, (download_url, sha, remote_path, title, chunk_text, blob)) + indexed_count += 1 + conn.commit() + return indexed_count > 0 + except Exception as e: + logger.error(f"Failed to sync-ingest {remote_path}: {e}") + return False + + def add_local_file(self, local_path): + """Upload a local file and then ingest it into the index.""" + path = Path(local_path) + if not path.exists(): + logger.error(f"File not found: {local_path}") + return False + + logger.info(f"Adding local file: {path.name}...") + + # 1. Upload to Gitea + remote_path = f"local_uploads/{path.name}" + download_url = self.api.upload_file(str(path), remote_path, message=f"Local Upload: {path.name}") + if not download_url: + return False + + # 2. Get SHA + files = self.api.list_files("local_uploads/") + sha = next((f["sha"] for f in files if f["path"] == remote_path), None) + + # 3. Ingest Locally + success = self.ingest_document(path, remote_path, download_url, sha) + + # 4. Push Updated Index + if success: + self.push_index_db() + logger.info(f"Successfully added and indexed {path.name}") + + return success + + def sync_and_reindex(self): + """Sync with remote Gitea repo and update semantic index with chunking.""" + logger.info("Starting Semantic Sync...") + + remote_files = self.api.list_files() + + with sqlite3.connect(self.db_path) as conn: + # We use DISTINCT because a URL now has multiple chunks + cursor = conn.execute("SELECT DISTINCT url, sha FROM documents") + indexed = {row[0]: row[1] for row in cursor.fetchall()} + + remote_urls = set() + for f in remote_files: + if f["path"] == "index.db": continue + url = f["download_url"] + remote_urls.add(url) + sha = f["sha"] + + if url not in indexed or indexed[url] != sha: + logger.info(f"Vecting new document: {f['path']}...") + try: + resp = requests.get(url, timeout=15) + resp.raise_for_status() + self.ingest_document_from_bytes(resp.content, f["path"], url, sha) + except Exception as e: + logger.error(f"Failed to index {url}: {e}") + + # Cleanup deleted files + to_remove = set(indexed.keys()) - remote_urls + if to_remove: + logger.info(f"Removing {len(to_remove)} deleted files from local index.") + for url in to_remove: + conn.execute("DELETE FROM documents WHERE url = ?", (url,)) + conn.commit() + logger.info("Semantic Indexing complete.") + + def pull_index_db(self, remote_path="index.db"): + """Download remote shared index from Gitea.""" + logger.info(f"Pulling shared index from {remote_path}...") + self.db_path.parent.mkdir(parents=True, exist_ok=True) + return self.api.download_file(remote_path, str(self.db_path), is_binary=True) + + def push_index_db(self, remote_path="index.db"): + """Upload local index to Gitea for other users.""" + logger.info(f"Pushing shared index to {remote_path}...") + return self.api.upload_file(str(self.db_path), remote_path, message="Updated Shared Index") + + def reset_all(self): + """DANGEROUS: Purge all documents and index from both repo and local system.""" + logger.warning("INHERENTLY DESTRUCTIVE: Purging knowledge base...") + + remote_files = self.api.list_files() + for f in remote_files: + logger.info(f"Deleting remote asset: {f['path']}...") + self.api.delete_file(f["path"], f["sha"], message="Reset: System Purge") + + if self.db_path.exists(): + try: + import gc + gc.collect() + self.db_path.unlink() + logger.info("Local index deleted.") + except Exception as e: + logger.error(f"Failed to delete local DB file: {e}") + self._init_db() + return True + + def search(self, query, limit=10): + """Search the semantic index for the best matching chunks.""" + query_vector = self.vector_service.get_embedding(query) + if not query_vector: return [] + + results = [] + with sqlite3.connect(self.db_path) as conn: + cursor = conn.execute("SELECT url, title, content, embedding FROM documents") + for url, title, content, blob in cursor.fetchall(): + doc_vector = self._deserialize_vector(blob) + similarity = self.vector_service.cosine_similarity(query_vector, doc_vector) + + results.append({ + "url": url, + "title": title, + "content": content, + "score": similarity + }) + + results.sort(key=lambda x: x["score"], reverse=True) + return results[:limit] + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + service = RetrievalService() + # Test + res = service.search("Chico history") + for r in res: + print(f"[{r['score']:.4f}] {r['title']} - {r['content'][:100]}...") diff --git a/pyrag3/search_manager.py b/pyrag3/search_manager.py new file mode 100644 index 0000000..8b0da4c --- /dev/null +++ b/pyrag3/search_manager.py @@ -0,0 +1,109 @@ +import logging +import requests +import os +import re +from urllib.parse import unquote, quote +import wikipedia +import feedparser +from bs4 import BeautifulSoup + +logger = logging.getLogger(__name__) + +class SearchManager: + def __init__(self): + self.session = requests.Session() + self.headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", + } + # Curated list of high-authority, public RSS feeds + self.news_feeds = [ + "https://www.reutersagency.com/en/reuters-best/rss/", + "http://feeds.bbci.co.uk/news/rss.xml", + "http://rss.cnn.com/rss/cnn_topstories.rss", + "https://www.aljazeera.com/xml/rss/all.xml", + "https://feeds.npr.org/1001/rss.xml" + ] + + def search(self, query, num_results=5): + """Unified parallel search: Web/News (Primary) + Wikipedia (Reference).""" + logger.info(f"Unified Discovery for: {query}") + + results = [] + + # 1. Web Phase (Primary): Use DuckDuckGo Search + # This provides specific, long-tail articles directly without redirects + web_links = self._search_web(query, num_results=num_results) + if web_links: + logger.info(f"Web Discovery found {len(web_links)} specific articles.") + results.extend(web_links) + + # 2. Encyclopedic Phase (Secondary): Check Wikipedia for broad context + # Limit to 2 results to avoid 'cluttering' the findings as per user request + wiki_links = self._search_wikipedia(query, num_results=2) + if wiki_links: + logger.info(f"Wikipedia Knowledge found {len(wiki_links)} pages.") + results.extend(wiki_links) + + # 3. Curated News feeds for high-authority updates (Optional backup) + temporal_keywords = ["today", "latest", "now", "2026", "news", "headline", "update"] + if any(k in query.lower() for k in temporal_keywords): + news_links = self._search_curated_news(query, num_results=2) + if news_links: + results.extend(news_links) + + # Deduplicate while preserving order (Web results now naturally come first) + unique_results = [] + seen = set() + for r in results: + if r not in seen: + unique_results.append(r) + seen.add(r) + + return unique_results[:num_results] + + def _search_wikipedia(self, query, num_results): + """Use the Wikipedia library for clean knowledge extraction.""" + try: + pages = wikipedia.search(query, results=num_results) + urls = [] + for page in pages: + try: + p = wikipedia.page(page, auto_suggest=False) + urls.append(p.url) + except: continue + return urls + except: + return [] + + def _search_curated_news(self, query, num_results): + """Search curated high-authority RSS feeds.""" + links = [] + words = query.lower().split() + for feed_url in self.news_feeds: + try: + feed = feedparser.parse(feed_url) + for entry in feed.entries: + if any(word in entry.title.lower() for word in words): + links.append(entry.link) + if len(links) >= num_results: break + except: continue + if len(links) >= num_results: break + return links + + def _search_web(self, query, num_results): + """Search Web for direct links avoiding Javascript redirects.""" + try: + from duckduckgo_search import DDGS + links = [] + results = DDGS().text(query, max_results=num_results) + for r in results: + links.append(r['href']) + return links + except Exception as e: + logger.error(f"Web Search Error: {e}") + return [] + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + sm = SearchManager() + print(f"Parallel Search: {sm.search('Who was John Bidwell', num_results=2)}") diff --git a/pyrag3/test_search.py b/pyrag3/test_search.py new file mode 100644 index 0000000..d0c9a98 --- /dev/null +++ b/pyrag3/test_search.py @@ -0,0 +1,7 @@ +import logging +from pyrag3.search_manager import SearchManager + +logging.basicConfig(level=logging.DEBUG) +sm = SearchManager() +res = sm.search("test") +print(f"Results: {res}") diff --git a/pyrag3/test_search_v2.py b/pyrag3/test_search_v2.py new file mode 100644 index 0000000..6b6f00d --- /dev/null +++ b/pyrag3/test_search_v2.py @@ -0,0 +1,7 @@ +import logging +from pyrag3.search_manager import SearchManager + +logging.basicConfig(level=logging.INFO) +sm = SearchManager() +res = sm.search("Starship news 2024") +print(f"Links found: {res}") diff --git a/pyrag3/tmp.json b/pyrag3/tmp.json new file mode 100644 index 0000000..1107c04 --- /dev/null +++ b/pyrag3/tmp.json @@ -0,0 +1,32 @@ +[ + { + "url": "https://git.client.guacamolebox.net/butterfly/ragdocs/raw/branch/main/web_results/wiki_Lake_Oroville.html", + "title": "\r\n Lake Oroville - Wikipedia\r\n ", + "content": "| Lake Oroville | |\n| --- | --- |\n| Satellite image of the lake by [Sentinel-2](/wiki/Sentinel-2 \"Sentinel-2\") | |\n| [Location of Lake Oroville in California, USA.](/wiki/File:Relief_map_of_California.png \"Location of Lake Oroville in California, USA.\") Location of Lake Oroville in California, USA. Lake Oroville Show map of California [Location of Lake Oroville in California, USA.](/wiki/File:Usa_edcp_relief_location_map.png \"Location of Lake Oroville in California, USA.\") Location of Lake Oroville in California, USA.", + "score": 0.6367981864470856 + }, + { + "url": "https://git.client.guacamolebox.net/butterfly/ragdocs/raw/branch/main/web_results/wiki_Lake_Oroville.html", + "title": "\r\n Lake Oroville - Wikipedia\r\n ", + "content": "| Lake Oroville | |\n| --- | --- |\n| Satellite image of the lake by [Sentinel-2](/wiki/Sentinel-2 \"Sentinel-2\") | |\n| [Location of Lake Oroville in California, USA.](/wiki/File:Relief_map_of_California.png \"Location of Lake Oroville in California, USA.\") Location of Lake Oroville in California, USA. Lake Oroville Show map of California [Location of Lake Oroville in California, USA.](/wiki/File:Usa_edcp_relief_location_map.png \"Location of Lake Oroville in California, USA.\") Location of Lake Oroville in California, USA.", + "score": 0.6367981864470856 + }, + { + "url": "https://git.client.guacamolebox.net/butterfly/ragdocs/raw/branch/main/web_results/wiki_Lake_Oroville.html", + "title": "\r\n Lake Oroville - Wikipedia\r\n ", + "content": "| |. **Lake Oroville**\n[[\n1\n]](#cite_note-GNIS-1)\nis a\n[reservoir](/wiki/Reservoir \"Reservoir\")\nformed by the\n[Oroville Dam](/wiki/Oroville_Dam \"Oroville Dam\")\nimpounding the\n[Feather River](/wiki/Feather_River \"Feather River\")\n, located in\n[Butte County](/wiki/Butte_County,_California \"Butte County, California\")\n, northern\n[California](/wiki/California \"California\")\n. The lake is situated 5 miles (8\u00a0km) northeast of the city of\n[Oroville](/wiki/Oroville,_California \"Oroville, California\")\n, within the\n[Lake Oroville State Recreation Area](/wiki/Lake_Oroville_State_Recreation_Area \"Lake Oroville State Recreation Area\")\n, in the western foothills of the\n[Sierra Nevada](/wiki/Sierra_Nevada \"Sierra Nevada\")\n. Known as the\n[second-largest reservoir in California](/wiki/List_of_largest_reservoirs_of_California \"List of largest reservoirs of California\")\n, Lake Oroville is treated as a keystone facility within the\n[California State Water Project](/wiki/California_State_Water_Project \"California State Water Project\")\nby storing water, providing\n[flood control](/wiki/Flood_control \"Flood control\")\n, recreation, freshwater releases to assist in controlling the salinity intrusion into the Sacramento-San Joaquin Delta and protecting fish and wildlife.", + "score": 0.5920514170838611 + }, + { + "url": "https://git.client.guacamolebox.net/butterfly/ragdocs/raw/branch/main/web_results/wiki_Lake_Oroville.html", + "title": "\r\n Lake Oroville - Wikipedia\r\n ", + "content": "| |. **Lake Oroville**\n[[\n1\n]](#cite_note-GNIS-1)\nis a\n[reservoir](/wiki/Reservoir \"Reservoir\")\nformed by the\n[Oroville Dam](/wiki/Oroville_Dam \"Oroville Dam\")\nimpounding the\n[Feather River](/wiki/Feather_River \"Feather River\")\n, located in\n[Butte County](/wiki/Butte_County,_California \"Butte County, California\")\n, northern\n[California](/wiki/California \"California\")\n. The lake is situated 5 miles (8\u00a0km) northeast of the city of\n[Oroville](/wiki/Oroville,_California \"Oroville, California\")\n, within the\n[Lake Oroville State Recreation Area](/wiki/Lake_Oroville_State_Recreation_Area \"Lake Oroville State Recreation Area\")\n, in the western foothills of the\n[Sierra Nevada](/wiki/Sierra_Nevada \"Sierra Nevada\")\n. Known as the\n[second-largest reservoir in California](/wiki/List_of_largest_reservoirs_of_California \"List of largest reservoirs of California\")\n, Lake Oroville is treated as a keystone facility within the\n[California State Water Project](/wiki/California_State_Water_Project \"California State Water Project\")\nby storing water, providing\n[flood control](/wiki/Flood_control \"Flood control\")\n, recreation, freshwater releases to assist in controlling the salinity intrusion into the Sacramento-San Joaquin Delta and protecting fish and wildlife.", + "score": 0.5920514170838611 + }, + { + "url": "https://git.client.guacamolebox.net/butterfly/ragdocs/raw/branch/main/web_results/wiki_Lake_Oroville.html", + "title": "\r\n Lake Oroville - Wikipedia\r\n ", + "content": "Lake Oroville - Wikipedia\n\n[Jump to content](#bodyContent)\n\n[Coordinates](/wiki/Geographic_coordinate_system \"Geographic coordinate system\")\n:\n[39\u00b032\u203214\u2033N\n\n121\u00b029\u203200\u2033W\n\n\ufeff / \ufeff\n\n\n39.53722\u00b0N 121.48333\u00b0W\n\n\ufeff /\n39.53722; -121.48333](https://geohack.toolforge.org/geohack.php?pagename=Lake_Oroville¶ms=39_32_14_N_121_29_00_W_scale:100000_type:waterbody_region:US-CA)\n\nFrom Wikipedia, the free encyclopedia\n\nReservoir in Butte County, California, U.S.", + "score": 0.589832461931772 + } +] diff --git a/pyrag3/tmp.md b/pyrag3/tmp.md new file mode 100644 index 0000000..75c1a0f --- /dev/null +++ b/pyrag3/tmp.md @@ -0,0 +1,8 @@ +[ + { + "url": "https://git.client.guacamolebox.net/butterfly/ragdocs/raw/branch/main/web_results/News_Blog_2026_Mar-2026_Lake-Oroville-Update-March-20-2026.html", + "title": "\r\n Lake Oroville Update March 20 2026\r\n ", + "score": 0.3749372215619156, + "content": "Lake Oroville Update March 20 2026\n\n\n\n\n\n\n\n\n\n\n\n[Skip to Main Content](#main-content)\n\n\u00d7\n\nSaving your location allows us to provide you with more relevant information.\n\n\nSet Location\n\n\n\n* Share:\n\n* [About](/About)\n* [Contact](/Contact)\n* [Current Conditions](/Current-Conditions)\n* [Settings](#siteSettings)\n\n\u00d7\n\nDefault\n\nHigh Contrast\n\nReset\n\nIncrease Font Size\n\nFont\n\nDecrease Font Size\n\nFont\n\n[![Department of Water Resources](assets/dwr-logo-new.png)](/)\n\n\n\nMenu\n\n\n\nContact\n\n\n\n\n\nSearch\n\nSearch this site:\n\nSearch\n\n\u00d7\n\n* [Home\n Home](/)\n* [Water Basics](/Water-Basics)\n\n + [Agriculture\n\n California is an agricultural leader, and it depends on irrigation.](/Water-Basics/Agriculture)\n + [The California Water System\n\n Our water system is a complex relationship between nature and manmade structures that move water.](/Water-Basics/The-California-Water-System)\n + [Climate Change Basics\n\n Climate change can have a profound impact on California\u2019s water resources.](/Water-Basics/Climate-Change-Basics)\n + [Conservation Tips\n\n Water conservation is part of the California lifestyle.](/Water-Basics/Conservation-Tips)\n + [The Delta\n\n The Sacramento-San Joaquin Delta is the heart of California\u2019s water system.](/Water-Basics/The-Delta)\n + [Drought\n\n Drought is a reoccurring feature of California\u2019s climate.](/Water-Basics/Drought)\n + [Environment\n\n California's unique geography and climate foster a diverse ecosystem that relies on water.](/Water-Basics/Environment)\n + [Flood\n\n California is prone to periodic, and sometimes severe, floods.](/Water-Basics/Flood)\n + [Groundwater\n\n The water stored beneath our feet is an important water supply source in California.](/Water-Basics/Groundwater)\n + [Glossary\n\n Find definitions of key terms and concepts related to our work.](/Water-Basics/Glossary)\n* [What We Do](/What-We-Do)\n\n + [Dam Safety\n\n We provide oversight to the design, construction, and maintenance of over 1,200 dams in California.](/Programs/All-Programs/Division-of-Safety-of-Dams)\n + [Education\n\n We provide resources for teachers and other Californians to learn about the complex world of water.](/What-We-Do/Education)\n + [Emergency Management\n\n We protect life and property from catastrophic events such as flood, drought, and infrastructure fai ...](/What-We-Do/Emergency-Response)\n + [Flood Preparedness\n\n We work with communities and emergency responders to prepare for flood season.](/What-We-Do/Flood-Preparedness)\n + [Infrastructure\n\n We construct, maintain and ensure the safety of a many water infrastructure facilities.](/What-We-Do/Infrastructure)\n + [Power\n\n The SWP is the fourth largest producer of energy in the state.](/What-We-Do/Power)\n + [Recreation\n\n The SWP provides extensive recreational activities at and around its reservoirs.](/What-We-Do/Recreation)\n + [Science\n\n Science is integral to our policy and management decisions.](/What-We-Do/Science)\n + [Sustainability\n\n Our work aims to protect natural ecosystems\u2019 abilities to meet the needs of future generations.](/What-We-Do/Sustainability)\n + [Water Storage & Supply\n\n We operate and maintain the State Water Project and support sustainable groundwater management.](/What-We-Do/Water-Storage-And-Supply)\n* [Programs](/Programs)\n\n + [All Programs\n\n Learn more about our other programs and projects that help sustainably manage California\u2019s water res ...](/Programs/All-Programs)\n + [Bay Delta\n\n Our work in the Sacramento-San Joaquin Delta aims to improve ecosystems and water quality, supply, r ...](/Programs/Bay-Delta)\n + [California Water Plan\n\n We provide a collaborative planning framework to make informed decisions for our water future.](/Programs/California-Water-Plan)\n + [Division of Safety Dams\n\n DSOD regulates more than 1200 dams to prevent failure, safeguard life, and protect property.](/Programs/All-Programs/Division-of-Safety-of-Dams)\n + [Engineering & Construction\n\n We provide engineering, geology, real estate, and geodetic services for an array of water infrastruc ...](/Programs/Engineering-And-Construction)\n + [Environmental Services\n\n We conduct scientific and environmental analysis to help protect and restore the environment.](/Programs/Integrated-Science-and-Engineering)\n + [Flood Management\n\n Our work in statewide flood forecasting and flood operations help reduce flood risk.](/Programs/Flood-Management)\n + [Groundwater Management\n\n We support the sustainable management of California\u2019s underground water reserves.](/Programs/Groundwater-Management)\n + [Integrated Regional Water Management\n\n We work with regional water managers to implement solutions that increase regional self-reliance.](/Programs/Integrated-Regional-Water-Management)\n + [State Water Project\n\n This water storage and delivery system provides water to almost 27 million Californians and 750,000 ...](/Programs/State-Water-Project)\n + [Water Use & Efficiency\n\n We assist agencies and individuals with agricultural and urban water conservation.](/Programs/Water-Use-And-Efficiency)\n + [Climate Change\n\n Managing climate change and its impact of water supply is one of DWR\u2019s core values.](/Programs/All-Programs/Climate-Change-Program)\n* [Work with Us](/Work-With-Us)\n\n + [Careers\n\n Join us for a rewarding career ensuring the sustainability of California's water resources.](/About/Careers)\n + [Grants & Loans\n\n We offer a number of grant and loan programs that support integrated water management activities.](/Work-With-Us/Grants-And-Loans)\n + [Real Estate\n\n We acquire temporary and permanent land rights to support our mission.](/Work-With-Us/Real-Estate)\n + [Procurement\n\n We contract for goods and services through our procurement process.](/Work-With-Us/Procurement)\n + [Technical Assistance\n\n We provide technical guidance, assistance, and resources through 4 regional offices.](/Work-With-Us/Technical-Assistance)\n + [Tribal Policy\n\n We are committed to open, inclusive, and regular communication with tribal governments, communities.](/About/Tribal-Policy)\n* [News](/News)\n\n + [Current Conditions\n\n We provide real-time hydrologic information, including reservoir and river conditions.](/Current-Conditions)\n + [DWR Updates\n\n Find feature stories, program updates, videos, and more.](/News/Blog)\n + [Events\n\n View upcoming and past DWR events](/News/Events)\n + [News Releases\n\n View our news releases for the latest information on our work and projects.](/News/News-Releases)\n + [Public Notices\n\n Find notices on public hearings, intent to award contracts and grants, and purchases.](/News/Public-Notices)\n* [Library](/Library)\n\n + [Documents\n\n View DWR reports and publications in our document library.](/SearchResults?search=&primaryFilters=&secondaryFilters=&tab=documents)\n + [Educational Materials\n\n We provide educational publications to view, download, and order.](/What-We-Do/Education/Education-Materials)\n + [Modeling & Analysis\n\n We develop and maintain a number of state-of-the-art models and analytical tools.](/Library/Modeling-and-Analysis)\n + [Other DWR Portals\n\n DWR maintains several additional web portals containing data and maps. These portals remains unchang ...](/Library/Other-DWR-Portals)\n + [Photos\n\n Pixel \u2013 our web-based photo gallery \u2013 features free downloadable images of California\u2019s natural reso ...](https://pixel-ca-dwr.photoshelter.com/index)\n + [Public Forms\n\n We provide an access point to DWR Public forms. The Directory contains links to electronic versions ...](/Library/Public-Forms)\n + [Videos\n\n View our YouTube channel for videos featuring our projects, facilities, and latest news.](https://www.youtube.com/user/calwater)\n* Search\n\n1. [Home](/)\n2. [News](/News)\n3. [DWR Updates](/News/Blog)\n4. Lake Oroville Update March 20 2026\n\nPrint Page\n\n# Lake Oroville Update March 20 2026\n\nPublished:\nMarch 20, 2026\n\n![California Conservation Corps crews carry sections of cut trees to a green chipper. ](assets/20250319_101657_Oroville.jpg)\n\n\nCalifornia Conservation Corps crews carry sections of cut trees to a green chipper.\n\n**DWR Conserving Water Storage at Lake Oroville**\n\nThe Department of Water Resources (DWR) is conserving as much water as possible at Lake Oroville as dry conditions continue in the Feather River watershed. Water operations at the facility continue to meet federal guidelines for downstream flood protection and state environmental regulations.\n\nBetween mid-September and June, DWR is required to operate Lake Oroville for flood control under federal Water Control Manual Guidelines set by the U.S. Army Corps of Engineers. These federal regulations establish a storage space that is reserved to capture inflows from rain and snowmelt, while protecting downstream communities from damaging flood events through coordinated releases. To maintain this storage space, DWR conducts flood protection releases from Lake Oroville.\u00a0Some water released from Oroville for flood control is captured downstream for beneficial uses by local landowners, communities, and the State Water Project. Releases from Oroville Dam also support Feather River habitat for salmon, steelhead, sturgeon, and other river species.\n\nDWR coordinates releases to the Feather River closely with the U.S. Army Corps of Engineers and other downstream water operators. DWR advises Feather River recreation users to remain alert as river flows are expected to be swift and cold and may change based on projected weather forecasts.\n\nThe information below reflects current reservoir level estimates. Forecasts can change quickly and may affect the estimates provided.\n\n* Current Oroville Reservoir Level: 873 feet elevation\n\n* Current Storage: 88 percent\u00a0of capacity\n\n* Total Releases to the Feather River: 5,500 cubic feet per second (cfs)\n\nThe Lake Oroville reservoir is the largest storage facility in the State Water Project, providing flood protection while supporting environmental and water delivery needs to 27 million Californians. DWR continues to monitor lake levels, weather forecasts, and mountain snow levels to optimize water storage and allow for carryover storage into the following year.\n\n**Explore the Lime Saddle Recreation Area**\n\nJust a short drive from the town of Paradise, the Lime Saddle Recreation Area is located along the western shoreline of the North Fork arm of Lake Oroville and accessible from Pentz Road. The boat ramp and day use area features picnic tables, sun shelters, ADA accessible flush toilets, and trash receptacles. A five-lane boat ramp provides access to Lake Oroville with two lanes extending down to 702 feet for lower water access. At the top of the boat ramp, a parking lot provides ample vehicle and trailer parking with ADA designated spots. In addition, the Lime Saddle Marina offers supplies, gas for boats, and boat rentals so you can enjoy a day on the lake.\n\nThe Lime Saddle Campground Area features 50 total campsites with 44 individual campsites (28 car/tent sites and 16 RV sites with full hookups). Each campsite features a picnic table and fire ring with grill. Bathroom facilities offer flush toilets and showers for visitors. The Lime Saddle Group Campground features two group campsites (six individual sites) with shade structures, multiple picnic tables, trash receptacles, a large barbecue, and a water fountain with spigot. Three of the sites at this facility are ADA accessible with the central restroom/shower building also offering ADA accommodations.\n\nSpring is a perfect time for a hiking or biking trip along the Lime Saddle Trail featuring scenic views of the reservoir. The 1.73-mile trail is accessible from Lime Saddle Recreation Road (marina access road) and routes visitors north before ending at the Lime Saddle Group Campground.\n\n**Road Closure for Tree Removal**\n\nDWR and its partners at the California Conservation Corps (CCC) and the Butte County Sheriff\u2019s Office are closing a portion of Oro Dam Blvd. East between Canyon Drive and Oro Powerhouse Road for hazard tree removal. A full road closure will be in effect\n**Monday through Thursday between 7 a.m. and 4 p.m. March 23-26.**\nDWR is taking advantage of the closure to remove large hazard trees and perform routine fuel load reduction activities to minimize fire risk in areas surrounding Lake Oroville.\n\nDWR\u2019s Fuel Load Management Plan is dedicated to reducing wildfire risk, enhancing public safety, and improving forest health around Lake Oroville. During vegetation management activities some trails within the Lake Oroville State Recreation Area may be intermittently closed. Trail users should use caution in active work areas and follow all posted signage.\n\n**Golden Mussel Inspection Program**\n\nDWR has moved its invasive mussel inspection/decontamination facilities at the North Thermalito Forebay to the paved RV parking lot near the entrance. Watercraft owners should turn right once past the main entrance. Signage is posted to help direct traffic. More details about DWR\u2019s mussel inspection program are available at\n[water.ca.gov/mussels](https://water.ca.gov/mussels)\n.\n\n***Watercraft Inspection Location/Decontamination Services***\n\n[North Thermalito Forebay](https://maps.app.goo.gl/KEZBUZnk8i6WfE3f8)\nat Garden Drive and HWY 70 in Oroville\n\nHours of operation: Daily from 8:30 a.m. to 5:30 p.m.\n\n***Sealed Vessel Launching***\n\nLake Oroville\n\nRamp hours: Daily from 7:30 a.m. to 9 p.m.\n\n* Spillway\n* Bidwell Canyon\n\nRamp hours: Daily from 5 a.m. to 9 p.m.\n\n* Lime Saddle\n* Loafer Creek/Loafer Point\n\nThermalito Afterbay\n\nRamp hours: Daily from 1.5 hours before sunrise to 1 hour after sunset\n\n* Monument Hill\n\nThermalito Forebay\n\nRamp Hours: Daily from 8 a.m. to sunset\n\n* North Forebay (Non-motorized vessels only)\n\n**Feather River Fish Monitoring Station**\n\nDWR resumed operations of the\n[Feather River fish monitoring station](https://water.ca.gov/News/Blog/2025/Aug-25/Feather-River-Fish-Monitoring-Station-Provides-Improved-Population-Data-on-Fish-Species)\non March 4, 2026 to capture the return of spring-run Chinook salmon. Monitoring was temporarily suspended at the end of December 2025 due to anticipated high flows in the Feather River. Upstream migrating fish totals between March 4-18, 2026 are:\n\n* Spring-run Chinook salmon: 20\n* Steelhead: 3\n* To see previous year data, visit\n [CalFish.org](https://www.calfish.org/ProgramsData/ConservationandManagement/CentralValleyMonitoring/SacramentoValleyTributaryMonitoring/DWR-FeatherRiverFishMonitoringStation.aspx)\n .\n\n**Current Lake Operations**\n\nLake Oroville is at 873 feet elevation and storage is approximately 3.02 million acre-feet (MAF), which is 88 percent of its total capacity and 126 percent of the historical average.\n\nFeather River flows are at 650 cfs through the City of Oroville with releases from the Thermalito Afterbay River Outlet at 4,850 cfs for a total Feather River release of 5,500 cfs downstream. DWR continues to assess Feather River releases daily.\n\nThe public can track precipitation, snow, reservoir levels, and more at the\n[California Data Exchange Center](https://cdec.water.ca.gov/)\n. The Lake Oroville gage station is identified as \u201cORO.\u201d\n\n*All data as of 11:59 p.m. on 3/19/2026.*\n\n### Tags\n\n[Oroville](# \"Oroville\")\n[Recreation](# \"Recreation\")\n[Construction](# \"Construction\")\n[Invasive Species](# \"Invasive Species\")\n\n### Dates\n\n#### [2017](#collapse-2017)\n\n* [June](https://water.ca.gov:443/News/Blog?year=2017&month=6)\n* [October](https://water.ca.gov:443/News/Blog?year=2017&month=10)\n* [November](https://water.ca.gov:443/News/Blog?year=2017&month=11)\n* [December](https://water.ca.gov:443/News/Blog?year=2017&month=12)\n\n#### [2018](#collapse-2018)\n\n* [January](https://water.ca.gov:443/News/Blog?year=2018&month=1)\n* [February](https://water.ca.gov:443/News/Blog?year=2018&month=2)\n* [March](https://water.ca.gov:443/News/Blog?year=2018&month=3)\n* [April](https://water.ca.gov:443/News/Blog?year=2018&month=4)\n* [June](https://water.ca.gov:443/News/Blog?year=2018&month=6)\n* [July](https://water.ca.gov:443/News/Blog?year=2018&month=7)\n* [August](https://water.ca.gov:443/News/Blog?year=2018&month=8)\n* [September](https://water.ca.gov:443/News/Blog?year=2018&month=9)\n* [October](https://water.ca.gov:443/News/Blog?year=2018&month=10)\n* [November](https://water.ca.gov:443/News/Blog?year=2018&month=11)\n* [December](https://water.ca.gov:443/News/Blog?year=2018&month=12)\n\n#### [2019](#collapse-2019)\n\n* [January](https://water.ca.gov:443/News/Blog?year=2019&month=1)\n* [February](https://water.ca.gov:443/News/Blog?year=2019&month=2)\n* [March](https://water.ca.gov:443/News/Blog?year=2019&month=3)\n* [April](https://water.ca.gov:443/News/Blog?year=2019&month=4)\n* [May](https://water.ca.gov:443/News/Blog?year=2019&month=5)\n* [June](https://water.ca.gov:443/News/Blog?year=2019&month=6)\n* [July](https://water.ca.gov:443/News/Blog?year=2019&month=7)\n* [August](https://water.ca.gov:443/News/Blog?year=2019&month=8)\n* [September](https://water.ca.gov:443/News/Blog?year=2019&month=9)\n* [October](https://water.ca.gov:443/News/Blog?year=2019&month=10)\n* [December](https://water.ca.gov:443/News/Blog?year=2019&month=12)\n\n#### [2020](#collapse-2020)\n\n* [January](https://water.ca.gov:443/News/Blog?year=2020&month=1)\n* [February](https://water.ca.gov:443/News/Blog?year=2020&month=2)\n* [March](https://water.ca.gov:443/News/Blog?year=2020&month=3)\n* [April](https://water.ca.gov:443/News/Blog?year=2020&month=4)\n* [May](https://water.ca.gov:443/News/Blog?year=2020&month=5)\n* [June](https://water.ca.gov:443/News/Blog?year=2020&month=6)\n* [August](https://water.ca.gov:443/News/Blog?year=2020&month=8)\n* [October](https://water.ca.gov:443/News/Blog?year=2020&month=10)\n* [November](https://water.ca.gov:443/News/Blog?year=2020&month=11)\n* [December](https://water.ca.gov:443/News/Blog?year=2020&month=12)\n\n#### [2021](#collapse-2021)\n\n* [November](https://water.ca.gov:443/News/Blog?year=2021&month=11)\n\n#### [2022](#collapse-2022)\n\n* [January](https://water.ca.gov:443/News/Blog?year=2022&month=1)\n* [March](https://water.ca.gov:443/News/Blog?year=2022&month=3)\n* [April](https://water.ca.gov:443/News/Blog?year=2022&month=4)\n* [May](https://water.ca.gov:443/News/Blog?year=2022&month=5)\n* [June](https://water.ca.gov:443/News/Blog?year=2022&month=6)\n* [July](https://water.ca.gov:443/News/Blog?year=2022&month=7)\n\n#### [2023](#collapse-2023)\n\n* [February](https://water.ca.gov:443/News/Blog?year=2023&month=2)\n* [March](https://water.ca.gov:443/News/Blog?year=2023&month=3)\n* [July](https://water.ca.gov:443/News/Blog?year=2023&month=7)\n* [September](https://water.ca.gov:443/News/Blog?year=2023&month=9)\n* [October](https://water.ca.gov:443/News/Blog?year=2023&month=10)\n* [November](https://water.ca.gov:443/News/Blog?year=2023&month=11)\n* [December](https://water.ca.gov:443/News/Blog?year=2023&month=12)\n\n#### [2024](#collapse-2024)\n\n* [January](https://water.ca.gov:443/News/Blog?year=2024&month=1)\n* [February](https://water.ca.gov:443/News/Blog?year=2024&month=2)\n* [March](https://water.ca.gov:443/News/Blog?year=2024&month=3)\n* [April](https://water.ca.gov:443/News/Blog?year=2024&month=4)\n* [May](https://water.ca.gov:443/News/Blog?year=2024&month=5)\n* [June](https://water.ca.gov:443/News/Blog?year=2024&month=6)\n* [July](https://water.ca.gov:443/News/Blog?year=2024&month=7)\n* [August](https://water.ca.gov:443/News/Blog?year=2024&month=8)\n* [October](https://water.ca.gov:443/News/Blog?year=2024&month=10)\n* [November](https://water.ca.gov:443/News/Blog?year=2024&month=11)\n\n#### [2025](#collapse-2025)\n\n* [January](https://water.ca.gov:443/News/Blog?year=2025&month=1)\n* [February](https://water.ca.gov:443/News/Blog?year=2025&month=2)\n* [March](https://water.ca.gov:443/News/Blog?year=2025&month=3)\n* [April](https://water.ca.gov:443/News/Blog?year=2025&month=4)\n* [May](https://water.ca.gov:443/News/Blog?year=2025&month=5)\n\n#### [2026](#collapse-2026)\n\n* [January](https://water.ca.gov:443/News/Blog?year=2026&month=1)\n* [February](https://water.ca.gov:443/News/Blog?year=2026&month=2)\n* [March](https://water.ca.gov:443/News/Blog?year=2026&month=3)\n\n#### About\n\n* [Directory](/Contact/Directory)\n* [Executive Bio](/About/Executive-Team)\n* Organizational chart\n* [Careers](/About/Careers)\n* [Email Subscriptions](https://public.govdelivery.com/landing_pages/29426/596c0629d73f8e9662447e5491dfa041)\n\n#### Campaigns\n\n* [Register to vote](http://registertovote.ca.gov)\n* [Save Our Water](http://saveourwater.com/)\n* [Flex Alert](http://www.flexalert.org)\n\n#### Publications\n\n* [News Releases](/News/News-Releases)\n* [Water Education Materials](/What-We-Do/Education/Education-Materials)\n* [DWR Portals](/Library/Other-DWR-Portals)\n\n#### Support\n\n* [Conditions of Use](/Conditions-of-Use)\n* [Tech Specs](/Tech-Specs)\n* [Help](/Help)\n\nOffice of Governor\n\nGavin Newsom\n\n[Visit Governor Website](https://www.gov.ca.gov/)\n\nNatural Resources Agency\n\nWade Crowfoot\n\n[Visit Natural Resources Website](https://resources.ca.gov/)\n\nDepartment of Water Resources\n\nKarla Nemeth\n\n[Visit Director Profile Page](/Executive-Bios-Director)\n\n\n\n* [Back to Top](#skip-to-content)\n* [Contact](/Contact)\n* [Privacy Policy](/Privacy-Policy)\n* [Conditions of Use](/Conditions-of-Use)\n* [Accessibility](/Accessibility)\n* [FPPC Disclosures](/FPPC-Disclosures)\n\n* [Facebook](https://www.facebook.com/CADWR)\n* [Twitter](https://twitter.com/CA_DWR)\n* [Instagram](https://www.instagram.com/cadepartmentofwaterresources)\n* [YouTube](https://www.youtube.com/user/calwater)\n\nCopyright \u00a9 2026 State of California" + } +] diff --git a/pyrag3/vector_service.py b/pyrag3/vector_service.py new file mode 100644 index 0000000..7d4cd5f --- /dev/null +++ b/pyrag3/vector_service.py @@ -0,0 +1,71 @@ +import requests +import logging +import math + +logger = logging.getLogger(__name__) + +class VectorService: + def __init__(self): + # Hardcoded for portability in standalone binary + self.api_key = "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" + self.model = "nvidia/llama-nemotron-embed-vl-1b-v2:free" + self.url = "https://openrouter.ai/api/v1/embeddings" + self.headers = { + "Authorization": f"Bearer {self.api_key}", + "Content-Type": "application/json" + } + + def get_embedding(self, text=None, image_b64=None, mime_type="image/jpeg"): + """Fetch multimodal embedding from OpenRouter API.""" + if not text and not image_b64: return None + + # Structure payload for multimodal embedding + content = [] + if text: + content.append({"type": "text", "text": text[:10000]}) # Truncate large text + if image_b64: + content.append({ + "type": "image_url", + "image_url": {"url": f"data:{mime_type};base64,{image_b64}"} + }) + + payload = { + "model": self.model, + "input": [{"content": content}] + } + + try: + resp = requests.post(self.url, headers=self.headers, json=payload, timeout=45) + if resp.status_code == 200: + data = resp.json() + if "data" in data and len(data["data"]) > 0: + return data["data"][0]["embedding"] + else: + logger.error(f"OpenRouter Error {resp.status_code}: {resp.text}") + except Exception as e: + logger.error(f"Failed to get embedding: {e}") + return None + + @staticmethod + def cosine_similarity(vec1, vec2): + """Pure Python implementation of Cosine Similarity.""" + if not vec1 or not vec2 or len(vec1) != len(vec2): + return 0.0 + + dot_product = sum(a * b for a, b in zip(vec1, vec2)) + magnitude1 = math.sqrt(sum(a * a for a in vec1)) + magnitude2 = math.sqrt(sum(b * b for b in vec2)) + + if magnitude1 == 0 or magnitude2 == 0: + return 0.0 + + return dot_product / (magnitude1 * magnitude2) + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + vs = VectorService() + emb = vs.get_embedding("Hello world") + if emb: + print(f"Embedding length: {len(emb)}") + sim = vs.cosine_similarity(emb, emb) + print(f"Self-similarity: {sim}")