docrag/rag/__init__.py

"""
RAG System - Retrieval Augmented Generation

This module provides the core RAG functionality for DocRAG, including:
- Website downloading and ingestion via website_downloader_tool
- Document processing and chunking
- Vector storage and similarity search
- Context retrieval for enhanced prompts
"""

from __future__ import annotations

import asyncio
import logging
import os
from pathlib import Path
from typing import Any, Optional
from urllib.parse import urlparse

from .document_processor import DocumentProcessor
from .vector_store import VectorStore
from .retriever import Retriever

# Import the website downloader tool
from website_downloader_tool import website_downloader

log = logging.getLogger(__name__)


class RAGSystem:
    """
    Main RAG system that coordinates website downloading, document processing,
    storage, and retrieval.

    This class provides a unified interface for:
    - Downloading websites using website_downloader_tool
    - Processing downloaded content into the knowledge base
    - Querying for relevant context
    - Managing the document lifecycle
    """

    def __init__(
        self,
        embedding_model: str = "text-embedding-3-small",
        vector_store_path: str = "./data/vectors",
        documents_path: str = "./data/documents",
        downloaded_sites_path: str = "./data/downloaded_sites",
        chunk_size: int = 1000,
        chunk_overlap: int = 200,
    ):
        self.embedding_model = embedding_model
        self.vector_store_path = Path(vector_store_path)
        self.documents_path = Path(documents_path)
        self.downloaded_sites_path = Path(downloaded_sites_path)
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap

        self._initialized = False
        self._document_processor: Optional[DocumentProcessor] = None
        self._vector_store: Optional[VectorStore] = None
        self._retriever: Optional[Retriever] = None

        # Track downloaded sites with their source URLs
        self._site_registry: dict[str, dict[str, Any]] = {}

    async def initialize(self) -> None:
        """Initialize the RAG system components."""
        if self._initialized:
            return

        log.info("Initializing RAG system...")

        # Create directories
        self.vector_store_path.mkdir(parents=True, exist_ok=True)
        self.documents_path.mkdir(parents=True, exist_ok=True)
        self.downloaded_sites_path.mkdir(parents=True, exist_ok=True)

        # Initialize document processor
        self._document_processor = DocumentProcessor(
            chunk_size=self.chunk_size,
            chunk_overlap=self.chunk_overlap,
        )

        # Initialize vector store
        self._vector_store = VectorStore(
            persist_directory=str(self.vector_store_path),
            embedding_model=self.embedding_model,
        )
        await self._vector_store.initialize()

        # Initialize retriever
        self._retriever = Retriever(
            vector_store=self._vector_store,
        )

        # Load existing site registry
        await self._load_site_registry()

        self._initialized = True
        log.info("RAG system initialized successfully")

    async def close(self) -> None:
        """Close the RAG system and release resources."""
        if self._vector_store:
            await self._vector_store.close()
        await self._save_site_registry()
        self._initialized = False
        log.info("RAG system closed")

    def _ensure_initialized(self) -> None:
        """Ensure the RAG system is initialized."""
        if not self._initialized:
            raise RuntimeError("RAG system not initialized. Call initialize() first.")

    async def _load_site_registry(self) -> None:
        """Load the site registry from disk."""
        import json
        registry_file = self.downloaded_sites_path / "site_registry.json"
        if registry_file.exists():
            try:
                with open(registry_file, "r") as f:
                    self._site_registry = json.load(f)
                log.info(f"Loaded site registry with {len(self._site_registry)} sites")
            except Exception as e:
                log.warning(f"Failed to load site registry: {e}")
                self._site_registry = {}

    async def _save_site_registry(self) -> None:
        """Save the site registry to disk."""
        import json
        registry_file = self.downloaded_sites_path / "site_registry.json"
        try:
            with open(registry_file, "w") as f:
                json.dump(self._site_registry, f, indent=2, ensure_ascii=False)
            log.info(f"Saved site registry with {len(self._site_registry)} sites")
        except Exception as e:
            log.error(f"Failed to save site registry: {e}")

    async def download_and_ingest_website(
        self,
        url: str,
        max_pages: int = 50,
        threads: int = 6,
        download_external_assets: bool = False,
        external_domains: Optional[list[str]] = None,
    ) -> dict[str, Any]:
        """
        Download a website using website_downloader_tool and ingest all content
        into the knowledge base.

        This is the PRIMARY method for adding content to the RAG system.

        Args:
            url: URL of the website to download
            max_pages: Maximum number of pages to crawl
            threads: Number of concurrent download threads
            download_external_assets: Whether to download external assets
            external_domains: List of external domains to allow

        Returns:
            Dictionary with download and ingestion results
        """
        self._ensure_initialized()

        log.info(f"Downloading website: {url}")

        # Use website_downloader_tool to download the site
        download_result = website_downloader(
            url=url,
            destination=str(self.downloaded_sites_path / self._get_site_folder(url)),
            max_pages=max_pages,
            threads=threads,
            download_external_assets=download_external_assets,
            external_domains=external_domains,
        )

        if not download_result.get("success"):
            log.error(f"Website download failed: {download_result.get('message')}")
            return {
                "success": False,
                "message": download_result.get("message", "Download failed"),
                "url": url,
            }

        output_dir = download_result.get("output_directory", "")
        stats = download_result.get("stats", {})

        log.info(f"Website downloaded to: {output_dir}")

        # Process all HTML files from the downloaded site
        ingestion_result = await self._ingest_downloaded_site(
            site_path=Path(output_dir),
            source_url=url,
        )

        # Register the site
        site_id = self._generate_site_id(url)
        self._site_registry[site_id] = {
            "url": url,
            "local_path": output_dir,
            "pages_downloaded": stats.get("pages_crawled", 0),
            "assets_downloaded": stats.get("assets_downloaded", 0),
            "chunks_ingested": ingestion_result.get("total_chunks", 0),
            "timestamp": self._get_timestamp(),
        }
        await self._save_site_registry()

        return {
            "success": True,
            "url": url,
            "local_path": output_dir,
            "pages_processed": ingestion_result.get("pages_processed", 0),
            "total_chunks": ingestion_result.get("total_chunks", 0),
            "stats": stats,
        }

    async def _ingest_downloaded_site(
        self,
        site_path: Path,
        source_url: str,
    ) -> dict[str, Any]:
        """
        Ingest all HTML files from a downloaded website into the knowledge base.

        Args:
            site_path: Path to the downloaded website directory
            source_url: Original URL of the website

        Returns:
            Dictionary with ingestion statistics
        """
        pages_processed = 0
        total_chunks = 0
        errors = []

        # Find all HTML files
        html_files = list(site_path.rglob("*.html"))
        log.info(f"Found {len(html_files)} HTML files in {site_path}")

        for html_file in html_files:
            try:
                # Read the HTML file
                content = html_file.read_bytes()

                # Calculate relative path for the page pointer
                relative_path = html_file.relative_to(site_path)
                page_url = self._reconstruct_page_url(source_url, relative_path)

                # Extract text from HTML
                text_content = await self._document_processor.extract_text_from_html(content)

                if not text_content.strip():
                    continue

                # Process into chunks
                doc_info = await self._document_processor.process(
                    content=content,
                    filename=str(html_file),
                    metadata={
                        "source_url": source_url,
                        "page_url": page_url,
                        "local_path": str(html_file),
                        "relative_path": str(relative_path),
                        "source_type": "downloaded_website",
                    },
                )

                # Store chunks in vector store with pointers
                if doc_info.get("chunks"):
                    # Add source pointer to each chunk's metadata
                    for metadata in doc_info.get("metadatas", []):
                        metadata["source_url"] = source_url
                        metadata["page_url"] = page_url
                        metadata["local_path"] = str(html_file)
                        metadata["pointer"] = {
                            "type": "downloaded_page",
                            "url": page_url,
                            "local_file": str(html_file),
                        }

                    await self._vector_store.add_chunks(
                        chunks=doc_info["chunks"],
                        metadatas=doc_info.get("metadatas", []),
                        ids=doc_info.get("ids", []),
                    )

                    total_chunks += len(doc_info["chunks"])
                    pages_processed += 1

                    log.debug(f"Ingested: {relative_path} -> {len(doc_info['chunks'])} chunks")

            except Exception as e:
                errors.append(f"{html_file}: {str(e)}")
                log.warning(f"Failed to process {html_file}: {e}")

        log.info(f"Ingestion complete: {pages_processed} pages, {total_chunks} chunks")

        return {
            "pages_processed": pages_processed,
            "total_chunks": total_chunks,
            "errors": errors,
        }

    def _get_site_folder(self, url: str) -> str:
        """Generate a folder name for a site from its URL."""
        parsed = urlparse(url)
        # Use domain name as folder, replace dots with underscores
        folder = parsed.netloc.replace(".", "_").replace(":", "_")
        return folder

    def _generate_site_id(self, url: str) -> str:
        """Generate a unique ID for a site."""
        import hashlib
        return hashlib.md5(url.encode()).hexdigest()[:16]

    def _reconstruct_page_url(self, base_url: str, relative_path: Path) -> str:
        """Reconstruct the original URL for a downloaded page."""
        parsed = urlparse(base_url)
        # Convert relative path back to URL path
        path_parts = list(relative_path.parts)

        # Handle index.html as directory
        if path_parts and path_parts[-1] == "index.html":
            path_parts = path_parts[:-1]
        # Remove .html extension from other files
        elif path_parts and path_parts[-1].endswith(".html"):
            path_parts[-1] = path_parts[-1][:-5]

        url_path = "/".join(path_parts)
        return f"{parsed.scheme}://{parsed.netloc}/{url_path}"

    def _get_timestamp(self) -> str:
        """Get current timestamp in ISO format."""
        from datetime import datetime
        return datetime.utcnow().isoformat()

    async def add_document(
        self,
        content: bytes,
        filename: str,
        metadata: Optional[dict[str, Any]] = None,
    ) -> dict[str, Any]:
        """
        Add a document to the knowledge base.

        Note: For websites, prefer using download_and_ingest_website() instead.

        Args:
            content: Raw document content
            filename: Original filename
            metadata: Optional metadata

        Returns:
            Dictionary with processing results
        """
        self._ensure_initialized()

        # Process document
        doc_info = await self._document_processor.process(
            content=content,
            filename=filename,
            metadata=metadata,
        )

        # Store chunks in vector store with pointers
        if doc_info.get("chunks"):
            # Add pointer metadata
            for metadata in doc_info.get("metadatas", []):
                metadata["pointer"] = {
                    "type": "uploaded_file",
                    "filename": filename,
                }

            await self._vector_store.add_chunks(
                chunks=doc_info["chunks"],
                metadatas=doc_info.get("metadatas", []),
                ids=doc_info.get("ids", []),
            )

        log.info(f"Added document '{filename}' with {len(doc_info.get('chunks', []))} chunks")
        return {"chunks": len(doc_info.get("chunks", [])), "document_id": doc_info.get("document_id")}

    async def query(
        self,
        query: str,
        top_k: int = 5,
        filter_metadata: Optional[dict] = None,
        include_pointers: bool = True,
    ) -> dict[str, Any]:
        """
        Query the knowledge base for relevant context.

        Args:
            query: Query string
            top_k: Number of results to return
            filter_metadata: Optional metadata filters
            include_pointers: Whether to include page pointers in results

        Returns:
            Dictionary with context, sources, and page pointers
        """
        self._ensure_initialized()

        # Retrieve relevant chunks
        results = await self._retriever.retrieve(
            query=query,
            top_k=top_k,
            filter_metadata=filter_metadata,
        )

        # Build context string and collect pointers
        context_parts = []
        sources = []
        pointers = []

        for i, result in enumerate(results):
            context_parts.append(f"[{i+1}] {result['content']}")

            metadata = result.get("metadata", {})

            # Collect source info
            if metadata.get("page_url"):
                sources.append(metadata["page_url"])
            elif metadata.get("source_url"):
                sources.append(metadata["source_url"])
            elif metadata.get("source"):
                sources.append(metadata["source"])

            # Collect pointer info
            if include_pointers and metadata.get("pointer"):
                pointer = metadata["pointer"]
                pointer["chunk_id"] = result.get("id")
                pointer["score"] = result.get("score")
                pointers.append(pointer)

        context = "\n\n".join(context_parts)

        return {
            "context": context,
            "sources": list(set(sources)),
            "pointers": pointers,
            "num_results": len(results),
            "results": results,
        }

    async def list_documents(self) -> list[dict[str, Any]]:
        """List all documents in the knowledge base."""
        self._ensure_initialized()

        # Get documents from vector store
        docs = await self._vector_store.list_documents()

        # Enrich with site registry info
        for doc in docs:
            source_url = doc.get("source_url")
            if source_url:
                site_id = self._generate_site_id(source_url)
                if site_id in self._site_registry:
                    doc["site_info"] = self._site_registry[site_id]

        return docs

    async def list_downloaded_sites(self) -> list[dict[str, Any]]:
        """List all downloaded websites."""
        return list(self._site_registry.values())

    async def delete_document(self, document_id: str) -> None:
        """Delete a document from the knowledge base."""
        self._ensure_initialized()
        await self._vector_store.delete_document(document_id)
        log.info(f"Deleted document {document_id}")

    async def delete_site(self, url: str) -> dict[str, Any]:
        """
        Delete a downloaded website and all its content from the knowledge base.

        Args:
            url: URL of the site to delete

        Returns:
            Dictionary with deletion results
        """
        self._ensure_initialized()

        site_id = self._generate_site_id(url)

        if site_id not in self._site_registry:
            return {"success": False, "message": f"Site not found: {url}"}

        site_info = self._site_registry[site_id]
        local_path = site_info.get("local_path")

        # Delete from vector store
        deleted_chunks = await self._vector_store.delete_by_source_url(url)

        # Delete local files
        import shutil
        if local_path and Path(local_path).exists():
            shutil.rmtree(local_path)

        # Remove from registry
        del self._site_registry[site_id]
        await self._save_site_registry()

        log.info(f"Deleted site: {url}")

        return {
            "success": True,
            "url": url,
            "deleted_chunks": deleted_chunks,
            "deleted_path": local_path,
        }

    def get_site_info(self, url: str) -> Optional[dict[str, Any]]:
        """Get information about a downloaded site."""
        site_id = self._generate_site_id(url)
        return self._site_registry.get(site_id)


# Global RAG system instance
_rag_system: Optional[RAGSystem] = None


async def get_rag_system(
    embedding_model: str = "text-embedding-3-small",
    vector_store_path: str = "./data/vectors",
    documents_path: str = "./data/documents",
    downloaded_sites_path: str = "./data/downloaded_sites",
    chunk_size: int = 1000,
    chunk_overlap: int = 200,
) -> RAGSystem:
    """
    Get or create the global RAG system instance.

    Args:
        embedding_model: Name of the embedding model
        vector_store_path: Path to vector store
        documents_path: Path to document storage
        downloaded_sites_path: Path to downloaded websites
        chunk_size: Size of document chunks
        chunk_overlap: Overlap between chunks

    Returns:
        Initialized RAGSystem instance
    """
    global _rag_system

    if _rag_system is None:
        _rag_system = RAGSystem(
            embedding_model=embedding_model,
            vector_store_path=vector_store_path,
            documents_path=documents_path,
            downloaded_sites_path=downloaded_sites_path,
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
        )
        await _rag_system.initialize()

    return _rag_system