Integrate website_downloader_tool into RAG system

Features: - RAG system now uses website_downloader_tool as primary content ingestion method - download_and_ingest_website() method for complete website processing - Stores page pointers (source_url, page_url, local_path) in vector store - Site registry tracks all downloaded websites with metadata - New API endpoints for website management: - POST /v1/documents/website - Download and ingest a website - GET /v1/documents/sites - List all downloaded sites - GET /v1/documents/sites/{url} - Get site info - DELETE /v1/documents/sites/{url} - Delete a site and its content Changes: - rag/__init__.py: Added download_and_ingest_website(), site registry - rag/document_processor.py: Added extract_text_from_html() public method - rag/vector_store.py: Added delete_by_source_url(), get_stats() - main.py: New website endpoints, integrated tool with RAG system
2026-03-29 02:36:59 +00:00 · 2026-03-29 02:36:59 +00:00 · 6aecc4b231
commit 6aecc4b231
parent eabdadfb62
4 changed files with 566 additions and 51 deletions
--- a/main.py
+++ b/main.py
@ -581,27 +581,36 @@ async def check_and_execute_tools(
    # In a production system, you'd use the LLM to decide tool usage
    message_lower = user_message.lower()
-    # Check for website download intent
+    # Check for website download intent - use RAG system for full integration
-    if any(kw in message_lower for kw in ["download website", "mirror site", "crawl", "archive site"]):
+    if any(kw in message_lower for kw in ["download website", "mirror site", "crawl", "archive site", "ingest site"]):
        # Extract URL from message
        import re
        url_pattern = r'https?://[^\s]+'
        urls = re.findall(url_pattern, user_message)
-        if urls:
+        if urls and state.rag_system:
-            tool_result = state.tool_manager.execute_tool(
+            # Use RAG system's integrated website downloader
-                "website_downloader",
+            try:
-                {"url": urls[0], "max_pages": 10}
+                result = await state.rag_system.download_and_ingest_website(
-            )
+                    url=urls[0],
-            tool_calls.append({
+                    max_pages=20,  # Reasonable default
-                "id": f"call_{uuid.uuid4().hex[:24]}",
+                )
-                "type": "function",
+                
-                "function": {
+                tool_calls.append({
-                    "name": "website_downloader",
+                    "id": f"call_{uuid.uuid4().hex[:24]}",
-                    "arguments": json.dumps({"url": urls[0]}),
+                    "type": "function",
-                }
+                    "function": {
-            })
+                        "name": "website_downloader",
-            log.info(f"Executed website_downloader tool: {tool_result}")
+                        "arguments": json.dumps({
                            "url": urls[0],
                            "success": result.get("success"),
                            "chunks_ingested": result.get("total_chunks", 0),
                        }),
                    }
                })
                log.info(f"Downloaded and ingested website: {urls[0]} -> {result.get('total_chunks', 0)} chunks")
            except Exception as e:
                log.error(f"Website download failed: {e}")
    return tool_calls
@ -638,9 +647,67 @@ async def upload_document(request: Request):
        raise HTTPException(status_code=500, detail=str(e))
 class WebsiteDownloadRequest(BaseModel):
    """Request model for website download."""
    url: str
    max_pages: int = 50
    threads: int = 6
    download_external_assets: bool = False
    external_domains: Optional[list[str]] = None
@app.post("/v1/documents/website")
 async def download_website(request: WebsiteDownloadRequest):
    """
    Download a website and ingest it into the knowledge base.
    This is the PRIMARY way to add content to the RAG system.
    Uses the website_downloader_tool to download and process websites.
    """
    if not state.rag_system:
        raise HTTPException(status_code=503, detail="RAG system not initialized")
    try:
        log.info(f"Downloading website: {request.url}")
        result = await state.rag_system.download_and_ingest_website(
            url=request.url,
            max_pages=request.max_pages,
            threads=request.threads,
            download_external_assets=request.download_external_assets,
            external_domains=request.external_domains,
        )
        if result.get("success"):
            return {
                "success": True,
                "message": f"Website downloaded and ingested: {request.url}",
                "url": request.url,
                "local_path": result.get("local_path"),
                "pages_processed": result.get("pages_processed", 0),
                "total_chunks": result.get("total_chunks", 0),
            }
        else:
            raise HTTPException(
                status_code=500,
                detail=result.get("message", "Website download failed")
            )
    except HTTPException:
        raise
    except Exception as e:
        log.exception("Website download failed")
        raise HTTPException(status_code=500, detail=str(e))
@app.post("/v1/documents/url")
 async def add_document_from_url(request: dict):
-    """Add a document from URL to the knowledge base."""
+    """
    Add a document from URL to the knowledge base.
    NOTE: For websites, prefer using /v1/documents/website instead
    as it downloads the entire site and provides better context.
    """
    if not state.rag_system:
        raise HTTPException(status_code=503, detail="RAG system not initialized")
@ -670,6 +737,47 @@ async def list_documents():
        raise HTTPException(status_code=500, detail=str(e))
@app.get("/v1/documents/sites")
 async def list_downloaded_sites():
    """List all downloaded websites in the knowledge base."""
    if not state.rag_system:
        raise HTTPException(status_code=503, detail="RAG system not initialized")
    try:
        sites = await state.rag_system.list_downloaded_sites()
        return {"sites": sites}
    except Exception as e:
        log.exception("Site listing failed")
        raise HTTPException(status_code=500, detail=str(e))
@app.get("/v1/documents/sites/{url:path}")
 async def get_site_info(url: str):
    """Get information about a specific downloaded site."""
    if not state.rag_system:
        raise HTTPException(status_code=503, detail="RAG system not initialized")
    try:
        # URL will be passed as path parameter, need to decode
        from urllib.parse import unquote
        decoded_url = unquote(url)
        # Add scheme if missing
        if not decoded_url.startswith(("http://", "https://")):
            decoded_url = "https://" + decoded_url
        site_info = state.rag_system.get_site_info(decoded_url)
        if site_info:
            return {"site": site_info}
        else:
            raise HTTPException(status_code=404, detail="Site not found")
    except HTTPException:
        raise
    except Exception as e:
        log.exception("Site info retrieval failed")
        raise HTTPException(status_code=500, detail=str(e))
@app.delete("/v1/documents/{doc_id}")
 async def delete_document(doc_id: str):
    """Delete a document from the knowledge base."""
@ -684,6 +792,40 @@ async def delete_document(doc_id: str):
        raise HTTPException(status_code=500, detail=str(e))
@app.delete("/v1/documents/sites/{url:path}")
 async def delete_site(url: str):
    """Delete a downloaded website and all its content from the knowledge base."""
    if not state.rag_system:
        raise HTTPException(status_code=503, detail="RAG system not initialized")
    try:
        # URL will be passed as path parameter, need to decode
        from urllib.parse import unquote
        decoded_url = unquote(url)
        # Add scheme if missing
        if not decoded_url.startswith(("http://", "https://")):
            decoded_url = "https://" + decoded_url
        result = await state.rag_system.delete_site(decoded_url)
        if result.get("success"):
            return {
                "success": True,
                "message": f"Site {decoded_url} deleted",
                "deleted_chunks": result.get("deleted_chunks", 0),
                "deleted_path": result.get("deleted_path"),
            }
        else:
            raise HTTPException(status_code=404, detail=result.get("message", "Site not found"))
    except HTTPException:
        raise
    except Exception as e:
        log.exception("Site deletion failed")
        raise HTTPException(status_code=500, detail=str(e))
 # =============================================================================
 # Health and Status Endpoints
 # =============================================================================
@ -711,6 +853,8 @@ async def root():
            "chat": "/v1/chat/completions",
            "models": "/v1/models",
            "documents": "/v1/documents",
            "download_website": "/v1/documents/website",
            "list_sites": "/v1/documents/sites",
            "health": "/health",
        },
    }
--- a/rag/init.py
+++ b/rag/init.py
@ -2,6 +2,7 @@
 RAG System - Retrieval Augmented Generation
 This module provides the core RAG functionality for DocRAG, including:
 - Website downloading and ingestion via website_downloader_tool
 - Document processing and chunking
 - Vector storage and similarity search
 - Context retrieval for enhanced prompts
@ -14,20 +15,26 @@ import logging
 import os
 from pathlib import Path
 from typing import Any, Optional
 from urllib.parse import urlparse
 from .document_processor import DocumentProcessor
 from .vector_store import VectorStore
 from .retriever import Retriever
 # Import the website downloader tool
 from website_downloader_tool import website_downloader
 log = logging.getLogger(__name__)
 class RAGSystem:
    """
-    Main RAG system that coordinates document processing, storage, and retrieval.
+    Main RAG system that coordinates website downloading, document processing, 
    storage, and retrieval.
    This class provides a unified interface for:
-    - Adding documents to the knowledge base
+    - Downloading websites using website_downloader_tool
    - Processing downloaded content into the knowledge base
    - Querying for relevant context
    - Managing the document lifecycle
    """
@ -37,12 +44,14 @@ class RAGSystem:
        embedding_model: str = "text-embedding-3-small",
        vector_store_path: str = "./data/vectors",
        documents_path: str = "./data/documents",
        downloaded_sites_path: str = "./data/downloaded_sites",
        chunk_size: int = 1000,
        chunk_overlap: int = 200,
    ):
        self.embedding_model = embedding_model
        self.vector_store_path = Path(vector_store_path)
        self.documents_path = Path(documents_path)
        self.downloaded_sites_path = Path(downloaded_sites_path)
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
@ -50,6 +59,9 @@ class RAGSystem:
        self._document_processor: Optional[DocumentProcessor] = None
        self._vector_store: Optional[VectorStore] = None
        self._retriever: Optional[Retriever] = None
        # Track downloaded sites with their source URLs
        self._site_registry: dict[str, dict[str, Any]] = {}
    async def initialize(self) -> None:
        """Initialize the RAG system components."""
@ -61,6 +73,7 @@ class RAGSystem:
        # Create directories
        self.vector_store_path.mkdir(parents=True, exist_ok=True)
        self.documents_path.mkdir(parents=True, exist_ok=True)
        self.downloaded_sites_path.mkdir(parents=True, exist_ok=True)
        # Initialize document processor
        self._document_processor = DocumentProcessor(
@ -73,12 +86,16 @@ class RAGSystem:
            persist_directory=str(self.vector_store_path),
            embedding_model=self.embedding_model,
        )
        await self._vector_store.initialize()
        # Initialize retriever
        self._retriever = Retriever(
            vector_store=self._vector_store,
        )
        # Load existing site registry
        await self._load_site_registry()
        self._initialized = True
        log.info("RAG system initialized successfully")
@ -86,6 +103,7 @@ class RAGSystem:
        """Close the RAG system and release resources."""
        if self._vector_store:
            await self._vector_store.close()
        await self._save_site_registry()
        self._initialized = False
        log.info("RAG system closed")
@ -94,6 +112,228 @@ class RAGSystem:
        if not self._initialized:
            raise RuntimeError("RAG system not initialized. Call initialize() first.")
    async def _load_site_registry(self) -> None:
        """Load the site registry from disk."""
        import json
        registry_file = self.downloaded_sites_path / "site_registry.json"
        if registry_file.exists():
            try:
                with open(registry_file, "r") as f:
                    self._site_registry = json.load(f)
                log.info(f"Loaded site registry with {len(self._site_registry)} sites")
            except Exception as e:
                log.warning(f"Failed to load site registry: {e}")
                self._site_registry = {}
    async def _save_site_registry(self) -> None:
        """Save the site registry to disk."""
        import json
        registry_file = self.downloaded_sites_path / "site_registry.json"
        try:
            with open(registry_file, "w") as f:
                json.dump(self._site_registry, f, indent=2, ensure_ascii=False)
            log.info(f"Saved site registry with {len(self._site_registry)} sites")
        except Exception as e:
            log.error(f"Failed to save site registry: {e}")
    async def download_and_ingest_website(
        self,
        url: str,
        max_pages: int = 50,
        threads: int = 6,
        download_external_assets: bool = False,
        external_domains: Optional[list[str]] = None,
    ) -> dict[str, Any]:
        """
        Download a website using website_downloader_tool and ingest all content
        into the knowledge base.
        This is the PRIMARY method for adding content to the RAG system.
        Args:
            url: URL of the website to download
            max_pages: Maximum number of pages to crawl
            threads: Number of concurrent download threads
            download_external_assets: Whether to download external assets
            external_domains: List of external domains to allow
        Returns:
            Dictionary with download and ingestion results
        """
        self._ensure_initialized()
        log.info(f"Downloading website: {url}")
        # Use website_downloader_tool to download the site
        download_result = website_downloader(
            url=url,
            destination=str(self.downloaded_sites_path / self._get_site_folder(url)),
            max_pages=max_pages,
            threads=threads,
            download_external_assets=download_external_assets,
            external_domains=external_domains,
        )
        if not download_result.get("success"):
            log.error(f"Website download failed: {download_result.get('message')}")
            return {
                "success": False,
                "message": download_result.get("message", "Download failed"),
                "url": url,
            }
        output_dir = download_result.get("output_directory", "")
        stats = download_result.get("stats", {})
        log.info(f"Website downloaded to: {output_dir}")
        # Process all HTML files from the downloaded site
        ingestion_result = await self._ingest_downloaded_site(
            site_path=Path(output_dir),
            source_url=url,
        )
        # Register the site
        site_id = self._generate_site_id(url)
        self._site_registry[site_id] = {
            "url": url,
            "local_path": output_dir,
            "pages_downloaded": stats.get("pages_crawled", 0),
            "assets_downloaded": stats.get("assets_downloaded", 0),
            "chunks_ingested": ingestion_result.get("total_chunks", 0),
            "timestamp": self._get_timestamp(),
        }
        await self._save_site_registry()
        return {
            "success": True,
            "url": url,
            "local_path": output_dir,
            "pages_processed": ingestion_result.get("pages_processed", 0),
            "total_chunks": ingestion_result.get("total_chunks", 0),
            "stats": stats,
        }
    async def _ingest_downloaded_site(
        self,
        site_path: Path,
        source_url: str,
    ) -> dict[str, Any]:
        """
        Ingest all HTML files from a downloaded website into the knowledge base.
        Args:
            site_path: Path to the downloaded website directory
            source_url: Original URL of the website
        Returns:
            Dictionary with ingestion statistics
        """
        pages_processed = 0
        total_chunks = 0
        errors = []
        # Find all HTML files
        html_files = list(site_path.rglob("*.html"))
        log.info(f"Found {len(html_files)} HTML files in {site_path}")
        for html_file in html_files:
            try:
                # Read the HTML file
                content = html_file.read_bytes()
                # Calculate relative path for the page pointer
                relative_path = html_file.relative_to(site_path)
                page_url = self._reconstruct_page_url(source_url, relative_path)
                # Extract text from HTML
                text_content = await self._document_processor.extract_text_from_html(content)
                if not text_content.strip():
                    continue
                # Process into chunks
                doc_info = await self._document_processor.process(
                    content=content,
                    filename=str(html_file),
                    metadata={
                        "source_url": source_url,
                        "page_url": page_url,
                        "local_path": str(html_file),
                        "relative_path": str(relative_path),
                        "source_type": "downloaded_website",
                    },
                )
                # Store chunks in vector store with pointers
                if doc_info.get("chunks"):
                    # Add source pointer to each chunk's metadata
                    for metadata in doc_info.get("metadatas", []):
                        metadata["source_url"] = source_url
                        metadata["page_url"] = page_url
                        metadata["local_path"] = str(html_file)
                        metadata["pointer"] = {
                            "type": "downloaded_page",
                            "url": page_url,
                            "local_file": str(html_file),
                        }
                    await self._vector_store.add_chunks(
                        chunks=doc_info["chunks"],
                        metadatas=doc_info.get("metadatas", []),
                        ids=doc_info.get("ids", []),
                    )
                    total_chunks += len(doc_info["chunks"])
                    pages_processed += 1
                    log.debug(f"Ingested: {relative_path} -> {len(doc_info['chunks'])} chunks")
            except Exception as e:
                errors.append(f"{html_file}: {str(e)}")
                log.warning(f"Failed to process {html_file}: {e}")
        log.info(f"Ingestion complete: {pages_processed} pages, {total_chunks} chunks")
        return {
            "pages_processed": pages_processed,
            "total_chunks": total_chunks,
            "errors": errors,
        }
    def _get_site_folder(self, url: str) -> str:
        """Generate a folder name for a site from its URL."""
        parsed = urlparse(url)
        # Use domain name as folder, replace dots with underscores
        folder = parsed.netloc.replace(".", "_").replace(":", "_")
        return folder
    def _generate_site_id(self, url: str) -> str:
        """Generate a unique ID for a site."""
        import hashlib
        return hashlib.md5(url.encode()).hexdigest()[:16]
    def _reconstruct_page_url(self, base_url: str, relative_path: Path) -> str:
        """Reconstruct the original URL for a downloaded page."""
        parsed = urlparse(base_url)
        # Convert relative path back to URL path
        path_parts = list(relative_path.parts)
        # Handle index.html as directory
        if path_parts and path_parts[-1] == "index.html":
            path_parts = path_parts[:-1]
        # Remove .html extension from other files
        elif path_parts and path_parts[-1].endswith(".html"):
            path_parts[-1] = path_parts[-1][:-5]
        url_path = "/".join(path_parts)
        return f"{parsed.scheme}://{parsed.netloc}/{url_path}"
    def _get_timestamp(self) -> str:
        """Get current timestamp in ISO format."""
        from datetime import datetime
        return datetime.utcnow().isoformat()
    async def add_document(
        self,
        content: bytes,
@ -102,6 +342,8 @@ class RAGSystem:
    ) -> dict[str, Any]:
        """
        Add a document to the knowledge base.
        Note: For websites, prefer using download_and_ingest_website() instead.
        Args:
            content: Raw document content
@ -120,8 +362,15 @@ class RAGSystem:
            metadata=metadata,
        )
-        # Store chunks in vector store
+        # Store chunks in vector store with pointers
        if doc_info.get("chunks"):
            # Add pointer metadata
            for metadata in doc_info.get("metadatas", []):
                metadata["pointer"] = {
                    "type": "uploaded_file",
                    "filename": filename,
                }
            await self._vector_store.add_chunks(
                chunks=doc_info["chunks"],
                metadatas=doc_info.get("metadatas", []),
@ -131,37 +380,12 @@ class RAGSystem:
        log.info(f"Added document '{filename}' with {len(doc_info.get('chunks', []))} chunks")
        return {"chunks": len(doc_info.get("chunks", [])), "document_id": doc_info.get("document_id")}
    async def add_document_from_url(self, url: str) -> dict[str, Any]:
        """
        Add a document from a URL to the knowledge base.
        Args:
            url: URL to fetch and process
        Returns:
            Dictionary with processing results
        """
        self._ensure_initialized()
        # Fetch content from URL
        import aiohttp
        async with aiohttp.ClientSession() as session:
            async with session.get(url, timeout=30) as response:
                response.raise_for_status()
                content = await response.read()
        # Extract filename from URL
        from urllib.parse import urlparse
        parsed = urlparse(url)
        filename = os.path.basename(parsed.path) or "webpage.html"
        return await self.add_document(content=content, filename=filename, metadata={"source_url": url})
    async def query(
        self,
        query: str,
        top_k: int = 5,
        filter_metadata: Optional[dict] = None,
        include_pointers: bool = True,
    ) -> dict[str, Any]:
        """
        Query the knowledge base for relevant context.
@ -170,9 +394,10 @@ class RAGSystem:
            query: Query string
            top_k: Number of results to return
            filter_metadata: Optional metadata filters
            include_pointers: Whether to include page pointers in results
        Returns:
-            Dictionary with context and sources
+            Dictionary with context, sources, and page pointers
        """
        self._ensure_initialized()
@ -183,20 +408,37 @@ class RAGSystem:
            filter_metadata=filter_metadata,
        )
-        # Build context string
+        # Build context string and collect pointers
        context_parts = []
        sources = []
        pointers = []
        for i, result in enumerate(results):
            context_parts.append(f"[{i+1}] {result['content']}")
-            if result.get("metadata", {}).get("source"):
+            
-                sources.append(result["metadata"]["source"])
+            metadata = result.get("metadata", {})
            # Collect source info
            if metadata.get("page_url"):
                sources.append(metadata["page_url"])
            elif metadata.get("source_url"):
                sources.append(metadata["source_url"])
            elif metadata.get("source"):
                sources.append(metadata["source"])
            # Collect pointer info
            if include_pointers and metadata.get("pointer"):
                pointer = metadata["pointer"]
                pointer["chunk_id"] = result.get("id")
                pointer["score"] = result.get("score")
                pointers.append(pointer)
        context = "\n\n".join(context_parts)
        return {
            "context": context,
            "sources": list(set(sources)),
            "pointers": pointers,
            "num_results": len(results),
            "results": results,
        }
@ -204,7 +446,23 @@ class RAGSystem:
    async def list_documents(self) -> list[dict[str, Any]]:
        """List all documents in the knowledge base."""
        self._ensure_initialized()
-        return await self._vector_store.list_documents()
+        
        # Get documents from vector store
        docs = await self._vector_store.list_documents()
        # Enrich with site registry info
        for doc in docs:
            source_url = doc.get("source_url")
            if source_url:
                site_id = self._generate_site_id(source_url)
                if site_id in self._site_registry:
                    doc["site_info"] = self._site_registry[site_id]
        return docs
    async def list_downloaded_sites(self) -> list[dict[str, Any]]:
        """List all downloaded websites."""
        return list(self._site_registry.values())
    async def delete_document(self, document_id: str) -> None:
        """Delete a document from the knowledge base."""
@ -212,6 +470,52 @@ class RAGSystem:
        await self._vector_store.delete_document(document_id)
        log.info(f"Deleted document {document_id}")
    async def delete_site(self, url: str) -> dict[str, Any]:
        """
        Delete a downloaded website and all its content from the knowledge base.
        Args:
            url: URL of the site to delete
        Returns:
            Dictionary with deletion results
        """
        self._ensure_initialized()
        site_id = self._generate_site_id(url)
        if site_id not in self._site_registry:
            return {"success": False, "message": f"Site not found: {url}"}
        site_info = self._site_registry[site_id]
        local_path = site_info.get("local_path")
        # Delete from vector store
        deleted_chunks = await self._vector_store.delete_by_source_url(url)
        # Delete local files
        import shutil
        if local_path and Path(local_path).exists():
            shutil.rmtree(local_path)
        # Remove from registry
        del self._site_registry[site_id]
        await self._save_site_registry()
        log.info(f"Deleted site: {url}")
        return {
            "success": True,
            "url": url,
            "deleted_chunks": deleted_chunks,
            "deleted_path": local_path,
        }
    def get_site_info(self, url: str) -> Optional[dict[str, Any]]:
        """Get information about a downloaded site."""
        site_id = self._generate_site_id(url)
        return self._site_registry.get(site_id)
 # Global RAG system instance
 _rag_system: Optional[RAGSystem] = None
@ -221,6 +525,7 @@ async def get_rag_system(
    embedding_model: str = "text-embedding-3-small",
    vector_store_path: str = "./data/vectors",
    documents_path: str = "./data/documents",
    downloaded_sites_path: str = "./data/downloaded_sites",
    chunk_size: int = 1000,
    chunk_overlap: int = 200,
 ) -> RAGSystem:
@ -231,6 +536,7 @@ async def get_rag_system(
        embedding_model: Name of the embedding model
        vector_store_path: Path to vector store
        documents_path: Path to document storage
        downloaded_sites_path: Path to downloaded websites
        chunk_size: Size of document chunks
        chunk_overlap: Overlap between chunks
@ -244,6 +550,7 @@ async def get_rag_system(
            embedding_model=embedding_model,
            vector_store_path=vector_store_path,
            documents_path=documents_path,
            downloaded_sites_path=downloaded_sites_path,
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
        )
--- a/rag/document_processor.py
+++ b/rag/document_processor.py
@ -167,6 +167,18 @@ class DocumentProcessor:
            log.error(f"HTML extraction failed: {e}")
            return ""
    async def extract_text_from_html(self, content: bytes) -> str:
        """
        Public method to extract text from HTML content.
        Args:
            content: Raw HTML content
        Returns:
            Extracted text content
        """
        return await self._extract_html(content)
    async def _extract_docx(self, content: bytes) -> str:
        """Extract text from DOCX."""
        try:
--- a/rag/vector_store.py
+++ b/rag/vector_store.py
@ -283,3 +283,55 @@ class VectorStore:
        await self._save()
        log.info(f"Deleted document {document_id} ({len(indices_to_remove)} chunks)")
    async def delete_by_source_url(self, source_url: str) -> int:
        """
        Delete all chunks from a specific source URL.
        Args:
            source_url: The source URL to delete
        Returns:
            Number of deleted chunks
        """
        self._ensure_initialized()
        # Find indices to remove
        indices_to_remove = [
            i
            for i, metadata in enumerate(self._metadata)
            if metadata.get("source_url") == source_url
        ]
        # Remove in reverse order to maintain indices
        for i in sorted(indices_to_remove, reverse=True):
            self._chunks.pop(i)
            self._embeddings.pop(i)
            self._metadata.pop(i)
            self._ids.pop(i)
        # Save changes
        await self._save()
        log.info(f"Deleted {len(indices_to_remove)} chunks from source: {source_url}")
        return len(indices_to_remove)
    async def get_stats(self) -> dict[str, Any]:
        """Get statistics about the vector store."""
        self._ensure_initialized()
        # Count unique sources
        sources = set()
        source_urls = set()
        for metadata in self._metadata:
            if metadata.get("source"):
                sources.add(metadata.get("source"))
            if metadata.get("source_url"):
                source_urls.add(metadata.get("source_url"))
        return {
            "total_chunks": len(self._chunks),
            "unique_sources": len(sources),
            "unique_urls": len(source_urls),
            "embedding_dimension": len(self._embeddings[0]) if self._embeddings else 0,
        }