Features:
- RAG system now uses website_downloader_tool as primary content ingestion method
- download_and_ingest_website() method for complete website processing
- Stores page pointers (source_url, page_url, local_path) in vector store
- Site registry tracks all downloaded websites with metadata
- New API endpoints for website management:
- POST /v1/documents/website - Download and ingest a website
- GET /v1/documents/sites - List all downloaded sites
- GET /v1/documents/sites/{url} - Get site info
- DELETE /v1/documents/sites/{url} - Delete a site and its content
Changes:
- rag/__init__.py: Added download_and_ingest_website(), site registry
- rag/document_processor.py: Added extract_text_from_html() public method
- rag/vector_store.py: Added delete_by_source_url(), get_stats()
- main.py: New website endpoints, integrated tool with RAG system
260 lines
8.0 KiB
Python
260 lines
8.0 KiB
Python
"""
|
|
Document Processor - Handles document parsing and chunking
|
|
|
|
Supports multiple document formats:
|
|
- Plain text (.txt, .md)
|
|
- PDF (.pdf)
|
|
- HTML (.html, .htm)
|
|
- Word documents (.docx)
|
|
- Code files (.py, .js, etc.)
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import hashlib
|
|
import logging
|
|
import os
|
|
import re
|
|
import uuid
|
|
from pathlib import Path
|
|
from typing import Any, Optional
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
|
|
class DocumentProcessor:
|
|
"""
|
|
Process documents into chunks suitable for vector storage.
|
|
|
|
Handles:
|
|
- Multiple file formats
|
|
- Intelligent chunking with overlap
|
|
- Metadata extraction
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
chunk_size: int = 1000,
|
|
chunk_overlap: int = 200,
|
|
):
|
|
self.chunk_size = chunk_size
|
|
self.chunk_overlap = chunk_overlap
|
|
|
|
async def process(
|
|
self,
|
|
content: bytes,
|
|
filename: str,
|
|
metadata: Optional[dict[str, Any]] = None,
|
|
) -> dict[str, Any]:
|
|
"""
|
|
Process a document into chunks.
|
|
|
|
Args:
|
|
content: Raw document content
|
|
filename: Original filename
|
|
metadata: Optional additional metadata
|
|
|
|
Returns:
|
|
Dictionary with chunks, metadatas, and ids
|
|
"""
|
|
# Extract text based on file type
|
|
text = await self._extract_text(content, filename)
|
|
|
|
if not text.strip():
|
|
return {"chunks": [], "metadatas": [], "ids": [], "document_id": None}
|
|
|
|
# Generate document ID
|
|
document_id = str(uuid.uuid4())
|
|
|
|
# Create chunks
|
|
chunks = self._chunk_text(text)
|
|
|
|
# Create metadata for each chunk
|
|
base_metadata = {
|
|
"source": filename,
|
|
"document_id": document_id,
|
|
**(metadata or {}),
|
|
}
|
|
|
|
metadatas = []
|
|
ids = []
|
|
|
|
for i, chunk in enumerate(chunks):
|
|
chunk_id = hashlib.md5(f"{document_id}_{i}".encode()).hexdigest()
|
|
ids.append(chunk_id)
|
|
metadatas.append({
|
|
**base_metadata,
|
|
"chunk_index": i,
|
|
"chunk_length": len(chunk),
|
|
})
|
|
|
|
return {
|
|
"chunks": chunks,
|
|
"metadatas": metadatas,
|
|
"ids": ids,
|
|
"document_id": document_id,
|
|
"total_chars": len(text),
|
|
}
|
|
|
|
async def _extract_text(self, content: bytes, filename: str) -> str:
|
|
"""Extract text from document based on file type."""
|
|
ext = Path(filename).suffix.lower()
|
|
|
|
try:
|
|
if ext in (".txt", ".md", ".rst", ".log"):
|
|
return content.decode("utf-8", errors="ignore")
|
|
|
|
elif ext == ".pdf":
|
|
return await self._extract_pdf(content)
|
|
|
|
elif ext in (".html", ".htm"):
|
|
return await self._extract_html(content)
|
|
|
|
elif ext == ".docx":
|
|
return await self._extract_docx(content)
|
|
|
|
elif ext in (".py", ".js", ".ts", ".java", ".cpp", ".c", ".go", ".rs", ".rb", ".php", ".cs", ".swift", ".kt"):
|
|
return content.decode("utf-8", errors="ignore")
|
|
|
|
elif ext in (".json", ".yaml", ".yml", ".xml", ".toml"):
|
|
return content.decode("utf-8", errors="ignore")
|
|
|
|
elif ext in (".csv", ".tsv"):
|
|
return content.decode("utf-8", errors="ignore")
|
|
|
|
else:
|
|
# Try to decode as text
|
|
try:
|
|
return content.decode("utf-8", errors="ignore")
|
|
except Exception:
|
|
log.warning(f"Unknown file type: {ext}, treating as binary")
|
|
return ""
|
|
|
|
except Exception as e:
|
|
log.error(f"Failed to extract text from {filename}: {e}")
|
|
return ""
|
|
|
|
async def _extract_pdf(self, content: bytes) -> str:
|
|
"""Extract text from PDF."""
|
|
try:
|
|
import fitz # PyMuPDF
|
|
doc = fitz.open(stream=content, filetype="pdf")
|
|
text_parts = []
|
|
for page in doc:
|
|
text_parts.append(page.get_text())
|
|
doc.close()
|
|
return "\n\n".join(text_parts)
|
|
except ImportError:
|
|
log.warning("PyMuPDF not installed, PDF extraction unavailable")
|
|
return ""
|
|
except Exception as e:
|
|
log.error(f"PDF extraction failed: {e}")
|
|
return ""
|
|
|
|
async def _extract_html(self, content: bytes) -> str:
|
|
"""Extract text from HTML."""
|
|
try:
|
|
from bs4 import BeautifulSoup
|
|
soup = BeautifulSoup(content, "html.parser")
|
|
# Remove script and style elements
|
|
for element in soup(["script", "style", "nav", "footer", "header"]):
|
|
element.decompose()
|
|
return soup.get_text(separator="\n", strip=True)
|
|
except ImportError:
|
|
log.warning("BeautifulSoup not installed, HTML extraction unavailable")
|
|
return content.decode("utf-8", errors="ignore")
|
|
except Exception as e:
|
|
log.error(f"HTML extraction failed: {e}")
|
|
return ""
|
|
|
|
async def extract_text_from_html(self, content: bytes) -> str:
|
|
"""
|
|
Public method to extract text from HTML content.
|
|
|
|
Args:
|
|
content: Raw HTML content
|
|
|
|
Returns:
|
|
Extracted text content
|
|
"""
|
|
return await self._extract_html(content)
|
|
|
|
async def _extract_docx(self, content: bytes) -> str:
|
|
"""Extract text from DOCX."""
|
|
try:
|
|
import io
|
|
from docx import Document
|
|
doc = Document(io.BytesIO(content))
|
|
return "\n\n".join(para.text for para in doc.paragraphs)
|
|
except ImportError:
|
|
log.warning("python-docx not installed, DOCX extraction unavailable")
|
|
return ""
|
|
except Exception as e:
|
|
log.error(f"DOCX extraction failed: {e}")
|
|
return ""
|
|
|
|
def _chunk_text(self, text: str) -> list[str]:
|
|
"""
|
|
Split text into overlapping chunks.
|
|
|
|
Uses a sentence-aware chunking strategy to avoid breaking mid-sentence.
|
|
"""
|
|
if len(text) <= self.chunk_size:
|
|
return [text.strip()] if text.strip() else []
|
|
|
|
# Split into sentences
|
|
sentences = self._split_sentences(text)
|
|
|
|
chunks = []
|
|
current_chunk = []
|
|
current_length = 0
|
|
|
|
for sentence in sentences:
|
|
sentence_length = len(sentence)
|
|
|
|
# If adding this sentence would exceed chunk size
|
|
if current_length + sentence_length > self.chunk_size and current_chunk:
|
|
# Save current chunk
|
|
chunks.append(" ".join(current_chunk))
|
|
|
|
# Start new chunk with overlap
|
|
overlap_text = self._get_overlap_text(current_chunk)
|
|
current_chunk = [overlap_text, sentence] if overlap_text else [sentence]
|
|
current_length = len(" ".join(current_chunk))
|
|
else:
|
|
current_chunk.append(sentence)
|
|
current_length += sentence_length + 1 # +1 for space
|
|
|
|
# Add final chunk
|
|
if current_chunk:
|
|
chunks.append(" ".join(current_chunk))
|
|
|
|
return [c.strip() for c in chunks if c.strip()]
|
|
|
|
def _split_sentences(self, text: str) -> list[str]:
|
|
"""Split text into sentences."""
|
|
# Simple sentence splitting - can be improved with NLP libraries
|
|
sentence_endings = r'(?<=[.!?])\s+'
|
|
sentences = re.split(sentence_endings, text)
|
|
return [s.strip() for s in sentences if s.strip()]
|
|
|
|
def _get_overlap_text(self, chunk_parts: list[str]) -> str:
|
|
"""Get text for overlap from the end of the current chunk."""
|
|
if not chunk_parts:
|
|
return ""
|
|
|
|
full_text = " ".join(chunk_parts)
|
|
|
|
if len(full_text) <= self.chunk_overlap:
|
|
return full_text
|
|
|
|
# Get last N characters
|
|
overlap = full_text[-self.chunk_overlap:]
|
|
|
|
# Try to start at a word boundary
|
|
space_idx = overlap.find(" ")
|
|
if space_idx > 0:
|
|
overlap = overlap[space_idx + 1:]
|
|
|
|
return overlap
|