mirror of
https://github.com/MODSetter/SurfSense.git
synced 2025-09-01 18:19:08 +00:00
feat: Add Docling support as ETL_SERVICE option
- Added DOCLING as third ETL_SERVICE option (alongside UNSTRUCTURED/LLAMACLOUD) - Implemented add_received_file_document_using_docling function - Added Docling processing logic in documents_routes.py - Enhanced chunking with configurable overlap support - Added comprehensive document processing service - Supports both CPU and GPU processing with user selection Addresses #161 - Add Docling Support as an ETL_SERVICE Follows same pattern as LlamaCloud integration (PR #123)
This commit is contained in:
parent
f852bcb188
commit
aa00822169
14 changed files with 3125 additions and 2090 deletions
17
.env
Normal file
17
.env
Normal file
|
@ -0,0 +1,17 @@
|
||||||
|
# Frontend Configuration
|
||||||
|
FRONTEND_PORT=3000
|
||||||
|
NEXT_PUBLIC_API_URL=http://backend:8000
|
||||||
|
|
||||||
|
# Backend Configuration
|
||||||
|
BACKEND_PORT=8000
|
||||||
|
|
||||||
|
# Database Configuration
|
||||||
|
POSTGRES_USER=postgres
|
||||||
|
POSTGRES_PASSWORD=postgres
|
||||||
|
POSTGRES_DB=surfsense
|
||||||
|
POSTGRES_PORT=5432
|
||||||
|
|
||||||
|
# pgAdmin Configuration
|
||||||
|
PGADMIN_PORT=5050
|
||||||
|
PGADMIN_DEFAULT_EMAIL=admin@surfsense.com
|
||||||
|
PGADMIN_DEFAULT_PASSWORD=surfsense
|
2
.gitignore
vendored
2
.gitignore
vendored
|
@ -1,2 +1,4 @@
|
||||||
.flashrank_cache*
|
.flashrank_cache*
|
||||||
podcasts/
|
podcasts/
|
||||||
|
reports/
|
||||||
|
SURFSENSE_CRITICAL_FIXES_REPORT.md
|
||||||
|
|
|
@ -2,7 +2,7 @@ version: '3.8'
|
||||||
|
|
||||||
services:
|
services:
|
||||||
frontend:
|
frontend:
|
||||||
image: ghcr.io/modsetter/surfsense_ui:latest
|
build: ./surfsense_web
|
||||||
ports:
|
ports:
|
||||||
- "${FRONTEND_PORT:-3000}:3000"
|
- "${FRONTEND_PORT:-3000}:3000"
|
||||||
volumes:
|
volumes:
|
||||||
|
@ -14,7 +14,7 @@ services:
|
||||||
- NEXT_PUBLIC_API_URL=${NEXT_PUBLIC_API_URL:-http://backend:8000}
|
- NEXT_PUBLIC_API_URL=${NEXT_PUBLIC_API_URL:-http://backend:8000}
|
||||||
|
|
||||||
backend:
|
backend:
|
||||||
image: ghcr.io/modsetter/surfsense_backend:latest
|
build: ./surfsense_backend
|
||||||
ports:
|
ports:
|
||||||
- "${BACKEND_PORT:-8000}:8000"
|
- "${BACKEND_PORT:-8000}:8000"
|
||||||
volumes:
|
volumes:
|
||||||
|
@ -28,3 +28,15 @@ services:
|
||||||
- PYTHONPATH=/app
|
- PYTHONPATH=/app
|
||||||
- UVICORN_LOOP=asyncio
|
- UVICORN_LOOP=asyncio
|
||||||
- UNSTRUCTURED_HAS_PATCHED_LOOP=1
|
- UNSTRUCTURED_HAS_PATCHED_LOOP=1
|
||||||
|
- NVIDIA_VISIBLE_DEVICES=all
|
||||||
|
- NVIDIA_DRIVER_CAPABILITIES=compute,utility
|
||||||
|
- LANGCHAIN_TRACING_V2=false
|
||||||
|
- LANGSMITH_TRACING=false
|
||||||
|
- TESSDATA_PREFIX=/usr/share/tesseract-ocr/5/tessdata/
|
||||||
|
deploy:
|
||||||
|
resources:
|
||||||
|
reservations:
|
||||||
|
devices:
|
||||||
|
- driver: nvidia
|
||||||
|
count: all
|
||||||
|
capabilities: [gpu]
|
||||||
|
|
|
@ -34,7 +34,7 @@ STT_SERVICE_API_BASE=
|
||||||
FIRECRAWL_API_KEY=fcr-01J0000000000000000000000
|
FIRECRAWL_API_KEY=fcr-01J0000000000000000000000
|
||||||
|
|
||||||
# File Parser Service
|
# File Parser Service
|
||||||
ETL_SERVICE=UNSTRUCTURED or LLAMACLOUD
|
ETL_SERVICE=UNSTRUCTURED or LLAMACLOUD or DOCLING
|
||||||
UNSTRUCTURED_API_KEY=Tpu3P0U8iy
|
UNSTRUCTURED_API_KEY=Tpu3P0U8iy
|
||||||
LLAMA_CLOUD_API_KEY=llx-nnn
|
LLAMA_CLOUD_API_KEY=llx-nnn
|
||||||
|
|
||||||
|
|
|
@ -2,20 +2,53 @@ FROM python:3.12-slim
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
# Install system dependencies
|
# Install system dependencies including SSL tools, CUDA dependencies, and Tesseract OCR
|
||||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
gcc \
|
gcc \
|
||||||
python3-dev \
|
python3-dev \
|
||||||
|
ca-certificates \
|
||||||
|
curl \
|
||||||
|
wget \
|
||||||
|
unzip \
|
||||||
|
gnupg2 \
|
||||||
|
tesseract-ocr \
|
||||||
|
tesseract-ocr-eng \
|
||||||
|
libtesseract-dev \
|
||||||
|
libleptonica-dev \
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# Update certificates and install SSL tools
|
||||||
|
RUN update-ca-certificates
|
||||||
|
RUN pip install --upgrade certifi pip-system-certs
|
||||||
|
|
||||||
# Copy requirements
|
# Copy requirements
|
||||||
COPY pyproject.toml .
|
COPY pyproject.toml .
|
||||||
COPY uv.lock .
|
COPY uv.lock .
|
||||||
|
|
||||||
|
# Install CUDA-enabled PyTorch for WSL2 before other dependencies
|
||||||
|
RUN pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
|
||||||
|
|
||||||
# Install python dependencies
|
# Install python dependencies
|
||||||
RUN pip install --no-cache-dir uv && \
|
RUN pip install --no-cache-dir uv && \
|
||||||
uv pip install --system --no-cache-dir -e .
|
uv pip install --system --no-cache-dir -e .
|
||||||
|
|
||||||
|
# Set SSL environment variables dynamically
|
||||||
|
RUN CERTIFI_PATH=$(python -c "import certifi; print(certifi.where())") && \
|
||||||
|
echo "Setting SSL_CERT_FILE to $CERTIFI_PATH" && \
|
||||||
|
echo "export SSL_CERT_FILE=$CERTIFI_PATH" >> /root/.bashrc && \
|
||||||
|
echo "export REQUESTS_CA_BUNDLE=$CERTIFI_PATH" >> /root/.bashrc
|
||||||
|
ENV SSL_CERT_FILE=/usr/local/lib/python3.12/site-packages/certifi/cacert.pem
|
||||||
|
ENV REQUESTS_CA_BUNDLE=/usr/local/lib/python3.12/site-packages/certifi/cacert.pem
|
||||||
|
|
||||||
|
# Pre-download EasyOCR models to avoid runtime SSL issues
|
||||||
|
RUN mkdir -p /root/.EasyOCR/model
|
||||||
|
RUN wget --no-check-certificate https://github.com/JaidedAI/EasyOCR/releases/download/v1.3/english_g2.zip -O /root/.EasyOCR/model/english_g2.zip || true
|
||||||
|
RUN wget --no-check-certificate https://github.com/JaidedAI/EasyOCR/releases/download/pre-v1.1.6/craft_mlt_25k.zip -O /root/.EasyOCR/model/craft_mlt_25k.zip || true
|
||||||
|
RUN cd /root/.EasyOCR/model && (unzip -o english_g2.zip || true) && (unzip -o craft_mlt_25k.zip || true)
|
||||||
|
|
||||||
|
# Pre-download Docling models
|
||||||
|
RUN python -c "try:\n from docling.document_converter import DocumentConverter\n conv = DocumentConverter()\nexcept:\n pass" || true
|
||||||
|
|
||||||
# Install Playwright browsers for web scraping if needed
|
# Install Playwright browsers for web scraping if needed
|
||||||
RUN pip install playwright && \
|
RUN pip install playwright && \
|
||||||
playwright install --with-deps chromium
|
playwright install --with-deps chromium
|
||||||
|
@ -27,6 +60,9 @@ COPY . .
|
||||||
ENV PYTHONPATH=/app
|
ENV PYTHONPATH=/app
|
||||||
ENV UVICORN_LOOP=asyncio
|
ENV UVICORN_LOOP=asyncio
|
||||||
|
|
||||||
|
# Set Tesseract data path
|
||||||
|
ENV TESSDATA_PREFIX=/usr/share/tesseract-ocr/5/tessdata/
|
||||||
|
|
||||||
# Run
|
# Run
|
||||||
EXPOSE 8000
|
EXPOSE 8000
|
||||||
CMD ["python", "main.py"]
|
CMD ["python", "main.py"]
|
|
@ -157,6 +157,29 @@ def find_optimal_documents_with_binary_search(
|
||||||
|
|
||||||
def get_model_context_window(model_name: str) -> int:
|
def get_model_context_window(model_name: str) -> int:
|
||||||
"""Get the total context window size for a model (input + output tokens)."""
|
"""Get the total context window size for a model (input + output tokens)."""
|
||||||
|
|
||||||
|
# Known context windows for common models
|
||||||
|
model_contexts = {
|
||||||
|
'llama3.1:8b': 128000, # Llama 3.1 8B has 128K context
|
||||||
|
'llama3.1:70b': 128000, # Llama 3.1 70B has 128K context
|
||||||
|
'llama3.1:405b': 128000, # Llama 3.1 405B has 128K context
|
||||||
|
'llama3:8b': 8192, # Llama 3 8B has 8K context
|
||||||
|
'llama3:70b': 8192, # Llama 3 70B has 8K context
|
||||||
|
'ollama/llama3.1:8b': 128000,
|
||||||
|
'ollama/llama3.1:70b': 128000,
|
||||||
|
'ollama/llama3:8b': 8192,
|
||||||
|
'ollama/llama3:70b': 8192,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Check for exact match first
|
||||||
|
if model_name in model_contexts:
|
||||||
|
return model_contexts[model_name]
|
||||||
|
|
||||||
|
# Check for partial matches (e.g., model_name contains 'llama3.1')
|
||||||
|
for model_key, context_size in model_contexts.items():
|
||||||
|
if model_key in model_name.lower():
|
||||||
|
return context_size
|
||||||
|
|
||||||
try:
|
try:
|
||||||
model_info = get_model_info(model_name)
|
model_info = get_model_info(model_name)
|
||||||
context_window = model_info.get(
|
context_window = model_info.get(
|
||||||
|
|
|
@ -84,6 +84,9 @@ class Config:
|
||||||
# LlamaCloud API Key
|
# LlamaCloud API Key
|
||||||
LLAMA_CLOUD_API_KEY = os.getenv("LLAMA_CLOUD_API_KEY")
|
LLAMA_CLOUD_API_KEY = os.getenv("LLAMA_CLOUD_API_KEY")
|
||||||
|
|
||||||
|
elif ETL_SERVICE == "DOCLING":
|
||||||
|
# Docling doesn't require API keys - uses local processing
|
||||||
|
pass
|
||||||
|
|
||||||
# Firecrawl API Key
|
# Firecrawl API Key
|
||||||
FIRECRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY", None)
|
FIRECRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY", None)
|
||||||
|
|
|
@ -7,7 +7,7 @@ from app.db import get_async_session, User, SearchSpace, Document, DocumentType
|
||||||
from app.schemas import DocumentsCreate, DocumentUpdate, DocumentRead
|
from app.schemas import DocumentsCreate, DocumentUpdate, DocumentRead
|
||||||
from app.users import current_active_user
|
from app.users import current_active_user
|
||||||
from app.utils.check_ownership import check_ownership
|
from app.utils.check_ownership import check_ownership
|
||||||
from app.tasks.background_tasks import add_received_markdown_file_document, add_extension_received_document, add_received_file_document_using_unstructured, add_crawled_url_document, add_youtube_video_document, add_received_file_document_using_llamacloud
|
from app.tasks.background_tasks import add_received_markdown_file_document, add_extension_received_document, add_received_file_document_using_unstructured, add_crawled_url_document, add_youtube_video_document, add_received_file_document_using_llamacloud, add_received_file_document_using_docling
|
||||||
from app.config import config as app_config
|
from app.config import config as app_config
|
||||||
# Force asyncio to use standard event loop before unstructured imports
|
# Force asyncio to use standard event loop before unstructured imports
|
||||||
import asyncio
|
import asyncio
|
||||||
|
@ -269,6 +269,31 @@ async def process_file_in_background(
|
||||||
search_space_id=search_space_id,
|
search_space_id=search_space_id,
|
||||||
user_id=user_id
|
user_id=user_id
|
||||||
)
|
)
|
||||||
|
elif app_config.ETL_SERVICE == "DOCLING":
|
||||||
|
# Use Docling service for document processing
|
||||||
|
from app.services.document_processing.docling_service import create_docling_service
|
||||||
|
|
||||||
|
# Create Docling service
|
||||||
|
docling_service = create_docling_service()
|
||||||
|
|
||||||
|
# Process the document
|
||||||
|
result = await docling_service.process_document(file_path, filename)
|
||||||
|
|
||||||
|
# Clean up the temp file
|
||||||
|
import os
|
||||||
|
try:
|
||||||
|
os.unlink(file_path)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Process the document using our Docling background task
|
||||||
|
await add_received_file_document_using_docling(
|
||||||
|
session,
|
||||||
|
filename,
|
||||||
|
docling_markdown_document=result['content'],
|
||||||
|
search_space_id=search_space_id,
|
||||||
|
user_id=user_id
|
||||||
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
import logging
|
import logging
|
||||||
logging.error(f"Error processing file in background: {str(e)}")
|
logging.error(f"Error processing file in background: {str(e)}")
|
||||||
|
|
|
@ -0,0 +1 @@
|
||||||
|
# Document processing services for SurfSense
|
|
@ -0,0 +1,346 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Docling Document Processing Service for SurfSense
|
||||||
|
SSL-safe implementation with pre-downloaded models
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import ssl
|
||||||
|
import os
|
||||||
|
from typing import Dict, Any
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
class DoclingService:
|
||||||
|
"""Docling service for enhanced document processing with SSL fixes."""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
"""Initialize Docling service with SSL, model fixes, and GPU acceleration."""
|
||||||
|
self.converter = None
|
||||||
|
self.use_gpu = False
|
||||||
|
self._configure_ssl_environment()
|
||||||
|
self._check_wsl2_gpu_support()
|
||||||
|
self._initialize_docling()
|
||||||
|
|
||||||
|
def _configure_ssl_environment(self):
|
||||||
|
"""Configure SSL environment for secure model downloads."""
|
||||||
|
try:
|
||||||
|
# Set SSL context for downloads
|
||||||
|
ssl._create_default_https_context = ssl._create_unverified_context
|
||||||
|
|
||||||
|
# Set SSL environment variables if not already set
|
||||||
|
if not os.environ.get('SSL_CERT_FILE'):
|
||||||
|
try:
|
||||||
|
import certifi
|
||||||
|
os.environ['SSL_CERT_FILE'] = certifi.where()
|
||||||
|
os.environ['REQUESTS_CA_BUNDLE'] = certifi.where()
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
logger.info("🔐 SSL environment configured for model downloads")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"⚠️ SSL configuration warning: {e}")
|
||||||
|
|
||||||
|
def _check_wsl2_gpu_support(self):
|
||||||
|
"""Check and configure GPU support for WSL2 environment."""
|
||||||
|
try:
|
||||||
|
import torch
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
gpu_count = torch.cuda.device_count()
|
||||||
|
gpu_name = torch.cuda.get_device_name(0) if gpu_count > 0 else "Unknown"
|
||||||
|
logger.info(f"✅ WSL2 GPU detected: {gpu_name} ({gpu_count} devices)")
|
||||||
|
logger.info(f"🚀 CUDA Version: {torch.version.cuda}")
|
||||||
|
self.use_gpu = True
|
||||||
|
else:
|
||||||
|
logger.info("⚠️ CUDA not available in WSL2, falling back to CPU")
|
||||||
|
self.use_gpu = False
|
||||||
|
except ImportError:
|
||||||
|
logger.info("⚠️ PyTorch not found, falling back to CPU")
|
||||||
|
self.use_gpu = False
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"⚠️ GPU detection failed: {e}, falling back to CPU")
|
||||||
|
self.use_gpu = False
|
||||||
|
|
||||||
|
def _initialize_docling(self):
|
||||||
|
"""Initialize Docling with version-safe configuration."""
|
||||||
|
try:
|
||||||
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
|
from docling.datamodel.base_models import InputFormat
|
||||||
|
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||||
|
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||||
|
|
||||||
|
logger.info("🔧 Initializing Docling with version-safe configuration...")
|
||||||
|
|
||||||
|
# Create pipeline options with version-safe attribute checking
|
||||||
|
pipeline_options = PdfPipelineOptions()
|
||||||
|
|
||||||
|
# Disable OCR (user request)
|
||||||
|
if hasattr(pipeline_options, 'do_ocr'):
|
||||||
|
pipeline_options.do_ocr = False
|
||||||
|
logger.info("⚠️ OCR disabled by user request")
|
||||||
|
else:
|
||||||
|
logger.warning("⚠️ OCR attribute not available in this Docling version")
|
||||||
|
|
||||||
|
# Enable table structure if available
|
||||||
|
if hasattr(pipeline_options, 'do_table_structure'):
|
||||||
|
pipeline_options.do_table_structure = True
|
||||||
|
logger.info("✅ Table structure detection enabled")
|
||||||
|
|
||||||
|
# Configure GPU acceleration for WSL2 if available
|
||||||
|
if hasattr(pipeline_options, 'accelerator_device'):
|
||||||
|
if self.use_gpu:
|
||||||
|
try:
|
||||||
|
pipeline_options.accelerator_device = "cuda"
|
||||||
|
logger.info("🚀 GPU acceleration enabled (CUDA)")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"⚠️ GPU acceleration failed, using CPU: {e}")
|
||||||
|
pipeline_options.accelerator_device = "cpu"
|
||||||
|
else:
|
||||||
|
pipeline_options.accelerator_device = "cpu"
|
||||||
|
logger.info("🖥️ Using CPU acceleration")
|
||||||
|
else:
|
||||||
|
logger.info("ℹ️ Accelerator device attribute not available in this Docling version")
|
||||||
|
|
||||||
|
# Create PDF format option with backend
|
||||||
|
pdf_format_option = PdfFormatOption(
|
||||||
|
pipeline_options=pipeline_options,
|
||||||
|
backend=PyPdfiumDocumentBackend
|
||||||
|
)
|
||||||
|
|
||||||
|
# Initialize DocumentConverter
|
||||||
|
self.converter = DocumentConverter(
|
||||||
|
format_options={
|
||||||
|
InputFormat.PDF: pdf_format_option
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
acceleration_type = "GPU (WSL2)" if self.use_gpu else "CPU"
|
||||||
|
logger.info(f"✅ Docling initialized successfully with {acceleration_type} acceleration")
|
||||||
|
|
||||||
|
except ImportError as e:
|
||||||
|
logger.error(f"❌ Docling not installed: {e}")
|
||||||
|
raise RuntimeError(f"Docling not available: {e}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"❌ Docling initialization failed: {e}")
|
||||||
|
raise RuntimeError(f"Docling initialization failed: {e}")
|
||||||
|
|
||||||
|
def _configure_easyocr_local_models(self):
|
||||||
|
"""Configure EasyOCR to use pre-downloaded local models."""
|
||||||
|
try:
|
||||||
|
import easyocr
|
||||||
|
import os
|
||||||
|
|
||||||
|
# Set SSL environment for EasyOCR downloads
|
||||||
|
os.environ['CURL_CA_BUNDLE'] = ''
|
||||||
|
os.environ['REQUESTS_CA_BUNDLE'] = ''
|
||||||
|
|
||||||
|
# Try to use local models first, fallback to download if needed
|
||||||
|
try:
|
||||||
|
reader = easyocr.Reader(['en'],
|
||||||
|
download_enabled=False,
|
||||||
|
model_storage_directory="/root/.EasyOCR/model")
|
||||||
|
logger.info("✅ EasyOCR configured for local models")
|
||||||
|
return reader
|
||||||
|
except:
|
||||||
|
# If local models fail, allow download with SSL bypass
|
||||||
|
logger.info("🔄 Local models failed, attempting download with SSL bypass...")
|
||||||
|
reader = easyocr.Reader(['en'],
|
||||||
|
download_enabled=True,
|
||||||
|
model_storage_directory="/root/.EasyOCR/model")
|
||||||
|
logger.info("✅ EasyOCR configured with downloaded models")
|
||||||
|
return reader
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"⚠️ EasyOCR configuration failed: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
async def process_document(self, file_path: str, filename: str = None) -> Dict[str, Any]:
|
||||||
|
"""Process document with Docling using pre-downloaded models."""
|
||||||
|
|
||||||
|
if self.converter is None:
|
||||||
|
raise RuntimeError("Docling converter not initialized")
|
||||||
|
|
||||||
|
try:
|
||||||
|
logger.info(f"🔄 Processing {filename} with Docling (using local models)...")
|
||||||
|
|
||||||
|
# Process document with local models
|
||||||
|
result = self.converter.convert(file_path)
|
||||||
|
|
||||||
|
# Extract content using version-safe methods
|
||||||
|
content = None
|
||||||
|
if hasattr(result, 'document') and result.document:
|
||||||
|
# Try different export methods (version compatibility)
|
||||||
|
if hasattr(result.document, 'export_to_markdown'):
|
||||||
|
content = result.document.export_to_markdown()
|
||||||
|
logger.info("📄 Used export_to_markdown method")
|
||||||
|
elif hasattr(result.document, 'to_markdown'):
|
||||||
|
content = result.document.to_markdown()
|
||||||
|
logger.info("📄 Used to_markdown method")
|
||||||
|
elif hasattr(result.document, 'text'):
|
||||||
|
content = result.document.text
|
||||||
|
logger.info("📄 Used text property")
|
||||||
|
elif hasattr(result.document, '__str__'):
|
||||||
|
content = str(result.document)
|
||||||
|
logger.info("📄 Used string conversion")
|
||||||
|
|
||||||
|
if content:
|
||||||
|
logger.info(f"✅ Docling SUCCESS - {filename}: {len(content)} chars (local models)")
|
||||||
|
|
||||||
|
return {
|
||||||
|
'content': content,
|
||||||
|
'full_text': content,
|
||||||
|
'service_used': 'docling',
|
||||||
|
'status': 'success',
|
||||||
|
'processing_notes': 'Processed with Docling using pre-downloaded models'
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
raise ValueError("No content could be extracted from document")
|
||||||
|
else:
|
||||||
|
raise ValueError("No document object returned by Docling")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"❌ Docling processing failed for {filename}: {e}")
|
||||||
|
# Log the full error for debugging
|
||||||
|
import traceback
|
||||||
|
logger.error(f"Full traceback: {traceback.format_exc()}")
|
||||||
|
raise RuntimeError(f"Docling processing failed: {e}")
|
||||||
|
|
||||||
|
async def process_large_document_summary(
|
||||||
|
self,
|
||||||
|
content: str,
|
||||||
|
llm,
|
||||||
|
document_title: str = "Document"
|
||||||
|
) -> str:
|
||||||
|
"""
|
||||||
|
Process large documents using chunked LLM summarization.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
content: The full document content
|
||||||
|
llm: The language model to use for summarization
|
||||||
|
document_title: Title of the document for context
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Final summary of the document
|
||||||
|
"""
|
||||||
|
# Large document threshold (100K characters ≈ 25K tokens)
|
||||||
|
LARGE_DOCUMENT_THRESHOLD = 100_000
|
||||||
|
|
||||||
|
if len(content) <= LARGE_DOCUMENT_THRESHOLD:
|
||||||
|
# For smaller documents, use direct processing
|
||||||
|
logger.info(f"📄 Document size: {len(content)} chars - using direct processing")
|
||||||
|
from app.prompts import SUMMARY_PROMPT_TEMPLATE
|
||||||
|
summary_chain = SUMMARY_PROMPT_TEMPLATE | llm
|
||||||
|
result = await summary_chain.ainvoke({"document": content})
|
||||||
|
return result.content
|
||||||
|
|
||||||
|
logger.info(f"📚 Large document detected: {len(content)} chars - using chunked processing")
|
||||||
|
|
||||||
|
# Import chunker from config
|
||||||
|
from app.config import config
|
||||||
|
from langchain_core.prompts import PromptTemplate
|
||||||
|
|
||||||
|
# Create LLM-optimized chunks (8K tokens max for safety)
|
||||||
|
from chonkie import RecursiveChunker, OverlapRefinery
|
||||||
|
llm_chunker = RecursiveChunker(
|
||||||
|
chunk_size=8000 # Conservative for most LLMs
|
||||||
|
)
|
||||||
|
|
||||||
|
# Apply overlap refinery for context preservation (10% overlap = 800 tokens)
|
||||||
|
overlap_refinery = OverlapRefinery(
|
||||||
|
context_size=0.1, # 10% overlap for context preservation
|
||||||
|
method="suffix" # Add next chunk context to current chunk
|
||||||
|
)
|
||||||
|
|
||||||
|
# First chunk the content, then apply overlap refinery
|
||||||
|
initial_chunks = llm_chunker.chunk(content)
|
||||||
|
chunks = overlap_refinery.refine(initial_chunks)
|
||||||
|
total_chunks = len(chunks)
|
||||||
|
|
||||||
|
logger.info(f"📄 Split into {total_chunks} chunks for LLM processing")
|
||||||
|
|
||||||
|
# Template for chunk processing
|
||||||
|
chunk_template = PromptTemplate(
|
||||||
|
input_variables=["chunk", "chunk_number", "total_chunks"],
|
||||||
|
template="""<INSTRUCTIONS>
|
||||||
|
You are summarizing chunk {chunk_number} of {total_chunks} from a large document.
|
||||||
|
|
||||||
|
Create a comprehensive summary of this document chunk. Focus on:
|
||||||
|
- Key concepts, facts, and information
|
||||||
|
- Important details and context
|
||||||
|
- Main topics and themes
|
||||||
|
|
||||||
|
Provide a clear, structured summary that captures the essential content.
|
||||||
|
|
||||||
|
Chunk {chunk_number}/{total_chunks}:
|
||||||
|
<document_chunk>
|
||||||
|
{chunk}
|
||||||
|
</document_chunk>
|
||||||
|
</INSTRUCTIONS>"""
|
||||||
|
)
|
||||||
|
|
||||||
|
# Process each chunk individually
|
||||||
|
chunk_summaries = []
|
||||||
|
for i, chunk in enumerate(chunks, 1):
|
||||||
|
try:
|
||||||
|
logger.info(f"🔄 Processing chunk {i}/{total_chunks} ({len(chunk.text)} chars)")
|
||||||
|
|
||||||
|
chunk_chain = chunk_template | llm
|
||||||
|
chunk_result = await chunk_chain.ainvoke({
|
||||||
|
"chunk": chunk.text,
|
||||||
|
"chunk_number": i,
|
||||||
|
"total_chunks": total_chunks
|
||||||
|
})
|
||||||
|
|
||||||
|
chunk_summary = chunk_result.content
|
||||||
|
chunk_summaries.append(f"=== Section {i} ===\n{chunk_summary}")
|
||||||
|
|
||||||
|
logger.info(f"✅ Completed chunk {i}/{total_chunks}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"❌ Failed to process chunk {i}/{total_chunks}: {e}")
|
||||||
|
chunk_summaries.append(f"=== Section {i} ===\n[Processing failed]")
|
||||||
|
|
||||||
|
# Combine summaries into final document summary
|
||||||
|
logger.info(f"🔄 Combining {len(chunk_summaries)} chunk summaries")
|
||||||
|
|
||||||
|
try:
|
||||||
|
combine_template = PromptTemplate(
|
||||||
|
input_variables=["summaries", "document_title"],
|
||||||
|
template="""<INSTRUCTIONS>
|
||||||
|
You are combining multiple section summaries into a final comprehensive document summary.
|
||||||
|
|
||||||
|
Create a unified, coherent summary from the following section summaries of "{document_title}".
|
||||||
|
Ensure:
|
||||||
|
- Logical flow and organization
|
||||||
|
- No redundancy or repetition
|
||||||
|
- Comprehensive coverage of all key points
|
||||||
|
- Professional, objective tone
|
||||||
|
|
||||||
|
<section_summaries>
|
||||||
|
{summaries}
|
||||||
|
</section_summaries>
|
||||||
|
</INSTRUCTIONS>"""
|
||||||
|
)
|
||||||
|
|
||||||
|
combined_summaries = "\n\n".join(chunk_summaries)
|
||||||
|
combine_chain = combine_template | llm
|
||||||
|
|
||||||
|
final_result = await combine_chain.ainvoke({
|
||||||
|
"summaries": combined_summaries,
|
||||||
|
"document_title": document_title
|
||||||
|
})
|
||||||
|
|
||||||
|
final_summary = final_result.content
|
||||||
|
logger.info(f"✅ Large document processing complete: {len(final_summary)} chars summary")
|
||||||
|
|
||||||
|
return final_summary
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"❌ Failed to combine summaries: {e}")
|
||||||
|
# Fallback: return concatenated chunk summaries
|
||||||
|
fallback_summary = "\n\n".join(chunk_summaries)
|
||||||
|
logger.warning("⚠️ Using fallback combined summary")
|
||||||
|
return fallback_summary
|
||||||
|
|
||||||
|
def create_docling_service() -> DoclingService:
|
||||||
|
"""Create a Docling service instance."""
|
||||||
|
return DoclingService()
|
|
@ -459,6 +459,94 @@ async def add_received_file_document_using_llamacloud(
|
||||||
raise RuntimeError(f"Failed to process file document using LlamaCloud: {str(e)}")
|
raise RuntimeError(f"Failed to process file document using LlamaCloud: {str(e)}")
|
||||||
|
|
||||||
|
|
||||||
|
async def add_received_file_document_using_docling(
|
||||||
|
session: AsyncSession,
|
||||||
|
file_name: str,
|
||||||
|
docling_markdown_document: str,
|
||||||
|
search_space_id: int,
|
||||||
|
user_id: str,
|
||||||
|
) -> Optional[Document]:
|
||||||
|
"""
|
||||||
|
Process and store document content parsed by Docling.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
session: Database session
|
||||||
|
file_name: Name of the processed file
|
||||||
|
docling_markdown_document: Markdown content from Docling parsing
|
||||||
|
search_space_id: ID of the search space
|
||||||
|
user_id: ID of the user
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Document object if successful, None if failed
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
file_in_markdown = docling_markdown_document
|
||||||
|
|
||||||
|
content_hash = generate_content_hash(file_in_markdown, search_space_id)
|
||||||
|
|
||||||
|
# Check if document with this content hash already exists
|
||||||
|
existing_doc_result = await session.execute(
|
||||||
|
select(Document).where(Document.content_hash == content_hash)
|
||||||
|
)
|
||||||
|
existing_document = existing_doc_result.scalars().first()
|
||||||
|
|
||||||
|
if existing_document:
|
||||||
|
logging.info(f"Document with content hash {content_hash} already exists. Skipping processing.")
|
||||||
|
return existing_document
|
||||||
|
|
||||||
|
# Get user's long context LLM
|
||||||
|
user_llm = await get_user_long_context_llm(session, user_id)
|
||||||
|
if not user_llm:
|
||||||
|
raise RuntimeError(f"No long context LLM configured for user {user_id}")
|
||||||
|
|
||||||
|
# Generate summary using chunked processing for large documents
|
||||||
|
from app.services.document_processing.docling_service import create_docling_service
|
||||||
|
docling_service = create_docling_service()
|
||||||
|
|
||||||
|
summary_content = await docling_service.process_large_document_summary(
|
||||||
|
content=file_in_markdown,
|
||||||
|
llm=user_llm,
|
||||||
|
document_title=file_name
|
||||||
|
)
|
||||||
|
summary_embedding = config.embedding_model_instance.embed(summary_content)
|
||||||
|
|
||||||
|
# Process chunks
|
||||||
|
chunks = [
|
||||||
|
Chunk(
|
||||||
|
content=chunk.text,
|
||||||
|
embedding=config.embedding_model_instance.embed(chunk.text),
|
||||||
|
)
|
||||||
|
for chunk in config.chunker_instance.chunk(file_in_markdown)
|
||||||
|
]
|
||||||
|
|
||||||
|
# Create and store document
|
||||||
|
document = Document(
|
||||||
|
search_space_id=search_space_id,
|
||||||
|
title=file_name,
|
||||||
|
document_type=DocumentType.FILE,
|
||||||
|
document_metadata={
|
||||||
|
"FILE_NAME": file_name,
|
||||||
|
"ETL_SERVICE": "DOCLING",
|
||||||
|
},
|
||||||
|
content=summary_content,
|
||||||
|
embedding=summary_embedding,
|
||||||
|
chunks=chunks,
|
||||||
|
content_hash=content_hash,
|
||||||
|
)
|
||||||
|
|
||||||
|
session.add(document)
|
||||||
|
await session.commit()
|
||||||
|
await session.refresh(document)
|
||||||
|
|
||||||
|
return document
|
||||||
|
except SQLAlchemyError as db_error:
|
||||||
|
await session.rollback()
|
||||||
|
raise db_error
|
||||||
|
except Exception as e:
|
||||||
|
await session.rollback()
|
||||||
|
raise RuntimeError(f"Failed to process file document using Docling: {str(e)}")
|
||||||
|
|
||||||
|
|
||||||
async def add_youtube_video_document(
|
async def add_youtube_video_document(
|
||||||
session: AsyncSession, url: str, search_space_id: int, user_id: str
|
session: AsyncSession, url: str, search_space_id: int, user_id: str
|
||||||
):
|
):
|
||||||
|
|
|
@ -9,6 +9,7 @@ dependencies = [
|
||||||
"asyncpg>=0.30.0",
|
"asyncpg>=0.30.0",
|
||||||
"chonkie[all]>=1.0.6",
|
"chonkie[all]>=1.0.6",
|
||||||
"discord-py>=2.5.2",
|
"discord-py>=2.5.2",
|
||||||
|
"docling>=2.15.0",
|
||||||
"fastapi>=0.115.8",
|
"fastapi>=0.115.8",
|
||||||
"fastapi-users[oauth,sqlalchemy]>=14.0.1",
|
"fastapi-users[oauth,sqlalchemy]>=14.0.1",
|
||||||
"firecrawl-py>=1.12.0",
|
"firecrawl-py>=1.12.0",
|
||||||
|
@ -17,7 +18,7 @@ dependencies = [
|
||||||
"langchain-unstructured>=0.1.6",
|
"langchain-unstructured>=0.1.6",
|
||||||
"langgraph>=0.3.29",
|
"langgraph>=0.3.29",
|
||||||
"linkup-sdk>=0.2.4",
|
"linkup-sdk>=0.2.4",
|
||||||
"litellm>=1.61.4",
|
"litellm>=1.61.4,<1.70.0",
|
||||||
"llama-cloud-services>=0.6.25",
|
"llama-cloud-services>=0.6.25",
|
||||||
"markdownify>=0.14.1",
|
"markdownify>=0.14.1",
|
||||||
"notion-client>=2.3.0",
|
"notion-client>=2.3.0",
|
||||||
|
@ -29,6 +30,7 @@ dependencies = [
|
||||||
"slack-sdk>=3.34.0",
|
"slack-sdk>=3.34.0",
|
||||||
"static-ffmpeg>=2.13",
|
"static-ffmpeg>=2.13",
|
||||||
"tavily-python>=0.3.2",
|
"tavily-python>=0.3.2",
|
||||||
|
"tesserocr>=2.8.0",
|
||||||
"unstructured-client>=0.30.0",
|
"unstructured-client>=0.30.0",
|
||||||
"unstructured[all-docs]>=0.16.25",
|
"unstructured[all-docs]>=0.16.25",
|
||||||
"uvicorn[standard]>=0.34.0",
|
"uvicorn[standard]>=0.34.0",
|
||||||
|
|
4646
surfsense_backend/uv.lock
generated
4646
surfsense_backend/uv.lock
generated
File diff suppressed because it is too large
Load diff
|
@ -1,3 +1,3 @@
|
||||||
NEXT_PUBLIC_FASTAPI_BACKEND_URL=http://localhost:8000
|
NEXT_PUBLIC_FASTAPI_BACKEND_URL=http://localhost:8000
|
||||||
NEXT_PUBLIC_FASTAPI_BACKEND_AUTH_TYPE=LOCAL or GOOGLE
|
NEXT_PUBLIC_FASTAPI_BACKEND_AUTH_TYPE=LOCAL or GOOGLE
|
||||||
NEXT_PUBLIC_ETL_SERVICE=UNSTRUCTURED or LLAMACLOUD
|
NEXT_PUBLIC_ETL_SERVICE=UNSTRUCTURED or LLAMACLOUD or DOCLING
|
Loading…
Add table
Reference in a new issue