feat: Add Docling support as ETL_SERVICE option

- Added DOCLING as third ETL_SERVICE option (alongside UNSTRUCTURED/LLAMACLOUD) - Implemented add_received_file_document_using_docling function - Added Docling processing logic in documents_routes.py - Enhanced chunking with configurable overlap support - Added comprehensive document processing service - Supports both CPU and GPU processing with user selection Addresses #161 - Add Docling Support as an ETL_SERVICE Follows same pattern as LlamaCloud integration (PR #123)
2025-09-01 18:19:08 +00:00 · 2025-07-20 11:42:55 +03:00 · 2025-07-20 11:42:55 +03:00 · aa00822169
commit aa00822169
parent f852bcb188
14 changed files with 3125 additions and 2090 deletions
--- a/.env
+++ b/.env
@ -0,0 +1,17 @@
+# Frontend Configuration
+FRONTEND_PORT=3000
+NEXT_PUBLIC_API_URL=http://backend:8000
+
+# Backend Configuration
+BACKEND_PORT=8000
+
+# Database Configuration
+POSTGRES_USER=postgres
+POSTGRES_PASSWORD=postgres
+POSTGRES_DB=surfsense
+POSTGRES_PORT=5432
+
+# pgAdmin Configuration
+PGADMIN_PORT=5050
+PGADMIN_DEFAULT_EMAIL=admin@surfsense.com
+PGADMIN_DEFAULT_PASSWORD=surfsense
--- a/.gitignore
+++ b/.gitignore
@ -1,2 +1,4 @@
 .flashrank_cache*
 podcasts/
+reports/
+SURFSENSE_CRITICAL_FIXES_REPORT.md
--- a/docker-compose.override.yml
+++ b/docker-compose.override.yml
@ -2,7 +2,7 @@ version: '3.8'

 services:
  frontend:
-    image: ghcr.io/modsetter/surfsense_ui:latest
+    build: ./surfsense_web
    ports:
      - "${FRONTEND_PORT:-3000}:3000"
    volumes:
@ -14,7 +14,7 @@ services:
      - NEXT_PUBLIC_API_URL=${NEXT_PUBLIC_API_URL:-http://backend:8000}

  backend:
-    image: ghcr.io/modsetter/surfsense_backend:latest
+    build: ./surfsense_backend
    ports:
      - "${BACKEND_PORT:-8000}:8000"
    volumes:
@ -28,3 +28,15 @@ services:
      - PYTHONPATH=/app
      - UVICORN_LOOP=asyncio
      - UNSTRUCTURED_HAS_PATCHED_LOOP=1
+      - NVIDIA_VISIBLE_DEVICES=all
+      - NVIDIA_DRIVER_CAPABILITIES=compute,utility
+      - LANGCHAIN_TRACING_V2=false
+      - LANGSMITH_TRACING=false
+      - TESSDATA_PREFIX=/usr/share/tesseract-ocr/5/tessdata/
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
--- a/surfsense_backend/.env.example
+++ b/surfsense_backend/.env.example
@ -34,7 +34,7 @@ STT_SERVICE_API_BASE=
 FIRECRAWL_API_KEY=fcr-01J0000000000000000000000

 # File Parser Service
-ETL_SERVICE=UNSTRUCTURED or LLAMACLOUD
+ETL_SERVICE=UNSTRUCTURED or LLAMACLOUD or DOCLING
 UNSTRUCTURED_API_KEY=Tpu3P0U8iy
 LLAMA_CLOUD_API_KEY=llx-nnn

--- a/surfsense_backend/Dockerfile
+++ b/surfsense_backend/Dockerfile
@ -2,20 +2,53 @@ FROM python:3.12-slim

 WORKDIR /app

-# Install system dependencies
+# Install system dependencies including SSL tools, CUDA dependencies, and Tesseract OCR
 RUN apt-get update && apt-get install -y --no-install-recommends \
    gcc \
    python3-dev \
+    ca-certificates \
+    curl \
+    wget \
+    unzip \
+    gnupg2 \
+    tesseract-ocr \
+    tesseract-ocr-eng \
+    libtesseract-dev \
+    libleptonica-dev \
    && rm -rf /var/lib/apt/lists/*

+# Update certificates and install SSL tools
+RUN update-ca-certificates
+RUN pip install --upgrade certifi pip-system-certs
+
 # Copy requirements
 COPY pyproject.toml .
 COPY uv.lock .

+# Install CUDA-enabled PyTorch for WSL2 before other dependencies
+RUN pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
+
 # Install python dependencies
 RUN pip install --no-cache-dir uv && \
    uv pip install --system --no-cache-dir -e .

+# Set SSL environment variables dynamically
+RUN CERTIFI_PATH=$(python -c "import certifi; print(certifi.where())") && \
+    echo "Setting SSL_CERT_FILE to $CERTIFI_PATH" && \
+    echo "export SSL_CERT_FILE=$CERTIFI_PATH" >> /root/.bashrc && \
+    echo "export REQUESTS_CA_BUNDLE=$CERTIFI_PATH" >> /root/.bashrc
+ENV SSL_CERT_FILE=/usr/local/lib/python3.12/site-packages/certifi/cacert.pem
+ENV REQUESTS_CA_BUNDLE=/usr/local/lib/python3.12/site-packages/certifi/cacert.pem
+
+# Pre-download EasyOCR models to avoid runtime SSL issues
+RUN mkdir -p /root/.EasyOCR/model
+RUN wget --no-check-certificate https://github.com/JaidedAI/EasyOCR/releases/download/v1.3/english_g2.zip -O /root/.EasyOCR/model/english_g2.zip || true
+RUN wget --no-check-certificate https://github.com/JaidedAI/EasyOCR/releases/download/pre-v1.1.6/craft_mlt_25k.zip -O /root/.EasyOCR/model/craft_mlt_25k.zip || true
+RUN cd /root/.EasyOCR/model && (unzip -o english_g2.zip || true) && (unzip -o craft_mlt_25k.zip || true)
+
+# Pre-download Docling models
+RUN python -c "try:\n    from docling.document_converter import DocumentConverter\n    conv = DocumentConverter()\nexcept:\n    pass" || true
+
 # Install Playwright browsers for web scraping if needed
 RUN pip install playwright && \
    playwright install --with-deps chromium
@ -27,6 +60,9 @@ COPY . .
 ENV PYTHONPATH=/app
 ENV UVICORN_LOOP=asyncio

+# Set Tesseract data path
+ENV TESSDATA_PREFIX=/usr/share/tesseract-ocr/5/tessdata/
+
 # Run
 EXPOSE 8000
 CMD ["python", "main.py"] 
--- a/surfsense_backend/app/agents/researcher/utils.py
+++ b/surfsense_backend/app/agents/researcher/utils.py
@ -157,6 +157,29 @@ def find_optimal_documents_with_binary_search(

 def get_model_context_window(model_name: str) -> int:
    """Get the total context window size for a model (input + output tokens)."""
+    
+    # Known context windows for common models
+    model_contexts = {
+        'llama3.1:8b': 128000,      # Llama 3.1 8B has 128K context
+        'llama3.1:70b': 128000,     # Llama 3.1 70B has 128K context
+        'llama3.1:405b': 128000,    # Llama 3.1 405B has 128K context
+        'llama3:8b': 8192,          # Llama 3 8B has 8K context
+        'llama3:70b': 8192,         # Llama 3 70B has 8K context
+        'ollama/llama3.1:8b': 128000,
+        'ollama/llama3.1:70b': 128000,
+        'ollama/llama3:8b': 8192,
+        'ollama/llama3:70b': 8192,
+    }
+    
+    # Check for exact match first
+    if model_name in model_contexts:
+        return model_contexts[model_name]
+    
+    # Check for partial matches (e.g., model_name contains 'llama3.1')
+    for model_key, context_size in model_contexts.items():
+        if model_key in model_name.lower():
+            return context_size
+    
    try:
        model_info = get_model_info(model_name)
        context_window = model_info.get(
--- a/surfsense_backend/app/config/init.py
+++ b/surfsense_backend/app/config/init.py
@ -84,6 +84,9 @@ class Config:
        # LlamaCloud API Key
        LLAMA_CLOUD_API_KEY = os.getenv("LLAMA_CLOUD_API_KEY")
        
+    elif ETL_SERVICE == "DOCLING":
+        # Docling doesn't require API keys - uses local processing
+        pass
        
    # Firecrawl API Key
    FIRECRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY", None) 
--- a/surfsense_backend/app/routes/documents_routes.py
+++ b/surfsense_backend/app/routes/documents_routes.py
@ -7,7 +7,7 @@ from app.db import get_async_session, User, SearchSpace, Document, DocumentType
 from app.schemas import DocumentsCreate, DocumentUpdate, DocumentRead
 from app.users import current_active_user
 from app.utils.check_ownership import check_ownership
-from app.tasks.background_tasks import add_received_markdown_file_document, add_extension_received_document, add_received_file_document_using_unstructured, add_crawled_url_document, add_youtube_video_document, add_received_file_document_using_llamacloud
+from app.tasks.background_tasks import add_received_markdown_file_document, add_extension_received_document, add_received_file_document_using_unstructured, add_crawled_url_document, add_youtube_video_document, add_received_file_document_using_llamacloud, add_received_file_document_using_docling
 from app.config import config as app_config
 # Force asyncio to use standard event loop before unstructured imports
 import asyncio
@ -269,6 +269,31 @@ async def process_file_in_background(
                        search_space_id=search_space_id,
                        user_id=user_id
                    )
+            elif app_config.ETL_SERVICE == "DOCLING":
+                # Use Docling service for document processing
+                from app.services.document_processing.docling_service import create_docling_service
+                
+                # Create Docling service
+                docling_service = create_docling_service()
+                
+                # Process the document
+                result = await docling_service.process_document(file_path, filename)
+                
+                # Clean up the temp file
+                import os
+                try:
+                    os.unlink(file_path)
+                except:
+                    pass
+                
+                # Process the document using our Docling background task
+                await add_received_file_document_using_docling(
+                    session,
+                    filename,
+                    docling_markdown_document=result['content'],
+                    search_space_id=search_space_id,
+                    user_id=user_id
+                )
    except Exception as e:
        import logging
        logging.error(f"Error processing file in background: {str(e)}")
--- a/surfsense_backend/app/services/document_processing/init.py
+++ b/surfsense_backend/app/services/document_processing/init.py
@ -0,0 +1 @@
+# Document processing services for SurfSense
--- a/surfsense_backend/app/services/document_processing/docling_service.py
+++ b/surfsense_backend/app/services/document_processing/docling_service.py
@ -0,0 +1,346 @@
+#!/usr/bin/env python3
+"""
+Docling Document Processing Service for SurfSense
+SSL-safe implementation with pre-downloaded models
+"""
+
+import logging
+import ssl
+import os
+from typing import Dict, Any
+
+logger = logging.getLogger(__name__)
+
+class DoclingService:
+    """Docling service for enhanced document processing with SSL fixes."""
+    
+    def __init__(self):
+        """Initialize Docling service with SSL, model fixes, and GPU acceleration."""
+        self.converter = None
+        self.use_gpu = False
+        self._configure_ssl_environment()
+        self._check_wsl2_gpu_support()
+        self._initialize_docling()
+    
+    def _configure_ssl_environment(self):
+        """Configure SSL environment for secure model downloads."""
+        try:
+            # Set SSL context for downloads
+            ssl._create_default_https_context = ssl._create_unverified_context
+            
+            # Set SSL environment variables if not already set
+            if not os.environ.get('SSL_CERT_FILE'):
+                try:
+                    import certifi
+                    os.environ['SSL_CERT_FILE'] = certifi.where()
+                    os.environ['REQUESTS_CA_BUNDLE'] = certifi.where()
+                except ImportError:
+                    pass
+            
+            logger.info("🔐 SSL environment configured for model downloads")
+        except Exception as e:
+            logger.warning(f"⚠️ SSL configuration warning: {e}")
+    
+    def _check_wsl2_gpu_support(self):
+        """Check and configure GPU support for WSL2 environment."""
+        try:
+            import torch
+            if torch.cuda.is_available():
+                gpu_count = torch.cuda.device_count()
+                gpu_name = torch.cuda.get_device_name(0) if gpu_count > 0 else "Unknown"
+                logger.info(f"✅ WSL2 GPU detected: {gpu_name} ({gpu_count} devices)")
+                logger.info(f"🚀 CUDA Version: {torch.version.cuda}")
+                self.use_gpu = True
+            else:
+                logger.info("⚠️ CUDA not available in WSL2, falling back to CPU")
+                self.use_gpu = False
+        except ImportError:
+            logger.info("⚠️ PyTorch not found, falling back to CPU")
+            self.use_gpu = False
+        except Exception as e:
+            logger.warning(f"⚠️ GPU detection failed: {e}, falling back to CPU")
+            self.use_gpu = False
+    
+    def _initialize_docling(self):
+        """Initialize Docling with version-safe configuration."""
+        try:
+            from docling.document_converter import DocumentConverter, PdfFormatOption
+            from docling.datamodel.base_models import InputFormat
+            from docling.datamodel.pipeline_options import PdfPipelineOptions
+            from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
+            
+            logger.info("🔧 Initializing Docling with version-safe configuration...")
+            
+            # Create pipeline options with version-safe attribute checking
+            pipeline_options = PdfPipelineOptions()
+            
+            # Disable OCR (user request)
+            if hasattr(pipeline_options, 'do_ocr'):
+                pipeline_options.do_ocr = False
+                logger.info("⚠️ OCR disabled by user request")
+            else:
+                logger.warning("⚠️ OCR attribute not available in this Docling version")
+            
+            # Enable table structure if available
+            if hasattr(pipeline_options, 'do_table_structure'):
+                pipeline_options.do_table_structure = True
+                logger.info("✅ Table structure detection enabled")
+            
+            # Configure GPU acceleration for WSL2 if available
+            if hasattr(pipeline_options, 'accelerator_device'):
+                if self.use_gpu:
+                    try:
+                        pipeline_options.accelerator_device = "cuda"
+                        logger.info("🚀 GPU acceleration enabled (CUDA)")
+                    except Exception as e:
+                        logger.warning(f"⚠️ GPU acceleration failed, using CPU: {e}")
+                        pipeline_options.accelerator_device = "cpu"
+                else:
+                    pipeline_options.accelerator_device = "cpu"
+                    logger.info("🖥️ Using CPU acceleration")
+            else:
+                logger.info("ℹ️ Accelerator device attribute not available in this Docling version")
+            
+            # Create PDF format option with backend
+            pdf_format_option = PdfFormatOption(
+                pipeline_options=pipeline_options,
+                backend=PyPdfiumDocumentBackend
+            )
+            
+            # Initialize DocumentConverter
+            self.converter = DocumentConverter(
+                format_options={
+                    InputFormat.PDF: pdf_format_option
+                }
+            )
+            
+            acceleration_type = "GPU (WSL2)" if self.use_gpu else "CPU"
+            logger.info(f"✅ Docling initialized successfully with {acceleration_type} acceleration")
+                
+        except ImportError as e:
+            logger.error(f"❌ Docling not installed: {e}")
+            raise RuntimeError(f"Docling not available: {e}")
+        except Exception as e:
+            logger.error(f"❌ Docling initialization failed: {e}")
+            raise RuntimeError(f"Docling initialization failed: {e}")
+    
+    def _configure_easyocr_local_models(self):
+        """Configure EasyOCR to use pre-downloaded local models."""
+        try:
+            import easyocr
+            import os
+            
+            # Set SSL environment for EasyOCR downloads
+            os.environ['CURL_CA_BUNDLE'] = ''
+            os.environ['REQUESTS_CA_BUNDLE'] = ''
+            
+            # Try to use local models first, fallback to download if needed
+            try:
+                reader = easyocr.Reader(['en'], 
+                                       download_enabled=False,
+                                       model_storage_directory="/root/.EasyOCR/model")
+                logger.info("✅ EasyOCR configured for local models")
+                return reader
+            except:
+                # If local models fail, allow download with SSL bypass
+                logger.info("🔄 Local models failed, attempting download with SSL bypass...")
+                reader = easyocr.Reader(['en'], 
+                                       download_enabled=True,
+                                       model_storage_directory="/root/.EasyOCR/model")
+                logger.info("✅ EasyOCR configured with downloaded models")
+                return reader
+        except Exception as e:
+            logger.warning(f"⚠️ EasyOCR configuration failed: {e}")
+            return None
+    
+    async def process_document(self, file_path: str, filename: str = None) -> Dict[str, Any]:
+        """Process document with Docling using pre-downloaded models."""
+        
+        if self.converter is None:
+            raise RuntimeError("Docling converter not initialized")
+        
+        try:
+            logger.info(f"🔄 Processing {filename} with Docling (using local models)...")
+            
+            # Process document with local models
+            result = self.converter.convert(file_path)
+            
+            # Extract content using version-safe methods
+            content = None
+            if hasattr(result, 'document') and result.document:
+                # Try different export methods (version compatibility)
+                if hasattr(result.document, 'export_to_markdown'):
+                    content = result.document.export_to_markdown()
+                    logger.info("📄 Used export_to_markdown method")
+                elif hasattr(result.document, 'to_markdown'):
+                    content = result.document.to_markdown()
+                    logger.info("📄 Used to_markdown method")
+                elif hasattr(result.document, 'text'):
+                    content = result.document.text
+                    logger.info("📄 Used text property")
+                elif hasattr(result.document, '__str__'):
+                    content = str(result.document)
+                    logger.info("📄 Used string conversion")
+                
+                if content:
+                    logger.info(f"✅ Docling SUCCESS - {filename}: {len(content)} chars (local models)")
+                    
+                    return {
+                        'content': content,
+                        'full_text': content,
+                        'service_used': 'docling',
+                        'status': 'success',
+                        'processing_notes': 'Processed with Docling using pre-downloaded models'
+                    }
+                else:
+                    raise ValueError("No content could be extracted from document")
+            else:
+                raise ValueError("No document object returned by Docling")
+                
+        except Exception as e:
+            logger.error(f"❌ Docling processing failed for {filename}: {e}")
+            # Log the full error for debugging
+            import traceback
+            logger.error(f"Full traceback: {traceback.format_exc()}")
+            raise RuntimeError(f"Docling processing failed: {e}")
+    
+    async def process_large_document_summary(
+        self, 
+        content: str, 
+        llm,
+        document_title: str = "Document"
+    ) -> str:
+        """
+        Process large documents using chunked LLM summarization.
+        
+        Args:
+            content: The full document content
+            llm: The language model to use for summarization
+            document_title: Title of the document for context
+            
+        Returns:
+            Final summary of the document
+        """
+        # Large document threshold (100K characters ≈ 25K tokens)
+        LARGE_DOCUMENT_THRESHOLD = 100_000
+        
+        if len(content) <= LARGE_DOCUMENT_THRESHOLD:
+            # For smaller documents, use direct processing
+            logger.info(f"📄 Document size: {len(content)} chars - using direct processing")
+            from app.prompts import SUMMARY_PROMPT_TEMPLATE
+            summary_chain = SUMMARY_PROMPT_TEMPLATE | llm
+            result = await summary_chain.ainvoke({"document": content})
+            return result.content
+        
+        logger.info(f"📚 Large document detected: {len(content)} chars - using chunked processing")
+        
+        # Import chunker from config
+        from app.config import config
+        from langchain_core.prompts import PromptTemplate
+        
+        # Create LLM-optimized chunks (8K tokens max for safety)
+        from chonkie import RecursiveChunker, OverlapRefinery
+        llm_chunker = RecursiveChunker(
+            chunk_size=8000  # Conservative for most LLMs
+        )
+        
+        # Apply overlap refinery for context preservation (10% overlap = 800 tokens)
+        overlap_refinery = OverlapRefinery(
+            context_size=0.1,  # 10% overlap for context preservation
+            method="suffix"     # Add next chunk context to current chunk
+        )
+        
+        # First chunk the content, then apply overlap refinery
+        initial_chunks = llm_chunker.chunk(content)
+        chunks = overlap_refinery.refine(initial_chunks)
+        total_chunks = len(chunks)
+        
+        logger.info(f"📄 Split into {total_chunks} chunks for LLM processing")
+        
+        # Template for chunk processing
+        chunk_template = PromptTemplate(
+            input_variables=["chunk", "chunk_number", "total_chunks"],
+            template="""<INSTRUCTIONS>
+You are summarizing chunk {chunk_number} of {total_chunks} from a large document.
+
+Create a comprehensive summary of this document chunk. Focus on:
+- Key concepts, facts, and information
+- Important details and context
+- Main topics and themes
+
+Provide a clear, structured summary that captures the essential content.
+
+Chunk {chunk_number}/{total_chunks}:
+<document_chunk>
+{chunk}
+</document_chunk>
+</INSTRUCTIONS>"""
+        )
+        
+        # Process each chunk individually
+        chunk_summaries = []
+        for i, chunk in enumerate(chunks, 1):
+            try:
+                logger.info(f"🔄 Processing chunk {i}/{total_chunks} ({len(chunk.text)} chars)")
+                
+                chunk_chain = chunk_template | llm
+                chunk_result = await chunk_chain.ainvoke({
+                    "chunk": chunk.text,
+                    "chunk_number": i,
+                    "total_chunks": total_chunks
+                })
+                
+                chunk_summary = chunk_result.content
+                chunk_summaries.append(f"=== Section {i} ===\n{chunk_summary}")
+                
+                logger.info(f"✅ Completed chunk {i}/{total_chunks}")
+                
+            except Exception as e:
+                logger.error(f"❌ Failed to process chunk {i}/{total_chunks}: {e}")
+                chunk_summaries.append(f"=== Section {i} ===\n[Processing failed]")
+        
+        # Combine summaries into final document summary
+        logger.info(f"🔄 Combining {len(chunk_summaries)} chunk summaries")
+        
+        try:
+            combine_template = PromptTemplate(
+                input_variables=["summaries", "document_title"],
+                template="""<INSTRUCTIONS>
+You are combining multiple section summaries into a final comprehensive document summary.
+
+Create a unified, coherent summary from the following section summaries of "{document_title}".
+Ensure:
+- Logical flow and organization
+- No redundancy or repetition  
+- Comprehensive coverage of all key points
+- Professional, objective tone
+
+<section_summaries>
+{summaries}
+</section_summaries>
+</INSTRUCTIONS>"""
+            )
+            
+            combined_summaries = "\n\n".join(chunk_summaries)
+            combine_chain = combine_template | llm
+            
+            final_result = await combine_chain.ainvoke({
+                "summaries": combined_summaries,
+                "document_title": document_title
+            })
+            
+            final_summary = final_result.content
+            logger.info(f"✅ Large document processing complete: {len(final_summary)} chars summary")
+            
+            return final_summary
+            
+        except Exception as e:
+            logger.error(f"❌ Failed to combine summaries: {e}")
+            # Fallback: return concatenated chunk summaries
+            fallback_summary = "\n\n".join(chunk_summaries)
+            logger.warning("⚠️ Using fallback combined summary")
+            return fallback_summary
+
+def create_docling_service() -> DoclingService:
+    """Create a Docling service instance."""
+    return DoclingService()
--- a/surfsense_backend/app/tasks/background_tasks.py
+++ b/surfsense_backend/app/tasks/background_tasks.py
@ -459,6 +459,94 @@ async def add_received_file_document_using_llamacloud(
        raise RuntimeError(f"Failed to process file document using LlamaCloud: {str(e)}")


+async def add_received_file_document_using_docling(
+    session: AsyncSession,
+    file_name: str,
+    docling_markdown_document: str,
+    search_space_id: int,
+    user_id: str,
+) -> Optional[Document]:
+    """
+    Process and store document content parsed by Docling.
+
+    Args:
+        session: Database session
+        file_name: Name of the processed file
+        docling_markdown_document: Markdown content from Docling parsing
+        search_space_id: ID of the search space
+        user_id: ID of the user
+
+    Returns:
+        Document object if successful, None if failed
+    """
+    try:
+        file_in_markdown = docling_markdown_document
+
+        content_hash = generate_content_hash(file_in_markdown, search_space_id)
+
+        # Check if document with this content hash already exists
+        existing_doc_result = await session.execute(
+            select(Document).where(Document.content_hash == content_hash)
+        )
+        existing_document = existing_doc_result.scalars().first()
+        
+        if existing_document:
+            logging.info(f"Document with content hash {content_hash} already exists. Skipping processing.")
+            return existing_document
+
+        # Get user's long context LLM
+        user_llm = await get_user_long_context_llm(session, user_id)
+        if not user_llm:
+            raise RuntimeError(f"No long context LLM configured for user {user_id}")
+
+        # Generate summary using chunked processing for large documents
+        from app.services.document_processing.docling_service import create_docling_service
+        docling_service = create_docling_service()
+        
+        summary_content = await docling_service.process_large_document_summary(
+            content=file_in_markdown,
+            llm=user_llm,
+            document_title=file_name
+        )
+        summary_embedding = config.embedding_model_instance.embed(summary_content)
+
+        # Process chunks
+        chunks = [
+            Chunk(
+                content=chunk.text,
+                embedding=config.embedding_model_instance.embed(chunk.text),
+            )
+            for chunk in config.chunker_instance.chunk(file_in_markdown)
+        ]
+
+        # Create and store document
+        document = Document(
+            search_space_id=search_space_id,
+            title=file_name,
+            document_type=DocumentType.FILE,
+            document_metadata={
+                "FILE_NAME": file_name,
+                "ETL_SERVICE": "DOCLING",
+            },
+            content=summary_content,
+            embedding=summary_embedding,
+            chunks=chunks,
+            content_hash=content_hash,
+        )
+
+        session.add(document)
+        await session.commit()
+        await session.refresh(document)
+
+        return document
+    except SQLAlchemyError as db_error:
+        await session.rollback()
+        raise db_error
+    except Exception as e:
+        await session.rollback()
+        raise RuntimeError(f"Failed to process file document using Docling: {str(e)}")
+
+
 async def add_youtube_video_document(
    session: AsyncSession, url: str, search_space_id: int, user_id: str
 ):
--- a/surfsense_backend/pyproject.toml
+++ b/surfsense_backend/pyproject.toml
@ -9,6 +9,7 @@ dependencies = [
    "asyncpg>=0.30.0",
    "chonkie[all]>=1.0.6",
    "discord-py>=2.5.2",
+    "docling>=2.15.0",
    "fastapi>=0.115.8",
    "fastapi-users[oauth,sqlalchemy]>=14.0.1",
    "firecrawl-py>=1.12.0",
@ -17,7 +18,7 @@ dependencies = [
    "langchain-unstructured>=0.1.6",
    "langgraph>=0.3.29",
    "linkup-sdk>=0.2.4",
-    "litellm>=1.61.4",
+    "litellm>=1.61.4,<1.70.0",
    "llama-cloud-services>=0.6.25",
    "markdownify>=0.14.1",
    "notion-client>=2.3.0",
@ -29,6 +30,7 @@ dependencies = [
    "slack-sdk>=3.34.0",
    "static-ffmpeg>=2.13",
    "tavily-python>=0.3.2",
+    "tesserocr>=2.8.0",
    "unstructured-client>=0.30.0",
    "unstructured[all-docs]>=0.16.25",
    "uvicorn[standard]>=0.34.0",
--- a/surfsense_backend/uv.lock
+++ b/surfsense_backend/uv.lock
--- a/surfsense_web/.env.example
+++ b/surfsense_web/.env.example
@ -1,3 +1,3 @@
 NEXT_PUBLIC_FASTAPI_BACKEND_URL=http://localhost:8000
 NEXT_PUBLIC_FASTAPI_BACKEND_AUTH_TYPE=LOCAL or GOOGLE
-NEXT_PUBLIC_ETL_SERVICE=UNSTRUCTURED or LLAMACLOUD
+NEXT_PUBLIC_ETL_SERVICE=UNSTRUCTURED or LLAMACLOUD or DOCLING
				`@ -0,0 +1 @@`
				`# Document processing services for SurfSense`