SurfSense/surfsense_backend/app/services/document_processing/docling_service.py
Abdullah 3li aa00822169 feat: Add Docling support as ETL_SERVICE option
- Added DOCLING as third ETL_SERVICE option (alongside UNSTRUCTURED/LLAMACLOUD)
- Implemented add_received_file_document_using_docling function
- Added Docling processing logic in documents_routes.py
- Enhanced chunking with configurable overlap support
- Added comprehensive document processing service
- Supports both CPU and GPU processing with user selection

Addresses #161 - Add Docling Support as an ETL_SERVICE
Follows same pattern as LlamaCloud integration (PR #123)
2025-07-20 11:42:55 +03:00

346 lines
No EOL
14 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Docling Document Processing Service for SurfSense
SSL-safe implementation with pre-downloaded models
"""
import logging
import ssl
import os
from typing import Dict, Any
logger = logging.getLogger(__name__)
class DoclingService:
"""Docling service for enhanced document processing with SSL fixes."""
def __init__(self):
"""Initialize Docling service with SSL, model fixes, and GPU acceleration."""
self.converter = None
self.use_gpu = False
self._configure_ssl_environment()
self._check_wsl2_gpu_support()
self._initialize_docling()
def _configure_ssl_environment(self):
"""Configure SSL environment for secure model downloads."""
try:
# Set SSL context for downloads
ssl._create_default_https_context = ssl._create_unverified_context
# Set SSL environment variables if not already set
if not os.environ.get('SSL_CERT_FILE'):
try:
import certifi
os.environ['SSL_CERT_FILE'] = certifi.where()
os.environ['REQUESTS_CA_BUNDLE'] = certifi.where()
except ImportError:
pass
logger.info("🔐 SSL environment configured for model downloads")
except Exception as e:
logger.warning(f"⚠️ SSL configuration warning: {e}")
def _check_wsl2_gpu_support(self):
"""Check and configure GPU support for WSL2 environment."""
try:
import torch
if torch.cuda.is_available():
gpu_count = torch.cuda.device_count()
gpu_name = torch.cuda.get_device_name(0) if gpu_count > 0 else "Unknown"
logger.info(f"✅ WSL2 GPU detected: {gpu_name} ({gpu_count} devices)")
logger.info(f"🚀 CUDA Version: {torch.version.cuda}")
self.use_gpu = True
else:
logger.info("⚠️ CUDA not available in WSL2, falling back to CPU")
self.use_gpu = False
except ImportError:
logger.info("⚠️ PyTorch not found, falling back to CPU")
self.use_gpu = False
except Exception as e:
logger.warning(f"⚠️ GPU detection failed: {e}, falling back to CPU")
self.use_gpu = False
def _initialize_docling(self):
"""Initialize Docling with version-safe configuration."""
try:
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
logger.info("🔧 Initializing Docling with version-safe configuration...")
# Create pipeline options with version-safe attribute checking
pipeline_options = PdfPipelineOptions()
# Disable OCR (user request)
if hasattr(pipeline_options, 'do_ocr'):
pipeline_options.do_ocr = False
logger.info("⚠️ OCR disabled by user request")
else:
logger.warning("⚠️ OCR attribute not available in this Docling version")
# Enable table structure if available
if hasattr(pipeline_options, 'do_table_structure'):
pipeline_options.do_table_structure = True
logger.info("✅ Table structure detection enabled")
# Configure GPU acceleration for WSL2 if available
if hasattr(pipeline_options, 'accelerator_device'):
if self.use_gpu:
try:
pipeline_options.accelerator_device = "cuda"
logger.info("🚀 GPU acceleration enabled (CUDA)")
except Exception as e:
logger.warning(f"⚠️ GPU acceleration failed, using CPU: {e}")
pipeline_options.accelerator_device = "cpu"
else:
pipeline_options.accelerator_device = "cpu"
logger.info("🖥️ Using CPU acceleration")
else:
logger.info(" Accelerator device attribute not available in this Docling version")
# Create PDF format option with backend
pdf_format_option = PdfFormatOption(
pipeline_options=pipeline_options,
backend=PyPdfiumDocumentBackend
)
# Initialize DocumentConverter
self.converter = DocumentConverter(
format_options={
InputFormat.PDF: pdf_format_option
}
)
acceleration_type = "GPU (WSL2)" if self.use_gpu else "CPU"
logger.info(f"✅ Docling initialized successfully with {acceleration_type} acceleration")
except ImportError as e:
logger.error(f"❌ Docling not installed: {e}")
raise RuntimeError(f"Docling not available: {e}")
except Exception as e:
logger.error(f"❌ Docling initialization failed: {e}")
raise RuntimeError(f"Docling initialization failed: {e}")
def _configure_easyocr_local_models(self):
"""Configure EasyOCR to use pre-downloaded local models."""
try:
import easyocr
import os
# Set SSL environment for EasyOCR downloads
os.environ['CURL_CA_BUNDLE'] = ''
os.environ['REQUESTS_CA_BUNDLE'] = ''
# Try to use local models first, fallback to download if needed
try:
reader = easyocr.Reader(['en'],
download_enabled=False,
model_storage_directory="/root/.EasyOCR/model")
logger.info("✅ EasyOCR configured for local models")
return reader
except:
# If local models fail, allow download with SSL bypass
logger.info("🔄 Local models failed, attempting download with SSL bypass...")
reader = easyocr.Reader(['en'],
download_enabled=True,
model_storage_directory="/root/.EasyOCR/model")
logger.info("✅ EasyOCR configured with downloaded models")
return reader
except Exception as e:
logger.warning(f"⚠️ EasyOCR configuration failed: {e}")
return None
async def process_document(self, file_path: str, filename: str = None) -> Dict[str, Any]:
"""Process document with Docling using pre-downloaded models."""
if self.converter is None:
raise RuntimeError("Docling converter not initialized")
try:
logger.info(f"🔄 Processing {filename} with Docling (using local models)...")
# Process document with local models
result = self.converter.convert(file_path)
# Extract content using version-safe methods
content = None
if hasattr(result, 'document') and result.document:
# Try different export methods (version compatibility)
if hasattr(result.document, 'export_to_markdown'):
content = result.document.export_to_markdown()
logger.info("📄 Used export_to_markdown method")
elif hasattr(result.document, 'to_markdown'):
content = result.document.to_markdown()
logger.info("📄 Used to_markdown method")
elif hasattr(result.document, 'text'):
content = result.document.text
logger.info("📄 Used text property")
elif hasattr(result.document, '__str__'):
content = str(result.document)
logger.info("📄 Used string conversion")
if content:
logger.info(f"✅ Docling SUCCESS - {filename}: {len(content)} chars (local models)")
return {
'content': content,
'full_text': content,
'service_used': 'docling',
'status': 'success',
'processing_notes': 'Processed with Docling using pre-downloaded models'
}
else:
raise ValueError("No content could be extracted from document")
else:
raise ValueError("No document object returned by Docling")
except Exception as e:
logger.error(f"❌ Docling processing failed for {filename}: {e}")
# Log the full error for debugging
import traceback
logger.error(f"Full traceback: {traceback.format_exc()}")
raise RuntimeError(f"Docling processing failed: {e}")
async def process_large_document_summary(
self,
content: str,
llm,
document_title: str = "Document"
) -> str:
"""
Process large documents using chunked LLM summarization.
Args:
content: The full document content
llm: The language model to use for summarization
document_title: Title of the document for context
Returns:
Final summary of the document
"""
# Large document threshold (100K characters ≈ 25K tokens)
LARGE_DOCUMENT_THRESHOLD = 100_000
if len(content) <= LARGE_DOCUMENT_THRESHOLD:
# For smaller documents, use direct processing
logger.info(f"📄 Document size: {len(content)} chars - using direct processing")
from app.prompts import SUMMARY_PROMPT_TEMPLATE
summary_chain = SUMMARY_PROMPT_TEMPLATE | llm
result = await summary_chain.ainvoke({"document": content})
return result.content
logger.info(f"📚 Large document detected: {len(content)} chars - using chunked processing")
# Import chunker from config
from app.config import config
from langchain_core.prompts import PromptTemplate
# Create LLM-optimized chunks (8K tokens max for safety)
from chonkie import RecursiveChunker, OverlapRefinery
llm_chunker = RecursiveChunker(
chunk_size=8000 # Conservative for most LLMs
)
# Apply overlap refinery for context preservation (10% overlap = 800 tokens)
overlap_refinery = OverlapRefinery(
context_size=0.1, # 10% overlap for context preservation
method="suffix" # Add next chunk context to current chunk
)
# First chunk the content, then apply overlap refinery
initial_chunks = llm_chunker.chunk(content)
chunks = overlap_refinery.refine(initial_chunks)
total_chunks = len(chunks)
logger.info(f"📄 Split into {total_chunks} chunks for LLM processing")
# Template for chunk processing
chunk_template = PromptTemplate(
input_variables=["chunk", "chunk_number", "total_chunks"],
template="""<INSTRUCTIONS>
You are summarizing chunk {chunk_number} of {total_chunks} from a large document.
Create a comprehensive summary of this document chunk. Focus on:
- Key concepts, facts, and information
- Important details and context
- Main topics and themes
Provide a clear, structured summary that captures the essential content.
Chunk {chunk_number}/{total_chunks}:
<document_chunk>
{chunk}
</document_chunk>
</INSTRUCTIONS>"""
)
# Process each chunk individually
chunk_summaries = []
for i, chunk in enumerate(chunks, 1):
try:
logger.info(f"🔄 Processing chunk {i}/{total_chunks} ({len(chunk.text)} chars)")
chunk_chain = chunk_template | llm
chunk_result = await chunk_chain.ainvoke({
"chunk": chunk.text,
"chunk_number": i,
"total_chunks": total_chunks
})
chunk_summary = chunk_result.content
chunk_summaries.append(f"=== Section {i} ===\n{chunk_summary}")
logger.info(f"✅ Completed chunk {i}/{total_chunks}")
except Exception as e:
logger.error(f"❌ Failed to process chunk {i}/{total_chunks}: {e}")
chunk_summaries.append(f"=== Section {i} ===\n[Processing failed]")
# Combine summaries into final document summary
logger.info(f"🔄 Combining {len(chunk_summaries)} chunk summaries")
try:
combine_template = PromptTemplate(
input_variables=["summaries", "document_title"],
template="""<INSTRUCTIONS>
You are combining multiple section summaries into a final comprehensive document summary.
Create a unified, coherent summary from the following section summaries of "{document_title}".
Ensure:
- Logical flow and organization
- No redundancy or repetition
- Comprehensive coverage of all key points
- Professional, objective tone
<section_summaries>
{summaries}
</section_summaries>
</INSTRUCTIONS>"""
)
combined_summaries = "\n\n".join(chunk_summaries)
combine_chain = combine_template | llm
final_result = await combine_chain.ainvoke({
"summaries": combined_summaries,
"document_title": document_title
})
final_summary = final_result.content
logger.info(f"✅ Large document processing complete: {len(final_summary)} chars summary")
return final_summary
except Exception as e:
logger.error(f"❌ Failed to combine summaries: {e}")
# Fallback: return concatenated chunk summaries
fallback_summary = "\n\n".join(chunk_summaries)
logger.warning("⚠️ Using fallback combined summary")
return fallback_summary
def create_docling_service() -> DoclingService:
"""Create a Docling service instance."""
return DoclingService()