mirror of
https://github.com/MODSetter/SurfSense.git
synced 2025-09-01 18:19:08 +00:00
feat: Add Docling support as ETL_SERVICE option
- Added DOCLING as third ETL_SERVICE option (alongside UNSTRUCTURED/LLAMACLOUD) - Implemented add_received_file_document_using_docling function - Added Docling processing logic in documents_routes.py - Enhanced chunking with configurable overlap support - Added comprehensive document processing service - Supports both CPU and GPU processing with user selection Addresses #161 - Add Docling Support as an ETL_SERVICE Follows same pattern as LlamaCloud integration (PR #123)
This commit is contained in:
parent
f852bcb188
commit
aa00822169
14 changed files with 3125 additions and 2090 deletions
17
.env
Normal file
17
.env
Normal file
|
@ -0,0 +1,17 @@
|
|||
# Frontend Configuration
|
||||
FRONTEND_PORT=3000
|
||||
NEXT_PUBLIC_API_URL=http://backend:8000
|
||||
|
||||
# Backend Configuration
|
||||
BACKEND_PORT=8000
|
||||
|
||||
# Database Configuration
|
||||
POSTGRES_USER=postgres
|
||||
POSTGRES_PASSWORD=postgres
|
||||
POSTGRES_DB=surfsense
|
||||
POSTGRES_PORT=5432
|
||||
|
||||
# pgAdmin Configuration
|
||||
PGADMIN_PORT=5050
|
||||
PGADMIN_DEFAULT_EMAIL=admin@surfsense.com
|
||||
PGADMIN_DEFAULT_PASSWORD=surfsense
|
2
.gitignore
vendored
2
.gitignore
vendored
|
@ -1,2 +1,4 @@
|
|||
.flashrank_cache*
|
||||
podcasts/
|
||||
reports/
|
||||
SURFSENSE_CRITICAL_FIXES_REPORT.md
|
||||
|
|
|
@ -2,7 +2,7 @@ version: '3.8'
|
|||
|
||||
services:
|
||||
frontend:
|
||||
image: ghcr.io/modsetter/surfsense_ui:latest
|
||||
build: ./surfsense_web
|
||||
ports:
|
||||
- "${FRONTEND_PORT:-3000}:3000"
|
||||
volumes:
|
||||
|
@ -14,7 +14,7 @@ services:
|
|||
- NEXT_PUBLIC_API_URL=${NEXT_PUBLIC_API_URL:-http://backend:8000}
|
||||
|
||||
backend:
|
||||
image: ghcr.io/modsetter/surfsense_backend:latest
|
||||
build: ./surfsense_backend
|
||||
ports:
|
||||
- "${BACKEND_PORT:-8000}:8000"
|
||||
volumes:
|
||||
|
@ -28,3 +28,15 @@ services:
|
|||
- PYTHONPATH=/app
|
||||
- UVICORN_LOOP=asyncio
|
||||
- UNSTRUCTURED_HAS_PATCHED_LOOP=1
|
||||
- NVIDIA_VISIBLE_DEVICES=all
|
||||
- NVIDIA_DRIVER_CAPABILITIES=compute,utility
|
||||
- LANGCHAIN_TRACING_V2=false
|
||||
- LANGSMITH_TRACING=false
|
||||
- TESSDATA_PREFIX=/usr/share/tesseract-ocr/5/tessdata/
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
devices:
|
||||
- driver: nvidia
|
||||
count: all
|
||||
capabilities: [gpu]
|
||||
|
|
|
@ -34,7 +34,7 @@ STT_SERVICE_API_BASE=
|
|||
FIRECRAWL_API_KEY=fcr-01J0000000000000000000000
|
||||
|
||||
# File Parser Service
|
||||
ETL_SERVICE=UNSTRUCTURED or LLAMACLOUD
|
||||
ETL_SERVICE=UNSTRUCTURED or LLAMACLOUD or DOCLING
|
||||
UNSTRUCTURED_API_KEY=Tpu3P0U8iy
|
||||
LLAMA_CLOUD_API_KEY=llx-nnn
|
||||
|
||||
|
|
|
@ -2,20 +2,53 @@ FROM python:3.12-slim
|
|||
|
||||
WORKDIR /app
|
||||
|
||||
# Install system dependencies
|
||||
# Install system dependencies including SSL tools, CUDA dependencies, and Tesseract OCR
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
gcc \
|
||||
python3-dev \
|
||||
ca-certificates \
|
||||
curl \
|
||||
wget \
|
||||
unzip \
|
||||
gnupg2 \
|
||||
tesseract-ocr \
|
||||
tesseract-ocr-eng \
|
||||
libtesseract-dev \
|
||||
libleptonica-dev \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Update certificates and install SSL tools
|
||||
RUN update-ca-certificates
|
||||
RUN pip install --upgrade certifi pip-system-certs
|
||||
|
||||
# Copy requirements
|
||||
COPY pyproject.toml .
|
||||
COPY uv.lock .
|
||||
|
||||
# Install CUDA-enabled PyTorch for WSL2 before other dependencies
|
||||
RUN pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
|
||||
|
||||
# Install python dependencies
|
||||
RUN pip install --no-cache-dir uv && \
|
||||
uv pip install --system --no-cache-dir -e .
|
||||
|
||||
# Set SSL environment variables dynamically
|
||||
RUN CERTIFI_PATH=$(python -c "import certifi; print(certifi.where())") && \
|
||||
echo "Setting SSL_CERT_FILE to $CERTIFI_PATH" && \
|
||||
echo "export SSL_CERT_FILE=$CERTIFI_PATH" >> /root/.bashrc && \
|
||||
echo "export REQUESTS_CA_BUNDLE=$CERTIFI_PATH" >> /root/.bashrc
|
||||
ENV SSL_CERT_FILE=/usr/local/lib/python3.12/site-packages/certifi/cacert.pem
|
||||
ENV REQUESTS_CA_BUNDLE=/usr/local/lib/python3.12/site-packages/certifi/cacert.pem
|
||||
|
||||
# Pre-download EasyOCR models to avoid runtime SSL issues
|
||||
RUN mkdir -p /root/.EasyOCR/model
|
||||
RUN wget --no-check-certificate https://github.com/JaidedAI/EasyOCR/releases/download/v1.3/english_g2.zip -O /root/.EasyOCR/model/english_g2.zip || true
|
||||
RUN wget --no-check-certificate https://github.com/JaidedAI/EasyOCR/releases/download/pre-v1.1.6/craft_mlt_25k.zip -O /root/.EasyOCR/model/craft_mlt_25k.zip || true
|
||||
RUN cd /root/.EasyOCR/model && (unzip -o english_g2.zip || true) && (unzip -o craft_mlt_25k.zip || true)
|
||||
|
||||
# Pre-download Docling models
|
||||
RUN python -c "try:\n from docling.document_converter import DocumentConverter\n conv = DocumentConverter()\nexcept:\n pass" || true
|
||||
|
||||
# Install Playwright browsers for web scraping if needed
|
||||
RUN pip install playwright && \
|
||||
playwright install --with-deps chromium
|
||||
|
@ -27,6 +60,9 @@ COPY . .
|
|||
ENV PYTHONPATH=/app
|
||||
ENV UVICORN_LOOP=asyncio
|
||||
|
||||
# Set Tesseract data path
|
||||
ENV TESSDATA_PREFIX=/usr/share/tesseract-ocr/5/tessdata/
|
||||
|
||||
# Run
|
||||
EXPOSE 8000
|
||||
CMD ["python", "main.py"]
|
|
@ -157,6 +157,29 @@ def find_optimal_documents_with_binary_search(
|
|||
|
||||
def get_model_context_window(model_name: str) -> int:
|
||||
"""Get the total context window size for a model (input + output tokens)."""
|
||||
|
||||
# Known context windows for common models
|
||||
model_contexts = {
|
||||
'llama3.1:8b': 128000, # Llama 3.1 8B has 128K context
|
||||
'llama3.1:70b': 128000, # Llama 3.1 70B has 128K context
|
||||
'llama3.1:405b': 128000, # Llama 3.1 405B has 128K context
|
||||
'llama3:8b': 8192, # Llama 3 8B has 8K context
|
||||
'llama3:70b': 8192, # Llama 3 70B has 8K context
|
||||
'ollama/llama3.1:8b': 128000,
|
||||
'ollama/llama3.1:70b': 128000,
|
||||
'ollama/llama3:8b': 8192,
|
||||
'ollama/llama3:70b': 8192,
|
||||
}
|
||||
|
||||
# Check for exact match first
|
||||
if model_name in model_contexts:
|
||||
return model_contexts[model_name]
|
||||
|
||||
# Check for partial matches (e.g., model_name contains 'llama3.1')
|
||||
for model_key, context_size in model_contexts.items():
|
||||
if model_key in model_name.lower():
|
||||
return context_size
|
||||
|
||||
try:
|
||||
model_info = get_model_info(model_name)
|
||||
context_window = model_info.get(
|
||||
|
|
|
@ -84,6 +84,9 @@ class Config:
|
|||
# LlamaCloud API Key
|
||||
LLAMA_CLOUD_API_KEY = os.getenv("LLAMA_CLOUD_API_KEY")
|
||||
|
||||
elif ETL_SERVICE == "DOCLING":
|
||||
# Docling doesn't require API keys - uses local processing
|
||||
pass
|
||||
|
||||
# Firecrawl API Key
|
||||
FIRECRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY", None)
|
||||
|
|
|
@ -7,7 +7,7 @@ from app.db import get_async_session, User, SearchSpace, Document, DocumentType
|
|||
from app.schemas import DocumentsCreate, DocumentUpdate, DocumentRead
|
||||
from app.users import current_active_user
|
||||
from app.utils.check_ownership import check_ownership
|
||||
from app.tasks.background_tasks import add_received_markdown_file_document, add_extension_received_document, add_received_file_document_using_unstructured, add_crawled_url_document, add_youtube_video_document, add_received_file_document_using_llamacloud
|
||||
from app.tasks.background_tasks import add_received_markdown_file_document, add_extension_received_document, add_received_file_document_using_unstructured, add_crawled_url_document, add_youtube_video_document, add_received_file_document_using_llamacloud, add_received_file_document_using_docling
|
||||
from app.config import config as app_config
|
||||
# Force asyncio to use standard event loop before unstructured imports
|
||||
import asyncio
|
||||
|
@ -269,6 +269,31 @@ async def process_file_in_background(
|
|||
search_space_id=search_space_id,
|
||||
user_id=user_id
|
||||
)
|
||||
elif app_config.ETL_SERVICE == "DOCLING":
|
||||
# Use Docling service for document processing
|
||||
from app.services.document_processing.docling_service import create_docling_service
|
||||
|
||||
# Create Docling service
|
||||
docling_service = create_docling_service()
|
||||
|
||||
# Process the document
|
||||
result = await docling_service.process_document(file_path, filename)
|
||||
|
||||
# Clean up the temp file
|
||||
import os
|
||||
try:
|
||||
os.unlink(file_path)
|
||||
except:
|
||||
pass
|
||||
|
||||
# Process the document using our Docling background task
|
||||
await add_received_file_document_using_docling(
|
||||
session,
|
||||
filename,
|
||||
docling_markdown_document=result['content'],
|
||||
search_space_id=search_space_id,
|
||||
user_id=user_id
|
||||
)
|
||||
except Exception as e:
|
||||
import logging
|
||||
logging.error(f"Error processing file in background: {str(e)}")
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
# Document processing services for SurfSense
|
|
@ -0,0 +1,346 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Docling Document Processing Service for SurfSense
|
||||
SSL-safe implementation with pre-downloaded models
|
||||
"""
|
||||
|
||||
import logging
|
||||
import ssl
|
||||
import os
|
||||
from typing import Dict, Any
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class DoclingService:
|
||||
"""Docling service for enhanced document processing with SSL fixes."""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize Docling service with SSL, model fixes, and GPU acceleration."""
|
||||
self.converter = None
|
||||
self.use_gpu = False
|
||||
self._configure_ssl_environment()
|
||||
self._check_wsl2_gpu_support()
|
||||
self._initialize_docling()
|
||||
|
||||
def _configure_ssl_environment(self):
|
||||
"""Configure SSL environment for secure model downloads."""
|
||||
try:
|
||||
# Set SSL context for downloads
|
||||
ssl._create_default_https_context = ssl._create_unverified_context
|
||||
|
||||
# Set SSL environment variables if not already set
|
||||
if not os.environ.get('SSL_CERT_FILE'):
|
||||
try:
|
||||
import certifi
|
||||
os.environ['SSL_CERT_FILE'] = certifi.where()
|
||||
os.environ['REQUESTS_CA_BUNDLE'] = certifi.where()
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
logger.info("🔐 SSL environment configured for model downloads")
|
||||
except Exception as e:
|
||||
logger.warning(f"⚠️ SSL configuration warning: {e}")
|
||||
|
||||
def _check_wsl2_gpu_support(self):
|
||||
"""Check and configure GPU support for WSL2 environment."""
|
||||
try:
|
||||
import torch
|
||||
if torch.cuda.is_available():
|
||||
gpu_count = torch.cuda.device_count()
|
||||
gpu_name = torch.cuda.get_device_name(0) if gpu_count > 0 else "Unknown"
|
||||
logger.info(f"✅ WSL2 GPU detected: {gpu_name} ({gpu_count} devices)")
|
||||
logger.info(f"🚀 CUDA Version: {torch.version.cuda}")
|
||||
self.use_gpu = True
|
||||
else:
|
||||
logger.info("⚠️ CUDA not available in WSL2, falling back to CPU")
|
||||
self.use_gpu = False
|
||||
except ImportError:
|
||||
logger.info("⚠️ PyTorch not found, falling back to CPU")
|
||||
self.use_gpu = False
|
||||
except Exception as e:
|
||||
logger.warning(f"⚠️ GPU detection failed: {e}, falling back to CPU")
|
||||
self.use_gpu = False
|
||||
|
||||
def _initialize_docling(self):
|
||||
"""Initialize Docling with version-safe configuration."""
|
||||
try:
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||
|
||||
logger.info("🔧 Initializing Docling with version-safe configuration...")
|
||||
|
||||
# Create pipeline options with version-safe attribute checking
|
||||
pipeline_options = PdfPipelineOptions()
|
||||
|
||||
# Disable OCR (user request)
|
||||
if hasattr(pipeline_options, 'do_ocr'):
|
||||
pipeline_options.do_ocr = False
|
||||
logger.info("⚠️ OCR disabled by user request")
|
||||
else:
|
||||
logger.warning("⚠️ OCR attribute not available in this Docling version")
|
||||
|
||||
# Enable table structure if available
|
||||
if hasattr(pipeline_options, 'do_table_structure'):
|
||||
pipeline_options.do_table_structure = True
|
||||
logger.info("✅ Table structure detection enabled")
|
||||
|
||||
# Configure GPU acceleration for WSL2 if available
|
||||
if hasattr(pipeline_options, 'accelerator_device'):
|
||||
if self.use_gpu:
|
||||
try:
|
||||
pipeline_options.accelerator_device = "cuda"
|
||||
logger.info("🚀 GPU acceleration enabled (CUDA)")
|
||||
except Exception as e:
|
||||
logger.warning(f"⚠️ GPU acceleration failed, using CPU: {e}")
|
||||
pipeline_options.accelerator_device = "cpu"
|
||||
else:
|
||||
pipeline_options.accelerator_device = "cpu"
|
||||
logger.info("🖥️ Using CPU acceleration")
|
||||
else:
|
||||
logger.info("ℹ️ Accelerator device attribute not available in this Docling version")
|
||||
|
||||
# Create PDF format option with backend
|
||||
pdf_format_option = PdfFormatOption(
|
||||
pipeline_options=pipeline_options,
|
||||
backend=PyPdfiumDocumentBackend
|
||||
)
|
||||
|
||||
# Initialize DocumentConverter
|
||||
self.converter = DocumentConverter(
|
||||
format_options={
|
||||
InputFormat.PDF: pdf_format_option
|
||||
}
|
||||
)
|
||||
|
||||
acceleration_type = "GPU (WSL2)" if self.use_gpu else "CPU"
|
||||
logger.info(f"✅ Docling initialized successfully with {acceleration_type} acceleration")
|
||||
|
||||
except ImportError as e:
|
||||
logger.error(f"❌ Docling not installed: {e}")
|
||||
raise RuntimeError(f"Docling not available: {e}")
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Docling initialization failed: {e}")
|
||||
raise RuntimeError(f"Docling initialization failed: {e}")
|
||||
|
||||
def _configure_easyocr_local_models(self):
|
||||
"""Configure EasyOCR to use pre-downloaded local models."""
|
||||
try:
|
||||
import easyocr
|
||||
import os
|
||||
|
||||
# Set SSL environment for EasyOCR downloads
|
||||
os.environ['CURL_CA_BUNDLE'] = ''
|
||||
os.environ['REQUESTS_CA_BUNDLE'] = ''
|
||||
|
||||
# Try to use local models first, fallback to download if needed
|
||||
try:
|
||||
reader = easyocr.Reader(['en'],
|
||||
download_enabled=False,
|
||||
model_storage_directory="/root/.EasyOCR/model")
|
||||
logger.info("✅ EasyOCR configured for local models")
|
||||
return reader
|
||||
except:
|
||||
# If local models fail, allow download with SSL bypass
|
||||
logger.info("🔄 Local models failed, attempting download with SSL bypass...")
|
||||
reader = easyocr.Reader(['en'],
|
||||
download_enabled=True,
|
||||
model_storage_directory="/root/.EasyOCR/model")
|
||||
logger.info("✅ EasyOCR configured with downloaded models")
|
||||
return reader
|
||||
except Exception as e:
|
||||
logger.warning(f"⚠️ EasyOCR configuration failed: {e}")
|
||||
return None
|
||||
|
||||
async def process_document(self, file_path: str, filename: str = None) -> Dict[str, Any]:
|
||||
"""Process document with Docling using pre-downloaded models."""
|
||||
|
||||
if self.converter is None:
|
||||
raise RuntimeError("Docling converter not initialized")
|
||||
|
||||
try:
|
||||
logger.info(f"🔄 Processing {filename} with Docling (using local models)...")
|
||||
|
||||
# Process document with local models
|
||||
result = self.converter.convert(file_path)
|
||||
|
||||
# Extract content using version-safe methods
|
||||
content = None
|
||||
if hasattr(result, 'document') and result.document:
|
||||
# Try different export methods (version compatibility)
|
||||
if hasattr(result.document, 'export_to_markdown'):
|
||||
content = result.document.export_to_markdown()
|
||||
logger.info("📄 Used export_to_markdown method")
|
||||
elif hasattr(result.document, 'to_markdown'):
|
||||
content = result.document.to_markdown()
|
||||
logger.info("📄 Used to_markdown method")
|
||||
elif hasattr(result.document, 'text'):
|
||||
content = result.document.text
|
||||
logger.info("📄 Used text property")
|
||||
elif hasattr(result.document, '__str__'):
|
||||
content = str(result.document)
|
||||
logger.info("📄 Used string conversion")
|
||||
|
||||
if content:
|
||||
logger.info(f"✅ Docling SUCCESS - {filename}: {len(content)} chars (local models)")
|
||||
|
||||
return {
|
||||
'content': content,
|
||||
'full_text': content,
|
||||
'service_used': 'docling',
|
||||
'status': 'success',
|
||||
'processing_notes': 'Processed with Docling using pre-downloaded models'
|
||||
}
|
||||
else:
|
||||
raise ValueError("No content could be extracted from document")
|
||||
else:
|
||||
raise ValueError("No document object returned by Docling")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Docling processing failed for {filename}: {e}")
|
||||
# Log the full error for debugging
|
||||
import traceback
|
||||
logger.error(f"Full traceback: {traceback.format_exc()}")
|
||||
raise RuntimeError(f"Docling processing failed: {e}")
|
||||
|
||||
async def process_large_document_summary(
|
||||
self,
|
||||
content: str,
|
||||
llm,
|
||||
document_title: str = "Document"
|
||||
) -> str:
|
||||
"""
|
||||
Process large documents using chunked LLM summarization.
|
||||
|
||||
Args:
|
||||
content: The full document content
|
||||
llm: The language model to use for summarization
|
||||
document_title: Title of the document for context
|
||||
|
||||
Returns:
|
||||
Final summary of the document
|
||||
"""
|
||||
# Large document threshold (100K characters ≈ 25K tokens)
|
||||
LARGE_DOCUMENT_THRESHOLD = 100_000
|
||||
|
||||
if len(content) <= LARGE_DOCUMENT_THRESHOLD:
|
||||
# For smaller documents, use direct processing
|
||||
logger.info(f"📄 Document size: {len(content)} chars - using direct processing")
|
||||
from app.prompts import SUMMARY_PROMPT_TEMPLATE
|
||||
summary_chain = SUMMARY_PROMPT_TEMPLATE | llm
|
||||
result = await summary_chain.ainvoke({"document": content})
|
||||
return result.content
|
||||
|
||||
logger.info(f"📚 Large document detected: {len(content)} chars - using chunked processing")
|
||||
|
||||
# Import chunker from config
|
||||
from app.config import config
|
||||
from langchain_core.prompts import PromptTemplate
|
||||
|
||||
# Create LLM-optimized chunks (8K tokens max for safety)
|
||||
from chonkie import RecursiveChunker, OverlapRefinery
|
||||
llm_chunker = RecursiveChunker(
|
||||
chunk_size=8000 # Conservative for most LLMs
|
||||
)
|
||||
|
||||
# Apply overlap refinery for context preservation (10% overlap = 800 tokens)
|
||||
overlap_refinery = OverlapRefinery(
|
||||
context_size=0.1, # 10% overlap for context preservation
|
||||
method="suffix" # Add next chunk context to current chunk
|
||||
)
|
||||
|
||||
# First chunk the content, then apply overlap refinery
|
||||
initial_chunks = llm_chunker.chunk(content)
|
||||
chunks = overlap_refinery.refine(initial_chunks)
|
||||
total_chunks = len(chunks)
|
||||
|
||||
logger.info(f"📄 Split into {total_chunks} chunks for LLM processing")
|
||||
|
||||
# Template for chunk processing
|
||||
chunk_template = PromptTemplate(
|
||||
input_variables=["chunk", "chunk_number", "total_chunks"],
|
||||
template="""<INSTRUCTIONS>
|
||||
You are summarizing chunk {chunk_number} of {total_chunks} from a large document.
|
||||
|
||||
Create a comprehensive summary of this document chunk. Focus on:
|
||||
- Key concepts, facts, and information
|
||||
- Important details and context
|
||||
- Main topics and themes
|
||||
|
||||
Provide a clear, structured summary that captures the essential content.
|
||||
|
||||
Chunk {chunk_number}/{total_chunks}:
|
||||
<document_chunk>
|
||||
{chunk}
|
||||
</document_chunk>
|
||||
</INSTRUCTIONS>"""
|
||||
)
|
||||
|
||||
# Process each chunk individually
|
||||
chunk_summaries = []
|
||||
for i, chunk in enumerate(chunks, 1):
|
||||
try:
|
||||
logger.info(f"🔄 Processing chunk {i}/{total_chunks} ({len(chunk.text)} chars)")
|
||||
|
||||
chunk_chain = chunk_template | llm
|
||||
chunk_result = await chunk_chain.ainvoke({
|
||||
"chunk": chunk.text,
|
||||
"chunk_number": i,
|
||||
"total_chunks": total_chunks
|
||||
})
|
||||
|
||||
chunk_summary = chunk_result.content
|
||||
chunk_summaries.append(f"=== Section {i} ===\n{chunk_summary}")
|
||||
|
||||
logger.info(f"✅ Completed chunk {i}/{total_chunks}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Failed to process chunk {i}/{total_chunks}: {e}")
|
||||
chunk_summaries.append(f"=== Section {i} ===\n[Processing failed]")
|
||||
|
||||
# Combine summaries into final document summary
|
||||
logger.info(f"🔄 Combining {len(chunk_summaries)} chunk summaries")
|
||||
|
||||
try:
|
||||
combine_template = PromptTemplate(
|
||||
input_variables=["summaries", "document_title"],
|
||||
template="""<INSTRUCTIONS>
|
||||
You are combining multiple section summaries into a final comprehensive document summary.
|
||||
|
||||
Create a unified, coherent summary from the following section summaries of "{document_title}".
|
||||
Ensure:
|
||||
- Logical flow and organization
|
||||
- No redundancy or repetition
|
||||
- Comprehensive coverage of all key points
|
||||
- Professional, objective tone
|
||||
|
||||
<section_summaries>
|
||||
{summaries}
|
||||
</section_summaries>
|
||||
</INSTRUCTIONS>"""
|
||||
)
|
||||
|
||||
combined_summaries = "\n\n".join(chunk_summaries)
|
||||
combine_chain = combine_template | llm
|
||||
|
||||
final_result = await combine_chain.ainvoke({
|
||||
"summaries": combined_summaries,
|
||||
"document_title": document_title
|
||||
})
|
||||
|
||||
final_summary = final_result.content
|
||||
logger.info(f"✅ Large document processing complete: {len(final_summary)} chars summary")
|
||||
|
||||
return final_summary
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Failed to combine summaries: {e}")
|
||||
# Fallback: return concatenated chunk summaries
|
||||
fallback_summary = "\n\n".join(chunk_summaries)
|
||||
logger.warning("⚠️ Using fallback combined summary")
|
||||
return fallback_summary
|
||||
|
||||
def create_docling_service() -> DoclingService:
|
||||
"""Create a Docling service instance."""
|
||||
return DoclingService()
|
|
@ -459,6 +459,94 @@ async def add_received_file_document_using_llamacloud(
|
|||
raise RuntimeError(f"Failed to process file document using LlamaCloud: {str(e)}")
|
||||
|
||||
|
||||
async def add_received_file_document_using_docling(
|
||||
session: AsyncSession,
|
||||
file_name: str,
|
||||
docling_markdown_document: str,
|
||||
search_space_id: int,
|
||||
user_id: str,
|
||||
) -> Optional[Document]:
|
||||
"""
|
||||
Process and store document content parsed by Docling.
|
||||
|
||||
Args:
|
||||
session: Database session
|
||||
file_name: Name of the processed file
|
||||
docling_markdown_document: Markdown content from Docling parsing
|
||||
search_space_id: ID of the search space
|
||||
user_id: ID of the user
|
||||
|
||||
Returns:
|
||||
Document object if successful, None if failed
|
||||
"""
|
||||
try:
|
||||
file_in_markdown = docling_markdown_document
|
||||
|
||||
content_hash = generate_content_hash(file_in_markdown, search_space_id)
|
||||
|
||||
# Check if document with this content hash already exists
|
||||
existing_doc_result = await session.execute(
|
||||
select(Document).where(Document.content_hash == content_hash)
|
||||
)
|
||||
existing_document = existing_doc_result.scalars().first()
|
||||
|
||||
if existing_document:
|
||||
logging.info(f"Document with content hash {content_hash} already exists. Skipping processing.")
|
||||
return existing_document
|
||||
|
||||
# Get user's long context LLM
|
||||
user_llm = await get_user_long_context_llm(session, user_id)
|
||||
if not user_llm:
|
||||
raise RuntimeError(f"No long context LLM configured for user {user_id}")
|
||||
|
||||
# Generate summary using chunked processing for large documents
|
||||
from app.services.document_processing.docling_service import create_docling_service
|
||||
docling_service = create_docling_service()
|
||||
|
||||
summary_content = await docling_service.process_large_document_summary(
|
||||
content=file_in_markdown,
|
||||
llm=user_llm,
|
||||
document_title=file_name
|
||||
)
|
||||
summary_embedding = config.embedding_model_instance.embed(summary_content)
|
||||
|
||||
# Process chunks
|
||||
chunks = [
|
||||
Chunk(
|
||||
content=chunk.text,
|
||||
embedding=config.embedding_model_instance.embed(chunk.text),
|
||||
)
|
||||
for chunk in config.chunker_instance.chunk(file_in_markdown)
|
||||
]
|
||||
|
||||
# Create and store document
|
||||
document = Document(
|
||||
search_space_id=search_space_id,
|
||||
title=file_name,
|
||||
document_type=DocumentType.FILE,
|
||||
document_metadata={
|
||||
"FILE_NAME": file_name,
|
||||
"ETL_SERVICE": "DOCLING",
|
||||
},
|
||||
content=summary_content,
|
||||
embedding=summary_embedding,
|
||||
chunks=chunks,
|
||||
content_hash=content_hash,
|
||||
)
|
||||
|
||||
session.add(document)
|
||||
await session.commit()
|
||||
await session.refresh(document)
|
||||
|
||||
return document
|
||||
except SQLAlchemyError as db_error:
|
||||
await session.rollback()
|
||||
raise db_error
|
||||
except Exception as e:
|
||||
await session.rollback()
|
||||
raise RuntimeError(f"Failed to process file document using Docling: {str(e)}")
|
||||
|
||||
|
||||
async def add_youtube_video_document(
|
||||
session: AsyncSession, url: str, search_space_id: int, user_id: str
|
||||
):
|
||||
|
|
|
@ -9,6 +9,7 @@ dependencies = [
|
|||
"asyncpg>=0.30.0",
|
||||
"chonkie[all]>=1.0.6",
|
||||
"discord-py>=2.5.2",
|
||||
"docling>=2.15.0",
|
||||
"fastapi>=0.115.8",
|
||||
"fastapi-users[oauth,sqlalchemy]>=14.0.1",
|
||||
"firecrawl-py>=1.12.0",
|
||||
|
@ -17,7 +18,7 @@ dependencies = [
|
|||
"langchain-unstructured>=0.1.6",
|
||||
"langgraph>=0.3.29",
|
||||
"linkup-sdk>=0.2.4",
|
||||
"litellm>=1.61.4",
|
||||
"litellm>=1.61.4,<1.70.0",
|
||||
"llama-cloud-services>=0.6.25",
|
||||
"markdownify>=0.14.1",
|
||||
"notion-client>=2.3.0",
|
||||
|
@ -29,6 +30,7 @@ dependencies = [
|
|||
"slack-sdk>=3.34.0",
|
||||
"static-ffmpeg>=2.13",
|
||||
"tavily-python>=0.3.2",
|
||||
"tesserocr>=2.8.0",
|
||||
"unstructured-client>=0.30.0",
|
||||
"unstructured[all-docs]>=0.16.25",
|
||||
"uvicorn[standard]>=0.34.0",
|
||||
|
|
4646
surfsense_backend/uv.lock
generated
4646
surfsense_backend/uv.lock
generated
File diff suppressed because it is too large
Load diff
|
@ -1,3 +1,3 @@
|
|||
NEXT_PUBLIC_FASTAPI_BACKEND_URL=http://localhost:8000
|
||||
NEXT_PUBLIC_FASTAPI_BACKEND_AUTH_TYPE=LOCAL or GOOGLE
|
||||
NEXT_PUBLIC_ETL_SERVICE=UNSTRUCTURED or LLAMACLOUD
|
||||
NEXT_PUBLIC_ETL_SERVICE=UNSTRUCTURED or LLAMACLOUD or DOCLING
|
Loading…
Add table
Reference in a new issue