feat: Add Docling support as ETL_SERVICE option

- Added DOCLING as third ETL_SERVICE option (alongside UNSTRUCTURED/LLAMACLOUD)
- Implemented add_received_file_document_using_docling function
- Added Docling processing logic in documents_routes.py
- Enhanced chunking with configurable overlap support
- Added comprehensive document processing service
- Supports both CPU and GPU processing with user selection

Addresses #161 - Add Docling Support as an ETL_SERVICE
Follows same pattern as LlamaCloud integration (PR #123)
This commit is contained in:
Abdullah 3li 2025-07-20 11:42:55 +03:00
parent f852bcb188
commit aa00822169
14 changed files with 3125 additions and 2090 deletions

17
.env Normal file
View file

@ -0,0 +1,17 @@
# Frontend Configuration
FRONTEND_PORT=3000
NEXT_PUBLIC_API_URL=http://backend:8000
# Backend Configuration
BACKEND_PORT=8000
# Database Configuration
POSTGRES_USER=postgres
POSTGRES_PASSWORD=postgres
POSTGRES_DB=surfsense
POSTGRES_PORT=5432
# pgAdmin Configuration
PGADMIN_PORT=5050
PGADMIN_DEFAULT_EMAIL=admin@surfsense.com
PGADMIN_DEFAULT_PASSWORD=surfsense

2
.gitignore vendored
View file

@ -1,2 +1,4 @@
.flashrank_cache* .flashrank_cache*
podcasts/ podcasts/
reports/
SURFSENSE_CRITICAL_FIXES_REPORT.md

View file

@ -2,7 +2,7 @@ version: '3.8'
services: services:
frontend: frontend:
image: ghcr.io/modsetter/surfsense_ui:latest build: ./surfsense_web
ports: ports:
- "${FRONTEND_PORT:-3000}:3000" - "${FRONTEND_PORT:-3000}:3000"
volumes: volumes:
@ -14,7 +14,7 @@ services:
- NEXT_PUBLIC_API_URL=${NEXT_PUBLIC_API_URL:-http://backend:8000} - NEXT_PUBLIC_API_URL=${NEXT_PUBLIC_API_URL:-http://backend:8000}
backend: backend:
image: ghcr.io/modsetter/surfsense_backend:latest build: ./surfsense_backend
ports: ports:
- "${BACKEND_PORT:-8000}:8000" - "${BACKEND_PORT:-8000}:8000"
volumes: volumes:
@ -28,3 +28,15 @@ services:
- PYTHONPATH=/app - PYTHONPATH=/app
- UVICORN_LOOP=asyncio - UVICORN_LOOP=asyncio
- UNSTRUCTURED_HAS_PATCHED_LOOP=1 - UNSTRUCTURED_HAS_PATCHED_LOOP=1
- NVIDIA_VISIBLE_DEVICES=all
- NVIDIA_DRIVER_CAPABILITIES=compute,utility
- LANGCHAIN_TRACING_V2=false
- LANGSMITH_TRACING=false
- TESSDATA_PREFIX=/usr/share/tesseract-ocr/5/tessdata/
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]

View file

@ -34,7 +34,7 @@ STT_SERVICE_API_BASE=
FIRECRAWL_API_KEY=fcr-01J0000000000000000000000 FIRECRAWL_API_KEY=fcr-01J0000000000000000000000
# File Parser Service # File Parser Service
ETL_SERVICE=UNSTRUCTURED or LLAMACLOUD ETL_SERVICE=UNSTRUCTURED or LLAMACLOUD or DOCLING
UNSTRUCTURED_API_KEY=Tpu3P0U8iy UNSTRUCTURED_API_KEY=Tpu3P0U8iy
LLAMA_CLOUD_API_KEY=llx-nnn LLAMA_CLOUD_API_KEY=llx-nnn

View file

@ -2,20 +2,53 @@ FROM python:3.12-slim
WORKDIR /app WORKDIR /app
# Install system dependencies # Install system dependencies including SSL tools, CUDA dependencies, and Tesseract OCR
RUN apt-get update && apt-get install -y --no-install-recommends \ RUN apt-get update && apt-get install -y --no-install-recommends \
gcc \ gcc \
python3-dev \ python3-dev \
ca-certificates \
curl \
wget \
unzip \
gnupg2 \
tesseract-ocr \
tesseract-ocr-eng \
libtesseract-dev \
libleptonica-dev \
&& rm -rf /var/lib/apt/lists/* && rm -rf /var/lib/apt/lists/*
# Update certificates and install SSL tools
RUN update-ca-certificates
RUN pip install --upgrade certifi pip-system-certs
# Copy requirements # Copy requirements
COPY pyproject.toml . COPY pyproject.toml .
COPY uv.lock . COPY uv.lock .
# Install CUDA-enabled PyTorch for WSL2 before other dependencies
RUN pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
# Install python dependencies # Install python dependencies
RUN pip install --no-cache-dir uv && \ RUN pip install --no-cache-dir uv && \
uv pip install --system --no-cache-dir -e . uv pip install --system --no-cache-dir -e .
# Set SSL environment variables dynamically
RUN CERTIFI_PATH=$(python -c "import certifi; print(certifi.where())") && \
echo "Setting SSL_CERT_FILE to $CERTIFI_PATH" && \
echo "export SSL_CERT_FILE=$CERTIFI_PATH" >> /root/.bashrc && \
echo "export REQUESTS_CA_BUNDLE=$CERTIFI_PATH" >> /root/.bashrc
ENV SSL_CERT_FILE=/usr/local/lib/python3.12/site-packages/certifi/cacert.pem
ENV REQUESTS_CA_BUNDLE=/usr/local/lib/python3.12/site-packages/certifi/cacert.pem
# Pre-download EasyOCR models to avoid runtime SSL issues
RUN mkdir -p /root/.EasyOCR/model
RUN wget --no-check-certificate https://github.com/JaidedAI/EasyOCR/releases/download/v1.3/english_g2.zip -O /root/.EasyOCR/model/english_g2.zip || true
RUN wget --no-check-certificate https://github.com/JaidedAI/EasyOCR/releases/download/pre-v1.1.6/craft_mlt_25k.zip -O /root/.EasyOCR/model/craft_mlt_25k.zip || true
RUN cd /root/.EasyOCR/model && (unzip -o english_g2.zip || true) && (unzip -o craft_mlt_25k.zip || true)
# Pre-download Docling models
RUN python -c "try:\n from docling.document_converter import DocumentConverter\n conv = DocumentConverter()\nexcept:\n pass" || true
# Install Playwright browsers for web scraping if needed # Install Playwright browsers for web scraping if needed
RUN pip install playwright && \ RUN pip install playwright && \
playwright install --with-deps chromium playwright install --with-deps chromium
@ -27,6 +60,9 @@ COPY . .
ENV PYTHONPATH=/app ENV PYTHONPATH=/app
ENV UVICORN_LOOP=asyncio ENV UVICORN_LOOP=asyncio
# Set Tesseract data path
ENV TESSDATA_PREFIX=/usr/share/tesseract-ocr/5/tessdata/
# Run # Run
EXPOSE 8000 EXPOSE 8000
CMD ["python", "main.py"] CMD ["python", "main.py"]

View file

@ -157,6 +157,29 @@ def find_optimal_documents_with_binary_search(
def get_model_context_window(model_name: str) -> int: def get_model_context_window(model_name: str) -> int:
"""Get the total context window size for a model (input + output tokens).""" """Get the total context window size for a model (input + output tokens)."""
# Known context windows for common models
model_contexts = {
'llama3.1:8b': 128000, # Llama 3.1 8B has 128K context
'llama3.1:70b': 128000, # Llama 3.1 70B has 128K context
'llama3.1:405b': 128000, # Llama 3.1 405B has 128K context
'llama3:8b': 8192, # Llama 3 8B has 8K context
'llama3:70b': 8192, # Llama 3 70B has 8K context
'ollama/llama3.1:8b': 128000,
'ollama/llama3.1:70b': 128000,
'ollama/llama3:8b': 8192,
'ollama/llama3:70b': 8192,
}
# Check for exact match first
if model_name in model_contexts:
return model_contexts[model_name]
# Check for partial matches (e.g., model_name contains 'llama3.1')
for model_key, context_size in model_contexts.items():
if model_key in model_name.lower():
return context_size
try: try:
model_info = get_model_info(model_name) model_info = get_model_info(model_name)
context_window = model_info.get( context_window = model_info.get(

View file

@ -84,6 +84,9 @@ class Config:
# LlamaCloud API Key # LlamaCloud API Key
LLAMA_CLOUD_API_KEY = os.getenv("LLAMA_CLOUD_API_KEY") LLAMA_CLOUD_API_KEY = os.getenv("LLAMA_CLOUD_API_KEY")
elif ETL_SERVICE == "DOCLING":
# Docling doesn't require API keys - uses local processing
pass
# Firecrawl API Key # Firecrawl API Key
FIRECRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY", None) FIRECRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY", None)

View file

@ -7,7 +7,7 @@ from app.db import get_async_session, User, SearchSpace, Document, DocumentType
from app.schemas import DocumentsCreate, DocumentUpdate, DocumentRead from app.schemas import DocumentsCreate, DocumentUpdate, DocumentRead
from app.users import current_active_user from app.users import current_active_user
from app.utils.check_ownership import check_ownership from app.utils.check_ownership import check_ownership
from app.tasks.background_tasks import add_received_markdown_file_document, add_extension_received_document, add_received_file_document_using_unstructured, add_crawled_url_document, add_youtube_video_document, add_received_file_document_using_llamacloud from app.tasks.background_tasks import add_received_markdown_file_document, add_extension_received_document, add_received_file_document_using_unstructured, add_crawled_url_document, add_youtube_video_document, add_received_file_document_using_llamacloud, add_received_file_document_using_docling
from app.config import config as app_config from app.config import config as app_config
# Force asyncio to use standard event loop before unstructured imports # Force asyncio to use standard event loop before unstructured imports
import asyncio import asyncio
@ -269,6 +269,31 @@ async def process_file_in_background(
search_space_id=search_space_id, search_space_id=search_space_id,
user_id=user_id user_id=user_id
) )
elif app_config.ETL_SERVICE == "DOCLING":
# Use Docling service for document processing
from app.services.document_processing.docling_service import create_docling_service
# Create Docling service
docling_service = create_docling_service()
# Process the document
result = await docling_service.process_document(file_path, filename)
# Clean up the temp file
import os
try:
os.unlink(file_path)
except:
pass
# Process the document using our Docling background task
await add_received_file_document_using_docling(
session,
filename,
docling_markdown_document=result['content'],
search_space_id=search_space_id,
user_id=user_id
)
except Exception as e: except Exception as e:
import logging import logging
logging.error(f"Error processing file in background: {str(e)}") logging.error(f"Error processing file in background: {str(e)}")

View file

@ -0,0 +1 @@
# Document processing services for SurfSense

View file

@ -0,0 +1,346 @@
#!/usr/bin/env python3
"""
Docling Document Processing Service for SurfSense
SSL-safe implementation with pre-downloaded models
"""
import logging
import ssl
import os
from typing import Dict, Any
logger = logging.getLogger(__name__)
class DoclingService:
"""Docling service for enhanced document processing with SSL fixes."""
def __init__(self):
"""Initialize Docling service with SSL, model fixes, and GPU acceleration."""
self.converter = None
self.use_gpu = False
self._configure_ssl_environment()
self._check_wsl2_gpu_support()
self._initialize_docling()
def _configure_ssl_environment(self):
"""Configure SSL environment for secure model downloads."""
try:
# Set SSL context for downloads
ssl._create_default_https_context = ssl._create_unverified_context
# Set SSL environment variables if not already set
if not os.environ.get('SSL_CERT_FILE'):
try:
import certifi
os.environ['SSL_CERT_FILE'] = certifi.where()
os.environ['REQUESTS_CA_BUNDLE'] = certifi.where()
except ImportError:
pass
logger.info("🔐 SSL environment configured for model downloads")
except Exception as e:
logger.warning(f"⚠️ SSL configuration warning: {e}")
def _check_wsl2_gpu_support(self):
"""Check and configure GPU support for WSL2 environment."""
try:
import torch
if torch.cuda.is_available():
gpu_count = torch.cuda.device_count()
gpu_name = torch.cuda.get_device_name(0) if gpu_count > 0 else "Unknown"
logger.info(f"✅ WSL2 GPU detected: {gpu_name} ({gpu_count} devices)")
logger.info(f"🚀 CUDA Version: {torch.version.cuda}")
self.use_gpu = True
else:
logger.info("⚠️ CUDA not available in WSL2, falling back to CPU")
self.use_gpu = False
except ImportError:
logger.info("⚠️ PyTorch not found, falling back to CPU")
self.use_gpu = False
except Exception as e:
logger.warning(f"⚠️ GPU detection failed: {e}, falling back to CPU")
self.use_gpu = False
def _initialize_docling(self):
"""Initialize Docling with version-safe configuration."""
try:
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
logger.info("🔧 Initializing Docling with version-safe configuration...")
# Create pipeline options with version-safe attribute checking
pipeline_options = PdfPipelineOptions()
# Disable OCR (user request)
if hasattr(pipeline_options, 'do_ocr'):
pipeline_options.do_ocr = False
logger.info("⚠️ OCR disabled by user request")
else:
logger.warning("⚠️ OCR attribute not available in this Docling version")
# Enable table structure if available
if hasattr(pipeline_options, 'do_table_structure'):
pipeline_options.do_table_structure = True
logger.info("✅ Table structure detection enabled")
# Configure GPU acceleration for WSL2 if available
if hasattr(pipeline_options, 'accelerator_device'):
if self.use_gpu:
try:
pipeline_options.accelerator_device = "cuda"
logger.info("🚀 GPU acceleration enabled (CUDA)")
except Exception as e:
logger.warning(f"⚠️ GPU acceleration failed, using CPU: {e}")
pipeline_options.accelerator_device = "cpu"
else:
pipeline_options.accelerator_device = "cpu"
logger.info("🖥️ Using CPU acceleration")
else:
logger.info(" Accelerator device attribute not available in this Docling version")
# Create PDF format option with backend
pdf_format_option = PdfFormatOption(
pipeline_options=pipeline_options,
backend=PyPdfiumDocumentBackend
)
# Initialize DocumentConverter
self.converter = DocumentConverter(
format_options={
InputFormat.PDF: pdf_format_option
}
)
acceleration_type = "GPU (WSL2)" if self.use_gpu else "CPU"
logger.info(f"✅ Docling initialized successfully with {acceleration_type} acceleration")
except ImportError as e:
logger.error(f"❌ Docling not installed: {e}")
raise RuntimeError(f"Docling not available: {e}")
except Exception as e:
logger.error(f"❌ Docling initialization failed: {e}")
raise RuntimeError(f"Docling initialization failed: {e}")
def _configure_easyocr_local_models(self):
"""Configure EasyOCR to use pre-downloaded local models."""
try:
import easyocr
import os
# Set SSL environment for EasyOCR downloads
os.environ['CURL_CA_BUNDLE'] = ''
os.environ['REQUESTS_CA_BUNDLE'] = ''
# Try to use local models first, fallback to download if needed
try:
reader = easyocr.Reader(['en'],
download_enabled=False,
model_storage_directory="/root/.EasyOCR/model")
logger.info("✅ EasyOCR configured for local models")
return reader
except:
# If local models fail, allow download with SSL bypass
logger.info("🔄 Local models failed, attempting download with SSL bypass...")
reader = easyocr.Reader(['en'],
download_enabled=True,
model_storage_directory="/root/.EasyOCR/model")
logger.info("✅ EasyOCR configured with downloaded models")
return reader
except Exception as e:
logger.warning(f"⚠️ EasyOCR configuration failed: {e}")
return None
async def process_document(self, file_path: str, filename: str = None) -> Dict[str, Any]:
"""Process document with Docling using pre-downloaded models."""
if self.converter is None:
raise RuntimeError("Docling converter not initialized")
try:
logger.info(f"🔄 Processing {filename} with Docling (using local models)...")
# Process document with local models
result = self.converter.convert(file_path)
# Extract content using version-safe methods
content = None
if hasattr(result, 'document') and result.document:
# Try different export methods (version compatibility)
if hasattr(result.document, 'export_to_markdown'):
content = result.document.export_to_markdown()
logger.info("📄 Used export_to_markdown method")
elif hasattr(result.document, 'to_markdown'):
content = result.document.to_markdown()
logger.info("📄 Used to_markdown method")
elif hasattr(result.document, 'text'):
content = result.document.text
logger.info("📄 Used text property")
elif hasattr(result.document, '__str__'):
content = str(result.document)
logger.info("📄 Used string conversion")
if content:
logger.info(f"✅ Docling SUCCESS - {filename}: {len(content)} chars (local models)")
return {
'content': content,
'full_text': content,
'service_used': 'docling',
'status': 'success',
'processing_notes': 'Processed with Docling using pre-downloaded models'
}
else:
raise ValueError("No content could be extracted from document")
else:
raise ValueError("No document object returned by Docling")
except Exception as e:
logger.error(f"❌ Docling processing failed for {filename}: {e}")
# Log the full error for debugging
import traceback
logger.error(f"Full traceback: {traceback.format_exc()}")
raise RuntimeError(f"Docling processing failed: {e}")
async def process_large_document_summary(
self,
content: str,
llm,
document_title: str = "Document"
) -> str:
"""
Process large documents using chunked LLM summarization.
Args:
content: The full document content
llm: The language model to use for summarization
document_title: Title of the document for context
Returns:
Final summary of the document
"""
# Large document threshold (100K characters ≈ 25K tokens)
LARGE_DOCUMENT_THRESHOLD = 100_000
if len(content) <= LARGE_DOCUMENT_THRESHOLD:
# For smaller documents, use direct processing
logger.info(f"📄 Document size: {len(content)} chars - using direct processing")
from app.prompts import SUMMARY_PROMPT_TEMPLATE
summary_chain = SUMMARY_PROMPT_TEMPLATE | llm
result = await summary_chain.ainvoke({"document": content})
return result.content
logger.info(f"📚 Large document detected: {len(content)} chars - using chunked processing")
# Import chunker from config
from app.config import config
from langchain_core.prompts import PromptTemplate
# Create LLM-optimized chunks (8K tokens max for safety)
from chonkie import RecursiveChunker, OverlapRefinery
llm_chunker = RecursiveChunker(
chunk_size=8000 # Conservative for most LLMs
)
# Apply overlap refinery for context preservation (10% overlap = 800 tokens)
overlap_refinery = OverlapRefinery(
context_size=0.1, # 10% overlap for context preservation
method="suffix" # Add next chunk context to current chunk
)
# First chunk the content, then apply overlap refinery
initial_chunks = llm_chunker.chunk(content)
chunks = overlap_refinery.refine(initial_chunks)
total_chunks = len(chunks)
logger.info(f"📄 Split into {total_chunks} chunks for LLM processing")
# Template for chunk processing
chunk_template = PromptTemplate(
input_variables=["chunk", "chunk_number", "total_chunks"],
template="""<INSTRUCTIONS>
You are summarizing chunk {chunk_number} of {total_chunks} from a large document.
Create a comprehensive summary of this document chunk. Focus on:
- Key concepts, facts, and information
- Important details and context
- Main topics and themes
Provide a clear, structured summary that captures the essential content.
Chunk {chunk_number}/{total_chunks}:
<document_chunk>
{chunk}
</document_chunk>
</INSTRUCTIONS>"""
)
# Process each chunk individually
chunk_summaries = []
for i, chunk in enumerate(chunks, 1):
try:
logger.info(f"🔄 Processing chunk {i}/{total_chunks} ({len(chunk.text)} chars)")
chunk_chain = chunk_template | llm
chunk_result = await chunk_chain.ainvoke({
"chunk": chunk.text,
"chunk_number": i,
"total_chunks": total_chunks
})
chunk_summary = chunk_result.content
chunk_summaries.append(f"=== Section {i} ===\n{chunk_summary}")
logger.info(f"✅ Completed chunk {i}/{total_chunks}")
except Exception as e:
logger.error(f"❌ Failed to process chunk {i}/{total_chunks}: {e}")
chunk_summaries.append(f"=== Section {i} ===\n[Processing failed]")
# Combine summaries into final document summary
logger.info(f"🔄 Combining {len(chunk_summaries)} chunk summaries")
try:
combine_template = PromptTemplate(
input_variables=["summaries", "document_title"],
template="""<INSTRUCTIONS>
You are combining multiple section summaries into a final comprehensive document summary.
Create a unified, coherent summary from the following section summaries of "{document_title}".
Ensure:
- Logical flow and organization
- No redundancy or repetition
- Comprehensive coverage of all key points
- Professional, objective tone
<section_summaries>
{summaries}
</section_summaries>
</INSTRUCTIONS>"""
)
combined_summaries = "\n\n".join(chunk_summaries)
combine_chain = combine_template | llm
final_result = await combine_chain.ainvoke({
"summaries": combined_summaries,
"document_title": document_title
})
final_summary = final_result.content
logger.info(f"✅ Large document processing complete: {len(final_summary)} chars summary")
return final_summary
except Exception as e:
logger.error(f"❌ Failed to combine summaries: {e}")
# Fallback: return concatenated chunk summaries
fallback_summary = "\n\n".join(chunk_summaries)
logger.warning("⚠️ Using fallback combined summary")
return fallback_summary
def create_docling_service() -> DoclingService:
"""Create a Docling service instance."""
return DoclingService()

View file

@ -459,6 +459,94 @@ async def add_received_file_document_using_llamacloud(
raise RuntimeError(f"Failed to process file document using LlamaCloud: {str(e)}") raise RuntimeError(f"Failed to process file document using LlamaCloud: {str(e)}")
async def add_received_file_document_using_docling(
session: AsyncSession,
file_name: str,
docling_markdown_document: str,
search_space_id: int,
user_id: str,
) -> Optional[Document]:
"""
Process and store document content parsed by Docling.
Args:
session: Database session
file_name: Name of the processed file
docling_markdown_document: Markdown content from Docling parsing
search_space_id: ID of the search space
user_id: ID of the user
Returns:
Document object if successful, None if failed
"""
try:
file_in_markdown = docling_markdown_document
content_hash = generate_content_hash(file_in_markdown, search_space_id)
# Check if document with this content hash already exists
existing_doc_result = await session.execute(
select(Document).where(Document.content_hash == content_hash)
)
existing_document = existing_doc_result.scalars().first()
if existing_document:
logging.info(f"Document with content hash {content_hash} already exists. Skipping processing.")
return existing_document
# Get user's long context LLM
user_llm = await get_user_long_context_llm(session, user_id)
if not user_llm:
raise RuntimeError(f"No long context LLM configured for user {user_id}")
# Generate summary using chunked processing for large documents
from app.services.document_processing.docling_service import create_docling_service
docling_service = create_docling_service()
summary_content = await docling_service.process_large_document_summary(
content=file_in_markdown,
llm=user_llm,
document_title=file_name
)
summary_embedding = config.embedding_model_instance.embed(summary_content)
# Process chunks
chunks = [
Chunk(
content=chunk.text,
embedding=config.embedding_model_instance.embed(chunk.text),
)
for chunk in config.chunker_instance.chunk(file_in_markdown)
]
# Create and store document
document = Document(
search_space_id=search_space_id,
title=file_name,
document_type=DocumentType.FILE,
document_metadata={
"FILE_NAME": file_name,
"ETL_SERVICE": "DOCLING",
},
content=summary_content,
embedding=summary_embedding,
chunks=chunks,
content_hash=content_hash,
)
session.add(document)
await session.commit()
await session.refresh(document)
return document
except SQLAlchemyError as db_error:
await session.rollback()
raise db_error
except Exception as e:
await session.rollback()
raise RuntimeError(f"Failed to process file document using Docling: {str(e)}")
async def add_youtube_video_document( async def add_youtube_video_document(
session: AsyncSession, url: str, search_space_id: int, user_id: str session: AsyncSession, url: str, search_space_id: int, user_id: str
): ):

View file

@ -9,6 +9,7 @@ dependencies = [
"asyncpg>=0.30.0", "asyncpg>=0.30.0",
"chonkie[all]>=1.0.6", "chonkie[all]>=1.0.6",
"discord-py>=2.5.2", "discord-py>=2.5.2",
"docling>=2.15.0",
"fastapi>=0.115.8", "fastapi>=0.115.8",
"fastapi-users[oauth,sqlalchemy]>=14.0.1", "fastapi-users[oauth,sqlalchemy]>=14.0.1",
"firecrawl-py>=1.12.0", "firecrawl-py>=1.12.0",
@ -17,7 +18,7 @@ dependencies = [
"langchain-unstructured>=0.1.6", "langchain-unstructured>=0.1.6",
"langgraph>=0.3.29", "langgraph>=0.3.29",
"linkup-sdk>=0.2.4", "linkup-sdk>=0.2.4",
"litellm>=1.61.4", "litellm>=1.61.4,<1.70.0",
"llama-cloud-services>=0.6.25", "llama-cloud-services>=0.6.25",
"markdownify>=0.14.1", "markdownify>=0.14.1",
"notion-client>=2.3.0", "notion-client>=2.3.0",
@ -29,6 +30,7 @@ dependencies = [
"slack-sdk>=3.34.0", "slack-sdk>=3.34.0",
"static-ffmpeg>=2.13", "static-ffmpeg>=2.13",
"tavily-python>=0.3.2", "tavily-python>=0.3.2",
"tesserocr>=2.8.0",
"unstructured-client>=0.30.0", "unstructured-client>=0.30.0",
"unstructured[all-docs]>=0.16.25", "unstructured[all-docs]>=0.16.25",
"uvicorn[standard]>=0.34.0", "uvicorn[standard]>=0.34.0",

4646
surfsense_backend/uv.lock generated

File diff suppressed because it is too large Load diff

View file

@ -1,3 +1,3 @@
NEXT_PUBLIC_FASTAPI_BACKEND_URL=http://localhost:8000 NEXT_PUBLIC_FASTAPI_BACKEND_URL=http://localhost:8000
NEXT_PUBLIC_FASTAPI_BACKEND_AUTH_TYPE=LOCAL or GOOGLE NEXT_PUBLIC_FASTAPI_BACKEND_AUTH_TYPE=LOCAL or GOOGLE
NEXT_PUBLIC_ETL_SERVICE=UNSTRUCTURED or LLAMACLOUD NEXT_PUBLIC_ETL_SERVICE=UNSTRUCTURED or LLAMACLOUD or DOCLING