mirror of
https://github.com/MODSetter/SurfSense.git
synced 2025-09-10 14:28:57 +00:00
feat: Add Docling support as ETL_SERVICE option
- Added DOCLING as third ETL_SERVICE option (alongside UNSTRUCTURED/LLAMACLOUD) - Implemented add_received_file_document_using_docling function - Added Docling processing logic in documents_routes.py - Enhanced chunking with configurable overlap support - Added comprehensive document processing service - Supports both CPU and GPU processing with user selection Addresses #161 - Add Docling Support as an ETL_SERVICE Follows same pattern as LlamaCloud integration (PR #123)
This commit is contained in:
parent
f852bcb188
commit
aa00822169
14 changed files with 3125 additions and 2090 deletions
|
@ -7,7 +7,7 @@ from app.db import get_async_session, User, SearchSpace, Document, DocumentType
|
|||
from app.schemas import DocumentsCreate, DocumentUpdate, DocumentRead
|
||||
from app.users import current_active_user
|
||||
from app.utils.check_ownership import check_ownership
|
||||
from app.tasks.background_tasks import add_received_markdown_file_document, add_extension_received_document, add_received_file_document_using_unstructured, add_crawled_url_document, add_youtube_video_document, add_received_file_document_using_llamacloud
|
||||
from app.tasks.background_tasks import add_received_markdown_file_document, add_extension_received_document, add_received_file_document_using_unstructured, add_crawled_url_document, add_youtube_video_document, add_received_file_document_using_llamacloud, add_received_file_document_using_docling
|
||||
from app.config import config as app_config
|
||||
# Force asyncio to use standard event loop before unstructured imports
|
||||
import asyncio
|
||||
|
@ -269,6 +269,31 @@ async def process_file_in_background(
|
|||
search_space_id=search_space_id,
|
||||
user_id=user_id
|
||||
)
|
||||
elif app_config.ETL_SERVICE == "DOCLING":
|
||||
# Use Docling service for document processing
|
||||
from app.services.document_processing.docling_service import create_docling_service
|
||||
|
||||
# Create Docling service
|
||||
docling_service = create_docling_service()
|
||||
|
||||
# Process the document
|
||||
result = await docling_service.process_document(file_path, filename)
|
||||
|
||||
# Clean up the temp file
|
||||
import os
|
||||
try:
|
||||
os.unlink(file_path)
|
||||
except:
|
||||
pass
|
||||
|
||||
# Process the document using our Docling background task
|
||||
await add_received_file_document_using_docling(
|
||||
session,
|
||||
filename,
|
||||
docling_markdown_document=result['content'],
|
||||
search_space_id=search_space_id,
|
||||
user_id=user_id
|
||||
)
|
||||
except Exception as e:
|
||||
import logging
|
||||
logging.error(f"Error processing file in background: {str(e)}")
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue