mirror of
https://github.com/MODSetter/SurfSense.git
synced 2025-09-09 13:54:40 +00:00
Fixed all ruff lint and formatting errors
This commit is contained in:
parent
0a03c42cc5
commit
d359a59f6d
85 changed files with 5520 additions and 3870 deletions
|
@ -5,15 +5,16 @@ SSL-safe implementation with pre-downloaded models
|
|||
"""
|
||||
|
||||
import logging
|
||||
import ssl
|
||||
import os
|
||||
from typing import Dict, Any
|
||||
import ssl
|
||||
from typing import Any
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DoclingService:
|
||||
"""Docling service for enhanced document processing with SSL fixes."""
|
||||
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize Docling service with SSL, model fixes, and GPU acceleration."""
|
||||
self.converter = None
|
||||
|
@ -21,30 +22,32 @@ class DoclingService:
|
|||
self._configure_ssl_environment()
|
||||
self._check_wsl2_gpu_support()
|
||||
self._initialize_docling()
|
||||
|
||||
|
||||
def _configure_ssl_environment(self):
|
||||
"""Configure SSL environment for secure model downloads."""
|
||||
try:
|
||||
# Set SSL context for downloads
|
||||
ssl._create_default_https_context = ssl._create_unverified_context
|
||||
|
||||
|
||||
# Set SSL environment variables if not already set
|
||||
if not os.environ.get('SSL_CERT_FILE'):
|
||||
if not os.environ.get("SSL_CERT_FILE"):
|
||||
try:
|
||||
import certifi
|
||||
os.environ['SSL_CERT_FILE'] = certifi.where()
|
||||
os.environ['REQUESTS_CA_BUNDLE'] = certifi.where()
|
||||
|
||||
os.environ["SSL_CERT_FILE"] = certifi.where()
|
||||
os.environ["REQUESTS_CA_BUNDLE"] = certifi.where()
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
|
||||
logger.info("🔐 SSL environment configured for model downloads")
|
||||
except Exception as e:
|
||||
logger.warning(f"⚠️ SSL configuration warning: {e}")
|
||||
|
||||
|
||||
def _check_wsl2_gpu_support(self):
|
||||
"""Check and configure GPU support for WSL2 environment."""
|
||||
try:
|
||||
import torch
|
||||
|
||||
if torch.cuda.is_available():
|
||||
gpu_count = torch.cuda.device_count()
|
||||
gpu_name = torch.cuda.get_device_name(0) if gpu_count > 0 else "Unknown"
|
||||
|
@ -60,34 +63,34 @@ class DoclingService:
|
|||
except Exception as e:
|
||||
logger.warning(f"⚠️ GPU detection failed: {e}, falling back to CPU")
|
||||
self.use_gpu = False
|
||||
|
||||
|
||||
def _initialize_docling(self):
|
||||
"""Initialize Docling with version-safe configuration."""
|
||||
try:
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
|
||||
logger.info("🔧 Initializing Docling with version-safe configuration...")
|
||||
|
||||
|
||||
# Create pipeline options with version-safe attribute checking
|
||||
pipeline_options = PdfPipelineOptions()
|
||||
|
||||
|
||||
# Disable OCR (user request)
|
||||
if hasattr(pipeline_options, 'do_ocr'):
|
||||
if hasattr(pipeline_options, "do_ocr"):
|
||||
pipeline_options.do_ocr = False
|
||||
logger.info("⚠️ OCR disabled by user request")
|
||||
else:
|
||||
logger.warning("⚠️ OCR attribute not available in this Docling version")
|
||||
|
||||
|
||||
# Enable table structure if available
|
||||
if hasattr(pipeline_options, 'do_table_structure'):
|
||||
if hasattr(pipeline_options, "do_table_structure"):
|
||||
pipeline_options.do_table_structure = True
|
||||
logger.info("✅ Table structure detection enabled")
|
||||
|
||||
|
||||
# Configure GPU acceleration for WSL2 if available
|
||||
if hasattr(pipeline_options, 'accelerator_device'):
|
||||
if hasattr(pipeline_options, "accelerator_device"):
|
||||
if self.use_gpu:
|
||||
try:
|
||||
pipeline_options.accelerator_device = "cuda"
|
||||
|
@ -99,164 +102,180 @@ class DoclingService:
|
|||
pipeline_options.accelerator_device = "cpu"
|
||||
logger.info("🖥️ Using CPU acceleration")
|
||||
else:
|
||||
logger.info("ℹ️ Accelerator device attribute not available in this Docling version")
|
||||
|
||||
logger.info(
|
||||
"⚠️ Accelerator device attribute not available in this Docling version"
|
||||
)
|
||||
|
||||
# Create PDF format option with backend
|
||||
pdf_format_option = PdfFormatOption(
|
||||
pipeline_options=pipeline_options,
|
||||
backend=PyPdfiumDocumentBackend
|
||||
pipeline_options=pipeline_options, backend=PyPdfiumDocumentBackend
|
||||
)
|
||||
|
||||
|
||||
# Initialize DocumentConverter
|
||||
self.converter = DocumentConverter(
|
||||
format_options={
|
||||
InputFormat.PDF: pdf_format_option
|
||||
}
|
||||
format_options={InputFormat.PDF: pdf_format_option}
|
||||
)
|
||||
|
||||
|
||||
acceleration_type = "GPU (WSL2)" if self.use_gpu else "CPU"
|
||||
logger.info(f"✅ Docling initialized successfully with {acceleration_type} acceleration")
|
||||
|
||||
logger.info(
|
||||
f"✅ Docling initialized successfully with {acceleration_type} acceleration"
|
||||
)
|
||||
|
||||
except ImportError as e:
|
||||
logger.error(f"❌ Docling not installed: {e}")
|
||||
raise RuntimeError(f"Docling not available: {e}")
|
||||
raise RuntimeError(f"Docling not available: {e}") from e
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Docling initialization failed: {e}")
|
||||
raise RuntimeError(f"Docling initialization failed: {e}")
|
||||
|
||||
raise RuntimeError(f"Docling initialization failed: {e}") from e
|
||||
|
||||
def _configure_easyocr_local_models(self):
|
||||
"""Configure EasyOCR to use pre-downloaded local models."""
|
||||
try:
|
||||
import easyocr
|
||||
import os
|
||||
|
||||
|
||||
import easyocr
|
||||
|
||||
# Set SSL environment for EasyOCR downloads
|
||||
os.environ['CURL_CA_BUNDLE'] = ''
|
||||
os.environ['REQUESTS_CA_BUNDLE'] = ''
|
||||
|
||||
os.environ["CURL_CA_BUNDLE"] = ""
|
||||
os.environ["REQUESTS_CA_BUNDLE"] = ""
|
||||
|
||||
# Try to use local models first, fallback to download if needed
|
||||
try:
|
||||
reader = easyocr.Reader(['en'],
|
||||
download_enabled=False,
|
||||
model_storage_directory="/root/.EasyOCR/model")
|
||||
reader = easyocr.Reader(
|
||||
["en"],
|
||||
download_enabled=False,
|
||||
model_storage_directory="/root/.EasyOCR/model",
|
||||
)
|
||||
logger.info("✅ EasyOCR configured for local models")
|
||||
return reader
|
||||
except:
|
||||
except Exception:
|
||||
# If local models fail, allow download with SSL bypass
|
||||
logger.info("🔄 Local models failed, attempting download with SSL bypass...")
|
||||
reader = easyocr.Reader(['en'],
|
||||
download_enabled=True,
|
||||
model_storage_directory="/root/.EasyOCR/model")
|
||||
logger.info(
|
||||
"🔄 Local models failed, attempting download with SSL bypass..."
|
||||
)
|
||||
reader = easyocr.Reader(
|
||||
["en"],
|
||||
download_enabled=True,
|
||||
model_storage_directory="/root/.EasyOCR/model",
|
||||
)
|
||||
logger.info("✅ EasyOCR configured with downloaded models")
|
||||
return reader
|
||||
except Exception as e:
|
||||
logger.warning(f"⚠️ EasyOCR configuration failed: {e}")
|
||||
return None
|
||||
|
||||
async def process_document(self, file_path: str, filename: str = None) -> Dict[str, Any]:
|
||||
|
||||
async def process_document(
|
||||
self, file_path: str, filename: str | None = None
|
||||
) -> dict[str, Any]:
|
||||
"""Process document with Docling using pre-downloaded models."""
|
||||
|
||||
|
||||
if self.converter is None:
|
||||
raise RuntimeError("Docling converter not initialized")
|
||||
|
||||
|
||||
try:
|
||||
logger.info(f"🔄 Processing {filename} with Docling (using local models)...")
|
||||
|
||||
logger.info(
|
||||
f"🔄 Processing {filename} with Docling (using local models)..."
|
||||
)
|
||||
|
||||
# Process document with local models
|
||||
result = self.converter.convert(file_path)
|
||||
|
||||
|
||||
# Extract content using version-safe methods
|
||||
content = None
|
||||
if hasattr(result, 'document') and result.document:
|
||||
if hasattr(result, "document") and result.document:
|
||||
# Try different export methods (version compatibility)
|
||||
if hasattr(result.document, 'export_to_markdown'):
|
||||
if hasattr(result.document, "export_to_markdown"):
|
||||
content = result.document.export_to_markdown()
|
||||
logger.info("📄 Used export_to_markdown method")
|
||||
elif hasattr(result.document, 'to_markdown'):
|
||||
elif hasattr(result.document, "to_markdown"):
|
||||
content = result.document.to_markdown()
|
||||
logger.info("📄 Used to_markdown method")
|
||||
elif hasattr(result.document, 'text'):
|
||||
elif hasattr(result.document, "text"):
|
||||
content = result.document.text
|
||||
logger.info("📄 Used text property")
|
||||
elif hasattr(result.document, '__str__'):
|
||||
elif hasattr(result.document, "__str__"):
|
||||
content = str(result.document)
|
||||
logger.info("📄 Used string conversion")
|
||||
|
||||
|
||||
if content:
|
||||
logger.info(f"✅ Docling SUCCESS - {filename}: {len(content)} chars (local models)")
|
||||
|
||||
logger.info(
|
||||
f"✅ Docling SUCCESS - {filename}: {len(content)} chars (local models)"
|
||||
)
|
||||
|
||||
return {
|
||||
'content': content,
|
||||
'full_text': content,
|
||||
'service_used': 'docling',
|
||||
'status': 'success',
|
||||
'processing_notes': 'Processed with Docling using pre-downloaded models'
|
||||
"content": content,
|
||||
"full_text": content,
|
||||
"service_used": "docling",
|
||||
"status": "success",
|
||||
"processing_notes": "Processed with Docling using pre-downloaded models",
|
||||
}
|
||||
else:
|
||||
raise ValueError("No content could be extracted from document")
|
||||
else:
|
||||
raise ValueError("No document object returned by Docling")
|
||||
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Docling processing failed for {filename}: {e}")
|
||||
# Log the full error for debugging
|
||||
import traceback
|
||||
|
||||
logger.error(f"Full traceback: {traceback.format_exc()}")
|
||||
raise RuntimeError(f"Docling processing failed: {e}")
|
||||
|
||||
raise RuntimeError(f"Docling processing failed: {e}") from e
|
||||
|
||||
async def process_large_document_summary(
|
||||
self,
|
||||
content: str,
|
||||
llm,
|
||||
document_title: str = "Document"
|
||||
self, content: str, llm, document_title: str = "Document"
|
||||
) -> str:
|
||||
"""
|
||||
Process large documents using chunked LLM summarization.
|
||||
|
||||
|
||||
Args:
|
||||
content: The full document content
|
||||
llm: The language model to use for summarization
|
||||
document_title: Title of the document for context
|
||||
|
||||
|
||||
Returns:
|
||||
Final summary of the document
|
||||
"""
|
||||
# Large document threshold (100K characters ≈ 25K tokens)
|
||||
LARGE_DOCUMENT_THRESHOLD = 100_000
|
||||
|
||||
if len(content) <= LARGE_DOCUMENT_THRESHOLD:
|
||||
large_document_threshold = 100_000
|
||||
|
||||
if len(content) <= large_document_threshold:
|
||||
# For smaller documents, use direct processing
|
||||
logger.info(f"📄 Document size: {len(content)} chars - using direct processing")
|
||||
logger.info(
|
||||
f"📄 Document size: {len(content)} chars - using direct processing"
|
||||
)
|
||||
from app.prompts import SUMMARY_PROMPT_TEMPLATE
|
||||
|
||||
summary_chain = SUMMARY_PROMPT_TEMPLATE | llm
|
||||
result = await summary_chain.ainvoke({"document": content})
|
||||
return result.content
|
||||
|
||||
logger.info(f"📚 Large document detected: {len(content)} chars - using chunked processing")
|
||||
|
||||
|
||||
logger.info(
|
||||
f"📚 Large document detected: {len(content)} chars - using chunked processing"
|
||||
)
|
||||
|
||||
# Import chunker from config
|
||||
from app.config import config
|
||||
from langchain_core.prompts import PromptTemplate
|
||||
|
||||
# Create LLM-optimized chunks (8K tokens max for safety)
|
||||
from chonkie import RecursiveChunker, OverlapRefinery
|
||||
from chonkie import OverlapRefinery, RecursiveChunker
|
||||
from langchain_core.prompts import PromptTemplate
|
||||
|
||||
llm_chunker = RecursiveChunker(
|
||||
chunk_size=8000 # Conservative for most LLMs
|
||||
)
|
||||
|
||||
|
||||
# Apply overlap refinery for context preservation (10% overlap = 800 tokens)
|
||||
overlap_refinery = OverlapRefinery(
|
||||
context_size=0.1, # 10% overlap for context preservation
|
||||
method="suffix" # Add next chunk context to current chunk
|
||||
method="suffix", # Add next chunk context to current chunk
|
||||
)
|
||||
|
||||
|
||||
# First chunk the content, then apply overlap refinery
|
||||
initial_chunks = llm_chunker.chunk(content)
|
||||
chunks = overlap_refinery.refine(initial_chunks)
|
||||
total_chunks = len(chunks)
|
||||
|
||||
|
||||
logger.info(f"📄 Split into {total_chunks} chunks for LLM processing")
|
||||
|
||||
|
||||
# Template for chunk processing
|
||||
chunk_template = PromptTemplate(
|
||||
input_variables=["chunk", "chunk_number", "total_chunks"],
|
||||
|
@ -274,34 +293,38 @@ Chunk {chunk_number}/{total_chunks}:
|
|||
<document_chunk>
|
||||
{chunk}
|
||||
</document_chunk>
|
||||
</INSTRUCTIONS>"""
|
||||
</INSTRUCTIONS>""",
|
||||
)
|
||||
|
||||
|
||||
# Process each chunk individually
|
||||
chunk_summaries = []
|
||||
for i, chunk in enumerate(chunks, 1):
|
||||
try:
|
||||
logger.info(f"🔄 Processing chunk {i}/{total_chunks} ({len(chunk.text)} chars)")
|
||||
|
||||
logger.info(
|
||||
f"🔄 Processing chunk {i}/{total_chunks} ({len(chunk.text)} chars)"
|
||||
)
|
||||
|
||||
chunk_chain = chunk_template | llm
|
||||
chunk_result = await chunk_chain.ainvoke({
|
||||
"chunk": chunk.text,
|
||||
"chunk_number": i,
|
||||
"total_chunks": total_chunks
|
||||
})
|
||||
|
||||
chunk_result = await chunk_chain.ainvoke(
|
||||
{
|
||||
"chunk": chunk.text,
|
||||
"chunk_number": i,
|
||||
"total_chunks": total_chunks,
|
||||
}
|
||||
)
|
||||
|
||||
chunk_summary = chunk_result.content
|
||||
chunk_summaries.append(f"=== Section {i} ===\n{chunk_summary}")
|
||||
|
||||
|
||||
logger.info(f"✅ Completed chunk {i}/{total_chunks}")
|
||||
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Failed to process chunk {i}/{total_chunks}: {e}")
|
||||
chunk_summaries.append(f"=== Section {i} ===\n[Processing failed]")
|
||||
|
||||
|
||||
# Combine summaries into final document summary
|
||||
logger.info(f"🔄 Combining {len(chunk_summaries)} chunk summaries")
|
||||
|
||||
|
||||
try:
|
||||
combine_template = PromptTemplate(
|
||||
input_variables=["summaries", "document_title"],
|
||||
|
@ -318,22 +341,23 @@ Ensure:
|
|||
<section_summaries>
|
||||
{summaries}
|
||||
</section_summaries>
|
||||
</INSTRUCTIONS>"""
|
||||
</INSTRUCTIONS>""",
|
||||
)
|
||||
|
||||
|
||||
combined_summaries = "\n\n".join(chunk_summaries)
|
||||
combine_chain = combine_template | llm
|
||||
|
||||
final_result = await combine_chain.ainvoke({
|
||||
"summaries": combined_summaries,
|
||||
"document_title": document_title
|
||||
})
|
||||
|
||||
|
||||
final_result = await combine_chain.ainvoke(
|
||||
{"summaries": combined_summaries, "document_title": document_title}
|
||||
)
|
||||
|
||||
final_summary = final_result.content
|
||||
logger.info(f"✅ Large document processing complete: {len(final_summary)} chars summary")
|
||||
|
||||
logger.info(
|
||||
f"✅ Large document processing complete: {len(final_summary)} chars summary"
|
||||
)
|
||||
|
||||
return final_summary
|
||||
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Failed to combine summaries: {e}")
|
||||
# Fallback: return concatenated chunk summaries
|
||||
|
@ -341,6 +365,7 @@ Ensure:
|
|||
logger.warning("⚠️ Using fallback combined summary")
|
||||
return fallback_summary
|
||||
|
||||
|
||||
def create_docling_service() -> DoclingService:
|
||||
"""Create a Docling service instance."""
|
||||
return DoclingService()
|
||||
return DoclingService()
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue