refactor: refactored background_tasks & indexing_tasks

2025-09-09 13:54:40 +00:00 · 2025-08-12 15:28:13 -07:00 · 2025-08-12 15:28:13 -07:00 · 5aa52375c3
commit 5aa52375c3
parent 356bbb86f5
24 changed files with 4704 additions and 5149 deletions
--- a/surfsense_backend/app/tasks/document_processors/base.py
+++ b/surfsense_backend/app/tasks/document_processors/base.py
@ -0,0 +1,75 @@
+"""
+Base functionality and shared imports for document processors.
+"""
+
+
+from langchain_community.document_transformers import MarkdownifyTransformer
+from sqlalchemy.ext.asyncio import AsyncSession
+from sqlalchemy.future import select
+
+from app.config import config
+from app.db import Chunk, Document
+from app.prompts import SUMMARY_PROMPT_TEMPLATE
+
+# Initialize markdown transformer
+md = MarkdownifyTransformer()
+
+
+async def check_duplicate_document(
+    session: AsyncSession, content_hash: str
+) -> Document | None:
+    """
+    Check if a document with the given content hash already exists.
+
+    Args:
+        session: Database session
+        content_hash: Hash of the document content
+
+    Returns:
+        Existing document if found, None otherwise
+    """
+    existing_doc_result = await session.execute(
+        select(Document).where(Document.content_hash == content_hash)
+    )
+    return existing_doc_result.scalars().first()
+
+
+async def create_document_chunks(content: str) -> list[Chunk]:
+    """
+    Create chunks from document content.
+
+    Args:
+        content: Document content to chunk
+
+    Returns:
+        List of Chunk objects with embeddings
+    """
+    return [
+        Chunk(
+            content=chunk.text,
+            embedding=config.embedding_model_instance.embed(chunk.text),
+        )
+        for chunk in config.chunker_instance.chunk(content)
+    ]
+
+
+async def generate_document_summary(
+    content: str, user_llm, document_title: str = ""
+) -> tuple[str, list[float]]:
+    """
+    Generate summary and embedding for document content.
+
+    Args:
+        content: Document content
+        user_llm: User's LLM instance
+        document_title: Optional document title for context
+
+    Returns:
+        Tuple of (summary_content, summary_embedding)
+    """
+    summary_chain = SUMMARY_PROMPT_TEMPLATE | user_llm
+    summary_result = await summary_chain.ainvoke({"document": content})
+    summary_content = summary_result.content
+    summary_embedding = config.embedding_model_instance.embed(summary_content)
+
+    return summary_content, summary_embedding