feat: Fixed Document Summary Content across connectors and processors

2025-09-10 22:34:39 +00:00 · 2025-08-18 20:51:48 -07:00 · 2025-08-18 20:51:48 -07:00 · 1c4c61eb04
commit 1c4c61eb04
parent c6921a4083
19 changed files with 474 additions and 233 deletions
--- a/surfsense_backend/app/tasks/document_processors/base.py
+++ b/surfsense_backend/app/tasks/document_processors/base.py
@ -6,9 +6,7 @@ from langchain_community.document_transformers import MarkdownifyTransformer
 from sqlalchemy.ext.asyncio import AsyncSession
 from sqlalchemy.future import select

-from app.config import config
-from app.db import Chunk, Document
-from app.prompts import SUMMARY_PROMPT_TEMPLATE
+from app.db import Document

 # Initialize markdown transformer
 md = MarkdownifyTransformer()
@ -31,44 +29,3 @@ async def check_duplicate_document(
        select(Document).where(Document.content_hash == content_hash)
    )
    return existing_doc_result.scalars().first()
-
-
-async def create_document_chunks(content: str) -> list[Chunk]:
-    """
-    Create chunks from document content.
-
-    Args:
-        content: Document content to chunk
-
-    Returns:
-        List of Chunk objects with embeddings
-    """
-    return [
-        Chunk(
-            content=chunk.text,
-            embedding=config.embedding_model_instance.embed(chunk.text),
-        )
-        for chunk in config.chunker_instance.chunk(content)
-    ]
-
-
-async def generate_document_summary(
-    content: str, user_llm, document_title: str = ""
-) -> tuple[str, list[float]]:
-    """
-    Generate summary and embedding for document content.
-
-    Args:
-        content: Document content
-        user_llm: User's LLM instance
-        document_title: Optional document title for context
-
-    Returns:
-        Tuple of (summary_content, summary_embedding)
-    """
-    summary_chain = SUMMARY_PROMPT_TEMPLATE | user_llm
-    summary_result = await summary_chain.ainvoke({"document": content})
-    summary_content = summary_result.content
-    summary_embedding = config.embedding_model_instance.embed(summary_content)
-
-    return summary_content, summary_embedding
--- a/surfsense_backend/app/tasks/document_processors/extension_processor.py
+++ b/surfsense_backend/app/tasks/document_processors/extension_processor.py
@ -11,12 +11,14 @@ from app.db import Document, DocumentType
 from app.schemas import ExtensionDocumentContent
 from app.services.llm_service import get_user_long_context_llm
 from app.services.task_logging_service import TaskLoggingService
-from app.utils.document_converters import generate_content_hash
+from app.utils.document_converters import (
+    create_document_chunks,
+    generate_content_hash,
+    generate_document_summary,
+)

 from .base import (
    check_duplicate_document,
-    create_document_chunks,
-    generate_document_summary,
 )


@ -106,9 +108,18 @@ async def add_extension_received_document(
        if not user_llm:
            raise RuntimeError(f"No long context LLM configured for user {user_id}")

-        # Generate summary
+        # Generate summary with metadata
+        document_metadata = {
+            "session_id": content.metadata.BrowsingSessionId,
+            "url": content.metadata.VisitedWebPageURL,
+            "title": content.metadata.VisitedWebPageTitle,
+            "referrer": content.metadata.VisitedWebPageReffererURL,
+            "timestamp": content.metadata.VisitedWebPageDateWithTimeInISOString,
+            "duration_ms": content.metadata.VisitedWebPageVisitDurationInMilliseconds,
+            "document_type": "Browser Extension Capture",
+        }
        summary_content, summary_embedding = await generate_document_summary(
-            combined_document_string, user_llm
+            combined_document_string, user_llm, document_metadata
        )

        # Process chunks
--- a/surfsense_backend/app/tasks/document_processors/file_processors.py
+++ b/surfsense_backend/app/tasks/document_processors/file_processors.py
@ -12,13 +12,13 @@ from app.db import Document, DocumentType
 from app.services.llm_service import get_user_long_context_llm
 from app.utils.document_converters import (
    convert_document_to_markdown,
+    create_document_chunks,
    generate_content_hash,
+    generate_document_summary,
 )

 from .base import (
    check_duplicate_document,
-    create_document_chunks,
-    generate_document_summary,
 )


@ -64,9 +64,14 @@ async def add_received_file_document_using_unstructured(
        if not user_llm:
            raise RuntimeError(f"No long context LLM configured for user {user_id}")

-        # Generate summary
+        # Generate summary with metadata
+        document_metadata = {
+            "file_name": file_name,
+            "etl_service": "UNSTRUCTURED",
+            "document_type": "File Document",
+        }
        summary_content, summary_embedding = await generate_document_summary(
-            file_in_markdown, user_llm
+            file_in_markdown, user_llm, document_metadata
        )

        # Process chunks
@ -139,9 +144,14 @@ async def add_received_file_document_using_llamacloud(
        if not user_llm:
            raise RuntimeError(f"No long context LLM configured for user {user_id}")

-        # Generate summary
+        # Generate summary with metadata
+        document_metadata = {
+            "file_name": file_name,
+            "etl_service": "LLAMACLOUD",
+            "document_type": "File Document",
+        }
        summary_content, summary_embedding = await generate_document_summary(
-            file_in_markdown, user_llm
+            file_in_markdown, user_llm, document_metadata
        )

        # Process chunks
@ -224,9 +234,30 @@ async def add_received_file_document_using_docling(
            content=file_in_markdown, llm=user_llm, document_title=file_name
        )

+        # Enhance summary with metadata
+        document_metadata = {
+            "file_name": file_name,
+            "etl_service": "DOCLING",
+            "document_type": "File Document",
+        }
+        metadata_parts = []
+        metadata_parts.append("# DOCUMENT METADATA")
+
+        for key, value in document_metadata.items():
+            if value:  # Only include non-empty values
+                formatted_key = key.replace("_", " ").title()
+                metadata_parts.append(f"**{formatted_key}:** {value}")
+
+        metadata_section = "\n".join(metadata_parts)
+        enhanced_summary_content = (
+            f"{metadata_section}\n\n# DOCUMENT SUMMARY\n\n{summary_content}"
+        )
+
        from app.config import config

-        summary_embedding = config.embedding_model_instance.embed(summary_content)
+        summary_embedding = config.embedding_model_instance.embed(
+            enhanced_summary_content
+        )

        # Process chunks
        chunks = await create_document_chunks(file_in_markdown)
@ -240,7 +271,7 @@ async def add_received_file_document_using_docling(
                "FILE_NAME": file_name,
                "ETL_SERVICE": "DOCLING",
            },
-            content=summary_content,
+            content=enhanced_summary_content,
            embedding=summary_embedding,
            chunks=chunks,
            content_hash=content_hash,
--- a/surfsense_backend/app/tasks/document_processors/markdown_processor.py
+++ b/surfsense_backend/app/tasks/document_processors/markdown_processor.py
@ -10,12 +10,14 @@ from sqlalchemy.ext.asyncio import AsyncSession
 from app.db import Document, DocumentType
 from app.services.llm_service import get_user_long_context_llm
 from app.services.task_logging_service import TaskLoggingService
-from app.utils.document_converters import generate_content_hash
+from app.utils.document_converters import (
+    create_document_chunks,
+    generate_content_hash,
+    generate_document_summary,
+)

 from .base import (
    check_duplicate_document,
-    create_document_chunks,
-    generate_document_summary,
 )


@ -77,9 +79,13 @@ async def add_received_markdown_file_document(
        if not user_llm:
            raise RuntimeError(f"No long context LLM configured for user {user_id}")

-        # Generate summary
+        # Generate summary with metadata
+        document_metadata = {
+            "file_name": file_name,
+            "document_type": "Markdown File Document",
+        }
        summary_content, summary_embedding = await generate_document_summary(
-            file_in_markdown, user_llm
+            file_in_markdown, user_llm, document_metadata
        )

        # Process chunks
--- a/surfsense_backend/app/tasks/document_processors/url_crawler.py
+++ b/surfsense_backend/app/tasks/document_processors/url_crawler.py
@ -13,12 +13,14 @@ from app.config import config
 from app.db import Document, DocumentType
 from app.services.llm_service import get_user_long_context_llm
 from app.services.task_logging_service import TaskLoggingService
-from app.utils.document_converters import generate_content_hash
+from app.utils.document_converters import (
+    create_document_chunks,
+    generate_content_hash,
+    generate_document_summary,
+)

 from .base import (
    check_duplicate_document,
-    create_document_chunks,
-    generate_document_summary,
    md,
 )

@ -170,8 +172,15 @@ async def add_crawled_url_document(
            {"stage": "summary_generation"},
        )

+        # Generate summary with metadata
+        document_metadata = {
+            "url": url,
+            "title": url_crawled[0].metadata.get("title", url),
+            "document_type": "Crawled URL Document",
+            "crawler_type": type(crawl_loader).__name__,
+        }
        summary_content, summary_embedding = await generate_document_summary(
-            combined_document_string, user_llm
+            combined_document_string, user_llm, document_metadata
        )

        # Process chunks
--- a/surfsense_backend/app/tasks/document_processors/youtube_processor.py
+++ b/surfsense_backend/app/tasks/document_processors/youtube_processor.py
@ -13,12 +13,14 @@ from youtube_transcript_api import YouTubeTranscriptApi
 from app.db import Document, DocumentType
 from app.services.llm_service import get_user_long_context_llm
 from app.services.task_logging_service import TaskLoggingService
-from app.utils.document_converters import generate_content_hash
+from app.utils.document_converters import (
+    create_document_chunks,
+    generate_content_hash,
+    generate_document_summary,
+)

 from .base import (
    check_duplicate_document,
-    create_document_chunks,
-    generate_document_summary,
 )


@ -242,8 +244,18 @@ async def add_youtube_video_document(
            {"stage": "summary_generation"},
        )

+        # Generate summary with metadata
+        document_metadata = {
+            "url": url,
+            "video_id": video_id,
+            "title": video_data.get("title", "YouTube Video"),
+            "author": video_data.get("author_name", "Unknown"),
+            "thumbnail": video_data.get("thumbnail_url", ""),
+            "document_type": "YouTube Video Document",
+            "has_transcript": "No captions available" not in transcript_text,
+        }
        summary_content, summary_embedding = await generate_document_summary(
-            combined_document_string, user_llm
+            combined_document_string, user_llm, document_metadata
        )

        # Process chunks