feat: Fixed Document Summary Content across connectors and processors

2025-09-09 13:54:40 +00:00 · 2025-08-18 20:51:48 -07:00 · 2025-08-18 20:51:48 -07:00 · 1c4c61eb04
commit 1c4c61eb04
parent c6921a4083
19 changed files with 474 additions and 233 deletions
--- a/surfsense_backend/app/tasks/connector_indexers/confluence_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/confluence_indexer.py
@ -10,13 +10,17 @@ from sqlalchemy.ext.asyncio import AsyncSession
 from app.config import config
 from app.connectors.confluence_connector import ConfluenceConnector
 from app.db import Document, DocumentType, SearchSourceConnectorType
+from app.services.llm_service import get_user_long_context_llm
 from app.services.task_logging_service import TaskLoggingService
-from app.utils.document_converters import generate_content_hash
+from app.utils.document_converters import (
+    create_document_chunks,
+    generate_content_hash,
+    generate_document_summary,
+)

 from .base import (
    calculate_date_range,
    check_duplicate_document_by_hash,
-    create_document_chunks,
    get_connector_by_id,
    logger,
    update_connector_last_indexed,
@ -213,21 +217,6 @@ async def index_confluence_pages(
                    documents_skipped += 1
                    continue

-                # Create a simple summary
-                summary_content = (
-                    f"Confluence Page: {page_title}\n\nSpace ID: {space_id}\n\n"
-                )
-                if page_content:
-                    # Take first 500 characters of content for summary
-                    content_preview = page_content[:500]
-                    if len(page_content) > 500:
-                        content_preview += "..."
-                    summary_content += f"Content Preview: {content_preview}\n\n"
-
-                # Add comment count
-                comment_count = len(comments)
-                summary_content += f"Comments: {comment_count}"
-
                # Generate content hash
                content_hash = generate_content_hash(full_content, search_space_id)

@ -243,10 +232,40 @@ async def index_confluence_pages(
                    documents_skipped += 1
                    continue

-                # Generate embedding for the summary
-                summary_embedding = config.embedding_model_instance.embed(
-                    summary_content
-                )
+                # Generate summary with metadata
+                user_llm = await get_user_long_context_llm(session, user_id)
+                comment_count = len(comments)
+
+                if user_llm:
+                    document_metadata = {
+                        "page_title": page_title,
+                        "page_id": page_id,
+                        "space_id": space_id,
+                        "comment_count": comment_count,
+                        "document_type": "Confluence Page",
+                        "connector_type": "Confluence",
+                    }
+                    (
+                        summary_content,
+                        summary_embedding,
+                    ) = await generate_document_summary(
+                        full_content, user_llm, document_metadata
+                    )
+                else:
+                    # Fallback to simple summary if no LLM configured
+                    summary_content = (
+                        f"Confluence Page: {page_title}\n\nSpace ID: {space_id}\n\n"
+                    )
+                    if page_content:
+                        # Take first 500 characters of content for summary
+                        content_preview = page_content[:500]
+                        if len(page_content) > 500:
+                            content_preview += "..."
+                        summary_content += f"Content Preview: {content_preview}\n\n"
+                    summary_content += f"Comments: {comment_count}"
+                    summary_embedding = config.embedding_model_instance.embed(
+                        summary_content
+                    )

                # Process chunks - using the full page content with comments
                chunks = await create_document_chunks(full_content)