fix: slack indexing

- Indivisual messages as Document instead of concatinating it.
2025-09-05 20:19:07 +00:00 · 2025-08-21 14:23:52 -07:00 · 2025-08-21 14:23:52 -07:00 · f443a6636f
commit f443a6636f
parent 0db3c32144
2 changed files with 101 additions and 85 deletions
--- a/surfsense_backend/app/tasks/connector_indexers/base.py
+++ b/surfsense_backend/app/tasks/connector_indexers/base.py
@ -160,3 +160,41 @@ def build_document_metadata_string(
    document_parts.append("</DOCUMENT>")
    return "\n".join(document_parts)
 def build_document_metadata_markdown(
    metadata_sections: list[tuple[str, list[str]]],
 ) -> str:
    """
    Build a markdown document string from metadata sections.
    Args:
        metadata_sections: List of (section_title, section_content) tuples
    Returns:
        Combined markdown document string
    """
    document_parts = []
    for section_title, section_content in metadata_sections:
        # Convert section title to proper markdown header
        document_parts.append(f"## {section_title.title()}")
        document_parts.append("")  # Empty line after header
        for content_line in section_content:
            # Handle special content formatting
            if content_line == "TEXT_START" or content_line == "TEXT_END":
                continue  # Skip text delimiters in markdown
            elif content_line.startswith("FORMAT: "):
                # Skip format indicators in markdown
                continue
            else:
                document_parts.append(content_line)
        document_parts.append("")  # Empty line after section
    # Remove trailing empty lines
    while document_parts and document_parts[-1] == "":
        document_parts.pop()
    return "\n".join(document_parts)
--- a/surfsense_backend/app/tasks/connector_indexers/slack_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/slack_indexer.py
@ -8,18 +8,17 @@ from slack_sdk.errors import SlackApiError
 from sqlalchemy.exc import SQLAlchemyError
 from sqlalchemy.ext.asyncio import AsyncSession
 from app.config import config
 from app.connectors.slack_history import SlackHistory
 from app.db import Document, DocumentType, SearchSourceConnectorType
 from app.services.llm_service import get_user_long_context_llm
 from app.services.task_logging_service import TaskLoggingService
 from app.utils.document_converters import (
    create_document_chunks,
    generate_content_hash,
    generate_document_summary,
 )
 from .base import (
-    build_document_metadata_string,
+    build_document_metadata_markdown,
    calculate_date_range,
    check_duplicate_document_by_hash,
    get_connector_by_id,
@ -234,17 +233,11 @@ async def index_slack_messages(
                    documents_skipped += 1
                    continue  # Skip if no valid messages after filtering
                # Convert messages to markdown format
                channel_content = f"# Slack Channel: {channel_name}\n\n"
                for msg in formatted_messages:
                    user_name = msg.get("user_name", "Unknown User")
                    timestamp = msg.get("datetime", "Unknown Time")
-                    text = msg.get("text", "")
+                    msg_user_name = msg.get("user_name", "Unknown User")
-
+                    msg_user_email = msg.get("user_email", "Unknown Email")
-                    channel_content += (
+                    msg_text = msg.get("text", "")
                        f"## {user_name} ({timestamp})\n\n{text}\n\n---\n\n"
                    )
                    # Format document metadata
                    metadata_sections = [
@ -253,17 +246,19 @@ async def index_slack_messages(
                            [
                                f"CHANNEL_NAME: {channel_name}",
                                f"CHANNEL_ID: {channel_id}",
-                            f"MESSAGE_COUNT: {len(formatted_messages)}",
+                                f"MESSAGE_TIMESTAMP: {timestamp}",
                                f"MESSAGE_USER_NAME: {msg_user_name}",
                                f"MESSAGE_USER_EMAIL: {msg_user_email}",
                            ],
                        ),
                        (
                            "CONTENT",
-                        ["FORMAT: markdown", "TEXT_START", channel_content, "TEXT_END"],
+                            ["FORMAT: markdown", "TEXT_START", msg_text, "TEXT_END"],
                        ),
                    ]
                    # Build the document string
-                combined_document_string = build_document_metadata_string(
+                    combined_document_string = build_document_metadata_markdown(
                        metadata_sections
                    )
                    content_hash = generate_content_hash(
@ -282,28 +277,11 @@ async def index_slack_messages(
                        documents_skipped += 1
                        continue
                # Get user's long context LLM
                user_llm = await get_user_long_context_llm(session, user_id)
                if not user_llm:
                    logger.error(f"No long context LLM configured for user {user_id}")
                    skipped_channels.append(f"{channel_name} (no LLM configured)")
                    documents_skipped += 1
                    continue
                # Generate summary with metadata
                document_metadata = {
                    "channel_name": channel_name,
                    "channel_id": channel_id,
                    "message_count": len(formatted_messages),
                    "document_type": "Slack Channel Messages",
                    "connector_type": "Slack",
                }
                summary_content, summary_embedding = await generate_document_summary(
                    combined_document_string, user_llm, document_metadata
                )
                    # Process chunks
-                chunks = await create_document_chunks(channel_content)
+                    chunks = await create_document_chunks(combined_document_string)
                    doc_embedding = config.embedding_model_instance.embed(
                        combined_document_string
                    )
                    # Create and store new document
                    document = Document(
@ -318,8 +296,8 @@ async def index_slack_messages(
                            "message_count": len(formatted_messages),
                            "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                        },
-                    content=summary_content,
+                        content=combined_document_string,
-                    embedding=summary_embedding,
+                        embedding=doc_embedding,
                        chunks=chunks,
                        content_hash=content_hash,
                    )