fix: slack indexing

- Indivisual messages as Document instead of concatinating it.
2025-09-04 11:39:19 +00:00 · 2025-08-21 14:23:52 -07:00 · 2025-08-21 14:23:52 -07:00 · f443a6636f
commit f443a6636f
parent 0db3c32144
2 changed files with 101 additions and 85 deletions
--- a/surfsense_backend/app/tasks/connector_indexers/base.py
+++ b/surfsense_backend/app/tasks/connector_indexers/base.py
@ -160,3 +160,41 @@ def build_document_metadata_string(
    document_parts.append("</DOCUMENT>")
    return "\n".join(document_parts)
 def build_document_metadata_markdown(
    metadata_sections: list[tuple[str, list[str]]],
 ) -> str:
    """
    Build a markdown document string from metadata sections.
    Args:
        metadata_sections: List of (section_title, section_content) tuples
    Returns:
        Combined markdown document string
    """
    document_parts = []
    for section_title, section_content in metadata_sections:
        # Convert section title to proper markdown header
        document_parts.append(f"## {section_title.title()}")
        document_parts.append("")  # Empty line after header
        for content_line in section_content:
            # Handle special content formatting
            if content_line == "TEXT_START" or content_line == "TEXT_END":
                continue  # Skip text delimiters in markdown
            elif content_line.startswith("FORMAT: "):
                # Skip format indicators in markdown
                continue
            else:
                document_parts.append(content_line)
        document_parts.append("")  # Empty line after section
    # Remove trailing empty lines
    while document_parts and document_parts[-1] == "":
        document_parts.pop()
    return "\n".join(document_parts)
--- a/surfsense_backend/app/tasks/connector_indexers/slack_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/slack_indexer.py
@ -8,18 +8,17 @@ from slack_sdk.errors import SlackApiError
 from sqlalchemy.exc import SQLAlchemyError
 from sqlalchemy.ext.asyncio import AsyncSession
 from app.config import config
 from app.connectors.slack_history import SlackHistory
 from app.db import Document, DocumentType, SearchSourceConnectorType
 from app.services.llm_service import get_user_long_context_llm
 from app.services.task_logging_service import TaskLoggingService
 from app.utils.document_converters import (
    create_document_chunks,
    generate_content_hash,
    generate_document_summary,
 )
 from .base import (
-    build_document_metadata_string,
+    build_document_metadata_markdown,
    calculate_date_range,
    check_duplicate_document_by_hash,
    get_connector_by_id,
@ -234,98 +233,77 @@ async def index_slack_messages(
                    documents_skipped += 1
                    continue  # Skip if no valid messages after filtering
                # Convert messages to markdown format
                channel_content = f"# Slack Channel: {channel_name}\n\n"
                for msg in formatted_messages:
                    user_name = msg.get("user_name", "Unknown User")
                    timestamp = msg.get("datetime", "Unknown Time")
-                    text = msg.get("text", "")
+                    msg_user_name = msg.get("user_name", "Unknown User")
                    msg_user_email = msg.get("user_email", "Unknown Email")
                    msg_text = msg.get("text", "")
-                    channel_content += (
+                    # Format document metadata
-                        f"## {user_name} ({timestamp})\n\n{text}\n\n---\n\n"
+                    metadata_sections = [
                        (
                            "METADATA",
                            [
                                f"CHANNEL_NAME: {channel_name}",
                                f"CHANNEL_ID: {channel_id}",
                                f"MESSAGE_TIMESTAMP: {timestamp}",
                                f"MESSAGE_USER_NAME: {msg_user_name}",
                                f"MESSAGE_USER_EMAIL: {msg_user_email}",
                            ],
                        ),
                        (
                            "CONTENT",
                            ["FORMAT: markdown", "TEXT_START", msg_text, "TEXT_END"],
                        ),
                    ]
                    # Build the document string
                    combined_document_string = build_document_metadata_markdown(
                        metadata_sections
                    )
                    content_hash = generate_content_hash(
                        combined_document_string, search_space_id
                    )
-                # Format document metadata
+                    # Check if document with this content hash already exists
-                metadata_sections = [
+                    existing_document_by_hash = await check_duplicate_document_by_hash(
-                    (
+                        session, content_hash
                        "METADATA",
                        [
                            f"CHANNEL_NAME: {channel_name}",
                            f"CHANNEL_ID: {channel_id}",
                            f"MESSAGE_COUNT: {len(formatted_messages)}",
                        ],
                    ),
                    (
                        "CONTENT",
                        ["FORMAT: markdown", "TEXT_START", channel_content, "TEXT_END"],
                    ),
                ]
                # Build the document string
                combined_document_string = build_document_metadata_string(
                    metadata_sections
                )
                content_hash = generate_content_hash(
                    combined_document_string, search_space_id
                )
                # Check if document with this content hash already exists
                existing_document_by_hash = await check_duplicate_document_by_hash(
                    session, content_hash
                )
                if existing_document_by_hash:
                    logger.info(
                        f"Document with content hash {content_hash} already exists for channel {channel_name}. Skipping processing."
                    )
                    documents_skipped += 1
                    continue
-                # Get user's long context LLM
+                    if existing_document_by_hash:
-                user_llm = await get_user_long_context_llm(session, user_id)
+                        logger.info(
-                if not user_llm:
+                            f"Document with content hash {content_hash} already exists for channel {channel_name}. Skipping processing."
-                    logger.error(f"No long context LLM configured for user {user_id}")
+                        )
-                    skipped_channels.append(f"{channel_name} (no LLM configured)")
+                        documents_skipped += 1
-                    documents_skipped += 1
+                        continue
                    continue
-                # Generate summary with metadata
+                    # Process chunks
-                document_metadata = {
+                    chunks = await create_document_chunks(combined_document_string)
-                    "channel_name": channel_name,
+                    doc_embedding = config.embedding_model_instance.embed(
-                    "channel_id": channel_id,
+                        combined_document_string
-                    "message_count": len(formatted_messages),
+                    )
                    "document_type": "Slack Channel Messages",
                    "connector_type": "Slack",
                }
                summary_content, summary_embedding = await generate_document_summary(
                    combined_document_string, user_llm, document_metadata
                )
-                # Process chunks
+                    # Create and store new document
-                chunks = await create_document_chunks(channel_content)
+                    document = Document(
                        search_space_id=search_space_id,
                        title=f"Slack - {channel_name}",
                        document_type=DocumentType.SLACK_CONNECTOR,
                        document_metadata={
                            "channel_name": channel_name,
                            "channel_id": channel_id,
                            "start_date": start_date_str,
                            "end_date": end_date_str,
                            "message_count": len(formatted_messages),
                            "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                        },
                        content=combined_document_string,
                        embedding=doc_embedding,
                        chunks=chunks,
                        content_hash=content_hash,
                    )
-                # Create and store new document
+                    session.add(document)
-                document = Document(
+                    documents_indexed += 1
                    search_space_id=search_space_id,
                    title=f"Slack - {channel_name}",
                    document_type=DocumentType.SLACK_CONNECTOR,
                    document_metadata={
                        "channel_name": channel_name,
                        "channel_id": channel_id,
                        "start_date": start_date_str,
                        "end_date": end_date_str,
                        "message_count": len(formatted_messages),
                        "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                    },
                    content=summary_content,
                    embedding=summary_embedding,
                    chunks=chunks,
                    content_hash=content_hash,
                )
                session.add(document)
                documents_indexed += 1
                logger.info(
                    f"Successfully indexed new channel {channel_name} with {len(formatted_messages)} messages"
                )