diff --git a/surfsense_backend/app/tasks/connector_indexers/base.py b/surfsense_backend/app/tasks/connector_indexers/base.py index 28cd206..6d6f823 100644 --- a/surfsense_backend/app/tasks/connector_indexers/base.py +++ b/surfsense_backend/app/tasks/connector_indexers/base.py @@ -160,3 +160,41 @@ def build_document_metadata_string( document_parts.append("") return "\n".join(document_parts) + + +def build_document_metadata_markdown( + metadata_sections: list[tuple[str, list[str]]], +) -> str: + """ + Build a markdown document string from metadata sections. + + Args: + metadata_sections: List of (section_title, section_content) tuples + + Returns: + Combined markdown document string + """ + document_parts = [] + + for section_title, section_content in metadata_sections: + # Convert section title to proper markdown header + document_parts.append(f"## {section_title.title()}") + document_parts.append("") # Empty line after header + + for content_line in section_content: + # Handle special content formatting + if content_line == "TEXT_START" or content_line == "TEXT_END": + continue # Skip text delimiters in markdown + elif content_line.startswith("FORMAT: "): + # Skip format indicators in markdown + continue + else: + document_parts.append(content_line) + + document_parts.append("") # Empty line after section + + # Remove trailing empty lines + while document_parts and document_parts[-1] == "": + document_parts.pop() + + return "\n".join(document_parts) diff --git a/surfsense_backend/app/tasks/connector_indexers/slack_indexer.py b/surfsense_backend/app/tasks/connector_indexers/slack_indexer.py index fcf3253..fb6dac9 100644 --- a/surfsense_backend/app/tasks/connector_indexers/slack_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/slack_indexer.py @@ -8,18 +8,17 @@ from slack_sdk.errors import SlackApiError from sqlalchemy.exc import SQLAlchemyError from sqlalchemy.ext.asyncio import AsyncSession +from app.config import config from app.connectors.slack_history import SlackHistory from app.db import Document, DocumentType, SearchSourceConnectorType -from app.services.llm_service import get_user_long_context_llm from app.services.task_logging_service import TaskLoggingService from app.utils.document_converters import ( create_document_chunks, generate_content_hash, - generate_document_summary, ) from .base import ( - build_document_metadata_string, + build_document_metadata_markdown, calculate_date_range, check_duplicate_document_by_hash, get_connector_by_id, @@ -234,98 +233,77 @@ async def index_slack_messages( documents_skipped += 1 continue # Skip if no valid messages after filtering - # Convert messages to markdown format - channel_content = f"# Slack Channel: {channel_name}\n\n" - for msg in formatted_messages: - user_name = msg.get("user_name", "Unknown User") timestamp = msg.get("datetime", "Unknown Time") - text = msg.get("text", "") + msg_user_name = msg.get("user_name", "Unknown User") + msg_user_email = msg.get("user_email", "Unknown Email") + msg_text = msg.get("text", "") - channel_content += ( - f"## {user_name} ({timestamp})\n\n{text}\n\n---\n\n" + # Format document metadata + metadata_sections = [ + ( + "METADATA", + [ + f"CHANNEL_NAME: {channel_name}", + f"CHANNEL_ID: {channel_id}", + f"MESSAGE_TIMESTAMP: {timestamp}", + f"MESSAGE_USER_NAME: {msg_user_name}", + f"MESSAGE_USER_EMAIL: {msg_user_email}", + ], + ), + ( + "CONTENT", + ["FORMAT: markdown", "TEXT_START", msg_text, "TEXT_END"], + ), + ] + + # Build the document string + combined_document_string = build_document_metadata_markdown( + metadata_sections + ) + content_hash = generate_content_hash( + combined_document_string, search_space_id ) - # Format document metadata - metadata_sections = [ - ( - "METADATA", - [ - f"CHANNEL_NAME: {channel_name}", - f"CHANNEL_ID: {channel_id}", - f"MESSAGE_COUNT: {len(formatted_messages)}", - ], - ), - ( - "CONTENT", - ["FORMAT: markdown", "TEXT_START", channel_content, "TEXT_END"], - ), - ] - - # Build the document string - combined_document_string = build_document_metadata_string( - metadata_sections - ) - content_hash = generate_content_hash( - combined_document_string, search_space_id - ) - - # Check if document with this content hash already exists - existing_document_by_hash = await check_duplicate_document_by_hash( - session, content_hash - ) - - if existing_document_by_hash: - logger.info( - f"Document with content hash {content_hash} already exists for channel {channel_name}. Skipping processing." + # Check if document with this content hash already exists + existing_document_by_hash = await check_duplicate_document_by_hash( + session, content_hash ) - documents_skipped += 1 - continue - # Get user's long context LLM - user_llm = await get_user_long_context_llm(session, user_id) - if not user_llm: - logger.error(f"No long context LLM configured for user {user_id}") - skipped_channels.append(f"{channel_name} (no LLM configured)") - documents_skipped += 1 - continue + if existing_document_by_hash: + logger.info( + f"Document with content hash {content_hash} already exists for channel {channel_name}. Skipping processing." + ) + documents_skipped += 1 + continue - # Generate summary with metadata - document_metadata = { - "channel_name": channel_name, - "channel_id": channel_id, - "message_count": len(formatted_messages), - "document_type": "Slack Channel Messages", - "connector_type": "Slack", - } - summary_content, summary_embedding = await generate_document_summary( - combined_document_string, user_llm, document_metadata - ) + # Process chunks + chunks = await create_document_chunks(combined_document_string) + doc_embedding = config.embedding_model_instance.embed( + combined_document_string + ) - # Process chunks - chunks = await create_document_chunks(channel_content) + # Create and store new document + document = Document( + search_space_id=search_space_id, + title=f"Slack - {channel_name}", + document_type=DocumentType.SLACK_CONNECTOR, + document_metadata={ + "channel_name": channel_name, + "channel_id": channel_id, + "start_date": start_date_str, + "end_date": end_date_str, + "message_count": len(formatted_messages), + "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + }, + content=combined_document_string, + embedding=doc_embedding, + chunks=chunks, + content_hash=content_hash, + ) - # Create and store new document - document = Document( - search_space_id=search_space_id, - title=f"Slack - {channel_name}", - document_type=DocumentType.SLACK_CONNECTOR, - document_metadata={ - "channel_name": channel_name, - "channel_id": channel_id, - "start_date": start_date_str, - "end_date": end_date_str, - "message_count": len(formatted_messages), - "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), - }, - content=summary_content, - embedding=summary_embedding, - chunks=chunks, - content_hash=content_hash, - ) - - session.add(document) - documents_indexed += 1 + session.add(document) + documents_indexed += 1 logger.info( f"Successfully indexed new channel {channel_name} with {len(formatted_messages)} messages" )