fix: slack indexing

- Indivisual messages as Document instead of concatinating it.
This commit is contained in:
DESKTOP-RTLN3BA\$punk 2025-08-21 14:23:52 -07:00
parent 0db3c32144
commit f443a6636f
2 changed files with 101 additions and 85 deletions

View file

@ -160,3 +160,41 @@ def build_document_metadata_string(
document_parts.append("</DOCUMENT>") document_parts.append("</DOCUMENT>")
return "\n".join(document_parts) return "\n".join(document_parts)
def build_document_metadata_markdown(
metadata_sections: list[tuple[str, list[str]]],
) -> str:
"""
Build a markdown document string from metadata sections.
Args:
metadata_sections: List of (section_title, section_content) tuples
Returns:
Combined markdown document string
"""
document_parts = []
for section_title, section_content in metadata_sections:
# Convert section title to proper markdown header
document_parts.append(f"## {section_title.title()}")
document_parts.append("") # Empty line after header
for content_line in section_content:
# Handle special content formatting
if content_line == "TEXT_START" or content_line == "TEXT_END":
continue # Skip text delimiters in markdown
elif content_line.startswith("FORMAT: "):
# Skip format indicators in markdown
continue
else:
document_parts.append(content_line)
document_parts.append("") # Empty line after section
# Remove trailing empty lines
while document_parts and document_parts[-1] == "":
document_parts.pop()
return "\n".join(document_parts)

View file

@ -8,18 +8,17 @@ from slack_sdk.errors import SlackApiError
from sqlalchemy.exc import SQLAlchemyError from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.ext.asyncio import AsyncSession
from app.config import config
from app.connectors.slack_history import SlackHistory from app.connectors.slack_history import SlackHistory
from app.db import Document, DocumentType, SearchSourceConnectorType from app.db import Document, DocumentType, SearchSourceConnectorType
from app.services.llm_service import get_user_long_context_llm
from app.services.task_logging_service import TaskLoggingService from app.services.task_logging_service import TaskLoggingService
from app.utils.document_converters import ( from app.utils.document_converters import (
create_document_chunks, create_document_chunks,
generate_content_hash, generate_content_hash,
generate_document_summary,
) )
from .base import ( from .base import (
build_document_metadata_string, build_document_metadata_markdown,
calculate_date_range, calculate_date_range,
check_duplicate_document_by_hash, check_duplicate_document_by_hash,
get_connector_by_id, get_connector_by_id,
@ -234,17 +233,11 @@ async def index_slack_messages(
documents_skipped += 1 documents_skipped += 1
continue # Skip if no valid messages after filtering continue # Skip if no valid messages after filtering
# Convert messages to markdown format
channel_content = f"# Slack Channel: {channel_name}\n\n"
for msg in formatted_messages: for msg in formatted_messages:
user_name = msg.get("user_name", "Unknown User")
timestamp = msg.get("datetime", "Unknown Time") timestamp = msg.get("datetime", "Unknown Time")
text = msg.get("text", "") msg_user_name = msg.get("user_name", "Unknown User")
msg_user_email = msg.get("user_email", "Unknown Email")
channel_content += ( msg_text = msg.get("text", "")
f"## {user_name} ({timestamp})\n\n{text}\n\n---\n\n"
)
# Format document metadata # Format document metadata
metadata_sections = [ metadata_sections = [
@ -253,17 +246,19 @@ async def index_slack_messages(
[ [
f"CHANNEL_NAME: {channel_name}", f"CHANNEL_NAME: {channel_name}",
f"CHANNEL_ID: {channel_id}", f"CHANNEL_ID: {channel_id}",
f"MESSAGE_COUNT: {len(formatted_messages)}", f"MESSAGE_TIMESTAMP: {timestamp}",
f"MESSAGE_USER_NAME: {msg_user_name}",
f"MESSAGE_USER_EMAIL: {msg_user_email}",
], ],
), ),
( (
"CONTENT", "CONTENT",
["FORMAT: markdown", "TEXT_START", channel_content, "TEXT_END"], ["FORMAT: markdown", "TEXT_START", msg_text, "TEXT_END"],
), ),
] ]
# Build the document string # Build the document string
combined_document_string = build_document_metadata_string( combined_document_string = build_document_metadata_markdown(
metadata_sections metadata_sections
) )
content_hash = generate_content_hash( content_hash = generate_content_hash(
@ -282,28 +277,11 @@ async def index_slack_messages(
documents_skipped += 1 documents_skipped += 1
continue continue
# Get user's long context LLM
user_llm = await get_user_long_context_llm(session, user_id)
if not user_llm:
logger.error(f"No long context LLM configured for user {user_id}")
skipped_channels.append(f"{channel_name} (no LLM configured)")
documents_skipped += 1
continue
# Generate summary with metadata
document_metadata = {
"channel_name": channel_name,
"channel_id": channel_id,
"message_count": len(formatted_messages),
"document_type": "Slack Channel Messages",
"connector_type": "Slack",
}
summary_content, summary_embedding = await generate_document_summary(
combined_document_string, user_llm, document_metadata
)
# Process chunks # Process chunks
chunks = await create_document_chunks(channel_content) chunks = await create_document_chunks(combined_document_string)
doc_embedding = config.embedding_model_instance.embed(
combined_document_string
)
# Create and store new document # Create and store new document
document = Document( document = Document(
@ -318,8 +296,8 @@ async def index_slack_messages(
"message_count": len(formatted_messages), "message_count": len(formatted_messages),
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
}, },
content=summary_content, content=combined_document_string,
embedding=summary_embedding, embedding=doc_embedding,
chunks=chunks, chunks=chunks,
content_hash=content_hash, content_hash=content_hash,
) )