mirror of
https://github.com/MODSetter/SurfSense.git
synced 2025-09-04 11:39:19 +00:00
fix: slack indexing
- Indivisual messages as Document instead of concatinating it.
This commit is contained in:
parent
0db3c32144
commit
f443a6636f
2 changed files with 101 additions and 85 deletions
|
@ -160,3 +160,41 @@ def build_document_metadata_string(
|
||||||
|
|
||||||
document_parts.append("</DOCUMENT>")
|
document_parts.append("</DOCUMENT>")
|
||||||
return "\n".join(document_parts)
|
return "\n".join(document_parts)
|
||||||
|
|
||||||
|
|
||||||
|
def build_document_metadata_markdown(
|
||||||
|
metadata_sections: list[tuple[str, list[str]]],
|
||||||
|
) -> str:
|
||||||
|
"""
|
||||||
|
Build a markdown document string from metadata sections.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
metadata_sections: List of (section_title, section_content) tuples
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Combined markdown document string
|
||||||
|
"""
|
||||||
|
document_parts = []
|
||||||
|
|
||||||
|
for section_title, section_content in metadata_sections:
|
||||||
|
# Convert section title to proper markdown header
|
||||||
|
document_parts.append(f"## {section_title.title()}")
|
||||||
|
document_parts.append("") # Empty line after header
|
||||||
|
|
||||||
|
for content_line in section_content:
|
||||||
|
# Handle special content formatting
|
||||||
|
if content_line == "TEXT_START" or content_line == "TEXT_END":
|
||||||
|
continue # Skip text delimiters in markdown
|
||||||
|
elif content_line.startswith("FORMAT: "):
|
||||||
|
# Skip format indicators in markdown
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
document_parts.append(content_line)
|
||||||
|
|
||||||
|
document_parts.append("") # Empty line after section
|
||||||
|
|
||||||
|
# Remove trailing empty lines
|
||||||
|
while document_parts and document_parts[-1] == "":
|
||||||
|
document_parts.pop()
|
||||||
|
|
||||||
|
return "\n".join(document_parts)
|
||||||
|
|
|
@ -8,18 +8,17 @@ from slack_sdk.errors import SlackApiError
|
||||||
from sqlalchemy.exc import SQLAlchemyError
|
from sqlalchemy.exc import SQLAlchemyError
|
||||||
from sqlalchemy.ext.asyncio import AsyncSession
|
from sqlalchemy.ext.asyncio import AsyncSession
|
||||||
|
|
||||||
|
from app.config import config
|
||||||
from app.connectors.slack_history import SlackHistory
|
from app.connectors.slack_history import SlackHistory
|
||||||
from app.db import Document, DocumentType, SearchSourceConnectorType
|
from app.db import Document, DocumentType, SearchSourceConnectorType
|
||||||
from app.services.llm_service import get_user_long_context_llm
|
|
||||||
from app.services.task_logging_service import TaskLoggingService
|
from app.services.task_logging_service import TaskLoggingService
|
||||||
from app.utils.document_converters import (
|
from app.utils.document_converters import (
|
||||||
create_document_chunks,
|
create_document_chunks,
|
||||||
generate_content_hash,
|
generate_content_hash,
|
||||||
generate_document_summary,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
from .base import (
|
from .base import (
|
||||||
build_document_metadata_string,
|
build_document_metadata_markdown,
|
||||||
calculate_date_range,
|
calculate_date_range,
|
||||||
check_duplicate_document_by_hash,
|
check_duplicate_document_by_hash,
|
||||||
get_connector_by_id,
|
get_connector_by_id,
|
||||||
|
@ -234,98 +233,77 @@ async def index_slack_messages(
|
||||||
documents_skipped += 1
|
documents_skipped += 1
|
||||||
continue # Skip if no valid messages after filtering
|
continue # Skip if no valid messages after filtering
|
||||||
|
|
||||||
# Convert messages to markdown format
|
|
||||||
channel_content = f"# Slack Channel: {channel_name}\n\n"
|
|
||||||
|
|
||||||
for msg in formatted_messages:
|
for msg in formatted_messages:
|
||||||
user_name = msg.get("user_name", "Unknown User")
|
|
||||||
timestamp = msg.get("datetime", "Unknown Time")
|
timestamp = msg.get("datetime", "Unknown Time")
|
||||||
text = msg.get("text", "")
|
msg_user_name = msg.get("user_name", "Unknown User")
|
||||||
|
msg_user_email = msg.get("user_email", "Unknown Email")
|
||||||
|
msg_text = msg.get("text", "")
|
||||||
|
|
||||||
channel_content += (
|
# Format document metadata
|
||||||
f"## {user_name} ({timestamp})\n\n{text}\n\n---\n\n"
|
metadata_sections = [
|
||||||
|
(
|
||||||
|
"METADATA",
|
||||||
|
[
|
||||||
|
f"CHANNEL_NAME: {channel_name}",
|
||||||
|
f"CHANNEL_ID: {channel_id}",
|
||||||
|
f"MESSAGE_TIMESTAMP: {timestamp}",
|
||||||
|
f"MESSAGE_USER_NAME: {msg_user_name}",
|
||||||
|
f"MESSAGE_USER_EMAIL: {msg_user_email}",
|
||||||
|
],
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"CONTENT",
|
||||||
|
["FORMAT: markdown", "TEXT_START", msg_text, "TEXT_END"],
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
# Build the document string
|
||||||
|
combined_document_string = build_document_metadata_markdown(
|
||||||
|
metadata_sections
|
||||||
|
)
|
||||||
|
content_hash = generate_content_hash(
|
||||||
|
combined_document_string, search_space_id
|
||||||
)
|
)
|
||||||
|
|
||||||
# Format document metadata
|
# Check if document with this content hash already exists
|
||||||
metadata_sections = [
|
existing_document_by_hash = await check_duplicate_document_by_hash(
|
||||||
(
|
session, content_hash
|
||||||
"METADATA",
|
|
||||||
[
|
|
||||||
f"CHANNEL_NAME: {channel_name}",
|
|
||||||
f"CHANNEL_ID: {channel_id}",
|
|
||||||
f"MESSAGE_COUNT: {len(formatted_messages)}",
|
|
||||||
],
|
|
||||||
),
|
|
||||||
(
|
|
||||||
"CONTENT",
|
|
||||||
["FORMAT: markdown", "TEXT_START", channel_content, "TEXT_END"],
|
|
||||||
),
|
|
||||||
]
|
|
||||||
|
|
||||||
# Build the document string
|
|
||||||
combined_document_string = build_document_metadata_string(
|
|
||||||
metadata_sections
|
|
||||||
)
|
|
||||||
content_hash = generate_content_hash(
|
|
||||||
combined_document_string, search_space_id
|
|
||||||
)
|
|
||||||
|
|
||||||
# Check if document with this content hash already exists
|
|
||||||
existing_document_by_hash = await check_duplicate_document_by_hash(
|
|
||||||
session, content_hash
|
|
||||||
)
|
|
||||||
|
|
||||||
if existing_document_by_hash:
|
|
||||||
logger.info(
|
|
||||||
f"Document with content hash {content_hash} already exists for channel {channel_name}. Skipping processing."
|
|
||||||
)
|
)
|
||||||
documents_skipped += 1
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Get user's long context LLM
|
if existing_document_by_hash:
|
||||||
user_llm = await get_user_long_context_llm(session, user_id)
|
logger.info(
|
||||||
if not user_llm:
|
f"Document with content hash {content_hash} already exists for channel {channel_name}. Skipping processing."
|
||||||
logger.error(f"No long context LLM configured for user {user_id}")
|
)
|
||||||
skipped_channels.append(f"{channel_name} (no LLM configured)")
|
documents_skipped += 1
|
||||||
documents_skipped += 1
|
continue
|
||||||
continue
|
|
||||||
|
|
||||||
# Generate summary with metadata
|
# Process chunks
|
||||||
document_metadata = {
|
chunks = await create_document_chunks(combined_document_string)
|
||||||
"channel_name": channel_name,
|
doc_embedding = config.embedding_model_instance.embed(
|
||||||
"channel_id": channel_id,
|
combined_document_string
|
||||||
"message_count": len(formatted_messages),
|
)
|
||||||
"document_type": "Slack Channel Messages",
|
|
||||||
"connector_type": "Slack",
|
|
||||||
}
|
|
||||||
summary_content, summary_embedding = await generate_document_summary(
|
|
||||||
combined_document_string, user_llm, document_metadata
|
|
||||||
)
|
|
||||||
|
|
||||||
# Process chunks
|
# Create and store new document
|
||||||
chunks = await create_document_chunks(channel_content)
|
document = Document(
|
||||||
|
search_space_id=search_space_id,
|
||||||
|
title=f"Slack - {channel_name}",
|
||||||
|
document_type=DocumentType.SLACK_CONNECTOR,
|
||||||
|
document_metadata={
|
||||||
|
"channel_name": channel_name,
|
||||||
|
"channel_id": channel_id,
|
||||||
|
"start_date": start_date_str,
|
||||||
|
"end_date": end_date_str,
|
||||||
|
"message_count": len(formatted_messages),
|
||||||
|
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||||||
|
},
|
||||||
|
content=combined_document_string,
|
||||||
|
embedding=doc_embedding,
|
||||||
|
chunks=chunks,
|
||||||
|
content_hash=content_hash,
|
||||||
|
)
|
||||||
|
|
||||||
# Create and store new document
|
session.add(document)
|
||||||
document = Document(
|
documents_indexed += 1
|
||||||
search_space_id=search_space_id,
|
|
||||||
title=f"Slack - {channel_name}",
|
|
||||||
document_type=DocumentType.SLACK_CONNECTOR,
|
|
||||||
document_metadata={
|
|
||||||
"channel_name": channel_name,
|
|
||||||
"channel_id": channel_id,
|
|
||||||
"start_date": start_date_str,
|
|
||||||
"end_date": end_date_str,
|
|
||||||
"message_count": len(formatted_messages),
|
|
||||||
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
|
||||||
},
|
|
||||||
content=summary_content,
|
|
||||||
embedding=summary_embedding,
|
|
||||||
chunks=chunks,
|
|
||||||
content_hash=content_hash,
|
|
||||||
)
|
|
||||||
|
|
||||||
session.add(document)
|
|
||||||
documents_indexed += 1
|
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Successfully indexed new channel {channel_name} with {len(formatted_messages)} messages"
|
f"Successfully indexed new channel {channel_name} with {len(formatted_messages)} messages"
|
||||||
)
|
)
|
||||||
|
|
Loading…
Add table
Reference in a new issue