feat: Fixed Document Summary Content across connectors and processors

This commit is contained in:
DESKTOP-RTLN3BA\$punk 2025-08-18 20:51:48 -07:00
parent c6921a4083
commit 1c4c61eb04
19 changed files with 474 additions and 233 deletions

View file

@ -513,7 +513,7 @@ async def process_file_in_background(
@router.get("/documents/", response_model=list[DocumentRead]) @router.get("/documents/", response_model=list[DocumentRead])
async def read_documents( async def read_documents(
skip: int = 0, skip: int = 0,
limit: int = 300, limit: int = 3000,
search_space_id: int | None = None, search_space_id: int | None = None,
session: AsyncSession = Depends(get_async_session), session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user), user: User = Depends(current_active_user),

View file

@ -8,9 +8,7 @@ from datetime import datetime, timedelta
from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.future import select from sqlalchemy.future import select
from app.config import config
from app.db import ( from app.db import (
Chunk,
Document, Document,
SearchSourceConnector, SearchSourceConnector,
SearchSourceConnectorType, SearchSourceConnectorType,
@ -39,25 +37,6 @@ async def check_duplicate_document_by_hash(
return existing_doc_result.scalars().first() return existing_doc_result.scalars().first()
async def create_document_chunks(content: str) -> list[Chunk]:
"""
Create chunks from document content.
Args:
content: Document content to chunk
Returns:
List of Chunk objects with embeddings
"""
return [
Chunk(
content=chunk.text,
embedding=config.embedding_model_instance.embed(chunk.text),
)
for chunk in config.chunker_instance.chunk(content)
]
async def get_connector_by_id( async def get_connector_by_id(
session: AsyncSession, connector_id: int, connector_type: SearchSourceConnectorType session: AsyncSession, connector_id: int, connector_type: SearchSourceConnectorType
) -> SearchSourceConnector | None: ) -> SearchSourceConnector | None:

View file

@ -10,12 +10,16 @@ from sqlalchemy.ext.asyncio import AsyncSession
from app.config import config from app.config import config
from app.connectors.clickup_connector import ClickUpConnector from app.connectors.clickup_connector import ClickUpConnector
from app.db import Document, DocumentType, SearchSourceConnectorType from app.db import Document, DocumentType, SearchSourceConnectorType
from app.services.llm_service import get_user_long_context_llm
from app.services.task_logging_service import TaskLoggingService from app.services.task_logging_service import TaskLoggingService
from app.utils.document_converters import generate_content_hash from app.utils.document_converters import (
create_document_chunks,
generate_content_hash,
generate_document_summary,
)
from .base import ( from .base import (
check_duplicate_document_by_hash, check_duplicate_document_by_hash,
create_document_chunks,
get_connector_by_id, get_connector_by_id,
logger, logger,
update_connector_last_indexed, update_connector_last_indexed,
@ -217,10 +221,34 @@ async def index_clickup_tasks(
documents_skipped += 1 documents_skipped += 1
continue continue
# Embedding and chunks # Generate summary with metadata
summary_embedding = config.embedding_model_instance.embed( user_llm = await get_user_long_context_llm(session, user_id)
task_content
) if user_llm:
document_metadata = {
"task_id": task_id,
"task_name": task_name,
"task_status": task_status,
"task_priority": task_priority,
"task_list": task_list_name,
"task_space": task_space_name,
"assignees": len(task_assignees),
"document_type": "ClickUp Task",
"connector_type": "ClickUp",
}
(
summary_content,
summary_embedding,
) = await generate_document_summary(
task_content, user_llm, document_metadata
)
else:
# Fallback to simple summary if no LLM configured
summary_content = task_content
summary_embedding = config.embedding_model_instance.embed(
task_content
)
chunks = await create_document_chunks(task_content) chunks = await create_document_chunks(task_content)
document = Document( document = Document(
@ -238,7 +266,7 @@ async def index_clickup_tasks(
"task_updated": task_updated, "task_updated": task_updated,
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
}, },
content=task_content, content=summary_content,
content_hash=content_hash, content_hash=content_hash,
embedding=summary_embedding, embedding=summary_embedding,
chunks=chunks, chunks=chunks,

View file

@ -10,13 +10,17 @@ from sqlalchemy.ext.asyncio import AsyncSession
from app.config import config from app.config import config
from app.connectors.confluence_connector import ConfluenceConnector from app.connectors.confluence_connector import ConfluenceConnector
from app.db import Document, DocumentType, SearchSourceConnectorType from app.db import Document, DocumentType, SearchSourceConnectorType
from app.services.llm_service import get_user_long_context_llm
from app.services.task_logging_service import TaskLoggingService from app.services.task_logging_service import TaskLoggingService
from app.utils.document_converters import generate_content_hash from app.utils.document_converters import (
create_document_chunks,
generate_content_hash,
generate_document_summary,
)
from .base import ( from .base import (
calculate_date_range, calculate_date_range,
check_duplicate_document_by_hash, check_duplicate_document_by_hash,
create_document_chunks,
get_connector_by_id, get_connector_by_id,
logger, logger,
update_connector_last_indexed, update_connector_last_indexed,
@ -213,21 +217,6 @@ async def index_confluence_pages(
documents_skipped += 1 documents_skipped += 1
continue continue
# Create a simple summary
summary_content = (
f"Confluence Page: {page_title}\n\nSpace ID: {space_id}\n\n"
)
if page_content:
# Take first 500 characters of content for summary
content_preview = page_content[:500]
if len(page_content) > 500:
content_preview += "..."
summary_content += f"Content Preview: {content_preview}\n\n"
# Add comment count
comment_count = len(comments)
summary_content += f"Comments: {comment_count}"
# Generate content hash # Generate content hash
content_hash = generate_content_hash(full_content, search_space_id) content_hash = generate_content_hash(full_content, search_space_id)
@ -243,10 +232,40 @@ async def index_confluence_pages(
documents_skipped += 1 documents_skipped += 1
continue continue
# Generate embedding for the summary # Generate summary with metadata
summary_embedding = config.embedding_model_instance.embed( user_llm = await get_user_long_context_llm(session, user_id)
summary_content comment_count = len(comments)
)
if user_llm:
document_metadata = {
"page_title": page_title,
"page_id": page_id,
"space_id": space_id,
"comment_count": comment_count,
"document_type": "Confluence Page",
"connector_type": "Confluence",
}
(
summary_content,
summary_embedding,
) = await generate_document_summary(
full_content, user_llm, document_metadata
)
else:
# Fallback to simple summary if no LLM configured
summary_content = (
f"Confluence Page: {page_title}\n\nSpace ID: {space_id}\n\n"
)
if page_content:
# Take first 500 characters of content for summary
content_preview = page_content[:500]
if len(page_content) > 500:
content_preview += "..."
summary_content += f"Content Preview: {content_preview}\n\n"
summary_content += f"Comments: {comment_count}"
summary_embedding = config.embedding_model_instance.embed(
summary_content
)
# Process chunks - using the full page content with comments # Process chunks - using the full page content with comments
chunks = await create_document_chunks(full_content) chunks = await create_document_chunks(full_content)

View file

@ -8,18 +8,19 @@ from datetime import UTC, datetime, timedelta
from sqlalchemy.exc import SQLAlchemyError from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.ext.asyncio import AsyncSession
from app.config import config
from app.connectors.discord_connector import DiscordConnector from app.connectors.discord_connector import DiscordConnector
from app.db import Document, DocumentType, SearchSourceConnectorType from app.db import Document, DocumentType, SearchSourceConnectorType
from app.prompts import SUMMARY_PROMPT_TEMPLATE
from app.services.llm_service import get_user_long_context_llm from app.services.llm_service import get_user_long_context_llm
from app.services.task_logging_service import TaskLoggingService from app.services.task_logging_service import TaskLoggingService
from app.utils.document_converters import generate_content_hash from app.utils.document_converters import (
create_document_chunks,
generate_content_hash,
generate_document_summary,
)
from .base import ( from .base import (
build_document_metadata_string, build_document_metadata_string,
check_duplicate_document_by_hash, check_duplicate_document_by_hash,
create_document_chunks,
get_connector_by_id, get_connector_by_id,
logger, logger,
update_connector_last_indexed, update_connector_last_indexed,
@ -335,14 +336,19 @@ async def index_discord_messages(
documents_skipped += 1 documents_skipped += 1
continue continue
# Generate summary using summary_chain # Generate summary with metadata
summary_chain = SUMMARY_PROMPT_TEMPLATE | user_llm document_metadata = {
summary_result = await summary_chain.ainvoke( "guild_name": guild_name,
{"document": combined_document_string} "channel_name": channel_name,
) "message_count": len(formatted_messages),
summary_content = summary_result.content "document_type": "Discord Channel Messages",
summary_embedding = config.embedding_model_instance.embed( "connector_type": "Discord",
summary_content }
(
summary_content,
summary_embedding,
) = await generate_document_summary(
combined_document_string, user_llm, document_metadata
) )
# Chunks from channel content # Chunks from channel content

View file

@ -10,12 +10,16 @@ from sqlalchemy.ext.asyncio import AsyncSession
from app.config import config from app.config import config
from app.connectors.github_connector import GitHubConnector from app.connectors.github_connector import GitHubConnector
from app.db import Document, DocumentType, SearchSourceConnectorType from app.db import Document, DocumentType, SearchSourceConnectorType
from app.services.llm_service import get_user_long_context_llm
from app.services.task_logging_service import TaskLoggingService from app.services.task_logging_service import TaskLoggingService
from app.utils.document_converters import generate_content_hash from app.utils.document_converters import (
create_document_chunks,
generate_content_hash,
generate_document_summary,
)
from .base import ( from .base import (
check_duplicate_document_by_hash, check_duplicate_document_by_hash,
create_document_chunks,
get_connector_by_id, get_connector_by_id,
logger, logger,
) )
@ -208,12 +212,34 @@ async def index_github_repos(
) )
continue continue
# Use file_content directly for chunking, maybe summary for main content? # Generate summary with metadata
# For now, let's use the full content for both, might need refinement user_llm = await get_user_long_context_llm(session, user_id)
summary_content = f"GitHub file: {full_path_key}\n\n{file_content[:1000]}..." # Simple summary if user_llm:
summary_embedding = config.embedding_model_instance.embed( # Extract file extension from file path
summary_content file_extension = (
) file_path.split(".")[-1] if "." in file_path else None
)
document_metadata = {
"file_path": full_path_key,
"repository": repo_full_name,
"file_type": file_extension or "unknown",
"document_type": "GitHub Repository File",
"connector_type": "GitHub",
}
(
summary_content,
summary_embedding,
) = await generate_document_summary(
file_content, user_llm, document_metadata
)
else:
# Fallback to simple summary if no LLM configured
summary_content = (
f"GitHub file: {full_path_key}\n\n{file_content[:1000]}..."
)
summary_embedding = config.embedding_model_instance.embed(
summary_content
)
# Chunk the content # Chunk the content
try: try:

View file

@ -11,11 +11,15 @@ from sqlalchemy.ext.asyncio import AsyncSession
from app.config import config from app.config import config
from app.connectors.google_calendar_connector import GoogleCalendarConnector from app.connectors.google_calendar_connector import GoogleCalendarConnector
from app.db import Document, DocumentType, SearchSourceConnectorType from app.db import Document, DocumentType, SearchSourceConnectorType
from app.services.llm_service import get_user_long_context_llm
from app.services.task_logging_service import TaskLoggingService from app.services.task_logging_service import TaskLoggingService
from app.utils.document_converters import generate_content_hash from app.utils.document_converters import (
create_document_chunks,
generate_content_hash,
generate_document_summary,
)
from .base import ( from .base import (
create_document_chunks,
get_connector_by_id, get_connector_by_id,
logger, logger,
update_connector_last_indexed, update_connector_last_indexed,
@ -237,18 +241,6 @@ async def index_google_calendar_events(
location = event.get("location", "") location = event.get("location", "")
description = event.get("description", "") description = event.get("description", "")
summary_content = f"Google Calendar Event: {event_summary}\n\n"
summary_content += f"Calendar: {calendar_id}\n"
summary_content += f"Start: {start_time}\n"
summary_content += f"End: {end_time}\n"
if location:
summary_content += f"Location: {location}\n"
if description:
desc_preview = description[:300]
if len(description) > 300:
desc_preview += "..."
summary_content += f"Description: {desc_preview}\n"
content_hash = generate_content_hash(event_markdown, search_space_id) content_hash = generate_content_hash(event_markdown, search_space_id)
# Duplicate check via simple query using helper in base # Duplicate check via simple query using helper in base
@ -266,10 +258,42 @@ async def index_google_calendar_events(
documents_skipped += 1 documents_skipped += 1
continue continue
# Embeddings and chunks # Generate summary with metadata
summary_embedding = config.embedding_model_instance.embed( user_llm = await get_user_long_context_llm(session, user_id)
summary_content
) if user_llm:
document_metadata = {
"event_id": event_id,
"event_summary": event_summary,
"calendar_id": calendar_id,
"start_time": start_time,
"end_time": end_time,
"location": location or "No location",
"document_type": "Google Calendar Event",
"connector_type": "Google Calendar",
}
(
summary_content,
summary_embedding,
) = await generate_document_summary(
event_markdown, user_llm, document_metadata
)
else:
# Fallback to simple summary if no LLM configured
summary_content = f"Google Calendar Event: {event_summary}\n\n"
summary_content += f"Calendar: {calendar_id}\n"
summary_content += f"Start: {start_time}\n"
summary_content += f"End: {end_time}\n"
if location:
summary_content += f"Location: {location}\n"
if description:
desc_preview = description[:300]
if len(description) > 300:
desc_preview += "..."
summary_content += f"Description: {desc_preview}\n"
summary_embedding = config.embedding_model_instance.embed(
summary_content
)
chunks = await create_document_chunks(event_markdown) chunks = await create_document_chunks(event_markdown)
document = Document( document = Document(

View file

@ -15,12 +15,16 @@ from app.db import (
DocumentType, DocumentType,
SearchSourceConnectorType, SearchSourceConnectorType,
) )
from app.services.llm_service import get_user_long_context_llm
from app.services.task_logging_service import TaskLoggingService from app.services.task_logging_service import TaskLoggingService
from app.utils.document_converters import generate_content_hash from app.utils.document_converters import (
create_document_chunks,
generate_content_hash,
generate_document_summary,
)
from .base import ( from .base import (
check_duplicate_document_by_hash, check_duplicate_document_by_hash,
create_document_chunks,
get_connector_by_id, get_connector_by_id,
logger, logger,
update_connector_last_indexed, update_connector_last_indexed,
@ -186,11 +190,6 @@ async def index_google_gmail_messages(
documents_skipped += 1 documents_skipped += 1
continue continue
# Create a simple summary
summary_content = f"Google Gmail Message: {subject}\n\n"
summary_content += f"Sender: {sender}\n"
summary_content += f"Date: {date_str}\n"
# Generate content hash # Generate content hash
content_hash = generate_content_hash(markdown_content, search_space_id) content_hash = generate_content_hash(markdown_content, search_space_id)
@ -206,10 +205,33 @@ async def index_google_gmail_messages(
documents_skipped += 1 documents_skipped += 1
continue continue
# Generate embedding for the summary # Generate summary with metadata
summary_embedding = config.embedding_model_instance.embed( user_llm = await get_user_long_context_llm(session, user_id)
summary_content
) if user_llm:
document_metadata = {
"message_id": message_id,
"thread_id": thread_id,
"subject": subject,
"sender": sender,
"date": date_str,
"document_type": "Gmail Message",
"connector_type": "Google Gmail",
}
(
summary_content,
summary_embedding,
) = await generate_document_summary(
markdown_content, user_llm, document_metadata
)
else:
# Fallback to simple summary if no LLM configured
summary_content = f"Google Gmail Message: {subject}\n\n"
summary_content += f"Sender: {sender}\n"
summary_content += f"Date: {date_str}\n"
summary_embedding = config.embedding_model_instance.embed(
summary_content
)
# Process chunks # Process chunks
chunks = await create_document_chunks(markdown_content) chunks = await create_document_chunks(markdown_content)
@ -228,7 +250,7 @@ async def index_google_gmail_messages(
"date": date_str, "date": date_str,
"connector_id": connector_id, "connector_id": connector_id,
}, },
content=markdown_content, content=summary_content,
content_hash=content_hash, content_hash=content_hash,
embedding=summary_embedding, embedding=summary_embedding,
chunks=chunks, chunks=chunks,

View file

@ -10,13 +10,17 @@ from sqlalchemy.ext.asyncio import AsyncSession
from app.config import config from app.config import config
from app.connectors.jira_connector import JiraConnector from app.connectors.jira_connector import JiraConnector
from app.db import Document, DocumentType, SearchSourceConnectorType from app.db import Document, DocumentType, SearchSourceConnectorType
from app.services.llm_service import get_user_long_context_llm
from app.services.task_logging_service import TaskLoggingService from app.services.task_logging_service import TaskLoggingService
from app.utils.document_converters import generate_content_hash from app.utils.document_converters import (
create_document_chunks,
generate_content_hash,
generate_document_summary,
)
from .base import ( from .base import (
calculate_date_range, calculate_date_range,
check_duplicate_document_by_hash, check_duplicate_document_by_hash,
create_document_chunks,
get_connector_by_id, get_connector_by_id,
logger, logger,
update_connector_last_indexed, update_connector_last_indexed,
@ -196,17 +200,6 @@ async def index_jira_issues(
documents_skipped += 1 documents_skipped += 1
continue continue
# Create a simple summary
summary_content = f"Jira Issue {issue_identifier}: {issue_title}\n\nStatus: {formatted_issue.get('status', 'Unknown')}\n\n"
if formatted_issue.get("description"):
summary_content += (
f"Description: {formatted_issue.get('description')}\n\n"
)
# Add comment count
comment_count = len(formatted_issue.get("comments", []))
summary_content += f"Comments: {comment_count}"
# Generate content hash # Generate content hash
content_hash = generate_content_hash(issue_content, search_space_id) content_hash = generate_content_hash(issue_content, search_space_id)
@ -222,10 +215,37 @@ async def index_jira_issues(
documents_skipped += 1 documents_skipped += 1
continue continue
# Generate embedding for the summary # Generate summary with metadata
summary_embedding = config.embedding_model_instance.embed( user_llm = await get_user_long_context_llm(session, user_id)
summary_content comment_count = len(formatted_issue.get("comments", []))
)
if user_llm:
document_metadata = {
"issue_key": issue_identifier,
"issue_title": issue_title,
"status": formatted_issue.get("status", "Unknown"),
"priority": formatted_issue.get("priority", "Unknown"),
"comment_count": comment_count,
"document_type": "Jira Issue",
"connector_type": "Jira",
}
(
summary_content,
summary_embedding,
) = await generate_document_summary(
issue_content, user_llm, document_metadata
)
else:
# Fallback to simple summary if no LLM configured
summary_content = f"Jira Issue {issue_identifier}: {issue_title}\n\nStatus: {formatted_issue.get('status', 'Unknown')}\n\n"
if formatted_issue.get("description"):
summary_content += (
f"Description: {formatted_issue.get('description')}\n\n"
)
summary_content += f"Comments: {comment_count}"
summary_embedding = config.embedding_model_instance.embed(
summary_content
)
# Process chunks - using the full issue content with comments # Process chunks - using the full issue content with comments
chunks = await create_document_chunks(issue_content) chunks = await create_document_chunks(issue_content)

View file

@ -10,13 +10,17 @@ from sqlalchemy.ext.asyncio import AsyncSession
from app.config import config from app.config import config
from app.connectors.linear_connector import LinearConnector from app.connectors.linear_connector import LinearConnector
from app.db import Document, DocumentType, SearchSourceConnectorType from app.db import Document, DocumentType, SearchSourceConnectorType
from app.services.llm_service import get_user_long_context_llm
from app.services.task_logging_service import TaskLoggingService from app.services.task_logging_service import TaskLoggingService
from app.utils.document_converters import generate_content_hash from app.utils.document_converters import (
create_document_chunks,
generate_content_hash,
generate_document_summary,
)
from .base import ( from .base import (
calculate_date_range, calculate_date_range,
check_duplicate_document_by_hash, check_duplicate_document_by_hash,
create_document_chunks,
get_connector_by_id, get_connector_by_id,
logger, logger,
update_connector_last_indexed, update_connector_last_indexed,
@ -209,22 +213,6 @@ async def index_linear_issues(
documents_skipped += 1 documents_skipped += 1
continue continue
# Create a short summary for the embedding
state = formatted_issue.get("state", "Unknown")
description = formatted_issue.get("description", "")
# Truncate description if it's too long for the summary
if description and len(description) > 500:
description = description[:497] + "..."
# Create a simple summary from the issue data
summary_content = f"Linear Issue {issue_identifier}: {issue_title}\n\nStatus: {state}\n\n"
if description:
summary_content += f"Description: {description}\n\n"
# Add comment count
comment_count = len(formatted_issue.get("comments", []))
summary_content += f"Comments: {comment_count}"
content_hash = generate_content_hash(issue_content, search_space_id) content_hash = generate_content_hash(issue_content, search_space_id)
# Check if document with this content hash already exists # Check if document with this content hash already exists
@ -239,10 +227,40 @@ async def index_linear_issues(
documents_skipped += 1 documents_skipped += 1
continue continue
# Generate embedding for the summary # Generate summary with metadata
summary_embedding = config.embedding_model_instance.embed( user_llm = await get_user_long_context_llm(session, user_id)
summary_content state = formatted_issue.get("state", "Unknown")
) description = formatted_issue.get("description", "")
comment_count = len(formatted_issue.get("comments", []))
if user_llm:
document_metadata = {
"issue_id": issue_identifier,
"issue_title": issue_title,
"state": state,
"priority": formatted_issue.get("priority", "Unknown"),
"comment_count": comment_count,
"document_type": "Linear Issue",
"connector_type": "Linear",
}
(
summary_content,
summary_embedding,
) = await generate_document_summary(
issue_content, user_llm, document_metadata
)
else:
# Fallback to simple summary if no LLM configured
# Truncate description if it's too long for the summary
if description and len(description) > 500:
description = description[:497] + "..."
summary_content = f"Linear Issue {issue_identifier}: {issue_title}\n\nStatus: {state}\n\n"
if description:
summary_content += f"Description: {description}\n\n"
summary_content += f"Comments: {comment_count}"
summary_embedding = config.embedding_model_instance.embed(
summary_content
)
# Process chunks - using the full issue content with comments # Process chunks - using the full issue content with comments
chunks = await create_document_chunks(issue_content) chunks = await create_document_chunks(issue_content)

View file

@ -7,18 +7,19 @@ from datetime import datetime, timedelta
from sqlalchemy.exc import SQLAlchemyError from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.ext.asyncio import AsyncSession
from app.config import config
from app.connectors.notion_history import NotionHistoryConnector from app.connectors.notion_history import NotionHistoryConnector
from app.db import Document, DocumentType, SearchSourceConnectorType from app.db import Document, DocumentType, SearchSourceConnectorType
from app.prompts import SUMMARY_PROMPT_TEMPLATE
from app.services.llm_service import get_user_long_context_llm from app.services.llm_service import get_user_long_context_llm
from app.services.task_logging_service import TaskLoggingService from app.services.task_logging_service import TaskLoggingService
from app.utils.document_converters import generate_content_hash from app.utils.document_converters import (
create_document_chunks,
generate_content_hash,
generate_document_summary,
)
from .base import ( from .base import (
build_document_metadata_string, build_document_metadata_string,
check_duplicate_document_by_hash, check_duplicate_document_by_hash,
create_document_chunks,
get_connector_by_id, get_connector_by_id,
logger, logger,
update_connector_last_indexed, update_connector_last_indexed,
@ -302,15 +303,16 @@ async def index_notion_pages(
documents_skipped += 1 documents_skipped += 1
continue continue
# Generate summary # Generate summary with metadata
logger.debug(f"Generating summary for page {page_title}") logger.debug(f"Generating summary for page {page_title}")
summary_chain = SUMMARY_PROMPT_TEMPLATE | user_llm document_metadata = {
summary_result = await summary_chain.ainvoke( "page_title": page_title,
{"document": combined_document_string} "page_id": page_id,
) "document_type": "Notion Page",
summary_content = summary_result.content "connector_type": "Notion",
summary_embedding = config.embedding_model_instance.embed( }
summary_content summary_content, summary_embedding = await generate_document_summary(
markdown_content, user_llm, document_metadata
) )
# Process chunks # Process chunks

View file

@ -8,19 +8,20 @@ from slack_sdk.errors import SlackApiError
from sqlalchemy.exc import SQLAlchemyError from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.ext.asyncio import AsyncSession
from app.config import config
from app.connectors.slack_history import SlackHistory from app.connectors.slack_history import SlackHistory
from app.db import Document, DocumentType, SearchSourceConnectorType from app.db import Document, DocumentType, SearchSourceConnectorType
from app.prompts import SUMMARY_PROMPT_TEMPLATE
from app.services.llm_service import get_user_long_context_llm from app.services.llm_service import get_user_long_context_llm
from app.services.task_logging_service import TaskLoggingService from app.services.task_logging_service import TaskLoggingService
from app.utils.document_converters import generate_content_hash from app.utils.document_converters import (
create_document_chunks,
generate_content_hash,
generate_document_summary,
)
from .base import ( from .base import (
build_document_metadata_string, build_document_metadata_string,
calculate_date_range, calculate_date_range,
check_duplicate_document_by_hash, check_duplicate_document_by_hash,
create_document_chunks,
get_connector_by_id, get_connector_by_id,
logger, logger,
update_connector_last_indexed, update_connector_last_indexed,
@ -289,14 +290,16 @@ async def index_slack_messages(
documents_skipped += 1 documents_skipped += 1
continue continue
# Generate summary # Generate summary with metadata
summary_chain = SUMMARY_PROMPT_TEMPLATE | user_llm document_metadata = {
summary_result = await summary_chain.ainvoke( "channel_name": channel_name,
{"document": combined_document_string} "channel_id": channel_id,
) "message_count": len(formatted_messages),
summary_content = summary_result.content "document_type": "Slack Channel Messages",
summary_embedding = config.embedding_model_instance.embed( "connector_type": "Slack",
summary_content }
summary_content, summary_embedding = await generate_document_summary(
combined_document_string, user_llm, document_metadata
) )
# Process chunks # Process chunks

View file

@ -6,9 +6,7 @@ from langchain_community.document_transformers import MarkdownifyTransformer
from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.future import select from sqlalchemy.future import select
from app.config import config from app.db import Document
from app.db import Chunk, Document
from app.prompts import SUMMARY_PROMPT_TEMPLATE
# Initialize markdown transformer # Initialize markdown transformer
md = MarkdownifyTransformer() md = MarkdownifyTransformer()
@ -31,44 +29,3 @@ async def check_duplicate_document(
select(Document).where(Document.content_hash == content_hash) select(Document).where(Document.content_hash == content_hash)
) )
return existing_doc_result.scalars().first() return existing_doc_result.scalars().first()
async def create_document_chunks(content: str) -> list[Chunk]:
"""
Create chunks from document content.
Args:
content: Document content to chunk
Returns:
List of Chunk objects with embeddings
"""
return [
Chunk(
content=chunk.text,
embedding=config.embedding_model_instance.embed(chunk.text),
)
for chunk in config.chunker_instance.chunk(content)
]
async def generate_document_summary(
content: str, user_llm, document_title: str = ""
) -> tuple[str, list[float]]:
"""
Generate summary and embedding for document content.
Args:
content: Document content
user_llm: User's LLM instance
document_title: Optional document title for context
Returns:
Tuple of (summary_content, summary_embedding)
"""
summary_chain = SUMMARY_PROMPT_TEMPLATE | user_llm
summary_result = await summary_chain.ainvoke({"document": content})
summary_content = summary_result.content
summary_embedding = config.embedding_model_instance.embed(summary_content)
return summary_content, summary_embedding

View file

@ -11,12 +11,14 @@ from app.db import Document, DocumentType
from app.schemas import ExtensionDocumentContent from app.schemas import ExtensionDocumentContent
from app.services.llm_service import get_user_long_context_llm from app.services.llm_service import get_user_long_context_llm
from app.services.task_logging_service import TaskLoggingService from app.services.task_logging_service import TaskLoggingService
from app.utils.document_converters import generate_content_hash from app.utils.document_converters import (
create_document_chunks,
generate_content_hash,
generate_document_summary,
)
from .base import ( from .base import (
check_duplicate_document, check_duplicate_document,
create_document_chunks,
generate_document_summary,
) )
@ -106,9 +108,18 @@ async def add_extension_received_document(
if not user_llm: if not user_llm:
raise RuntimeError(f"No long context LLM configured for user {user_id}") raise RuntimeError(f"No long context LLM configured for user {user_id}")
# Generate summary # Generate summary with metadata
document_metadata = {
"session_id": content.metadata.BrowsingSessionId,
"url": content.metadata.VisitedWebPageURL,
"title": content.metadata.VisitedWebPageTitle,
"referrer": content.metadata.VisitedWebPageReffererURL,
"timestamp": content.metadata.VisitedWebPageDateWithTimeInISOString,
"duration_ms": content.metadata.VisitedWebPageVisitDurationInMilliseconds,
"document_type": "Browser Extension Capture",
}
summary_content, summary_embedding = await generate_document_summary( summary_content, summary_embedding = await generate_document_summary(
combined_document_string, user_llm combined_document_string, user_llm, document_metadata
) )
# Process chunks # Process chunks

View file

@ -12,13 +12,13 @@ from app.db import Document, DocumentType
from app.services.llm_service import get_user_long_context_llm from app.services.llm_service import get_user_long_context_llm
from app.utils.document_converters import ( from app.utils.document_converters import (
convert_document_to_markdown, convert_document_to_markdown,
create_document_chunks,
generate_content_hash, generate_content_hash,
generate_document_summary,
) )
from .base import ( from .base import (
check_duplicate_document, check_duplicate_document,
create_document_chunks,
generate_document_summary,
) )
@ -64,9 +64,14 @@ async def add_received_file_document_using_unstructured(
if not user_llm: if not user_llm:
raise RuntimeError(f"No long context LLM configured for user {user_id}") raise RuntimeError(f"No long context LLM configured for user {user_id}")
# Generate summary # Generate summary with metadata
document_metadata = {
"file_name": file_name,
"etl_service": "UNSTRUCTURED",
"document_type": "File Document",
}
summary_content, summary_embedding = await generate_document_summary( summary_content, summary_embedding = await generate_document_summary(
file_in_markdown, user_llm file_in_markdown, user_llm, document_metadata
) )
# Process chunks # Process chunks
@ -139,9 +144,14 @@ async def add_received_file_document_using_llamacloud(
if not user_llm: if not user_llm:
raise RuntimeError(f"No long context LLM configured for user {user_id}") raise RuntimeError(f"No long context LLM configured for user {user_id}")
# Generate summary # Generate summary with metadata
document_metadata = {
"file_name": file_name,
"etl_service": "LLAMACLOUD",
"document_type": "File Document",
}
summary_content, summary_embedding = await generate_document_summary( summary_content, summary_embedding = await generate_document_summary(
file_in_markdown, user_llm file_in_markdown, user_llm, document_metadata
) )
# Process chunks # Process chunks
@ -224,9 +234,30 @@ async def add_received_file_document_using_docling(
content=file_in_markdown, llm=user_llm, document_title=file_name content=file_in_markdown, llm=user_llm, document_title=file_name
) )
# Enhance summary with metadata
document_metadata = {
"file_name": file_name,
"etl_service": "DOCLING",
"document_type": "File Document",
}
metadata_parts = []
metadata_parts.append("# DOCUMENT METADATA")
for key, value in document_metadata.items():
if value: # Only include non-empty values
formatted_key = key.replace("_", " ").title()
metadata_parts.append(f"**{formatted_key}:** {value}")
metadata_section = "\n".join(metadata_parts)
enhanced_summary_content = (
f"{metadata_section}\n\n# DOCUMENT SUMMARY\n\n{summary_content}"
)
from app.config import config from app.config import config
summary_embedding = config.embedding_model_instance.embed(summary_content) summary_embedding = config.embedding_model_instance.embed(
enhanced_summary_content
)
# Process chunks # Process chunks
chunks = await create_document_chunks(file_in_markdown) chunks = await create_document_chunks(file_in_markdown)
@ -240,7 +271,7 @@ async def add_received_file_document_using_docling(
"FILE_NAME": file_name, "FILE_NAME": file_name,
"ETL_SERVICE": "DOCLING", "ETL_SERVICE": "DOCLING",
}, },
content=summary_content, content=enhanced_summary_content,
embedding=summary_embedding, embedding=summary_embedding,
chunks=chunks, chunks=chunks,
content_hash=content_hash, content_hash=content_hash,

View file

@ -10,12 +10,14 @@ from sqlalchemy.ext.asyncio import AsyncSession
from app.db import Document, DocumentType from app.db import Document, DocumentType
from app.services.llm_service import get_user_long_context_llm from app.services.llm_service import get_user_long_context_llm
from app.services.task_logging_service import TaskLoggingService from app.services.task_logging_service import TaskLoggingService
from app.utils.document_converters import generate_content_hash from app.utils.document_converters import (
create_document_chunks,
generate_content_hash,
generate_document_summary,
)
from .base import ( from .base import (
check_duplicate_document, check_duplicate_document,
create_document_chunks,
generate_document_summary,
) )
@ -77,9 +79,13 @@ async def add_received_markdown_file_document(
if not user_llm: if not user_llm:
raise RuntimeError(f"No long context LLM configured for user {user_id}") raise RuntimeError(f"No long context LLM configured for user {user_id}")
# Generate summary # Generate summary with metadata
document_metadata = {
"file_name": file_name,
"document_type": "Markdown File Document",
}
summary_content, summary_embedding = await generate_document_summary( summary_content, summary_embedding = await generate_document_summary(
file_in_markdown, user_llm file_in_markdown, user_llm, document_metadata
) )
# Process chunks # Process chunks

View file

@ -13,12 +13,14 @@ from app.config import config
from app.db import Document, DocumentType from app.db import Document, DocumentType
from app.services.llm_service import get_user_long_context_llm from app.services.llm_service import get_user_long_context_llm
from app.services.task_logging_service import TaskLoggingService from app.services.task_logging_service import TaskLoggingService
from app.utils.document_converters import generate_content_hash from app.utils.document_converters import (
create_document_chunks,
generate_content_hash,
generate_document_summary,
)
from .base import ( from .base import (
check_duplicate_document, check_duplicate_document,
create_document_chunks,
generate_document_summary,
md, md,
) )
@ -170,8 +172,15 @@ async def add_crawled_url_document(
{"stage": "summary_generation"}, {"stage": "summary_generation"},
) )
# Generate summary with metadata
document_metadata = {
"url": url,
"title": url_crawled[0].metadata.get("title", url),
"document_type": "Crawled URL Document",
"crawler_type": type(crawl_loader).__name__,
}
summary_content, summary_embedding = await generate_document_summary( summary_content, summary_embedding = await generate_document_summary(
combined_document_string, user_llm combined_document_string, user_llm, document_metadata
) )
# Process chunks # Process chunks

View file

@ -13,12 +13,14 @@ from youtube_transcript_api import YouTubeTranscriptApi
from app.db import Document, DocumentType from app.db import Document, DocumentType
from app.services.llm_service import get_user_long_context_llm from app.services.llm_service import get_user_long_context_llm
from app.services.task_logging_service import TaskLoggingService from app.services.task_logging_service import TaskLoggingService
from app.utils.document_converters import generate_content_hash from app.utils.document_converters import (
create_document_chunks,
generate_content_hash,
generate_document_summary,
)
from .base import ( from .base import (
check_duplicate_document, check_duplicate_document,
create_document_chunks,
generate_document_summary,
) )
@ -242,8 +244,18 @@ async def add_youtube_video_document(
{"stage": "summary_generation"}, {"stage": "summary_generation"},
) )
# Generate summary with metadata
document_metadata = {
"url": url,
"video_id": video_id,
"title": video_data.get("title", "YouTube Video"),
"author": video_data.get("author_name", "Unknown"),
"thumbnail": video_data.get("thumbnail_url", ""),
"document_type": "YouTube Video Document",
"has_transcript": "No captions available" not in transcript_text,
}
summary_content, summary_embedding = await generate_document_summary( summary_content, summary_embedding = await generate_document_summary(
combined_document_string, user_llm combined_document_string, user_llm, document_metadata
) )
# Process chunks # Process chunks

View file

@ -1,5 +1,73 @@
import hashlib import hashlib
from app.config import config
from app.db import Chunk
from app.prompts import SUMMARY_PROMPT_TEMPLATE
async def generate_document_summary(
content: str,
user_llm,
document_metadata: dict | None = None,
document_title: str = "",
) -> tuple[str, list[float]]:
"""
Generate summary and embedding for document content with metadata.
Args:
content: Document content
user_llm: User's LLM instance
document_metadata: Optional metadata dictionary to include in summary
document_title: Optional document title for context (deprecated, use metadata)
Returns:
Tuple of (enhanced_summary_content, summary_embedding)
"""
summary_chain = SUMMARY_PROMPT_TEMPLATE | user_llm
content_with_metadata = f"<DOCUMENT><DOCUMENT_METADATA>\n\n{document_metadata}\n\n</DOCUMENT_METADATA>\n\n<DOCUMENT_CONTENT>\n\n{content}\n\n</DOCUMENT_CONTENT></DOCUMENT>"
summary_result = await summary_chain.ainvoke({"document": content_with_metadata})
summary_content = summary_result.content
# Combine summary with metadata if provided
if document_metadata:
metadata_parts = []
metadata_parts.append("# DOCUMENT METADATA")
for key, value in document_metadata.items():
if value: # Only include non-empty values
formatted_key = key.replace("_", " ").title()
metadata_parts.append(f"**{formatted_key}:** {value}")
metadata_section = "\n".join(metadata_parts)
enhanced_summary_content = (
f"{metadata_section}\n\n# DOCUMENT SUMMARY\n\n{summary_content}"
)
else:
enhanced_summary_content = summary_content
summary_embedding = config.embedding_model_instance.embed(enhanced_summary_content)
return enhanced_summary_content, summary_embedding
async def create_document_chunks(content: str) -> list[Chunk]:
"""
Create chunks from document content.
Args:
content: Document content to chunk
Returns:
List of Chunk objects with embeddings
"""
return [
Chunk(
content=chunk.text,
embedding=config.embedding_model_instance.embed(chunk.text),
)
for chunk in config.chunker_instance.chunk(content)
]
async def convert_element_to_markdown(element) -> str: async def convert_element_to_markdown(element) -> str:
""" """