From d8f2c5f7cfd6eee8846a37a6ff0a93eeb8750c33 Mon Sep 17 00:00:00 2001 From: "DESKTOP-RTLN3BA\\$punk" Date: Tue, 10 Jun 2025 13:56:23 -0700 Subject: [PATCH] fix: generate content hash based on search space id as well. - Allows Reindexing in selperate seatch spaces. --- surfsense_backend/app/tasks/background_tasks.py | 12 ++++++------ .../app/tasks/connectors_indexing_tasks.py | 10 +++++----- surfsense_backend/app/utils/document_converters.py | 7 ++++--- 3 files changed, 15 insertions(+), 14 deletions(-) diff --git a/surfsense_backend/app/tasks/background_tasks.py b/surfsense_backend/app/tasks/background_tasks.py index 77e06e7..5c3ffe3 100644 --- a/surfsense_backend/app/tasks/background_tasks.py +++ b/surfsense_backend/app/tasks/background_tasks.py @@ -72,7 +72,7 @@ async def add_crawled_url_document( document_parts.append("") combined_document_string = "\n".join(document_parts) - content_hash = generate_content_hash(combined_document_string) + content_hash = generate_content_hash(combined_document_string, search_space_id) # Check if document with this content hash already exists existing_doc_result = await session.execute( @@ -179,7 +179,7 @@ async def add_extension_received_document( document_parts.append("") combined_document_string = "\n".join(document_parts) - content_hash = generate_content_hash(combined_document_string) + content_hash = generate_content_hash(combined_document_string, search_space_id) # Check if document with this content hash already exists existing_doc_result = await session.execute( @@ -243,7 +243,7 @@ async def add_received_markdown_file_document( session: AsyncSession, file_name: str, file_in_markdown: str, search_space_id: int, user_id: str ) -> Optional[Document]: try: - content_hash = generate_content_hash(file_in_markdown) + content_hash = generate_content_hash(file_in_markdown, search_space_id) # Check if document with this content hash already exists existing_doc_result = await session.execute( @@ -314,7 +314,7 @@ async def add_received_file_document_using_unstructured( unstructured_processed_elements ) - content_hash = generate_content_hash(file_in_markdown) + content_hash = generate_content_hash(file_in_markdown, search_space_id) # Check if document with this content hash already exists existing_doc_result = await session.execute( @@ -399,7 +399,7 @@ async def add_received_file_document_using_llamacloud( # Combine all markdown documents into one file_in_markdown = llamacloud_markdown_document - content_hash = generate_content_hash(file_in_markdown) + content_hash = generate_content_hash(file_in_markdown, search_space_id) # Check if document with this content hash already exists existing_doc_result = await session.execute( @@ -556,7 +556,7 @@ async def add_youtube_video_document( document_parts.append("") combined_document_string = "\n".join(document_parts) - content_hash = generate_content_hash(combined_document_string) + content_hash = generate_content_hash(combined_document_string, search_space_id) # Check if document with this content hash already exists existing_doc_result = await session.execute( diff --git a/surfsense_backend/app/tasks/connectors_indexing_tasks.py b/surfsense_backend/app/tasks/connectors_indexing_tasks.py index 21243c3..eaa60a2 100644 --- a/surfsense_backend/app/tasks/connectors_indexing_tasks.py +++ b/surfsense_backend/app/tasks/connectors_indexing_tasks.py @@ -200,7 +200,7 @@ async def index_slack_messages( document_parts.append("") combined_document_string = '\n'.join(document_parts) - content_hash = generate_content_hash(combined_document_string) + content_hash = generate_content_hash(combined_document_string, search_space_id) # Check if document with this content hash already exists existing_doc_by_hash_result = await session.execute( @@ -474,7 +474,7 @@ async def index_notion_pages( document_parts.append("") combined_document_string = '\n'.join(document_parts) - content_hash = generate_content_hash(combined_document_string) + content_hash = generate_content_hash(combined_document_string, search_space_id) # Check if document with this content hash already exists existing_doc_by_hash_result = await session.execute( @@ -658,7 +658,7 @@ async def index_github_repos( logger.warning(f"Could not retrieve content for {full_path_key}. Skipping.") continue # Skip if content fetch failed - content_hash = generate_content_hash(file_content) + content_hash = generate_content_hash(file_content, search_space_id) # Check if document with this content hash already exists existing_doc_by_hash_result = await session.execute( @@ -897,7 +897,7 @@ async def index_linear_issues( comment_count = len(formatted_issue.get("comments", [])) summary_content += f"Comments: {comment_count}" - content_hash = generate_content_hash(issue_content) + content_hash = generate_content_hash(issue_content, search_space_id) # Check if document with this content hash already exists existing_doc_by_hash_result = await session.execute( @@ -1151,7 +1151,7 @@ async def index_discord_messages( document_parts.append(f"") document_parts.append("") combined_document_string = '\n'.join(document_parts) - content_hash = generate_content_hash(combined_document_string) + content_hash = generate_content_hash(combined_document_string, search_space_id) # Check if document with this content hash already exists existing_doc_by_hash_result = await session.execute( diff --git a/surfsense_backend/app/utils/document_converters.py b/surfsense_backend/app/utils/document_converters.py index a6f69e4..ab8ba4d 100644 --- a/surfsense_backend/app/utils/document_converters.py +++ b/surfsense_backend/app/utils/document_converters.py @@ -141,6 +141,7 @@ def convert_chunks_to_langchain_documents(chunks): return langchain_docs -def generate_content_hash(content: str) -> str: - """Generate SHA-256 hash for the given content.""" - return hashlib.sha256(content.encode('utf-8')).hexdigest() +def generate_content_hash(content: str, search_space_id: int) -> str: + """Generate SHA-256 hash for the given content combined with search space ID.""" + combined_data = f"{search_space_id}:{content}" + return hashlib.sha256(combined_data.encode('utf-8')).hexdigest()