From d8f2c5f7cfd6eee8846a37a6ff0a93eeb8750c33 Mon Sep 17 00:00:00 2001
From: "DESKTOP-RTLN3BA\\$punk" <vermarohanfinal@gmail.com>
Date: Tue, 10 Jun 2025 13:56:23 -0700
Subject: [PATCH] fix: generate content hash based on search space id as well.

- Allows Reindexing in selperate seatch spaces.
---
 surfsense_backend/app/tasks/background_tasks.py      | 12 ++++++------
 .../app/tasks/connectors_indexing_tasks.py           | 10 +++++-----
 surfsense_backend/app/utils/document_converters.py   |  7 ++++---
 3 files changed, 15 insertions(+), 14 deletions(-)
diff --git a/surfsense_backend/app/tasks/background_tasks.py b/surfsense_backend/app/tasks/background_tasks.py
index 77e06e7..5c3ffe3 100644
--- a/surfsense_backend/app/tasks/background_tasks.py
+++ b/surfsense_backend/app/tasks/background_tasks.py
@@ -72,7 +72,7 @@ async def add_crawled_url_document(
 
         document_parts.append("</DOCUMENT>")
         combined_document_string = "\n".join(document_parts)
-        content_hash = generate_content_hash(combined_document_string)
+        content_hash = generate_content_hash(combined_document_string, search_space_id)
 
         # Check if document with this content hash already exists
         existing_doc_result = await session.execute(
@@ -179,7 +179,7 @@ async def add_extension_received_document(
 
         document_parts.append("</DOCUMENT>")
         combined_document_string = "\n".join(document_parts)
-        content_hash = generate_content_hash(combined_document_string)
+        content_hash = generate_content_hash(combined_document_string, search_space_id)
 
         # Check if document with this content hash already exists
         existing_doc_result = await session.execute(
@@ -243,7 +243,7 @@ async def add_received_markdown_file_document(
     session: AsyncSession, file_name: str, file_in_markdown: str, search_space_id: int, user_id: str
 ) -> Optional[Document]:
     try:
-        content_hash = generate_content_hash(file_in_markdown)
+        content_hash = generate_content_hash(file_in_markdown, search_space_id)
 
         # Check if document with this content hash already exists
         existing_doc_result = await session.execute(
@@ -314,7 +314,7 @@ async def add_received_file_document_using_unstructured(
             unstructured_processed_elements
         )
 
-        content_hash = generate_content_hash(file_in_markdown)
+        content_hash = generate_content_hash(file_in_markdown, search_space_id)
 
         # Check if document with this content hash already exists
         existing_doc_result = await session.execute(
@@ -399,7 +399,7 @@ async def add_received_file_document_using_llamacloud(
         # Combine all markdown documents into one
         file_in_markdown = llamacloud_markdown_document
 
-        content_hash = generate_content_hash(file_in_markdown)
+        content_hash = generate_content_hash(file_in_markdown, search_space_id)
 
         # Check if document with this content hash already exists
         existing_doc_result = await session.execute(
@@ -556,7 +556,7 @@ async def add_youtube_video_document(
 
         document_parts.append("</DOCUMENT>")
         combined_document_string = "\n".join(document_parts)
-        content_hash = generate_content_hash(combined_document_string)
+        content_hash = generate_content_hash(combined_document_string, search_space_id)
 
         # Check if document with this content hash already exists
         existing_doc_result = await session.execute(
diff --git a/surfsense_backend/app/tasks/connectors_indexing_tasks.py b/surfsense_backend/app/tasks/connectors_indexing_tasks.py
index 21243c3..eaa60a2 100644
--- a/surfsense_backend/app/tasks/connectors_indexing_tasks.py
+++ b/surfsense_backend/app/tasks/connectors_indexing_tasks.py
@@ -200,7 +200,7 @@ async def index_slack_messages(
                 
                 document_parts.append("</DOCUMENT>")
                 combined_document_string = '\n'.join(document_parts)
-                content_hash = generate_content_hash(combined_document_string)
+                content_hash = generate_content_hash(combined_document_string, search_space_id)
 
                 # Check if document with this content hash already exists
                 existing_doc_by_hash_result = await session.execute(
@@ -474,7 +474,7 @@ async def index_notion_pages(
                 
                 document_parts.append("</DOCUMENT>")
                 combined_document_string = '\n'.join(document_parts)
-                content_hash = generate_content_hash(combined_document_string)
+                content_hash = generate_content_hash(combined_document_string, search_space_id)
 
                 # Check if document with this content hash already exists
                 existing_doc_by_hash_result = await session.execute(
@@ -658,7 +658,7 @@ async def index_github_repos(
                         logger.warning(f"Could not retrieve content for {full_path_key}. Skipping.")
                         continue # Skip if content fetch failed
                         
-                    content_hash = generate_content_hash(file_content)
+                    content_hash = generate_content_hash(file_content, search_space_id)
 
                     # Check if document with this content hash already exists
                     existing_doc_by_hash_result = await session.execute(
@@ -897,7 +897,7 @@ async def index_linear_issues(
                 comment_count = len(formatted_issue.get("comments", []))
                 summary_content += f"Comments: {comment_count}"
                 
-                content_hash = generate_content_hash(issue_content)
+                content_hash = generate_content_hash(issue_content, search_space_id)
 
                 # Check if document with this content hash already exists
                 existing_doc_by_hash_result = await session.execute(
@@ -1151,7 +1151,7 @@ async def index_discord_messages(
                         document_parts.append(f"</{section_title}>")
                     document_parts.append("</DOCUMENT>")
                     combined_document_string = '\n'.join(document_parts)
-                    content_hash = generate_content_hash(combined_document_string)
+                    content_hash = generate_content_hash(combined_document_string, search_space_id)
 
                     # Check if document with this content hash already exists
                     existing_doc_by_hash_result = await session.execute(
diff --git a/surfsense_backend/app/utils/document_converters.py b/surfsense_backend/app/utils/document_converters.py
index a6f69e4..ab8ba4d 100644
--- a/surfsense_backend/app/utils/document_converters.py
+++ b/surfsense_backend/app/utils/document_converters.py
@@ -141,6 +141,7 @@ def convert_chunks_to_langchain_documents(chunks):
     return langchain_docs
 
 
-def generate_content_hash(content: str) -> str:
-    """Generate SHA-256 hash for the given content."""
-    return hashlib.sha256(content.encode('utf-8')).hexdigest()
+def generate_content_hash(content: str, search_space_id: int) -> str:
+    """Generate SHA-256 hash for the given content combined with search space ID."""
+    combined_data = f"{search_space_id}:{content}"
+    return hashlib.sha256(combined_data.encode('utf-8')).hexdigest()