mirror of
https://github.com/MODSetter/SurfSense.git
synced 2025-09-01 18:19:08 +00:00
fix: generate content hash based on search space id as well.
- Allows Reindexing in selperate seatch spaces.
This commit is contained in:
parent
fa54de1f41
commit
d8f2c5f7cf
3 changed files with 15 additions and 14 deletions
|
@ -72,7 +72,7 @@ async def add_crawled_url_document(
|
|||
|
||||
document_parts.append("</DOCUMENT>")
|
||||
combined_document_string = "\n".join(document_parts)
|
||||
content_hash = generate_content_hash(combined_document_string)
|
||||
content_hash = generate_content_hash(combined_document_string, search_space_id)
|
||||
|
||||
# Check if document with this content hash already exists
|
||||
existing_doc_result = await session.execute(
|
||||
|
@ -179,7 +179,7 @@ async def add_extension_received_document(
|
|||
|
||||
document_parts.append("</DOCUMENT>")
|
||||
combined_document_string = "\n".join(document_parts)
|
||||
content_hash = generate_content_hash(combined_document_string)
|
||||
content_hash = generate_content_hash(combined_document_string, search_space_id)
|
||||
|
||||
# Check if document with this content hash already exists
|
||||
existing_doc_result = await session.execute(
|
||||
|
@ -243,7 +243,7 @@ async def add_received_markdown_file_document(
|
|||
session: AsyncSession, file_name: str, file_in_markdown: str, search_space_id: int, user_id: str
|
||||
) -> Optional[Document]:
|
||||
try:
|
||||
content_hash = generate_content_hash(file_in_markdown)
|
||||
content_hash = generate_content_hash(file_in_markdown, search_space_id)
|
||||
|
||||
# Check if document with this content hash already exists
|
||||
existing_doc_result = await session.execute(
|
||||
|
@ -314,7 +314,7 @@ async def add_received_file_document_using_unstructured(
|
|||
unstructured_processed_elements
|
||||
)
|
||||
|
||||
content_hash = generate_content_hash(file_in_markdown)
|
||||
content_hash = generate_content_hash(file_in_markdown, search_space_id)
|
||||
|
||||
# Check if document with this content hash already exists
|
||||
existing_doc_result = await session.execute(
|
||||
|
@ -399,7 +399,7 @@ async def add_received_file_document_using_llamacloud(
|
|||
# Combine all markdown documents into one
|
||||
file_in_markdown = llamacloud_markdown_document
|
||||
|
||||
content_hash = generate_content_hash(file_in_markdown)
|
||||
content_hash = generate_content_hash(file_in_markdown, search_space_id)
|
||||
|
||||
# Check if document with this content hash already exists
|
||||
existing_doc_result = await session.execute(
|
||||
|
@ -556,7 +556,7 @@ async def add_youtube_video_document(
|
|||
|
||||
document_parts.append("</DOCUMENT>")
|
||||
combined_document_string = "\n".join(document_parts)
|
||||
content_hash = generate_content_hash(combined_document_string)
|
||||
content_hash = generate_content_hash(combined_document_string, search_space_id)
|
||||
|
||||
# Check if document with this content hash already exists
|
||||
existing_doc_result = await session.execute(
|
||||
|
|
|
@ -200,7 +200,7 @@ async def index_slack_messages(
|
|||
|
||||
document_parts.append("</DOCUMENT>")
|
||||
combined_document_string = '\n'.join(document_parts)
|
||||
content_hash = generate_content_hash(combined_document_string)
|
||||
content_hash = generate_content_hash(combined_document_string, search_space_id)
|
||||
|
||||
# Check if document with this content hash already exists
|
||||
existing_doc_by_hash_result = await session.execute(
|
||||
|
@ -474,7 +474,7 @@ async def index_notion_pages(
|
|||
|
||||
document_parts.append("</DOCUMENT>")
|
||||
combined_document_string = '\n'.join(document_parts)
|
||||
content_hash = generate_content_hash(combined_document_string)
|
||||
content_hash = generate_content_hash(combined_document_string, search_space_id)
|
||||
|
||||
# Check if document with this content hash already exists
|
||||
existing_doc_by_hash_result = await session.execute(
|
||||
|
@ -658,7 +658,7 @@ async def index_github_repos(
|
|||
logger.warning(f"Could not retrieve content for {full_path_key}. Skipping.")
|
||||
continue # Skip if content fetch failed
|
||||
|
||||
content_hash = generate_content_hash(file_content)
|
||||
content_hash = generate_content_hash(file_content, search_space_id)
|
||||
|
||||
# Check if document with this content hash already exists
|
||||
existing_doc_by_hash_result = await session.execute(
|
||||
|
@ -897,7 +897,7 @@ async def index_linear_issues(
|
|||
comment_count = len(formatted_issue.get("comments", []))
|
||||
summary_content += f"Comments: {comment_count}"
|
||||
|
||||
content_hash = generate_content_hash(issue_content)
|
||||
content_hash = generate_content_hash(issue_content, search_space_id)
|
||||
|
||||
# Check if document with this content hash already exists
|
||||
existing_doc_by_hash_result = await session.execute(
|
||||
|
@ -1151,7 +1151,7 @@ async def index_discord_messages(
|
|||
document_parts.append(f"</{section_title}>")
|
||||
document_parts.append("</DOCUMENT>")
|
||||
combined_document_string = '\n'.join(document_parts)
|
||||
content_hash = generate_content_hash(combined_document_string)
|
||||
content_hash = generate_content_hash(combined_document_string, search_space_id)
|
||||
|
||||
# Check if document with this content hash already exists
|
||||
existing_doc_by_hash_result = await session.execute(
|
||||
|
|
|
@ -141,6 +141,7 @@ def convert_chunks_to_langchain_documents(chunks):
|
|||
return langchain_docs
|
||||
|
||||
|
||||
def generate_content_hash(content: str) -> str:
|
||||
"""Generate SHA-256 hash for the given content."""
|
||||
return hashlib.sha256(content.encode('utf-8')).hexdigest()
|
||||
def generate_content_hash(content: str, search_space_id: int) -> str:
|
||||
"""Generate SHA-256 hash for the given content combined with search space ID."""
|
||||
combined_data = f"{search_space_id}:{content}"
|
||||
return hashlib.sha256(combined_data.encode('utf-8')).hexdigest()
|
||||
|
|
Loading…
Add table
Reference in a new issue