feat: Added content based hashing to prevent duplicates and fix resync issues

This commit is contained in:
DESKTOP-RTLN3BA\$punk 2025-05-28 23:52:00 -07:00
parent 38516e74f9
commit 5411bac8e0
17 changed files with 297 additions and 334 deletions

View file

@ -1,19 +1,22 @@
import hashlib
async def convert_element_to_markdown(element) -> str:
"""
Convert an Unstructured element to markdown format based on its category.
Args:
element: The Unstructured API element object
Returns:
str: Markdown formatted string
"""
element_category = element.metadata["category"]
content = element.page_content
if not content:
return ""
markdown_mapping = {
"Formula": lambda x: f"```math\n{x}\n```",
"FigureCaption": lambda x: f"*Figure: {x}*",
@ -31,7 +34,7 @@ async def convert_element_to_markdown(element) -> str:
"PageNumber": lambda x: f"*Page {x}*\n\n",
"UncategorizedText": lambda x: f"{x}\n\n"
}
converter = markdown_mapping.get(element_category, lambda x: x)
return converter(content)
@ -39,29 +42,30 @@ async def convert_element_to_markdown(element) -> str:
async def convert_document_to_markdown(elements):
"""
Convert all document elements to markdown.
Args:
elements: List of Unstructured API elements
Returns:
str: Complete markdown document
"""
markdown_parts = []
for element in elements:
markdown_text = await convert_element_to_markdown(element)
if markdown_text:
markdown_parts.append(markdown_text)
return "".join(markdown_parts)
def convert_chunks_to_langchain_documents(chunks):
"""
Convert chunks from hybrid search results to LangChain Document objects.
Args:
chunks: List of chunk dictionaries from hybrid search results
Returns:
List of LangChain Document objects
"""
@ -71,20 +75,20 @@ def convert_chunks_to_langchain_documents(chunks):
raise ImportError(
"LangChain is not installed. Please install it with `pip install langchain langchain-core`"
)
langchain_docs = []
for chunk in chunks:
# Extract content from the chunk
content = chunk.get("content", "")
# Create metadata dictionary
metadata = {
"chunk_id": chunk.get("chunk_id"),
"score": chunk.get("score"),
"rank": chunk.get("rank") if "rank" in chunk else None,
}
# Add document information to metadata
if "document" in chunk:
doc = chunk["document"]
@ -93,24 +97,25 @@ def convert_chunks_to_langchain_documents(chunks):
"document_title": doc.get("title"),
"document_type": doc.get("document_type"),
})
# Add document metadata if available
if "metadata" in doc:
# Prefix document metadata keys to avoid conflicts
doc_metadata = {f"doc_meta_{k}": v for k, v in doc.get("metadata", {}).items()}
doc_metadata = {f"doc_meta_{k}": v for k,
v in doc.get("metadata", {}).items()}
metadata.update(doc_metadata)
# Add source URL if available in metadata
if "url" in doc.get("metadata", {}):
metadata["source"] = doc["metadata"]["url"]
elif "sourceURL" in doc.get("metadata", {}):
metadata["source"] = doc["metadata"]["sourceURL"]
# Ensure source_id is set for citation purposes
# Use document_id as the source_id if available
if "document_id" in metadata:
metadata["source_id"] = metadata["document_id"]
# Update content for citation mode - format as XML with explicit source_id
new_content = f"""
<document>
@ -124,13 +129,18 @@ def convert_chunks_to_langchain_documents(chunks):
</content>
</document>
"""
# Create LangChain Document
langchain_doc = LangChainDocument(
page_content=new_content,
metadata=metadata
)
langchain_docs.append(langchain_doc)
return langchain_docs
def generate_content_hash(content: str) -> str:
"""Generate SHA-256 hash for the given content."""
return hashlib.sha256(content.encode('utf-8')).hexdigest()