Merge pull request #274 from MODSetter/dev

fix: fixed search scope switching duplicate sources.
This commit is contained in:
Rohan Verma 2025-08-20 10:24:11 -07:00 committed by GitHub
commit d3759f3c08
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 65 additions and 52 deletions

View file

@ -71,39 +71,41 @@ async def fetch_documents_by_ids(
chunks_result = await db_session.execute(chunks_query) chunks_result = await db_session.execute(chunks_query)
chunks = chunks_result.scalars().all() chunks = chunks_result.scalars().all()
# Concatenate chunks content (similar to SearchMode.DOCUMENTS approach) # Return individual chunks instead of concatenated content
concatenated_chunks_content = ( if chunks:
" ".join([chunk.content for chunk in chunks]) if chunks else doc.content for chunk in chunks:
) # Format each chunk to match connector service return format
formatted_chunk = {
"chunk_id": chunk.id,
"content": chunk.content, # Use individual chunk content
"score": 0.5, # High score since user explicitly selected these
"document": {
"id": chunk.id,
"title": doc.title,
"document_type": (
doc.document_type.value
if doc.document_type
else "UNKNOWN"
),
"metadata": doc.document_metadata or {},
},
"source": doc.document_type.value
if doc.document_type
else "UNKNOWN",
}
formatted_documents.append(formatted_chunk)
# Format to match connector service return format # Group by document type for source objects
formatted_doc = { doc_type = (
"chunk_id": f"user_doc_{doc.id}",
"content": concatenated_chunks_content, # Use concatenated content like DOCUMENTS mode
"score": 0.5, # High score since user explicitly selected these
"document": {
"id": doc.id,
"title": doc.title,
"document_type": (
doc.document_type.value if doc.document_type else "UNKNOWN" doc.document_type.value if doc.document_type else "UNKNOWN"
), )
"metadata": doc.document_metadata or {}, if doc_type not in documents_by_type:
}, documents_by_type[doc_type] = []
"source": doc.document_type.value if doc.document_type else "UNKNOWN", documents_by_type[doc_type].append(doc)
}
formatted_documents.append(formatted_doc)
# Group by document type for source objects
doc_type = doc.document_type.value if doc.document_type else "UNKNOWN"
if doc_type not in documents_by_type:
documents_by_type[doc_type] = []
documents_by_type[doc_type].append(doc)
# Create source objects for each document type (similar to ConnectorService) # Create source objects for each document type (similar to ConnectorService)
source_objects = [] source_objects = []
connector_id_counter = ( connector_id_counter = 100
100 # Start from 100 to avoid conflicts with regular connectors
)
for doc_type, docs in documents_by_type.items(): for doc_type, docs in documents_by_type.items():
sources_list = [] sources_list = []
@ -395,7 +397,7 @@ async def fetch_documents_by_ids(
connector_id_counter += 1 connector_id_counter += 1
print( print(
f"Fetched {len(formatted_documents)} user-selected documents (with concatenated chunks) from {len(document_ids)} requested IDs" f"Fetched {len(formatted_documents)} user-selected chunks from {len(document_ids)} requested document IDs"
) )
print(f"Created {len(source_objects)} source objects for UI display") print(f"Created {len(source_objects)} source objects for UI display")
@ -1708,7 +1710,7 @@ async def handle_qna_workflow(
) )
# Use a reasonable top_k for QNA - not too many documents to avoid overwhelming the LLM # Use a reasonable top_k for QNA - not too many documents to avoid overwhelming the LLM
top_k = 20 top_k = 5 if configuration.search_mode == SearchMode.DOCUMENTS else 20
relevant_documents = [] relevant_documents = []
user_selected_documents = [] user_selected_documents = []

View file

@ -240,7 +240,7 @@ class DocumentHybridSearchRetriever:
if not documents_with_scores: if not documents_with_scores:
return [] return []
# Convert to serializable dictionaries # Convert to serializable dictionaries - return individual chunks
serialized_results = [] serialized_results = []
for document, score in documents_with_scores: for document, score in documents_with_scores:
# Fetch associated chunks for this document # Fetch associated chunks for this document
@ -254,26 +254,36 @@ class DocumentHybridSearchRetriever:
chunks_result = await self.db_session.execute(chunks_query) chunks_result = await self.db_session.execute(chunks_query)
chunks = chunks_result.scalars().all() chunks = chunks_result.scalars().all()
# Concatenate chunks content # Return individual chunks instead of concatenated content
concatenated_chunks_content = ( if chunks:
" ".join([chunk.content for chunk in chunks]) for chunk in chunks:
if chunks serialized_results.append(
else document.content {
) "document_id": chunk.id,
"title": document.title,
serialized_results.append( "content": chunk.content, # Use chunk content instead of document content
{ "document_type": document.document_type.value
"document_id": document.id, if hasattr(document, "document_type")
"title": document.title, else None,
"content": document.content, "metadata": document.document_metadata,
"chunks_content": concatenated_chunks_content, "score": float(score), # Ensure score is a Python float
"document_type": document.document_type.value "search_space_id": document.search_space_id,
if hasattr(document, "document_type") }
else None, )
"metadata": document.document_metadata, else:
"score": float(score), # Ensure score is a Python float # If no chunks exist, return the document content as a single result
"search_space_id": document.search_space_id, serialized_results.append(
} {
) "document_id": document.id,
"title": document.title,
"content": document.content,
"document_type": document.document_type.value
if hasattr(document, "document_type")
else None,
"metadata": document.document_metadata,
"score": float(score), # Ensure score is a Python float
"search_space_id": document.search_space_id,
}
)
return serialized_results return serialized_results

View file

@ -222,6 +222,7 @@ class ConnectorService:
for doc in document_results: for doc in document_results:
transformed_results.append( transformed_results.append(
{ {
"chunk_id": doc.get("document_id"),
"document": { "document": {
"id": doc.get("document_id"), "id": doc.get("document_id"),
"title": doc.get("title", "Untitled Document"), "title": doc.get("title", "Untitled Document"),