mirror of
https://github.com/MODSetter/SurfSense.git
synced 2025-09-01 10:09:08 +00:00
Merge pull request #274 from MODSetter/dev
fix: fixed search scope switching duplicate sources.
This commit is contained in:
commit
d3759f3c08
3 changed files with 65 additions and 52 deletions
|
@ -71,39 +71,41 @@ async def fetch_documents_by_ids(
|
||||||
chunks_result = await db_session.execute(chunks_query)
|
chunks_result = await db_session.execute(chunks_query)
|
||||||
chunks = chunks_result.scalars().all()
|
chunks = chunks_result.scalars().all()
|
||||||
|
|
||||||
# Concatenate chunks content (similar to SearchMode.DOCUMENTS approach)
|
# Return individual chunks instead of concatenated content
|
||||||
concatenated_chunks_content = (
|
if chunks:
|
||||||
" ".join([chunk.content for chunk in chunks]) if chunks else doc.content
|
for chunk in chunks:
|
||||||
)
|
# Format each chunk to match connector service return format
|
||||||
|
formatted_chunk = {
|
||||||
|
"chunk_id": chunk.id,
|
||||||
|
"content": chunk.content, # Use individual chunk content
|
||||||
|
"score": 0.5, # High score since user explicitly selected these
|
||||||
|
"document": {
|
||||||
|
"id": chunk.id,
|
||||||
|
"title": doc.title,
|
||||||
|
"document_type": (
|
||||||
|
doc.document_type.value
|
||||||
|
if doc.document_type
|
||||||
|
else "UNKNOWN"
|
||||||
|
),
|
||||||
|
"metadata": doc.document_metadata or {},
|
||||||
|
},
|
||||||
|
"source": doc.document_type.value
|
||||||
|
if doc.document_type
|
||||||
|
else "UNKNOWN",
|
||||||
|
}
|
||||||
|
formatted_documents.append(formatted_chunk)
|
||||||
|
|
||||||
# Format to match connector service return format
|
# Group by document type for source objects
|
||||||
formatted_doc = {
|
doc_type = (
|
||||||
"chunk_id": f"user_doc_{doc.id}",
|
|
||||||
"content": concatenated_chunks_content, # Use concatenated content like DOCUMENTS mode
|
|
||||||
"score": 0.5, # High score since user explicitly selected these
|
|
||||||
"document": {
|
|
||||||
"id": doc.id,
|
|
||||||
"title": doc.title,
|
|
||||||
"document_type": (
|
|
||||||
doc.document_type.value if doc.document_type else "UNKNOWN"
|
doc.document_type.value if doc.document_type else "UNKNOWN"
|
||||||
),
|
)
|
||||||
"metadata": doc.document_metadata or {},
|
if doc_type not in documents_by_type:
|
||||||
},
|
documents_by_type[doc_type] = []
|
||||||
"source": doc.document_type.value if doc.document_type else "UNKNOWN",
|
documents_by_type[doc_type].append(doc)
|
||||||
}
|
|
||||||
formatted_documents.append(formatted_doc)
|
|
||||||
|
|
||||||
# Group by document type for source objects
|
|
||||||
doc_type = doc.document_type.value if doc.document_type else "UNKNOWN"
|
|
||||||
if doc_type not in documents_by_type:
|
|
||||||
documents_by_type[doc_type] = []
|
|
||||||
documents_by_type[doc_type].append(doc)
|
|
||||||
|
|
||||||
# Create source objects for each document type (similar to ConnectorService)
|
# Create source objects for each document type (similar to ConnectorService)
|
||||||
source_objects = []
|
source_objects = []
|
||||||
connector_id_counter = (
|
connector_id_counter = 100
|
||||||
100 # Start from 100 to avoid conflicts with regular connectors
|
|
||||||
)
|
|
||||||
|
|
||||||
for doc_type, docs in documents_by_type.items():
|
for doc_type, docs in documents_by_type.items():
|
||||||
sources_list = []
|
sources_list = []
|
||||||
|
@ -395,7 +397,7 @@ async def fetch_documents_by_ids(
|
||||||
connector_id_counter += 1
|
connector_id_counter += 1
|
||||||
|
|
||||||
print(
|
print(
|
||||||
f"Fetched {len(formatted_documents)} user-selected documents (with concatenated chunks) from {len(document_ids)} requested IDs"
|
f"Fetched {len(formatted_documents)} user-selected chunks from {len(document_ids)} requested document IDs"
|
||||||
)
|
)
|
||||||
print(f"Created {len(source_objects)} source objects for UI display")
|
print(f"Created {len(source_objects)} source objects for UI display")
|
||||||
|
|
||||||
|
@ -1708,7 +1710,7 @@ async def handle_qna_workflow(
|
||||||
)
|
)
|
||||||
|
|
||||||
# Use a reasonable top_k for QNA - not too many documents to avoid overwhelming the LLM
|
# Use a reasonable top_k for QNA - not too many documents to avoid overwhelming the LLM
|
||||||
top_k = 20
|
top_k = 5 if configuration.search_mode == SearchMode.DOCUMENTS else 20
|
||||||
|
|
||||||
relevant_documents = []
|
relevant_documents = []
|
||||||
user_selected_documents = []
|
user_selected_documents = []
|
||||||
|
|
|
@ -240,7 +240,7 @@ class DocumentHybridSearchRetriever:
|
||||||
if not documents_with_scores:
|
if not documents_with_scores:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
# Convert to serializable dictionaries
|
# Convert to serializable dictionaries - return individual chunks
|
||||||
serialized_results = []
|
serialized_results = []
|
||||||
for document, score in documents_with_scores:
|
for document, score in documents_with_scores:
|
||||||
# Fetch associated chunks for this document
|
# Fetch associated chunks for this document
|
||||||
|
@ -254,26 +254,36 @@ class DocumentHybridSearchRetriever:
|
||||||
chunks_result = await self.db_session.execute(chunks_query)
|
chunks_result = await self.db_session.execute(chunks_query)
|
||||||
chunks = chunks_result.scalars().all()
|
chunks = chunks_result.scalars().all()
|
||||||
|
|
||||||
# Concatenate chunks content
|
# Return individual chunks instead of concatenated content
|
||||||
concatenated_chunks_content = (
|
if chunks:
|
||||||
" ".join([chunk.content for chunk in chunks])
|
for chunk in chunks:
|
||||||
if chunks
|
serialized_results.append(
|
||||||
else document.content
|
{
|
||||||
)
|
"document_id": chunk.id,
|
||||||
|
"title": document.title,
|
||||||
serialized_results.append(
|
"content": chunk.content, # Use chunk content instead of document content
|
||||||
{
|
"document_type": document.document_type.value
|
||||||
"document_id": document.id,
|
if hasattr(document, "document_type")
|
||||||
"title": document.title,
|
else None,
|
||||||
"content": document.content,
|
"metadata": document.document_metadata,
|
||||||
"chunks_content": concatenated_chunks_content,
|
"score": float(score), # Ensure score is a Python float
|
||||||
"document_type": document.document_type.value
|
"search_space_id": document.search_space_id,
|
||||||
if hasattr(document, "document_type")
|
}
|
||||||
else None,
|
)
|
||||||
"metadata": document.document_metadata,
|
else:
|
||||||
"score": float(score), # Ensure score is a Python float
|
# If no chunks exist, return the document content as a single result
|
||||||
"search_space_id": document.search_space_id,
|
serialized_results.append(
|
||||||
}
|
{
|
||||||
)
|
"document_id": document.id,
|
||||||
|
"title": document.title,
|
||||||
|
"content": document.content,
|
||||||
|
"document_type": document.document_type.value
|
||||||
|
if hasattr(document, "document_type")
|
||||||
|
else None,
|
||||||
|
"metadata": document.document_metadata,
|
||||||
|
"score": float(score), # Ensure score is a Python float
|
||||||
|
"search_space_id": document.search_space_id,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
return serialized_results
|
return serialized_results
|
||||||
|
|
|
@ -222,6 +222,7 @@ class ConnectorService:
|
||||||
for doc in document_results:
|
for doc in document_results:
|
||||||
transformed_results.append(
|
transformed_results.append(
|
||||||
{
|
{
|
||||||
|
"chunk_id": doc.get("document_id"),
|
||||||
"document": {
|
"document": {
|
||||||
"id": doc.get("document_id"),
|
"id": doc.get("document_id"),
|
||||||
"title": doc.get("title", "Untitled Document"),
|
"title": doc.get("title", "Untitled Document"),
|
||||||
|
|
Loading…
Add table
Reference in a new issue