fix: citations for user selected documents.

2025-09-01 18:19:08 +00:00 · 2025-08-20 12:03:57 -07:00 · 2025-08-20 12:03:57 -07:00 · 9dba1930de
commit 9dba1930de
parent 3d93fe8186
1 changed files with 85 additions and 10 deletions
--- a/surfsense_backend/app/agents/researcher/nodes.py
+++ b/surfsense_backend/app/agents/researcher/nodes.py
@ -26,6 +26,77 @@ from .sub_section_writer.graph import graph as sub_section_writer_graph
 from .utils import AnswerOutline, get_connector_emoji, get_connector_friendly_name
 def extract_sources_from_documents(
    all_documents: list[dict[str, Any]],
 ) -> list[dict[str, Any]]:
    """
    Extract sources from all_documents and group them by document type.
    Args:
        all_documents: List of document chunks from user-selected documents and connector-fetched documents
    Returns:
        List of source objects grouped by type for streaming
    """
    # Group documents by their source type
    documents_by_type = {}
    for doc in all_documents:
        # Get source type from the document
        source_type = doc.get("source", "UNKNOWN")
        document_info = doc.get("document", {})
        document_type = document_info.get("document_type", source_type)
        # Use document_type if available, otherwise use source
        group_type = document_type if document_type != "UNKNOWN" else source_type
        if group_type not in documents_by_type:
            documents_by_type[group_type] = []
        documents_by_type[group_type].append(doc)
    # Create source objects for each document type
    source_objects = []
    source_id_counter = 1
    for doc_type, docs in documents_by_type.items():
        sources_list = []
        for doc in docs:
            document_info = doc.get("document", {})
            metadata = document_info.get("metadata", {})
            # Create source entry based on document structure
            source = {
                "id": doc.get("chunk_id", source_id_counter),
                "title": document_info.get("title", "Untitled Document"),
                "description": doc.get("content", "")[:100] + "..."
                if len(doc.get("content", "")) > 100
                else doc.get("content", ""),
                "url": metadata.get("url", metadata.get("page_url", "")),
            }
            source_id_counter += 1
            sources_list.append(source)
        # Create group object
        group_name = (
            get_connector_friendly_name(doc_type)
            if doc_type != "UNKNOWN"
            else "Unknown Sources"
        )
        source_object = {
            "id": len(source_objects) + 1,
            "name": group_name,
            "type": doc_type,
            "sources": sources_list,
        }
        source_objects.append(source_object)
    return source_objects
 async def fetch_documents_by_ids(
    document_ids: list[int], user_id: str, db_session: AsyncSession
 ) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]:
@ -1169,16 +1240,6 @@ async def fetch_relevant_documents(
            }
        )
    # After all sources are collected and deduplicated, stream them
    if streaming_service and writer:
        writer(
            {
                "yield_value": streaming_service.format_sources_delta(
                    deduplicated_sources
                )
            }
        )
    # Deduplicate raw documents based on chunk_id or content
    seen_chunk_ids = set()
    seen_content_hashes = set()
@ -1355,6 +1416,13 @@ async def process_sections(
    )
    print(f"Total documents for sections: {len(all_documents)}")
    # Extract and stream sources from all_documents
    if all_documents:
        sources_to_stream = extract_sources_from_documents(all_documents)
        writer(
            {"yield_value": streaming_service.format_sources_delta(sources_to_stream)}
        )
    writer(
        {
            "yield_value": streaming_service.format_terminal_info_delta(
@ -1781,6 +1849,13 @@ async def handle_qna_workflow(
    print(f"Added {len(user_selected_documents)} user-selected documents for QNA")
    print(f"Total documents for QNA: {len(all_documents)}")
    # Extract and stream sources from all_documents
    if all_documents:
        sources_to_stream = extract_sources_from_documents(all_documents)
        writer(
            {"yield_value": streaming_service.format_sources_delta(sources_to_stream)}
        )
    writer(
        {
            "yield_value": streaming_service.format_terminal_info_delta(