From fbbb3294f4c67d50b590420bc39d233f29eae862 Mon Sep 17 00:00:00 2001
From: "DESKTOP-RTLN3BA\\$punk" <vermarohanfinal@gmail.com>
Date: Sun, 11 May 2025 23:04:48 -0700
Subject: [PATCH 1/2] feat: Introduce the RAPTOR Search.

---
 .../app/agents/researcher/configuration.py    |   7 +
 .../app/agents/researcher/nodes.py            |  32 ++-
 .../researcher/sub_section_writer/nodes.py    |  10 +-
 .../researcher/sub_section_writer/prompts.py  |  21 ++
 .../app/retriver/documents_hybrid_search.py   |  14 +-
 surfsense_backend/app/routes/chats_routes.py  |   7 +-
 .../tasks/stream_connector_search_results.py  |  13 +-
 .../app/utils/connector_service.py            | 253 +++++++++++++-----
 .../researcher/[chat_id]/page.tsx             |  82 +++---
 .../components/chat/ConnectorComponents.tsx   |   2 +-
 .../components/chat/SegmentedControl.tsx      |   4 +-
 11 files changed, 318 insertions(+), 127 deletions(-)

diff --git a/surfsense_backend/app/agents/researcher/configuration.py b/surfsense_backend/app/agents/researcher/configuration.py
index 8ba3849..0eb34b5 100644
--- a/surfsense_backend/app/agents/researcher/configuration.py
+++ b/surfsense_backend/app/agents/researcher/configuration.py
@@ -3,10 +3,16 @@
 from __future__ import annotations
 
 from dataclasses import dataclass, fields
+from enum import Enum
 from typing import Optional, List, Any
 
 from langchain_core.runnables import RunnableConfig
 
+class SearchMode(Enum): 
+    """Enum defining the type of search mode."""
+    CHUNKS = "CHUNKS"
+    DOCUMENTS = "DOCUMENTS"
+
 
 @dataclass(kw_only=True)
 class Configuration:
@@ -18,6 +24,7 @@ class Configuration:
     connectors_to_search: List[str]
     user_id: str
     search_space_id: int
+    search_mode: SearchMode
 
 
     @classmethod
diff --git a/surfsense_backend/app/agents/researcher/nodes.py b/surfsense_backend/app/agents/researcher/nodes.py
index b0b81ae..644ddd9 100644
--- a/surfsense_backend/app/agents/researcher/nodes.py
+++ b/surfsense_backend/app/agents/researcher/nodes.py
@@ -10,7 +10,7 @@ from langchain_core.runnables import RunnableConfig
 from pydantic import BaseModel, Field
 from sqlalchemy.ext.asyncio import AsyncSession
 
-from .configuration import Configuration
+from .configuration import Configuration, SearchMode
 from .prompts import get_answer_outline_system_prompt
 from .state import State
 from .sub_section_writer.graph import graph as sub_section_writer_graph
@@ -149,7 +149,8 @@ async def fetch_relevant_documents(
     writer: StreamWriter = None,
     state: State = None,
     top_k: int = 10,
-    connector_service: ConnectorService = None
+    connector_service: ConnectorService = None,
+    search_mode: SearchMode = SearchMode.CHUNKS
 ) -> List[Dict[str, Any]]:
     """
     Fetch relevant documents for research questions using the provided connectors.
@@ -213,7 +214,8 @@ async def fetch_relevant_documents(
                         user_query=reformulated_query,
                         user_id=user_id,
                         search_space_id=search_space_id,
-                        top_k=top_k
+                        top_k=top_k,
+                        search_mode=search_mode
                     )
                     
                     # Add to sources and raw documents
@@ -231,7 +233,8 @@ async def fetch_relevant_documents(
                         user_query=reformulated_query,
                         user_id=user_id,
                         search_space_id=search_space_id,
-                        top_k=top_k
+                        top_k=top_k,
+                        search_mode=search_mode
                     )
                     
                     # Add to sources and raw documents
@@ -249,7 +252,8 @@ async def fetch_relevant_documents(
                         user_query=reformulated_query,
                         user_id=user_id,
                         search_space_id=search_space_id,
-                        top_k=top_k
+                        top_k=top_k,
+                        search_mode=search_mode
                     )
                     
                     # Add to sources and raw documents
@@ -267,7 +271,8 @@ async def fetch_relevant_documents(
                         user_query=reformulated_query,
                         user_id=user_id,
                         search_space_id=search_space_id,
-                        top_k=top_k
+                        top_k=top_k,
+                        search_mode=search_mode
                     )
                     
                     # Add to sources and raw documents
@@ -286,7 +291,8 @@ async def fetch_relevant_documents(
                         user_query=reformulated_query,
                         user_id=user_id,
                         search_space_id=search_space_id,
-                        top_k=top_k
+                        top_k=top_k,
+                        search_mode=search_mode
                     )
                     
                     # Add to sources and raw documents
@@ -304,7 +310,8 @@ async def fetch_relevant_documents(
                         user_query=reformulated_query,
                         user_id=user_id,
                         search_space_id=search_space_id,
-                        top_k=top_k
+                        top_k=top_k,
+                        search_mode=search_mode
                     )
                     
                     # Add to sources and raw documents
@@ -322,7 +329,8 @@ async def fetch_relevant_documents(
                         user_query=reformulated_query,
                         user_id=user_id,
                         search_space_id=search_space_id,
-                        top_k=top_k
+                        top_k=top_k,
+                        search_mode=search_mode
                     )
                     
                     # Add to sources and raw documents
@@ -340,7 +348,8 @@ async def fetch_relevant_documents(
                         user_query=reformulated_query,
                         user_id=user_id,
                         search_space_id=search_space_id,
-                        top_k=top_k
+                        top_k=top_k,
+                        search_mode=search_mode
                     )
                     
                     # Add to sources and raw documents
@@ -558,7 +567,8 @@ async def process_sections(state: State, config: RunnableConfig, writer: StreamW
                 writer=writer,
                 state=state,
                 top_k=TOP_K,
-                connector_service=connector_service
+                connector_service=connector_service,
+                search_mode=configuration.search_mode
             )
         except Exception as e:
             error_message = f"Error fetching relevant documents: {str(e)}"
diff --git a/surfsense_backend/app/agents/researcher/sub_section_writer/nodes.py b/surfsense_backend/app/agents/researcher/sub_section_writer/nodes.py
index 765b619..5853283 100644
--- a/surfsense_backend/app/agents/researcher/sub_section_writer/nodes.py
+++ b/surfsense_backend/app/agents/researcher/sub_section_writer/nodes.py
@@ -141,6 +141,11 @@ async def write_sub_section(state: State, config: RunnableConfig) -> Dict[str, A
     
     # Construct a clear, structured query for the LLM
     human_message_content = f"""
+    Source material:
+    <documents>
+        {documents_text}
+    </documents>
+    
     Now user's query is: 
     <user_query>
         {user_query}
@@ -158,11 +163,6 @@ async def write_sub_section(state: State, config: RunnableConfig) -> Dict[str, A
     <guiding_questions>
         {questions_text}
     </guiding_questions>
-    
-    Use the provided documents as your source material and cite them properly using the IEEE citation format [X] where X is the source_id.
-    <documents>
-        {documents_text}
-    </documents>
     """
     
     # Create messages for the LLM
diff --git a/surfsense_backend/app/agents/researcher/sub_section_writer/prompts.py b/surfsense_backend/app/agents/researcher/sub_section_writer/prompts.py
index 18a91eb..48345c9 100644
--- a/surfsense_backend/app/agents/researcher/sub_section_writer/prompts.py
+++ b/surfsense_backend/app/agents/researcher/sub_section_writer/prompts.py
@@ -25,6 +25,8 @@ You are a research assistant tasked with analyzing documents and providing compr
 16. CRITICAL: Citations must ONLY appear as [X] or [X], [Y], [Z] format - never with parentheses, hyperlinks, or other formatting.
 17. CRITICAL: Never make up citation numbers. Only use source_id values that are explicitly provided in the document metadata.
 18. CRITICAL: If you are unsure about a source_id, do not include a citation rather than guessing or making one up.
+19. CRITICAL: Focus only on answering the user's query. Any guiding questions provided are for your thinking process only and should not be mentioned in your response.
+20. CRITICAL: Ensure your response aligns with the provided sub-section title and section position.
 </instructions>
 
 <format>
@@ -37,6 +39,8 @@ You are a research assistant tasked with analyzing documents and providing compr
 - NEVER create your own citation numbering system - use the exact source_id values from the documents.
 - NEVER format citations as clickable links or as markdown links like "([1](https://example.com))". Always use plain square brackets only.
 - NEVER make up citation numbers if you are unsure about the source_id. It is better to omit the citation than to guess.
+- NEVER include or mention the guiding questions in your response. They are only to help guide your thinking.
+- ALWAYS focus on answering the user's query directly from the information in the documents.
 </format>
 
 <input_example>
@@ -84,4 +88,21 @@ ONLY use plain square brackets [1] or multiple citations [1], [2], [3]
 </incorrect_citation_formats>
 
 Note that the citation numbers match exactly with the source_id values (1, 13, and 21) and are not renumbered sequentially. Citations follow IEEE style with square brackets and appear at the end of sentences.
+
+<user_query_instructions>
+When you see a user query like:
+    <user_query>
+        Give all linear issues.
+    </user_query>
+
+Focus exclusively on answering this query using information from the provided documents. 
+
+If guiding questions are provided in a <guiding_questions> section, use them only to guide your thinking process. Do not mention or list these questions in your response.
+
+Make sure your response:
+1. Directly answers the user's query
+2. Fits the provided sub-section title and section position
+3. Uses proper citations for all information from documents
+4. Is well-structured and professional in tone
+</user_query_instructions>
 """
\ No newline at end of file
diff --git a/surfsense_backend/app/retriver/documents_hybrid_search.py b/surfsense_backend/app/retriver/documents_hybrid_search.py
index 060c3b1..2163635 100644
--- a/surfsense_backend/app/retriver/documents_hybrid_search.py
+++ b/surfsense_backend/app/retriver/documents_hybrid_search.py
@@ -113,8 +113,6 @@ class DocumentHybridSearchRetriever:
             search_space_id: Optional search space ID to filter results
             document_type: Optional document type to filter results (e.g., "FILE", "CRAWLED_URL")
             
-        Returns:
-            List of dictionaries containing document data and relevance scores
         """
         from sqlalchemy import select, func, text
         from sqlalchemy.orm import joinedload
@@ -224,10 +222,22 @@ class DocumentHybridSearchRetriever:
         # Convert to serializable dictionaries
         serialized_results = []
         for document, score in documents_with_scores:
+            # Fetch associated chunks for this document
+            from sqlalchemy import select
+            from app.db import Chunk
+            
+            chunks_query = select(Chunk).where(Chunk.document_id == document.id).order_by(Chunk.id)
+            chunks_result = await self.db_session.execute(chunks_query)
+            chunks = chunks_result.scalars().all()
+            
+            # Concatenate chunks content
+            concatenated_chunks_content = " ".join([chunk.content for chunk in chunks]) if chunks else document.content
+            
             serialized_results.append({
                 "document_id": document.id,
                 "title": document.title,
                 "content": document.content,
+                "chunks_content": concatenated_chunks_content,
                 "document_type": document.document_type.value if hasattr(document, 'document_type') else None,
                 "metadata": document.document_metadata,
                 "score": float(score),  # Ensure score is a Python float
diff --git a/surfsense_backend/app/routes/chats_routes.py b/surfsense_backend/app/routes/chats_routes.py
index 62c7e8a..9a2aa79 100644
--- a/surfsense_backend/app/routes/chats_routes.py
+++ b/surfsense_backend/app/routes/chats_routes.py
@@ -11,6 +11,8 @@ from sqlalchemy.exc import IntegrityError, OperationalError
 from sqlalchemy.ext.asyncio import AsyncSession
 from sqlalchemy.future import select
 from langchain.schema import HumanMessage, AIMessage
+
+
 router = APIRouter()
 
 @router.post("/chat")
@@ -28,6 +30,8 @@ async def handle_chat_data(
     search_space_id = request.data.get('search_space_id')
     research_mode: str = request.data.get('research_mode')
     selected_connectors: List[str] = request.data.get('selected_connectors')
+    
+    search_mode_str = request.data.get('search_mode', "CHUNKS")
 
     # Convert search_space_id to integer if it's a string
     if search_space_id and isinstance(search_space_id, str):
@@ -66,7 +70,8 @@ async def handle_chat_data(
         session,
         research_mode,
         selected_connectors,
-        langchain_chat_history
+        langchain_chat_history,
+        search_mode_str
     ))
     response.headers['x-vercel-ai-data-stream'] = 'v1'
     return response
diff --git a/surfsense_backend/app/tasks/stream_connector_search_results.py b/surfsense_backend/app/tasks/stream_connector_search_results.py
index 2f3b50a..aa5f401 100644
--- a/surfsense_backend/app/tasks/stream_connector_search_results.py
+++ b/surfsense_backend/app/tasks/stream_connector_search_results.py
@@ -6,6 +6,8 @@ from app.agents.researcher.state import State
 from app.utils.streaming_service import StreamingService
 from sqlalchemy.ext.asyncio import AsyncSession
 
+from app.agents.researcher.configuration import SearchMode
+
 
 async def stream_connector_search_results(
     user_query: str, 
@@ -14,7 +16,8 @@ async def stream_connector_search_results(
     session: AsyncSession, 
     research_mode: str, 
     selected_connectors: List[str],
-    langchain_chat_history: List[Any]
+    langchain_chat_history: List[Any],
+    search_mode_str: str
 ) -> AsyncGenerator[str, None]:
     """
     Stream connector search results to the client
@@ -41,6 +44,11 @@ async def stream_connector_search_results(
     # Convert UUID to string if needed
     user_id_str = str(user_id) if isinstance(user_id, UUID) else user_id
     
+    if search_mode_str == "CHUNKS":
+        search_mode = SearchMode.CHUNKS
+    elif search_mode_str == "DOCUMENTS":
+        search_mode = SearchMode.DOCUMENTS
+    
     # Sample configuration
     config = {
         "configurable": {
@@ -48,7 +56,8 @@ async def stream_connector_search_results(
             "num_sections": NUM_SECTIONS,
             "connectors_to_search": selected_connectors,
             "user_id": user_id_str,
-            "search_space_id": search_space_id
+            "search_space_id": search_space_id,
+            "search_mode": search_mode
         }
     }
     # Initialize state with database session and streaming service
diff --git a/surfsense_backend/app/utils/connector_service.py b/surfsense_backend/app/utils/connector_service.py
index c7ad692..49c3b08 100644
--- a/surfsense_backend/app/utils/connector_service.py
+++ b/surfsense_backend/app/utils/connector_service.py
@@ -4,32 +4,47 @@ import asyncio
 from sqlalchemy.ext.asyncio import AsyncSession
 from sqlalchemy.future import select
 from app.retriver.chunks_hybrid_search import ChucksHybridSearchRetriever
+from app.retriver.documents_hybrid_search import DocumentHybridSearchRetriever
 from app.db import SearchSourceConnector, SearchSourceConnectorType
 from tavily import TavilyClient
 from linkup import LinkupClient
 
+from app.agents.researcher.configuration import SearchMode
+
 
 class ConnectorService:
     def __init__(self, session: AsyncSession):
         self.session = session
-        self.retriever = ChucksHybridSearchRetriever(session)
+        self.chunk_retriever = ChucksHybridSearchRetriever(session)
+        self.document_retriever = DocumentHybridSearchRetriever(session)
         self.source_id_counter = 1
         self.counter_lock = asyncio.Lock()  # Lock to protect counter in multithreaded environments
     
-    async def search_crawled_urls(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20) -> tuple:
+    async def search_crawled_urls(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple:
         """
         Search for crawled URLs and return both the source information and langchain documents
         
         Returns:
             tuple: (sources_info, langchain_documents)
         """
-        crawled_urls_chunks = await self.retriever.hybrid_search(
-            query_text=user_query,
-            top_k=top_k,
-            user_id=user_id,
-            search_space_id=search_space_id,
-            document_type="CRAWLED_URL"
-        )
+        if search_mode == SearchMode.CHUNKS:
+            crawled_urls_chunks = await self.chunk_retriever.hybrid_search(
+                query_text=user_query,
+                top_k=top_k,
+                user_id=user_id,
+                search_space_id=search_space_id,
+                document_type="CRAWLED_URL"
+            )
+        elif search_mode == SearchMode.DOCUMENTS:
+            crawled_urls_chunks = await self.document_retriever.hybrid_search(
+                query_text=user_query,
+                top_k=top_k,
+                user_id=user_id,
+                search_space_id=search_space_id,
+                document_type="CRAWLED_URL"
+            )
+            # Transform document retriever results to match expected format
+            crawled_urls_chunks = self._transform_document_results(crawled_urls_chunks)
 
         # Early return if no results
         if not crawled_urls_chunks:
@@ -71,20 +86,31 @@ class ConnectorService:
         
         return result_object, crawled_urls_chunks
     
-    async def search_files(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20) -> tuple:
+    async def search_files(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple:
         """
         Search for files and return both the source information and langchain documents
         
         Returns:
             tuple: (sources_info, langchain_documents)
         """
-        files_chunks = await self.retriever.hybrid_search(
-            query_text=user_query,
-            top_k=top_k,
-            user_id=user_id,
-            search_space_id=search_space_id,
-            document_type="FILE"
-        )
+        if search_mode == SearchMode.CHUNKS:
+            files_chunks = await self.chunk_retriever.hybrid_search(
+                query_text=user_query,
+                top_k=top_k,
+                user_id=user_id,
+                search_space_id=search_space_id,
+                document_type="FILE"
+            )
+        elif search_mode == SearchMode.DOCUMENTS:
+            files_chunks = await self.document_retriever.hybrid_search(
+                query_text=user_query,
+                top_k=top_k,
+                user_id=user_id,
+                search_space_id=search_space_id,
+                document_type="FILE"
+            )
+            # Transform document retriever results to match expected format
+            files_chunks = self._transform_document_results(files_chunks)
         
         # Early return if no results
         if not files_chunks:
@@ -126,6 +152,31 @@ class ConnectorService:
         
         return result_object, files_chunks
     
+    def _transform_document_results(self, document_results: List[Dict]) -> List[Dict]:
+        """
+        Transform results from document_retriever.hybrid_search() to match the format
+        expected by the processing code.
+        
+        Args:
+            document_results: Results from document_retriever.hybrid_search()
+            
+        Returns:
+            List of transformed results in the format expected by the processing code
+        """
+        transformed_results = []
+        for doc in document_results:
+            transformed_results.append({
+                'document': {
+                    'id': doc.get('document_id'),
+                    'title': doc.get('title', 'Untitled Document'),
+                    'document_type': doc.get('document_type'),
+                    'metadata': doc.get('metadata', {}),
+                },
+                'content': doc.get('chunks_content', doc.get('content', '')),
+                'score': doc.get('score', 0.0)
+            })
+        return transformed_results
+    
     async def get_connector_by_type(self, user_id: str, connector_type: SearchSourceConnectorType) -> Optional[SearchSourceConnector]:
         """
         Get a connector by type for a specific user
@@ -249,20 +300,31 @@ class ConnectorService:
                 "sources": [],
             }, []
     
-    async def search_slack(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20) -> tuple:
+    async def search_slack(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple:
         """
         Search for slack and return both the source information and langchain documents
         
         Returns:
             tuple: (sources_info, langchain_documents)
         """
-        slack_chunks = await self.retriever.hybrid_search(
-            query_text=user_query,
-            top_k=top_k,
-            user_id=user_id,
-            search_space_id=search_space_id,
-            document_type="SLACK_CONNECTOR"
-        )
+        if search_mode == SearchMode.CHUNKS:
+            slack_chunks = await self.chunk_retriever.hybrid_search(
+                query_text=user_query,
+                top_k=top_k,
+                user_id=user_id,
+                search_space_id=search_space_id,
+                document_type="SLACK_CONNECTOR"
+            )
+        elif search_mode == SearchMode.DOCUMENTS:
+            slack_chunks = await self.document_retriever.hybrid_search(
+                query_text=user_query,
+                top_k=top_k,
+                user_id=user_id,
+                search_space_id=search_space_id,
+                document_type="SLACK_CONNECTOR"
+            )
+            # Transform document retriever results to match expected format
+            slack_chunks = self._transform_document_results(slack_chunks)
         
         # Early return if no results
         if not slack_chunks:
@@ -323,7 +385,7 @@ class ConnectorService:
         
         return result_object, slack_chunks
         
-    async def search_notion(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20) -> tuple:
+    async def search_notion(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple:
         """
         Search for Notion pages and return both the source information and langchain documents
         
@@ -336,14 +398,25 @@ class ConnectorService:
         Returns:
             tuple: (sources_info, langchain_documents)
         """
-        notion_chunks = await self.retriever.hybrid_search(
-            query_text=user_query,
-            top_k=top_k,
-            user_id=user_id,
-            search_space_id=search_space_id,
-            document_type="NOTION_CONNECTOR"
-        )
-        
+        if search_mode == SearchMode.CHUNKS:
+            notion_chunks = await self.chunk_retriever.hybrid_search(
+                query_text=user_query,
+                top_k=top_k,
+                user_id=user_id,
+                search_space_id=search_space_id,
+                document_type="NOTION_CONNECTOR"
+            )
+        elif search_mode == SearchMode.DOCUMENTS:
+            notion_chunks = await self.document_retriever.hybrid_search(
+                query_text=user_query,
+                top_k=top_k,
+                user_id=user_id,
+                search_space_id=search_space_id,
+                document_type="NOTION_CONNECTOR"
+            )
+            # Transform document retriever results to match expected format
+            notion_chunks = self._transform_document_results(notion_chunks)
+            
         # Early return if no results
         if not notion_chunks:
             return {
@@ -405,7 +478,7 @@ class ConnectorService:
         
         return result_object, notion_chunks
     
-    async def search_extension(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20) -> tuple:
+    async def search_extension(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple:
         """
         Search for extension data and return both the source information and langchain documents
         
@@ -418,14 +491,25 @@ class ConnectorService:
         Returns:
             tuple: (sources_info, langchain_documents)
         """
-        extension_chunks = await self.retriever.hybrid_search(
-            query_text=user_query,
-            top_k=top_k,
-            user_id=user_id,
-            search_space_id=search_space_id,
-            document_type="EXTENSION"
-        )
-        
+        if search_mode == SearchMode.CHUNKS:
+            extension_chunks = await self.chunk_retriever.hybrid_search(
+                query_text=user_query,
+                top_k=top_k,
+                user_id=user_id,
+                search_space_id=search_space_id,
+                document_type="EXTENSION"
+            )
+        elif search_mode == SearchMode.DOCUMENTS:
+            extension_chunks = await self.document_retriever.hybrid_search(
+                query_text=user_query,
+                top_k=top_k,
+                user_id=user_id,
+                search_space_id=search_space_id,
+                document_type="EXTENSION"
+            )
+            # Transform document retriever results to match expected format
+            extension_chunks = self._transform_document_results(extension_chunks)
+
         # Early return if no results
         if not extension_chunks:
             return {
@@ -505,7 +589,7 @@ class ConnectorService:
         
         return result_object, extension_chunks
     
-    async def search_youtube(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20) -> tuple:
+    async def search_youtube(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple:
         """
         Search for YouTube videos and return both the source information and langchain documents
         
@@ -518,13 +602,24 @@ class ConnectorService:
         Returns:
             tuple: (sources_info, langchain_documents)
         """
-        youtube_chunks = await self.retriever.hybrid_search(
-            query_text=user_query,
-            top_k=top_k,
-            user_id=user_id,
-            search_space_id=search_space_id,
-            document_type="YOUTUBE_VIDEO"
-        )
+        if search_mode == SearchMode.CHUNKS:
+            youtube_chunks = await self.chunk_retriever.hybrid_search(
+                query_text=user_query,
+                top_k=top_k,
+                user_id=user_id,
+                search_space_id=search_space_id,
+                document_type="YOUTUBE_VIDEO"
+            )
+        elif search_mode == SearchMode.DOCUMENTS:
+            youtube_chunks = await self.document_retriever.hybrid_search(
+                query_text=user_query,
+                top_k=top_k,
+                user_id=user_id,
+                search_space_id=search_space_id,
+                document_type="YOUTUBE_VIDEO"
+            )
+            # Transform document retriever results to match expected format
+            youtube_chunks = self._transform_document_results(youtube_chunks)
         
         # Early return if no results
         if not youtube_chunks:
@@ -587,20 +682,31 @@ class ConnectorService:
         
         return result_object, youtube_chunks
 
-    async def search_github(self, user_query: str, user_id: int, search_space_id: int, top_k: int = 20) -> tuple:
+    async def search_github(self, user_query: str, user_id: int, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple:
         """
         Search for GitHub documents and return both the source information and langchain documents
         
         Returns:
             tuple: (sources_info, langchain_documents)
         """
-        github_chunks = await self.retriever.hybrid_search(
-            query_text=user_query,
-            top_k=top_k,
-            user_id=user_id,
-            search_space_id=search_space_id,
-            document_type="GITHUB_CONNECTOR"
-        )
+        if search_mode == SearchMode.CHUNKS:
+            github_chunks = await self.chunk_retriever.hybrid_search(
+                query_text=user_query,
+                top_k=top_k,
+                user_id=user_id,
+                search_space_id=search_space_id,
+                document_type="GITHUB_CONNECTOR"
+            )
+        elif search_mode == SearchMode.DOCUMENTS:
+            github_chunks = await self.document_retriever.hybrid_search(
+                query_text=user_query,
+                top_k=top_k,
+                user_id=user_id,
+                search_space_id=search_space_id,
+                document_type="GITHUB_CONNECTOR"
+            )
+            # Transform document retriever results to match expected format
+            github_chunks = self._transform_document_results(github_chunks)
         
         # Early return if no results
         if not github_chunks:
@@ -643,7 +749,7 @@ class ConnectorService:
         
         return result_object, github_chunks
 
-    async def search_linear(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20) -> tuple:
+    async def search_linear(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple:
         """
         Search for Linear issues and comments and return both the source information and langchain documents
         
@@ -656,14 +762,25 @@ class ConnectorService:
         Returns:
             tuple: (sources_info, langchain_documents)
         """
-        linear_chunks = await self.retriever.hybrid_search(
-            query_text=user_query,
-            top_k=top_k,
-            user_id=user_id,
-            search_space_id=search_space_id,
-            document_type="LINEAR_CONNECTOR"
-        )
-        
+        if search_mode == SearchMode.CHUNKS:
+            linear_chunks = await self.chunk_retriever.hybrid_search(
+                query_text=user_query,
+                top_k=top_k,
+                user_id=user_id,
+                search_space_id=search_space_id,
+                document_type="LINEAR_CONNECTOR"
+            )
+        elif search_mode == SearchMode.DOCUMENTS:
+            linear_chunks = await self.document_retriever.hybrid_search(
+                query_text=user_query,
+                top_k=top_k,
+                user_id=user_id,
+                search_space_id=search_space_id,
+                document_type="LINEAR_CONNECTOR"
+            )
+            # Transform document retriever results to match expected format
+            linear_chunks = self._transform_document_results(linear_chunks)
+
         # Early return if no results
         if not linear_chunks:
             return {
diff --git a/surfsense_web/app/dashboard/[search_space_id]/researcher/[chat_id]/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/researcher/[chat_id]/page.tsx
index bc58e8c..78239e2 100644
--- a/surfsense_web/app/dashboard/[search_space_id]/researcher/[chat_id]/page.tsx
+++ b/surfsense_web/app/dashboard/[search_space_id]/researcher/[chat_id]/page.tsx
@@ -13,7 +13,9 @@ import {
   ArrowDown,
   CircleUser,
   Database,
-  SendHorizontal
+  SendHorizontal,
+  FileText,
+  Grid3x3
 } from 'lucide-react';
 import { Card, CardContent, CardHeader, CardTitle } from '@/components/ui/card';
 import { Button } from '@/components/ui/button';
@@ -248,6 +250,7 @@ const ChatPage = () => {
   const tabsListRef = useRef<HTMLDivElement>(null);
   const [terminalExpanded, setTerminalExpanded] = useState(false);
   const [selectedConnectors, setSelectedConnectors] = useState<string[]>(["CRAWLED_URL"]);
+  const [searchMode, setSearchMode] = useState<'DOCUMENTS' | 'CHUNKS'>('DOCUMENTS');
   const [researchMode, setResearchMode] = useState<ResearchMode>("GENERAL");
   const [currentTime, setCurrentTime] = useState<string>('');
   const [currentDate, setCurrentDate] = useState<string>('');
@@ -362,7 +365,8 @@ const ChatPage = () => {
       data: {
         search_space_id: search_space_id,
         selected_connectors: selectedConnectors,
-        research_mode: researchMode
+        research_mode: researchMode,
+        search_mode: searchMode
       }
     },
     onError: (error) => {
@@ -557,11 +561,6 @@ const ChatPage = () => {
     }
   }, [terminalExpanded]);
 
-  // Get total sources count for a connector type
-  const getSourcesCount = (connectorType: string) => {
-    return getSourcesCountUtil(getMessageConnectorSources(messages[messages.length - 1]), connectorType);
-  };
-
   // Function to check scroll position and update indicators
   const updateScrollIndicators = () => {
     updateScrollIndicatorsUtil(tabsListRef as React.RefObject<HTMLDivElement>, setCanScrollLeft, setCanScrollRight);
@@ -587,23 +586,6 @@ const ChatPage = () => {
   // Use the scroll to bottom hook
   useScrollToBottom(messagesEndRef as React.RefObject<HTMLDivElement>, [messages]);
 
-  // Function to get sources for the main view
-  const getMainViewSources = (connector: any) => {
-    return getMainViewSourcesUtil(connector, INITIAL_SOURCES_DISPLAY);
-  };
-
-  // Function to get filtered sources for the dialog with null check
-  const getFilteredSourcesWithCheck = (connector: any, sourceFilter: string) => {
-    if (!connector?.sources) return [];
-    return getFilteredSourcesUtil(connector, sourceFilter);
-  };
-
-  // Function to get paginated dialog sources with null check
-  const getPaginatedDialogSourcesWithCheck = (connector: any, sourceFilter: string, expandedSources: boolean, sourcesPage: number, sourcesPerPage: number) => {
-    if (!connector?.sources) return [];
-    return getPaginatedDialogSourcesUtil(connector, sourceFilter, expandedSources, sourcesPage, sourcesPerPage);
-  };
-
   // Function to get a citation source by ID
   const getCitationSource = React.useCallback((citationId: number, messageIndex?: number): Source | null => {
     if (!messages || messages.length === 0) return null;
@@ -995,15 +977,17 @@ const ChatPage = () => {
               <span className="sr-only">Send</span>
             </Button>
           </form>
-          <div className="flex items-center justify-between px-2 py-1 mt-8">
-            <div className="flex items-center gap-4">
+          <div className="flex items-center justify-between px-2 py-2 mt-3">
+            <div className="flex items-center space-x-3">
               {/* Connector Selection Dialog */}
               <Dialog>
                 <DialogTrigger asChild>
-                  <ConnectorButton
-                    selectedConnectors={selectedConnectors}
-                    onClick={() => { }}
-                  />
+                  <div className="h-8">
+                    <ConnectorButton
+                      selectedConnectors={selectedConnectors}
+                      onClick={() => { }}
+                    />
+                  </div>
                 </DialogTrigger>
                 <DialogContent className="sm:max-w-md">
                   <DialogHeader>
@@ -1070,12 +1054,40 @@ const ChatPage = () => {
                 </DialogContent>
               </Dialog>
 
+              {/* Search Mode Control */}
+              <div className="flex items-center p-0.5 rounded-md border border-border bg-muted/20 h-8">
+                <button
+                  onClick={() => setSearchMode('DOCUMENTS')}
+                  className={`flex h-full items-center justify-center gap-1 px-2 rounded text-xs font-medium transition-colors flex-1 whitespace-nowrap overflow-hidden ${
+                    searchMode === 'DOCUMENTS'
+                      ? 'bg-primary text-primary-foreground shadow-sm'
+                      : 'text-muted-foreground hover:text-foreground hover:bg-muted/50'
+                  }`}
+                >
+                  <FileText className="h-3 w-3 flex-shrink-0 mr-1" />
+                  <span>Full Document</span>
+                </button>
+                <button
+                  onClick={() => setSearchMode('CHUNKS')}
+                  className={`flex h-full items-center justify-center gap-1 px-2 rounded text-xs font-medium transition-colors flex-1 whitespace-nowrap overflow-hidden ${
+                    searchMode === 'CHUNKS'
+                      ? 'bg-primary text-primary-foreground shadow-sm'
+                      : 'text-muted-foreground hover:text-foreground hover:bg-muted/50'
+                  }`}
+                >
+                  <Grid3x3 className="h-3 w-3 flex-shrink-0 mr-1" />
+                  <span>Document Chunks</span>
+                </button>
+              </div>
+
               {/* Research Mode Segmented Control */}
-              <SegmentedControl<ResearchMode>
-                value={researchMode}
-                onChange={setResearchMode}
-                options={researcherOptions}
-              />
+              <div className="h-8">
+                <SegmentedControl<ResearchMode>
+                  value={researchMode}
+                  onChange={setResearchMode}
+                  options={researcherOptions}
+                />
+              </div>
             </div>
           </div>
         </div>
diff --git a/surfsense_web/components/chat/ConnectorComponents.tsx b/surfsense_web/components/chat/ConnectorComponents.tsx
index e52348c..163d5bf 100644
--- a/surfsense_web/components/chat/ConnectorComponents.tsx
+++ b/surfsense_web/components/chat/ConnectorComponents.tsx
@@ -147,7 +147,7 @@ export const ConnectorButton = ({ selectedConnectors, onClick, connectorSources
   return (
     <Button
       variant="outline"
-      className="h-7 px-2 text-xs font-medium rounded-md border-border relative overflow-hidden group scale-90 origin-left"
+      className="h-8 px-2 text-xs font-medium rounded-md border-border relative overflow-hidden group"
       onClick={onClick}
       aria-label={selectedCount === 0 ? "Select Connectors" : `${selectedCount} connectors selected`}
     >
diff --git a/surfsense_web/components/chat/SegmentedControl.tsx b/surfsense_web/components/chat/SegmentedControl.tsx
index d886a3b..7d81a52 100644
--- a/surfsense_web/components/chat/SegmentedControl.tsx
+++ b/surfsense_web/components/chat/SegmentedControl.tsx
@@ -15,11 +15,11 @@ type SegmentedControlProps<T extends string> = {
  */
 function SegmentedControl<T extends string>({ value, onChange, options }: SegmentedControlProps<T>) {
   return (
-    <div className="flex rounded-md border border-border overflow-hidden scale-90 origin-left">
+    <div className="flex h-7 rounded-md border border-border overflow-hidden">
       {options.map((option) => (
         <button
           key={option.value}
-          className={`flex items-center gap-1 px-2 py-1 text-xs transition-colors ${
+          className={`flex h-full items-center gap-1 px-2 text-xs transition-colors ${
             value === option.value 
               ? 'bg-primary text-primary-foreground' 
               : 'hover:bg-muted'

From a9db0a8cebc27cf35aaa548f56b3655359bdcaf4 Mon Sep 17 00:00:00 2001
From: "DESKTOP-RTLN3BA\\$punk" <vermarohanfinal@gmail.com>
Date: Sun, 11 May 2025 23:05:56 -0700
Subject: [PATCH 2/2] feat: Introduce the RAPTOR Search.

---
 .../app/agents/researcher/configuration.py    |   7 +
 .../app/agents/researcher/nodes.py            |  32 ++-
 .../researcher/sub_section_writer/nodes.py    |  10 +-
 .../researcher/sub_section_writer/prompts.py  |  21 ++
 .../app/retriver/documents_hybrid_search.py   |  14 +-
 surfsense_backend/app/routes/chats_routes.py  |   7 +-
 .../tasks/stream_connector_search_results.py  |  13 +-
 .../app/utils/connector_service.py            | 253 +++++++++++++-----
 .../researcher/[chat_id]/page.tsx             |  82 +++---
 .../components/chat/ConnectorComponents.tsx   |   2 +-
 .../components/chat/SegmentedControl.tsx      |   4 +-
 11 files changed, 318 insertions(+), 127 deletions(-)

diff --git a/surfsense_backend/app/agents/researcher/configuration.py b/surfsense_backend/app/agents/researcher/configuration.py
index 8ba3849..0eb34b5 100644
--- a/surfsense_backend/app/agents/researcher/configuration.py
+++ b/surfsense_backend/app/agents/researcher/configuration.py
@@ -3,10 +3,16 @@
 from __future__ import annotations
 
 from dataclasses import dataclass, fields
+from enum import Enum
 from typing import Optional, List, Any
 
 from langchain_core.runnables import RunnableConfig
 
+class SearchMode(Enum): 
+    """Enum defining the type of search mode."""
+    CHUNKS = "CHUNKS"
+    DOCUMENTS = "DOCUMENTS"
+
 
 @dataclass(kw_only=True)
 class Configuration:
@@ -18,6 +24,7 @@ class Configuration:
     connectors_to_search: List[str]
     user_id: str
     search_space_id: int
+    search_mode: SearchMode
 
 
     @classmethod
diff --git a/surfsense_backend/app/agents/researcher/nodes.py b/surfsense_backend/app/agents/researcher/nodes.py
index b0b81ae..644ddd9 100644
--- a/surfsense_backend/app/agents/researcher/nodes.py
+++ b/surfsense_backend/app/agents/researcher/nodes.py
@@ -10,7 +10,7 @@ from langchain_core.runnables import RunnableConfig
 from pydantic import BaseModel, Field
 from sqlalchemy.ext.asyncio import AsyncSession
 
-from .configuration import Configuration
+from .configuration import Configuration, SearchMode
 from .prompts import get_answer_outline_system_prompt
 from .state import State
 from .sub_section_writer.graph import graph as sub_section_writer_graph
@@ -149,7 +149,8 @@ async def fetch_relevant_documents(
     writer: StreamWriter = None,
     state: State = None,
     top_k: int = 10,
-    connector_service: ConnectorService = None
+    connector_service: ConnectorService = None,
+    search_mode: SearchMode = SearchMode.CHUNKS
 ) -> List[Dict[str, Any]]:
     """
     Fetch relevant documents for research questions using the provided connectors.
@@ -213,7 +214,8 @@ async def fetch_relevant_documents(
                         user_query=reformulated_query,
                         user_id=user_id,
                         search_space_id=search_space_id,
-                        top_k=top_k
+                        top_k=top_k,
+                        search_mode=search_mode
                     )
                     
                     # Add to sources and raw documents
@@ -231,7 +233,8 @@ async def fetch_relevant_documents(
                         user_query=reformulated_query,
                         user_id=user_id,
                         search_space_id=search_space_id,
-                        top_k=top_k
+                        top_k=top_k,
+                        search_mode=search_mode
                     )
                     
                     # Add to sources and raw documents
@@ -249,7 +252,8 @@ async def fetch_relevant_documents(
                         user_query=reformulated_query,
                         user_id=user_id,
                         search_space_id=search_space_id,
-                        top_k=top_k
+                        top_k=top_k,
+                        search_mode=search_mode
                     )
                     
                     # Add to sources and raw documents
@@ -267,7 +271,8 @@ async def fetch_relevant_documents(
                         user_query=reformulated_query,
                         user_id=user_id,
                         search_space_id=search_space_id,
-                        top_k=top_k
+                        top_k=top_k,
+                        search_mode=search_mode
                     )
                     
                     # Add to sources and raw documents
@@ -286,7 +291,8 @@ async def fetch_relevant_documents(
                         user_query=reformulated_query,
                         user_id=user_id,
                         search_space_id=search_space_id,
-                        top_k=top_k
+                        top_k=top_k,
+                        search_mode=search_mode
                     )
                     
                     # Add to sources and raw documents
@@ -304,7 +310,8 @@ async def fetch_relevant_documents(
                         user_query=reformulated_query,
                         user_id=user_id,
                         search_space_id=search_space_id,
-                        top_k=top_k
+                        top_k=top_k,
+                        search_mode=search_mode
                     )
                     
                     # Add to sources and raw documents
@@ -322,7 +329,8 @@ async def fetch_relevant_documents(
                         user_query=reformulated_query,
                         user_id=user_id,
                         search_space_id=search_space_id,
-                        top_k=top_k
+                        top_k=top_k,
+                        search_mode=search_mode
                     )
                     
                     # Add to sources and raw documents
@@ -340,7 +348,8 @@ async def fetch_relevant_documents(
                         user_query=reformulated_query,
                         user_id=user_id,
                         search_space_id=search_space_id,
-                        top_k=top_k
+                        top_k=top_k,
+                        search_mode=search_mode
                     )
                     
                     # Add to sources and raw documents
@@ -558,7 +567,8 @@ async def process_sections(state: State, config: RunnableConfig, writer: StreamW
                 writer=writer,
                 state=state,
                 top_k=TOP_K,
-                connector_service=connector_service
+                connector_service=connector_service,
+                search_mode=configuration.search_mode
             )
         except Exception as e:
             error_message = f"Error fetching relevant documents: {str(e)}"
diff --git a/surfsense_backend/app/agents/researcher/sub_section_writer/nodes.py b/surfsense_backend/app/agents/researcher/sub_section_writer/nodes.py
index 765b619..5853283 100644
--- a/surfsense_backend/app/agents/researcher/sub_section_writer/nodes.py
+++ b/surfsense_backend/app/agents/researcher/sub_section_writer/nodes.py
@@ -141,6 +141,11 @@ async def write_sub_section(state: State, config: RunnableConfig) -> Dict[str, A
     
     # Construct a clear, structured query for the LLM
     human_message_content = f"""
+    Source material:
+    <documents>
+        {documents_text}
+    </documents>
+    
     Now user's query is: 
     <user_query>
         {user_query}
@@ -158,11 +163,6 @@ async def write_sub_section(state: State, config: RunnableConfig) -> Dict[str, A
     <guiding_questions>
         {questions_text}
     </guiding_questions>
-    
-    Use the provided documents as your source material and cite them properly using the IEEE citation format [X] where X is the source_id.
-    <documents>
-        {documents_text}
-    </documents>
     """
     
     # Create messages for the LLM
diff --git a/surfsense_backend/app/agents/researcher/sub_section_writer/prompts.py b/surfsense_backend/app/agents/researcher/sub_section_writer/prompts.py
index 18a91eb..48345c9 100644
--- a/surfsense_backend/app/agents/researcher/sub_section_writer/prompts.py
+++ b/surfsense_backend/app/agents/researcher/sub_section_writer/prompts.py
@@ -25,6 +25,8 @@ You are a research assistant tasked with analyzing documents and providing compr
 16. CRITICAL: Citations must ONLY appear as [X] or [X], [Y], [Z] format - never with parentheses, hyperlinks, or other formatting.
 17. CRITICAL: Never make up citation numbers. Only use source_id values that are explicitly provided in the document metadata.
 18. CRITICAL: If you are unsure about a source_id, do not include a citation rather than guessing or making one up.
+19. CRITICAL: Focus only on answering the user's query. Any guiding questions provided are for your thinking process only and should not be mentioned in your response.
+20. CRITICAL: Ensure your response aligns with the provided sub-section title and section position.
 </instructions>
 
 <format>
@@ -37,6 +39,8 @@ You are a research assistant tasked with analyzing documents and providing compr
 - NEVER create your own citation numbering system - use the exact source_id values from the documents.
 - NEVER format citations as clickable links or as markdown links like "([1](https://example.com))". Always use plain square brackets only.
 - NEVER make up citation numbers if you are unsure about the source_id. It is better to omit the citation than to guess.
+- NEVER include or mention the guiding questions in your response. They are only to help guide your thinking.
+- ALWAYS focus on answering the user's query directly from the information in the documents.
 </format>
 
 <input_example>
@@ -84,4 +88,21 @@ ONLY use plain square brackets [1] or multiple citations [1], [2], [3]
 </incorrect_citation_formats>
 
 Note that the citation numbers match exactly with the source_id values (1, 13, and 21) and are not renumbered sequentially. Citations follow IEEE style with square brackets and appear at the end of sentences.
+
+<user_query_instructions>
+When you see a user query like:
+    <user_query>
+        Give all linear issues.
+    </user_query>
+
+Focus exclusively on answering this query using information from the provided documents. 
+
+If guiding questions are provided in a <guiding_questions> section, use them only to guide your thinking process. Do not mention or list these questions in your response.
+
+Make sure your response:
+1. Directly answers the user's query
+2. Fits the provided sub-section title and section position
+3. Uses proper citations for all information from documents
+4. Is well-structured and professional in tone
+</user_query_instructions>
 """
\ No newline at end of file
diff --git a/surfsense_backend/app/retriver/documents_hybrid_search.py b/surfsense_backend/app/retriver/documents_hybrid_search.py
index 060c3b1..2163635 100644
--- a/surfsense_backend/app/retriver/documents_hybrid_search.py
+++ b/surfsense_backend/app/retriver/documents_hybrid_search.py
@@ -113,8 +113,6 @@ class DocumentHybridSearchRetriever:
             search_space_id: Optional search space ID to filter results
             document_type: Optional document type to filter results (e.g., "FILE", "CRAWLED_URL")
             
-        Returns:
-            List of dictionaries containing document data and relevance scores
         """
         from sqlalchemy import select, func, text
         from sqlalchemy.orm import joinedload
@@ -224,10 +222,22 @@ class DocumentHybridSearchRetriever:
         # Convert to serializable dictionaries
         serialized_results = []
         for document, score in documents_with_scores:
+            # Fetch associated chunks for this document
+            from sqlalchemy import select
+            from app.db import Chunk
+            
+            chunks_query = select(Chunk).where(Chunk.document_id == document.id).order_by(Chunk.id)
+            chunks_result = await self.db_session.execute(chunks_query)
+            chunks = chunks_result.scalars().all()
+            
+            # Concatenate chunks content
+            concatenated_chunks_content = " ".join([chunk.content for chunk in chunks]) if chunks else document.content
+            
             serialized_results.append({
                 "document_id": document.id,
                 "title": document.title,
                 "content": document.content,
+                "chunks_content": concatenated_chunks_content,
                 "document_type": document.document_type.value if hasattr(document, 'document_type') else None,
                 "metadata": document.document_metadata,
                 "score": float(score),  # Ensure score is a Python float
diff --git a/surfsense_backend/app/routes/chats_routes.py b/surfsense_backend/app/routes/chats_routes.py
index 62c7e8a..9a2aa79 100644
--- a/surfsense_backend/app/routes/chats_routes.py
+++ b/surfsense_backend/app/routes/chats_routes.py
@@ -11,6 +11,8 @@ from sqlalchemy.exc import IntegrityError, OperationalError
 from sqlalchemy.ext.asyncio import AsyncSession
 from sqlalchemy.future import select
 from langchain.schema import HumanMessage, AIMessage
+
+
 router = APIRouter()
 
 @router.post("/chat")
@@ -28,6 +30,8 @@ async def handle_chat_data(
     search_space_id = request.data.get('search_space_id')
     research_mode: str = request.data.get('research_mode')
     selected_connectors: List[str] = request.data.get('selected_connectors')
+    
+    search_mode_str = request.data.get('search_mode', "CHUNKS")
 
     # Convert search_space_id to integer if it's a string
     if search_space_id and isinstance(search_space_id, str):
@@ -66,7 +70,8 @@ async def handle_chat_data(
         session,
         research_mode,
         selected_connectors,
-        langchain_chat_history
+        langchain_chat_history,
+        search_mode_str
     ))
     response.headers['x-vercel-ai-data-stream'] = 'v1'
     return response
diff --git a/surfsense_backend/app/tasks/stream_connector_search_results.py b/surfsense_backend/app/tasks/stream_connector_search_results.py
index 2f3b50a..aa5f401 100644
--- a/surfsense_backend/app/tasks/stream_connector_search_results.py
+++ b/surfsense_backend/app/tasks/stream_connector_search_results.py
@@ -6,6 +6,8 @@ from app.agents.researcher.state import State
 from app.utils.streaming_service import StreamingService
 from sqlalchemy.ext.asyncio import AsyncSession
 
+from app.agents.researcher.configuration import SearchMode
+
 
 async def stream_connector_search_results(
     user_query: str, 
@@ -14,7 +16,8 @@ async def stream_connector_search_results(
     session: AsyncSession, 
     research_mode: str, 
     selected_connectors: List[str],
-    langchain_chat_history: List[Any]
+    langchain_chat_history: List[Any],
+    search_mode_str: str
 ) -> AsyncGenerator[str, None]:
     """
     Stream connector search results to the client
@@ -41,6 +44,11 @@ async def stream_connector_search_results(
     # Convert UUID to string if needed
     user_id_str = str(user_id) if isinstance(user_id, UUID) else user_id
     
+    if search_mode_str == "CHUNKS":
+        search_mode = SearchMode.CHUNKS
+    elif search_mode_str == "DOCUMENTS":
+        search_mode = SearchMode.DOCUMENTS
+    
     # Sample configuration
     config = {
         "configurable": {
@@ -48,7 +56,8 @@ async def stream_connector_search_results(
             "num_sections": NUM_SECTIONS,
             "connectors_to_search": selected_connectors,
             "user_id": user_id_str,
-            "search_space_id": search_space_id
+            "search_space_id": search_space_id,
+            "search_mode": search_mode
         }
     }
     # Initialize state with database session and streaming service
diff --git a/surfsense_backend/app/utils/connector_service.py b/surfsense_backend/app/utils/connector_service.py
index c7ad692..49c3b08 100644
--- a/surfsense_backend/app/utils/connector_service.py
+++ b/surfsense_backend/app/utils/connector_service.py
@@ -4,32 +4,47 @@ import asyncio
 from sqlalchemy.ext.asyncio import AsyncSession
 from sqlalchemy.future import select
 from app.retriver.chunks_hybrid_search import ChucksHybridSearchRetriever
+from app.retriver.documents_hybrid_search import DocumentHybridSearchRetriever
 from app.db import SearchSourceConnector, SearchSourceConnectorType
 from tavily import TavilyClient
 from linkup import LinkupClient
 
+from app.agents.researcher.configuration import SearchMode
+
 
 class ConnectorService:
     def __init__(self, session: AsyncSession):
         self.session = session
-        self.retriever = ChucksHybridSearchRetriever(session)
+        self.chunk_retriever = ChucksHybridSearchRetriever(session)
+        self.document_retriever = DocumentHybridSearchRetriever(session)
         self.source_id_counter = 1
         self.counter_lock = asyncio.Lock()  # Lock to protect counter in multithreaded environments
     
-    async def search_crawled_urls(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20) -> tuple:
+    async def search_crawled_urls(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple:
         """
         Search for crawled URLs and return both the source information and langchain documents
         
         Returns:
             tuple: (sources_info, langchain_documents)
         """
-        crawled_urls_chunks = await self.retriever.hybrid_search(
-            query_text=user_query,
-            top_k=top_k,
-            user_id=user_id,
-            search_space_id=search_space_id,
-            document_type="CRAWLED_URL"
-        )
+        if search_mode == SearchMode.CHUNKS:
+            crawled_urls_chunks = await self.chunk_retriever.hybrid_search(
+                query_text=user_query,
+                top_k=top_k,
+                user_id=user_id,
+                search_space_id=search_space_id,
+                document_type="CRAWLED_URL"
+            )
+        elif search_mode == SearchMode.DOCUMENTS:
+            crawled_urls_chunks = await self.document_retriever.hybrid_search(
+                query_text=user_query,
+                top_k=top_k,
+                user_id=user_id,
+                search_space_id=search_space_id,
+                document_type="CRAWLED_URL"
+            )
+            # Transform document retriever results to match expected format
+            crawled_urls_chunks = self._transform_document_results(crawled_urls_chunks)
 
         # Early return if no results
         if not crawled_urls_chunks:
@@ -71,20 +86,31 @@ class ConnectorService:
         
         return result_object, crawled_urls_chunks
     
-    async def search_files(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20) -> tuple:
+    async def search_files(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple:
         """
         Search for files and return both the source information and langchain documents
         
         Returns:
             tuple: (sources_info, langchain_documents)
         """
-        files_chunks = await self.retriever.hybrid_search(
-            query_text=user_query,
-            top_k=top_k,
-            user_id=user_id,
-            search_space_id=search_space_id,
-            document_type="FILE"
-        )
+        if search_mode == SearchMode.CHUNKS:
+            files_chunks = await self.chunk_retriever.hybrid_search(
+                query_text=user_query,
+                top_k=top_k,
+                user_id=user_id,
+                search_space_id=search_space_id,
+                document_type="FILE"
+            )
+        elif search_mode == SearchMode.DOCUMENTS:
+            files_chunks = await self.document_retriever.hybrid_search(
+                query_text=user_query,
+                top_k=top_k,
+                user_id=user_id,
+                search_space_id=search_space_id,
+                document_type="FILE"
+            )
+            # Transform document retriever results to match expected format
+            files_chunks = self._transform_document_results(files_chunks)
         
         # Early return if no results
         if not files_chunks:
@@ -126,6 +152,31 @@ class ConnectorService:
         
         return result_object, files_chunks
     
+    def _transform_document_results(self, document_results: List[Dict]) -> List[Dict]:
+        """
+        Transform results from document_retriever.hybrid_search() to match the format
+        expected by the processing code.
+        
+        Args:
+            document_results: Results from document_retriever.hybrid_search()
+            
+        Returns:
+            List of transformed results in the format expected by the processing code
+        """
+        transformed_results = []
+        for doc in document_results:
+            transformed_results.append({
+                'document': {
+                    'id': doc.get('document_id'),
+                    'title': doc.get('title', 'Untitled Document'),
+                    'document_type': doc.get('document_type'),
+                    'metadata': doc.get('metadata', {}),
+                },
+                'content': doc.get('chunks_content', doc.get('content', '')),
+                'score': doc.get('score', 0.0)
+            })
+        return transformed_results
+    
     async def get_connector_by_type(self, user_id: str, connector_type: SearchSourceConnectorType) -> Optional[SearchSourceConnector]:
         """
         Get a connector by type for a specific user
@@ -249,20 +300,31 @@ class ConnectorService:
                 "sources": [],
             }, []
     
-    async def search_slack(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20) -> tuple:
+    async def search_slack(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple:
         """
         Search for slack and return both the source information and langchain documents
         
         Returns:
             tuple: (sources_info, langchain_documents)
         """
-        slack_chunks = await self.retriever.hybrid_search(
-            query_text=user_query,
-            top_k=top_k,
-            user_id=user_id,
-            search_space_id=search_space_id,
-            document_type="SLACK_CONNECTOR"
-        )
+        if search_mode == SearchMode.CHUNKS:
+            slack_chunks = await self.chunk_retriever.hybrid_search(
+                query_text=user_query,
+                top_k=top_k,
+                user_id=user_id,
+                search_space_id=search_space_id,
+                document_type="SLACK_CONNECTOR"
+            )
+        elif search_mode == SearchMode.DOCUMENTS:
+            slack_chunks = await self.document_retriever.hybrid_search(
+                query_text=user_query,
+                top_k=top_k,
+                user_id=user_id,
+                search_space_id=search_space_id,
+                document_type="SLACK_CONNECTOR"
+            )
+            # Transform document retriever results to match expected format
+            slack_chunks = self._transform_document_results(slack_chunks)
         
         # Early return if no results
         if not slack_chunks:
@@ -323,7 +385,7 @@ class ConnectorService:
         
         return result_object, slack_chunks
         
-    async def search_notion(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20) -> tuple:
+    async def search_notion(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple:
         """
         Search for Notion pages and return both the source information and langchain documents
         
@@ -336,14 +398,25 @@ class ConnectorService:
         Returns:
             tuple: (sources_info, langchain_documents)
         """
-        notion_chunks = await self.retriever.hybrid_search(
-            query_text=user_query,
-            top_k=top_k,
-            user_id=user_id,
-            search_space_id=search_space_id,
-            document_type="NOTION_CONNECTOR"
-        )
-        
+        if search_mode == SearchMode.CHUNKS:
+            notion_chunks = await self.chunk_retriever.hybrid_search(
+                query_text=user_query,
+                top_k=top_k,
+                user_id=user_id,
+                search_space_id=search_space_id,
+                document_type="NOTION_CONNECTOR"
+            )
+        elif search_mode == SearchMode.DOCUMENTS:
+            notion_chunks = await self.document_retriever.hybrid_search(
+                query_text=user_query,
+                top_k=top_k,
+                user_id=user_id,
+                search_space_id=search_space_id,
+                document_type="NOTION_CONNECTOR"
+            )
+            # Transform document retriever results to match expected format
+            notion_chunks = self._transform_document_results(notion_chunks)
+            
         # Early return if no results
         if not notion_chunks:
             return {
@@ -405,7 +478,7 @@ class ConnectorService:
         
         return result_object, notion_chunks
     
-    async def search_extension(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20) -> tuple:
+    async def search_extension(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple:
         """
         Search for extension data and return both the source information and langchain documents
         
@@ -418,14 +491,25 @@ class ConnectorService:
         Returns:
             tuple: (sources_info, langchain_documents)
         """
-        extension_chunks = await self.retriever.hybrid_search(
-            query_text=user_query,
-            top_k=top_k,
-            user_id=user_id,
-            search_space_id=search_space_id,
-            document_type="EXTENSION"
-        )
-        
+        if search_mode == SearchMode.CHUNKS:
+            extension_chunks = await self.chunk_retriever.hybrid_search(
+                query_text=user_query,
+                top_k=top_k,
+                user_id=user_id,
+                search_space_id=search_space_id,
+                document_type="EXTENSION"
+            )
+        elif search_mode == SearchMode.DOCUMENTS:
+            extension_chunks = await self.document_retriever.hybrid_search(
+                query_text=user_query,
+                top_k=top_k,
+                user_id=user_id,
+                search_space_id=search_space_id,
+                document_type="EXTENSION"
+            )
+            # Transform document retriever results to match expected format
+            extension_chunks = self._transform_document_results(extension_chunks)
+
         # Early return if no results
         if not extension_chunks:
             return {
@@ -505,7 +589,7 @@ class ConnectorService:
         
         return result_object, extension_chunks
     
-    async def search_youtube(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20) -> tuple:
+    async def search_youtube(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple:
         """
         Search for YouTube videos and return both the source information and langchain documents
         
@@ -518,13 +602,24 @@ class ConnectorService:
         Returns:
             tuple: (sources_info, langchain_documents)
         """
-        youtube_chunks = await self.retriever.hybrid_search(
-            query_text=user_query,
-            top_k=top_k,
-            user_id=user_id,
-            search_space_id=search_space_id,
-            document_type="YOUTUBE_VIDEO"
-        )
+        if search_mode == SearchMode.CHUNKS:
+            youtube_chunks = await self.chunk_retriever.hybrid_search(
+                query_text=user_query,
+                top_k=top_k,
+                user_id=user_id,
+                search_space_id=search_space_id,
+                document_type="YOUTUBE_VIDEO"
+            )
+        elif search_mode == SearchMode.DOCUMENTS:
+            youtube_chunks = await self.document_retriever.hybrid_search(
+                query_text=user_query,
+                top_k=top_k,
+                user_id=user_id,
+                search_space_id=search_space_id,
+                document_type="YOUTUBE_VIDEO"
+            )
+            # Transform document retriever results to match expected format
+            youtube_chunks = self._transform_document_results(youtube_chunks)
         
         # Early return if no results
         if not youtube_chunks:
@@ -587,20 +682,31 @@ class ConnectorService:
         
         return result_object, youtube_chunks
 
-    async def search_github(self, user_query: str, user_id: int, search_space_id: int, top_k: int = 20) -> tuple:
+    async def search_github(self, user_query: str, user_id: int, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple:
         """
         Search for GitHub documents and return both the source information and langchain documents
         
         Returns:
             tuple: (sources_info, langchain_documents)
         """
-        github_chunks = await self.retriever.hybrid_search(
-            query_text=user_query,
-            top_k=top_k,
-            user_id=user_id,
-            search_space_id=search_space_id,
-            document_type="GITHUB_CONNECTOR"
-        )
+        if search_mode == SearchMode.CHUNKS:
+            github_chunks = await self.chunk_retriever.hybrid_search(
+                query_text=user_query,
+                top_k=top_k,
+                user_id=user_id,
+                search_space_id=search_space_id,
+                document_type="GITHUB_CONNECTOR"
+            )
+        elif search_mode == SearchMode.DOCUMENTS:
+            github_chunks = await self.document_retriever.hybrid_search(
+                query_text=user_query,
+                top_k=top_k,
+                user_id=user_id,
+                search_space_id=search_space_id,
+                document_type="GITHUB_CONNECTOR"
+            )
+            # Transform document retriever results to match expected format
+            github_chunks = self._transform_document_results(github_chunks)
         
         # Early return if no results
         if not github_chunks:
@@ -643,7 +749,7 @@ class ConnectorService:
         
         return result_object, github_chunks
 
-    async def search_linear(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20) -> tuple:
+    async def search_linear(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple:
         """
         Search for Linear issues and comments and return both the source information and langchain documents
         
@@ -656,14 +762,25 @@ class ConnectorService:
         Returns:
             tuple: (sources_info, langchain_documents)
         """
-        linear_chunks = await self.retriever.hybrid_search(
-            query_text=user_query,
-            top_k=top_k,
-            user_id=user_id,
-            search_space_id=search_space_id,
-            document_type="LINEAR_CONNECTOR"
-        )
-        
+        if search_mode == SearchMode.CHUNKS:
+            linear_chunks = await self.chunk_retriever.hybrid_search(
+                query_text=user_query,
+                top_k=top_k,
+                user_id=user_id,
+                search_space_id=search_space_id,
+                document_type="LINEAR_CONNECTOR"
+            )
+        elif search_mode == SearchMode.DOCUMENTS:
+            linear_chunks = await self.document_retriever.hybrid_search(
+                query_text=user_query,
+                top_k=top_k,
+                user_id=user_id,
+                search_space_id=search_space_id,
+                document_type="LINEAR_CONNECTOR"
+            )
+            # Transform document retriever results to match expected format
+            linear_chunks = self._transform_document_results(linear_chunks)
+
         # Early return if no results
         if not linear_chunks:
             return {
diff --git a/surfsense_web/app/dashboard/[search_space_id]/researcher/[chat_id]/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/researcher/[chat_id]/page.tsx
index bc58e8c..78239e2 100644
--- a/surfsense_web/app/dashboard/[search_space_id]/researcher/[chat_id]/page.tsx
+++ b/surfsense_web/app/dashboard/[search_space_id]/researcher/[chat_id]/page.tsx
@@ -13,7 +13,9 @@ import {
   ArrowDown,
   CircleUser,
   Database,
-  SendHorizontal
+  SendHorizontal,
+  FileText,
+  Grid3x3
 } from 'lucide-react';
 import { Card, CardContent, CardHeader, CardTitle } from '@/components/ui/card';
 import { Button } from '@/components/ui/button';
@@ -248,6 +250,7 @@ const ChatPage = () => {
   const tabsListRef = useRef<HTMLDivElement>(null);
   const [terminalExpanded, setTerminalExpanded] = useState(false);
   const [selectedConnectors, setSelectedConnectors] = useState<string[]>(["CRAWLED_URL"]);
+  const [searchMode, setSearchMode] = useState<'DOCUMENTS' | 'CHUNKS'>('DOCUMENTS');
   const [researchMode, setResearchMode] = useState<ResearchMode>("GENERAL");
   const [currentTime, setCurrentTime] = useState<string>('');
   const [currentDate, setCurrentDate] = useState<string>('');
@@ -362,7 +365,8 @@ const ChatPage = () => {
       data: {
         search_space_id: search_space_id,
         selected_connectors: selectedConnectors,
-        research_mode: researchMode
+        research_mode: researchMode,
+        search_mode: searchMode
       }
     },
     onError: (error) => {
@@ -557,11 +561,6 @@ const ChatPage = () => {
     }
   }, [terminalExpanded]);
 
-  // Get total sources count for a connector type
-  const getSourcesCount = (connectorType: string) => {
-    return getSourcesCountUtil(getMessageConnectorSources(messages[messages.length - 1]), connectorType);
-  };
-
   // Function to check scroll position and update indicators
   const updateScrollIndicators = () => {
     updateScrollIndicatorsUtil(tabsListRef as React.RefObject<HTMLDivElement>, setCanScrollLeft, setCanScrollRight);
@@ -587,23 +586,6 @@ const ChatPage = () => {
   // Use the scroll to bottom hook
   useScrollToBottom(messagesEndRef as React.RefObject<HTMLDivElement>, [messages]);
 
-  // Function to get sources for the main view
-  const getMainViewSources = (connector: any) => {
-    return getMainViewSourcesUtil(connector, INITIAL_SOURCES_DISPLAY);
-  };
-
-  // Function to get filtered sources for the dialog with null check
-  const getFilteredSourcesWithCheck = (connector: any, sourceFilter: string) => {
-    if (!connector?.sources) return [];
-    return getFilteredSourcesUtil(connector, sourceFilter);
-  };
-
-  // Function to get paginated dialog sources with null check
-  const getPaginatedDialogSourcesWithCheck = (connector: any, sourceFilter: string, expandedSources: boolean, sourcesPage: number, sourcesPerPage: number) => {
-    if (!connector?.sources) return [];
-    return getPaginatedDialogSourcesUtil(connector, sourceFilter, expandedSources, sourcesPage, sourcesPerPage);
-  };
-
   // Function to get a citation source by ID
   const getCitationSource = React.useCallback((citationId: number, messageIndex?: number): Source | null => {
     if (!messages || messages.length === 0) return null;
@@ -995,15 +977,17 @@ const ChatPage = () => {
               <span className="sr-only">Send</span>
             </Button>
           </form>
-          <div className="flex items-center justify-between px-2 py-1 mt-8">
-            <div className="flex items-center gap-4">
+          <div className="flex items-center justify-between px-2 py-2 mt-3">
+            <div className="flex items-center space-x-3">
               {/* Connector Selection Dialog */}
               <Dialog>
                 <DialogTrigger asChild>
-                  <ConnectorButton
-                    selectedConnectors={selectedConnectors}
-                    onClick={() => { }}
-                  />
+                  <div className="h-8">
+                    <ConnectorButton
+                      selectedConnectors={selectedConnectors}
+                      onClick={() => { }}
+                    />
+                  </div>
                 </DialogTrigger>
                 <DialogContent className="sm:max-w-md">
                   <DialogHeader>
@@ -1070,12 +1054,40 @@ const ChatPage = () => {
                 </DialogContent>
               </Dialog>
 
+              {/* Search Mode Control */}
+              <div className="flex items-center p-0.5 rounded-md border border-border bg-muted/20 h-8">
+                <button
+                  onClick={() => setSearchMode('DOCUMENTS')}
+                  className={`flex h-full items-center justify-center gap-1 px-2 rounded text-xs font-medium transition-colors flex-1 whitespace-nowrap overflow-hidden ${
+                    searchMode === 'DOCUMENTS'
+                      ? 'bg-primary text-primary-foreground shadow-sm'
+                      : 'text-muted-foreground hover:text-foreground hover:bg-muted/50'
+                  }`}
+                >
+                  <FileText className="h-3 w-3 flex-shrink-0 mr-1" />
+                  <span>Full Document</span>
+                </button>
+                <button
+                  onClick={() => setSearchMode('CHUNKS')}
+                  className={`flex h-full items-center justify-center gap-1 px-2 rounded text-xs font-medium transition-colors flex-1 whitespace-nowrap overflow-hidden ${
+                    searchMode === 'CHUNKS'
+                      ? 'bg-primary text-primary-foreground shadow-sm'
+                      : 'text-muted-foreground hover:text-foreground hover:bg-muted/50'
+                  }`}
+                >
+                  <Grid3x3 className="h-3 w-3 flex-shrink-0 mr-1" />
+                  <span>Document Chunks</span>
+                </button>
+              </div>
+
               {/* Research Mode Segmented Control */}
-              <SegmentedControl<ResearchMode>
-                value={researchMode}
-                onChange={setResearchMode}
-                options={researcherOptions}
-              />
+              <div className="h-8">
+                <SegmentedControl<ResearchMode>
+                  value={researchMode}
+                  onChange={setResearchMode}
+                  options={researcherOptions}
+                />
+              </div>
             </div>
           </div>
         </div>
diff --git a/surfsense_web/components/chat/ConnectorComponents.tsx b/surfsense_web/components/chat/ConnectorComponents.tsx
index e52348c..163d5bf 100644
--- a/surfsense_web/components/chat/ConnectorComponents.tsx
+++ b/surfsense_web/components/chat/ConnectorComponents.tsx
@@ -147,7 +147,7 @@ export const ConnectorButton = ({ selectedConnectors, onClick, connectorSources
   return (
     <Button
       variant="outline"
-      className="h-7 px-2 text-xs font-medium rounded-md border-border relative overflow-hidden group scale-90 origin-left"
+      className="h-8 px-2 text-xs font-medium rounded-md border-border relative overflow-hidden group"
       onClick={onClick}
       aria-label={selectedCount === 0 ? "Select Connectors" : `${selectedCount} connectors selected`}
     >
diff --git a/surfsense_web/components/chat/SegmentedControl.tsx b/surfsense_web/components/chat/SegmentedControl.tsx
index d886a3b..7d81a52 100644
--- a/surfsense_web/components/chat/SegmentedControl.tsx
+++ b/surfsense_web/components/chat/SegmentedControl.tsx
@@ -15,11 +15,11 @@ type SegmentedControlProps<T extends string> = {
  */
 function SegmentedControl<T extends string>({ value, onChange, options }: SegmentedControlProps<T>) {
   return (
-    <div className="flex rounded-md border border-border overflow-hidden scale-90 origin-left">
+    <div className="flex h-7 rounded-md border border-border overflow-hidden">
       {options.map((option) => (
         <button
           key={option.value}
-          className={`flex items-center gap-1 px-2 py-1 text-xs transition-colors ${
+          className={`flex h-full items-center gap-1 px-2 text-xs transition-colors ${
             value === option.value 
               ? 'bg-primary text-primary-foreground' 
               : 'hover:bg-muted'