chore: update README and refactor ConnectorService for improved document handling and error management

2025-09-04 03:29:07 +00:00 · 2025-04-27 20:39:17 -07:00 · 2025-04-27 20:39:17 -07:00 · a971bb1f72
commit a971bb1f72
parent 1309f9e262
4 changed files with 399 additions and 303 deletions
--- a/surfsense_backend/README.md
+++ b/surfsense_backend/README.md
@ -110,7 +110,6 @@ See pyproject.toml for detailed dependency information. Key dependencies include
 - fastapi and related packages
 - fastapi-users: Authentication and user management
 - firecrawl-py: Web crawling capabilities
- gpt-researcher: Advanced research capabilities
 - langchain components for AI workflows
 - litellm: LLM model integration
 - pgvector: Vector similarity search in PostgreSQL
--- a/surfsense_backend/app/agents/researcher/nodes.py
+++ b/surfsense_backend/app/agents/researcher/nodes.py
@ -143,7 +143,8 @@ async def fetch_relevant_documents(
    connectors_to_search: List[str],
    writer: StreamWriter = None,
    state: State = None,
-    top_k: int = 10
+    top_k: int = 10,
+    connector_service: ConnectorService = None
 ) -> List[Dict[str, Any]]:
    """
    Fetch relevant documents for research questions using the provided connectors.
@ -162,7 +163,7 @@ async def fetch_relevant_documents(
        List of relevant documents
    """
    # Initialize services
-    connector_service = ConnectorService(db_session)
+    # connector_service = ConnectorService(db_session)
    
    # Only use streaming if both writer and state are provided
    streaming_service = state.streaming_service if state is not None else None
@ -494,10 +495,12 @@ async def process_sections(state: State, config: RunnableConfig, writer: StreamW
    elif configuration.num_sections == 6:
        TOP_K = 30
    
-
    relevant_documents = []
    async with async_session_maker() as db_session:
        try:
+            # Create connector service inside the db_session scope
+            connector_service = ConnectorService(db_session)
+            
            relevant_documents = await fetch_relevant_documents(
                research_questions=all_questions,
                user_id=configuration.user_id,
@ -506,7 +509,8 @@ async def process_sections(state: State, config: RunnableConfig, writer: StreamW
                connectors_to_search=configuration.connectors_to_search,
                writer=writer,
                state=state,
-                top_k=TOP_K
+                top_k=TOP_K,
+                connector_service=connector_service
            )
        except Exception as e:
            error_message = f"Error fetching relevant documents: {str(e)}"
--- a/surfsense_backend/app/agents/researcher/sub_section_writer/nodes.py
+++ b/surfsense_backend/app/agents/researcher/sub_section_writer/nodes.py
@ -102,7 +102,7 @@ async def write_sub_section(state: State, config: RunnableConfig) -> Dict[str, A
        # Extract content and metadata
        content = doc.get("content", "")
        doc_info = doc.get("document", {})
-        document_id = doc_info.get("id", f"{i+1}")  # Use document ID or index+1 as source_id
+        document_id = doc_info.get("id")  # Use document ID
        
        # Format document according to the citation system prompt's expected format
        formatted_doc = f"""
--- a/surfsense_backend/app/utils/connector_service.py
+++ b/surfsense_backend/app/utils/connector_service.py
@ -1,5 +1,6 @@
 import json
 from typing import List, Dict, Any, Optional, Tuple
+import asyncio
 from sqlalchemy.ext.asyncio import AsyncSession
 from sqlalchemy.future import select
 from app.retriver.chunks_hybrid_search import ChucksHybridSearchRetriever
@ -13,6 +14,7 @@ class ConnectorService:
        self.session = session
        self.retriever = ChucksHybridSearchRetriever(session)
        self.source_id_counter = 1
+        self.counter_lock = asyncio.Lock()  # Lock to protect counter in multithreaded environments
    
    async def search_crawled_urls(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20) -> tuple:
        """
@ -29,8 +31,18 @@ class ConnectorService:
            document_type="CRAWLED_URL"
        )

+        # Early return if no results
+        if not crawled_urls_chunks:
+            return {
+                "id": 1,
+                "name": "Crawled URLs",
+                "type": "CRAWLED_URL",
+                "sources": [],
+            }, []
+
        # Process each chunk and create sources directly without deduplication
        sources_list = []
+        async with self.counter_lock:
            for i, chunk in enumerate(crawled_urls_chunks):
                # Fix for UI
                crawled_urls_chunks[i]['document']['id'] = self.source_id_counter
@ -74,8 +86,18 @@ class ConnectorService:
            document_type="FILE"
        )
        
+        # Early return if no results
+        if not files_chunks:
+            return {
+                "id": 2,
+                "name": "Files",
+                "type": "FILE",
+                "sources": [],
+            }, []
+
        # Process each chunk and create sources directly without deduplication
        sources_list = []
+        async with self.counter_lock:
            for i, chunk in enumerate(files_chunks):
                # Fix for UI
                files_chunks[i]['document']['id'] = self.source_id_counter
@ -163,10 +185,20 @@ class ConnectorService:
            # Extract results from Tavily response
            tavily_results = response.get("results", [])
            
+            # Early return if no results
+            if not tavily_results:
+                return {
+                    "id": 3,
+                    "name": "Tavily Search",
+                    "type": "TAVILY_API",
+                    "sources": [],
+                }, []
+            
            # Process each result and create sources directly without deduplication
            sources_list = []
            documents = []
            
+            async with self.counter_lock:
                for i, result in enumerate(tavily_results):
                    
                    # Create a source entry
@ -232,8 +264,18 @@ class ConnectorService:
            document_type="SLACK_CONNECTOR"
        )
        
+        # Early return if no results
+        if not slack_chunks:
+            return {
+                "id": 4,
+                "name": "Slack",
+                "type": "SLACK_CONNECTOR",
+                "sources": [],
+            }, []
+
        # Process each chunk and create sources directly without deduplication
        sources_list = []
+        async with self.counter_lock:
            for i, chunk in enumerate(slack_chunks):
                # Fix for UI
                slack_chunks[i]['document']['id'] = self.source_id_counter
@ -302,8 +344,18 @@ class ConnectorService:
            document_type="NOTION_CONNECTOR"
        )
        
+        # Early return if no results
+        if not notion_chunks:
+            return {
+                "id": 5,
+                "name": "Notion",
+                "type": "NOTION_CONNECTOR",
+                "sources": [],
+            }, []
+
        # Process each chunk and create sources directly without deduplication
        sources_list = []
+        async with self.counter_lock:
            for i, chunk in enumerate(notion_chunks):
                # Fix for UI
                notion_chunks[i]['document']['id'] = self.source_id_counter
@ -374,8 +426,18 @@ class ConnectorService:
            document_type="EXTENSION"
        )
        
+        # Early return if no results
+        if not extension_chunks:
+            return {
+                "id": 6,
+                "name": "Extension",
+                "type": "EXTENSION",
+                "sources": [],
+            }, []
+
        # Process each chunk and create sources directly without deduplication
        sources_list = []
+        async with self.counter_lock:
            for i, chunk in enumerate(extension_chunks):
                # Fix for UI
                extension_chunks[i]['document']['id'] = self.source_id_counter
@ -464,8 +526,18 @@ class ConnectorService:
            document_type="YOUTUBE_VIDEO"
        )
        
+        # Early return if no results
+        if not youtube_chunks:
+            return {
+                "id": 7,
+                "name": "YouTube Videos",
+                "type": "YOUTUBE_VIDEO",
+                "sources": [],
+            }, []
+
        # Process each chunk and create sources directly without deduplication
        sources_list = []
+        async with self.counter_lock:
            for i, chunk in enumerate(youtube_chunks):
                # Fix for UI
                youtube_chunks[i]['document']['id'] = self.source_id_counter
@ -530,8 +602,18 @@ class ConnectorService:
            document_type="GITHUB_CONNECTOR"
        )
        
+        # Early return if no results
+        if not github_chunks:
+            return {
+                "id": 8,
+                "name": "GitHub",
+                "type": "GITHUB_CONNECTOR",
+                "sources": [],
+            }, []
+
        # Process each chunk and create sources directly without deduplication
        sources_list = []
+        async with self.counter_lock:
            for i, chunk in enumerate(github_chunks):
                # Fix for UI - assign a unique ID for citation/source tracking
                github_chunks[i]['document']['id'] = self.source_id_counter
@ -582,8 +664,18 @@ class ConnectorService:
            document_type="LINEAR_CONNECTOR"
        )
        
+        # Early return if no results
+        if not linear_chunks:
+            return {
+                "id": 9,
+                "name": "Linear Issues",
+                "type": "LINEAR_CONNECTOR",
+                "sources": [],
+            }, []
+
        # Process each chunk and create sources directly without deduplication
        sources_list = []
+        async with self.counter_lock:
            for i, chunk in enumerate(linear_chunks):
                # Fix for UI
                linear_chunks[i]['document']['id'] = self.source_id_counter
@ -697,6 +789,7 @@ class ConnectorService:
            sources_list = []
            documents = []
            
+            async with self.counter_lock:
                for i, result in enumerate(linkup_results):
                    # Only process results that have content
                    if not hasattr(result, 'content') or not result.content: