feat: discord knowledge retrieval

2025-09-01 10:09:08 +00:00 · 2025-06-02 18:43:32 +07:00 · 2025-06-02 18:43:32 +07:00 · 1d67a87b82
commit 1d67a87b82
parent 158976e802
5 changed files with 124 additions and 1 deletions
--- a/surfsense_backend/app/agents/researcher/nodes.py
+++ b/surfsense_backend/app/agents/researcher/nodes.py
@ -400,6 +400,23 @@ async def fetch_relevant_documents(
                    if streaming_service and writer:
                        streaming_service.only_update_terminal(f"🔗 Found {len(linkup_chunks)} Linkup results related to your query")
                        writer({"yeild_value": streaming_service._format_annotations()})
+                        
+                elif connector == "DISCORD_CONNECTOR":
+                    source_object, discord_chunks = await connector_service.search_discord(
+                        user_query=reformulated_query,
+                        user_id=user_id,
+                        search_space_id=search_space_id,
+                        top_k=top_k,
+                        search_mode=search_mode
+                    )
+                    # Add to sources and raw documents
+                    if source_object:
+                        all_sources.append(source_object)
+                    all_raw_documents.extend(discord_chunks)
+                    # Stream found document count
+                    if streaming_service and writer:
+                        streaming_service.only_update_terminal(f"🗨️ Found {len(discord_chunks)} Discord messages related to your query")
+                        writer({"yeild_value": streaming_service._format_annotations()})
                    

            except Exception as e:
--- a/surfsense_backend/app/agents/researcher/sub_section_writer/prompts.py
+++ b/surfsense_backend/app/agents/researcher/sub_section_writer/prompts.py
@ -15,6 +15,7 @@ You are SurfSense, an advanced AI research assistant that synthesizes informatio
 - YOUTUBE_VIDEO: "YouTube video transcripts and metadata" (personally saved videos)
 - GITHUB_CONNECTOR: "GitHub repository content and issues" (personal repositories and interactions)
 - LINEAR_CONNECTOR: "Linear project issues and discussions" (personal project management)
+- DISCORD_CONNECTOR: "Discord server messages and channels" (personal community interactions)
 - TAVILY_API: "Tavily search API results" (personalized search results)
 - LINKUP_API: "Linkup search API results" (personalized search results)
 </knowledge_sources>
--- a/surfsense_backend/app/routes/search_source_connectors_routes.py
+++ b/surfsense_backend/app/routes/search_source_connectors_routes.py
@ -7,7 +7,7 @@ PUT /search-source-connectors/{connector_id} - Update a specific connector
 DELETE /search-source-connectors/{connector_id} - Delete a specific connector
 POST /search-source-connectors/{connector_id}/index - Index content from a connector to a search space

-Note: Each user can have only one connector of each type (SERPER_API, TAVILY_API, SLACK_CONNECTOR, NOTION_CONNECTOR, GITHUB_CONNECTOR, LINEAR_CONNECTOR).
+Note: Each user can have only one connector of each type (SERPER_API, TAVILY_API, SLACK_CONNECTOR, NOTION_CONNECTOR, GITHUB_CONNECTOR, LINEAR_CONNECTOR, DISCORD_CONNECTOR).
 """
 from fastapi import APIRouter, Depends, HTTPException, Query, BackgroundTasks, Body
 from sqlalchemy.ext.asyncio import AsyncSession
@ -282,6 +282,7 @@ async def index_connector_content(
    - NOTION_CONNECTOR: Indexes pages from all accessible Notion pages
    - GITHUB_CONNECTOR: Indexes code and documentation from GitHub repositories
    - LINEAR_CONNECTOR: Indexes issues and comments from Linear
+    - DISCORD_CONNECTOR: Indexes messages from all accessible Discord channels
    
    Args:
        connector_id: ID of the connector to use
--- a/surfsense_backend/app/schemas/search_source_connector.py
+++ b/surfsense_backend/app/schemas/search_source_connector.py
@ -81,6 +81,7 @@ class SearchSourceConnectorBase(BaseModel):
            repo_full_names = config.get("repo_full_names")
            if not isinstance(repo_full_names, list) or not repo_full_names:
                raise ValueError("repo_full_names must be a non-empty list of strings")
+            
        elif connector_type == SearchSourceConnectorType.LINEAR_CONNECTOR:
            # For LINEAR_CONNECTOR, only allow LINEAR_API_KEY
            allowed_keys = ["LINEAR_API_KEY"]
@ -90,6 +91,16 @@ class SearchSourceConnectorBase(BaseModel):
            # Ensure the token is not empty
            if not config.get("LINEAR_API_KEY"):
                raise ValueError("LINEAR_API_KEY cannot be empty")
+        
+        elif connector_type == SearchSourceConnectorType.DISCORD_CONNECTOR:
+            # For DISCORD_CONNECTOR, only allow DISCORD_BOT_TOKEN
+            allowed_keys = ["DISCORD_BOT_TOKEN"]
+            if set(config.keys()) != set(allowed_keys):
+                raise ValueError(f"For DISCORD_CONNECTOR connector type, config must only contain these keys: {allowed_keys}")
+
+            # Ensure the bot token is not empty
+            if not config.get("DISCORD_BOT_TOKEN"):
+                raise ValueError("DISCORD_BOT_TOKEN cannot be empty")

        return config

--- a/surfsense_backend/app/utils/connector_service.py
+++ b/surfsense_backend/app/utils/connector_service.py
@ -959,3 +959,96 @@ class ConnectorService:
                "type": "LINKUP_API",
                "sources": [],
            }, []
+    
+    async def search_discord(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple:
+        """
+        Search for Discord messages and return both the source information and langchain documents
+        
+        Args:
+            user_query: The user's query
+            user_id: The user's ID
+            search_space_id: The search space ID to search in
+            top_k: Maximum number of results to return
+            
+        Returns:
+            tuple: (sources_info, langchain_documents)
+        """
+        if search_mode == SearchMode.CHUNKS:
+            discord_chunks = await self.chunk_retriever.hybrid_search(
+                query_text=user_query,
+                top_k=top_k,
+                user_id=user_id,
+                search_space_id=search_space_id,
+                document_type="DISCORD_CONNECTOR"
+            )
+        elif search_mode == SearchMode.DOCUMENTS:
+            discord_chunks = await self.document_retriever.hybrid_search(
+                query_text=user_query,
+                top_k=top_k,
+                user_id=user_id,
+                search_space_id=search_space_id,
+                document_type="DISCORD_CONNECTOR"
+            )
+            # Transform document retriever results to match expected format
+            discord_chunks = self._transform_document_results(discord_chunks)
+        
+        # Early return if no results
+        if not discord_chunks:
+            return {
+                "id": 11,
+                "name": "Discord",
+                "type": "DISCORD_CONNECTOR",
+                "sources": [],
+            }, []
+
+        # Process each chunk and create sources directly without deduplication
+        sources_list = []
+        async with self.counter_lock:
+            for i, chunk in enumerate(discord_chunks):
+                # Fix for UI
+                discord_chunks[i]['document']['id'] = self.source_id_counter
+                # Extract document metadata
+                document = chunk.get('document', {})
+                metadata = document.get('metadata', {})
+
+                # Create a mapped source entry with Discord-specific metadata
+                channel_name = metadata.get('channel_name', 'Unknown Channel')
+                channel_id = metadata.get('channel_id', '')
+                message_date = metadata.get('start_date', '')
+                
+                # Create a more descriptive title for Discord messages
+                title = f"Discord: {channel_name}"
+                if message_date:
+                    title += f" ({message_date})"
+                    
+                # Create a more descriptive description for Discord messages
+                description = chunk.get('content', '')[:100]
+                if len(description) == 100:
+                    description += "..."
+                    
+                # For URL, we can use a placeholder or construct a URL to the Discord channel if available
+                url = ""
+                if channel_id:
+                    url = f"https://discord.com/channels/@me/{channel_id}"
+
+                source = {
+                    "id": self.source_id_counter,
+                    "title": title,
+                    "description": description,
+                    "url": url,
+                }
+
+                self.source_id_counter += 1
+                sources_list.append(source)
+        
+        # Create result object
+        result_object = {
+            "id": 11,
+            "name": "Discord",
+            "type": "DISCORD_CONNECTOR",
+            "sources": sources_list,
+        }
+        
+        return result_object, discord_chunks
+
+