From 1d67a87b82d11f30fa6f54d2229577bdb98542b3 Mon Sep 17 00:00:00 2001 From: Muhamad Aji Wibisono Date: Mon, 2 Jun 2025 18:43:32 +0700 Subject: [PATCH] feat: discord knowledge retrieval --- .../app/agents/researcher/nodes.py | 17 ++++ .../researcher/sub_section_writer/prompts.py | 1 + .../routes/search_source_connectors_routes.py | 3 +- .../app/schemas/search_source_connector.py | 11 +++ .../app/utils/connector_service.py | 93 +++++++++++++++++++ 5 files changed, 124 insertions(+), 1 deletion(-) diff --git a/surfsense_backend/app/agents/researcher/nodes.py b/surfsense_backend/app/agents/researcher/nodes.py index fcec440..d3c5ab6 100644 --- a/surfsense_backend/app/agents/researcher/nodes.py +++ b/surfsense_backend/app/agents/researcher/nodes.py @@ -400,6 +400,23 @@ async def fetch_relevant_documents( if streaming_service and writer: streaming_service.only_update_terminal(f"🔗 Found {len(linkup_chunks)} Linkup results related to your query") writer({"yeild_value": streaming_service._format_annotations()}) + + elif connector == "DISCORD_CONNECTOR": + source_object, discord_chunks = await connector_service.search_discord( + user_query=reformulated_query, + user_id=user_id, + search_space_id=search_space_id, + top_k=top_k, + search_mode=search_mode + ) + # Add to sources and raw documents + if source_object: + all_sources.append(source_object) + all_raw_documents.extend(discord_chunks) + # Stream found document count + if streaming_service and writer: + streaming_service.only_update_terminal(f"🗨️ Found {len(discord_chunks)} Discord messages related to your query") + writer({"yeild_value": streaming_service._format_annotations()}) except Exception as e: diff --git a/surfsense_backend/app/agents/researcher/sub_section_writer/prompts.py b/surfsense_backend/app/agents/researcher/sub_section_writer/prompts.py index e87c9e8..186ea7d 100644 --- a/surfsense_backend/app/agents/researcher/sub_section_writer/prompts.py +++ b/surfsense_backend/app/agents/researcher/sub_section_writer/prompts.py @@ -15,6 +15,7 @@ You are SurfSense, an advanced AI research assistant that synthesizes informatio - YOUTUBE_VIDEO: "YouTube video transcripts and metadata" (personally saved videos) - GITHUB_CONNECTOR: "GitHub repository content and issues" (personal repositories and interactions) - LINEAR_CONNECTOR: "Linear project issues and discussions" (personal project management) +- DISCORD_CONNECTOR: "Discord server messages and channels" (personal community interactions) - TAVILY_API: "Tavily search API results" (personalized search results) - LINKUP_API: "Linkup search API results" (personalized search results) diff --git a/surfsense_backend/app/routes/search_source_connectors_routes.py b/surfsense_backend/app/routes/search_source_connectors_routes.py index 7df690e..95c9e08 100644 --- a/surfsense_backend/app/routes/search_source_connectors_routes.py +++ b/surfsense_backend/app/routes/search_source_connectors_routes.py @@ -7,7 +7,7 @@ PUT /search-source-connectors/{connector_id} - Update a specific connector DELETE /search-source-connectors/{connector_id} - Delete a specific connector POST /search-source-connectors/{connector_id}/index - Index content from a connector to a search space -Note: Each user can have only one connector of each type (SERPER_API, TAVILY_API, SLACK_CONNECTOR, NOTION_CONNECTOR, GITHUB_CONNECTOR, LINEAR_CONNECTOR). +Note: Each user can have only one connector of each type (SERPER_API, TAVILY_API, SLACK_CONNECTOR, NOTION_CONNECTOR, GITHUB_CONNECTOR, LINEAR_CONNECTOR, DISCORD_CONNECTOR). """ from fastapi import APIRouter, Depends, HTTPException, Query, BackgroundTasks, Body from sqlalchemy.ext.asyncio import AsyncSession @@ -282,6 +282,7 @@ async def index_connector_content( - NOTION_CONNECTOR: Indexes pages from all accessible Notion pages - GITHUB_CONNECTOR: Indexes code and documentation from GitHub repositories - LINEAR_CONNECTOR: Indexes issues and comments from Linear + - DISCORD_CONNECTOR: Indexes messages from all accessible Discord channels Args: connector_id: ID of the connector to use diff --git a/surfsense_backend/app/schemas/search_source_connector.py b/surfsense_backend/app/schemas/search_source_connector.py index b136757..1225d54 100644 --- a/surfsense_backend/app/schemas/search_source_connector.py +++ b/surfsense_backend/app/schemas/search_source_connector.py @@ -81,6 +81,7 @@ class SearchSourceConnectorBase(BaseModel): repo_full_names = config.get("repo_full_names") if not isinstance(repo_full_names, list) or not repo_full_names: raise ValueError("repo_full_names must be a non-empty list of strings") + elif connector_type == SearchSourceConnectorType.LINEAR_CONNECTOR: # For LINEAR_CONNECTOR, only allow LINEAR_API_KEY allowed_keys = ["LINEAR_API_KEY"] @@ -90,6 +91,16 @@ class SearchSourceConnectorBase(BaseModel): # Ensure the token is not empty if not config.get("LINEAR_API_KEY"): raise ValueError("LINEAR_API_KEY cannot be empty") + + elif connector_type == SearchSourceConnectorType.DISCORD_CONNECTOR: + # For DISCORD_CONNECTOR, only allow DISCORD_BOT_TOKEN + allowed_keys = ["DISCORD_BOT_TOKEN"] + if set(config.keys()) != set(allowed_keys): + raise ValueError(f"For DISCORD_CONNECTOR connector type, config must only contain these keys: {allowed_keys}") + + # Ensure the bot token is not empty + if not config.get("DISCORD_BOT_TOKEN"): + raise ValueError("DISCORD_BOT_TOKEN cannot be empty") return config diff --git a/surfsense_backend/app/utils/connector_service.py b/surfsense_backend/app/utils/connector_service.py index 49c3b08..c18b288 100644 --- a/surfsense_backend/app/utils/connector_service.py +++ b/surfsense_backend/app/utils/connector_service.py @@ -959,3 +959,96 @@ class ConnectorService: "type": "LINKUP_API", "sources": [], }, [] + + async def search_discord(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple: + """ + Search for Discord messages and return both the source information and langchain documents + + Args: + user_query: The user's query + user_id: The user's ID + search_space_id: The search space ID to search in + top_k: Maximum number of results to return + + Returns: + tuple: (sources_info, langchain_documents) + """ + if search_mode == SearchMode.CHUNKS: + discord_chunks = await self.chunk_retriever.hybrid_search( + query_text=user_query, + top_k=top_k, + user_id=user_id, + search_space_id=search_space_id, + document_type="DISCORD_CONNECTOR" + ) + elif search_mode == SearchMode.DOCUMENTS: + discord_chunks = await self.document_retriever.hybrid_search( + query_text=user_query, + top_k=top_k, + user_id=user_id, + search_space_id=search_space_id, + document_type="DISCORD_CONNECTOR" + ) + # Transform document retriever results to match expected format + discord_chunks = self._transform_document_results(discord_chunks) + + # Early return if no results + if not discord_chunks: + return { + "id": 11, + "name": "Discord", + "type": "DISCORD_CONNECTOR", + "sources": [], + }, [] + + # Process each chunk and create sources directly without deduplication + sources_list = [] + async with self.counter_lock: + for i, chunk in enumerate(discord_chunks): + # Fix for UI + discord_chunks[i]['document']['id'] = self.source_id_counter + # Extract document metadata + document = chunk.get('document', {}) + metadata = document.get('metadata', {}) + + # Create a mapped source entry with Discord-specific metadata + channel_name = metadata.get('channel_name', 'Unknown Channel') + channel_id = metadata.get('channel_id', '') + message_date = metadata.get('start_date', '') + + # Create a more descriptive title for Discord messages + title = f"Discord: {channel_name}" + if message_date: + title += f" ({message_date})" + + # Create a more descriptive description for Discord messages + description = chunk.get('content', '')[:100] + if len(description) == 100: + description += "..." + + # For URL, we can use a placeholder or construct a URL to the Discord channel if available + url = "" + if channel_id: + url = f"https://discord.com/channels/@me/{channel_id}" + + source = { + "id": self.source_id_counter, + "title": title, + "description": description, + "url": url, + } + + self.source_id_counter += 1 + sources_list.append(source) + + # Create result object + result_object = { + "id": 11, + "name": "Discord", + "type": "DISCORD_CONNECTOR", + "sources": sources_list, + } + + return result_object, discord_chunks + +