from typing import List, Dict, Optional import asyncio from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.future import select from app.retriver.chunks_hybrid_search import ChucksHybridSearchRetriever from app.retriver.documents_hybrid_search import DocumentHybridSearchRetriever from app.db import SearchSourceConnector, SearchSourceConnectorType, Chunk, Document, SearchSpace from tavily import TavilyClient from linkup import LinkupClient from sqlalchemy import func from app.agents.researcher.configuration import SearchMode class ConnectorService: def __init__(self, session: AsyncSession, user_id: str = None): self.session = session self.chunk_retriever = ChucksHybridSearchRetriever(session) self.document_retriever = DocumentHybridSearchRetriever(session) self.user_id = user_id self.source_id_counter = 100000 # High starting value to avoid collisions with existing IDs self.counter_lock = asyncio.Lock() # Lock to protect counter in multithreaded environments async def initialize_counter(self): """ Initialize the source_id_counter based on the total number of chunks for the user. This ensures unique IDs across different sessions. """ if self.user_id: try: # Count total chunks for documents belonging to this user result = await self.session.execute( select(func.count(Chunk.id)) .join(Document) .join(SearchSpace) .filter(SearchSpace.user_id == self.user_id) ) chunk_count = result.scalar() or 0 self.source_id_counter = chunk_count + 1 print(f"Initialized source_id_counter to {self.source_id_counter} for user {self.user_id}") except Exception as e: print(f"Error initializing source_id_counter: {str(e)}") # Fallback to default value self.source_id_counter = 1 async def search_crawled_urls(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple: """ Search for crawled URLs and return both the source information and langchain documents Returns: tuple: (sources_info, langchain_documents) """ if search_mode == SearchMode.CHUNKS: crawled_urls_chunks = await self.chunk_retriever.hybrid_search( query_text=user_query, top_k=top_k, user_id=user_id, search_space_id=search_space_id, document_type="CRAWLED_URL" ) elif search_mode == SearchMode.DOCUMENTS: crawled_urls_chunks = await self.document_retriever.hybrid_search( query_text=user_query, top_k=top_k, user_id=user_id, search_space_id=search_space_id, document_type="CRAWLED_URL" ) # Transform document retriever results to match expected format crawled_urls_chunks = self._transform_document_results(crawled_urls_chunks) # Early return if no results if not crawled_urls_chunks: return { "id": 1, "name": "Crawled URLs", "type": "CRAWLED_URL", "sources": [], }, [] # Process each chunk and create sources directly without deduplication sources_list = [] async with self.counter_lock: for _i, chunk in enumerate(crawled_urls_chunks): # Extract document metadata document = chunk.get('document', {}) metadata = document.get('metadata', {}) # Create a source entry source = { "id": document.get('id', self.source_id_counter), "title": document.get('title', 'Untitled Document'), "description": metadata.get('og:description', metadata.get('ogDescription', chunk.get('content', '')[:100])), "url": metadata.get('url', '') } self.source_id_counter += 1 sources_list.append(source) # Create result object result_object = { "id": 1, "name": "Crawled URLs", "type": "CRAWLED_URL", "sources": sources_list, } return result_object, crawled_urls_chunks async def search_files(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple: """ Search for files and return both the source information and langchain documents Returns: tuple: (sources_info, langchain_documents) """ if search_mode == SearchMode.CHUNKS: files_chunks = await self.chunk_retriever.hybrid_search( query_text=user_query, top_k=top_k, user_id=user_id, search_space_id=search_space_id, document_type="FILE" ) elif search_mode == SearchMode.DOCUMENTS: files_chunks = await self.document_retriever.hybrid_search( query_text=user_query, top_k=top_k, user_id=user_id, search_space_id=search_space_id, document_type="FILE" ) # Transform document retriever results to match expected format files_chunks = self._transform_document_results(files_chunks) # Early return if no results if not files_chunks: return { "id": 2, "name": "Files", "type": "FILE", "sources": [], }, [] # Process each chunk and create sources directly without deduplication sources_list = [] async with self.counter_lock: for _i, chunk in enumerate(files_chunks): # Extract document metadata document = chunk.get('document', {}) metadata = document.get('metadata', {}) # Create a source entry source = { "id": document.get('id', self.source_id_counter), "title": document.get('title', 'Untitled Document'), "description": metadata.get('og:description', metadata.get('ogDescription', chunk.get('content', '')[:100])), "url": metadata.get('url', '') } self.source_id_counter += 1 sources_list.append(source) # Create result object result_object = { "id": 2, "name": "Files", "type": "FILE", "sources": sources_list, } return result_object, files_chunks def _transform_document_results(self, document_results: List[Dict]) -> List[Dict]: """ Transform results from document_retriever.hybrid_search() to match the format expected by the processing code. Args: document_results: Results from document_retriever.hybrid_search() Returns: List of transformed results in the format expected by the processing code """ transformed_results = [] for doc in document_results: transformed_results.append({ 'document': { 'id': doc.get('document_id'), 'title': doc.get('title', 'Untitled Document'), 'document_type': doc.get('document_type'), 'metadata': doc.get('metadata', {}), }, 'content': doc.get('chunks_content', doc.get('content', '')), 'score': doc.get('score', 0.0) }) return transformed_results async def get_connector_by_type(self, user_id: str, connector_type: SearchSourceConnectorType) -> Optional[SearchSourceConnector]: """ Get a connector by type for a specific user Args: user_id: The user's ID connector_type: The connector type to retrieve Returns: Optional[SearchSourceConnector]: The connector if found, None otherwise """ result = await self.session.execute( select(SearchSourceConnector) .filter( SearchSourceConnector.user_id == user_id, SearchSourceConnector.connector_type == connector_type ) ) return result.scalars().first() async def search_tavily(self, user_query: str, user_id: str, top_k: int = 20) -> tuple: """ Search using Tavily API and return both the source information and documents Args: user_query: The user's query user_id: The user's ID top_k: Maximum number of results to return Returns: tuple: (sources_info, documents) """ # Get Tavily connector configuration tavily_connector = await self.get_connector_by_type(user_id, SearchSourceConnectorType.TAVILY_API) if not tavily_connector: # Return empty results if no Tavily connector is configured return { "id": 3, "name": "Tavily Search", "type": "TAVILY_API", "sources": [], }, [] # Initialize Tavily client with API key from connector config tavily_api_key = tavily_connector.config.get("TAVILY_API_KEY") tavily_client = TavilyClient(api_key=tavily_api_key) # Perform search with Tavily try: response = tavily_client.search( query=user_query, max_results=top_k, search_depth="advanced" # Use advanced search for better results ) # Extract results from Tavily response tavily_results = response.get("results", []) # Early return if no results if not tavily_results: return { "id": 3, "name": "Tavily Search", "type": "TAVILY_API", "sources": [], }, [] # Process each result and create sources directly without deduplication sources_list = [] documents = [] async with self.counter_lock: for i, result in enumerate(tavily_results): # Create a source entry source = { "id": self.source_id_counter, "title": result.get("title", "Tavily Result"), "description": result.get("content", "")[:100], "url": result.get("url", "") } sources_list.append(source) # Create a document entry document = { "chunk_id": f"tavily_chunk_{i}", "content": result.get("content", ""), "score": result.get("score", 0.0), "document": { "id": self.source_id_counter, "title": result.get("title", "Tavily Result"), "document_type": "TAVILY_API", "metadata": { "url": result.get("url", ""), "published_date": result.get("published_date", ""), "source": "TAVILY_API" } } } documents.append(document) self.source_id_counter += 1 # Create result object result_object = { "id": 3, "name": "Tavily Search", "type": "TAVILY_API", "sources": sources_list, } return result_object, documents except Exception as e: # Log the error and return empty results print(f"Error searching with Tavily: {str(e)}") return { "id": 3, "name": "Tavily Search", "type": "TAVILY_API", "sources": [], }, [] async def search_slack(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple: """ Search for slack and return both the source information and langchain documents Returns: tuple: (sources_info, langchain_documents) """ if search_mode == SearchMode.CHUNKS: slack_chunks = await self.chunk_retriever.hybrid_search( query_text=user_query, top_k=top_k, user_id=user_id, search_space_id=search_space_id, document_type="SLACK_CONNECTOR" ) elif search_mode == SearchMode.DOCUMENTS: slack_chunks = await self.document_retriever.hybrid_search( query_text=user_query, top_k=top_k, user_id=user_id, search_space_id=search_space_id, document_type="SLACK_CONNECTOR" ) # Transform document retriever results to match expected format slack_chunks = self._transform_document_results(slack_chunks) # Early return if no results if not slack_chunks: return { "id": 4, "name": "Slack", "type": "SLACK_CONNECTOR", "sources": [], }, [] # Process each chunk and create sources directly without deduplication sources_list = [] async with self.counter_lock: for _i, chunk in enumerate(slack_chunks): # Extract document metadata document = chunk.get('document', {}) metadata = document.get('metadata', {}) # Create a mapped source entry with Slack-specific metadata channel_name = metadata.get('channel_name', 'Unknown Channel') channel_id = metadata.get('channel_id', '') message_date = metadata.get('start_date', '') # Create a more descriptive title for Slack messages title = f"Slack: {channel_name}" if message_date: title += f" ({message_date})" # Create a more descriptive description for Slack messages description = chunk.get('content', '')[:100] if len(description) == 100: description += "..." # For URL, we can use a placeholder or construct a URL to the Slack channel if available url = "" if channel_id: url = f"https://slack.com/app_redirect?channel={channel_id}" source = { "id": document.get('id', self.source_id_counter), "title": title, "description": description, "url": url, } self.source_id_counter += 1 sources_list.append(source) # Create result object result_object = { "id": 4, "name": "Slack", "type": "SLACK_CONNECTOR", "sources": sources_list, } return result_object, slack_chunks async def search_notion(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple: """ Search for Notion pages and return both the source information and langchain documents Args: user_query: The user's query user_id: The user's ID search_space_id: The search space ID to search in top_k: Maximum number of results to return Returns: tuple: (sources_info, langchain_documents) """ if search_mode == SearchMode.CHUNKS: notion_chunks = await self.chunk_retriever.hybrid_search( query_text=user_query, top_k=top_k, user_id=user_id, search_space_id=search_space_id, document_type="NOTION_CONNECTOR" ) elif search_mode == SearchMode.DOCUMENTS: notion_chunks = await self.document_retriever.hybrid_search( query_text=user_query, top_k=top_k, user_id=user_id, search_space_id=search_space_id, document_type="NOTION_CONNECTOR" ) # Transform document retriever results to match expected format notion_chunks = self._transform_document_results(notion_chunks) # Early return if no results if not notion_chunks: return { "id": 5, "name": "Notion", "type": "NOTION_CONNECTOR", "sources": [], }, [] # Process each chunk and create sources directly without deduplication sources_list = [] async with self.counter_lock: for _i, chunk in enumerate(notion_chunks): # Extract document metadata document = chunk.get('document', {}) metadata = document.get('metadata', {}) # Create a mapped source entry with Notion-specific metadata page_title = metadata.get('page_title', 'Untitled Page') page_id = metadata.get('page_id', '') indexed_at = metadata.get('indexed_at', '') # Create a more descriptive title for Notion pages title = f"Notion: {page_title}" if indexed_at: title += f" (indexed: {indexed_at})" # Create a more descriptive description for Notion pages description = chunk.get('content', '')[:100] if len(description) == 100: description += "..." # For URL, we can use a placeholder or construct a URL to the Notion page if available url = "" if page_id: # Notion page URLs follow this format url = f"https://notion.so/{page_id.replace('-', '')}" source = { "id": document.get('id', self.source_id_counter), "title": title, "description": description, "url": url, } self.source_id_counter += 1 sources_list.append(source) # Create result object result_object = { "id": 5, "name": "Notion", "type": "NOTION_CONNECTOR", "sources": sources_list, } return result_object, notion_chunks async def search_extension(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple: """ Search for extension data and return both the source information and langchain documents Args: user_query: The user's query user_id: The user's ID search_space_id: The search space ID to search in top_k: Maximum number of results to return Returns: tuple: (sources_info, langchain_documents) """ if search_mode == SearchMode.CHUNKS: extension_chunks = await self.chunk_retriever.hybrid_search( query_text=user_query, top_k=top_k, user_id=user_id, search_space_id=search_space_id, document_type="EXTENSION" ) elif search_mode == SearchMode.DOCUMENTS: extension_chunks = await self.document_retriever.hybrid_search( query_text=user_query, top_k=top_k, user_id=user_id, search_space_id=search_space_id, document_type="EXTENSION" ) # Transform document retriever results to match expected format extension_chunks = self._transform_document_results(extension_chunks) # Early return if no results if not extension_chunks: return { "id": 6, "name": "Extension", "type": "EXTENSION", "sources": [], }, [] # Process each chunk and create sources directly without deduplication sources_list = [] async with self.counter_lock: for i, chunk in enumerate(extension_chunks): # Extract document metadata document = chunk.get('document', {}) metadata = document.get('metadata', {}) # Extract extension-specific metadata webpage_title = metadata.get('VisitedWebPageTitle', 'Untitled Page') webpage_url = metadata.get('VisitedWebPageURL', '') visit_date = metadata.get('VisitedWebPageDateWithTimeInISOString', '') visit_duration = metadata.get('VisitedWebPageVisitDurationInMilliseconds', '') browsing_session_id = metadata.get('BrowsingSessionId', '') # Create a more descriptive title for extension data title = webpage_title if visit_date: # Format the date for display (simplified) try: # Just extract the date part for display formatted_date = visit_date.split('T')[0] if 'T' in visit_date else visit_date title += f" (visited: {formatted_date})" except: # Fallback if date parsing fails title += f" (visited: {visit_date})" # Create a more descriptive description for extension data description = chunk.get('content', '')[:100] if len(description) == 100: description += "..." # Add visit duration if available if visit_duration: try: duration_seconds = int(visit_duration) / 1000 if duration_seconds < 60: duration_text = f"{duration_seconds:.1f} seconds" else: duration_text = f"{duration_seconds/60:.1f} minutes" if description: description += f" | Duration: {duration_text}" except: # Fallback if duration parsing fails pass source = { "id": document.get('id', self.source_id_counter), "title": title, "description": description, "url": webpage_url } self.source_id_counter += 1 sources_list.append(source) # Create result object result_object = { "id": 6, "name": "Extension", "type": "EXTENSION", "sources": sources_list, } return result_object, extension_chunks async def search_youtube(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple: """ Search for YouTube videos and return both the source information and langchain documents Args: user_query: The user's query user_id: The user's ID search_space_id: The search space ID to search in top_k: Maximum number of results to return Returns: tuple: (sources_info, langchain_documents) """ if search_mode == SearchMode.CHUNKS: youtube_chunks = await self.chunk_retriever.hybrid_search( query_text=user_query, top_k=top_k, user_id=user_id, search_space_id=search_space_id, document_type="YOUTUBE_VIDEO" ) elif search_mode == SearchMode.DOCUMENTS: youtube_chunks = await self.document_retriever.hybrid_search( query_text=user_query, top_k=top_k, user_id=user_id, search_space_id=search_space_id, document_type="YOUTUBE_VIDEO" ) # Transform document retriever results to match expected format youtube_chunks = self._transform_document_results(youtube_chunks) # Early return if no results if not youtube_chunks: return { "id": 7, "name": "YouTube Videos", "type": "YOUTUBE_VIDEO", "sources": [], }, [] # Process each chunk and create sources directly without deduplication sources_list = [] async with self.counter_lock: for _i, chunk in enumerate(youtube_chunks): # Extract document metadata document = chunk.get('document', {}) metadata = document.get('metadata', {}) # Extract YouTube-specific metadata video_title = metadata.get('video_title', 'Untitled Video') video_id = metadata.get('video_id', '') channel_name = metadata.get('channel_name', '') # published_date = metadata.get('published_date', '') # Create a more descriptive title for YouTube videos title = video_title if channel_name: title += f" - {channel_name}" # Create a more descriptive description for YouTube videos description = metadata.get('description', chunk.get('content', '')[:100]) if len(description) == 100: description += "..." # For URL, construct a URL to the YouTube video url = f"https://www.youtube.com/watch?v={video_id}" if video_id else "" source = { "id": document.get('id', self.source_id_counter), "title": title, "description": description, "url": url, "video_id": video_id, # Additional field for YouTube videos "channel_name": channel_name # Additional field for YouTube videos } self.source_id_counter += 1 sources_list.append(source) # Create result object result_object = { "id": 7, # Assign a unique ID for the YouTube connector "name": "YouTube Videos", "type": "YOUTUBE_VIDEO", "sources": sources_list, } return result_object, youtube_chunks async def search_github(self, user_query: str, user_id: int, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple: """ Search for GitHub documents and return both the source information and langchain documents Returns: tuple: (sources_info, langchain_documents) """ if search_mode == SearchMode.CHUNKS: github_chunks = await self.chunk_retriever.hybrid_search( query_text=user_query, top_k=top_k, user_id=user_id, search_space_id=search_space_id, document_type="GITHUB_CONNECTOR" ) elif search_mode == SearchMode.DOCUMENTS: github_chunks = await self.document_retriever.hybrid_search( query_text=user_query, top_k=top_k, user_id=user_id, search_space_id=search_space_id, document_type="GITHUB_CONNECTOR" ) # Transform document retriever results to match expected format github_chunks = self._transform_document_results(github_chunks) # Early return if no results if not github_chunks: return { "id": 8, "name": "GitHub", "type": "GITHUB_CONNECTOR", "sources": [], }, [] # Process each chunk and create sources directly without deduplication sources_list = [] async with self.counter_lock: for _i, chunk in enumerate(github_chunks): # Extract document metadata document = chunk.get('document', {}) metadata = document.get('metadata', {}) # Create a source entry source = { "id": document.get('id', self.source_id_counter), "title": document.get('title', 'GitHub Document'), # Use specific title if available "description": metadata.get('description', chunk.get('content', '')[:100]), # Use description or content preview "url": metadata.get('url', '') # Use URL if available in metadata } self.source_id_counter += 1 sources_list.append(source) # Create result object result_object = { "id": 8, "name": "GitHub", "type": "GITHUB_CONNECTOR", "sources": sources_list, } return result_object, github_chunks async def search_linear(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple: """ Search for Linear issues and comments and return both the source information and langchain documents Args: user_query: The user's query user_id: The user's ID search_space_id: The search space ID to search in top_k: Maximum number of results to return Returns: tuple: (sources_info, langchain_documents) """ if search_mode == SearchMode.CHUNKS: linear_chunks = await self.chunk_retriever.hybrid_search( query_text=user_query, top_k=top_k, user_id=user_id, search_space_id=search_space_id, document_type="LINEAR_CONNECTOR" ) elif search_mode == SearchMode.DOCUMENTS: linear_chunks = await self.document_retriever.hybrid_search( query_text=user_query, top_k=top_k, user_id=user_id, search_space_id=search_space_id, document_type="LINEAR_CONNECTOR" ) # Transform document retriever results to match expected format linear_chunks = self._transform_document_results(linear_chunks) # Early return if no results if not linear_chunks: return { "id": 9, "name": "Linear Issues", "type": "LINEAR_CONNECTOR", "sources": [], }, [] # Process each chunk and create sources directly without deduplication sources_list = [] async with self.counter_lock: for _i, chunk in enumerate(linear_chunks): # Extract document metadata document = chunk.get('document', {}) metadata = document.get('metadata', {}) # Extract Linear-specific metadata issue_identifier = metadata.get('issue_identifier', '') issue_title = metadata.get('issue_title', 'Untitled Issue') issue_state = metadata.get('state', '') comment_count = metadata.get('comment_count', 0) # Create a more descriptive title for Linear issues title = f"Linear: {issue_identifier} - {issue_title}" if issue_state: title += f" ({issue_state})" # Create a more descriptive description for Linear issues description = chunk.get('content', '')[:100] if len(description) == 100: description += "..." # Add comment count info to description if comment_count: if description: description += f" | Comments: {comment_count}" else: description = f"Comments: {comment_count}" # For URL, we could construct a URL to the Linear issue if we have the workspace info # For now, use a generic placeholder url = "" if issue_identifier: # This is a generic format, may need to be adjusted based on actual Linear workspace url = f"https://linear.app/issue/{issue_identifier}" source = { "id": document.get('id', self.source_id_counter), "title": title, "description": description, "url": url, "issue_identifier": issue_identifier, "state": issue_state, "comment_count": comment_count } self.source_id_counter += 1 sources_list.append(source) # Create result object result_object = { "id": 9, # Assign a unique ID for the Linear connector "name": "Linear Issues", "type": "LINEAR_CONNECTOR", "sources": sources_list, } return result_object, linear_chunks async def search_linkup(self, user_query: str, user_id: str, mode: str = "standard") -> tuple: """ Search using Linkup API and return both the source information and documents Args: user_query: The user's query user_id: The user's ID mode: Search depth mode, can be "standard" or "deep" Returns: tuple: (sources_info, documents) """ # Get Linkup connector configuration linkup_connector = await self.get_connector_by_type(user_id, SearchSourceConnectorType.LINKUP_API) if not linkup_connector: # Return empty results if no Linkup connector is configured return { "id": 10, "name": "Linkup Search", "type": "LINKUP_API", "sources": [], }, [] # Initialize Linkup client with API key from connector config linkup_api_key = linkup_connector.config.get("LINKUP_API_KEY") linkup_client = LinkupClient(api_key=linkup_api_key) # Perform search with Linkup try: response = linkup_client.search( query=user_query, depth=mode, # Use the provided mode ("standard" or "deep") output_type="searchResults", # Default to search results ) # Extract results from Linkup response - access as attribute instead of using .get() linkup_results = response.results if hasattr(response, 'results') else [] # Only proceed if we have results if not linkup_results: return { "id": 10, "name": "Linkup Search", "type": "LINKUP_API", "sources": [], }, [] # Process each result and create sources directly without deduplication sources_list = [] documents = [] async with self.counter_lock: for i, result in enumerate(linkup_results): # Only process results that have content if not hasattr(result, 'content') or not result.content: continue # Create a source entry source = { "id": self.source_id_counter, "title": result.name if hasattr(result, 'name') else "Linkup Result", "description": result.content[:100] if hasattr(result, 'content') else "", "url": result.url if hasattr(result, 'url') else "" } sources_list.append(source) # Create a document entry document = { "chunk_id": f"linkup_chunk_{i}", "content": result.content if hasattr(result, 'content') else "", "score": 1.0, # Default score since not provided by Linkup "document": { "id": self.source_id_counter, "title": result.name if hasattr(result, 'name') else "Linkup Result", "document_type": "LINKUP_API", "metadata": { "url": result.url if hasattr(result, 'url') else "", "type": result.type if hasattr(result, 'type') else "", "source": "LINKUP_API" } } } documents.append(document) self.source_id_counter += 1 # Create result object result_object = { "id": 10, "name": "Linkup Search", "type": "LINKUP_API", "sources": sources_list, } return result_object, documents except Exception as e: # Log the error and return empty results print(f"Error searching with Linkup: {str(e)}") return { "id": 10, "name": "Linkup Search", "type": "LINKUP_API", "sources": [], }, [] async def search_discord(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple: """ Search for Discord messages and return both the source information and langchain documents Args: user_query: The user's query user_id: The user's ID search_space_id: The search space ID to search in top_k: Maximum number of results to return Returns: tuple: (sources_info, langchain_documents) """ if search_mode == SearchMode.CHUNKS: discord_chunks = await self.chunk_retriever.hybrid_search( query_text=user_query, top_k=top_k, user_id=user_id, search_space_id=search_space_id, document_type="DISCORD_CONNECTOR" ) elif search_mode == SearchMode.DOCUMENTS: discord_chunks = await self.document_retriever.hybrid_search( query_text=user_query, top_k=top_k, user_id=user_id, search_space_id=search_space_id, document_type="DISCORD_CONNECTOR" ) # Transform document retriever results to match expected format discord_chunks = self._transform_document_results(discord_chunks) # Early return if no results if not discord_chunks: return { "id": 11, "name": "Discord", "type": "DISCORD_CONNECTOR", "sources": [], }, [] # Process each chunk and create sources directly without deduplication sources_list = [] async with self.counter_lock: for i, chunk in enumerate(discord_chunks): # Extract document metadata document = chunk.get('document', {}) metadata = document.get('metadata', {}) # Create a mapped source entry with Discord-specific metadata channel_name = metadata.get('channel_name', 'Unknown Channel') channel_id = metadata.get('channel_id', '') message_date = metadata.get('start_date', '') # Create a more descriptive title for Discord messages title = f"Discord: {channel_name}" if message_date: title += f" ({message_date})" # Create a more descriptive description for Discord messages description = chunk.get('content', '')[:100] if len(description) == 100: description += "..." url = "" guild_id = metadata.get('guild_id', '') if guild_id and channel_id: url = f"https://discord.com/channels/{guild_id}/{channel_id}" elif channel_id: # Fallback for DM channels or when guild_id is not available url = f"https://discord.com/channels/@me/{channel_id}" source = { "id": document.get('id', self.source_id_counter), "title": title, "description": description, "url": url, } self.source_id_counter += 1 sources_list.append(source) # Create result object result_object = { "id": 11, "name": "Discord", "type": "DISCORD_CONNECTOR", "sources": sources_list, } return result_object, discord_chunks