import asyncio from typing import Dict, List, Optional from app.agents.researcher.configuration import SearchMode from app.db import ( Chunk, Document, SearchSourceConnector, SearchSourceConnectorType, SearchSpace, ) from app.retriver.chunks_hybrid_search import ChucksHybridSearchRetriever from app.retriver.documents_hybrid_search import DocumentHybridSearchRetriever from linkup import LinkupClient from sqlalchemy import func from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.future import select from tavily import TavilyClient class ConnectorService: def __init__(self, session: AsyncSession, user_id: str = None): self.session = session self.chunk_retriever = ChucksHybridSearchRetriever(session) self.document_retriever = DocumentHybridSearchRetriever(session) self.user_id = user_id self.source_id_counter = ( 100000 # High starting value to avoid collisions with existing IDs ) self.counter_lock = ( asyncio.Lock() ) # Lock to protect counter in multithreaded environments async def initialize_counter(self): """ Initialize the source_id_counter based on the total number of chunks for the user. This ensures unique IDs across different sessions. """ if self.user_id: try: # Count total chunks for documents belonging to this user result = await self.session.execute( select(func.count(Chunk.id)) .join(Document) .join(SearchSpace) .filter(SearchSpace.user_id == self.user_id) ) chunk_count = result.scalar() or 0 self.source_id_counter = chunk_count + 1 print( f"Initialized source_id_counter to {self.source_id_counter} for user {self.user_id}" ) except Exception as e: print(f"Error initializing source_id_counter: {str(e)}") # Fallback to default value self.source_id_counter = 1 async def search_crawled_urls( self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS, ) -> tuple: """ Search for crawled URLs and return both the source information and langchain documents Returns: tuple: (sources_info, langchain_documents) """ if search_mode == SearchMode.CHUNKS: crawled_urls_chunks = await self.chunk_retriever.hybrid_search( query_text=user_query, top_k=top_k, user_id=user_id, search_space_id=search_space_id, document_type="CRAWLED_URL", ) elif search_mode == SearchMode.DOCUMENTS: crawled_urls_chunks = await self.document_retriever.hybrid_search( query_text=user_query, top_k=top_k, user_id=user_id, search_space_id=search_space_id, document_type="CRAWLED_URL", ) # Transform document retriever results to match expected format crawled_urls_chunks = self._transform_document_results(crawled_urls_chunks) # Early return if no results if not crawled_urls_chunks: return { "id": 1, "name": "Crawled URLs", "type": "CRAWLED_URL", "sources": [], }, [] # Process each chunk and create sources directly without deduplication sources_list = [] async with self.counter_lock: for _i, chunk in enumerate(crawled_urls_chunks): # Extract document metadata document = chunk.get("document", {}) metadata = document.get("metadata", {}) # Create a source entry source = { "id": document.get("id", self.source_id_counter), "title": document.get("title", "Untitled Document"), "description": metadata.get( "og:description", metadata.get("ogDescription", chunk.get("content", "")[:100]), ), "url": metadata.get("url", ""), } self.source_id_counter += 1 sources_list.append(source) # Create result object result_object = { "id": 1, "name": "Crawled URLs", "type": "CRAWLED_URL", "sources": sources_list, } return result_object, crawled_urls_chunks async def search_files( self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS, ) -> tuple: """ Search for files and return both the source information and langchain documents Returns: tuple: (sources_info, langchain_documents) """ if search_mode == SearchMode.CHUNKS: files_chunks = await self.chunk_retriever.hybrid_search( query_text=user_query, top_k=top_k, user_id=user_id, search_space_id=search_space_id, document_type="FILE", ) elif search_mode == SearchMode.DOCUMENTS: files_chunks = await self.document_retriever.hybrid_search( query_text=user_query, top_k=top_k, user_id=user_id, search_space_id=search_space_id, document_type="FILE", ) # Transform document retriever results to match expected format files_chunks = self._transform_document_results(files_chunks) # Early return if no results if not files_chunks: return { "id": 2, "name": "Files", "type": "FILE", "sources": [], }, [] # Process each chunk and create sources directly without deduplication sources_list = [] async with self.counter_lock: for _i, chunk in enumerate(files_chunks): # Extract document metadata document = chunk.get("document", {}) metadata = document.get("metadata", {}) # Create a source entry source = { "id": document.get("id", self.source_id_counter), "title": document.get("title", "Untitled Document"), "description": metadata.get( "og:description", metadata.get("ogDescription", chunk.get("content", "")[:100]), ), "url": metadata.get("url", ""), } self.source_id_counter += 1 sources_list.append(source) # Create result object result_object = { "id": 2, "name": "Files", "type": "FILE", "sources": sources_list, } return result_object, files_chunks def _transform_document_results(self, document_results: List[Dict]) -> List[Dict]: """ Transform results from document_retriever.hybrid_search() to match the format expected by the processing code. Args: document_results: Results from document_retriever.hybrid_search() Returns: List of transformed results in the format expected by the processing code """ transformed_results = [] for doc in document_results: transformed_results.append( { "document": { "id": doc.get("document_id"), "title": doc.get("title", "Untitled Document"), "document_type": doc.get("document_type"), "metadata": doc.get("metadata", {}), }, "content": doc.get("chunks_content", doc.get("content", "")), "score": doc.get("score", 0.0), } ) return transformed_results async def get_connector_by_type( self, user_id: str, connector_type: SearchSourceConnectorType ) -> Optional[SearchSourceConnector]: """ Get a connector by type for a specific user Args: user_id: The user's ID connector_type: The connector type to retrieve Returns: Optional[SearchSourceConnector]: The connector if found, None otherwise """ result = await self.session.execute( select(SearchSourceConnector).filter( SearchSourceConnector.user_id == user_id, SearchSourceConnector.connector_type == connector_type, ) ) return result.scalars().first() async def search_tavily( self, user_query: str, user_id: str, top_k: int = 20 ) -> tuple: """ Search using Tavily API and return both the source information and documents Args: user_query: The user's query user_id: The user's ID top_k: Maximum number of results to return Returns: tuple: (sources_info, documents) """ # Get Tavily connector configuration tavily_connector = await self.get_connector_by_type( user_id, SearchSourceConnectorType.TAVILY_API ) if not tavily_connector: # Return empty results if no Tavily connector is configured return { "id": 3, "name": "Tavily Search", "type": "TAVILY_API", "sources": [], }, [] # Initialize Tavily client with API key from connector config tavily_api_key = tavily_connector.config.get("TAVILY_API_KEY") tavily_client = TavilyClient(api_key=tavily_api_key) # Perform search with Tavily try: response = tavily_client.search( query=user_query, max_results=top_k, search_depth="advanced", # Use advanced search for better results ) # Extract results from Tavily response tavily_results = response.get("results", []) # Early return if no results if not tavily_results: return { "id": 3, "name": "Tavily Search", "type": "TAVILY_API", "sources": [], }, [] # Process each result and create sources directly without deduplication sources_list = [] documents = [] async with self.counter_lock: for i, result in enumerate(tavily_results): # Create a source entry source = { "id": self.source_id_counter, "title": result.get("title", "Tavily Result"), "description": result.get("content", "")[:100], "url": result.get("url", ""), } sources_list.append(source) # Create a document entry document = { "chunk_id": f"tavily_chunk_{i}", "content": result.get("content", ""), "score": result.get("score", 0.0), "document": { "id": self.source_id_counter, "title": result.get("title", "Tavily Result"), "document_type": "TAVILY_API", "metadata": { "url": result.get("url", ""), "published_date": result.get("published_date", ""), "source": "TAVILY_API", }, }, } documents.append(document) self.source_id_counter += 1 # Create result object result_object = { "id": 3, "name": "Tavily Search", "type": "TAVILY_API", "sources": sources_list, } return result_object, documents except Exception as e: # Log the error and return empty results print(f"Error searching with Tavily: {str(e)}") return { "id": 3, "name": "Tavily Search", "type": "TAVILY_API", "sources": [], }, [] async def search_slack( self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS, ) -> tuple: """ Search for slack and return both the source information and langchain documents Returns: tuple: (sources_info, langchain_documents) """ if search_mode == SearchMode.CHUNKS: slack_chunks = await self.chunk_retriever.hybrid_search( query_text=user_query, top_k=top_k, user_id=user_id, search_space_id=search_space_id, document_type="SLACK_CONNECTOR", ) elif search_mode == SearchMode.DOCUMENTS: slack_chunks = await self.document_retriever.hybrid_search( query_text=user_query, top_k=top_k, user_id=user_id, search_space_id=search_space_id, document_type="SLACK_CONNECTOR", ) # Transform document retriever results to match expected format slack_chunks = self._transform_document_results(slack_chunks) # Early return if no results if not slack_chunks: return { "id": 4, "name": "Slack", "type": "SLACK_CONNECTOR", "sources": [], }, [] # Process each chunk and create sources directly without deduplication sources_list = [] async with self.counter_lock: for _i, chunk in enumerate(slack_chunks): # Extract document metadata document = chunk.get("document", {}) metadata = document.get("metadata", {}) # Create a mapped source entry with Slack-specific metadata channel_name = metadata.get("channel_name", "Unknown Channel") channel_id = metadata.get("channel_id", "") message_date = metadata.get("start_date", "") # Create a more descriptive title for Slack messages title = f"Slack: {channel_name}" if message_date: title += f" ({message_date})" # Create a more descriptive description for Slack messages description = chunk.get("content", "")[:100] if len(description) == 100: description += "..." # For URL, we can use a placeholder or construct a URL to the Slack channel if available url = "" if channel_id: url = f"https://slack.com/app_redirect?channel={channel_id}" source = { "id": document.get("id", self.source_id_counter), "title": title, "description": description, "url": url, } self.source_id_counter += 1 sources_list.append(source) # Create result object result_object = { "id": 4, "name": "Slack", "type": "SLACK_CONNECTOR", "sources": sources_list, } return result_object, slack_chunks async def search_notion( self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS, ) -> tuple: """ Search for Notion pages and return both the source information and langchain documents Args: user_query: The user's query user_id: The user's ID search_space_id: The search space ID to search in top_k: Maximum number of results to return Returns: tuple: (sources_info, langchain_documents) """ if search_mode == SearchMode.CHUNKS: notion_chunks = await self.chunk_retriever.hybrid_search( query_text=user_query, top_k=top_k, user_id=user_id, search_space_id=search_space_id, document_type="NOTION_CONNECTOR", ) elif search_mode == SearchMode.DOCUMENTS: notion_chunks = await self.document_retriever.hybrid_search( query_text=user_query, top_k=top_k, user_id=user_id, search_space_id=search_space_id, document_type="NOTION_CONNECTOR", ) # Transform document retriever results to match expected format notion_chunks = self._transform_document_results(notion_chunks) # Early return if no results if not notion_chunks: return { "id": 5, "name": "Notion", "type": "NOTION_CONNECTOR", "sources": [], }, [] # Process each chunk and create sources directly without deduplication sources_list = [] async with self.counter_lock: for _i, chunk in enumerate(notion_chunks): # Extract document metadata document = chunk.get("document", {}) metadata = document.get("metadata", {}) # Create a mapped source entry with Notion-specific metadata page_title = metadata.get("page_title", "Untitled Page") page_id = metadata.get("page_id", "") indexed_at = metadata.get("indexed_at", "") # Create a more descriptive title for Notion pages title = f"Notion: {page_title}" if indexed_at: title += f" (indexed: {indexed_at})" # Create a more descriptive description for Notion pages description = chunk.get("content", "")[:100] if len(description) == 100: description += "..." # For URL, we can use a placeholder or construct a URL to the Notion page if available url = "" if page_id: # Notion page URLs follow this format url = f"https://notion.so/{page_id.replace('-', '')}" source = { "id": document.get("id", self.source_id_counter), "title": title, "description": description, "url": url, } self.source_id_counter += 1 sources_list.append(source) # Create result object result_object = { "id": 5, "name": "Notion", "type": "NOTION_CONNECTOR", "sources": sources_list, } return result_object, notion_chunks async def search_extension( self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS, ) -> tuple: """ Search for extension data and return both the source information and langchain documents Args: user_query: The user's query user_id: The user's ID search_space_id: The search space ID to search in top_k: Maximum number of results to return Returns: tuple: (sources_info, langchain_documents) """ if search_mode == SearchMode.CHUNKS: extension_chunks = await self.chunk_retriever.hybrid_search( query_text=user_query, top_k=top_k, user_id=user_id, search_space_id=search_space_id, document_type="EXTENSION", ) elif search_mode == SearchMode.DOCUMENTS: extension_chunks = await self.document_retriever.hybrid_search( query_text=user_query, top_k=top_k, user_id=user_id, search_space_id=search_space_id, document_type="EXTENSION", ) # Transform document retriever results to match expected format extension_chunks = self._transform_document_results(extension_chunks) # Early return if no results if not extension_chunks: return { "id": 6, "name": "Extension", "type": "EXTENSION", "sources": [], }, [] # Process each chunk and create sources directly without deduplication sources_list = [] async with self.counter_lock: for i, chunk in enumerate(extension_chunks): # Extract document metadata document = chunk.get("document", {}) metadata = document.get("metadata", {}) # Extract extension-specific metadata webpage_title = metadata.get("VisitedWebPageTitle", "Untitled Page") webpage_url = metadata.get("VisitedWebPageURL", "") visit_date = metadata.get("VisitedWebPageDateWithTimeInISOString", "") visit_duration = metadata.get( "VisitedWebPageVisitDurationInMilliseconds", "" ) browsing_session_id = metadata.get("BrowsingSessionId", "") # Create a more descriptive title for extension data title = webpage_title if visit_date: # Format the date for display (simplified) try: # Just extract the date part for display formatted_date = ( visit_date.split("T")[0] if "T" in visit_date else visit_date ) title += f" (visited: {formatted_date})" except: # Fallback if date parsing fails title += f" (visited: {visit_date})" # Create a more descriptive description for extension data description = chunk.get("content", "")[:100] if len(description) == 100: description += "..." # Add visit duration if available if visit_duration: try: duration_seconds = int(visit_duration) / 1000 if duration_seconds < 60: duration_text = f"{duration_seconds:.1f} seconds" else: duration_text = f"{duration_seconds / 60:.1f} minutes" if description: description += f" | Duration: {duration_text}" except: # Fallback if duration parsing fails pass source = { "id": document.get("id", self.source_id_counter), "title": title, "description": description, "url": webpage_url, } self.source_id_counter += 1 sources_list.append(source) # Create result object result_object = { "id": 6, "name": "Extension", "type": "EXTENSION", "sources": sources_list, } return result_object, extension_chunks async def search_youtube( self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS, ) -> tuple: """ Search for YouTube videos and return both the source information and langchain documents Args: user_query: The user's query user_id: The user's ID search_space_id: The search space ID to search in top_k: Maximum number of results to return Returns: tuple: (sources_info, langchain_documents) """ if search_mode == SearchMode.CHUNKS: youtube_chunks = await self.chunk_retriever.hybrid_search( query_text=user_query, top_k=top_k, user_id=user_id, search_space_id=search_space_id, document_type="YOUTUBE_VIDEO", ) elif search_mode == SearchMode.DOCUMENTS: youtube_chunks = await self.document_retriever.hybrid_search( query_text=user_query, top_k=top_k, user_id=user_id, search_space_id=search_space_id, document_type="YOUTUBE_VIDEO", ) # Transform document retriever results to match expected format youtube_chunks = self._transform_document_results(youtube_chunks) # Early return if no results if not youtube_chunks: return { "id": 7, "name": "YouTube Videos", "type": "YOUTUBE_VIDEO", "sources": [], }, [] # Process each chunk and create sources directly without deduplication sources_list = [] async with self.counter_lock: for _i, chunk in enumerate(youtube_chunks): # Extract document metadata document = chunk.get("document", {}) metadata = document.get("metadata", {}) # Extract YouTube-specific metadata video_title = metadata.get("video_title", "Untitled Video") video_id = metadata.get("video_id", "") channel_name = metadata.get("channel_name", "") # published_date = metadata.get('published_date', '') # Create a more descriptive title for YouTube videos title = video_title if channel_name: title += f" - {channel_name}" # Create a more descriptive description for YouTube videos description = metadata.get( "description", chunk.get("content", "")[:100] ) if len(description) == 100: description += "..." # For URL, construct a URL to the YouTube video url = f"https://www.youtube.com/watch?v={video_id}" if video_id else "" source = { "id": document.get("id", self.source_id_counter), "title": title, "description": description, "url": url, "video_id": video_id, # Additional field for YouTube videos "channel_name": channel_name, # Additional field for YouTube videos } self.source_id_counter += 1 sources_list.append(source) # Create result object result_object = { "id": 7, # Assign a unique ID for the YouTube connector "name": "YouTube Videos", "type": "YOUTUBE_VIDEO", "sources": sources_list, } return result_object, youtube_chunks async def search_github( self, user_query: str, user_id: int, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS, ) -> tuple: """ Search for GitHub documents and return both the source information and langchain documents Returns: tuple: (sources_info, langchain_documents) """ if search_mode == SearchMode.CHUNKS: github_chunks = await self.chunk_retriever.hybrid_search( query_text=user_query, top_k=top_k, user_id=user_id, search_space_id=search_space_id, document_type="GITHUB_CONNECTOR", ) elif search_mode == SearchMode.DOCUMENTS: github_chunks = await self.document_retriever.hybrid_search( query_text=user_query, top_k=top_k, user_id=user_id, search_space_id=search_space_id, document_type="GITHUB_CONNECTOR", ) # Transform document retriever results to match expected format github_chunks = self._transform_document_results(github_chunks) # Early return if no results if not github_chunks: return { "id": 8, "name": "GitHub", "type": "GITHUB_CONNECTOR", "sources": [], }, [] # Process each chunk and create sources directly without deduplication sources_list = [] async with self.counter_lock: for _i, chunk in enumerate(github_chunks): # Extract document metadata document = chunk.get("document", {}) metadata = document.get("metadata", {}) # Create a source entry source = { "id": document.get("id", self.source_id_counter), "title": document.get( "title", "GitHub Document" ), # Use specific title if available "description": metadata.get( "description", chunk.get("content", "")[:100] ), # Use description or content preview "url": metadata.get("url", ""), # Use URL if available in metadata } self.source_id_counter += 1 sources_list.append(source) # Create result object result_object = { "id": 8, "name": "GitHub", "type": "GITHUB_CONNECTOR", "sources": sources_list, } return result_object, github_chunks async def search_linear( self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS, ) -> tuple: """ Search for Linear issues and comments and return both the source information and langchain documents Args: user_query: The user's query user_id: The user's ID search_space_id: The search space ID to search in top_k: Maximum number of results to return Returns: tuple: (sources_info, langchain_documents) """ if search_mode == SearchMode.CHUNKS: linear_chunks = await self.chunk_retriever.hybrid_search( query_text=user_query, top_k=top_k, user_id=user_id, search_space_id=search_space_id, document_type="LINEAR_CONNECTOR", ) elif search_mode == SearchMode.DOCUMENTS: linear_chunks = await self.document_retriever.hybrid_search( query_text=user_query, top_k=top_k, user_id=user_id, search_space_id=search_space_id, document_type="LINEAR_CONNECTOR", ) # Transform document retriever results to match expected format linear_chunks = self._transform_document_results(linear_chunks) # Early return if no results if not linear_chunks: return { "id": 9, "name": "Linear Issues", "type": "LINEAR_CONNECTOR", "sources": [], }, [] # Process each chunk and create sources directly without deduplication sources_list = [] async with self.counter_lock: for _i, chunk in enumerate(linear_chunks): # Extract document metadata document = chunk.get("document", {}) metadata = document.get("metadata", {}) # Extract Linear-specific metadata issue_identifier = metadata.get("issue_identifier", "") issue_title = metadata.get("issue_title", "Untitled Issue") issue_state = metadata.get("state", "") comment_count = metadata.get("comment_count", 0) # Create a more descriptive title for Linear issues title = f"Linear: {issue_identifier} - {issue_title}" if issue_state: title += f" ({issue_state})" # Create a more descriptive description for Linear issues description = chunk.get("content", "")[:100] if len(description) == 100: description += "..." # Add comment count info to description if comment_count: if description: description += f" | Comments: {comment_count}" else: description = f"Comments: {comment_count}" # For URL, we could construct a URL to the Linear issue if we have the workspace info # For now, use a generic placeholder url = "" if issue_identifier: # This is a generic format, may need to be adjusted based on actual Linear workspace url = f"https://linear.app/issue/{issue_identifier}" source = { "id": document.get("id", self.source_id_counter), "title": title, "description": description, "url": url, "issue_identifier": issue_identifier, "state": issue_state, "comment_count": comment_count, } self.source_id_counter += 1 sources_list.append(source) # Create result object result_object = { "id": 9, # Assign a unique ID for the Linear connector "name": "Linear Issues", "type": "LINEAR_CONNECTOR", "sources": sources_list, } return result_object, linear_chunks async def search_jira( self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS, ) -> tuple: """ Search for Jira issues and comments and return both the source information and langchain documents Args: user_query: The user's query user_id: The user's ID search_space_id: The search space ID to search in top_k: Maximum number of results to return search_mode: Search mode (CHUNKS or DOCUMENTS) Returns: tuple: (sources_info, langchain_documents) """ if search_mode == SearchMode.CHUNKS: jira_chunks = await self.chunk_retriever.hybrid_search( query_text=user_query, top_k=top_k, user_id=user_id, search_space_id=search_space_id, document_type="JIRA_CONNECTOR", ) elif search_mode == SearchMode.DOCUMENTS: jira_chunks = await self.document_retriever.hybrid_search( query_text=user_query, top_k=top_k, user_id=user_id, search_space_id=search_space_id, document_type="JIRA_CONNECTOR", ) # Transform document retriever results to match expected format jira_chunks = self._transform_document_results(jira_chunks) # Early return if no results if not jira_chunks: return { "id": 10, "name": "Jira Issues", "type": "JIRA_CONNECTOR", "sources": [], }, [] # Process each chunk and create sources directly without deduplication sources_list = [] async with self.counter_lock: for _i, chunk in enumerate(jira_chunks): # Extract document metadata document = chunk.get("document", {}) metadata = document.get("metadata", {}) # Extract Jira-specific metadata issue_key = metadata.get("issue_key", "") issue_title = metadata.get("issue_title", "Untitled Issue") status = metadata.get("status", "") priority = metadata.get("priority", "") issue_type = metadata.get("issue_type", "") comment_count = metadata.get("comment_count", 0) # Create a more descriptive title for Jira issues title = f"Jira: {issue_key} - {issue_title}" if status: title += f" ({status})" # Create a more descriptive description for Jira issues description = chunk.get("content", "")[:100] if len(description) == 100: description += "..." # Add priority and type info to description info_parts = [] if priority: info_parts.append(f"Priority: {priority}") if issue_type: info_parts.append(f"Type: {issue_type}") if comment_count: info_parts.append(f"Comments: {comment_count}") if info_parts: if description: description += f" | {' | '.join(info_parts)}" else: description = " | ".join(info_parts) # For URL, we could construct a URL to the Jira issue if we have the base URL # For now, use a generic placeholder url = "" if issue_key and metadata.get("base_url"): url = f"{metadata.get('base_url')}/browse/{issue_key}" source = { "id": document.get("id", self.source_id_counter), "title": title, "description": description, "url": url, "issue_key": issue_key, "status": status, "priority": priority, "issue_type": issue_type, "comment_count": comment_count, } self.source_id_counter += 1 sources_list.append(source) # Create result object result_object = { "id": 10, # Assign a unique ID for the Jira connector "name": "Jira Issues", "type": "JIRA_CONNECTOR", "sources": sources_list, } return result_object, jira_chunks async def search_linkup( self, user_query: str, user_id: str, mode: str = "standard" ) -> tuple: """ Search using Linkup API and return both the source information and documents Args: user_query: The user's query user_id: The user's ID mode: Search depth mode, can be "standard" or "deep" Returns: tuple: (sources_info, documents) """ # Get Linkup connector configuration linkup_connector = await self.get_connector_by_type( user_id, SearchSourceConnectorType.LINKUP_API ) if not linkup_connector: # Return empty results if no Linkup connector is configured return { "id": 10, "name": "Linkup Search", "type": "LINKUP_API", "sources": [], }, [] # Initialize Linkup client with API key from connector config linkup_api_key = linkup_connector.config.get("LINKUP_API_KEY") linkup_client = LinkupClient(api_key=linkup_api_key) # Perform search with Linkup try: response = linkup_client.search( query=user_query, depth=mode, # Use the provided mode ("standard" or "deep") output_type="searchResults", # Default to search results ) # Extract results from Linkup response - access as attribute instead of using .get() linkup_results = response.results if hasattr(response, "results") else [] # Only proceed if we have results if not linkup_results: return { "id": 10, "name": "Linkup Search", "type": "LINKUP_API", "sources": [], }, [] # Process each result and create sources directly without deduplication sources_list = [] documents = [] async with self.counter_lock: for i, result in enumerate(linkup_results): # Only process results that have content if not hasattr(result, "content") or not result.content: continue # Create a source entry source = { "id": self.source_id_counter, "title": ( result.name if hasattr(result, "name") else "Linkup Result" ), "description": ( result.content[:100] if hasattr(result, "content") else "" ), "url": result.url if hasattr(result, "url") else "", } sources_list.append(source) # Create a document entry document = { "chunk_id": f"linkup_chunk_{i}", "content": result.content if hasattr(result, "content") else "", "score": 1.0, # Default score since not provided by Linkup "document": { "id": self.source_id_counter, "title": ( result.name if hasattr(result, "name") else "Linkup Result" ), "document_type": "LINKUP_API", "metadata": { "url": result.url if hasattr(result, "url") else "", "type": result.type if hasattr(result, "type") else "", "source": "LINKUP_API", }, }, } documents.append(document) self.source_id_counter += 1 # Create result object result_object = { "id": 10, "name": "Linkup Search", "type": "LINKUP_API", "sources": sources_list, } return result_object, documents except Exception as e: # Log the error and return empty results print(f"Error searching with Linkup: {str(e)}") return { "id": 10, "name": "Linkup Search", "type": "LINKUP_API", "sources": [], }, [] async def search_discord( self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS, ) -> tuple: """ Search for Discord messages and return both the source information and langchain documents Args: user_query: The user's query user_id: The user's ID search_space_id: The search space ID to search in top_k: Maximum number of results to return Returns: tuple: (sources_info, langchain_documents) """ if search_mode == SearchMode.CHUNKS: discord_chunks = await self.chunk_retriever.hybrid_search( query_text=user_query, top_k=top_k, user_id=user_id, search_space_id=search_space_id, document_type="DISCORD_CONNECTOR", ) elif search_mode == SearchMode.DOCUMENTS: discord_chunks = await self.document_retriever.hybrid_search( query_text=user_query, top_k=top_k, user_id=user_id, search_space_id=search_space_id, document_type="DISCORD_CONNECTOR", ) # Transform document retriever results to match expected format discord_chunks = self._transform_document_results(discord_chunks) # Early return if no results if not discord_chunks: return { "id": 11, "name": "Discord", "type": "DISCORD_CONNECTOR", "sources": [], }, [] # Process each chunk and create sources directly without deduplication sources_list = [] async with self.counter_lock: for i, chunk in enumerate(discord_chunks): # Extract document metadata document = chunk.get("document", {}) metadata = document.get("metadata", {}) # Create a mapped source entry with Discord-specific metadata channel_name = metadata.get("channel_name", "Unknown Channel") channel_id = metadata.get("channel_id", "") message_date = metadata.get("start_date", "") # Create a more descriptive title for Discord messages title = f"Discord: {channel_name}" if message_date: title += f" ({message_date})" # Create a more descriptive description for Discord messages description = chunk.get("content", "")[:100] if len(description) == 100: description += "..." url = "" guild_id = metadata.get("guild_id", "") if guild_id and channel_id: url = f"https://discord.com/channels/{guild_id}/{channel_id}" elif channel_id: # Fallback for DM channels or when guild_id is not available url = f"https://discord.com/channels/@me/{channel_id}" source = { "id": document.get("id", self.source_id_counter), "title": title, "description": description, "url": url, } self.source_id_counter += 1 sources_list.append(source) # Create result object result_object = { "id": 11, "name": "Discord", "type": "DISCORD_CONNECTOR", "sources": sources_list, } return result_object, discord_chunks