add search google gmail connector service

2025-09-01 10:09:08 +00:00 · 2025-08-04 00:58:37 +02:00 · 2025-08-04 00:58:37 +02:00 · e17d969087
commit e17d969087
parent 1de0f0309c
1 changed files with 126 additions and 0 deletions
--- a/surfsense_backend/app/services/connector_service.py
+++ b/surfsense_backend/app/services/connector_service.py
@ -1208,6 +1208,132 @@ class ConnectorService:

        return result_object, calendar_chunks

+    async def search_google_gmail(
+        self,
+        user_query: str,
+        user_id: str,
+        search_space_id: int,
+        top_k: int = 20,
+        search_mode: SearchMode = SearchMode.CHUNKS,
+    ) -> tuple:
+        """
+        Search for Gmail messages and return both the source information and langchain documents
+
+        Args:
+            user_query: The user's query
+            user_id: The user's ID
+            search_space_id: The search space ID to search in
+            top_k: Maximum number of results to return
+            search_mode: Search mode (CHUNKS or DOCUMENTS)
+
+        Returns:
+            tuple: (sources_info, langchain_documents)
+        """
+        if search_mode == SearchMode.CHUNKS:
+            gmail_chunks = await self.chunk_retriever.hybrid_search(
+                query_text=user_query,
+                top_k=top_k,
+                user_id=user_id,
+                search_space_id=search_space_id,
+                document_type="GOOGLE_GMAIL_CONNECTOR",
+            )
+        elif search_mode == SearchMode.DOCUMENTS:
+            gmail_chunks = await self.document_retriever.hybrid_search(
+                query_text=user_query,
+                top_k=top_k,
+                user_id=user_id,
+                search_space_id=search_space_id,
+                document_type="GOOGLE_GMAIL_CONNECTOR",
+            )
+            # Transform document retriever results to match expected format
+            gmail_chunks = self._transform_document_results(gmail_chunks)
+
+        # Early return if no results
+        if not gmail_chunks:
+            return {
+                "id": 32,
+                "name": "Gmail Messages",
+                "type": "GOOGLE_GMAIL_CONNECTOR",
+                "sources": [],
+            }, []
+
+        # Process each chunk and create sources directly without deduplication
+        sources_list = []
+        async with self.counter_lock:
+            for _i, chunk in enumerate(gmail_chunks):
+                # Extract document metadata
+                document = chunk.get("document", {})
+                metadata = document.get("metadata", {})
+
+                # Extract Gmail-specific metadata
+                message_id = metadata.get("message_id", "")
+                subject = metadata.get("subject", "No Subject")
+                sender = metadata.get("sender", "Unknown Sender")
+                date_str = metadata.get("date", "")
+                thread_id = metadata.get("thread_id", "")
+
+                # Create a more descriptive title for Gmail messages
+                title = f"Email: {subject}"
+                if sender:
+                    # Extract just the email address or name from sender
+                    import re
+
+                    sender_match = re.search(r"<([^>]+)>", sender)
+                    if sender_match:
+                        sender_email = sender_match.group(1)
+                        title += f" (from {sender_email})"
+                    else:
+                        title += f" (from {sender})"
+
+                # Create a more descriptive description for Gmail messages
+                description = chunk.get("content", "")[:150]
+                if len(description) == 150:
+                    description += "..."
+
+                # Add message info to description
+                info_parts = []
+                if date_str:
+                    info_parts.append(f"Date: {date_str}")
+                if thread_id:
+                    info_parts.append(f"Thread: {thread_id}")
+
+                if info_parts:
+                    if description:
+                        description += f" | {' | '.join(info_parts)}"
+                    else:
+                        description = " | ".join(info_parts)
+
+                # For URL, we could construct a URL to the Gmail message
+                url = ""
+                if message_id:
+                    # Gmail message URL format
+                    url = f"https://mail.google.com/mail/u/0/#inbox/{message_id}"
+
+                source = {
+                    "id": document.get("id", self.source_id_counter),
+                    "title": title,
+                    "description": description,
+                    "url": url,
+                    "message_id": message_id,
+                    "subject": subject,
+                    "sender": sender,
+                    "date": date_str,
+                    "thread_id": thread_id,
+                }
+
+                self.source_id_counter += 1
+                sources_list.append(source)
+
+        # Create result object
+        result_object = {
+            "id": 32,  # Assign a unique ID for the Gmail connector
+            "name": "Gmail Messages",
+            "type": "GOOGLE_GMAIL_CONNECTOR",
+            "sources": sources_list,
+        }
+
+        return result_object, gmail_chunks
+
    async def search_confluence(
        self,
        user_query: str,