update connector indexing / update connector service

2025-09-01 10:09:08 +00:00 · 2025-07-24 11:52:06 +02:00 · 2025-07-24 11:52:06 +02:00 · ca98693005
commit ca98693005
parent a6fe7e583b
2 changed files with 370 additions and 0 deletions
--- a/surfsense_backend/app/services/connector_service.py
+++ b/surfsense_backend/app/services/connector_service.py
@ -857,6 +857,120 @@ class ConnectorService:
        
        return result_object, linear_chunks

+    async def search_jira(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple:
+        """
+        Search for Jira issues and comments and return both the source information and langchain documents
+
+        Args:
+            user_query: The user's query
+            user_id: The user's ID
+            search_space_id: The search space ID to search in
+            top_k: Maximum number of results to return
+            search_mode: Search mode (CHUNKS or DOCUMENTS)
+
+        Returns:
+            tuple: (sources_info, langchain_documents)
+        """
+        if search_mode == SearchMode.CHUNKS:
+            jira_chunks = await self.chunk_retriever.hybrid_search(
+                query_text=user_query,
+                top_k=top_k,
+                user_id=user_id,
+                search_space_id=search_space_id,
+                document_type="JIRA_CONNECTOR"
+            )
+        elif search_mode == SearchMode.DOCUMENTS:
+            jira_chunks = await self.document_retriever.hybrid_search(
+                query_text=user_query,
+                top_k=top_k,
+                user_id=user_id,
+                search_space_id=search_space_id,
+                document_type="JIRA_CONNECTOR"
+            )
+            # Transform document retriever results to match expected format
+            jira_chunks = self._transform_document_results(jira_chunks)
+
+        # Early return if no results
+        if not jira_chunks:
+            return {
+                "id": 10,
+                "name": "Jira Issues",
+                "type": "JIRA_CONNECTOR",
+                "sources": [],
+            }, []
+
+        # Process each chunk and create sources directly without deduplication
+        sources_list = []
+        async with self.counter_lock:
+            for _i, chunk in enumerate(jira_chunks):
+                # Extract document metadata
+                document = chunk.get('document', {})
+                metadata = document.get('metadata', {})
+
+                # Extract Jira-specific metadata
+                issue_key = metadata.get('issue_key', '')
+                issue_title = metadata.get('issue_title', 'Untitled Issue')
+                status = metadata.get('status', '')
+                priority = metadata.get('priority', '')
+                issue_type = metadata.get('issue_type', '')
+                comment_count = metadata.get('comment_count', 0)
+
+                # Create a more descriptive title for Jira issues
+                title = f"Jira: {issue_key} - {issue_title}"
+                if status:
+                    title += f" ({status})"
+
+                # Create a more descriptive description for Jira issues
+                description = chunk.get('content', '')[:100]
+                if len(description) == 100:
+                    description += "..."
+
+                # Add priority and type info to description
+                info_parts = []
+                if priority:
+                    info_parts.append(f"Priority: {priority}")
+                if issue_type:
+                    info_parts.append(f"Type: {issue_type}")
+                if comment_count:
+                    info_parts.append(f"Comments: {comment_count}")
+
+                if info_parts:
+                    if description:
+                        description += f" | {' | '.join(info_parts)}"
+                    else:
+                        description = ' | '.join(info_parts)
+
+                # For URL, we could construct a URL to the Jira issue if we have the base URL
+                # For now, use a generic placeholder
+                url = ""
+                if issue_key and metadata.get('base_url'):
+                    url = f"{metadata.get('base_url')}/browse/{issue_key}"
+
+                source = {
+                    "id": document.get('id', self.source_id_counter),
+                    "title": title,
+                    "description": description,
+                    "url": url,
+                    "issue_key": issue_key,
+                    "status": status,
+                    "priority": priority,
+                    "issue_type": issue_type,
+                    "comment_count": comment_count
+                }
+
+                self.source_id_counter += 1
+                sources_list.append(source)
+
+        # Create result object
+        result_object = {
+            "id": 10,  # Assign a unique ID for the Jira connector
+            "name": "Jira Issues",
+            "type": "JIRA_CONNECTOR",
+            "sources": sources_list,
+        }
+
+        return result_object, jira_chunks
+
    async def search_linkup(self, user_query: str, user_id: str, mode: str = "standard") -> tuple:
        """
        Search using Linkup API and return both the source information and documents
--- a/surfsense_backend/app/tasks/connectors_indexing_tasks.py
+++ b/surfsense_backend/app/tasks/connectors_indexing_tasks.py
@ -13,6 +13,7 @@ from app.connectors.notion_history import NotionHistoryConnector
 from app.connectors.github_connector import GitHubConnector
 from app.connectors.linear_connector import LinearConnector
 from app.connectors.discord_connector import DiscordConnector
+from app.connectors.jira_connector import JiraConnector
 from slack_sdk.errors import SlackApiError
 import logging
 import asyncio
@ -1651,3 +1652,258 @@ async def index_discord_messages(
        )
        logger.error(f"Failed to index Discord messages: {str(e)}", exc_info=True)
        return 0, f"Failed to index Discord messages: {str(e)}"
+
+
+async def index_jira_issues(
+    session: AsyncSession,
+    connector_id: int,
+    search_space_id: int,
+    user_id: str,
+    start_date: str = None,
+    end_date: str = None,
+    update_last_indexed: bool = True
+) -> Tuple[int, Optional[str]]:
+    """
+    Index Jira issues and comments.
+
+    Args:
+        session: Database session
+        connector_id: ID of the Jira connector
+        search_space_id: ID of the search space to store documents in
+        user_id: User ID
+        start_date: Start date for indexing (YYYY-MM-DD format)
+        end_date: End date for indexing (YYYY-MM-DD format)
+        update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
+
+    Returns:
+        Tuple containing (number of documents indexed, error message or None)
+    """
+    task_logger = TaskLoggingService(session, search_space_id)
+
+    # Log task start
+    log_entry = await task_logger.log_task_start(
+        task_name="jira_issues_indexing",
+        source="connector_indexing_task",
+        message=f"Starting Jira issues indexing for connector {connector_id}",
+        metadata={"connector_id": connector_id, "user_id": str(user_id), "start_date": start_date, "end_date": end_date}
+    )
+
+    try:
+        # Get the connector from the database
+        result = await session.execute(
+            select(SearchSourceConnector).where(SearchSourceConnector.id == connector_id)
+        )
+        connector = result.scalar_one_or_none()
+
+        if not connector:
+            await task_logger.log_task_failure(
+                log_entry,
+                f"Connector with ID {connector_id} not found",
+                "Connector not found",
+                {"error_type": "ConnectorNotFound"}
+            )
+            return 0, f"Connector with ID {connector_id} not found"
+
+        # Get the Jira credentials from the connector config
+        jira_token = connector.config.get("JIRA_PERSONAL_ACCESS_TOKEN")
+        jira_base_url = connector.config.get("JIRA_BASE_URL")
+
+        if not jira_token or not jira_base_url:
+            await task_logger.log_task_failure(
+                log_entry,
+                f"Jira credentials not found in connector config for connector {connector_id}",
+                "Missing Jira credentials",
+                {"error_type": "MissingCredentials"}
+            )
+            return 0, "Jira credentials not found in connector config"
+
+        # Initialize Jira client
+        await task_logger.log_task_progress(
+            log_entry,
+            f"Initializing Jira client for connector {connector_id}",
+            {"stage": "client_initialization"}
+        )
+
+        jira_client = JiraConnector(base_url=jira_base_url, personal_access_token=jira_token)
+
+        # Calculate date range
+        if start_date is None or end_date is None:
+            # Fall back to calculating dates based on last_indexed_at
+            calculated_end_date = datetime.now()
+
+            if connector.last_indexed_at:
+                calculated_start_date = connector.last_indexed_at
+            else:
+                # If never indexed, go back 30 days
+                calculated_start_date = calculated_end_date - timedelta(days=30)
+
+            start_date_str = calculated_start_date.strftime('%Y-%m-%d')
+            end_date_str = calculated_end_date.strftime('%Y-%m-%d')
+        else:
+            start_date_str = start_date
+            end_date_str = end_date
+
+        await task_logger.log_task_progress(
+            log_entry,
+            f"Fetching Jira issues from {start_date_str} to {end_date_str}",
+            {"stage": "fetching_issues", "start_date": start_date_str, "end_date": end_date_str}
+        )
+
+        # Get issues within date range
+        try:
+            issues, error = jira_client.get_issues_by_date_range(
+                start_date=start_date_str,
+                end_date=end_date_str,
+                include_comments=True
+            )
+
+            if error:
+                logger.error(f"Failed to get Jira issues: {error}")
+
+                # Don't treat "No issues found" as an error that should stop indexing
+                if "No issues found" in error:
+                    logger.info("No issues found is not a critical error, continuing with update")
+                    if update_last_indexed:
+                        connector.last_indexed_at = datetime.now()
+                        await session.commit()
+                        logger.info(f"Updated last_indexed_at to {connector.last_indexed_at} despite no issues found")
+
+                    await task_logger.log_task_completion(
+                        log_entry,
+                        f"No Jira issues found in date range {start_date_str} to {end_date_str}",
+                        {"indexed_count": 0}
+                    )
+                    return 0, None
+                else:
+                    await task_logger.log_task_failure(
+                        log_entry,
+                        f"Failed to get Jira issues: {error}",
+                        "API Error",
+                        {"error_type": "APIError"}
+                    )
+                    return 0, f"Failed to get Jira issues: {error}"
+
+            logger.info(f"Retrieved {len(issues)} issues from Jira API")
+
+            await task_logger.log_task_progress(
+                log_entry,
+                f"Retrieved {len(issues)} issues from Jira API",
+                {"stage": "processing_issues", "issue_count": len(issues)}
+            )
+
+        except Exception as e:
+            await task_logger.log_task_failure(
+                log_entry,
+                f"Error fetching Jira issues: {str(e)}",
+                "Fetch Error",
+                {"error_type": type(e).__name__}
+            )
+            logger.error(f"Error fetching Jira issues: {str(e)}", exc_info=True)
+            return 0, f"Error fetching Jira issues: {str(e)}"
+
+        # Process and index each issue
+        indexed_count = 0
+
+        for issue in issues:
+            try:
+                # Format the issue for better readability
+                formatted_issue = jira_client.format_issue(issue)
+
+                # Convert to markdown
+                issue_markdown = jira_client.format_issue_to_markdown(formatted_issue)
+
+                # Create document metadata
+                metadata = {
+                    "issue_key": formatted_issue.get("key", ""),
+                    "issue_title": formatted_issue.get("title", ""),
+                    "status": formatted_issue.get("status", ""),
+                    "priority": formatted_issue.get("priority", ""),
+                    "issue_type": formatted_issue.get("issue_type", ""),
+                    "project": formatted_issue.get("project", ""),
+                    "assignee": formatted_issue.get("assignee", {}).get("display_name", "") if formatted_issue.get("assignee") else "",
+                    "reporter": formatted_issue.get("reporter", {}).get("display_name", ""),
+                    "created_at": formatted_issue.get("created_at", ""),
+                    "updated_at": formatted_issue.get("updated_at", ""),
+                    "comment_count": len(formatted_issue.get("comments", [])),
+                    "connector_id": connector_id,
+                    "source": "jira",
+                    "base_url": jira_base_url
+                }
+
+                # Generate content hash
+                content_hash = generate_content_hash(issue_markdown)
+
+                # Check if document already exists
+                existing_doc_result = await session.execute(
+                    select(Document).where(Document.content_hash == content_hash)
+                )
+                existing_doc = existing_doc_result.scalar_one_or_none()
+
+                if existing_doc:
+                    logger.debug(f"Document with hash {content_hash} already exists, skipping")
+                    continue
+
+                # Create new document
+                document = Document(
+                    title=f"Jira: {formatted_issue.get('key', 'Unknown')} - {formatted_issue.get('title', 'Untitled')}",
+                    document_type=DocumentType.JIRA_CONNECTOR,
+                    document_metadata=metadata,
+                    content=issue_markdown,
+                    content_hash=content_hash,
+                    search_space_id=search_space_id
+                )
+
+                # Generate embedding
+                embedding = await config.embedding_model_instance.get_embedding(issue_markdown)
+                document.embedding = embedding
+
+                session.add(document)
+                await session.flush()  # Flush to get the document ID
+
+                # Create chunks for the document
+                chunks = await config.chunking_model_instance.chunk_document(issue_markdown)
+
+                for chunk_content in chunks:
+                    chunk_embedding = await config.embedding_model_instance.get_embedding(chunk_content)
+
+                    chunk = Chunk(
+                        content=chunk_content,
+                        embedding=chunk_embedding,
+                        document_id=document.id
+                    )
+                    session.add(chunk)
+
+                indexed_count += 1
+                logger.debug(f"Indexed Jira issue: {formatted_issue.get('key', 'Unknown')}")
+
+            except Exception as e:
+                logger.error(f"Error processing Jira issue {issue.get('key', 'Unknown')}: {str(e)}", exc_info=True)
+                continue
+
+        # Commit all changes
+        await session.commit()
+
+        # Update last_indexed_at timestamp
+        if update_last_indexed:
+            connector.last_indexed_at = datetime.now()
+            await session.commit()
+            logger.info(f"Updated last_indexed_at to {connector.last_indexed_at}")
+
+        await task_logger.log_task_completion(
+            log_entry,
+            f"Successfully indexed {indexed_count} Jira issues",
+            {"indexed_count": indexed_count}
+        )
+
+        logger.info(f"Successfully indexed {indexed_count} Jira issues")
+        return indexed_count, None
+
+    except Exception as e:
+        await task_logger.log_task_failure(
+            log_entry,
+            f"Failed to index Jira issues: {str(e)}",
+            str(e),
+            {"error_type": type(e).__name__}
+        )
+        logger.error(f"Failed to index Jira issues: {str(e)}", exc_info=True)
+        return 0, f"Failed to index Jira issues: {str(e)}"