From 5aa52375c3a61c8c66910258016a3ca9414645b0 Mon Sep 17 00:00:00 2001 From: "DESKTOP-RTLN3BA\\$punk" Date: Tue, 12 Aug 2025 15:28:13 -0700 Subject: [PATCH] refactor: refactored background_tasks & indexing_tasks --- .../app/connectors/test_github_connector.py | 188 - .../app/connectors/test_slack_history.py | 507 --- .../app/routes/documents_routes.py | 2 +- .../routes/search_source_connectors_routes.py | 2 +- .../app/tasks/background_tasks.py | 1077 ------ .../app/tasks/connector_indexers/__init__.py | 54 + .../app/tasks/connector_indexers/base.py | 183 + .../connector_indexers/clickup_indexer.py | 301 ++ .../connector_indexers/confluence_indexer.py | 338 ++ .../connector_indexers/discord_indexer.py | 441 +++ .../connector_indexers/github_indexer.py | 326 ++ .../google_calendar_indexer.py | 350 ++ .../tasks/connector_indexers/jira_indexer.py | 320 ++ .../connector_indexers/linear_indexer.py | 337 ++ .../connector_indexers/notion_indexer.py | 406 ++ .../tasks/connector_indexers/slack_indexer.py | 396 ++ .../app/tasks/connectors_indexing_tasks.py | 3375 ----------------- .../app/tasks/document_processors/__init__.py | 47 + .../app/tasks/document_processors/base.py | 75 + .../extension_processor.py | 163 + .../document_processors/file_processors.py | 261 ++ .../document_processors/markdown_processor.py | 136 + .../tasks/document_processors/url_crawler.py | 242 ++ .../document_processors/youtube_processor.py | 326 ++ 24 files changed, 4704 insertions(+), 5149 deletions(-) delete mode 100644 surfsense_backend/app/connectors/test_github_connector.py delete mode 100644 surfsense_backend/app/connectors/test_slack_history.py delete mode 100644 surfsense_backend/app/tasks/background_tasks.py create mode 100644 surfsense_backend/app/tasks/connector_indexers/__init__.py create mode 100644 surfsense_backend/app/tasks/connector_indexers/base.py create mode 100644 surfsense_backend/app/tasks/connector_indexers/clickup_indexer.py create mode 100644 surfsense_backend/app/tasks/connector_indexers/confluence_indexer.py create mode 100644 surfsense_backend/app/tasks/connector_indexers/discord_indexer.py create mode 100644 surfsense_backend/app/tasks/connector_indexers/github_indexer.py create mode 100644 surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py create mode 100644 surfsense_backend/app/tasks/connector_indexers/jira_indexer.py create mode 100644 surfsense_backend/app/tasks/connector_indexers/linear_indexer.py create mode 100644 surfsense_backend/app/tasks/connector_indexers/notion_indexer.py create mode 100644 surfsense_backend/app/tasks/connector_indexers/slack_indexer.py delete mode 100644 surfsense_backend/app/tasks/connectors_indexing_tasks.py create mode 100644 surfsense_backend/app/tasks/document_processors/__init__.py create mode 100644 surfsense_backend/app/tasks/document_processors/base.py create mode 100644 surfsense_backend/app/tasks/document_processors/extension_processor.py create mode 100644 surfsense_backend/app/tasks/document_processors/file_processors.py create mode 100644 surfsense_backend/app/tasks/document_processors/markdown_processor.py create mode 100644 surfsense_backend/app/tasks/document_processors/url_crawler.py create mode 100644 surfsense_backend/app/tasks/document_processors/youtube_processor.py diff --git a/surfsense_backend/app/connectors/test_github_connector.py b/surfsense_backend/app/connectors/test_github_connector.py deleted file mode 100644 index 6ed9ffa..0000000 --- a/surfsense_backend/app/connectors/test_github_connector.py +++ /dev/null @@ -1,188 +0,0 @@ -import unittest -from datetime import datetime -from unittest.mock import Mock, patch - -from github3.exceptions import ForbiddenError # Import the specific exception - -# Adjust the import path based on the actual location if test_github_connector.py -# is not in the same directory as github_connector.py or if paths are set up differently. -# Assuming surfsend_backend/app/connectors/test_github_connector.py -from surfsense_backend.app.connectors.github_connector import GitHubConnector - - -class TestGitHubConnector(unittest.TestCase): - @patch("surfsense_backend.app.connectors.github_connector.github_login") - def test_get_user_repositories_uses_type_all(self, mock_github_login): - # Mock the GitHub client object and its methods - mock_gh_instance = Mock() - mock_github_login.return_value = mock_gh_instance - - # Mock the self.gh.me() call in __init__ to prevent an actual API call - mock_gh_instance.me.return_value = Mock() # Simple mock to pass initialization - - # Prepare mock repository data - mock_repo1_data = Mock() - mock_repo1_data.id = 1 - mock_repo1_data.name = "repo1" - mock_repo1_data.full_name = "user/repo1" - mock_repo1_data.private = False - mock_repo1_data.html_url = "http://example.com/user/repo1" - mock_repo1_data.description = "Test repo 1" - mock_repo1_data.updated_at = datetime( - 2023, 1, 1, 10, 30, 0 - ) # Added time component - - mock_repo2_data = Mock() - mock_repo2_data.id = 2 - mock_repo2_data.name = "org-repo" - mock_repo2_data.full_name = "org/org-repo" - mock_repo2_data.private = True - mock_repo2_data.html_url = "http://example.com/org/org-repo" - mock_repo2_data.description = "Org repo" - mock_repo2_data.updated_at = datetime( - 2023, 1, 2, 12, 0, 0 - ) # Added time component - - # Configure the mock for gh.repositories() call - # This method is an iterator, so it should return an iterable (e.g., a list) - mock_gh_instance.repositories.return_value = [mock_repo1_data, mock_repo2_data] - - connector = GitHubConnector(token="fake_token") - repositories = connector.get_user_repositories() - - # Assert that gh.repositories was called correctly - mock_gh_instance.repositories.assert_called_once_with( - type="all", sort="updated" - ) - - # Assert the structure and content of the returned data - expected_repositories = [ - { - "id": 1, - "name": "repo1", - "full_name": "user/repo1", - "private": False, - "url": "http://example.com/user/repo1", - "description": "Test repo 1", - "last_updated": datetime(2023, 1, 1, 10, 30, 0), - }, - { - "id": 2, - "name": "org-repo", - "full_name": "org/org-repo", - "private": True, - "url": "http://example.com/org/org-repo", - "description": "Org repo", - "last_updated": datetime(2023, 1, 2, 12, 0, 0), - }, - ] - self.assertEqual(repositories, expected_repositories) - self.assertEqual(len(repositories), 2) - - @patch("surfsense_backend.app.connectors.github_connector.github_login") - def test_get_user_repositories_handles_empty_description_and_none_updated_at( - self, mock_github_login - ): - # Mock the GitHub client object and its methods - mock_gh_instance = Mock() - mock_github_login.return_value = mock_gh_instance - mock_gh_instance.me.return_value = Mock() - - mock_repo_data = Mock() - mock_repo_data.id = 1 - mock_repo_data.name = "repo_no_desc" - mock_repo_data.full_name = "user/repo_no_desc" - mock_repo_data.private = False - mock_repo_data.html_url = "http://example.com/user/repo_no_desc" - mock_repo_data.description = None # Test None description - mock_repo_data.updated_at = None # Test None updated_at - - mock_gh_instance.repositories.return_value = [mock_repo_data] - connector = GitHubConnector(token="fake_token") - repositories = connector.get_user_repositories() - - mock_gh_instance.repositories.assert_called_once_with( - type="all", sort="updated" - ) - expected_repositories = [ - { - "id": 1, - "name": "repo_no_desc", - "full_name": "user/repo_no_desc", - "private": False, - "url": "http://example.com/user/repo_no_desc", - "description": "", # Expect empty string - "last_updated": None, # Expect None - } - ] - self.assertEqual(repositories, expected_repositories) - - @patch("surfsense_backend.app.connectors.github_connector.github_login") - def test_github_connector_initialization_failure_forbidden(self, mock_github_login): - # Test that __init__ raises ValueError on auth failure (ForbiddenError) - mock_gh_instance = Mock() - mock_github_login.return_value = mock_gh_instance - - # Create a mock response object for the ForbiddenError - # The actual response structure might vary, but github3.py's ForbiddenError - # can be instantiated with just a response object that has a status_code. - mock_response = Mock() - mock_response.status_code = 403 # Typically Forbidden - - # Setup the side_effect for self.gh.me() - mock_gh_instance.me.side_effect = ForbiddenError(mock_response) - - with self.assertRaises(ValueError) as context: - GitHubConnector(token="invalid_token_forbidden") - self.assertIn( - "Invalid GitHub token or insufficient permissions.", str(context.exception) - ) - - @patch("surfsense_backend.app.connectors.github_connector.github_login") - def test_github_connector_initialization_failure_authentication_failed( - self, mock_github_login - ): - # Test that __init__ raises ValueError on auth failure (AuthenticationFailed, which is a subclass of ForbiddenError) - # For github3.py, AuthenticationFailed is more specific for token issues. - from github3.exceptions import AuthenticationFailed - - mock_gh_instance = Mock() - mock_github_login.return_value = mock_gh_instance - - mock_response = Mock() - mock_response.status_code = 401 # Typically Unauthorized - - mock_gh_instance.me.side_effect = AuthenticationFailed(mock_response) - - with self.assertRaises(ValueError) as context: - GitHubConnector(token="invalid_token_authfailed") - self.assertIn( - "Invalid GitHub token or insufficient permissions.", str(context.exception) - ) - - @patch("surfsense_backend.app.connectors.github_connector.github_login") - def test_get_user_repositories_handles_api_exception(self, mock_github_login): - mock_gh_instance = Mock() - mock_github_login.return_value = mock_gh_instance - mock_gh_instance.me.return_value = Mock() - - # Simulate an exception when calling repositories - mock_gh_instance.repositories.side_effect = Exception("API Error") - - connector = GitHubConnector(token="fake_token") - # We expect it to log an error and return an empty list - with patch( - "surfsense_backend.app.connectors.github_connector.logger" - ) as mock_logger: - repositories = connector.get_user_repositories() - - self.assertEqual(repositories, []) - mock_logger.error.assert_called_once() - self.assertIn( - "Failed to fetch GitHub repositories: API Error", - mock_logger.error.call_args[0][0], - ) - - -if __name__ == "__main__": - unittest.main() diff --git a/surfsense_backend/app/connectors/test_slack_history.py b/surfsense_backend/app/connectors/test_slack_history.py deleted file mode 100644 index 67677df..0000000 --- a/surfsense_backend/app/connectors/test_slack_history.py +++ /dev/null @@ -1,507 +0,0 @@ -import unittest -from unittest.mock import Mock, call, patch - -from slack_sdk.errors import SlackApiError - -# Since test_slack_history.py is in the same directory as slack_history.py -from .slack_history import SlackHistory - - -class TestSlackHistoryGetAllChannels(unittest.TestCase): - @patch("surfsense_backend.app.connectors.slack_history.logger") - @patch("surfsense_backend.app.connectors.slack_history.time.sleep") - @patch("slack_sdk.WebClient") - def test_get_all_channels_pagination_with_delay( - self, mock_web_client, mock_sleep, mock_logger - ): - mock_client_instance = mock_web_client.return_value - - # Mock API responses now include is_private and is_member - page1_response = { - "channels": [ - {"name": "general", "id": "C1", "is_private": False, "is_member": True}, - {"name": "dev", "id": "C0", "is_private": False, "is_member": True}, - ], - "response_metadata": {"next_cursor": "cursor123"}, - } - page2_response = { - "channels": [ - {"name": "random", "id": "C2", "is_private": True, "is_member": True} - ], - "response_metadata": {"next_cursor": ""}, - } - - mock_client_instance.conversations_list.side_effect = [ - page1_response, - page2_response, - ] - - slack_history = SlackHistory(token="fake_token") - channels_list = slack_history.get_all_channels(include_private=True) - - expected_channels_list = [ - {"id": "C1", "name": "general", "is_private": False, "is_member": True}, - {"id": "C0", "name": "dev", "is_private": False, "is_member": True}, - {"id": "C2", "name": "random", "is_private": True, "is_member": True}, - ] - - self.assertEqual(len(channels_list), 3) - self.assertListEqual( - channels_list, expected_channels_list - ) # Assert list equality - - expected_calls = [ - call(types="public_channel,private_channel", cursor=None, limit=1000), - call( - types="public_channel,private_channel", cursor="cursor123", limit=1000 - ), - ] - mock_client_instance.conversations_list.assert_has_calls(expected_calls) - self.assertEqual(mock_client_instance.conversations_list.call_count, 2) - - mock_sleep.assert_called_once_with(3) - mock_logger.info.assert_called_once_with( - "Paginating for channels, waiting 3 seconds before next call. Cursor: cursor123" - ) - - @patch("surfsense_backend.app.connectors.slack_history.logger") - @patch("surfsense_backend.app.connectors.slack_history.time.sleep") - @patch("slack_sdk.WebClient") - def test_get_all_channels_rate_limit_with_retry_after( - self, mock_web_client, mock_sleep, mock_logger - ): - mock_client_instance = mock_web_client.return_value - - mock_error_response = Mock() - mock_error_response.status_code = 429 - mock_error_response.headers = {"Retry-After": "5"} - - successful_response = { - "channels": [ - {"name": "general", "id": "C1", "is_private": False, "is_member": True} - ], - "response_metadata": {"next_cursor": ""}, - } - - mock_client_instance.conversations_list.side_effect = [ - SlackApiError(message="ratelimited", response=mock_error_response), - successful_response, - ] - - slack_history = SlackHistory(token="fake_token") - channels_list = slack_history.get_all_channels(include_private=True) - - expected_channels_list = [ - {"id": "C1", "name": "general", "is_private": False, "is_member": True} - ] - self.assertEqual(len(channels_list), 1) - self.assertListEqual(channels_list, expected_channels_list) - - mock_sleep.assert_called_once_with(5) - mock_logger.warning.assert_called_once_with( - "Slack API rate limit hit while fetching channels. Waiting for 5 seconds. Cursor: None" - ) - - expected_calls = [ - call(types="public_channel,private_channel", cursor=None, limit=1000), - call(types="public_channel,private_channel", cursor=None, limit=1000), - ] - mock_client_instance.conversations_list.assert_has_calls(expected_calls) - self.assertEqual(mock_client_instance.conversations_list.call_count, 2) - - @patch("surfsense_backend.app.connectors.slack_history.logger") - @patch("surfsense_backend.app.connectors.slack_history.time.sleep") - @patch("slack_sdk.WebClient") - def test_get_all_channels_rate_limit_no_retry_after_valid_header( - self, mock_web_client, mock_sleep, mock_logger - ): - mock_client_instance = mock_web_client.return_value - - mock_error_response = Mock() - mock_error_response.status_code = 429 - mock_error_response.headers = {"Retry-After": "invalid_value"} - - successful_response = { - "channels": [ - {"name": "general", "id": "C1", "is_private": False, "is_member": True} - ], - "response_metadata": {"next_cursor": ""}, - } - - mock_client_instance.conversations_list.side_effect = [ - SlackApiError(message="ratelimited", response=mock_error_response), - successful_response, - ] - - slack_history = SlackHistory(token="fake_token") - channels_list = slack_history.get_all_channels(include_private=True) - - expected_channels_list = [ - {"id": "C1", "name": "general", "is_private": False, "is_member": True} - ] - self.assertListEqual(channels_list, expected_channels_list) - mock_sleep.assert_called_once_with(60) # Default fallback - mock_logger.warning.assert_called_once_with( - "Slack API rate limit hit while fetching channels. Waiting for 60 seconds. Cursor: None" - ) - self.assertEqual(mock_client_instance.conversations_list.call_count, 2) - - @patch("surfsense_backend.app.connectors.slack_history.logger") - @patch("surfsense_backend.app.connectors.slack_history.time.sleep") - @patch("slack_sdk.WebClient") - def test_get_all_channels_rate_limit_no_retry_after_header( - self, mock_web_client, mock_sleep, mock_logger - ): - mock_client_instance = mock_web_client.return_value - - mock_error_response = Mock() - mock_error_response.status_code = 429 - mock_error_response.headers = {} - - successful_response = { - "channels": [ - {"name": "general", "id": "C1", "is_private": False, "is_member": True} - ], - "response_metadata": {"next_cursor": ""}, - } - - mock_client_instance.conversations_list.side_effect = [ - SlackApiError(message="ratelimited", response=mock_error_response), - successful_response, - ] - - slack_history = SlackHistory(token="fake_token") - channels_list = slack_history.get_all_channels(include_private=True) - - expected_channels_list = [ - {"id": "C1", "name": "general", "is_private": False, "is_member": True} - ] - self.assertListEqual(channels_list, expected_channels_list) - mock_sleep.assert_called_once_with(60) # Default fallback - mock_logger.warning.assert_called_once_with( - "Slack API rate limit hit while fetching channels. Waiting for 60 seconds. Cursor: None" - ) - self.assertEqual(mock_client_instance.conversations_list.call_count, 2) - - @patch("surfsense_backend.app.connectors.slack_history.logger") - @patch("surfsense_backend.app.connectors.slack_history.time.sleep") - @patch("slack_sdk.WebClient") - def test_get_all_channels_other_slack_api_error( - self, mock_web_client, mock_sleep, mock_logger - ): - mock_client_instance = mock_web_client.return_value - - mock_error_response = Mock() - mock_error_response.status_code = 500 - mock_error_response.headers = {} - mock_error_response.data = {"ok": False, "error": "internal_error"} - - original_error = SlackApiError( - message="server error", response=mock_error_response - ) - mock_client_instance.conversations_list.side_effect = original_error - - slack_history = SlackHistory(token="fake_token") - - with self.assertRaises(SlackApiError) as context: - slack_history.get_all_channels(include_private=True) - - self.assertEqual(context.exception.response.status_code, 500) - self.assertIn("server error", str(context.exception)) - mock_sleep.assert_not_called() - mock_logger.warning.assert_not_called() # Ensure no rate limit log - mock_client_instance.conversations_list.assert_called_once_with( - types="public_channel,private_channel", cursor=None, limit=1000 - ) - - @patch("surfsense_backend.app.connectors.slack_history.logger") - @patch("surfsense_backend.app.connectors.slack_history.time.sleep") - @patch("slack_sdk.WebClient") - def test_get_all_channels_handles_missing_name_id_gracefully( - self, mock_web_client, mock_sleep, mock_logger - ): - mock_client_instance = mock_web_client.return_value - - response_with_malformed_data = { - "channels": [ - {"id": "C1_missing_name", "is_private": False, "is_member": True}, - {"name": "channel_missing_id", "is_private": False, "is_member": True}, - { - "name": "general", - "id": "C2_valid", - "is_private": False, - "is_member": True, - }, - ], - "response_metadata": {"next_cursor": ""}, - } - - mock_client_instance.conversations_list.return_value = ( - response_with_malformed_data - ) - - slack_history = SlackHistory(token="fake_token") - channels_list = slack_history.get_all_channels(include_private=True) - - expected_channels_list = [ - { - "id": "C2_valid", - "name": "general", - "is_private": False, - "is_member": True, - } - ] - self.assertEqual(len(channels_list), 1) - self.assertListEqual(channels_list, expected_channels_list) - - self.assertEqual(mock_logger.warning.call_count, 2) - mock_logger.warning.assert_any_call( - "Channel found with missing name or id. Data: {'id': 'C1_missing_name', 'is_private': False, 'is_member': True}" - ) - mock_logger.warning.assert_any_call( - "Channel found with missing name or id. Data: {'name': 'channel_missing_id', 'is_private': False, 'is_member': True}" - ) - - mock_sleep.assert_not_called() - mock_client_instance.conversations_list.assert_called_once_with( - types="public_channel,private_channel", cursor=None, limit=1000 - ) - - -if __name__ == "__main__": - unittest.main() - - -class TestSlackHistoryGetConversationHistory(unittest.TestCase): - @patch("surfsense_backend.app.connectors.slack_history.logger") - @patch("surfsense_backend.app.connectors.slack_history.time.sleep") - @patch("slack_sdk.WebClient") - def test_proactive_delay_single_page( - self, mock_web_client, mock_time_sleep, mock_logger - ): - mock_client_instance = mock_web_client.return_value - mock_client_instance.conversations_history.return_value = { - "messages": [{"text": "msg1"}], - "has_more": False, - } - - slack_history = SlackHistory(token="fake_token") - slack_history.get_conversation_history(channel_id="C123") - - mock_time_sleep.assert_called_once_with(1.2) # Proactive delay - - @patch("surfsense_backend.app.connectors.slack_history.logger") - @patch("surfsense_backend.app.connectors.slack_history.time.sleep") - @patch("slack_sdk.WebClient") - def test_proactive_delay_multiple_pages( - self, mock_web_client, mock_time_sleep, mock_logger - ): - mock_client_instance = mock_web_client.return_value - mock_client_instance.conversations_history.side_effect = [ - { - "messages": [{"text": "msg1"}], - "has_more": True, - "response_metadata": {"next_cursor": "cursor1"}, - }, - {"messages": [{"text": "msg2"}], "has_more": False}, - ] - - slack_history = SlackHistory(token="fake_token") - slack_history.get_conversation_history(channel_id="C123") - - # Expected calls: 1.2 (page1), 1.2 (page2) - self.assertEqual(mock_time_sleep.call_count, 2) - mock_time_sleep.assert_has_calls([call(1.2), call(1.2)]) - - @patch("surfsense_backend.app.connectors.slack_history.logger") - @patch("surfsense_backend.app.connectors.slack_history.time.sleep") - @patch("slack_sdk.WebClient") - def test_retry_after_logic(self, mock_web_client, mock_time_sleep, mock_logger): - mock_client_instance = mock_web_client.return_value - - mock_error_response = Mock() - mock_error_response.status_code = 429 - mock_error_response.headers = {"Retry-After": "5"} - - mock_client_instance.conversations_history.side_effect = [ - SlackApiError(message="ratelimited", response=mock_error_response), - {"messages": [{"text": "msg1"}], "has_more": False}, - ] - - slack_history = SlackHistory(token="fake_token") - messages = slack_history.get_conversation_history(channel_id="C123") - - self.assertEqual(len(messages), 1) - self.assertEqual(messages[0]["text"], "msg1") - - # Expected sleep calls: 1.2 (proactive for 1st attempt), 5 (rate limit), 1.2 (proactive for 2nd attempt) - mock_time_sleep.assert_has_calls( - [call(1.2), call(5), call(1.2)], any_order=False - ) - mock_logger.warning.assert_called_once() # Check that a warning was logged for rate limiting - - @patch("surfsense_backend.app.connectors.slack_history.logger") - @patch("surfsense_backend.app.connectors.slack_history.time.sleep") - @patch("slack_sdk.WebClient") - def test_not_in_channel_error(self, mock_web_client, mock_time_sleep, mock_logger): - mock_client_instance = mock_web_client.return_value - - mock_error_response = Mock() - mock_error_response.status_code = ( - 403 # Typical for not_in_channel, but data matters more - ) - mock_error_response.data = {"ok": False, "error": "not_in_channel"} - - # This error is now raised by the inner try-except, then caught by the outer one - mock_client_instance.conversations_history.side_effect = SlackApiError( - message="not_in_channel error", response=mock_error_response - ) - - slack_history = SlackHistory(token="fake_token") - messages = slack_history.get_conversation_history(channel_id="C123") - - self.assertEqual(messages, []) - mock_logger.warning.assert_called_with( - "Bot is not in channel 'C123'. Cannot fetch history. Please add the bot to this channel." - ) - mock_time_sleep.assert_called_once_with( - 1.2 - ) # Proactive delay before the API call - - @patch("surfsense_backend.app.connectors.slack_history.logger") - @patch("surfsense_backend.app.connectors.slack_history.time.sleep") - @patch("slack_sdk.WebClient") - def test_other_slack_api_error_propagates( - self, mock_web_client, mock_time_sleep, mock_logger - ): - mock_client_instance = mock_web_client.return_value - - mock_error_response = Mock() - mock_error_response.status_code = 500 - mock_error_response.data = {"ok": False, "error": "internal_error"} - original_error = SlackApiError( - message="server error", response=mock_error_response - ) - - mock_client_instance.conversations_history.side_effect = original_error - - slack_history = SlackHistory(token="fake_token") - - with self.assertRaises(SlackApiError) as context: - slack_history.get_conversation_history(channel_id="C123") - - self.assertIn( - "Error retrieving history for channel C123", str(context.exception) - ) - self.assertIs(context.exception.response, mock_error_response) - mock_time_sleep.assert_called_once_with(1.2) # Proactive delay - - @patch("surfsense_backend.app.connectors.slack_history.logger") - @patch("surfsense_backend.app.connectors.slack_history.time.sleep") - @patch("slack_sdk.WebClient") - def test_general_exception_propagates( - self, mock_web_client, mock_time_sleep, mock_logger - ): - mock_client_instance = mock_web_client.return_value - original_error = Exception("Something broke") - mock_client_instance.conversations_history.side_effect = original_error - - slack_history = SlackHistory(token="fake_token") - - with self.assertRaises(Exception) as context: # Check for generic Exception - slack_history.get_conversation_history(channel_id="C123") - - self.assertIs( - context.exception, original_error - ) # Should re-raise the original error - mock_logger.error.assert_called_once_with( - "Unexpected error in get_conversation_history for channel C123: Something broke" - ) - mock_time_sleep.assert_called_once_with(1.2) # Proactive delay - - -class TestSlackHistoryGetUserInfo(unittest.TestCase): - @patch("surfsense_backend.app.connectors.slack_history.logger") - @patch("surfsense_backend.app.connectors.slack_history.time.sleep") - @patch("slack_sdk.WebClient") - def test_retry_after_logic(self, mock_web_client, mock_time_sleep, mock_logger): - mock_client_instance = mock_web_client.return_value - - mock_error_response = Mock() - mock_error_response.status_code = 429 - mock_error_response.headers = {"Retry-After": "3"} # Using 3 seconds for test - - successful_user_data = {"id": "U123", "name": "testuser"} - - mock_client_instance.users_info.side_effect = [ - SlackApiError(message="ratelimited_userinfo", response=mock_error_response), - {"user": successful_user_data}, - ] - - slack_history = SlackHistory(token="fake_token") - user_info = slack_history.get_user_info(user_id="U123") - - self.assertEqual(user_info, successful_user_data) - - # Assert that time.sleep was called for the rate limit - mock_time_sleep.assert_called_once_with(3) - mock_logger.warning.assert_called_once_with( - "Rate limited by Slack on users.info for user U123. Retrying after 3 seconds." - ) - # Assert users_info was called twice (original + retry) - self.assertEqual(mock_client_instance.users_info.call_count, 2) - mock_client_instance.users_info.assert_has_calls( - [call(user="U123"), call(user="U123")] - ) - - @patch("surfsense_backend.app.connectors.slack_history.logger") - @patch( - "surfsense_backend.app.connectors.slack_history.time.sleep" - ) # time.sleep might be called by other logic, but not expected here - @patch("slack_sdk.WebClient") - def test_other_slack_api_error_propagates( - self, mock_web_client, mock_time_sleep, mock_logger - ): - mock_client_instance = mock_web_client.return_value - - mock_error_response = Mock() - mock_error_response.status_code = 500 # Some other error - mock_error_response.data = {"ok": False, "error": "internal_server_error"} - original_error = SlackApiError( - message="internal server error", response=mock_error_response - ) - - mock_client_instance.users_info.side_effect = original_error - - slack_history = SlackHistory(token="fake_token") - - with self.assertRaises(SlackApiError) as context: - slack_history.get_user_info(user_id="U123") - - # Check that the raised error is the one we expect - self.assertIn("Error retrieving user info for U123", str(context.exception)) - self.assertIs(context.exception.response, mock_error_response) - mock_time_sleep.assert_not_called() # No rate limit sleep - - @patch("surfsense_backend.app.connectors.slack_history.logger") - @patch("surfsense_backend.app.connectors.slack_history.time.sleep") - @patch("slack_sdk.WebClient") - def test_general_exception_propagates( - self, mock_web_client, mock_time_sleep, mock_logger - ): - mock_client_instance = mock_web_client.return_value - original_error = Exception("A very generic problem") - mock_client_instance.users_info.side_effect = original_error - - slack_history = SlackHistory(token="fake_token") - - with self.assertRaises(Exception) as context: - slack_history.get_user_info(user_id="U123") - - self.assertIs( - context.exception, original_error - ) # Check it's the exact same exception - mock_logger.error.assert_called_once_with( - "Unexpected error in get_user_info for user U123: A very generic problem" - ) - mock_time_sleep.assert_not_called() # No rate limit sleep diff --git a/surfsense_backend/app/routes/documents_routes.py b/surfsense_backend/app/routes/documents_routes.py index 2c21fa3..8711859 100644 --- a/surfsense_backend/app/routes/documents_routes.py +++ b/surfsense_backend/app/routes/documents_routes.py @@ -10,7 +10,7 @@ from app.config import config as app_config from app.db import Document, DocumentType, Log, SearchSpace, User, get_async_session from app.schemas import DocumentRead, DocumentsCreate, DocumentUpdate from app.services.task_logging_service import TaskLoggingService -from app.tasks.background_tasks import ( +from app.tasks.document_processors import ( add_crawled_url_document, add_extension_received_document, add_received_file_document_using_docling, diff --git a/surfsense_backend/app/routes/search_source_connectors_routes.py b/surfsense_backend/app/routes/search_source_connectors_routes.py index a65f1cf..49f3128 100644 --- a/surfsense_backend/app/routes/search_source_connectors_routes.py +++ b/surfsense_backend/app/routes/search_source_connectors_routes.py @@ -35,7 +35,7 @@ from app.schemas import ( SearchSourceConnectorRead, SearchSourceConnectorUpdate, ) -from app.tasks.connectors_indexing_tasks import ( +from app.tasks.connector_indexers import ( index_clickup_tasks, index_confluence_pages, index_discord_messages, diff --git a/surfsense_backend/app/tasks/background_tasks.py b/surfsense_backend/app/tasks/background_tasks.py deleted file mode 100644 index 06304b4..0000000 --- a/surfsense_backend/app/tasks/background_tasks.py +++ /dev/null @@ -1,1077 +0,0 @@ -import logging -from urllib.parse import parse_qs, urlparse - -import aiohttp -import validators -from langchain_community.document_loaders import AsyncChromiumLoader, FireCrawlLoader -from langchain_community.document_transformers import MarkdownifyTransformer -from langchain_core.documents import Document as LangChainDocument -from sqlalchemy.exc import SQLAlchemyError -from sqlalchemy.ext.asyncio import AsyncSession -from sqlalchemy.future import select -from youtube_transcript_api import YouTubeTranscriptApi - -from app.config import config -from app.db import Chunk, Document, DocumentType -from app.prompts import SUMMARY_PROMPT_TEMPLATE -from app.schemas import ExtensionDocumentContent -from app.services.llm_service import get_user_long_context_llm -from app.services.task_logging_service import TaskLoggingService -from app.utils.document_converters import ( - convert_document_to_markdown, - generate_content_hash, -) - -md = MarkdownifyTransformer() - - -async def add_crawled_url_document( - session: AsyncSession, url: str, search_space_id: int, user_id: str -) -> Document | None: - task_logger = TaskLoggingService(session, search_space_id) - - # Log task start - log_entry = await task_logger.log_task_start( - task_name="crawl_url_document", - source="background_task", - message=f"Starting URL crawling process for: {url}", - metadata={"url": url, "user_id": str(user_id)}, - ) - - try: - # URL validation step - await task_logger.log_task_progress( - log_entry, f"Validating URL: {url}", {"stage": "validation"} - ) - - if not validators.url(url): - raise ValueError(f"Url {url} is not a valid URL address") - - # Set up crawler - await task_logger.log_task_progress( - log_entry, - f"Setting up crawler for URL: {url}", - { - "stage": "crawler_setup", - "firecrawl_available": bool(config.FIRECRAWL_API_KEY), - }, - ) - - if config.FIRECRAWL_API_KEY: - crawl_loader = FireCrawlLoader( - url=url, - api_key=config.FIRECRAWL_API_KEY, - mode="scrape", - params={ - "formats": ["markdown"], - "excludeTags": ["a"], - }, - ) - else: - crawl_loader = AsyncChromiumLoader(urls=[url], headless=True) - - # Perform crawling - await task_logger.log_task_progress( - log_entry, - f"Crawling URL content: {url}", - {"stage": "crawling", "crawler_type": type(crawl_loader).__name__}, - ) - - url_crawled = await crawl_loader.aload() - - if isinstance(crawl_loader, FireCrawlLoader): - content_in_markdown = url_crawled[0].page_content - elif isinstance(crawl_loader, AsyncChromiumLoader): - content_in_markdown = md.transform_documents(url_crawled)[0].page_content - - # Format document - await task_logger.log_task_progress( - log_entry, - f"Processing crawled content from: {url}", - {"stage": "content_processing", "content_length": len(content_in_markdown)}, - ) - - # Format document metadata in a more maintainable way - metadata_sections = [ - ( - "METADATA", - [ - f"{key.upper()}: {value}" - for key, value in url_crawled[0].metadata.items() - ], - ), - ( - "CONTENT", - ["FORMAT: markdown", "TEXT_START", content_in_markdown, "TEXT_END"], - ), - ] - - # Build the document string more efficiently - document_parts = [] - document_parts.append("") - - for section_title, section_content in metadata_sections: - document_parts.append(f"<{section_title}>") - document_parts.extend(section_content) - document_parts.append(f"") - - document_parts.append("") - combined_document_string = "\n".join(document_parts) - content_hash = generate_content_hash(combined_document_string, search_space_id) - - # Check for duplicates - await task_logger.log_task_progress( - log_entry, - f"Checking for duplicate content: {url}", - {"stage": "duplicate_check", "content_hash": content_hash}, - ) - - # Check if document with this content hash already exists - existing_doc_result = await session.execute( - select(Document).where(Document.content_hash == content_hash) - ) - existing_document = existing_doc_result.scalars().first() - - if existing_document: - await task_logger.log_task_success( - log_entry, - f"Document already exists for URL: {url}", - { - "duplicate_detected": True, - "existing_document_id": existing_document.id, - }, - ) - logging.info( - f"Document with content hash {content_hash} already exists. Skipping processing." - ) - return existing_document - - # Get LLM for summary generation - await task_logger.log_task_progress( - log_entry, - f"Preparing for summary generation: {url}", - {"stage": "llm_setup"}, - ) - - # Get user's long context LLM - user_llm = await get_user_long_context_llm(session, user_id) - if not user_llm: - raise RuntimeError(f"No long context LLM configured for user {user_id}") - - # Generate summary - await task_logger.log_task_progress( - log_entry, - f"Generating summary for URL content: {url}", - {"stage": "summary_generation"}, - ) - - summary_chain = SUMMARY_PROMPT_TEMPLATE | user_llm - summary_result = await summary_chain.ainvoke( - {"document": combined_document_string} - ) - summary_content = summary_result.content - summary_embedding = config.embedding_model_instance.embed(summary_content) - - # Process chunks - await task_logger.log_task_progress( - log_entry, - f"Processing content chunks for URL: {url}", - {"stage": "chunk_processing"}, - ) - - chunks = [ - Chunk( - content=chunk.text, - embedding=config.embedding_model_instance.embed(chunk.text), - ) - for chunk in config.chunker_instance.chunk(content_in_markdown) - ] - - # Create and store document - await task_logger.log_task_progress( - log_entry, - f"Creating document in database for URL: {url}", - {"stage": "document_creation", "chunks_count": len(chunks)}, - ) - - document = Document( - search_space_id=search_space_id, - title=url_crawled[0].metadata["title"] - if isinstance(crawl_loader, FireCrawlLoader) - else url_crawled[0].metadata["source"], - document_type=DocumentType.CRAWLED_URL, - document_metadata=url_crawled[0].metadata, - content=summary_content, - embedding=summary_embedding, - chunks=chunks, - content_hash=content_hash, - ) - - session.add(document) - await session.commit() - await session.refresh(document) - - # Log success - await task_logger.log_task_success( - log_entry, - f"Successfully crawled and processed URL: {url}", - { - "document_id": document.id, - "title": document.title, - "content_hash": content_hash, - "chunks_count": len(chunks), - "summary_length": len(summary_content), - }, - ) - - return document - - except SQLAlchemyError as db_error: - await session.rollback() - await task_logger.log_task_failure( - log_entry, - f"Database error while processing URL: {url}", - str(db_error), - {"error_type": "SQLAlchemyError"}, - ) - raise db_error - except Exception as e: - await session.rollback() - await task_logger.log_task_failure( - log_entry, - f"Failed to crawl URL: {url}", - str(e), - {"error_type": type(e).__name__}, - ) - raise RuntimeError(f"Failed to crawl URL: {e!s}") from e - - -async def add_extension_received_document( - session: AsyncSession, - content: ExtensionDocumentContent, - search_space_id: int, - user_id: str, -) -> Document | None: - """ - Process and store document content received from the SurfSense Extension. - - Args: - session: Database session - content: Document content from extension - search_space_id: ID of the search space - - Returns: - Document object if successful, None if failed - """ - task_logger = TaskLoggingService(session, search_space_id) - - # Log task start - log_entry = await task_logger.log_task_start( - task_name="extension_document", - source="background_task", - message=f"Processing extension document: {content.metadata.VisitedWebPageTitle}", - metadata={ - "url": content.metadata.VisitedWebPageURL, - "title": content.metadata.VisitedWebPageTitle, - "user_id": str(user_id), - }, - ) - - try: - # Format document metadata in a more maintainable way - metadata_sections = [ - ( - "METADATA", - [ - f"SESSION_ID: {content.metadata.BrowsingSessionId}", - f"URL: {content.metadata.VisitedWebPageURL}", - f"TITLE: {content.metadata.VisitedWebPageTitle}", - f"REFERRER: {content.metadata.VisitedWebPageReffererURL}", - f"TIMESTAMP: {content.metadata.VisitedWebPageDateWithTimeInISOString}", - f"DURATION_MS: {content.metadata.VisitedWebPageVisitDurationInMilliseconds}", - ], - ), - ( - "CONTENT", - ["FORMAT: markdown", "TEXT_START", content.pageContent, "TEXT_END"], - ), - ] - - # Build the document string more efficiently - document_parts = [] - document_parts.append("") - - for section_title, section_content in metadata_sections: - document_parts.append(f"<{section_title}>") - document_parts.extend(section_content) - document_parts.append(f"") - - document_parts.append("") - combined_document_string = "\n".join(document_parts) - content_hash = generate_content_hash(combined_document_string, search_space_id) - - # Check if document with this content hash already exists - existing_doc_result = await session.execute( - select(Document).where(Document.content_hash == content_hash) - ) - existing_document = existing_doc_result.scalars().first() - - if existing_document: - await task_logger.log_task_success( - log_entry, - f"Extension document already exists: {content.metadata.VisitedWebPageTitle}", - { - "duplicate_detected": True, - "existing_document_id": existing_document.id, - }, - ) - logging.info( - f"Document with content hash {content_hash} already exists. Skipping processing." - ) - return existing_document - - # Get user's long context LLM - user_llm = await get_user_long_context_llm(session, user_id) - if not user_llm: - raise RuntimeError(f"No long context LLM configured for user {user_id}") - - # Generate summary - summary_chain = SUMMARY_PROMPT_TEMPLATE | user_llm - summary_result = await summary_chain.ainvoke( - {"document": combined_document_string} - ) - summary_content = summary_result.content - summary_embedding = config.embedding_model_instance.embed(summary_content) - - # Process chunks - chunks = [ - Chunk( - content=chunk.text, - embedding=config.embedding_model_instance.embed(chunk.text), - ) - for chunk in config.chunker_instance.chunk(content.pageContent) - ] - - # Create and store document - document = Document( - search_space_id=search_space_id, - title=content.metadata.VisitedWebPageTitle, - document_type=DocumentType.EXTENSION, - document_metadata=content.metadata.model_dump(), - content=summary_content, - embedding=summary_embedding, - chunks=chunks, - content_hash=content_hash, - ) - - session.add(document) - await session.commit() - await session.refresh(document) - - # Log success - await task_logger.log_task_success( - log_entry, - f"Successfully processed extension document: {content.metadata.VisitedWebPageTitle}", - { - "document_id": document.id, - "content_hash": content_hash, - "url": content.metadata.VisitedWebPageURL, - }, - ) - - return document - - except SQLAlchemyError as db_error: - await session.rollback() - await task_logger.log_task_failure( - log_entry, - f"Database error processing extension document: {content.metadata.VisitedWebPageTitle}", - str(db_error), - {"error_type": "SQLAlchemyError"}, - ) - raise db_error - except Exception as e: - await session.rollback() - await task_logger.log_task_failure( - log_entry, - f"Failed to process extension document: {content.metadata.VisitedWebPageTitle}", - str(e), - {"error_type": type(e).__name__}, - ) - raise RuntimeError(f"Failed to process extension document: {e!s}") from e - - -async def add_received_markdown_file_document( - session: AsyncSession, - file_name: str, - file_in_markdown: str, - search_space_id: int, - user_id: str, -) -> Document | None: - task_logger = TaskLoggingService(session, search_space_id) - - # Log task start - log_entry = await task_logger.log_task_start( - task_name="markdown_file_document", - source="background_task", - message=f"Processing markdown file: {file_name}", - metadata={ - "filename": file_name, - "user_id": str(user_id), - "content_length": len(file_in_markdown), - }, - ) - - try: - content_hash = generate_content_hash(file_in_markdown, search_space_id) - - # Check if document with this content hash already exists - existing_doc_result = await session.execute( - select(Document).where(Document.content_hash == content_hash) - ) - existing_document = existing_doc_result.scalars().first() - - if existing_document: - await task_logger.log_task_success( - log_entry, - f"Markdown file document already exists: {file_name}", - { - "duplicate_detected": True, - "existing_document_id": existing_document.id, - }, - ) - logging.info( - f"Document with content hash {content_hash} already exists. Skipping processing." - ) - return existing_document - - # Get user's long context LLM - user_llm = await get_user_long_context_llm(session, user_id) - if not user_llm: - raise RuntimeError(f"No long context LLM configured for user {user_id}") - - # Generate summary - summary_chain = SUMMARY_PROMPT_TEMPLATE | user_llm - summary_result = await summary_chain.ainvoke({"document": file_in_markdown}) - summary_content = summary_result.content - summary_embedding = config.embedding_model_instance.embed(summary_content) - - # Process chunks - chunks = [ - Chunk( - content=chunk.text, - embedding=config.embedding_model_instance.embed(chunk.text), - ) - for chunk in config.chunker_instance.chunk(file_in_markdown) - ] - - # Create and store document - document = Document( - search_space_id=search_space_id, - title=file_name, - document_type=DocumentType.FILE, - document_metadata={ - "FILE_NAME": file_name, - }, - content=summary_content, - embedding=summary_embedding, - chunks=chunks, - content_hash=content_hash, - ) - - session.add(document) - await session.commit() - await session.refresh(document) - - # Log success - await task_logger.log_task_success( - log_entry, - f"Successfully processed markdown file: {file_name}", - { - "document_id": document.id, - "content_hash": content_hash, - "chunks_count": len(chunks), - "summary_length": len(summary_content), - }, - ) - - return document - except SQLAlchemyError as db_error: - await session.rollback() - await task_logger.log_task_failure( - log_entry, - f"Database error processing markdown file: {file_name}", - str(db_error), - {"error_type": "SQLAlchemyError"}, - ) - raise db_error - except Exception as e: - await session.rollback() - await task_logger.log_task_failure( - log_entry, - f"Failed to process markdown file: {file_name}", - str(e), - {"error_type": type(e).__name__}, - ) - raise RuntimeError(f"Failed to process file document: {e!s}") from e - - -async def add_received_file_document_using_unstructured( - session: AsyncSession, - file_name: str, - unstructured_processed_elements: list[LangChainDocument], - search_space_id: int, - user_id: str, -) -> Document | None: - try: - file_in_markdown = await convert_document_to_markdown( - unstructured_processed_elements - ) - - content_hash = generate_content_hash(file_in_markdown, search_space_id) - - # Check if document with this content hash already exists - existing_doc_result = await session.execute( - select(Document).where(Document.content_hash == content_hash) - ) - existing_document = existing_doc_result.scalars().first() - - if existing_document: - logging.info( - f"Document with content hash {content_hash} already exists. Skipping processing." - ) - return existing_document - - # TODO: Check if file_markdown exceeds token limit of embedding model - - # Get user's long context LLM - user_llm = await get_user_long_context_llm(session, user_id) - if not user_llm: - raise RuntimeError(f"No long context LLM configured for user {user_id}") - - # Generate summary - summary_chain = SUMMARY_PROMPT_TEMPLATE | user_llm - summary_result = await summary_chain.ainvoke({"document": file_in_markdown}) - summary_content = summary_result.content - summary_embedding = config.embedding_model_instance.embed(summary_content) - - # Process chunks - chunks = [ - Chunk( - content=chunk.text, - embedding=config.embedding_model_instance.embed(chunk.text), - ) - for chunk in config.chunker_instance.chunk(file_in_markdown) - ] - - # Create and store document - document = Document( - search_space_id=search_space_id, - title=file_name, - document_type=DocumentType.FILE, - document_metadata={ - "FILE_NAME": file_name, - "ETL_SERVICE": "UNSTRUCTURED", - }, - content=summary_content, - embedding=summary_embedding, - chunks=chunks, - content_hash=content_hash, - ) - - session.add(document) - await session.commit() - await session.refresh(document) - - return document - except SQLAlchemyError as db_error: - await session.rollback() - raise db_error - except Exception as e: - await session.rollback() - raise RuntimeError(f"Failed to process file document: {e!s}") from e - - -async def add_received_file_document_using_llamacloud( - session: AsyncSession, - file_name: str, - llamacloud_markdown_document: str, - search_space_id: int, - user_id: str, -) -> Document | None: - """ - Process and store document content parsed by LlamaCloud. - - Args: - session: Database session - file_name: Name of the processed file - llamacloud_markdown_documents: List of markdown content from LlamaCloud parsing - search_space_id: ID of the search space - - Returns: - Document object if successful, None if failed - """ - try: - # Combine all markdown documents into one - file_in_markdown = llamacloud_markdown_document - - content_hash = generate_content_hash(file_in_markdown, search_space_id) - - # Check if document with this content hash already exists - existing_doc_result = await session.execute( - select(Document).where(Document.content_hash == content_hash) - ) - existing_document = existing_doc_result.scalars().first() - - if existing_document: - logging.info( - f"Document with content hash {content_hash} already exists. Skipping processing." - ) - return existing_document - - # Get user's long context LLM - user_llm = await get_user_long_context_llm(session, user_id) - if not user_llm: - raise RuntimeError(f"No long context LLM configured for user {user_id}") - - # Generate summary - summary_chain = SUMMARY_PROMPT_TEMPLATE | user_llm - summary_result = await summary_chain.ainvoke({"document": file_in_markdown}) - summary_content = summary_result.content - summary_embedding = config.embedding_model_instance.embed(summary_content) - - # Process chunks - chunks = [ - Chunk( - content=chunk.text, - embedding=config.embedding_model_instance.embed(chunk.text), - ) - for chunk in config.chunker_instance.chunk(file_in_markdown) - ] - - # Create and store document - document = Document( - search_space_id=search_space_id, - title=file_name, - document_type=DocumentType.FILE, - document_metadata={ - "FILE_NAME": file_name, - "ETL_SERVICE": "LLAMACLOUD", - }, - content=summary_content, - embedding=summary_embedding, - chunks=chunks, - content_hash=content_hash, - ) - - session.add(document) - await session.commit() - await session.refresh(document) - - return document - except SQLAlchemyError as db_error: - await session.rollback() - raise db_error - except Exception as e: - await session.rollback() - raise RuntimeError( - f"Failed to process file document using LlamaCloud: {e!s}" - ) from e - - -async def add_received_file_document_using_docling( - session: AsyncSession, - file_name: str, - docling_markdown_document: str, - search_space_id: int, - user_id: str, -) -> Document | None: - """ - Process and store document content parsed by Docling. - - Args: - session: Database session - file_name: Name of the processed file - docling_markdown_document: Markdown content from Docling parsing - search_space_id: ID of the search space - user_id: ID of the user - - Returns: - Document object if successful, None if failed - """ - try: - file_in_markdown = docling_markdown_document - - content_hash = generate_content_hash(file_in_markdown, search_space_id) - - # Check if document with this content hash already exists - existing_doc_result = await session.execute( - select(Document).where(Document.content_hash == content_hash) - ) - existing_document = existing_doc_result.scalars().first() - - if existing_document: - logging.info( - f"Document with content hash {content_hash} already exists. Skipping processing." - ) - return existing_document - - # Get user's long context LLM - user_llm = await get_user_long_context_llm(session, user_id) - if not user_llm: - raise RuntimeError(f"No long context LLM configured for user {user_id}") - - # Generate summary using chunked processing for large documents - from app.services.docling_service import create_docling_service - - docling_service = create_docling_service() - - summary_content = await docling_service.process_large_document_summary( - content=file_in_markdown, llm=user_llm, document_title=file_name - ) - summary_embedding = config.embedding_model_instance.embed(summary_content) - - # Process chunks - chunks = [ - Chunk( - content=chunk.text, - embedding=config.embedding_model_instance.embed(chunk.text), - ) - for chunk in config.chunker_instance.chunk(file_in_markdown) - ] - - # Create and store document - document = Document( - search_space_id=search_space_id, - title=file_name, - document_type=DocumentType.FILE, - document_metadata={ - "FILE_NAME": file_name, - "ETL_SERVICE": "DOCLING", - }, - content=summary_content, - embedding=summary_embedding, - chunks=chunks, - content_hash=content_hash, - ) - - session.add(document) - await session.commit() - await session.refresh(document) - - return document - except SQLAlchemyError as db_error: - await session.rollback() - raise db_error - except Exception as e: - await session.rollback() - raise RuntimeError( - f"Failed to process file document using Docling: {e!s}" - ) from e - - -async def add_youtube_video_document( - session: AsyncSession, url: str, search_space_id: int, user_id: str -): - """ - Process a YouTube video URL, extract transcripts, and store as a document. - - Args: - session: Database session for storing the document - url: YouTube video URL (supports standard, shortened, and embed formats) - search_space_id: ID of the search space to add the document to - - Returns: - Document: The created document object - - Raises: - ValueError: If the YouTube video ID cannot be extracted from the URL - SQLAlchemyError: If there's a database error - RuntimeError: If the video processing fails - """ - task_logger = TaskLoggingService(session, search_space_id) - - # Log task start - log_entry = await task_logger.log_task_start( - task_name="youtube_video_document", - source="background_task", - message=f"Starting YouTube video processing for: {url}", - metadata={"url": url, "user_id": str(user_id)}, - ) - - try: - # Extract video ID from URL - await task_logger.log_task_progress( - log_entry, - f"Extracting video ID from URL: {url}", - {"stage": "video_id_extraction"}, - ) - - def get_youtube_video_id(url: str): - parsed_url = urlparse(url) - hostname = parsed_url.hostname - - if hostname == "youtu.be": - return parsed_url.path[1:] - if hostname in ("www.youtube.com", "youtube.com"): - if parsed_url.path == "/watch": - query_params = parse_qs(parsed_url.query) - return query_params.get("v", [None])[0] - if parsed_url.path.startswith("/embed/"): - return parsed_url.path.split("/")[2] - if parsed_url.path.startswith("/v/"): - return parsed_url.path.split("/")[2] - return None - - # Get video ID - video_id = get_youtube_video_id(url) - if not video_id: - raise ValueError(f"Could not extract video ID from URL: {url}") - - await task_logger.log_task_progress( - log_entry, - f"Video ID extracted: {video_id}", - {"stage": "video_id_extracted", "video_id": video_id}, - ) - - # Get video metadata - await task_logger.log_task_progress( - log_entry, - f"Fetching video metadata for: {video_id}", - {"stage": "metadata_fetch"}, - ) - - params = { - "format": "json", - "url": f"https://www.youtube.com/watch?v={video_id}", - } - oembed_url = "https://www.youtube.com/oembed" - - async with ( - aiohttp.ClientSession() as http_session, - http_session.get(oembed_url, params=params) as response, - ): - video_data = await response.json() - - await task_logger.log_task_progress( - log_entry, - f"Video metadata fetched: {video_data.get('title', 'Unknown')}", - { - "stage": "metadata_fetched", - "title": video_data.get("title"), - "author": video_data.get("author_name"), - }, - ) - - # Get video transcript - await task_logger.log_task_progress( - log_entry, - f"Fetching transcript for video: {video_id}", - {"stage": "transcript_fetch"}, - ) - - try: - captions = YouTubeTranscriptApi.get_transcript(video_id) - # Include complete caption information with timestamps - transcript_segments = [] - for line in captions: - start_time = line.get("start", 0) - duration = line.get("duration", 0) - text = line.get("text", "") - timestamp = f"[{start_time:.2f}s-{start_time + duration:.2f}s]" - transcript_segments.append(f"{timestamp} {text}") - transcript_text = "\n".join(transcript_segments) - - await task_logger.log_task_progress( - log_entry, - f"Transcript fetched successfully: {len(captions)} segments", - { - "stage": "transcript_fetched", - "segments_count": len(captions), - "transcript_length": len(transcript_text), - }, - ) - except Exception as e: - transcript_text = f"No captions available for this video. Error: {e!s}" - await task_logger.log_task_progress( - log_entry, - f"No transcript available for video: {video_id}", - {"stage": "transcript_unavailable", "error": str(e)}, - ) - - # Format document - await task_logger.log_task_progress( - log_entry, - f"Processing video content: {video_data.get('title', 'YouTube Video')}", - {"stage": "content_processing"}, - ) - - # Format document metadata in a more maintainable way - metadata_sections = [ - ( - "METADATA", - [ - f"TITLE: {video_data.get('title', 'YouTube Video')}", - f"URL: {url}", - f"VIDEO_ID: {video_id}", - f"AUTHOR: {video_data.get('author_name', 'Unknown')}", - f"THUMBNAIL: {video_data.get('thumbnail_url', '')}", - ], - ), - ( - "CONTENT", - ["FORMAT: transcript", "TEXT_START", transcript_text, "TEXT_END"], - ), - ] - - # Build the document string more efficiently - document_parts = [] - document_parts.append("") - - for section_title, section_content in metadata_sections: - document_parts.append(f"<{section_title}>") - document_parts.extend(section_content) - document_parts.append(f"") - - document_parts.append("") - combined_document_string = "\n".join(document_parts) - content_hash = generate_content_hash(combined_document_string, search_space_id) - - # Check for duplicates - await task_logger.log_task_progress( - log_entry, - f"Checking for duplicate video content: {video_id}", - {"stage": "duplicate_check", "content_hash": content_hash}, - ) - - # Check if document with this content hash already exists - existing_doc_result = await session.execute( - select(Document).where(Document.content_hash == content_hash) - ) - existing_document = existing_doc_result.scalars().first() - - if existing_document: - await task_logger.log_task_success( - log_entry, - f"YouTube video document already exists: {video_data.get('title', 'YouTube Video')}", - { - "duplicate_detected": True, - "existing_document_id": existing_document.id, - "video_id": video_id, - }, - ) - logging.info( - f"Document with content hash {content_hash} already exists. Skipping processing." - ) - return existing_document - - # Get LLM for summary generation - await task_logger.log_task_progress( - log_entry, - f"Preparing for summary generation: {video_data.get('title', 'YouTube Video')}", - {"stage": "llm_setup"}, - ) - - # Get user's long context LLM - user_llm = await get_user_long_context_llm(session, user_id) - if not user_llm: - raise RuntimeError(f"No long context LLM configured for user {user_id}") - - # Generate summary - await task_logger.log_task_progress( - log_entry, - f"Generating summary for video: {video_data.get('title', 'YouTube Video')}", - {"stage": "summary_generation"}, - ) - - summary_chain = SUMMARY_PROMPT_TEMPLATE | user_llm - summary_result = await summary_chain.ainvoke( - {"document": combined_document_string} - ) - summary_content = summary_result.content - summary_embedding = config.embedding_model_instance.embed(summary_content) - - # Process chunks - await task_logger.log_task_progress( - log_entry, - f"Processing content chunks for video: {video_data.get('title', 'YouTube Video')}", - {"stage": "chunk_processing"}, - ) - - chunks = [ - Chunk( - content=chunk.text, - embedding=config.embedding_model_instance.embed(chunk.text), - ) - for chunk in config.chunker_instance.chunk(combined_document_string) - ] - - # Create document - await task_logger.log_task_progress( - log_entry, - f"Creating YouTube video document in database: {video_data.get('title', 'YouTube Video')}", - {"stage": "document_creation", "chunks_count": len(chunks)}, - ) - - document = Document( - title=video_data.get("title", "YouTube Video"), - document_type=DocumentType.YOUTUBE_VIDEO, - document_metadata={ - "url": url, - "video_id": video_id, - "video_title": video_data.get("title", "YouTube Video"), - "author": video_data.get("author_name", "Unknown"), - "thumbnail": video_data.get("thumbnail_url", ""), - }, - content=summary_content, - embedding=summary_embedding, - chunks=chunks, - search_space_id=search_space_id, - content_hash=content_hash, - ) - - session.add(document) - await session.commit() - await session.refresh(document) - - # Log success - await task_logger.log_task_success( - log_entry, - f"Successfully processed YouTube video: {video_data.get('title', 'YouTube Video')}", - { - "document_id": document.id, - "video_id": video_id, - "title": document.title, - "content_hash": content_hash, - "chunks_count": len(chunks), - "summary_length": len(summary_content), - "has_transcript": "No captions available" not in transcript_text, - }, - ) - - return document - except SQLAlchemyError as db_error: - await session.rollback() - await task_logger.log_task_failure( - log_entry, - f"Database error while processing YouTube video: {url}", - str(db_error), - { - "error_type": "SQLAlchemyError", - "video_id": video_id if "video_id" in locals() else None, - }, - ) - raise db_error - except Exception as e: - await session.rollback() - await task_logger.log_task_failure( - log_entry, - f"Failed to process YouTube video: {url}", - str(e), - { - "error_type": type(e).__name__, - "video_id": video_id if "video_id" in locals() else None, - }, - ) - logging.error(f"Failed to process YouTube video: {e!s}") - raise diff --git a/surfsense_backend/app/tasks/connector_indexers/__init__.py b/surfsense_backend/app/tasks/connector_indexers/__init__.py new file mode 100644 index 0000000..048a136 --- /dev/null +++ b/surfsense_backend/app/tasks/connector_indexers/__init__.py @@ -0,0 +1,54 @@ +""" +Connector indexers module for background tasks. + +This module provides a collection of connector indexers for different platforms +and services. Each indexer is responsible for handling the indexing of content +from a specific connector type. + +Available indexers: +- Slack: Index messages from Slack channels +- Notion: Index pages from Notion workspaces +- GitHub: Index repositories and files from GitHub +- Linear: Index issues from Linear workspaces +- Jira: Index issues from Jira projects +- Confluence: Index pages from Confluence spaces +- Discord: Index messages from Discord servers +- ClickUp: Index tasks from ClickUp workspaces +- Google Calendar: Index events from Google Calendar +""" + +# Communication platforms +from .clickup_indexer import index_clickup_tasks +from .confluence_indexer import index_confluence_pages +from .discord_indexer import index_discord_messages + +# Development platforms +from .github_indexer import index_github_repos + +# Calendar and scheduling +from .google_calendar_indexer import index_google_calendar_events +from .jira_indexer import index_jira_issues + +# Issue tracking and project management +from .linear_indexer import index_linear_issues + +# Documentation and knowledge management +from .notion_indexer import index_notion_pages +from .slack_indexer import index_slack_messages + +__all__ = [ + "index_clickup_tasks", + "index_confluence_pages", + "index_discord_messages", + # Development platforms + "index_github_repos", + # Calendar and scheduling + "index_google_calendar_events", + "index_jira_issues", + # Issue tracking and project management + "index_linear_issues", + # Documentation and knowledge management + "index_notion_pages", + # Communication platforms + "index_slack_messages", +] diff --git a/surfsense_backend/app/tasks/connector_indexers/base.py b/surfsense_backend/app/tasks/connector_indexers/base.py new file mode 100644 index 0000000..3ad2faf --- /dev/null +++ b/surfsense_backend/app/tasks/connector_indexers/base.py @@ -0,0 +1,183 @@ +""" +Base functionality and shared imports for connector indexers. +""" + +import logging +from datetime import datetime, timedelta + +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.future import select + +from app.config import config +from app.db import ( + Chunk, + Document, + SearchSourceConnector, + SearchSourceConnectorType, +) + +# Set up logging +logger = logging.getLogger(__name__) + + +async def check_duplicate_document_by_hash( + session: AsyncSession, content_hash: str +) -> Document | None: + """ + Check if a document with the given content hash already exists. + + Args: + session: Database session + content_hash: Hash of the document content + + Returns: + Existing document if found, None otherwise + """ + existing_doc_result = await session.execute( + select(Document).where(Document.content_hash == content_hash) + ) + return existing_doc_result.scalars().first() + + +async def create_document_chunks(content: str) -> list[Chunk]: + """ + Create chunks from document content. + + Args: + content: Document content to chunk + + Returns: + List of Chunk objects with embeddings + """ + return [ + Chunk( + content=chunk.text, + embedding=config.embedding_model_instance.embed(chunk.text), + ) + for chunk in config.chunker_instance.chunk(content) + ] + + +async def get_connector_by_id( + session: AsyncSession, connector_id: int, connector_type: SearchSourceConnectorType +) -> SearchSourceConnector | None: + """ + Get a connector by ID and type from the database. + + Args: + session: Database session + connector_id: ID of the connector + connector_type: Expected type of the connector + + Returns: + Connector object if found, None otherwise + """ + result = await session.execute( + select(SearchSourceConnector).filter( + SearchSourceConnector.id == connector_id, + SearchSourceConnector.connector_type == connector_type, + ) + ) + return result.scalars().first() + + +def calculate_date_range( + connector: SearchSourceConnector, + start_date: str | None = None, + end_date: str | None = None, + default_days_back: int = 365, +) -> tuple[str, str]: + """ + Calculate date range for indexing based on provided dates or connector's last indexed date. + + Args: + connector: The connector object + start_date: Optional start date string (YYYY-MM-DD) + end_date: Optional end date string (YYYY-MM-DD) + default_days_back: Default number of days to go back if no last indexed date + + Returns: + Tuple of (start_date_str, end_date_str) + """ + if start_date is not None and end_date is not None: + return start_date, end_date + + # Fall back to calculating dates based on last_indexed_at + calculated_end_date = datetime.now() + + # Use last_indexed_at as start date if available, otherwise use default_days_back + if connector.last_indexed_at: + # Convert dates to be comparable (both timezone-naive) + last_indexed_naive = ( + connector.last_indexed_at.replace(tzinfo=None) + if connector.last_indexed_at.tzinfo + else connector.last_indexed_at + ) + + # Check if last_indexed_at is in the future or after end_date + if last_indexed_naive > calculated_end_date: + logger.warning( + f"Last indexed date ({last_indexed_naive.strftime('%Y-%m-%d')}) is in the future. Using {default_days_back} days ago instead." + ) + calculated_start_date = calculated_end_date - timedelta( + days=default_days_back + ) + else: + calculated_start_date = last_indexed_naive + logger.info( + f"Using last_indexed_at ({calculated_start_date.strftime('%Y-%m-%d')}) as start date" + ) + else: + calculated_start_date = calculated_end_date - timedelta(days=default_days_back) + logger.info( + f"No last_indexed_at found, using {calculated_start_date.strftime('%Y-%m-%d')} ({default_days_back} days ago) as start date" + ) + + # Use calculated dates if not provided + start_date_str = ( + start_date if start_date else calculated_start_date.strftime("%Y-%m-%d") + ) + end_date_str = end_date if end_date else calculated_end_date.strftime("%Y-%m-%d") + + return start_date_str, end_date_str + + +async def update_connector_last_indexed( + session: AsyncSession, + connector: SearchSourceConnector, + update_last_indexed: bool = True, +) -> None: + """ + Update the last_indexed_at timestamp for a connector. + + Args: + session: Database session + connector: The connector object + update_last_indexed: Whether to actually update the timestamp + """ + if update_last_indexed: + connector.last_indexed_at = datetime.now() + logger.info(f"Updated last_indexed_at to {connector.last_indexed_at}") + + +def build_document_metadata_string( + metadata_sections: list[tuple[str, list[str]]], +) -> str: + """ + Build a document string from metadata sections. + + Args: + metadata_sections: List of (section_title, section_content) tuples + + Returns: + Combined document string + """ + document_parts = [""] + + for section_title, section_content in metadata_sections: + document_parts.append(f"<{section_title}>") + document_parts.extend(section_content) + document_parts.append(f"") + + document_parts.append("") + return "\n".join(document_parts) diff --git a/surfsense_backend/app/tasks/connector_indexers/clickup_indexer.py b/surfsense_backend/app/tasks/connector_indexers/clickup_indexer.py new file mode 100644 index 0000000..2bad6f8 --- /dev/null +++ b/surfsense_backend/app/tasks/connector_indexers/clickup_indexer.py @@ -0,0 +1,301 @@ +""" +ClickUp connector indexer. +""" + +from datetime import datetime + +from sqlalchemy.exc import SQLAlchemyError +from sqlalchemy.ext.asyncio import AsyncSession + +from app.config import config +from app.connectors.clickup_connector import ClickUpConnector +from app.db import Document, DocumentType, SearchSourceConnectorType +from app.services.task_logging_service import TaskLoggingService +from app.utils.document_converters import generate_content_hash + +from .base import ( + check_duplicate_document_by_hash, + create_document_chunks, + get_connector_by_id, + logger, + update_connector_last_indexed, +) + + +async def index_clickup_tasks( + session: AsyncSession, + connector_id: int, + search_space_id: int, + user_id: str, + start_date: str | None = None, + end_date: str | None = None, + update_last_indexed: bool = True, +) -> tuple[int, str | None]: + """ + Index tasks from ClickUp workspace. + + Args: + session: Database session + connector_id: ID of the ClickUp connector + search_space_id: ID of the search space + user_id: ID of the user + start_date: Start date for filtering tasks (YYYY-MM-DD format) + end_date: End date for filtering tasks (YYYY-MM-DD format) + update_last_indexed: Whether to update the last_indexed_at timestamp + + Returns: + Tuple of (number of indexed tasks, error message if any) + """ + task_logger = TaskLoggingService(session, search_space_id) + + # Log task start + log_entry = await task_logger.log_task_start( + task_name="clickup_tasks_indexing", + source="connector_indexing_task", + message=f"Starting ClickUp tasks indexing for connector {connector_id}", + metadata={ + "connector_id": connector_id, + "start_date": start_date, + "end_date": end_date, + }, + ) + + try: + # Get connector configuration + connector = await get_connector_by_id( + session, connector_id, SearchSourceConnectorType.CLICKUP_CONNECTOR + ) + + if not connector: + error_msg = f"ClickUp connector with ID {connector_id} not found" + await task_logger.log_task_failure( + log_entry, + f"Connector with ID {connector_id} not found or is not a ClickUp connector", + "Connector not found", + {"error_type": "ConnectorNotFound"}, + ) + return 0, error_msg + + # Extract ClickUp configuration + clickup_api_token = connector.config.get("CLICKUP_API_TOKEN") + + if not clickup_api_token: + error_msg = "ClickUp API token not found in connector configuration" + await task_logger.log_task_failure( + log_entry, + f"ClickUp API token not found in connector config for connector {connector_id}", + "Missing ClickUp token", + {"error_type": "MissingToken"}, + ) + return 0, error_msg + + await task_logger.log_task_progress( + log_entry, + f"Initializing ClickUp client for connector {connector_id}", + {"stage": "client_initialization"}, + ) + + clickup_client = ClickUpConnector(api_token=clickup_api_token) + + # Get authorized workspaces + await task_logger.log_task_progress( + log_entry, + "Fetching authorized ClickUp workspaces", + {"stage": "workspace_fetching"}, + ) + + workspaces_response = clickup_client.get_authorized_workspaces() + workspaces = workspaces_response.get("teams", []) + + if not workspaces: + error_msg = "No authorized ClickUp workspaces found" + await task_logger.log_task_failure( + log_entry, + f"No authorized ClickUp workspaces found for connector {connector_id}", + "No workspaces found", + {"error_type": "NoWorkspacesFound"}, + ) + return 0, error_msg + + documents_indexed = 0 + documents_skipped = 0 + + # Iterate workspaces and fetch tasks + for workspace in workspaces: + workspace_id = workspace.get("id") + workspace_name = workspace.get("name", "Unknown Workspace") + if not workspace_id: + continue + + await task_logger.log_task_progress( + log_entry, + f"Processing workspace: {workspace_name}", + {"stage": "workspace_processing", "workspace_id": workspace_id}, + ) + + # Fetch tasks for date range if provided + if start_date and end_date: + tasks, error = clickup_client.get_tasks_in_date_range( + workspace_id=workspace_id, + start_date=start_date, + end_date=end_date, + include_closed=True, + ) + if error: + logger.warning( + f"Error fetching tasks from workspace {workspace_name}: {error}" + ) + continue + else: + tasks = clickup_client.get_workspace_tasks( + workspace_id=workspace_id, include_closed=True + ) + + await task_logger.log_task_progress( + log_entry, + f"Found {len(tasks)} tasks in workspace {workspace_name}", + {"stage": "tasks_found", "task_count": len(tasks)}, + ) + + for task in tasks: + try: + task_id = task.get("id") + task_name = task.get("name", "Untitled Task") + task_description = task.get("description", "") + task_status = task.get("status", {}).get("status", "Unknown") + task_priority = ( + task.get("priority", {}).get("priority", "Unknown") + if task.get("priority") + else "None" + ) + task_assignees = task.get("assignees", []) + task_due_date = task.get("due_date") + task_created = task.get("date_created") + task_updated = task.get("date_updated") + + task_list = task.get("list", {}) + task_list_name = task_list.get("name", "Unknown List") + task_space = task.get("space", {}) + task_space_name = task_space.get("name", "Unknown Space") + + # Build task content string + content_parts: list[str] = [f"Task: {task_name}"] + if task_description: + content_parts.append(f"Description: {task_description}") + content_parts.extend( + [ + f"Status: {task_status}", + f"Priority: {task_priority}", + f"List: {task_list_name}", + f"Space: {task_space_name}", + ] + ) + if task_assignees: + assignee_names = [ + assignee.get("username", "Unknown") + for assignee in task_assignees + ] + content_parts.append( + f"Assignees: {', '.join(assignee_names)}" + ) + if task_due_date: + content_parts.append(f"Due Date: {task_due_date}") + + task_content = "\n".join(content_parts) + if not task_content.strip(): + logger.warning(f"Skipping task with no content: {task_name}") + documents_skipped += 1 + continue + + # Hash for duplicates + content_hash = generate_content_hash(task_content, search_space_id) + existing_document_by_hash = await check_duplicate_document_by_hash( + session, content_hash + ) + if existing_document_by_hash: + logger.info( + f"Document with content hash {content_hash} already exists for task {task_name}. Skipping processing." + ) + documents_skipped += 1 + continue + + # Embedding and chunks + summary_embedding = config.embedding_model_instance.embed( + task_content + ) + chunks = await create_document_chunks(task_content) + + document = Document( + search_space_id=search_space_id, + title=f"Task - {task_name}", + document_type=DocumentType.CLICKUP_CONNECTOR, + document_metadata={ + "task_id": task_id, + "task_name": task_name, + "task_status": task_status, + "task_priority": task_priority, + "task_assignees": task_assignees, + "task_due_date": task_due_date, + "task_created": task_created, + "task_updated": task_updated, + "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + }, + content=task_content, + content_hash=content_hash, + embedding=summary_embedding, + chunks=chunks, + ) + + session.add(document) + documents_indexed += 1 + logger.info(f"Successfully indexed new task {task_name}") + + except Exception as e: + logger.error( + f"Error processing task {task.get('name', 'Unknown')}: {e!s}", + exc_info=True, + ) + documents_skipped += 1 + + total_processed = documents_indexed + + if total_processed > 0: + await update_connector_last_indexed(session, connector, update_last_indexed) + + await session.commit() + + await task_logger.log_task_success( + log_entry, + f"Successfully completed clickup indexing for connector {connector_id}", + { + "pages_processed": total_processed, + "documents_indexed": documents_indexed, + "documents_skipped": documents_skipped, + }, + ) + + logger.info( + f"clickup indexing completed: {documents_indexed} new tasks, {documents_skipped} skipped" + ) + return total_processed, None + + except SQLAlchemyError as db_error: + await session.rollback() + await task_logger.log_task_failure( + log_entry, + f"Database error during ClickUp indexing for connector {connector_id}", + str(db_error), + {"error_type": "SQLAlchemyError"}, + ) + logger.error(f"Database error: {db_error!s}", exc_info=True) + return 0, f"Database error: {db_error!s}" + except Exception as e: + await session.rollback() + await task_logger.log_task_failure( + log_entry, + f"Failed to index ClickUp tasks for connector {connector_id}", + str(e), + {"error_type": type(e).__name__}, + ) + logger.error(f"Failed to index ClickUp tasks: {e!s}", exc_info=True) + return 0, f"Failed to index ClickUp tasks: {e!s}" diff --git a/surfsense_backend/app/tasks/connector_indexers/confluence_indexer.py b/surfsense_backend/app/tasks/connector_indexers/confluence_indexer.py new file mode 100644 index 0000000..57d19da --- /dev/null +++ b/surfsense_backend/app/tasks/connector_indexers/confluence_indexer.py @@ -0,0 +1,338 @@ +""" +Confluence connector indexer. +""" + +from datetime import datetime + +from sqlalchemy.exc import SQLAlchemyError +from sqlalchemy.ext.asyncio import AsyncSession + +from app.config import config +from app.connectors.confluence_connector import ConfluenceConnector +from app.db import Document, DocumentType, SearchSourceConnectorType +from app.services.task_logging_service import TaskLoggingService +from app.utils.document_converters import generate_content_hash + +from .base import ( + calculate_date_range, + check_duplicate_document_by_hash, + create_document_chunks, + get_connector_by_id, + logger, + update_connector_last_indexed, +) + + +async def index_confluence_pages( + session: AsyncSession, + connector_id: int, + search_space_id: int, + user_id: str, + start_date: str | None = None, + end_date: str | None = None, + update_last_indexed: bool = True, +) -> tuple[int, str | None]: + """ + Index Confluence pages and comments. + + Args: + session: Database session + connector_id: ID of the Confluence connector + search_space_id: ID of the search space to store documents in + user_id: User ID + start_date: Start date for indexing (YYYY-MM-DD format) + end_date: End date for indexing (YYYY-MM-DD format) + update_last_indexed: Whether to update the last_indexed_at timestamp (default: True) + + Returns: + Tuple containing (number of documents indexed, error message or None) + """ + task_logger = TaskLoggingService(session, search_space_id) + + # Log task start + log_entry = await task_logger.log_task_start( + task_name="confluence_pages_indexing", + source="connector_indexing_task", + message=f"Starting Confluence pages indexing for connector {connector_id}", + metadata={ + "connector_id": connector_id, + "user_id": str(user_id), + "start_date": start_date, + "end_date": end_date, + }, + ) + + try: + # Get the connector from the database + connector = await get_connector_by_id( + session, connector_id, SearchSourceConnectorType.CONFLUENCE_CONNECTOR + ) + + if not connector: + await task_logger.log_task_failure( + log_entry, + f"Connector with ID {connector_id} not found", + "Connector not found", + {"error_type": "ConnectorNotFound"}, + ) + return 0, f"Connector with ID {connector_id} not found" + + # Get the Confluence credentials from the connector config + confluence_email = connector.config.get("CONFLUENCE_EMAIL") + confluence_api_token = connector.config.get("CONFLUENCE_API_TOKEN") + confluence_base_url = connector.config.get("CONFLUENCE_BASE_URL") + + if not confluence_email or not confluence_api_token or not confluence_base_url: + await task_logger.log_task_failure( + log_entry, + f"Confluence credentials not found in connector config for connector {connector_id}", + "Missing Confluence credentials", + {"error_type": "MissingCredentials"}, + ) + return 0, "Confluence credentials not found in connector config" + + # Initialize Confluence client + await task_logger.log_task_progress( + log_entry, + f"Initializing Confluence client for connector {connector_id}", + {"stage": "client_initialization"}, + ) + + confluence_client = ConfluenceConnector( + base_url=confluence_base_url, + email=confluence_email, + api_token=confluence_api_token, + ) + + # Calculate date range + start_date_str, end_date_str = calculate_date_range( + connector, start_date, end_date, default_days_back=365 + ) + + await task_logger.log_task_progress( + log_entry, + f"Fetching Confluence pages from {start_date_str} to {end_date_str}", + { + "stage": "fetching_pages", + "start_date": start_date_str, + "end_date": end_date_str, + }, + ) + + # Get pages within date range + try: + pages, error = confluence_client.get_pages_by_date_range( + start_date=start_date_str, end_date=end_date_str, include_comments=True + ) + + if error: + logger.error(f"Failed to get Confluence pages: {error}") + + # Don't treat "No pages found" as an error that should stop indexing + if "No pages found" in error: + logger.info( + "No pages found is not a critical error, continuing with update" + ) + if update_last_indexed: + await update_connector_last_indexed( + session, connector, update_last_indexed + ) + await session.commit() + logger.info( + f"Updated last_indexed_at to {connector.last_indexed_at} despite no pages found" + ) + + await task_logger.log_task_success( + log_entry, + f"No Confluence pages found in date range {start_date_str} to {end_date_str}", + {"pages_found": 0}, + ) + return 0, None + else: + await task_logger.log_task_failure( + log_entry, + f"Failed to get Confluence pages: {error}", + "API Error", + {"error_type": "APIError"}, + ) + return 0, f"Failed to get Confluence pages: {error}" + + logger.info(f"Retrieved {len(pages)} pages from Confluence API") + + except Exception as e: + logger.error(f"Error fetching Confluence pages: {e!s}", exc_info=True) + return 0, f"Error fetching Confluence pages: {e!s}" + + # Process and index each page + documents_indexed = 0 + skipped_pages = [] + documents_skipped = 0 + + for page in pages: + try: + page_id = page.get("id") + page_title = page.get("title", "") + space_id = page.get("spaceId", "") + + if not page_id or not page_title: + logger.warning( + f"Skipping page with missing ID or title: {page_id or 'Unknown'}" + ) + skipped_pages.append(f"{page_title or 'Unknown'} (missing data)") + documents_skipped += 1 + continue + + # Extract page content + page_content = "" + if page.get("body") and page["body"].get("storage"): + page_content = page["body"]["storage"].get("value", "") + + # Add comments to content + comments = page.get("comments", []) + comments_content = "" + if comments: + comments_content = "\n\n## Comments\n\n" + for comment in comments: + comment_body = "" + if comment.get("body") and comment["body"].get("storage"): + comment_body = comment["body"]["storage"].get("value", "") + + comment_author = comment.get("version", {}).get( + "authorId", "Unknown" + ) + comment_date = comment.get("version", {}).get("createdAt", "") + + comments_content += f"**Comment by {comment_author}** ({comment_date}):\n{comment_body}\n\n" + + # Combine page content with comments + full_content = f"# {page_title}\n\n{page_content}{comments_content}" + + if not full_content.strip(): + logger.warning(f"Skipping page with no content: {page_title}") + skipped_pages.append(f"{page_title} (no content)") + documents_skipped += 1 + continue + + # Create a simple summary + summary_content = ( + f"Confluence Page: {page_title}\n\nSpace ID: {space_id}\n\n" + ) + if page_content: + # Take first 500 characters of content for summary + content_preview = page_content[:500] + if len(page_content) > 500: + content_preview += "..." + summary_content += f"Content Preview: {content_preview}\n\n" + + # Add comment count + comment_count = len(comments) + summary_content += f"Comments: {comment_count}" + + # Generate content hash + content_hash = generate_content_hash(full_content, search_space_id) + + # Check if document already exists + existing_document_by_hash = await check_duplicate_document_by_hash( + session, content_hash + ) + + if existing_document_by_hash: + logger.info( + f"Document with content hash {content_hash} already exists for page {page_title}. Skipping processing." + ) + documents_skipped += 1 + continue + + # Generate embedding for the summary + summary_embedding = config.embedding_model_instance.embed( + summary_content + ) + + # Process chunks - using the full page content with comments + chunks = await create_document_chunks(full_content) + + # Create and store new document + logger.info(f"Creating new document for page {page_title}") + document = Document( + search_space_id=search_space_id, + title=f"Confluence - {page_title}", + document_type=DocumentType.CONFLUENCE_CONNECTOR, + document_metadata={ + "page_id": page_id, + "page_title": page_title, + "space_id": space_id, + "comment_count": comment_count, + "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + }, + content=summary_content, + content_hash=content_hash, + embedding=summary_embedding, + chunks=chunks, + ) + + session.add(document) + documents_indexed += 1 + logger.info(f"Successfully indexed new page {page_title}") + + except Exception as e: + logger.error( + f"Error processing page {page.get('title', 'Unknown')}: {e!s}", + exc_info=True, + ) + skipped_pages.append( + f"{page.get('title', 'Unknown')} (processing error)" + ) + documents_skipped += 1 + continue # Skip this page and continue with others + + # Update the last_indexed_at timestamp for the connector only if requested + total_processed = documents_indexed + if update_last_indexed: + await update_connector_last_indexed(session, connector, update_last_indexed) + + # Commit all changes + await session.commit() + logger.info( + "Successfully committed all Confluence document changes to database" + ) + + # Log success + await task_logger.log_task_success( + log_entry, + f"Successfully completed Confluence indexing for connector {connector_id}", + { + "pages_processed": total_processed, + "documents_indexed": documents_indexed, + "documents_skipped": documents_skipped, + "skipped_pages_count": len(skipped_pages), + }, + ) + + logger.info( + f"Confluence indexing completed: {documents_indexed} new pages, {documents_skipped} skipped" + ) + return ( + total_processed, + None, + ) # Return None as the error message to indicate success + + except SQLAlchemyError as db_error: + await session.rollback() + await task_logger.log_task_failure( + log_entry, + f"Database error during Confluence indexing for connector {connector_id}", + str(db_error), + {"error_type": "SQLAlchemyError"}, + ) + logger.error(f"Database error: {db_error!s}", exc_info=True) + return 0, f"Database error: {db_error!s}" + except Exception as e: + await session.rollback() + await task_logger.log_task_failure( + log_entry, + f"Failed to index Confluence pages for connector {connector_id}", + str(e), + {"error_type": type(e).__name__}, + ) + logger.error(f"Failed to index Confluence pages: {e!s}", exc_info=True) + return 0, f"Failed to index Confluence pages: {e!s}" diff --git a/surfsense_backend/app/tasks/connector_indexers/discord_indexer.py b/surfsense_backend/app/tasks/connector_indexers/discord_indexer.py new file mode 100644 index 0000000..37c3405 --- /dev/null +++ b/surfsense_backend/app/tasks/connector_indexers/discord_indexer.py @@ -0,0 +1,441 @@ +""" +Discord connector indexer. +""" + +import asyncio +from datetime import UTC, datetime, timedelta + +from sqlalchemy.exc import SQLAlchemyError +from sqlalchemy.ext.asyncio import AsyncSession + +from app.config import config +from app.connectors.discord_connector import DiscordConnector +from app.db import Document, DocumentType, SearchSourceConnectorType +from app.prompts import SUMMARY_PROMPT_TEMPLATE +from app.services.llm_service import get_user_long_context_llm +from app.services.task_logging_service import TaskLoggingService +from app.utils.document_converters import generate_content_hash + +from .base import ( + build_document_metadata_string, + check_duplicate_document_by_hash, + create_document_chunks, + get_connector_by_id, + logger, + update_connector_last_indexed, +) + + +async def index_discord_messages( + session: AsyncSession, + connector_id: int, + search_space_id: int, + user_id: str, + start_date: str | None = None, + end_date: str | None = None, + update_last_indexed: bool = True, +) -> tuple[int, str | None]: + """ + Index Discord messages from all accessible channels. + + Args: + session: Database session + connector_id: ID of the Discord connector + search_space_id: ID of the search space to store documents in + user_id: ID of the user + start_date: Start date for indexing (YYYY-MM-DD format) + end_date: End date for indexing (YYYY-MM-DD format) + update_last_indexed: Whether to update the last_indexed_at timestamp (default: True) + + Returns: + Tuple containing (number of documents indexed, error message or None) + """ + task_logger = TaskLoggingService(session, search_space_id) + + # Log task start + log_entry = await task_logger.log_task_start( + task_name="discord_messages_indexing", + source="connector_indexing_task", + message=f"Starting Discord messages indexing for connector {connector_id}", + metadata={ + "connector_id": connector_id, + "user_id": str(user_id), + "start_date": start_date, + "end_date": end_date, + }, + ) + + try: + # Get the connector + await task_logger.log_task_progress( + log_entry, + f"Retrieving Discord connector {connector_id} from database", + {"stage": "connector_retrieval"}, + ) + + connector = await get_connector_by_id( + session, connector_id, SearchSourceConnectorType.DISCORD_CONNECTOR + ) + + if not connector: + await task_logger.log_task_failure( + log_entry, + f"Connector with ID {connector_id} not found or is not a Discord connector", + "Connector not found", + {"error_type": "ConnectorNotFound"}, + ) + return ( + 0, + f"Connector with ID {connector_id} not found or is not a Discord connector", + ) + + # Get the Discord token from the connector config + discord_token = connector.config.get("DISCORD_BOT_TOKEN") + if not discord_token: + await task_logger.log_task_failure( + log_entry, + f"Discord token not found in connector config for connector {connector_id}", + "Missing Discord token", + {"error_type": "MissingToken"}, + ) + return 0, "Discord token not found in connector config" + + logger.info(f"Starting Discord indexing for connector {connector_id}") + + # Initialize Discord client + await task_logger.log_task_progress( + log_entry, + f"Initializing Discord client for connector {connector_id}", + {"stage": "client_initialization"}, + ) + + discord_client = DiscordConnector(token=discord_token) + + # Calculate date range + if start_date is None or end_date is None: + # Fall back to calculating dates based on last_indexed_at + calculated_end_date = datetime.now(UTC) + + # Use last_indexed_at as start date if available, otherwise use 365 days ago + if connector.last_indexed_at: + calculated_start_date = connector.last_indexed_at.replace(tzinfo=UTC) + logger.info( + f"Using last_indexed_at ({calculated_start_date.strftime('%Y-%m-%d')}) as start date" + ) + else: + calculated_start_date = calculated_end_date - timedelta(days=365) + logger.info( + f"No last_indexed_at found, using {calculated_start_date.strftime('%Y-%m-%d')} (365 days ago) as start date" + ) + + # Use calculated dates if not provided, convert to ISO format for Discord API + if start_date is None: + start_date_iso = calculated_start_date.isoformat() + else: + # Convert YYYY-MM-DD to ISO format + start_date_iso = ( + datetime.strptime(start_date, "%Y-%m-%d") + .replace(tzinfo=UTC) + .isoformat() + ) + + if end_date is None: + end_date_iso = calculated_end_date.isoformat() + else: + # Convert YYYY-MM-DD to ISO format + end_date_iso = ( + datetime.strptime(end_date, "%Y-%m-%d") + .replace(tzinfo=UTC) + .isoformat() + ) + else: + # Convert provided dates to ISO format for Discord API + start_date_iso = ( + datetime.strptime(start_date, "%Y-%m-%d") + .replace(tzinfo=UTC) + .isoformat() + ) + end_date_iso = ( + datetime.strptime(end_date, "%Y-%m-%d").replace(tzinfo=UTC).isoformat() + ) + + logger.info( + f"Indexing Discord messages from {start_date_iso} to {end_date_iso}" + ) + + + try: + await task_logger.log_task_progress( + log_entry, + f"Starting Discord bot and fetching guilds for connector {connector_id}", + {"stage": "fetch_guilds"}, + ) + + logger.info("Starting Discord bot to fetch guilds") + discord_client._bot_task = asyncio.create_task(discord_client.start_bot()) + await discord_client._wait_until_ready() + + logger.info("Fetching Discord guilds") + guilds = await discord_client.get_guilds() + logger.info(f"Found {len(guilds)} guilds") + except Exception as e: + await task_logger.log_task_failure( + log_entry, + f"Failed to get Discord guilds for connector {connector_id}", + str(e), + {"error_type": "GuildFetchError"}, + ) + logger.error(f"Failed to get Discord guilds: {e!s}", exc_info=True) + await discord_client.close_bot() + return 0, f"Failed to get Discord guilds: {e!s}" + + if not guilds: + await task_logger.log_task_success( + log_entry, + f"No Discord guilds found for connector {connector_id}", + {"guilds_found": 0}, + ) + logger.info("No Discord guilds found to index") + await discord_client.close_bot() + return 0, "No Discord guilds found" + + # Track results + documents_indexed = 0 + documents_skipped = 0 + skipped_channels: list[str] = [] + + # Process each guild and channel + await task_logger.log_task_progress( + log_entry, + f"Starting to process {len(guilds)} Discord guilds", + {"stage": "process_guilds", "total_guilds": len(guilds)}, + ) + + try: + for guild in guilds: + guild_id = guild["id"] + guild_name = guild["name"] + logger.info(f"Processing guild: {guild_name} ({guild_id})") + + try: + channels = await discord_client.get_text_channels(guild_id) + if not channels: + logger.info(f"No channels found in guild {guild_name}. Skipping.") + skipped_channels.append(f"{guild_name} (no channels)") + documents_skipped += 1 + continue + + for channel in channels: + channel_id = channel["id"] + channel_name = channel["name"] + + try: + messages = await discord_client.get_channel_history( + channel_id=channel_id, + start_date=start_date_iso, + end_date=end_date_iso, + ) + except Exception as e: + logger.error( + f"Failed to get messages for channel {channel_name}: {e!s}" + ) + skipped_channels.append( + f"{guild_name}#{channel_name} (fetch error)" + ) + documents_skipped += 1 + continue + + if not messages: + logger.info( + f"No messages found in channel {channel_name} for the specified date range." + ) + documents_skipped += 1 + continue + + # Filter/format messages + formatted_messages: list[dict] = [] + for msg in messages: + # Optionally skip system messages + if msg.get("type") in ["system"]: + continue + formatted_messages.append(msg) + + if not formatted_messages: + logger.info( + f"No valid messages found in channel {channel_name} after filtering." + ) + documents_skipped += 1 + continue + + # Convert messages to markdown format + channel_content = ( + f"# Discord Channel: {guild_name} / {channel_name}\n\n" + ) + for msg in formatted_messages: + user_name = msg.get("author_name", "Unknown User") + timestamp = msg.get("created_at", "Unknown Time") + text = msg.get("content", "") + channel_content += ( + f"## {user_name} ({timestamp})\n\n{text}\n\n---\n\n" + ) + + # Metadata sections + metadata_sections = [ + ( + "METADATA", + [ + f"GUILD_NAME: {guild_name}", + f"GUILD_ID: {guild_id}", + f"CHANNEL_NAME: {channel_name}", + f"CHANNEL_ID: {channel_id}", + f"MESSAGE_COUNT: {len(formatted_messages)}", + ], + ), + ( + "CONTENT", + [ + "FORMAT: markdown", + "TEXT_START", + channel_content, + "TEXT_END", + ], + ), + ] + + combined_document_string = build_document_metadata_string( + metadata_sections + ) + content_hash = generate_content_hash( + combined_document_string, search_space_id + ) + + # Skip duplicates by hash + existing_document_by_hash = await check_duplicate_document_by_hash( + session, content_hash + ) + if existing_document_by_hash: + logger.info( + f"Document with content hash {content_hash} already exists for channel {guild_name}#{channel_name}. Skipping processing." + ) + documents_skipped += 1 + continue + + # Get user's long context LLM + user_llm = await get_user_long_context_llm(session, user_id) + if not user_llm: + logger.error( + f"No long context LLM configured for user {user_id}" + ) + skipped_channels.append( + f"{guild_name}#{channel_name} (no LLM configured)" + ) + documents_skipped += 1 + continue + + # Generate summary using summary_chain + summary_chain = SUMMARY_PROMPT_TEMPLATE | user_llm + summary_result = await summary_chain.ainvoke( + {"document": combined_document_string} + ) + summary_content = summary_result.content + summary_embedding = config.embedding_model_instance.embed( + summary_content + ) + + # Chunks from channel content + chunks = await create_document_chunks(channel_content) + + # Create and store new document + document = Document( + search_space_id=search_space_id, + title=f"Discord - {guild_name}#{channel_name}", + document_type=DocumentType.DISCORD_CONNECTOR, + document_metadata={ + "guild_name": guild_name, + "guild_id": guild_id, + "channel_name": channel_name, + "channel_id": channel_id, + "message_count": len(formatted_messages), + "start_date": start_date_iso, + "end_date": end_date_iso, + "indexed_at": datetime.now(UTC).strftime( + "%Y-%m-%d %H:%M:%S" + ), + }, + content=summary_content, + content_hash=content_hash, + embedding=summary_embedding, + chunks=chunks, + ) + + session.add(document) + documents_indexed += 1 + logger.info( + f"Successfully indexed new channel {guild_name}#{channel_name} with {len(formatted_messages)} messages" + ) + + except Exception as e: + logger.error( + f"Error processing guild {guild_name}: {e!s}", exc_info=True + ) + skipped_channels.append(f"{guild_name} (processing error)") + documents_skipped += 1 + continue + finally: + await discord_client.close_bot() + + # Update last_indexed_at only if we indexed at least one + if documents_indexed > 0: + await update_connector_last_indexed(session, connector, update_last_indexed) + + await session.commit() + + # Prepare result message + result_message = None + if skipped_channels: + result_message = ( + f"Processed {documents_indexed} channels. Skipped {len(skipped_channels)} channels: " + + ", ".join(skipped_channels) + ) + else: + result_message = f"Processed {documents_indexed} channels." + + # Log success + await task_logger.log_task_success( + log_entry, + f"Successfully completed Discord indexing for connector {connector_id}", + { + "channels_processed": documents_indexed, + "documents_indexed": documents_indexed, + "documents_skipped": documents_skipped, + "skipped_channels_count": len(skipped_channels), + "guilds_processed": len(guilds), + "result_message": result_message, + }, + ) + + logger.info( + f"Discord indexing completed: {documents_indexed} new channels, {documents_skipped} skipped" + ) + return documents_indexed, result_message + + except SQLAlchemyError as db_error: + await session.rollback() + await task_logger.log_task_failure( + log_entry, + f"Database error during Discord indexing for connector {connector_id}", + str(db_error), + {"error_type": "SQLAlchemyError"}, + ) + logger.error(f"Database error: {db_error!s}", exc_info=True) + return 0, f"Database error: {db_error!s}" + except Exception as e: + await session.rollback() + await task_logger.log_task_failure( + log_entry, + f"Failed to index Discord messages for connector {connector_id}", + str(e), + {"error_type": type(e).__name__}, + ) + logger.error(f"Failed to index Discord messages: {e!s}", exc_info=True) + return 0, f"Failed to index Discord messages: {e!s}" diff --git a/surfsense_backend/app/tasks/connector_indexers/github_indexer.py b/surfsense_backend/app/tasks/connector_indexers/github_indexer.py new file mode 100644 index 0000000..3300440 --- /dev/null +++ b/surfsense_backend/app/tasks/connector_indexers/github_indexer.py @@ -0,0 +1,326 @@ +""" +GitHub connector indexer. +""" + +from datetime import UTC, datetime + +from sqlalchemy.exc import SQLAlchemyError +from sqlalchemy.ext.asyncio import AsyncSession + +from app.config import config +from app.connectors.github_connector import GitHubConnector +from app.db import Document, DocumentType, SearchSourceConnectorType +from app.services.task_logging_service import TaskLoggingService +from app.utils.document_converters import generate_content_hash + +from .base import ( + check_duplicate_document_by_hash, + create_document_chunks, + get_connector_by_id, + logger, +) + + +async def index_github_repos( + session: AsyncSession, + connector_id: int, + search_space_id: int, + user_id: str, + start_date: str | None = None, + end_date: str | None = None, + update_last_indexed: bool = True, +) -> tuple[int, str | None]: + """ + Index code and documentation files from accessible GitHub repositories. + + Args: + session: Database session + connector_id: ID of the GitHub connector + search_space_id: ID of the search space to store documents in + user_id: ID of the user + start_date: Start date for filtering (YYYY-MM-DD format) - Note: GitHub indexing processes all files regardless of dates + end_date: End date for filtering (YYYY-MM-DD format) - Note: GitHub indexing processes all files regardless of dates + update_last_indexed: Whether to update the last_indexed_at timestamp (default: True) + + Returns: + Tuple containing (number of documents indexed, error message or None) + """ + task_logger = TaskLoggingService(session, search_space_id) + + # Log task start + log_entry = await task_logger.log_task_start( + task_name="github_repos_indexing", + source="connector_indexing_task", + message=f"Starting GitHub repositories indexing for connector {connector_id}", + metadata={ + "connector_id": connector_id, + "user_id": str(user_id), + "start_date": start_date, + "end_date": end_date, + }, + ) + + documents_processed = 0 + errors = [] + + try: + # 1. Get the GitHub connector from the database + await task_logger.log_task_progress( + log_entry, + f"Retrieving GitHub connector {connector_id} from database", + {"stage": "connector_retrieval"}, + ) + + connector = await get_connector_by_id( + session, connector_id, SearchSourceConnectorType.GITHUB_CONNECTOR + ) + + if not connector: + await task_logger.log_task_failure( + log_entry, + f"Connector with ID {connector_id} not found or is not a GitHub connector", + "Connector not found", + {"error_type": "ConnectorNotFound"}, + ) + return ( + 0, + f"Connector with ID {connector_id} not found or is not a GitHub connector", + ) + + # 2. Get the GitHub PAT and selected repositories from the connector config + github_pat = connector.config.get("GITHUB_PAT") + repo_full_names_to_index = connector.config.get("repo_full_names") + + if not github_pat: + await task_logger.log_task_failure( + log_entry, + f"GitHub Personal Access Token (PAT) not found in connector config for connector {connector_id}", + "Missing GitHub PAT", + {"error_type": "MissingToken"}, + ) + return 0, "GitHub Personal Access Token (PAT) not found in connector config" + + if not repo_full_names_to_index or not isinstance( + repo_full_names_to_index, list + ): + await task_logger.log_task_failure( + log_entry, + f"'repo_full_names' not found or is not a list in connector config for connector {connector_id}", + "Invalid repo configuration", + {"error_type": "InvalidConfiguration"}, + ) + return 0, "'repo_full_names' not found or is not a list in connector config" + + # 3. Initialize GitHub connector client + await task_logger.log_task_progress( + log_entry, + f"Initializing GitHub client for connector {connector_id}", + { + "stage": "client_initialization", + "repo_count": len(repo_full_names_to_index), + }, + ) + + try: + github_client = GitHubConnector(token=github_pat) + except ValueError as e: + await task_logger.log_task_failure( + log_entry, + f"Failed to initialize GitHub client for connector {connector_id}", + str(e), + {"error_type": "ClientInitializationError"}, + ) + return 0, f"Failed to initialize GitHub client: {e!s}" + + # 4. Validate selected repositories + await task_logger.log_task_progress( + log_entry, + f"Starting indexing for {len(repo_full_names_to_index)} selected repositories", + { + "stage": "repo_processing", + "repo_count": len(repo_full_names_to_index), + "start_date": start_date, + "end_date": end_date, + }, + ) + + logger.info( + f"Starting indexing for {len(repo_full_names_to_index)} selected repositories." + ) + if start_date and end_date: + logger.info( + f"Date range requested: {start_date} to {end_date} (Note: GitHub indexing processes all files regardless of dates)" + ) + + # 6. Iterate through selected repositories and index files + for repo_full_name in repo_full_names_to_index: + if not repo_full_name or not isinstance(repo_full_name, str): + logger.warning(f"Skipping invalid repository entry: {repo_full_name}") + continue + + logger.info(f"Processing repository: {repo_full_name}") + try: + files_to_index = github_client.get_repository_files(repo_full_name) + if not files_to_index: + logger.info( + f"No indexable files found in repository: {repo_full_name}" + ) + continue + + logger.info( + f"Found {len(files_to_index)} files to process in {repo_full_name}" + ) + + for file_info in files_to_index: + file_path = file_info.get("path") + file_url = file_info.get("url") + file_sha = file_info.get("sha") + file_type = file_info.get("type") # 'code' or 'doc' + full_path_key = f"{repo_full_name}/{file_path}" + + if not file_path or not file_url or not file_sha: + logger.warning( + f"Skipping file with missing info in {repo_full_name}: {file_info}" + ) + continue + + # Get file content + file_content = github_client.get_file_content( + repo_full_name, file_path + ) + + if file_content is None: + logger.warning( + f"Could not retrieve content for {full_path_key}. Skipping." + ) + continue # Skip if content fetch failed + + content_hash = generate_content_hash(file_content, search_space_id) + + # Check if document with this content hash already exists + existing_document_by_hash = await check_duplicate_document_by_hash( + session, content_hash + ) + + if existing_document_by_hash: + logger.info( + f"Document with content hash {content_hash} already exists for file {full_path_key}. Skipping processing." + ) + continue + + # Use file_content directly for chunking, maybe summary for main content? + # For now, let's use the full content for both, might need refinement + summary_content = f"GitHub file: {full_path_key}\n\n{file_content[:1000]}..." # Simple summary + summary_embedding = config.embedding_model_instance.embed( + summary_content + ) + + # Chunk the content + try: + chunks_data = [ + await create_document_chunks(file_content) + ][0] + + # Use code chunker if available, otherwise regular chunker + if hasattr(config, "code_chunker_instance"): + chunks_data = [ + { + "content": chunk.text, + "embedding": config.embedding_model_instance.embed( + chunk.text + ), + } + for chunk in config.code_chunker_instance.chunk( + file_content + ) + ] + else: + chunks_data = await create_document_chunks(file_content) + + except Exception as chunk_err: + logger.error( + f"Failed to chunk file {full_path_key}: {chunk_err}" + ) + errors.append( + f"Chunking failed for {full_path_key}: {chunk_err}" + ) + continue # Skip this file if chunking fails + + doc_metadata = { + "repository_full_name": repo_full_name, + "file_path": file_path, + "full_path": full_path_key, # For easier lookup + "url": file_url, + "sha": file_sha, + "type": file_type, + "indexed_at": datetime.now(UTC).isoformat(), + } + + # Create new document + logger.info(f"Creating new document for file: {full_path_key}") + document = Document( + title=f"GitHub - {file_path}", + document_type=DocumentType.GITHUB_CONNECTOR, + document_metadata=doc_metadata, + content=summary_content, # Store summary + content_hash=content_hash, + embedding=summary_embedding, + search_space_id=search_space_id, + chunks=chunks_data, # Associate chunks directly + ) + session.add(document) + documents_processed += 1 + + except Exception as repo_err: + logger.error( + f"Failed to process repository {repo_full_name}: {repo_err}" + ) + errors.append(f"Failed processing {repo_full_name}: {repo_err}") + + # Commit all changes at the end + await session.commit() + logger.info( + f"Finished GitHub indexing for connector {connector_id}. Processed {documents_processed} files." + ) + + # Log success + await task_logger.log_task_success( + log_entry, + f"Successfully completed GitHub indexing for connector {connector_id}", + { + "documents_processed": documents_processed, + "errors_count": len(errors), + "repo_count": len(repo_full_names_to_index), + }, + ) + + except SQLAlchemyError as db_err: + await session.rollback() + await task_logger.log_task_failure( + log_entry, + f"Database error during GitHub indexing for connector {connector_id}", + str(db_err), + {"error_type": "SQLAlchemyError"}, + ) + logger.error( + f"Database error during GitHub indexing for connector {connector_id}: {db_err}" + ) + errors.append(f"Database error: {db_err}") + return documents_processed, "; ".join(errors) if errors else str(db_err) + except Exception as e: + await session.rollback() + await task_logger.log_task_failure( + log_entry, + f"Unexpected error during GitHub indexing for connector {connector_id}", + str(e), + {"error_type": type(e).__name__}, + ) + logger.error( + f"Unexpected error during GitHub indexing for connector {connector_id}: {e}", + exc_info=True, + ) + errors.append(f"Unexpected error: {e}") + return documents_processed, "; ".join(errors) if errors else str(e) + + error_message = "; ".join(errors) if errors else None + return documents_processed, error_message diff --git a/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py b/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py new file mode 100644 index 0000000..37f6362 --- /dev/null +++ b/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py @@ -0,0 +1,350 @@ +""" +Google Calendar connector indexer. +""" + +from datetime import datetime, timedelta + +from google.oauth2.credentials import Credentials +from sqlalchemy.exc import SQLAlchemyError +from sqlalchemy.ext.asyncio import AsyncSession + +from app.config import config +from app.connectors.google_calendar_connector import GoogleCalendarConnector +from app.db import Document, DocumentType, SearchSourceConnectorType +from app.services.task_logging_service import TaskLoggingService +from app.utils.document_converters import generate_content_hash + +from .base import ( + create_document_chunks, + get_connector_by_id, + logger, + update_connector_last_indexed, +) + + +async def index_google_calendar_events( + session: AsyncSession, + connector_id: int, + search_space_id: int, + user_id: str, + start_date: str | None = None, + end_date: str | None = None, + update_last_indexed: bool = True, +) -> tuple[int, str | None]: + """ + Index Google Calendar events. + + Args: + session: Database session + connector_id: ID of the Google Calendar connector + search_space_id: ID of the search space to store documents in + user_id: User ID + start_date: Start date for indexing (YYYY-MM-DD format) + end_date: End date for indexing (YYYY-MM-DD format) + update_last_indexed: Whether to update the last_indexed_at timestamp (default: True) + + Returns: + Tuple containing (number of documents indexed, error message or None) + """ + task_logger = TaskLoggingService(session, search_space_id) + + # Log task start + log_entry = await task_logger.log_task_start( + task_name="google_calendar_events_indexing", + source="connector_indexing_task", + message=f"Starting Google Calendar events indexing for connector {connector_id}", + metadata={ + "connector_id": connector_id, + "user_id": str(user_id), + "start_date": start_date, + "end_date": end_date, + }, + ) + + try: + # Get the connector from the database + connector = await get_connector_by_id( + session, connector_id, SearchSourceConnectorType.GOOGLE_CALENDAR_CONNECTOR + ) + + if not connector: + await task_logger.log_task_failure( + log_entry, + f"Connector with ID {connector_id} not found", + "Connector not found", + {"error_type": "ConnectorNotFound"}, + ) + return 0, f"Connector with ID {connector_id} not found" + + # Get the Google Calendar credentials from the connector config + credentials = Credentials( + token=connector.config.get("token"), + refresh_token=connector.config.get("refresh_token"), + token_uri=connector.config.get("token_uri"), + client_id=connector.config.get("client_id"), + client_secret=connector.config.get("client_secret"), + scopes=connector.config.get("scopes"), + ) + + if ( + not credentials.client_id + or not credentials.client_secret + or not credentials.refresh_token + ): + await task_logger.log_task_failure( + log_entry, + f"Google Calendar credentials not found in connector config for connector {connector_id}", + "Missing Google Calendar credentials", + {"error_type": "MissingCredentials"}, + ) + return 0, "Google Calendar credentials not found in connector config" + + # Initialize Google Calendar client + await task_logger.log_task_progress( + log_entry, + f"Initializing Google Calendar client for connector {connector_id}", + {"stage": "client_initialization"}, + ) + + calendar_client = GoogleCalendarConnector(credentials=credentials) + + # Calculate date range + if start_date is None or end_date is None: + # Fall back to calculating dates based on last_indexed_at + calculated_end_date = datetime.now() + + # Use last_indexed_at as start date if available, otherwise use 30 days ago + if connector.last_indexed_at: + # Convert dates to be comparable (both timezone-naive) + last_indexed_naive = ( + connector.last_indexed_at.replace(tzinfo=None) + if connector.last_indexed_at.tzinfo + else connector.last_indexed_at + ) + + # Check if last_indexed_at is in the future or after end_date + if last_indexed_naive > calculated_end_date: + logger.warning( + f"Last indexed date ({last_indexed_naive.strftime('%Y-%m-%d')}) is in the future. Using 30 days ago instead." + ) + calculated_start_date = calculated_end_date - timedelta(days=30) + else: + calculated_start_date = last_indexed_naive + logger.info( + f"Using last_indexed_at ({calculated_start_date.strftime('%Y-%m-%d')}) as start date" + ) + else: + calculated_start_date = calculated_end_date - timedelta( + days=30 + ) # Use 30 days as default for calendar events + logger.info( + f"No last_indexed_at found, using {calculated_start_date.strftime('%Y-%m-%d')} (30 days ago) as start date" + ) + + # Use calculated dates if not provided + start_date_str = ( + start_date if start_date else calculated_start_date.strftime("%Y-%m-%d") + ) + end_date_str = ( + end_date if end_date else calculated_end_date.strftime("%Y-%m-%d") + ) + else: + # Use provided dates + start_date_str = start_date + end_date_str = end_date + + await task_logger.log_task_progress( + log_entry, + f"Fetching Google Calendar events from {start_date_str} to {end_date_str}", + { + "stage": "fetching_events", + "start_date": start_date_str, + "end_date": end_date_str, + }, + ) + + # Get events within date range from primary calendar + try: + events, error = calendar_client.get_all_primary_calendar_events( + start_date=start_date_str, end_date=end_date_str + ) + + if error: + logger.error(f"Failed to get Google Calendar events: {error}") + + # Don't treat "No events found" as an error that should stop indexing + if "No events found" in error: + logger.info( + "No events found is not a critical error, continuing with update" + ) + if update_last_indexed: + await update_connector_last_indexed( + session, connector, update_last_indexed + ) + await session.commit() + logger.info( + f"Updated last_indexed_at to {connector.last_indexed_at} despite no events found" + ) + + await task_logger.log_task_success( + log_entry, + f"No Google Calendar events found in date range {start_date_str} to {end_date_str}", + {"events_found": 0}, + ) + return 0, None + else: + await task_logger.log_task_failure( + log_entry, + f"Failed to get Google Calendar events: {error}", + "API Error", + {"error_type": "APIError"}, + ) + return 0, f"Failed to get Google Calendar events: {error}" + + logger.info(f"Retrieved {len(events)} events from Google Calendar API") + + except Exception as e: + logger.error(f"Error fetching Google Calendar events: {e!s}", exc_info=True) + return 0, f"Error fetching Google Calendar events: {e!s}" + + documents_indexed = 0 + documents_skipped = 0 + skipped_events = [] + + for event in events: + try: + event_id = event.get("id") + event_summary = event.get("summary", "No Title") + calendar_id = event.get("calendarId", "") + + if not event_id: + logger.warning(f"Skipping event with missing ID: {event_summary}") + skipped_events.append(f"{event_summary} (missing ID)") + documents_skipped += 1 + continue + + event_markdown = calendar_client.format_event_to_markdown(event) + if not event_markdown.strip(): + logger.warning(f"Skipping event with no content: {event_summary}") + skipped_events.append(f"{event_summary} (no content)") + documents_skipped += 1 + continue + + start = event.get("start", {}) + end = event.get("end", {}) + start_time = start.get("dateTime") or start.get("date", "") + end_time = end.get("dateTime") or end.get("date", "") + location = event.get("location", "") + description = event.get("description", "") + + summary_content = f"Google Calendar Event: {event_summary}\n\n" + summary_content += f"Calendar: {calendar_id}\n" + summary_content += f"Start: {start_time}\n" + summary_content += f"End: {end_time}\n" + if location: + summary_content += f"Location: {location}\n" + if description: + desc_preview = description[:300] + if len(description) > 300: + desc_preview += "..." + summary_content += f"Description: {desc_preview}\n" + + content_hash = generate_content_hash(event_markdown, search_space_id) + + # Duplicate check via simple query using helper in base + from .base import ( + check_duplicate_document_by_hash, # local import to avoid circular at module import + ) + + existing_document_by_hash = await check_duplicate_document_by_hash( + session, content_hash + ) + if existing_document_by_hash: + logger.info( + f"Document with content hash {content_hash} already exists for event {event_summary}. Skipping processing." + ) + documents_skipped += 1 + continue + + # Embeddings and chunks + summary_embedding = config.embedding_model_instance.embed( + summary_content + ) + chunks = await create_document_chunks(event_markdown) + + document = Document( + search_space_id=search_space_id, + title=f"Calendar Event - {event_summary}", + document_type=DocumentType.GOOGLE_CALENDAR_CONNECTOR, + document_metadata={ + "event_id": event_id, + "event_summary": event_summary, + "calendar_id": calendar_id, + "start_time": start_time, + "end_time": end_time, + "location": location, + "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + }, + content=summary_content, + content_hash=content_hash, + embedding=summary_embedding, + chunks=chunks, + ) + + session.add(document) + documents_indexed += 1 + logger.info(f"Successfully indexed new event {event_summary}") + + except Exception as e: + logger.error( + f"Error processing event {event.get('summary', 'Unknown')}: {e!s}", + exc_info=True, + ) + skipped_events.append( + f"{event.get('summary', 'Unknown')} (processing error)" + ) + documents_skipped += 1 + continue + + total_processed = documents_indexed + if total_processed > 0: + await update_connector_last_indexed(session, connector, update_last_indexed) + + await session.commit() + + await task_logger.log_task_success( + log_entry, + f"Successfully completed Google Calendar indexing for connector {connector_id}", + { + "events_processed": total_processed, + "documents_indexed": documents_indexed, + "documents_skipped": documents_skipped, + "skipped_events_count": len(skipped_events), + }, + ) + + logger.info( + f"Google Calendar indexing completed: {documents_indexed} new events, {documents_skipped} skipped" + ) + return total_processed, None + + except SQLAlchemyError as db_error: + await session.rollback() + await task_logger.log_task_failure( + log_entry, + f"Database error during Google Calendar indexing for connector {connector_id}", + str(db_error), + {"error_type": "SQLAlchemyError"}, + ) + logger.error(f"Database error: {db_error!s}", exc_info=True) + return 0, f"Database error: {db_error!s}" + except Exception as e: + await session.rollback() + await task_logger.log_task_failure( + log_entry, + f"Failed to index Google Calendar events for connector {connector_id}", + str(e), + {"error_type": type(e).__name__}, + ) + logger.error(f"Failed to index Google Calendar events: {e!s}", exc_info=True) + return 0, f"Failed to index Google Calendar events: {e!s}" diff --git a/surfsense_backend/app/tasks/connector_indexers/jira_indexer.py b/surfsense_backend/app/tasks/connector_indexers/jira_indexer.py new file mode 100644 index 0000000..75b20f9 --- /dev/null +++ b/surfsense_backend/app/tasks/connector_indexers/jira_indexer.py @@ -0,0 +1,320 @@ +""" +Jira connector indexer. +""" + +from datetime import datetime + +from sqlalchemy.exc import SQLAlchemyError +from sqlalchemy.ext.asyncio import AsyncSession + +from app.config import config +from app.connectors.jira_connector import JiraConnector +from app.db import Document, DocumentType, SearchSourceConnectorType +from app.services.task_logging_service import TaskLoggingService +from app.utils.document_converters import generate_content_hash + +from .base import ( + calculate_date_range, + check_duplicate_document_by_hash, + create_document_chunks, + get_connector_by_id, + logger, + update_connector_last_indexed, +) + + +async def index_jira_issues( + session: AsyncSession, + connector_id: int, + search_space_id: int, + user_id: str, + start_date: str | None = None, + end_date: str | None = None, + update_last_indexed: bool = True, +) -> tuple[int, str | None]: + """ + Index Jira issues and comments. + + Args: + session: Database session + connector_id: ID of the Jira connector + search_space_id: ID of the search space to store documents in + user_id: User ID + start_date: Start date for indexing (YYYY-MM-DD format) + end_date: End date for indexing (YYYY-MM-DD format) + update_last_indexed: Whether to update the last_indexed_at timestamp (default: True) + + Returns: + Tuple containing (number of documents indexed, error message or None) + """ + task_logger = TaskLoggingService(session, search_space_id) + + # Log task start + log_entry = await task_logger.log_task_start( + task_name="jira_issues_indexing", + source="connector_indexing_task", + message=f"Starting Jira issues indexing for connector {connector_id}", + metadata={ + "connector_id": connector_id, + "user_id": str(user_id), + "start_date": start_date, + "end_date": end_date, + }, + ) + + try: + # Get the connector from the database + connector = await get_connector_by_id( + session, connector_id, SearchSourceConnectorType.JIRA_CONNECTOR + ) + + if not connector: + await task_logger.log_task_failure( + log_entry, + f"Connector with ID {connector_id} not found", + "Connector not found", + {"error_type": "ConnectorNotFound"}, + ) + return 0, f"Connector with ID {connector_id} not found" + + # Get the Jira credentials from the connector config + jira_email = connector.config.get("JIRA_EMAIL") + jira_api_token = connector.config.get("JIRA_API_TOKEN") + jira_base_url = connector.config.get("JIRA_BASE_URL") + + if not jira_email or not jira_api_token or not jira_base_url: + await task_logger.log_task_failure( + log_entry, + f"Jira credentials not found in connector config for connector {connector_id}", + "Missing Jira credentials", + {"error_type": "MissingCredentials"}, + ) + return 0, "Jira credentials not found in connector config" + + # Initialize Jira client + await task_logger.log_task_progress( + log_entry, + f"Initializing Jira client for connector {connector_id}", + {"stage": "client_initialization"}, + ) + + jira_client = JiraConnector( + base_url=jira_base_url, email=jira_email, api_token=jira_api_token + ) + + # Calculate date range + start_date_str, end_date_str = calculate_date_range( + connector, start_date, end_date, default_days_back=365 + ) + + await task_logger.log_task_progress( + log_entry, + f"Fetching Jira issues from {start_date_str} to {end_date_str}", + { + "stage": "fetching_issues", + "start_date": start_date_str, + "end_date": end_date_str, + }, + ) + + # Get issues within date range + try: + issues, error = jira_client.get_issues_by_date_range( + start_date=start_date_str, end_date=end_date_str, include_comments=True + ) + + if error: + logger.error(f"Failed to get Jira issues: {error}") + + # Don't treat "No issues found" as an error that should stop indexing + if "No issues found" in error: + logger.info( + "No issues found is not a critical error, continuing with update" + ) + if update_last_indexed: + await update_connector_last_indexed( + session, connector, update_last_indexed + ) + await session.commit() + logger.info( + f"Updated last_indexed_at to {connector.last_indexed_at} despite no issues found" + ) + + await task_logger.log_task_success( + log_entry, + f"No Jira issues found in date range {start_date_str} to {end_date_str}", + {"issues_found": 0}, + ) + return 0, None + else: + await task_logger.log_task_failure( + log_entry, + f"Failed to get Jira issues: {error}", + "API Error", + {"error_type": "APIError"}, + ) + return 0, f"Failed to get Jira issues: {error}" + + logger.info(f"Retrieved {len(issues)} issues from Jira API") + + except Exception as e: + logger.error(f"Error fetching Jira issues: {e!s}", exc_info=True) + return 0, f"Error fetching Jira issues: {e!s}" + + # Process and index each issue + documents_indexed = 0 + skipped_issues = [] + documents_skipped = 0 + + for issue in issues: + try: + issue_id = issue.get("key") + issue_identifier = issue.get("key", "") + issue_title = issue.get("id", "") + + if not issue_id or not issue_title: + logger.warning( + f"Skipping issue with missing ID or title: {issue_id or 'Unknown'}" + ) + skipped_issues.append( + f"{issue_identifier or 'Unknown'} (missing data)" + ) + documents_skipped += 1 + continue + + # Format the issue for better readability + formatted_issue = jira_client.format_issue(issue) + + # Convert to markdown + issue_content = jira_client.format_issue_to_markdown(formatted_issue) + + if not issue_content: + logger.warning( + f"Skipping issue with no content: {issue_identifier} - {issue_title}" + ) + skipped_issues.append(f"{issue_identifier} (no content)") + documents_skipped += 1 + continue + + # Create a simple summary + summary_content = f"Jira Issue {issue_identifier}: {issue_title}\n\nStatus: {formatted_issue.get('status', 'Unknown')}\n\n" + if formatted_issue.get("description"): + summary_content += ( + f"Description: {formatted_issue.get('description')}\n\n" + ) + + # Add comment count + comment_count = len(formatted_issue.get("comments", [])) + summary_content += f"Comments: {comment_count}" + + # Generate content hash + content_hash = generate_content_hash(issue_content, search_space_id) + + # Check if document already exists + existing_document_by_hash = await check_duplicate_document_by_hash( + session, content_hash + ) + + if existing_document_by_hash: + logger.info( + f"Document with content hash {content_hash} already exists for issue {issue_identifier}. Skipping processing." + ) + documents_skipped += 1 + continue + + # Generate embedding for the summary + summary_embedding = config.embedding_model_instance.embed( + summary_content + ) + + # Process chunks - using the full issue content with comments + chunks = await create_document_chunks(issue_content) + + # Create and store new document + logger.info( + f"Creating new document for issue {issue_identifier} - {issue_title}" + ) + document = Document( + search_space_id=search_space_id, + title=f"Jira - {issue_identifier}: {issue_title}", + document_type=DocumentType.JIRA_CONNECTOR, + document_metadata={ + "issue_id": issue_id, + "issue_identifier": issue_identifier, + "issue_title": issue_title, + "state": formatted_issue.get("status", "Unknown"), + "comment_count": comment_count, + "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + }, + content=summary_content, + content_hash=content_hash, + embedding=summary_embedding, + chunks=chunks, + ) + + session.add(document) + documents_indexed += 1 + logger.info( + f"Successfully indexed new issue {issue_identifier} - {issue_title}" + ) + + except Exception as e: + logger.error( + f"Error processing issue {issue.get('identifier', 'Unknown')}: {e!s}", + exc_info=True, + ) + skipped_issues.append( + f"{issue.get('identifier', 'Unknown')} (processing error)" + ) + documents_skipped += 1 + continue # Skip this issue and continue with others + + # Update the last_indexed_at timestamp for the connector only if requested + total_processed = documents_indexed + if update_last_indexed: + await update_connector_last_indexed(session, connector, update_last_indexed) + + # Commit all changes + await session.commit() + logger.info("Successfully committed all JIRA document changes to database") + + # Log success + await task_logger.log_task_success( + log_entry, + f"Successfully completed JIRA indexing for connector {connector_id}", + { + "issues_processed": total_processed, + "documents_indexed": documents_indexed, + "documents_skipped": documents_skipped, + "skipped_issues_count": len(skipped_issues), + }, + ) + + logger.info( + f"JIRA indexing completed: {documents_indexed} new issues, {documents_skipped} skipped" + ) + return ( + total_processed, + None, + ) # Return None as the error message to indicate success + + except SQLAlchemyError as db_error: + await session.rollback() + await task_logger.log_task_failure( + log_entry, + f"Database error during JIRA indexing for connector {connector_id}", + str(db_error), + {"error_type": "SQLAlchemyError"}, + ) + logger.error(f"Database error: {db_error!s}", exc_info=True) + return 0, f"Database error: {db_error!s}" + except Exception as e: + await session.rollback() + await task_logger.log_task_failure( + log_entry, + f"Failed to index JIRA issues for connector {connector_id}", + str(e), + {"error_type": type(e).__name__}, + ) + logger.error(f"Failed to index JIRA issues: {e!s}", exc_info=True) + return 0, f"Failed to index JIRA issues: {e!s}" diff --git a/surfsense_backend/app/tasks/connector_indexers/linear_indexer.py b/surfsense_backend/app/tasks/connector_indexers/linear_indexer.py new file mode 100644 index 0000000..dfe0aff --- /dev/null +++ b/surfsense_backend/app/tasks/connector_indexers/linear_indexer.py @@ -0,0 +1,337 @@ +""" +Linear connector indexer. +""" + +from datetime import datetime + +from sqlalchemy.exc import SQLAlchemyError +from sqlalchemy.ext.asyncio import AsyncSession + +from app.config import config +from app.connectors.linear_connector import LinearConnector +from app.db import Document, DocumentType, SearchSourceConnectorType +from app.services.task_logging_service import TaskLoggingService +from app.utils.document_converters import generate_content_hash + +from .base import ( + calculate_date_range, + check_duplicate_document_by_hash, + create_document_chunks, + get_connector_by_id, + logger, + update_connector_last_indexed, +) + + +async def index_linear_issues( + session: AsyncSession, + connector_id: int, + search_space_id: int, + user_id: str, + start_date: str | None = None, + end_date: str | None = None, + update_last_indexed: bool = True, +) -> tuple[int, str | None]: + """ + Index Linear issues and comments. + + Args: + session: Database session + connector_id: ID of the Linear connector + search_space_id: ID of the search space to store documents in + user_id: ID of the user + start_date: Start date for indexing (YYYY-MM-DD format) + end_date: End date for indexing (YYYY-MM-DD format) + update_last_indexed: Whether to update the last_indexed_at timestamp (default: True) + + Returns: + Tuple containing (number of documents indexed, error message or None) + """ + task_logger = TaskLoggingService(session, search_space_id) + + # Log task start + log_entry = await task_logger.log_task_start( + task_name="linear_issues_indexing", + source="connector_indexing_task", + message=f"Starting Linear issues indexing for connector {connector_id}", + metadata={ + "connector_id": connector_id, + "user_id": str(user_id), + "start_date": start_date, + "end_date": end_date, + }, + ) + + try: + # Get the connector + await task_logger.log_task_progress( + log_entry, + f"Retrieving Linear connector {connector_id} from database", + {"stage": "connector_retrieval"}, + ) + + connector = await get_connector_by_id( + session, connector_id, SearchSourceConnectorType.LINEAR_CONNECTOR + ) + + if not connector: + await task_logger.log_task_failure( + log_entry, + f"Connector with ID {connector_id} not found or is not a Linear connector", + "Connector not found", + {"error_type": "ConnectorNotFound"}, + ) + return ( + 0, + f"Connector with ID {connector_id} not found or is not a Linear connector", + ) + + # Get the Linear token from the connector config + linear_token = connector.config.get("LINEAR_API_KEY") + if not linear_token: + await task_logger.log_task_failure( + log_entry, + f"Linear API token not found in connector config for connector {connector_id}", + "Missing Linear token", + {"error_type": "MissingToken"}, + ) + return 0, "Linear API token not found in connector config" + + # Initialize Linear client + await task_logger.log_task_progress( + log_entry, + f"Initializing Linear client for connector {connector_id}", + {"stage": "client_initialization"}, + ) + + linear_client = LinearConnector(token=linear_token) + + # Calculate date range + start_date_str, end_date_str = calculate_date_range( + connector, start_date, end_date, default_days_back=365 + ) + + logger.info(f"Fetching Linear issues from {start_date_str} to {end_date_str}") + + await task_logger.log_task_progress( + log_entry, + f"Fetching Linear issues from {start_date_str} to {end_date_str}", + { + "stage": "fetch_issues", + "start_date": start_date_str, + "end_date": end_date_str, + }, + ) + + # Get issues within date range + try: + issues, error = linear_client.get_issues_by_date_range( + start_date=start_date_str, end_date=end_date_str, include_comments=True + ) + + if error: + logger.error(f"Failed to get Linear issues: {error}") + + # Don't treat "No issues found" as an error that should stop indexing + if "No issues found" in error: + logger.info( + "No issues found is not a critical error, continuing with update" + ) + if update_last_indexed: + await update_connector_last_indexed( + session, connector, update_last_indexed + ) + await session.commit() + logger.info( + f"Updated last_indexed_at to {connector.last_indexed_at} despite no issues found" + ) + return 0, None + else: + return 0, f"Failed to get Linear issues: {error}" + + logger.info(f"Retrieved {len(issues)} issues from Linear API") + + except Exception as e: + logger.error(f"Exception when calling Linear API: {e!s}", exc_info=True) + return 0, f"Failed to get Linear issues: {e!s}" + + if not issues: + logger.info("No Linear issues found for the specified date range") + if update_last_indexed: + await update_connector_last_indexed( + session, connector, update_last_indexed + ) + await session.commit() + logger.info( + f"Updated last_indexed_at to {connector.last_indexed_at} despite no issues found" + ) + return 0, None # Return None instead of error message when no issues found + + # Track the number of documents indexed + documents_indexed = 0 + documents_skipped = 0 + skipped_issues = [] + + await task_logger.log_task_progress( + log_entry, + f"Starting to process {len(issues)} Linear issues", + {"stage": "process_issues", "total_issues": len(issues)}, + ) + + # Process each issue + for issue in issues: + try: + issue_id = issue.get("id", "") + issue_identifier = issue.get("identifier", "") + issue_title = issue.get("title", "") + + if not issue_id or not issue_title: + logger.warning( + f"Skipping issue with missing ID or title: {issue_id or 'Unknown'}" + ) + skipped_issues.append( + f"{issue_identifier or 'Unknown'} (missing data)" + ) + documents_skipped += 1 + continue + + # Format the issue first to get well-structured data + formatted_issue = linear_client.format_issue(issue) + + # Convert issue to markdown format + issue_content = linear_client.format_issue_to_markdown(formatted_issue) + + if not issue_content: + logger.warning( + f"Skipping issue with no content: {issue_identifier} - {issue_title}" + ) + skipped_issues.append(f"{issue_identifier} (no content)") + documents_skipped += 1 + continue + + # Create a short summary for the embedding + state = formatted_issue.get("state", "Unknown") + description = formatted_issue.get("description", "") + # Truncate description if it's too long for the summary + if description and len(description) > 500: + description = description[:497] + "..." + + # Create a simple summary from the issue data + summary_content = f"Linear Issue {issue_identifier}: {issue_title}\n\nStatus: {state}\n\n" + if description: + summary_content += f"Description: {description}\n\n" + + # Add comment count + comment_count = len(formatted_issue.get("comments", [])) + summary_content += f"Comments: {comment_count}" + + content_hash = generate_content_hash(issue_content, search_space_id) + + # Check if document with this content hash already exists + existing_document_by_hash = await check_duplicate_document_by_hash( + session, content_hash + ) + + if existing_document_by_hash: + logger.info( + f"Document with content hash {content_hash} already exists for issue {issue_identifier}. Skipping processing." + ) + documents_skipped += 1 + continue + + # Generate embedding for the summary + summary_embedding = config.embedding_model_instance.embed( + summary_content + ) + + # Process chunks - using the full issue content with comments + chunks = await create_document_chunks(issue_content) + + # Create and store new document + logger.info( + f"Creating new document for issue {issue_identifier} - {issue_title}" + ) + document = Document( + search_space_id=search_space_id, + title=f"Linear - {issue_identifier}: {issue_title}", + document_type=DocumentType.LINEAR_CONNECTOR, + document_metadata={ + "issue_id": issue_id, + "issue_identifier": issue_identifier, + "issue_title": issue_title, + "state": state, + "comment_count": comment_count, + "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + }, + content=summary_content, + content_hash=content_hash, + embedding=summary_embedding, + chunks=chunks, + ) + + session.add(document) + documents_indexed += 1 + logger.info( + f"Successfully indexed new issue {issue_identifier} - {issue_title}" + ) + + except Exception as e: + logger.error( + f"Error processing issue {issue.get('identifier', 'Unknown')}: {e!s}", + exc_info=True, + ) + skipped_issues.append( + f"{issue.get('identifier', 'Unknown')} (processing error)" + ) + documents_skipped += 1 + continue # Skip this issue and continue with others + + # Update the last_indexed_at timestamp for the connector only if requested + total_processed = documents_indexed + if update_last_indexed: + await update_connector_last_indexed(session, connector, update_last_indexed) + + # Commit all changes + await session.commit() + logger.info("Successfully committed all Linear document changes to database") + + # Log success + await task_logger.log_task_success( + log_entry, + f"Successfully completed Linear indexing for connector {connector_id}", + { + "issues_processed": total_processed, + "documents_indexed": documents_indexed, + "documents_skipped": documents_skipped, + "skipped_issues_count": len(skipped_issues), + }, + ) + + logger.info( + f"Linear indexing completed: {documents_indexed} new issues, {documents_skipped} skipped" + ) + return ( + total_processed, + None, + ) # Return None as the error message to indicate success + + except SQLAlchemyError as db_error: + await session.rollback() + await task_logger.log_task_failure( + log_entry, + f"Database error during Linear indexing for connector {connector_id}", + str(db_error), + {"error_type": "SQLAlchemyError"}, + ) + logger.error(f"Database error: {db_error!s}", exc_info=True) + return 0, f"Database error: {db_error!s}" + except Exception as e: + await session.rollback() + await task_logger.log_task_failure( + log_entry, + f"Failed to index Linear issues for connector {connector_id}", + str(e), + {"error_type": type(e).__name__}, + ) + logger.error(f"Failed to index Linear issues: {e!s}", exc_info=True) + return 0, f"Failed to index Linear issues: {e!s}" diff --git a/surfsense_backend/app/tasks/connector_indexers/notion_indexer.py b/surfsense_backend/app/tasks/connector_indexers/notion_indexer.py new file mode 100644 index 0000000..fbd769e --- /dev/null +++ b/surfsense_backend/app/tasks/connector_indexers/notion_indexer.py @@ -0,0 +1,406 @@ +""" +Notion connector indexer. +""" + +from datetime import datetime, timedelta + +from sqlalchemy.exc import SQLAlchemyError +from sqlalchemy.ext.asyncio import AsyncSession + +from app.config import config +from app.connectors.notion_history import NotionHistoryConnector +from app.db import Document, DocumentType, SearchSourceConnectorType +from app.prompts import SUMMARY_PROMPT_TEMPLATE +from app.services.llm_service import get_user_long_context_llm +from app.services.task_logging_service import TaskLoggingService +from app.utils.document_converters import generate_content_hash + +from .base import ( + build_document_metadata_string, + check_duplicate_document_by_hash, + create_document_chunks, + get_connector_by_id, + logger, + update_connector_last_indexed, +) + + +async def index_notion_pages( + session: AsyncSession, + connector_id: int, + search_space_id: int, + user_id: str, + start_date: str | None = None, + end_date: str | None = None, + update_last_indexed: bool = True, +) -> tuple[int, str | None]: + """ + Index Notion pages from all accessible pages. + + Args: + session: Database session + connector_id: ID of the Notion connector + search_space_id: ID of the search space to store documents in + user_id: ID of the user + start_date: Start date for indexing (YYYY-MM-DD format) + end_date: End date for indexing (YYYY-MM-DD format) + update_last_indexed: Whether to update the last_indexed_at timestamp (default: True) + + Returns: + Tuple containing (number of documents indexed, error message or None) + """ + task_logger = TaskLoggingService(session, search_space_id) + + # Log task start + log_entry = await task_logger.log_task_start( + task_name="notion_pages_indexing", + source="connector_indexing_task", + message=f"Starting Notion pages indexing for connector {connector_id}", + metadata={ + "connector_id": connector_id, + "user_id": str(user_id), + "start_date": start_date, + "end_date": end_date, + }, + ) + + try: + # Get the connector + await task_logger.log_task_progress( + log_entry, + f"Retrieving Notion connector {connector_id} from database", + {"stage": "connector_retrieval"}, + ) + + connector = await get_connector_by_id( + session, connector_id, SearchSourceConnectorType.NOTION_CONNECTOR + ) + + if not connector: + await task_logger.log_task_failure( + log_entry, + f"Connector with ID {connector_id} not found or is not a Notion connector", + "Connector not found", + {"error_type": "ConnectorNotFound"}, + ) + return ( + 0, + f"Connector with ID {connector_id} not found or is not a Notion connector", + ) + + # Get the Notion token from the connector config + notion_token = connector.config.get("NOTION_INTEGRATION_TOKEN") + if not notion_token: + await task_logger.log_task_failure( + log_entry, + f"Notion integration token not found in connector config for connector {connector_id}", + "Missing Notion token", + {"error_type": "MissingToken"}, + ) + return 0, "Notion integration token not found in connector config" + + # Initialize Notion client + await task_logger.log_task_progress( + log_entry, + f"Initializing Notion client for connector {connector_id}", + {"stage": "client_initialization"}, + ) + + logger.info(f"Initializing Notion client for connector {connector_id}") + notion_client = NotionHistoryConnector(token=notion_token) + + # Calculate date range + if start_date is None or end_date is None: + # Fall back to calculating dates + calculated_end_date = datetime.now() + calculated_start_date = calculated_end_date - timedelta( + days=365 + ) # Check for last 1 year of pages + + # Use calculated dates if not provided + if start_date is None: + start_date_iso = calculated_start_date.strftime("%Y-%m-%dT%H:%M:%SZ") + else: + # Convert YYYY-MM-DD to ISO format + start_date_iso = datetime.strptime(start_date, "%Y-%m-%d").strftime( + "%Y-%m-%dT%H:%M:%SZ" + ) + + if end_date is None: + end_date_iso = calculated_end_date.strftime("%Y-%m-%dT%H:%M:%SZ") + else: + # Convert YYYY-MM-DD to ISO format + end_date_iso = datetime.strptime(end_date, "%Y-%m-%d").strftime( + "%Y-%m-%dT%H:%M:%SZ" + ) + else: + # Convert provided dates to ISO format for Notion API + start_date_iso = datetime.strptime(start_date, "%Y-%m-%d").strftime( + "%Y-%m-%dT%H:%M:%SZ" + ) + end_date_iso = datetime.strptime(end_date, "%Y-%m-%d").strftime( + "%Y-%m-%dT%H:%M:%SZ" + ) + + logger.info(f"Fetching Notion pages from {start_date_iso} to {end_date_iso}") + + await task_logger.log_task_progress( + log_entry, + f"Fetching Notion pages from {start_date_iso} to {end_date_iso}", + { + "stage": "fetch_pages", + "start_date": start_date_iso, + "end_date": end_date_iso, + }, + ) + + # Get all pages + try: + pages = notion_client.get_all_pages( + start_date=start_date_iso, end_date=end_date_iso + ) + logger.info(f"Found {len(pages)} Notion pages") + except Exception as e: + await task_logger.log_task_failure( + log_entry, + f"Failed to get Notion pages for connector {connector_id}", + str(e), + {"error_type": "PageFetchError"}, + ) + logger.error(f"Error fetching Notion pages: {e!s}", exc_info=True) + return 0, f"Failed to get Notion pages: {e!s}" + + if not pages: + await task_logger.log_task_success( + log_entry, + f"No Notion pages found for connector {connector_id}", + {"pages_found": 0}, + ) + logger.info("No Notion pages found to index") + return 0, "No Notion pages found" + + # Track the number of documents indexed + documents_indexed = 0 + documents_skipped = 0 + skipped_pages = [] + + await task_logger.log_task_progress( + log_entry, + f"Starting to process {len(pages)} Notion pages", + {"stage": "process_pages", "total_pages": len(pages)}, + ) + + # Process each page + for page in pages: + try: + page_id = page.get("page_id") + page_title = page.get("title", f"Untitled page ({page_id})") + page_content = page.get("content", []) + + logger.info(f"Processing Notion page: {page_title} ({page_id})") + + if not page_content: + logger.info(f"No content found in page {page_title}. Skipping.") + skipped_pages.append(f"{page_title} (no content)") + documents_skipped += 1 + continue + + # Convert page content to markdown format + markdown_content = f"# Notion Page: {page_title}\n\n" + + # Process blocks recursively + def process_blocks(blocks, level=0): + result = "" + for block in blocks: + block_type = block.get("type") + block_content = block.get("content", "") + children = block.get("children", []) + + # Add indentation based on level + indent = " " * level + + # Format based on block type + if block_type in ["paragraph", "text"]: + result += f"{indent}{block_content}\n\n" + elif block_type in ["heading_1", "header"]: + result += f"{indent}# {block_content}\n\n" + elif block_type == "heading_2": + result += f"{indent}## {block_content}\n\n" + elif block_type == "heading_3": + result += f"{indent}### {block_content}\n\n" + elif block_type == "bulleted_list_item": + result += f"{indent}* {block_content}\n" + elif block_type == "numbered_list_item": + result += f"{indent}1. {block_content}\n" + elif block_type == "to_do": + result += f"{indent}- [ ] {block_content}\n" + elif block_type == "toggle": + result += f"{indent}> {block_content}\n" + elif block_type == "code": + result += f"{indent}```\n{block_content}\n```\n\n" + elif block_type == "quote": + result += f"{indent}> {block_content}\n\n" + elif block_type == "callout": + result += f"{indent}> **Note:** {block_content}\n\n" + elif block_type == "image": + result += f"{indent}![Image]({block_content})\n\n" + else: + # Default for other block types + if block_content: + result += f"{indent}{block_content}\n\n" + + # Process children recursively + if children: + result += process_blocks(children, level + 1) + + return result + + logger.debug( + f"Converting {len(page_content)} blocks to markdown for page {page_title}" + ) + markdown_content += process_blocks(page_content) + + # Format document metadata + metadata_sections = [ + ("METADATA", [f"PAGE_TITLE: {page_title}", f"PAGE_ID: {page_id}"]), + ( + "CONTENT", + [ + "FORMAT: markdown", + "TEXT_START", + markdown_content, + "TEXT_END", + ], + ), + ] + + # Build the document string + combined_document_string = build_document_metadata_string( + metadata_sections + ) + content_hash = generate_content_hash( + combined_document_string, search_space_id + ) + + # Check if document with this content hash already exists + existing_document_by_hash = await check_duplicate_document_by_hash( + session, content_hash + ) + + if existing_document_by_hash: + logger.info( + f"Document with content hash {content_hash} already exists for page {page_title}. Skipping processing." + ) + documents_skipped += 1 + continue + + # Get user's long context LLM + user_llm = await get_user_long_context_llm(session, user_id) + if not user_llm: + logger.error(f"No long context LLM configured for user {user_id}") + skipped_pages.append(f"{page_title} (no LLM configured)") + documents_skipped += 1 + continue + + # Generate summary + logger.debug(f"Generating summary for page {page_title}") + summary_chain = SUMMARY_PROMPT_TEMPLATE | user_llm + summary_result = await summary_chain.ainvoke( + {"document": combined_document_string} + ) + summary_content = summary_result.content + summary_embedding = config.embedding_model_instance.embed( + summary_content + ) + + # Process chunks + logger.debug(f"Chunking content for page {page_title}") + chunks = await create_document_chunks(markdown_content) + + # Create and store new document + document = Document( + search_space_id=search_space_id, + title=f"Notion - {page_title}", + document_type=DocumentType.NOTION_CONNECTOR, + document_metadata={ + "page_title": page_title, + "page_id": page_id, + "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + }, + content=summary_content, + content_hash=content_hash, + embedding=summary_embedding, + chunks=chunks, + ) + + session.add(document) + documents_indexed += 1 + logger.info(f"Successfully indexed new Notion page: {page_title}") + + except Exception as e: + logger.error( + f"Error processing Notion page {page.get('title', 'Unknown')}: {e!s}", + exc_info=True, + ) + skipped_pages.append( + f"{page.get('title', 'Unknown')} (processing error)" + ) + documents_skipped += 1 + continue # Skip this page and continue with others + + # Update the last_indexed_at timestamp for the connector only if requested + # and if we successfully indexed at least one page + total_processed = documents_indexed + if total_processed > 0: + await update_connector_last_indexed(session, connector, update_last_indexed) + + # Commit all changes + await session.commit() + + # Prepare result message + result_message = None + if skipped_pages: + result_message = f"Processed {total_processed} pages. Skipped {len(skipped_pages)} pages: {', '.join(skipped_pages)}" + else: + result_message = f"Processed {total_processed} pages." + + # Log success + await task_logger.log_task_success( + log_entry, + f"Successfully completed Notion indexing for connector {connector_id}", + { + "pages_processed": total_processed, + "documents_indexed": documents_indexed, + "documents_skipped": documents_skipped, + "skipped_pages_count": len(skipped_pages), + "result_message": result_message, + }, + ) + + logger.info( + f"Notion indexing completed: {documents_indexed} new pages, {documents_skipped} skipped" + ) + return total_processed, result_message + + except SQLAlchemyError as db_error: + await session.rollback() + await task_logger.log_task_failure( + log_entry, + f"Database error during Notion indexing for connector {connector_id}", + str(db_error), + {"error_type": "SQLAlchemyError"}, + ) + logger.error( + f"Database error during Notion indexing: {db_error!s}", exc_info=True + ) + return 0, f"Database error: {db_error!s}" + except Exception as e: + await session.rollback() + await task_logger.log_task_failure( + log_entry, + f"Failed to index Notion pages for connector {connector_id}", + str(e), + {"error_type": type(e).__name__}, + ) + logger.error(f"Failed to index Notion pages: {e!s}", exc_info=True) + return 0, f"Failed to index Notion pages: {e!s}" diff --git a/surfsense_backend/app/tasks/connector_indexers/slack_indexer.py b/surfsense_backend/app/tasks/connector_indexers/slack_indexer.py new file mode 100644 index 0000000..78b1308 --- /dev/null +++ b/surfsense_backend/app/tasks/connector_indexers/slack_indexer.py @@ -0,0 +1,396 @@ +""" +Slack connector indexer. +""" + +from datetime import datetime + +from slack_sdk.errors import SlackApiError +from sqlalchemy.exc import SQLAlchemyError +from sqlalchemy.ext.asyncio import AsyncSession + +from app.config import config +from app.connectors.slack_history import SlackHistory +from app.db import Document, DocumentType, SearchSourceConnectorType +from app.prompts import SUMMARY_PROMPT_TEMPLATE +from app.services.llm_service import get_user_long_context_llm +from app.services.task_logging_service import TaskLoggingService +from app.utils.document_converters import generate_content_hash + +from .base import ( + build_document_metadata_string, + calculate_date_range, + check_duplicate_document_by_hash, + create_document_chunks, + get_connector_by_id, + logger, + update_connector_last_indexed, +) + + +async def index_slack_messages( + session: AsyncSession, + connector_id: int, + search_space_id: int, + user_id: str, + start_date: str | None = None, + end_date: str | None = None, + update_last_indexed: bool = True, +) -> tuple[int, str | None]: + """ + Index Slack messages from all accessible channels. + + Args: + session: Database session + connector_id: ID of the Slack connector + search_space_id: ID of the search space to store documents in + user_id: ID of the user + start_date: Start date for indexing (YYYY-MM-DD format) + end_date: End date for indexing (YYYY-MM-DD format) + update_last_indexed: Whether to update the last_indexed_at timestamp (default: True) + + Returns: + Tuple containing (number of documents indexed, error message or None) + """ + task_logger = TaskLoggingService(session, search_space_id) + + # Log task start + log_entry = await task_logger.log_task_start( + task_name="slack_messages_indexing", + source="connector_indexing_task", + message=f"Starting Slack messages indexing for connector {connector_id}", + metadata={ + "connector_id": connector_id, + "user_id": str(user_id), + "start_date": start_date, + "end_date": end_date, + }, + ) + + try: + # Get the connector + await task_logger.log_task_progress( + log_entry, + f"Retrieving Slack connector {connector_id} from database", + {"stage": "connector_retrieval"}, + ) + + connector = await get_connector_by_id( + session, connector_id, SearchSourceConnectorType.SLACK_CONNECTOR + ) + + if not connector: + await task_logger.log_task_failure( + log_entry, + f"Connector with ID {connector_id} not found or is not a Slack connector", + "Connector not found", + {"error_type": "ConnectorNotFound"}, + ) + return ( + 0, + f"Connector with ID {connector_id} not found or is not a Slack connector", + ) + + # Get the Slack token from the connector config + slack_token = connector.config.get("SLACK_BOT_TOKEN") + if not slack_token: + await task_logger.log_task_failure( + log_entry, + f"Slack token not found in connector config for connector {connector_id}", + "Missing Slack token", + {"error_type": "MissingToken"}, + ) + return 0, "Slack token not found in connector config" + + # Initialize Slack client + await task_logger.log_task_progress( + log_entry, + f"Initializing Slack client for connector {connector_id}", + {"stage": "client_initialization"}, + ) + + slack_client = SlackHistory(token=slack_token) + + # Calculate date range + await task_logger.log_task_progress( + log_entry, + "Calculating date range for Slack indexing", + { + "stage": "date_calculation", + "provided_start_date": start_date, + "provided_end_date": end_date, + }, + ) + + start_date_str, end_date_str = calculate_date_range( + connector, start_date, end_date, default_days_back=365 + ) + + logger.info(f"Indexing Slack messages from {start_date_str} to {end_date_str}") + + await task_logger.log_task_progress( + log_entry, + f"Fetching Slack channels from {start_date_str} to {end_date_str}", + { + "stage": "fetch_channels", + "start_date": start_date_str, + "end_date": end_date_str, + }, + ) + + # Get all channels + try: + channels = slack_client.get_all_channels() + except Exception as e: + await task_logger.log_task_failure( + log_entry, + f"Failed to get Slack channels for connector {connector_id}", + str(e), + {"error_type": "ChannelFetchError"}, + ) + return 0, f"Failed to get Slack channels: {e!s}" + + if not channels: + await task_logger.log_task_success( + log_entry, + f"No Slack channels found for connector {connector_id}", + {"channels_found": 0}, + ) + return 0, "No Slack channels found" + + # Track the number of documents indexed + documents_indexed = 0 + documents_skipped = 0 + skipped_channels = [] + + await task_logger.log_task_progress( + log_entry, + f"Starting to process {len(channels)} Slack channels", + {"stage": "process_channels", "total_channels": len(channels)}, + ) + + # Process each channel + for channel_obj in channels: + channel_id = channel_obj["id"] + channel_name = channel_obj["name"] + is_private = channel_obj["is_private"] + is_member = channel_obj["is_member"] + + try: + # If it's a private channel and the bot is not a member, skip. + if is_private and not is_member: + logger.warning( + f"Bot is not a member of private channel {channel_name} ({channel_id}). Skipping." + ) + skipped_channels.append( + f"{channel_name} (private, bot not a member)" + ) + documents_skipped += 1 + continue + + # Get messages for this channel + messages, error = slack_client.get_history_by_date_range( + channel_id=channel_id, + start_date=start_date_str, + end_date=end_date_str, + limit=1000, # Limit to 1000 messages per channel + ) + + if error: + logger.warning( + f"Error getting messages from channel {channel_name}: {error}" + ) + skipped_channels.append(f"{channel_name} (error: {error})") + documents_skipped += 1 + continue # Skip this channel if there's an error + + if not messages: + logger.info( + f"No messages found in channel {channel_name} for the specified date range." + ) + documents_skipped += 1 + continue # Skip if no messages + + # Format messages with user info + formatted_messages = [] + for msg in messages: + # Skip bot messages and system messages + if msg.get("subtype") in [ + "bot_message", + "channel_join", + "channel_leave", + ]: + continue + + formatted_msg = slack_client.format_message( + msg, include_user_info=True + ) + formatted_messages.append(formatted_msg) + + if not formatted_messages: + logger.info( + f"No valid messages found in channel {channel_name} after filtering." + ) + documents_skipped += 1 + continue # Skip if no valid messages after filtering + + # Convert messages to markdown format + channel_content = f"# Slack Channel: {channel_name}\n\n" + + for msg in formatted_messages: + user_name = msg.get("user_name", "Unknown User") + timestamp = msg.get("datetime", "Unknown Time") + text = msg.get("text", "") + + channel_content += ( + f"## {user_name} ({timestamp})\n\n{text}\n\n---\n\n" + ) + + # Format document metadata + metadata_sections = [ + ( + "METADATA", + [ + f"CHANNEL_NAME: {channel_name}", + f"CHANNEL_ID: {channel_id}", + f"MESSAGE_COUNT: {len(formatted_messages)}", + ], + ), + ( + "CONTENT", + ["FORMAT: markdown", "TEXT_START", channel_content, "TEXT_END"], + ), + ] + + # Build the document string + combined_document_string = build_document_metadata_string( + metadata_sections + ) + content_hash = generate_content_hash( + combined_document_string, search_space_id + ) + + # Check if document with this content hash already exists + existing_document_by_hash = await check_duplicate_document_by_hash( + session, content_hash + ) + + if existing_document_by_hash: + logger.info( + f"Document with content hash {content_hash} already exists for channel {channel_name}. Skipping processing." + ) + documents_skipped += 1 + continue + + # Get user's long context LLM + user_llm = await get_user_long_context_llm(session, user_id) + if not user_llm: + logger.error(f"No long context LLM configured for user {user_id}") + skipped_channels.append(f"{channel_name} (no LLM configured)") + documents_skipped += 1 + continue + + # Generate summary + summary_chain = SUMMARY_PROMPT_TEMPLATE | user_llm + summary_result = await summary_chain.ainvoke( + {"document": combined_document_string} + ) + summary_content = summary_result.content + summary_embedding = config.embedding_model_instance.embed( + summary_content + ) + + # Process chunks + chunks = await create_document_chunks(channel_content) + + # Create and store new document + document = Document( + search_space_id=search_space_id, + title=f"Slack - {channel_name}", + document_type=DocumentType.SLACK_CONNECTOR, + document_metadata={ + "channel_name": channel_name, + "channel_id": channel_id, + "start_date": start_date_str, + "end_date": end_date_str, + "message_count": len(formatted_messages), + "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + }, + content=summary_content, + embedding=summary_embedding, + chunks=chunks, + content_hash=content_hash, + ) + + session.add(document) + documents_indexed += 1 + logger.info( + f"Successfully indexed new channel {channel_name} with {len(formatted_messages)} messages" + ) + + except SlackApiError as slack_error: + logger.error( + f"Slack API error for channel {channel_name}: {slack_error!s}" + ) + skipped_channels.append(f"{channel_name} (Slack API error)") + documents_skipped += 1 + continue # Skip this channel and continue with others + except Exception as e: + logger.error(f"Error processing channel {channel_name}: {e!s}") + skipped_channels.append(f"{channel_name} (processing error)") + documents_skipped += 1 + continue # Skip this channel and continue with others + + # Update the last_indexed_at timestamp for the connector only if requested + # and if we successfully indexed at least one channel + total_processed = documents_indexed + if total_processed > 0: + await update_connector_last_indexed(session, connector, update_last_indexed) + + # Commit all changes + await session.commit() + + # Prepare result message + result_message = None + if skipped_channels: + result_message = f"Processed {total_processed} channels. Skipped {len(skipped_channels)} channels: {', '.join(skipped_channels)}" + else: + result_message = f"Processed {total_processed} channels." + + # Log success + await task_logger.log_task_success( + log_entry, + f"Successfully completed Slack indexing for connector {connector_id}", + { + "channels_processed": total_processed, + "documents_indexed": documents_indexed, + "documents_skipped": documents_skipped, + "skipped_channels_count": len(skipped_channels), + "result_message": result_message, + }, + ) + + logger.info( + f"Slack indexing completed: {documents_indexed} new channels, {documents_skipped} skipped" + ) + return total_processed, result_message + + except SQLAlchemyError as db_error: + await session.rollback() + await task_logger.log_task_failure( + log_entry, + f"Database error during Slack indexing for connector {connector_id}", + str(db_error), + {"error_type": "SQLAlchemyError"}, + ) + logger.error(f"Database error: {db_error!s}") + return 0, f"Database error: {db_error!s}" + except Exception as e: + await session.rollback() + await task_logger.log_task_failure( + log_entry, + f"Failed to index Slack messages for connector {connector_id}", + str(e), + {"error_type": type(e).__name__}, + ) + logger.error(f"Failed to index Slack messages: {e!s}") + return 0, f"Failed to index Slack messages: {e!s}" diff --git a/surfsense_backend/app/tasks/connectors_indexing_tasks.py b/surfsense_backend/app/tasks/connectors_indexing_tasks.py deleted file mode 100644 index 72bc18c..0000000 --- a/surfsense_backend/app/tasks/connectors_indexing_tasks.py +++ /dev/null @@ -1,3375 +0,0 @@ -import asyncio -import logging -from datetime import UTC, datetime, timedelta - -from google.oauth2.credentials import Credentials -from slack_sdk.errors import SlackApiError -from sqlalchemy.exc import SQLAlchemyError -from sqlalchemy.ext.asyncio import AsyncSession -from sqlalchemy.future import select - -from app.config import config -from app.connectors.clickup_connector import ClickUpConnector -from app.connectors.confluence_connector import ConfluenceConnector -from app.connectors.discord_connector import DiscordConnector -from app.connectors.github_connector import GitHubConnector -from app.connectors.google_calendar_connector import GoogleCalendarConnector -from app.connectors.jira_connector import JiraConnector -from app.connectors.linear_connector import LinearConnector -from app.connectors.notion_history import NotionHistoryConnector -from app.connectors.slack_history import SlackHistory -from app.db import ( - Chunk, - Document, - DocumentType, - SearchSourceConnector, - SearchSourceConnectorType, -) -from app.prompts import SUMMARY_PROMPT_TEMPLATE -from app.services.llm_service import get_user_long_context_llm -from app.services.task_logging_service import TaskLoggingService -from app.utils.document_converters import generate_content_hash - -# Set up logging -logger = logging.getLogger(__name__) - - -async def index_slack_messages( - session: AsyncSession, - connector_id: int, - search_space_id: int, - user_id: str, - start_date: str | None = None, - end_date: str | None = None, - update_last_indexed: bool = True, -) -> tuple[int, str | None]: - """ - Index Slack messages from all accessible channels. - - Args: - session: Database session - connector_id: ID of the Slack connector - search_space_id: ID of the search space to store documents in - update_last_indexed: Whether to update the last_indexed_at timestamp (default: True) - - Returns: - Tuple containing (number of documents indexed, error message or None) - """ - task_logger = TaskLoggingService(session, search_space_id) - - # Log task start - log_entry = await task_logger.log_task_start( - task_name="slack_messages_indexing", - source="connector_indexing_task", - message=f"Starting Slack messages indexing for connector {connector_id}", - metadata={ - "connector_id": connector_id, - "user_id": str(user_id), - "start_date": start_date, - "end_date": end_date, - }, - ) - - try: - # Get the connector - await task_logger.log_task_progress( - log_entry, - f"Retrieving Slack connector {connector_id} from database", - {"stage": "connector_retrieval"}, - ) - - result = await session.execute( - select(SearchSourceConnector).filter( - SearchSourceConnector.id == connector_id, - SearchSourceConnector.connector_type - == SearchSourceConnectorType.SLACK_CONNECTOR, - ) - ) - connector = result.scalars().first() - - if not connector: - await task_logger.log_task_failure( - log_entry, - f"Connector with ID {connector_id} not found or is not a Slack connector", - "Connector not found", - {"error_type": "ConnectorNotFound"}, - ) - return ( - 0, - f"Connector with ID {connector_id} not found or is not a Slack connector", - ) - - # Get the Slack token from the connector config - slack_token = connector.config.get("SLACK_BOT_TOKEN") - if not slack_token: - await task_logger.log_task_failure( - log_entry, - f"Slack token not found in connector config for connector {connector_id}", - "Missing Slack token", - {"error_type": "MissingToken"}, - ) - return 0, "Slack token not found in connector config" - - # Initialize Slack client - await task_logger.log_task_progress( - log_entry, - f"Initializing Slack client for connector {connector_id}", - {"stage": "client_initialization"}, - ) - - slack_client = SlackHistory(token=slack_token) - - # Calculate date range - await task_logger.log_task_progress( - log_entry, - "Calculating date range for Slack indexing", - { - "stage": "date_calculation", - "provided_start_date": start_date, - "provided_end_date": end_date, - }, - ) - - if start_date is None or end_date is None: - # Fall back to calculating dates based on last_indexed_at - calculated_end_date = datetime.now() - - # Use last_indexed_at as start date if available, otherwise use 365 days ago - if connector.last_indexed_at: - # Convert dates to be comparable (both timezone-naive) - last_indexed_naive = ( - connector.last_indexed_at.replace(tzinfo=None) - if connector.last_indexed_at.tzinfo - else connector.last_indexed_at - ) - - # Check if last_indexed_at is in the future or after end_date - if last_indexed_naive > calculated_end_date: - logger.warning( - f"Last indexed date ({last_indexed_naive.strftime('%Y-%m-%d')}) is in the future. Using 365 days ago instead." - ) - calculated_start_date = calculated_end_date - timedelta(days=365) - else: - calculated_start_date = last_indexed_naive - logger.info( - f"Using last_indexed_at ({calculated_start_date.strftime('%Y-%m-%d')}) as start date" - ) - else: - calculated_start_date = calculated_end_date - timedelta( - days=365 - ) # Use 365 days as default - logger.info( - f"No last_indexed_at found, using {calculated_start_date.strftime('%Y-%m-%d')} (365 days ago) as start date" - ) - - # Use calculated dates if not provided - start_date_str = ( - start_date if start_date else calculated_start_date.strftime("%Y-%m-%d") - ) - end_date_str = ( - end_date if end_date else calculated_end_date.strftime("%Y-%m-%d") - ) - else: - # Use provided dates - start_date_str = start_date - end_date_str = end_date - - logger.info(f"Indexing Slack messages from {start_date_str} to {end_date_str}") - - await task_logger.log_task_progress( - log_entry, - f"Fetching Slack channels from {start_date_str} to {end_date_str}", - { - "stage": "fetch_channels", - "start_date": start_date_str, - "end_date": end_date_str, - }, - ) - - # Get all channels - try: - channels = slack_client.get_all_channels() - except Exception as e: - await task_logger.log_task_failure( - log_entry, - f"Failed to get Slack channels for connector {connector_id}", - str(e), - {"error_type": "ChannelFetchError"}, - ) - return 0, f"Failed to get Slack channels: {e!s}" - - if not channels: - await task_logger.log_task_success( - log_entry, - f"No Slack channels found for connector {connector_id}", - {"channels_found": 0}, - ) - return 0, "No Slack channels found" - - # Track the number of documents indexed - documents_indexed = 0 - documents_skipped = 0 - skipped_channels = [] - - await task_logger.log_task_progress( - log_entry, - f"Starting to process {len(channels)} Slack channels", - {"stage": "process_channels", "total_channels": len(channels)}, - ) - - # Process each channel - for ( - channel_obj - ) in channels: # Modified loop to iterate over list of channel objects - channel_id = channel_obj["id"] - channel_name = channel_obj["name"] - is_private = channel_obj["is_private"] - is_member = channel_obj[ - "is_member" - ] # This might be False for public channels too - - try: - # If it's a private channel and the bot is not a member, skip. - # For public channels, if they are listed by conversations.list, the bot can typically read history. - # The `not_in_channel` error in get_conversation_history will be the ultimate gatekeeper if history is inaccessible. - if is_private and not is_member: - logger.warning( - f"Bot is not a member of private channel {channel_name} ({channel_id}). Skipping." - ) - skipped_channels.append( - f"{channel_name} (private, bot not a member)" - ) - documents_skipped += 1 - continue - - # Get messages for this channel - # The get_history_by_date_range now uses get_conversation_history, - # which handles 'not_in_channel' by returning [] and logging. - messages, error = slack_client.get_history_by_date_range( - channel_id=channel_id, - start_date=start_date_str, - end_date=end_date_str, - limit=1000, # Limit to 1000 messages per channel - ) - - if error: - logger.warning( - f"Error getting messages from channel {channel_name}: {error}" - ) - skipped_channels.append(f"{channel_name} (error: {error})") - documents_skipped += 1 - continue # Skip this channel if there's an error - - if not messages: - logger.info( - f"No messages found in channel {channel_name} for the specified date range." - ) - documents_skipped += 1 - continue # Skip if no messages - - # Format messages with user info - formatted_messages = [] - for msg in messages: - # Skip bot messages and system messages - if msg.get("subtype") in [ - "bot_message", - "channel_join", - "channel_leave", - ]: - continue - - formatted_msg = slack_client.format_message( - msg, include_user_info=True - ) - formatted_messages.append(formatted_msg) - - if not formatted_messages: - logger.info( - f"No valid messages found in channel {channel_name} after filtering." - ) - documents_skipped += 1 - continue # Skip if no valid messages after filtering - - # Convert messages to markdown format - channel_content = f"# Slack Channel: {channel_name}\n\n" - - for msg in formatted_messages: - user_name = msg.get("user_name", "Unknown User") - timestamp = msg.get("datetime", "Unknown Time") - text = msg.get("text", "") - - channel_content += ( - f"## {user_name} ({timestamp})\n\n{text}\n\n---\n\n" - ) - - # Format document metadata - metadata_sections = [ - ( - "METADATA", - [ - f"CHANNEL_NAME: {channel_name}", - f"CHANNEL_ID: {channel_id}", - # f"START_DATE: {start_date_str}", - # f"END_DATE: {end_date_str}", - f"MESSAGE_COUNT: {len(formatted_messages)}", - ], - ), - ( - "CONTENT", - ["FORMAT: markdown", "TEXT_START", channel_content, "TEXT_END"], - ), - ] - - # Build the document string - document_parts = [] - document_parts.append("") - - for section_title, section_content in metadata_sections: - document_parts.append(f"<{section_title}>") - document_parts.extend(section_content) - document_parts.append(f"") - - document_parts.append("") - combined_document_string = "\n".join(document_parts) - content_hash = generate_content_hash( - combined_document_string, search_space_id - ) - - # Check if document with this content hash already exists - existing_doc_by_hash_result = await session.execute( - select(Document).where(Document.content_hash == content_hash) - ) - existing_document_by_hash = ( - existing_doc_by_hash_result.scalars().first() - ) - - if existing_document_by_hash: - logger.info( - f"Document with content hash {content_hash} already exists for channel {channel_name}. Skipping processing." - ) - documents_skipped += 1 - continue - - # Get user's long context LLM - user_llm = await get_user_long_context_llm(session, user_id) - if not user_llm: - logger.error(f"No long context LLM configured for user {user_id}") - skipped_channels.append(f"{channel_name} (no LLM configured)") - documents_skipped += 1 - continue - - # Generate summary - summary_chain = SUMMARY_PROMPT_TEMPLATE | user_llm - summary_result = await summary_chain.ainvoke( - {"document": combined_document_string} - ) - summary_content = summary_result.content - summary_embedding = config.embedding_model_instance.embed( - summary_content - ) - - # Process chunks - chunks = [ - Chunk( - content=chunk.text, - embedding=config.embedding_model_instance.embed(chunk.text), - ) - for chunk in config.chunker_instance.chunk(channel_content) - ] - - # Create and store new document - document = Document( - search_space_id=search_space_id, - title=f"Slack - {channel_name}", - document_type=DocumentType.SLACK_CONNECTOR, - document_metadata={ - "channel_name": channel_name, - "channel_id": channel_id, - "start_date": start_date_str, - "end_date": end_date_str, - "message_count": len(formatted_messages), - "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), - }, - content=summary_content, - embedding=summary_embedding, - chunks=chunks, - content_hash=content_hash, - ) - - session.add(document) - documents_indexed += 1 - logger.info( - f"Successfully indexed new channel {channel_name} with {len(formatted_messages)} messages" - ) - - except SlackApiError as slack_error: - logger.error( - f"Slack API error for channel {channel_name}: {slack_error!s}" - ) - skipped_channels.append(f"{channel_name} (Slack API error)") - documents_skipped += 1 - continue # Skip this channel and continue with others - except Exception as e: - logger.error(f"Error processing channel {channel_name}: {e!s}") - skipped_channels.append(f"{channel_name} (processing error)") - documents_skipped += 1 - continue # Skip this channel and continue with others - - # Update the last_indexed_at timestamp for the connector only if requested - # and if we successfully indexed at least one channel - total_processed = documents_indexed - if update_last_indexed and total_processed > 0: - connector.last_indexed_at = datetime.now() - - # Commit all changes - await session.commit() - - # Prepare result message - result_message = None - if skipped_channels: - result_message = f"Processed {total_processed} channels. Skipped {len(skipped_channels)} channels: {', '.join(skipped_channels)}" - else: - result_message = f"Processed {total_processed} channels." - - # Log success - await task_logger.log_task_success( - log_entry, - f"Successfully completed Slack indexing for connector {connector_id}", - { - "channels_processed": total_processed, - "documents_indexed": documents_indexed, - "documents_skipped": documents_skipped, - "skipped_channels_count": len(skipped_channels), - "result_message": result_message, - }, - ) - - logger.info( - f"Slack indexing completed: {documents_indexed} new channels, {documents_skipped} skipped" - ) - return total_processed, result_message - - except SQLAlchemyError as db_error: - await session.rollback() - await task_logger.log_task_failure( - log_entry, - f"Database error during Slack indexing for connector {connector_id}", - str(db_error), - {"error_type": "SQLAlchemyError"}, - ) - logger.error(f"Database error: {db_error!s}") - return 0, f"Database error: {db_error!s}" - except Exception as e: - await session.rollback() - await task_logger.log_task_failure( - log_entry, - f"Failed to index Slack messages for connector {connector_id}", - str(e), - {"error_type": type(e).__name__}, - ) - logger.error(f"Failed to index Slack messages: {e!s}") - return 0, f"Failed to index Slack messages: {e!s}" - - -async def index_notion_pages( - session: AsyncSession, - connector_id: int, - search_space_id: int, - user_id: str, - start_date: str | None = None, - end_date: str | None = None, - update_last_indexed: bool = True, -) -> tuple[int, str | None]: - """ - Index Notion pages from all accessible pages. - - Args: - session: Database session - connector_id: ID of the Notion connector - search_space_id: ID of the search space to store documents in - update_last_indexed: Whether to update the last_indexed_at timestamp (default: True) - - Returns: - Tuple containing (number of documents indexed, error message or None) - """ - task_logger = TaskLoggingService(session, search_space_id) - - # Log task start - log_entry = await task_logger.log_task_start( - task_name="notion_pages_indexing", - source="connector_indexing_task", - message=f"Starting Notion pages indexing for connector {connector_id}", - metadata={ - "connector_id": connector_id, - "user_id": str(user_id), - "start_date": start_date, - "end_date": end_date, - }, - ) - - try: - # Get the connector - await task_logger.log_task_progress( - log_entry, - f"Retrieving Notion connector {connector_id} from database", - {"stage": "connector_retrieval"}, - ) - - result = await session.execute( - select(SearchSourceConnector).filter( - SearchSourceConnector.id == connector_id, - SearchSourceConnector.connector_type - == SearchSourceConnectorType.NOTION_CONNECTOR, - ) - ) - connector = result.scalars().first() - - if not connector: - await task_logger.log_task_failure( - log_entry, - f"Connector with ID {connector_id} not found or is not a Notion connector", - "Connector not found", - {"error_type": "ConnectorNotFound"}, - ) - return ( - 0, - f"Connector with ID {connector_id} not found or is not a Notion connector", - ) - - # Get the Notion token from the connector config - notion_token = connector.config.get("NOTION_INTEGRATION_TOKEN") - if not notion_token: - await task_logger.log_task_failure( - log_entry, - f"Notion integration token not found in connector config for connector {connector_id}", - "Missing Notion token", - {"error_type": "MissingToken"}, - ) - return 0, "Notion integration token not found in connector config" - - # Initialize Notion client - await task_logger.log_task_progress( - log_entry, - f"Initializing Notion client for connector {connector_id}", - {"stage": "client_initialization"}, - ) - - logger.info(f"Initializing Notion client for connector {connector_id}") - notion_client = NotionHistoryConnector(token=notion_token) - - # Calculate date range - if start_date is None or end_date is None: - # Fall back to calculating dates - calculated_end_date = datetime.now() - calculated_start_date = calculated_end_date - timedelta( - days=365 - ) # Check for last 1 year of pages - - # Use calculated dates if not provided - if start_date is None: - start_date_iso = calculated_start_date.strftime("%Y-%m-%dT%H:%M:%SZ") - else: - # Convert YYYY-MM-DD to ISO format - start_date_iso = datetime.strptime(start_date, "%Y-%m-%d").strftime( - "%Y-%m-%dT%H:%M:%SZ" - ) - - if end_date is None: - end_date_iso = calculated_end_date.strftime("%Y-%m-%dT%H:%M:%SZ") - else: - # Convert YYYY-MM-DD to ISO format - end_date_iso = datetime.strptime(end_date, "%Y-%m-%d").strftime( - "%Y-%m-%dT%H:%M:%SZ" - ) - else: - # Convert provided dates to ISO format for Notion API - start_date_iso = datetime.strptime(start_date, "%Y-%m-%d").strftime( - "%Y-%m-%dT%H:%M:%SZ" - ) - end_date_iso = datetime.strptime(end_date, "%Y-%m-%d").strftime( - "%Y-%m-%dT%H:%M:%SZ" - ) - - logger.info(f"Fetching Notion pages from {start_date_iso} to {end_date_iso}") - - await task_logger.log_task_progress( - log_entry, - f"Fetching Notion pages from {start_date_iso} to {end_date_iso}", - { - "stage": "fetch_pages", - "start_date": start_date_iso, - "end_date": end_date_iso, - }, - ) - - # Get all pages - try: - pages = notion_client.get_all_pages( - start_date=start_date_iso, end_date=end_date_iso - ) - logger.info(f"Found {len(pages)} Notion pages") - except Exception as e: - await task_logger.log_task_failure( - log_entry, - f"Failed to get Notion pages for connector {connector_id}", - str(e), - {"error_type": "PageFetchError"}, - ) - logger.error(f"Error fetching Notion pages: {e!s}", exc_info=True) - return 0, f"Failed to get Notion pages: {e!s}" - - if not pages: - await task_logger.log_task_success( - log_entry, - f"No Notion pages found for connector {connector_id}", - {"pages_found": 0}, - ) - logger.info("No Notion pages found to index") - return 0, "No Notion pages found" - - # Track the number of documents indexed - documents_indexed = 0 - documents_skipped = 0 - skipped_pages = [] - - await task_logger.log_task_progress( - log_entry, - f"Starting to process {len(pages)} Notion pages", - {"stage": "process_pages", "total_pages": len(pages)}, - ) - - # Process each page - for page in pages: - try: - page_id = page.get("page_id") - page_title = page.get("title", f"Untitled page ({page_id})") - page_content = page.get("content", []) - - logger.info(f"Processing Notion page: {page_title} ({page_id})") - - if not page_content: - logger.info(f"No content found in page {page_title}. Skipping.") - skipped_pages.append(f"{page_title} (no content)") - documents_skipped += 1 - continue - - # Convert page content to markdown format - markdown_content = f"# Notion Page: {page_title}\n\n" - - # Process blocks recursively - def process_blocks(blocks, level=0): - result = "" - for block in blocks: - block_type = block.get("type") - block_content = block.get("content", "") - children = block.get("children", []) - - # Add indentation based on level - indent = " " * level - - # Format based on block type - if block_type in ["paragraph", "text"]: - result += f"{indent}{block_content}\n\n" - elif block_type in ["heading_1", "header"]: - result += f"{indent}# {block_content}\n\n" - elif block_type == "heading_2": - result += f"{indent}## {block_content}\n\n" - elif block_type == "heading_3": - result += f"{indent}### {block_content}\n\n" - elif block_type == "bulleted_list_item": - result += f"{indent}* {block_content}\n" - elif block_type == "numbered_list_item": - result += f"{indent}1. {block_content}\n" - elif block_type == "to_do": - result += f"{indent}- [ ] {block_content}\n" - elif block_type == "toggle": - result += f"{indent}> {block_content}\n" - elif block_type == "code": - result += f"{indent}```\n{block_content}\n```\n\n" - elif block_type == "quote": - result += f"{indent}> {block_content}\n\n" - elif block_type == "callout": - result += f"{indent}> **Note:** {block_content}\n\n" - elif block_type == "image": - result += f"{indent}![Image]({block_content})\n\n" - else: - # Default for other block types - if block_content: - result += f"{indent}{block_content}\n\n" - - # Process children recursively - if children: - result += process_blocks(children, level + 1) - - return result - - logger.debug( - f"Converting {len(page_content)} blocks to markdown for page {page_title}" - ) - markdown_content += process_blocks(page_content) - - # Format document metadata - metadata_sections = [ - ("METADATA", [f"PAGE_TITLE: {page_title}", f"PAGE_ID: {page_id}"]), - ( - "CONTENT", - [ - "FORMAT: markdown", - "TEXT_START", - markdown_content, - "TEXT_END", - ], - ), - ] - - # Build the document string - document_parts = [] - document_parts.append("") - - for section_title, section_content in metadata_sections: - document_parts.append(f"<{section_title}>") - document_parts.extend(section_content) - document_parts.append(f"") - - document_parts.append("") - combined_document_string = "\n".join(document_parts) - content_hash = generate_content_hash( - combined_document_string, search_space_id - ) - - # Check if document with this content hash already exists - existing_doc_by_hash_result = await session.execute( - select(Document).where(Document.content_hash == content_hash) - ) - existing_document_by_hash = ( - existing_doc_by_hash_result.scalars().first() - ) - - if existing_document_by_hash: - logger.info( - f"Document with content hash {content_hash} already exists for page {page_title}. Skipping processing." - ) - documents_skipped += 1 - continue - - # Get user's long context LLM - user_llm = await get_user_long_context_llm(session, user_id) - if not user_llm: - logger.error(f"No long context LLM configured for user {user_id}") - skipped_pages.append(f"{page_title} (no LLM configured)") - documents_skipped += 1 - continue - - # Generate summary - logger.debug(f"Generating summary for page {page_title}") - summary_chain = SUMMARY_PROMPT_TEMPLATE | user_llm - summary_result = await summary_chain.ainvoke( - {"document": combined_document_string} - ) - summary_content = summary_result.content - summary_embedding = config.embedding_model_instance.embed( - summary_content - ) - - # Process chunks - logger.debug(f"Chunking content for page {page_title}") - chunks = [ - Chunk( - content=chunk.text, - embedding=config.embedding_model_instance.embed(chunk.text), - ) - for chunk in config.chunker_instance.chunk(markdown_content) - ] - - # Create and store new document - document = Document( - search_space_id=search_space_id, - title=f"Notion - {page_title}", - document_type=DocumentType.NOTION_CONNECTOR, - document_metadata={ - "page_title": page_title, - "page_id": page_id, - "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), - }, - content=summary_content, - content_hash=content_hash, - embedding=summary_embedding, - chunks=chunks, - ) - - session.add(document) - documents_indexed += 1 - logger.info(f"Successfully indexed new Notion page: {page_title}") - - except Exception as e: - logger.error( - f"Error processing Notion page {page.get('title', 'Unknown')}: {e!s}", - exc_info=True, - ) - skipped_pages.append( - f"{page.get('title', 'Unknown')} (processing error)" - ) - documents_skipped += 1 - continue # Skip this page and continue with others - - # Update the last_indexed_at timestamp for the connector only if requested - # and if we successfully indexed at least one page - total_processed = documents_indexed - if update_last_indexed and total_processed > 0: - connector.last_indexed_at = datetime.now() - logger.info(f"Updated last_indexed_at for connector {connector_id}") - - # Commit all changes - await session.commit() - - # Prepare result message - result_message = None - if skipped_pages: - result_message = f"Processed {total_processed} pages. Skipped {len(skipped_pages)} pages: {', '.join(skipped_pages)}" - else: - result_message = f"Processed {total_processed} pages." - - # Log success - await task_logger.log_task_success( - log_entry, - f"Successfully completed Notion indexing for connector {connector_id}", - { - "pages_processed": total_processed, - "documents_indexed": documents_indexed, - "documents_skipped": documents_skipped, - "skipped_pages_count": len(skipped_pages), - "result_message": result_message, - }, - ) - - logger.info( - f"Notion indexing completed: {documents_indexed} new pages, {documents_skipped} skipped" - ) - return total_processed, result_message - - except SQLAlchemyError as db_error: - await session.rollback() - await task_logger.log_task_failure( - log_entry, - f"Database error during Notion indexing for connector {connector_id}", - str(db_error), - {"error_type": "SQLAlchemyError"}, - ) - logger.error( - f"Database error during Notion indexing: {db_error!s}", exc_info=True - ) - return 0, f"Database error: {db_error!s}" - except Exception as e: - await session.rollback() - await task_logger.log_task_failure( - log_entry, - f"Failed to index Notion pages for connector {connector_id}", - str(e), - {"error_type": type(e).__name__}, - ) - logger.error(f"Failed to index Notion pages: {e!s}", exc_info=True) - return 0, f"Failed to index Notion pages: {e!s}" - - -async def index_github_repos( - session: AsyncSession, - connector_id: int, - search_space_id: int, - user_id: str, - start_date: str | None = None, - end_date: str | None = None, - update_last_indexed: bool = True, -) -> tuple[int, str | None]: - """ - Index code and documentation files from accessible GitHub repositories. - - Args: - session: Database session - connector_id: ID of the GitHub connector - search_space_id: ID of the search space to store documents in - update_last_indexed: Whether to update the last_indexed_at timestamp (default: True) - - Returns: - Tuple containing (number of documents indexed, error message or None) - """ - task_logger = TaskLoggingService(session, search_space_id) - - # Log task start - log_entry = await task_logger.log_task_start( - task_name="github_repos_indexing", - source="connector_indexing_task", - message=f"Starting GitHub repositories indexing for connector {connector_id}", - metadata={ - "connector_id": connector_id, - "user_id": str(user_id), - "start_date": start_date, - "end_date": end_date, - }, - ) - - documents_processed = 0 - errors = [] - - try: - # 1. Get the GitHub connector from the database - await task_logger.log_task_progress( - log_entry, - f"Retrieving GitHub connector {connector_id} from database", - {"stage": "connector_retrieval"}, - ) - - result = await session.execute( - select(SearchSourceConnector).filter( - SearchSourceConnector.id == connector_id, - SearchSourceConnector.connector_type - == SearchSourceConnectorType.GITHUB_CONNECTOR, - ) - ) - connector = result.scalars().first() - - if not connector: - await task_logger.log_task_failure( - log_entry, - f"Connector with ID {connector_id} not found or is not a GitHub connector", - "Connector not found", - {"error_type": "ConnectorNotFound"}, - ) - return ( - 0, - f"Connector with ID {connector_id} not found or is not a GitHub connector", - ) - - # 2. Get the GitHub PAT and selected repositories from the connector config - github_pat = connector.config.get("GITHUB_PAT") - repo_full_names_to_index = connector.config.get("repo_full_names") - - if not github_pat: - await task_logger.log_task_failure( - log_entry, - f"GitHub Personal Access Token (PAT) not found in connector config for connector {connector_id}", - "Missing GitHub PAT", - {"error_type": "MissingToken"}, - ) - return 0, "GitHub Personal Access Token (PAT) not found in connector config" - - if not repo_full_names_to_index or not isinstance( - repo_full_names_to_index, list - ): - await task_logger.log_task_failure( - log_entry, - f"'repo_full_names' not found or is not a list in connector config for connector {connector_id}", - "Invalid repo configuration", - {"error_type": "InvalidConfiguration"}, - ) - return 0, "'repo_full_names' not found or is not a list in connector config" - - # 3. Initialize GitHub connector client - await task_logger.log_task_progress( - log_entry, - f"Initializing GitHub client for connector {connector_id}", - { - "stage": "client_initialization", - "repo_count": len(repo_full_names_to_index), - }, - ) - - try: - github_client = GitHubConnector(token=github_pat) - except ValueError as e: - await task_logger.log_task_failure( - log_entry, - f"Failed to initialize GitHub client for connector {connector_id}", - str(e), - {"error_type": "ClientInitializationError"}, - ) - return 0, f"Failed to initialize GitHub client: {e!s}" - - # 4. Validate selected repositories - # For simplicity, we'll proceed with the list provided. - # If a repo is inaccessible, get_repository_files will likely fail gracefully later. - await task_logger.log_task_progress( - log_entry, - f"Starting indexing for {len(repo_full_names_to_index)} selected repositories", - { - "stage": "repo_processing", - "repo_count": len(repo_full_names_to_index), - "start_date": start_date, - "end_date": end_date, - }, - ) - - logger.info( - f"Starting indexing for {len(repo_full_names_to_index)} selected repositories." - ) - if start_date and end_date: - logger.info( - f"Date range requested: {start_date} to {end_date} (Note: GitHub indexing processes all files regardless of dates)" - ) - - # 6. Iterate through selected repositories and index files - for repo_full_name in repo_full_names_to_index: - if not repo_full_name or not isinstance(repo_full_name, str): - logger.warning(f"Skipping invalid repository entry: {repo_full_name}") - continue - - logger.info(f"Processing repository: {repo_full_name}") - try: - files_to_index = github_client.get_repository_files(repo_full_name) - if not files_to_index: - logger.info( - f"No indexable files found in repository: {repo_full_name}" - ) - continue - - logger.info( - f"Found {len(files_to_index)} files to process in {repo_full_name}" - ) - - for file_info in files_to_index: - file_path = file_info.get("path") - file_url = file_info.get("url") - file_sha = file_info.get("sha") - file_type = file_info.get("type") # 'code' or 'doc' - full_path_key = f"{repo_full_name}/{file_path}" - - if not file_path or not file_url or not file_sha: - logger.warning( - f"Skipping file with missing info in {repo_full_name}: {file_info}" - ) - continue - - # Get file content - file_content = github_client.get_file_content( - repo_full_name, file_path - ) - - if file_content is None: - logger.warning( - f"Could not retrieve content for {full_path_key}. Skipping." - ) - continue # Skip if content fetch failed - - content_hash = generate_content_hash(file_content, search_space_id) - - # Check if document with this content hash already exists - existing_doc_by_hash_result = await session.execute( - select(Document).where(Document.content_hash == content_hash) - ) - existing_document_by_hash = ( - existing_doc_by_hash_result.scalars().first() - ) - - if existing_document_by_hash: - logger.info( - f"Document with content hash {content_hash} already exists for file {full_path_key}. Skipping processing." - ) - continue - - # Use file_content directly for chunking, maybe summary for main content? - # For now, let's use the full content for both, might need refinement - summary_content = f"GitHub file: {full_path_key}\n\n{file_content[:1000]}..." # Simple summary - summary_embedding = config.embedding_model_instance.embed( - summary_content - ) - - # Chunk the content - try: - chunks_data = [ - Chunk( - content=chunk.text, - embedding=config.embedding_model_instance.embed( - chunk.text - ), - ) - for chunk in config.code_chunker_instance.chunk( - file_content - ) - ] - except Exception as chunk_err: - logger.error( - f"Failed to chunk file {full_path_key}: {chunk_err}" - ) - errors.append( - f"Chunking failed for {full_path_key}: {chunk_err}" - ) - continue # Skip this file if chunking fails - - doc_metadata = { - "repository_full_name": repo_full_name, - "file_path": file_path, - "full_path": full_path_key, # For easier lookup - "url": file_url, - "sha": file_sha, - "type": file_type, - "indexed_at": datetime.now(UTC).isoformat(), - } - - # Create new document - logger.info(f"Creating new document for file: {full_path_key}") - document = Document( - title=f"GitHub - {file_path}", - document_type=DocumentType.GITHUB_CONNECTOR, - document_metadata=doc_metadata, - content=summary_content, # Store summary - content_hash=content_hash, - embedding=summary_embedding, - search_space_id=search_space_id, - chunks=chunks_data, # Associate chunks directly - ) - session.add(document) - documents_processed += 1 - - except Exception as repo_err: - logger.error( - f"Failed to process repository {repo_full_name}: {repo_err}" - ) - errors.append(f"Failed processing {repo_full_name}: {repo_err}") - - # Commit all changes at the end - await session.commit() - logger.info( - f"Finished GitHub indexing for connector {connector_id}. Processed {documents_processed} files." - ) - - # Log success - await task_logger.log_task_success( - log_entry, - f"Successfully completed GitHub indexing for connector {connector_id}", - { - "documents_processed": documents_processed, - "errors_count": len(errors), - "repo_count": len(repo_full_names_to_index), - }, - ) - - except SQLAlchemyError as db_err: - await session.rollback() - await task_logger.log_task_failure( - log_entry, - f"Database error during GitHub indexing for connector {connector_id}", - str(db_err), - {"error_type": "SQLAlchemyError"}, - ) - logger.error( - f"Database error during GitHub indexing for connector {connector_id}: {db_err}" - ) - errors.append(f"Database error: {db_err}") - return documents_processed, "; ".join(errors) if errors else str(db_err) - except Exception as e: - await session.rollback() - await task_logger.log_task_failure( - log_entry, - f"Unexpected error during GitHub indexing for connector {connector_id}", - str(e), - {"error_type": type(e).__name__}, - ) - logger.error( - f"Unexpected error during GitHub indexing for connector {connector_id}: {e}", - exc_info=True, - ) - errors.append(f"Unexpected error: {e}") - return documents_processed, "; ".join(errors) if errors else str(e) - - error_message = "; ".join(errors) if errors else None - return documents_processed, error_message - - -async def index_linear_issues( - session: AsyncSession, - connector_id: int, - search_space_id: int, - user_id: str, - start_date: str | None = None, - end_date: str | None = None, - update_last_indexed: bool = True, -) -> tuple[int, str | None]: - """ - Index Linear issues and comments. - - Args: - session: Database session - connector_id: ID of the Linear connector - search_space_id: ID of the search space to store documents in - update_last_indexed: Whether to update the last_indexed_at timestamp (default: True) - - Returns: - Tuple containing (number of documents indexed, error message or None) - """ - task_logger = TaskLoggingService(session, search_space_id) - - # Log task start - log_entry = await task_logger.log_task_start( - task_name="linear_issues_indexing", - source="connector_indexing_task", - message=f"Starting Linear issues indexing for connector {connector_id}", - metadata={ - "connector_id": connector_id, - "user_id": str(user_id), - "start_date": start_date, - "end_date": end_date, - }, - ) - - try: - # Get the connector - await task_logger.log_task_progress( - log_entry, - f"Retrieving Linear connector {connector_id} from database", - {"stage": "connector_retrieval"}, - ) - - result = await session.execute( - select(SearchSourceConnector).filter( - SearchSourceConnector.id == connector_id, - SearchSourceConnector.connector_type - == SearchSourceConnectorType.LINEAR_CONNECTOR, - ) - ) - connector = result.scalars().first() - - if not connector: - await task_logger.log_task_failure( - log_entry, - f"Connector with ID {connector_id} not found or is not a Linear connector", - "Connector not found", - {"error_type": "ConnectorNotFound"}, - ) - return ( - 0, - f"Connector with ID {connector_id} not found or is not a Linear connector", - ) - - # Get the Linear token from the connector config - linear_token = connector.config.get("LINEAR_API_KEY") - if not linear_token: - await task_logger.log_task_failure( - log_entry, - f"Linear API token not found in connector config for connector {connector_id}", - "Missing Linear token", - {"error_type": "MissingToken"}, - ) - return 0, "Linear API token not found in connector config" - - # Initialize Linear client - await task_logger.log_task_progress( - log_entry, - f"Initializing Linear client for connector {connector_id}", - {"stage": "client_initialization"}, - ) - - linear_client = LinearConnector(token=linear_token) - - # Calculate date range - if start_date is None or end_date is None: - # Fall back to calculating dates based on last_indexed_at - calculated_end_date = datetime.now() - - # Use last_indexed_at as start date if available, otherwise use 365 days ago - if connector.last_indexed_at: - # Convert dates to be comparable (both timezone-naive) - last_indexed_naive = ( - connector.last_indexed_at.replace(tzinfo=None) - if connector.last_indexed_at.tzinfo - else connector.last_indexed_at - ) - - # Check if last_indexed_at is in the future or after end_date - if last_indexed_naive > calculated_end_date: - logger.warning( - f"Last indexed date ({last_indexed_naive.strftime('%Y-%m-%d')}) is in the future. Using 365 days ago instead." - ) - calculated_start_date = calculated_end_date - timedelta(days=365) - else: - calculated_start_date = last_indexed_naive - logger.info( - f"Using last_indexed_at ({calculated_start_date.strftime('%Y-%m-%d')}) as start date" - ) - else: - calculated_start_date = calculated_end_date - timedelta( - days=365 - ) # Use 365 days as default - logger.info( - f"No last_indexed_at found, using {calculated_start_date.strftime('%Y-%m-%d')} (365 days ago) as start date" - ) - - # Use calculated dates if not provided - start_date_str = ( - start_date if start_date else calculated_start_date.strftime("%Y-%m-%d") - ) - end_date_str = ( - end_date if end_date else calculated_end_date.strftime("%Y-%m-%d") - ) - else: - # Use provided dates - start_date_str = start_date - end_date_str = end_date - - logger.info(f"Fetching Linear issues from {start_date_str} to {end_date_str}") - - await task_logger.log_task_progress( - log_entry, - f"Fetching Linear issues from {start_date_str} to {end_date_str}", - { - "stage": "fetch_issues", - "start_date": start_date_str, - "end_date": end_date_str, - }, - ) - - # Get issues within date range - try: - issues, error = linear_client.get_issues_by_date_range( - start_date=start_date_str, end_date=end_date_str, include_comments=True - ) - - if error: - logger.error(f"Failed to get Linear issues: {error}") - - # Don't treat "No issues found" as an error that should stop indexing - if "No issues found" in error: - logger.info( - "No issues found is not a critical error, continuing with update" - ) - if update_last_indexed: - connector.last_indexed_at = datetime.now() - await session.commit() - logger.info( - f"Updated last_indexed_at to {connector.last_indexed_at} despite no issues found" - ) - return 0, None - else: - return 0, f"Failed to get Linear issues: {error}" - - logger.info(f"Retrieved {len(issues)} issues from Linear API") - - except Exception as e: - logger.error(f"Exception when calling Linear API: {e!s}", exc_info=True) - return 0, f"Failed to get Linear issues: {e!s}" - - if not issues: - logger.info("No Linear issues found for the specified date range") - if update_last_indexed: - connector.last_indexed_at = datetime.now() - await session.commit() - logger.info( - f"Updated last_indexed_at to {connector.last_indexed_at} despite no issues found" - ) - return 0, None # Return None instead of error message when no issues found - - # Log issue IDs and titles for debugging - logger.info("Issues retrieved from Linear API:") - for idx, issue in enumerate(issues[:10]): # Log first 10 issues - logger.info( - f" {idx + 1}. {issue.get('identifier', 'Unknown')} - {issue.get('title', 'Unknown')} - Created: {issue.get('createdAt', 'Unknown')} - Updated: {issue.get('updatedAt', 'Unknown')}" - ) - if len(issues) > 10: - logger.info(f" ...and {len(issues) - 10} more issues") - - # Track the number of documents indexed - documents_indexed = 0 - documents_skipped = 0 - skipped_issues = [] - - await task_logger.log_task_progress( - log_entry, - f"Starting to process {len(issues)} Linear issues", - {"stage": "process_issues", "total_issues": len(issues)}, - ) - - # Process each issue - for issue in issues: - try: - issue_id = issue.get("key") - issue_identifier = issue.get("id", "") - issue_title = issue.get("key", "") - - if not issue_id or not issue_title: - logger.warning( - f"Skipping issue with missing ID or title: {issue_id or 'Unknown'}" - ) - skipped_issues.append( - f"{issue_identifier or 'Unknown'} (missing data)" - ) - documents_skipped += 1 - continue - - # Format the issue first to get well-structured data - formatted_issue = linear_client.format_issue(issue) - - # Convert issue to markdown format - issue_content = linear_client.format_issue_to_markdown(formatted_issue) - - if not issue_content: - logger.warning( - f"Skipping issue with no content: {issue_identifier} - {issue_title}" - ) - skipped_issues.append(f"{issue_identifier} (no content)") - documents_skipped += 1 - continue - - # Create a short summary for the embedding - # This avoids using the LLM and just uses the issue data directly - state = formatted_issue.get("state", "Unknown") - description = formatted_issue.get("description", "") - # Truncate description if it's too long for the summary - if description and len(description) > 500: - description = description[:497] + "..." - - # Create a simple summary from the issue data - summary_content = f"Linear Issue {issue_identifier}: {issue_title}\n\nStatus: {state}\n\n" - if description: - summary_content += f"Description: {description}\n\n" - - # Add comment count - comment_count = len(formatted_issue.get("comments", [])) - summary_content += f"Comments: {comment_count}" - - content_hash = generate_content_hash(issue_content, search_space_id) - - # Check if document with this content hash already exists - existing_doc_by_hash_result = await session.execute( - select(Document).where(Document.content_hash == content_hash) - ) - existing_document_by_hash = ( - existing_doc_by_hash_result.scalars().first() - ) - - if existing_document_by_hash: - logger.info( - f"Document with content hash {content_hash} already exists for issue {issue_identifier}. Skipping processing." - ) - documents_skipped += 1 - continue - - # Generate embedding for the summary - summary_embedding = config.embedding_model_instance.embed( - summary_content - ) - - # Process chunks - using the full issue content with comments - chunks = [ - Chunk( - content=chunk.text, - embedding=config.embedding_model_instance.embed(chunk.text), - ) - for chunk in config.chunker_instance.chunk(issue_content) - ] - - # Create and store new document - logger.info( - f"Creating new document for issue {issue_identifier} - {issue_title}" - ) - document = Document( - search_space_id=search_space_id, - title=f"Linear - {issue_identifier}: {issue_title}", - document_type=DocumentType.LINEAR_CONNECTOR, - document_metadata={ - "issue_id": issue_id, - "issue_identifier": issue_identifier, - "issue_title": issue_title, - "state": state, - "comment_count": comment_count, - "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), - }, - content=summary_content, - content_hash=content_hash, - embedding=summary_embedding, - chunks=chunks, - ) - - session.add(document) - documents_indexed += 1 - logger.info( - f"Successfully indexed new issue {issue_identifier} - {issue_title}" - ) - - except Exception as e: - logger.error( - f"Error processing issue {issue.get('identifier', 'Unknown')}: {e!s}", - exc_info=True, - ) - skipped_issues.append( - f"{issue.get('identifier', 'Unknown')} (processing error)" - ) - documents_skipped += 1 - continue # Skip this issue and continue with others - - # Update the last_indexed_at timestamp for the connector only if requested - total_processed = documents_indexed - if update_last_indexed: - connector.last_indexed_at = datetime.now() - logger.info(f"Updated last_indexed_at to {connector.last_indexed_at}") - - # Commit all changes - await session.commit() - logger.info("Successfully committed all Linear document changes to database") - - # Log success - await task_logger.log_task_success( - log_entry, - f"Successfully completed Linear indexing for connector {connector_id}", - { - "issues_processed": total_processed, - "documents_indexed": documents_indexed, - "documents_skipped": documents_skipped, - "skipped_issues_count": len(skipped_issues), - }, - ) - - logger.info( - f"Linear indexing completed: {documents_indexed} new issues, {documents_skipped} skipped" - ) - return ( - total_processed, - None, - ) # Return None as the error message to indicate success - - except SQLAlchemyError as db_error: - await session.rollback() - await task_logger.log_task_failure( - log_entry, - f"Database error during Linear indexing for connector {connector_id}", - str(db_error), - {"error_type": "SQLAlchemyError"}, - ) - logger.error(f"Database error: {db_error!s}", exc_info=True) - return 0, f"Database error: {db_error!s}" - except Exception as e: - await session.rollback() - await task_logger.log_task_failure( - log_entry, - f"Failed to index Linear issues for connector {connector_id}", - str(e), - {"error_type": type(e).__name__}, - ) - logger.error(f"Failed to index Linear issues: {e!s}", exc_info=True) - return 0, f"Failed to index Linear issues: {e!s}" - - -async def index_discord_messages( - session: AsyncSession, - connector_id: int, - search_space_id: int, - user_id: str, - start_date: str | None = None, - end_date: str | None = None, - update_last_indexed: bool = True, -) -> tuple[int, str | None]: - """ - Index Discord messages from all accessible channels. - - Args: - session: Database session - connector_id: ID of the Discord connector - search_space_id: ID of the search space to store documents in - update_last_indexed: Whether to update the last_indexed_at timestamp (default: True) - - Returns: - Tuple containing (number of documents indexed, error message or None) - """ - task_logger = TaskLoggingService(session, search_space_id) - - # Log task start - log_entry = await task_logger.log_task_start( - task_name="discord_messages_indexing", - source="connector_indexing_task", - message=f"Starting Discord messages indexing for connector {connector_id}", - metadata={ - "connector_id": connector_id, - "user_id": str(user_id), - "start_date": start_date, - "end_date": end_date, - }, - ) - - try: - # Get the connector - await task_logger.log_task_progress( - log_entry, - f"Retrieving Discord connector {connector_id} from database", - {"stage": "connector_retrieval"}, - ) - - result = await session.execute( - select(SearchSourceConnector).filter( - SearchSourceConnector.id == connector_id, - SearchSourceConnector.connector_type - == SearchSourceConnectorType.DISCORD_CONNECTOR, - ) - ) - connector = result.scalars().first() - - if not connector: - await task_logger.log_task_failure( - log_entry, - f"Connector with ID {connector_id} not found or is not a Discord connector", - "Connector not found", - {"error_type": "ConnectorNotFound"}, - ) - return ( - 0, - f"Connector with ID {connector_id} not found or is not a Discord connector", - ) - - # Get the Discord token from the connector config - discord_token = connector.config.get("DISCORD_BOT_TOKEN") - if not discord_token: - await task_logger.log_task_failure( - log_entry, - f"Discord token not found in connector config for connector {connector_id}", - "Missing Discord token", - {"error_type": "MissingToken"}, - ) - return 0, "Discord token not found in connector config" - - logger.info(f"Starting Discord indexing for connector {connector_id}") - - # Initialize Discord client - await task_logger.log_task_progress( - log_entry, - f"Initializing Discord client for connector {connector_id}", - {"stage": "client_initialization"}, - ) - - discord_client = DiscordConnector(token=discord_token) - - # Calculate date range - if start_date is None or end_date is None: - # Fall back to calculating dates based on last_indexed_at - calculated_end_date = datetime.now(UTC) - - # Use last_indexed_at as start date if available, otherwise use 365 days ago - if connector.last_indexed_at: - calculated_start_date = connector.last_indexed_at.replace(tzinfo=UTC) - logger.info( - f"Using last_indexed_at ({calculated_start_date.strftime('%Y-%m-%d')}) as start date" - ) - else: - calculated_start_date = calculated_end_date - timedelta(days=365) - logger.info( - f"No last_indexed_at found, using {calculated_start_date.strftime('%Y-%m-%d')} (365 days ago) as start date" - ) - - # Use calculated dates if not provided, convert to ISO format for Discord API - if start_date is None: - start_date_iso = calculated_start_date.isoformat() - else: - # Convert YYYY-MM-DD to ISO format - start_date_iso = ( - datetime.strptime(start_date, "%Y-%m-%d") - .replace(tzinfo=UTC) - .isoformat() - ) - - if end_date is None: - end_date_iso = calculated_end_date.isoformat() - else: - # Convert YYYY-MM-DD to ISO format - end_date_iso = ( - datetime.strptime(end_date, "%Y-%m-%d") - .replace(tzinfo=UTC) - .isoformat() - ) - else: - # Convert provided dates to ISO format for Discord API - start_date_iso = ( - datetime.strptime(start_date, "%Y-%m-%d") - .replace(tzinfo=UTC) - .isoformat() - ) - end_date_iso = ( - datetime.strptime(end_date, "%Y-%m-%d").replace(tzinfo=UTC).isoformat() - ) - - logger.info( - f"Indexing Discord messages from {start_date_iso} to {end_date_iso}" - ) - - documents_indexed = 0 - documents_skipped = 0 - skipped_channels = [] - - try: - await task_logger.log_task_progress( - log_entry, - f"Starting Discord bot and fetching guilds for connector {connector_id}", - {"stage": "fetch_guilds"}, - ) - - logger.info("Starting Discord bot to fetch guilds") - discord_client._bot_task = asyncio.create_task(discord_client.start_bot()) - await discord_client._wait_until_ready() - - logger.info("Fetching Discord guilds") - guilds = await discord_client.get_guilds() - logger.info(f"Found {len(guilds)} guilds") - except Exception as e: - await task_logger.log_task_failure( - log_entry, - f"Failed to get Discord guilds for connector {connector_id}", - str(e), - {"error_type": "GuildFetchError"}, - ) - logger.error(f"Failed to get Discord guilds: {e!s}", exc_info=True) - await discord_client.close_bot() - return 0, f"Failed to get Discord guilds: {e!s}" - if not guilds: - await task_logger.log_task_success( - log_entry, - f"No Discord guilds found for connector {connector_id}", - {"guilds_found": 0}, - ) - logger.info("No Discord guilds found to index") - await discord_client.close_bot() - return 0, "No Discord guilds found" - - # Process each guild and channel - await task_logger.log_task_progress( - log_entry, - f"Starting to process {len(guilds)} Discord guilds", - {"stage": "process_guilds", "total_guilds": len(guilds)}, - ) - - for guild in guilds: - guild_id = guild["id"] - guild_name = guild["name"] - logger.info(f"Processing guild: {guild_name} ({guild_id})") - try: - channels = await discord_client.get_text_channels(guild_id) - if not channels: - logger.info(f"No channels found in guild {guild_name}. Skipping.") - skipped_channels.append(f"{guild_name} (no channels)") - documents_skipped += 1 - continue - - for channel in channels: - channel_id = channel["id"] - channel_name = channel["name"] - - try: - messages = await discord_client.get_channel_history( - channel_id=channel_id, - start_date=start_date_iso, - end_date=end_date_iso, - ) - except Exception as e: - logger.error( - f"Failed to get messages for channel {channel_name}: {e!s}" - ) - skipped_channels.append( - f"{guild_name}#{channel_name} (fetch error)" - ) - documents_skipped += 1 - continue - - if not messages: - logger.info( - f"No messages found in channel {channel_name} for the specified date range." - ) - documents_skipped += 1 - continue - - # Format messages - formatted_messages = [] - for msg in messages: - # Skip system messages if needed (Discord has some types) - if msg.get("type") in ["system"]: - continue - formatted_messages.append(msg) - - if not formatted_messages: - logger.info( - f"No valid messages found in channel {channel_name} after filtering." - ) - documents_skipped += 1 - continue - - # Convert messages to markdown format - channel_content = ( - f"# Discord Channel: {guild_name} / {channel_name}\n\n" - ) - for msg in formatted_messages: - user_name = msg.get("author_name", "Unknown User") - timestamp = msg.get("created_at", "Unknown Time") - text = msg.get("content", "") - channel_content += ( - f"## {user_name} ({timestamp})\n\n{text}\n\n---\n\n" - ) - - # Format document metadata - metadata_sections = [ - ( - "METADATA", - [ - f"GUILD_NAME: {guild_name}", - f"GUILD_ID: {guild_id}", - f"CHANNEL_NAME: {channel_name}", - f"CHANNEL_ID: {channel_id}", - f"MESSAGE_COUNT: {len(formatted_messages)}", - ], - ), - ( - "CONTENT", - [ - "FORMAT: markdown", - "TEXT_START", - channel_content, - "TEXT_END", - ], - ), - ] - - # Build the document string - document_parts = [] - document_parts.append("") - for section_title, section_content in metadata_sections: - document_parts.append(f"<{section_title}>") - document_parts.extend(section_content) - document_parts.append(f"") - document_parts.append("") - combined_document_string = "\n".join(document_parts) - content_hash = generate_content_hash( - combined_document_string, search_space_id - ) - - # Check if document with this content hash already exists - existing_doc_by_hash_result = await session.execute( - select(Document).where(Document.content_hash == content_hash) - ) - existing_document_by_hash = ( - existing_doc_by_hash_result.scalars().first() - ) - - if existing_document_by_hash: - logger.info( - f"Document with content hash {content_hash} already exists for channel {guild_name}#{channel_name}. Skipping processing." - ) - documents_skipped += 1 - continue - - # Get user's long context LLM - user_llm = await get_user_long_context_llm(session, user_id) - if not user_llm: - logger.error( - f"No long context LLM configured for user {user_id}" - ) - skipped_channels.append( - f"{guild_name}#{channel_name} (no LLM configured)" - ) - documents_skipped += 1 - continue - - # Generate summary using summary_chain - summary_chain = SUMMARY_PROMPT_TEMPLATE | user_llm - summary_result = await summary_chain.ainvoke( - {"document": combined_document_string} - ) - summary_content = summary_result.content - summary_embedding = await asyncio.to_thread( - config.embedding_model_instance.embed, summary_content - ) - - # Process chunks - raw_chunks = await asyncio.to_thread( - config.chunker_instance.chunk, channel_content - ) - - chunk_texts = [ - chunk.text for chunk in raw_chunks if chunk.text.strip() - ] - chunk_embeddings = await asyncio.to_thread( - lambda texts: [ - config.embedding_model_instance.embed(t) for t in texts - ], - chunk_texts, - ) - - chunks = [ - Chunk(content=raw_chunk.text, embedding=embedding) - for raw_chunk, embedding in zip( - raw_chunks, chunk_embeddings, strict=False - ) - ] - - # Create and store new document - document = Document( - search_space_id=search_space_id, - title=f"Discord - {guild_name}#{channel_name}", - document_type=DocumentType.DISCORD_CONNECTOR, - document_metadata={ - "guild_name": guild_name, - "guild_id": guild_id, - "channel_name": channel_name, - "channel_id": channel_id, - "message_count": len(formatted_messages), - "start_date": start_date_iso, - "end_date": end_date_iso, - "indexed_at": datetime.now(UTC).strftime( - "%Y-%m-%d %H:%M:%S" - ), - }, - content=summary_content, - content_hash=content_hash, - embedding=summary_embedding, - chunks=chunks, - ) - - session.add(document) - documents_indexed += 1 - logger.info( - f"Successfully indexed new channel {guild_name}#{channel_name} with {len(formatted_messages)} messages" - ) - - except Exception as e: - logger.error( - f"Error processing guild {guild_name}: {e!s}", exc_info=True - ) - skipped_channels.append(f"{guild_name} (processing error)") - documents_skipped += 1 - continue - - if update_last_indexed and documents_indexed > 0: - connector.last_indexed_at = datetime.now(UTC) - logger.info(f"Updated last_indexed_at to {connector.last_indexed_at}") - - await session.commit() - await discord_client.close_bot() - - # Prepare result message - result_message = None - if skipped_channels: - result_message = f"Processed {documents_indexed} channels. Skipped {len(skipped_channels)} channels: {', '.join(skipped_channels)}" - else: - result_message = f"Processed {documents_indexed} channels." - - # Log success - await task_logger.log_task_success( - log_entry, - f"Successfully completed Discord indexing for connector {connector_id}", - { - "channels_processed": documents_indexed, - "documents_indexed": documents_indexed, - "documents_skipped": documents_skipped, - "skipped_channels_count": len(skipped_channels), - "guilds_processed": len(guilds), - "result_message": result_message, - }, - ) - - logger.info( - f"Discord indexing completed: {documents_indexed} new channels, {documents_skipped} skipped" - ) - return documents_indexed, result_message - - except SQLAlchemyError as db_error: - await session.rollback() - await task_logger.log_task_failure( - log_entry, - f"Database error during Discord indexing for connector {connector_id}", - str(db_error), - {"error_type": "SQLAlchemyError"}, - ) - logger.error( - f"Database error during Discord indexing: {db_error!s}", exc_info=True - ) - return 0, f"Database error: {db_error!s}" - except Exception as e: - await session.rollback() - await task_logger.log_task_failure( - log_entry, - f"Failed to index Discord messages for connector {connector_id}", - str(e), - {"error_type": type(e).__name__}, - ) - logger.error(f"Failed to index Discord messages: {e!s}", exc_info=True) - return 0, f"Failed to index Discord messages: {e!s}" - - -async def index_jira_issues( - session: AsyncSession, - connector_id: int, - search_space_id: int, - user_id: str, - start_date: str | None = None, - end_date: str | None = None, - update_last_indexed: bool = True, -) -> tuple[int, str | None]: - """ - Index Jira issues and comments. - - Args: - session: Database session - connector_id: ID of the Jira connector - search_space_id: ID of the search space to store documents in - user_id: User ID - start_date: Start date for indexing (YYYY-MM-DD format) - end_date: End date for indexing (YYYY-MM-DD format) - update_last_indexed: Whether to update the last_indexed_at timestamp (default: True) - - Returns: - Tuple containing (number of documents indexed, error message or None) - """ - task_logger = TaskLoggingService(session, search_space_id) - - # Log task start - log_entry = await task_logger.log_task_start( - task_name="jira_issues_indexing", - source="connector_indexing_task", - message=f"Starting Jira issues indexing for connector {connector_id}", - metadata={ - "connector_id": connector_id, - "user_id": str(user_id), - "start_date": start_date, - "end_date": end_date, - }, - ) - - try: - # Get the connector from the database - result = await session.execute( - select(SearchSourceConnector).filter( - SearchSourceConnector.id == connector_id, - SearchSourceConnector.connector_type - == SearchSourceConnectorType.JIRA_CONNECTOR, - ) - ) - connector = result.scalars().first() - - if not connector: - await task_logger.log_task_failure( - log_entry, - f"Connector with ID {connector_id} not found", - "Connector not found", - {"error_type": "ConnectorNotFound"}, - ) - return 0, f"Connector with ID {connector_id} not found" - - # Get the Jira credentials from the connector config - jira_email = connector.config.get("JIRA_EMAIL") - jira_api_token = connector.config.get("JIRA_API_TOKEN") - jira_base_url = connector.config.get("JIRA_BASE_URL") - - if not jira_email or not jira_api_token or not jira_base_url: - await task_logger.log_task_failure( - log_entry, - f"Jira credentials not found in connector config for connector {connector_id}", - "Missing Jira credentials", - {"error_type": "MissingCredentials"}, - ) - return 0, "Jira credentials not found in connector config" - - # Initialize Jira client - await task_logger.log_task_progress( - log_entry, - f"Initializing Jira client for connector {connector_id}", - {"stage": "client_initialization"}, - ) - - jira_client = JiraConnector( - base_url=jira_base_url, email=jira_email, api_token=jira_api_token - ) - - # Calculate date range - if start_date is None or end_date is None: - # Fall back to calculating dates based on last_indexed_at - calculated_end_date = datetime.now() - - # Use last_indexed_at as start date if available, otherwise use 365 days ago - if connector.last_indexed_at: - # Convert dates to be comparable (both timezone-naive) - last_indexed_naive = ( - connector.last_indexed_at.replace(tzinfo=None) - if connector.last_indexed_at.tzinfo - else connector.last_indexed_at - ) - - # Check if last_indexed_at is in the future or after end_date - if last_indexed_naive > calculated_end_date: - logger.warning( - f"Last indexed date ({last_indexed_naive.strftime('%Y-%m-%d')}) is in the future. Using 365 days ago instead." - ) - calculated_start_date = calculated_end_date - timedelta(days=365) - else: - calculated_start_date = last_indexed_naive - logger.info( - f"Using last_indexed_at ({calculated_start_date.strftime('%Y-%m-%d')}) as start date" - ) - else: - calculated_start_date = calculated_end_date - timedelta( - days=365 - ) # Use 365 days as default - logger.info( - f"No last_indexed_at found, using {calculated_start_date.strftime('%Y-%m-%d')} (365 days ago) as start date" - ) - - # Use calculated dates if not provided - start_date_str = ( - start_date if start_date else calculated_start_date.strftime("%Y-%m-%d") - ) - end_date_str = ( - end_date if end_date else calculated_end_date.strftime("%Y-%m-%d") - ) - else: - # Use provided dates - start_date_str = start_date - end_date_str = end_date - - await task_logger.log_task_progress( - log_entry, - f"Fetching Jira issues from {start_date_str} to {end_date_str}", - { - "stage": "fetching_issues", - "start_date": start_date_str, - "end_date": end_date_str, - }, - ) - - # Get issues within date range - try: - issues, error = jira_client.get_issues_by_date_range( - start_date=start_date_str, end_date=end_date_str, include_comments=True - ) - - if error: - logger.error(f"Failed to get Jira issues: {error}") - - # Don't treat "No issues found" as an error that should stop indexing - if "No issues found" in error: - logger.info( - "No issues found is not a critical error, continuing with update" - ) - if update_last_indexed: - connector.last_indexed_at = datetime.now() - await session.commit() - logger.info( - f"Updated last_indexed_at to {connector.last_indexed_at} despite no issues found" - ) - - await task_logger.log_task_success( - log_entry, - f"No Jira issues found in date range {start_date_str} to {end_date_str}", - {"issues_found": 0}, - ) - return 0, None - else: - await task_logger.log_task_failure( - log_entry, - f"Failed to get Jira issues: {error}", - "API Error", - {"error_type": "APIError"}, - ) - return 0, f"Failed to get Jira issues: {error}" - - logger.info(f"Retrieved {len(issues)} issues from Jira API") - - except Exception as e: - logger.error(f"Error fetching Jira issues: {e!s}", exc_info=True) - return 0, f"Error fetching Jira issues: {e!s}" - - # Process and index each issue - documents_indexed = 0 - skipped_issues = [] - documents_skipped = 0 - - for issue in issues: - try: - issue_id = issue.get("key") - issue_identifier = issue.get("key", "") - issue_title = issue.get("id", "") - - if not issue_id or not issue_title: - logger.warning( - f"Skipping issue with missing ID or title: {issue_id or 'Unknown'}" - ) - skipped_issues.append( - f"{issue_identifier or 'Unknown'} (missing data)" - ) - documents_skipped += 1 - continue - - # Format the issue for better readability - formatted_issue = jira_client.format_issue(issue) - - # Convert to markdown - issue_content = jira_client.format_issue_to_markdown(formatted_issue) - - if not issue_content: - logger.warning( - f"Skipping issue with no content: {issue_identifier} - {issue_title}" - ) - skipped_issues.append(f"{issue_identifier} (no content)") - documents_skipped += 1 - continue - - # Create a simple summary - summary_content = f"Jira Issue {issue_identifier}: {issue_title}\n\nStatus: {formatted_issue.get('status', 'Unknown')}\n\n" - if formatted_issue.get("description"): - summary_content += ( - f"Description: {formatted_issue.get('description')}\n\n" - ) - - # Add comment count - comment_count = len(formatted_issue.get("comments", [])) - summary_content += f"Comments: {comment_count}" - - # Generate content hash - content_hash = generate_content_hash(issue_content, search_space_id) - - # Check if document already exists - existing_doc_by_hash_result = await session.execute( - select(Document).where(Document.content_hash == content_hash) - ) - existing_document_by_hash = ( - existing_doc_by_hash_result.scalars().first() - ) - - if existing_document_by_hash: - logger.info( - f"Document with content hash {content_hash} already exists for issue {issue_identifier}. Skipping processing." - ) - documents_skipped += 1 - continue - - # Generate embedding for the summary - summary_embedding = config.embedding_model_instance.embed( - summary_content - ) - - # Process chunks - using the full issue content with comments - chunks = [ - Chunk( - content=chunk.text, - embedding=config.embedding_model_instance.embed(chunk.text), - ) - for chunk in config.chunker_instance.chunk(issue_content) - ] - - # Create and store new document - logger.info( - f"Creating new document for issue {issue_identifier} - {issue_title}" - ) - document = Document( - search_space_id=search_space_id, - title=f"Jira - {issue_identifier}: {issue_title}", - document_type=DocumentType.JIRA_CONNECTOR, - document_metadata={ - "issue_id": issue_id, - "issue_identifier": issue_identifier, - "issue_title": issue_title, - "state": formatted_issue.get("status", "Unknown"), - "comment_count": comment_count, - "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), - }, - content=summary_content, - content_hash=content_hash, - embedding=summary_embedding, - chunks=chunks, - ) - - session.add(document) - documents_indexed += 1 - logger.info( - f"Successfully indexed new issue {issue_identifier} - {issue_title}" - ) - - except Exception as e: - logger.error( - f"Error processing issue {issue.get('identifier', 'Unknown')}: {e!s}", - exc_info=True, - ) - skipped_issues.append( - f"{issue.get('identifier', 'Unknown')} (processing error)" - ) - documents_skipped += 1 - continue # Skip this issue and continue with others - - # Update the last_indexed_at timestamp for the connector only if requested - total_processed = documents_indexed - if update_last_indexed: - connector.last_indexed_at = datetime.now() - logger.info(f"Updated last_indexed_at to {connector.last_indexed_at}") - - # Commit all changes - await session.commit() - logger.info("Successfully committed all JIRA document changes to database") - - # Log success - await task_logger.log_task_success( - log_entry, - f"Successfully completed JIRA indexing for connector {connector_id}", - { - "issues_processed": total_processed, - "documents_indexed": documents_indexed, - "documents_skipped": documents_skipped, - "skipped_issues_count": len(skipped_issues), - }, - ) - - logger.info( - f"JIRA indexing completed: {documents_indexed} new issues, {documents_skipped} skipped" - ) - return ( - total_processed, - None, - ) # Return None as the error message to indicate success - - except SQLAlchemyError as db_error: - await session.rollback() - await task_logger.log_task_failure( - log_entry, - f"Database error during JIRA indexing for connector {connector_id}", - str(db_error), - {"error_type": "SQLAlchemyError"}, - ) - logger.error(f"Database error: {db_error!s}", exc_info=True) - return 0, f"Database error: {db_error!s}" - except Exception as e: - await session.rollback() - await task_logger.log_task_failure( - log_entry, - f"Failed to index JIRA issues for connector {connector_id}", - str(e), - {"error_type": type(e).__name__}, - ) - logger.error(f"Failed to index JIRA issues: {e!s}", exc_info=True) - return 0, f"Failed to index JIRA issues: {e!s}" - - -async def index_confluence_pages( - session: AsyncSession, - connector_id: int, - search_space_id: int, - user_id: str, - start_date: str | None = None, - end_date: str | None = None, - update_last_indexed: bool = True, -) -> tuple[int, str | None]: - """ - Index Confluence pages and comments. - - Args: - session: Database session - connector_id: ID of the Confluence connector - search_space_id: ID of the search space to store documents in - user_id: User ID - start_date: Start date for indexing (YYYY-MM-DD format) - end_date: End date for indexing (YYYY-MM-DD format) - update_last_indexed: Whether to update the last_indexed_at timestamp (default: True) - - Returns: - Tuple containing (number of documents indexed, error message or None) - """ - task_logger = TaskLoggingService(session, search_space_id) - - # Log task start - log_entry = await task_logger.log_task_start( - task_name="confluence_pages_indexing", - source="connector_indexing_task", - message=f"Starting Confluence pages indexing for connector {connector_id}", - metadata={ - "connector_id": connector_id, - "user_id": str(user_id), - "start_date": start_date, - "end_date": end_date, - }, - ) - - try: - # Get the connector from the database - result = await session.execute( - select(SearchSourceConnector).filter( - SearchSourceConnector.id == connector_id, - SearchSourceConnector.connector_type - == SearchSourceConnectorType.CONFLUENCE_CONNECTOR, - ) - ) - connector = result.scalars().first() - - if not connector: - await task_logger.log_task_failure( - log_entry, - f"Connector with ID {connector_id} not found", - "Connector not found", - {"error_type": "ConnectorNotFound"}, - ) - return 0, f"Connector with ID {connector_id} not found" - - # Get the Confluence credentials from the connector config - confluence_email = connector.config.get("CONFLUENCE_EMAIL") - confluence_api_token = connector.config.get("CONFLUENCE_API_TOKEN") - confluence_base_url = connector.config.get("CONFLUENCE_BASE_URL") - - if not confluence_email or not confluence_api_token or not confluence_base_url: - await task_logger.log_task_failure( - log_entry, - f"Confluence credentials not found in connector config for connector {connector_id}", - "Missing Confluence credentials", - {"error_type": "MissingCredentials"}, - ) - return 0, "Confluence credentials not found in connector config" - - # Initialize Confluence client - await task_logger.log_task_progress( - log_entry, - f"Initializing Confluence client for connector {connector_id}", - {"stage": "client_initialization"}, - ) - - confluence_client = ConfluenceConnector( - base_url=confluence_base_url, - email=confluence_email, - api_token=confluence_api_token, - ) - - # Calculate date range - if start_date is None or end_date is None: - # Fall back to calculating dates based on last_indexed_at - calculated_end_date = datetime.now() - - # Use last_indexed_at as start date if available, otherwise use 365 days ago - if connector.last_indexed_at: - # Convert dates to be comparable (both timezone-naive) - last_indexed_naive = ( - connector.last_indexed_at.replace(tzinfo=None) - if connector.last_indexed_at.tzinfo - else connector.last_indexed_at - ) - - # Check if last_indexed_at is in the future or after end_date - if last_indexed_naive > calculated_end_date: - logger.warning( - f"Last indexed date ({last_indexed_naive.strftime('%Y-%m-%d')}) is in the future. Using 365 days ago instead." - ) - calculated_start_date = calculated_end_date - timedelta(days=365) - else: - calculated_start_date = last_indexed_naive - logger.info( - f"Using last_indexed_at ({calculated_start_date.strftime('%Y-%m-%d')}) as start date" - ) - else: - calculated_start_date = calculated_end_date - timedelta( - days=365 - ) # Use 365 days as default - logger.info( - f"No last_indexed_at found, using {calculated_start_date.strftime('%Y-%m-%d')} (365 days ago) as start date" - ) - - # Use calculated dates if not provided - start_date_str = ( - start_date if start_date else calculated_start_date.strftime("%Y-%m-%d") - ) - end_date_str = ( - end_date if end_date else calculated_end_date.strftime("%Y-%m-%d") - ) - else: - # Use provided dates - start_date_str = start_date - end_date_str = end_date - - await task_logger.log_task_progress( - log_entry, - f"Fetching Confluence pages from {start_date_str} to {end_date_str}", - { - "stage": "fetching_pages", - "start_date": start_date_str, - "end_date": end_date_str, - }, - ) - - # Get pages within date range - try: - pages, error = confluence_client.get_pages_by_date_range( - start_date=start_date_str, end_date=end_date_str, include_comments=True - ) - - if error: - logger.error(f"Failed to get Confluence pages: {error}") - - # Don't treat "No pages found" as an error that should stop indexing - if "No pages found" in error: - logger.info( - "No pages found is not a critical error, continuing with update" - ) - if update_last_indexed: - connector.last_indexed_at = datetime.now() - await session.commit() - logger.info( - f"Updated last_indexed_at to {connector.last_indexed_at} despite no pages found" - ) - - await task_logger.log_task_success( - log_entry, - f"No Confluence pages found in date range {start_date_str} to {end_date_str}", - {"pages_found": 0}, - ) - return 0, None - else: - await task_logger.log_task_failure( - log_entry, - f"Failed to get Confluence pages: {error}", - "API Error", - {"error_type": "APIError"}, - ) - return 0, f"Failed to get Confluence pages: {error}" - - logger.info(f"Retrieved {len(pages)} pages from Confluence API") - - except Exception as e: - logger.error(f"Error fetching Confluence pages: {e!s}", exc_info=True) - return 0, f"Error fetching Confluence pages: {e!s}" - - # Process and index each page - documents_indexed = 0 - skipped_pages = [] - documents_skipped = 0 - - for page in pages: - try: - page_id = page.get("id") - page_title = page.get("title", "") - space_id = page.get("spaceId", "") - - if not page_id or not page_title: - logger.warning( - f"Skipping page with missing ID or title: {page_id or 'Unknown'}" - ) - skipped_pages.append(f"{page_title or 'Unknown'} (missing data)") - documents_skipped += 1 - continue - - # Extract page content - page_content = "" - if page.get("body") and page["body"].get("storage"): - page_content = page["body"]["storage"].get("value", "") - - # Add comments to content - comments = page.get("comments", []) - comments_content = "" - if comments: - comments_content = "\n\n## Comments\n\n" - for comment in comments: - comment_body = "" - if comment.get("body") and comment["body"].get("storage"): - comment_body = comment["body"]["storage"].get("value", "") - - comment_author = comment.get("version", {}).get( - "authorId", "Unknown" - ) - comment_date = comment.get("version", {}).get("createdAt", "") - - comments_content += f"**Comment by {comment_author}** ({comment_date}):\n{comment_body}\n\n" - - # Combine page content with comments - full_content = f"# {page_title}\n\n{page_content}{comments_content}" - - if not full_content.strip(): - logger.warning(f"Skipping page with no content: {page_title}") - skipped_pages.append(f"{page_title} (no content)") - documents_skipped += 1 - continue - - # Create a simple summary - summary_content = ( - f"Confluence Page: {page_title}\n\nSpace ID: {space_id}\n\n" - ) - if page_content: - # Take first 500 characters of content for summary - content_preview = page_content[:500] - if len(page_content) > 500: - content_preview += "..." - summary_content += f"Content Preview: {content_preview}\n\n" - - # Add comment count - comment_count = len(comments) - summary_content += f"Comments: {comment_count}" - - # Generate content hash - content_hash = generate_content_hash(full_content, search_space_id) - - # Check if document already exists - existing_doc_by_hash_result = await session.execute( - select(Document).where(Document.content_hash == content_hash) - ) - existing_document_by_hash = ( - existing_doc_by_hash_result.scalars().first() - ) - - if existing_document_by_hash: - logger.info( - f"Document with content hash {content_hash} already exists for page {page_title}. Skipping processing." - ) - documents_skipped += 1 - continue - - # Generate embedding for the summary - summary_embedding = config.embedding_model_instance.embed( - summary_content - ) - - # Process chunks - using the full page content with comments - chunks = [ - Chunk( - content=chunk.text, - embedding=config.embedding_model_instance.embed(chunk.text), - ) - for chunk in config.chunker_instance.chunk(full_content) - ] - - # Create and store new document - logger.info(f"Creating new document for page {page_title}") - document = Document( - search_space_id=search_space_id, - title=f"Confluence - {page_title}", - document_type=DocumentType.CONFLUENCE_CONNECTOR, - document_metadata={ - "page_id": page_id, - "page_title": page_title, - "space_id": space_id, - "comment_count": comment_count, - "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), - }, - content=summary_content, - content_hash=content_hash, - embedding=summary_embedding, - chunks=chunks, - ) - - session.add(document) - documents_indexed += 1 - logger.info(f"Successfully indexed new page {page_title}") - - except Exception as e: - logger.error( - f"Error processing page {page.get('title', 'Unknown')}: {e!s}", - exc_info=True, - ) - skipped_pages.append( - f"{page.get('title', 'Unknown')} (processing error)" - ) - documents_skipped += 1 - continue # Skip this page and continue with others - - # Update the last_indexed_at timestamp for the connector only if requested - total_processed = documents_indexed - if update_last_indexed: - connector.last_indexed_at = datetime.now() - logger.info(f"Updated last_indexed_at to {connector.last_indexed_at}") - - # Commit all changes - await session.commit() - logger.info( - "Successfully committed all Confluence document changes to database" - ) - - # Log success - await task_logger.log_task_success( - log_entry, - f"Successfully completed Confluence indexing for connector {connector_id}", - { - "pages_processed": total_processed, - "documents_indexed": documents_indexed, - "documents_skipped": documents_skipped, - "skipped_pages_count": len(skipped_pages), - }, - ) - - logger.info( - f"Confluence indexing completed: {documents_indexed} new pages, {documents_skipped} skipped" - ) - return ( - total_processed, - None, - ) # Return None as the error message to indicate success - - except SQLAlchemyError as db_error: - await session.rollback() - await task_logger.log_task_failure( - log_entry, - f"Database error during Confluence indexing for connector {connector_id}", - str(db_error), - {"error_type": "SQLAlchemyError"}, - ) - logger.error(f"Database error: {db_error!s}", exc_info=True) - return 0, f"Database error: {db_error!s}" - except Exception as e: - await session.rollback() - await task_logger.log_task_failure( - log_entry, - f"Failed to index Confluence pages for connector {connector_id}", - str(e), - {"error_type": type(e).__name__}, - ) - logger.error(f"Failed to index Confluence pages: {e!s}", exc_info=True) - return 0, f"Failed to index Confluence pages: {e!s}" - - -async def index_clickup_tasks( - session: AsyncSession, - connector_id: int, - search_space_id: int, - user_id: str, - start_date: str | None = None, - end_date: str | None = None, - update_last_indexed: bool = True, -) -> tuple[int, str | None]: - """ - Index tasks from ClickUp workspace. - - Args: - session: Database session - connector_id: ID of the ClickUp connector - search_space_id: ID of the search space - user_id: ID of the user - start_date: Start date for filtering tasks (YYYY-MM-DD format) - end_date: End date for filtering tasks (YYYY-MM-DD format) - update_last_indexed: Whether to update the last_indexed_at timestamp - - Returns: - Tuple of (number of indexed tasks, error message if any) - """ - task_logger = TaskLoggingService(session, search_space_id) - - # Log task start - log_entry = await task_logger.log_task_start( - task_name="clickup_tasks_indexing", - source="connector_indexing_task", - message=f"Starting ClickUp tasks indexing for connector {connector_id}", - metadata={ - "connector_id": connector_id, - "start_date": start_date, - "end_date": end_date, - }, - ) - - try: - # Get connector configuration - result = await session.execute( - select(SearchSourceConnector).filter( - SearchSourceConnector.id == connector_id - ) - ) - connector = result.scalars().first() - - if not connector: - error_msg = f"ClickUp connector with ID {connector_id} not found" - await task_logger.log_task_failure( - log_entry, - f"Connector with ID {connector_id} not found or is not a ClickUp connector", - "Connector not found", - {"error_type": "ConnectorNotFound"}, - ) - return 0, error_msg - - # Extract ClickUp configuration - clickup_api_token = connector.config.get("CLICKUP_API_TOKEN") - - if not clickup_api_token: - error_msg = "ClickUp API token not found in connector configuration" - await task_logger.log_task_failure( - log_entry, - f"ClickUp API token not found in connector config for connector {connector_id}", - "Missing ClickUp token", - {"error_type": "MissingToken"}, - ) - return 0, error_msg - - await task_logger.log_task_progress( - log_entry, - f"Initializing ClickUp client for connector {connector_id}", - {"stage": "client_initialization"}, - ) - - clickup_client = ClickUpConnector(api_token=clickup_api_token) - - # Get authorized workspaces - await task_logger.log_task_progress( - log_entry, - "Fetching authorized ClickUp workspaces", - {"stage": "workspace_fetching"}, - ) - - workspaces_response = clickup_client.get_authorized_workspaces() - workspaces = workspaces_response.get("teams", []) - - if not workspaces: - error_msg = "No authorized ClickUp workspaces found" - await task_logger.log_task_failure( - log_entry, - f"No authorized ClickUp workspaces found for connector {connector_id}", - "No workspaces found", - {"error_type": "NoWorkspacesFound"}, - ) - return 0, error_msg - - # Process and index each task - documents_indexed = 0 - documents_skipped = 0 - - for workspace in workspaces: - workspace_id = workspace.get("id") - workspace_name = workspace.get("name", "Unknown Workspace") - - if not workspace_id: - continue - - await task_logger.log_task_progress( - log_entry, - f"Processing workspace: {workspace_name}", - {"stage": "workspace_processing", "workspace_id": workspace_id}, - ) - - # Fetch tasks from workspace - if start_date and end_date: - tasks, error = clickup_client.get_tasks_in_date_range( - workspace_id=workspace_id, - start_date=start_date, - end_date=end_date, - include_closed=True, - ) - if error: - logger.warning( - f"Error fetching tasks from workspace {workspace_name}: {error}" - ) - continue - else: - tasks = clickup_client.get_workspace_tasks( - workspace_id=workspace_id, include_closed=True - ) - - await task_logger.log_task_progress( - log_entry, - f"Found {len(tasks)} tasks in workspace {workspace_name}", - {"stage": "tasks_found", "task_count": len(tasks)}, - ) - - # Process each task - for task in tasks: - try: - task_id = task.get("id") - task_name = task.get("name", "Untitled Task") - task_description = task.get("description", "") - task_status = task.get("status", {}).get("status", "Unknown") - task_priority = ( - task.get("priority", {}).get("priority", "Unknown") - if task.get("priority") - else "None" - ) - task_assignees = task.get("assignees", []) - task_due_date = task.get("due_date") - task_created = task.get("date_created") - task_updated = task.get("date_updated") - - # Get list and space information - task_list = task.get("list", {}) - task_list_name = task_list.get("name", "Unknown List") - task_space = task.get("space", {}) - task_space_name = task_space.get("name", "Unknown Space") - - # Create task content - content_parts = [f"Task: {task_name}"] - - if task_description: - content_parts.append(f"Description: {task_description}") - - content_parts.extend( - [ - f"Status: {task_status}", - f"Priority: {task_priority}", - f"List: {task_list_name}", - f"Space: {task_space_name}", - ] - ) - - if task_assignees: - assignee_names = [ - assignee.get("username", "Unknown") - for assignee in task_assignees - ] - content_parts.append(f"Assignees: {', '.join(assignee_names)}") - - if task_due_date: - content_parts.append(f"Due Date: {task_due_date}") - - task_content = "\n".join(content_parts) - - if not task_content.strip(): - logger.warning(f"Skipping task with no content: {task_name}") - continue - - # Generate content hash - content_hash = generate_content_hash(task_content, search_space_id) - - # Check if document already exists - existing_doc_by_hash_result = await session.execute( - select(Document).where(Document.content_hash == content_hash) - ) - existing_document_by_hash = ( - existing_doc_by_hash_result.scalars().first() - ) - - if existing_document_by_hash: - logger.info( - f"Document with content hash {content_hash} already exists for task {task_name}. Skipping processing." - ) - documents_skipped += 1 - continue - - # Generate embedding for the summary - summary_embedding = config.embedding_model_instance.embed( - task_content - ) - - # Process chunks - using the full page content with comments - chunks = [ - Chunk( - content=chunk.text, - embedding=config.embedding_model_instance.embed(chunk.text), - ) - for chunk in config.chunker_instance.chunk(task_content) - ] - - # Create and store new document - logger.info(f"Creating new document for task {task_name}") - - document = Document( - search_space_id=search_space_id, - title=f"Task - {task_name}", - document_type=DocumentType.CLICKUP_CONNECTOR, - document_metadata={ - "task_id": task_id, - "task_name": task_name, - "task_status": task_status, - "task_priority": task_priority, - "task_assignees": task_assignees, - "task_due_date": task_due_date, - "task_created": task_created, - "task_updated": task_updated, - "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), - }, - content=task_content, - content_hash=content_hash, - embedding=summary_embedding, - chunks=chunks, - ) - - session.add(document) - documents_indexed += 1 - logger.info(f"Successfully indexed new task {task_name}") - - except Exception as e: - logger.error( - f"Error processing task {task.get('name', 'Unknown')}: {e!s}", - exc_info=True, - ) - documents_skipped += 1 - - # Update the last_indexed_at timestamp for the connector only if requested - total_processed = documents_indexed - if update_last_indexed: - connector.last_indexed_at = datetime.now() - logger.info(f"Updated last_indexed_at to {connector.last_indexed_at}") - - # Commit all changes - await session.commit() - logger.info( - "Successfully committed all clickup document changes to database" - ) - - # Log success - await task_logger.log_task_success( - log_entry, - f"Successfully completed clickup indexing for connector {connector_id}", - { - "pages_processed": total_processed, - "documents_indexed": documents_indexed, - "documents_skipped": documents_skipped, - }, - ) - - logger.info( - f"clickup indexing completed: {documents_indexed} new tasks, {documents_skipped} skipped" - ) - return ( - total_processed, - None, - ) # Return None as the error message to indicate success - - except SQLAlchemyError as db_error: - await session.rollback() - await task_logger.log_task_failure( - log_entry, - f"Database error during Cickup indexing for connector {connector_id}", - str(db_error), - {"error_type": "SQLAlchemyError"}, - ) - logger.error(f"Database error: {db_error!s}", exc_info=True) - return 0, f"Database error: {db_error!s}" - except Exception as e: - await session.rollback() - await task_logger.log_task_failure( - log_entry, - f"Failed to index ClickUp tasks for connector {connector_id}", - str(e), - {"error_type": type(e).__name__}, - ) - logger.error(f"Failed to index ClickUp tasks: {e!s}", exc_info=True) - return 0, f"Failed to index ClickUp tasks: {e!s}" - - -async def index_google_calendar_events( - session: AsyncSession, - connector_id: int, - search_space_id: int, - user_id: str, - start_date: str | None = None, - end_date: str | None = None, - update_last_indexed: bool = True, -) -> tuple[int, str | None]: - """ - Index Google Calendar events. - - Args: - session: Database session - connector_id: ID of the Google Calendar connector - search_space_id: ID of the search space to store documents in - user_id: User ID - start_date: Start date for indexing (YYYY-MM-DD format) - end_date: End date for indexing (YYYY-MM-DD format) - update_last_indexed: Whether to update the last_indexed_at timestamp (default: True) - - Returns: - Tuple containing (number of documents indexed, error message or None) - """ - task_logger = TaskLoggingService(session, search_space_id) - - # Log task start - log_entry = await task_logger.log_task_start( - task_name="google_calendar_events_indexing", - source="connector_indexing_task", - message=f"Starting Google Calendar events indexing for connector {connector_id}", - metadata={ - "connector_id": connector_id, - "user_id": str(user_id), - "start_date": start_date, - "end_date": end_date, - }, - ) - - try: - # Get the connector from the database - result = await session.execute( - select(SearchSourceConnector).filter( - SearchSourceConnector.id == connector_id, - SearchSourceConnector.connector_type - == SearchSourceConnectorType.GOOGLE_CALENDAR_CONNECTOR, - ) - ) - connector = result.scalars().first() - - if not connector: - await task_logger.log_task_failure( - log_entry, - f"Connector with ID {connector_id} not found", - "Connector not found", - {"error_type": "ConnectorNotFound"}, - ) - return 0, f"Connector with ID {connector_id} not found" - - # Get the Google Calendar credentials from the connector config - credentials = Credentials( - token=connector.config.get("token"), - refresh_token=connector.config.get("refresh_token"), - token_uri=connector.config.get("token_uri"), - client_id=connector.config.get("client_id"), - client_secret=connector.config.get("client_secret"), - scopes=connector.config.get("scopes"), - ) - - if ( - not credentials.client_id - or not credentials.client_secret - or not credentials.refresh_token - ): - await task_logger.log_task_failure( - log_entry, - f"Google Calendar credentials not found in connector config for connector {connector_id}", - "Missing Google Calendar credentials", - {"error_type": "MissingCredentials"}, - ) - return 0, "Google Calendar credentials not found in connector config" - - # Initialize Google Calendar client - await task_logger.log_task_progress( - log_entry, - f"Initializing Google Calendar client for connector {connector_id}", - {"stage": "client_initialization"}, - ) - - calendar_client = GoogleCalendarConnector(credentials=credentials) - - # Calculate date range - if start_date is None or end_date is None: - # Fall back to calculating dates based on last_indexed_at - calculated_end_date = datetime.now() - - # Use last_indexed_at as start date if available, otherwise use 30 days ago - if connector.last_indexed_at: - # Convert dates to be comparable (both timezone-naive) - last_indexed_naive = ( - connector.last_indexed_at.replace(tzinfo=None) - if connector.last_indexed_at.tzinfo - else connector.last_indexed_at - ) - - # Check if last_indexed_at is in the future or after end_date - if last_indexed_naive > calculated_end_date: - logger.warning( - f"Last indexed date ({last_indexed_naive.strftime('%Y-%m-%d')}) is in the future. Using 30 days ago instead." - ) - calculated_start_date = calculated_end_date - timedelta(days=30) - else: - calculated_start_date = last_indexed_naive - logger.info( - f"Using last_indexed_at ({calculated_start_date.strftime('%Y-%m-%d')}) as start date" - ) - else: - calculated_start_date = calculated_end_date - timedelta( - days=30 - ) # Use 30 days as default for calendar events - logger.info( - f"No last_indexed_at found, using {calculated_start_date.strftime('%Y-%m-%d')} (30 days ago) as start date" - ) - - # Use calculated dates if not provided - start_date_str = ( - start_date if start_date else calculated_start_date.strftime("%Y-%m-%d") - ) - end_date_str = ( - end_date if end_date else calculated_end_date.strftime("%Y-%m-%d") - ) - else: - # Use provided dates - start_date_str = start_date - end_date_str = end_date - - await task_logger.log_task_progress( - log_entry, - f"Fetching Google Calendar events from {start_date_str} to {end_date_str}", - { - "stage": "fetching_events", - "start_date": start_date_str, - "end_date": end_date_str, - }, - ) - - # Get events within date range from primary calendar - try: - events, error = calendar_client.get_all_primary_calendar_events( - start_date=start_date_str, end_date=end_date_str - ) - - if error: - logger.error(f"Failed to get Google Calendar events: {error}") - - # Don't treat "No events found" as an error that should stop indexing - if "No events found" in error: - logger.info( - "No events found is not a critical error, continuing with update" - ) - if update_last_indexed: - connector.last_indexed_at = datetime.now() - await session.commit() - logger.info( - f"Updated last_indexed_at to {connector.last_indexed_at} despite no events found" - ) - - await task_logger.log_task_success( - log_entry, - f"No Google Calendar events found in date range {start_date_str} to {end_date_str}", - {"events_found": 0}, - ) - return 0, None - else: - await task_logger.log_task_failure( - log_entry, - f"Failed to get Google Calendar events: {error}", - "API Error", - {"error_type": "APIError"}, - ) - return 0, f"Failed to get Google Calendar events: {error}" - - logger.info(f"Retrieved {len(events)} events from Google Calendar API") - - except Exception as e: - logger.error(f"Error fetching Google Calendar events: {e!s}", exc_info=True) - return 0, f"Error fetching Google Calendar events: {e!s}" - - # Process and index each event - documents_indexed = 0 - skipped_events = [] - documents_skipped = 0 - - for event in events: - try: - event_id = event.get("id") - event_summary = event.get("summary", "No Title") - calendar_id = event.get("calendarId", "") - - if not event_id: - logger.warning(f"Skipping event with missing ID: {event_summary}") - skipped_events.append(f"{event_summary} (missing ID)") - documents_skipped += 1 - continue - - # Format event as markdown - event_markdown = calendar_client.format_event_to_markdown(event) - - if not event_markdown.strip(): - logger.warning(f"Skipping event with no content: {event_summary}") - skipped_events.append(f"{event_summary} (no content)") - documents_skipped += 1 - continue - - # Create a simple summary for the document - start = event.get("start", {}) - end = event.get("end", {}) - start_time = start.get("dateTime") or start.get("date", "") - end_time = end.get("dateTime") or end.get("date", "") - location = event.get("location", "") - description = event.get("description", "") - - summary_content = f"Google Calendar Event: {event_summary}\n\n" - summary_content += f"Calendar: {calendar_id}\n" - summary_content += f"Start: {start_time}\n" - summary_content += f"End: {end_time}\n" - - if location: - summary_content += f"Location: {location}\n" - - if description: - # Take first 300 characters of description for summary - desc_preview = description[:300] - if len(description) > 300: - desc_preview += "..." - summary_content += f"Description: {desc_preview}\n" - - # Generate content hash - content_hash = generate_content_hash(event_markdown, search_space_id) - - # Check if document already exists - existing_doc_by_hash_result = await session.execute( - select(Document).where(Document.content_hash == content_hash) - ) - existing_document_by_hash = ( - existing_doc_by_hash_result.scalars().first() - ) - - if existing_document_by_hash: - logger.info( - f"Document with content hash {content_hash} already exists for event {event_summary}. Skipping processing." - ) - documents_skipped += 1 - continue - - # Generate embedding for the summary - summary_embedding = config.embedding_model_instance.embed( - summary_content - ) - - # Process chunks - using the full event markdown - chunks = [ - Chunk( - content=chunk.text, - embedding=config.embedding_model_instance.embed(chunk.text), - ) - for chunk in config.chunker_instance.chunk(event_markdown) - ] - - # Create and store new document - logger.info(f"Creating new document for event {event_summary}") - document = Document( - search_space_id=search_space_id, - title=f"Calendar Event - {event_summary}", - document_type=DocumentType.GOOGLE_CALENDAR_CONNECTOR, - document_metadata={ - "event_id": event_id, - "event_summary": event_summary, - "calendar_id": calendar_id, - "start_time": start_time, - "end_time": end_time, - "location": location, - "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), - }, - content=summary_content, - content_hash=content_hash, - embedding=summary_embedding, - chunks=chunks, - ) - - session.add(document) - documents_indexed += 1 - logger.info(f"Successfully indexed new event {event_summary}") - - except Exception as e: - logger.error( - f"Error processing event {event.get('summary', 'Unknown')}: {e!s}", - exc_info=True, - ) - skipped_events.append( - f"{event.get('summary', 'Unknown')} (processing error)" - ) - documents_skipped += 1 - continue # Skip this event and continue with others - - # Update the last_indexed_at timestamp for the connector only if requested - total_processed = documents_indexed - if update_last_indexed: - connector.last_indexed_at = datetime.now() - logger.info(f"Updated last_indexed_at to {connector.last_indexed_at}") - - # Commit all changes - await session.commit() - logger.info( - "Successfully committed all Google Calendar document changes to database" - ) - - # Log success - await task_logger.log_task_success( - log_entry, - f"Successfully completed Google Calendar indexing for connector {connector_id}", - { - "events_processed": total_processed, - "documents_indexed": documents_indexed, - "documents_skipped": documents_skipped, - "skipped_events_count": len(skipped_events), - }, - ) - - logger.info( - f"Google Calendar indexing completed: {documents_indexed} new events, {documents_skipped} skipped" - ) - return ( - total_processed, - None, - ) # Return None as the error message to indicate success - - except SQLAlchemyError as db_error: - await session.rollback() - await task_logger.log_task_failure( - log_entry, - f"Database error during Google Calendar indexing for connector {connector_id}", - str(db_error), - {"error_type": "SQLAlchemyError"}, - ) - logger.error(f"Database error: {db_error!s}", exc_info=True) - return 0, f"Database error: {db_error!s}" - except Exception as e: - await session.rollback() - await task_logger.log_task_failure( - log_entry, - f"Failed to index Google Calendar events for connector {connector_id}", - str(e), - {"error_type": type(e).__name__}, - ) - logger.error(f"Failed to index Google Calendar events: {e!s}", exc_info=True) - return 0, f"Failed to index Google Calendar events: {e!s}" diff --git a/surfsense_backend/app/tasks/document_processors/__init__.py b/surfsense_backend/app/tasks/document_processors/__init__.py new file mode 100644 index 0000000..a238ac8 --- /dev/null +++ b/surfsense_backend/app/tasks/document_processors/__init__.py @@ -0,0 +1,47 @@ +""" +Document processors module for background tasks. + +This module provides a collection of document processors for different content types +and sources. Each processor is responsible for handling a specific type of document +processing task in the background. + +Available processors: +- URL crawler: Process web pages from URLs +- Extension processor: Handle documents from browser extension +- Markdown processor: Process markdown files +- File processors: Handle files using different ETL services (Unstructured, LlamaCloud, Docling) +- YouTube processor: Process YouTube videos and extract transcripts +""" + +# URL crawler +# Extension processor +from .extension_processor import add_extension_received_document + +# File processors +from .file_processors import ( + add_received_file_document_using_docling, + add_received_file_document_using_llamacloud, + add_received_file_document_using_unstructured, +) + +# Markdown processor +from .markdown_processor import add_received_markdown_file_document +from .url_crawler import add_crawled_url_document + +# YouTube processor +from .youtube_processor import add_youtube_video_document + +__all__ = [ + # URL processing + "add_crawled_url_document", + # Extension processing + "add_extension_received_document", + "add_received_file_document_using_docling", + "add_received_file_document_using_llamacloud", + # File processing with different ETL services + "add_received_file_document_using_unstructured", + # Markdown file processing + "add_received_markdown_file_document", + # YouTube video processing + "add_youtube_video_document", +] diff --git a/surfsense_backend/app/tasks/document_processors/base.py b/surfsense_backend/app/tasks/document_processors/base.py new file mode 100644 index 0000000..2b2e3ab --- /dev/null +++ b/surfsense_backend/app/tasks/document_processors/base.py @@ -0,0 +1,75 @@ +""" +Base functionality and shared imports for document processors. +""" + + +from langchain_community.document_transformers import MarkdownifyTransformer +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.future import select + +from app.config import config +from app.db import Chunk, Document +from app.prompts import SUMMARY_PROMPT_TEMPLATE + +# Initialize markdown transformer +md = MarkdownifyTransformer() + + +async def check_duplicate_document( + session: AsyncSession, content_hash: str +) -> Document | None: + """ + Check if a document with the given content hash already exists. + + Args: + session: Database session + content_hash: Hash of the document content + + Returns: + Existing document if found, None otherwise + """ + existing_doc_result = await session.execute( + select(Document).where(Document.content_hash == content_hash) + ) + return existing_doc_result.scalars().first() + + +async def create_document_chunks(content: str) -> list[Chunk]: + """ + Create chunks from document content. + + Args: + content: Document content to chunk + + Returns: + List of Chunk objects with embeddings + """ + return [ + Chunk( + content=chunk.text, + embedding=config.embedding_model_instance.embed(chunk.text), + ) + for chunk in config.chunker_instance.chunk(content) + ] + + +async def generate_document_summary( + content: str, user_llm, document_title: str = "" +) -> tuple[str, list[float]]: + """ + Generate summary and embedding for document content. + + Args: + content: Document content + user_llm: User's LLM instance + document_title: Optional document title for context + + Returns: + Tuple of (summary_content, summary_embedding) + """ + summary_chain = SUMMARY_PROMPT_TEMPLATE | user_llm + summary_result = await summary_chain.ainvoke({"document": content}) + summary_content = summary_result.content + summary_embedding = config.embedding_model_instance.embed(summary_content) + + return summary_content, summary_embedding diff --git a/surfsense_backend/app/tasks/document_processors/extension_processor.py b/surfsense_backend/app/tasks/document_processors/extension_processor.py new file mode 100644 index 0000000..492b76d --- /dev/null +++ b/surfsense_backend/app/tasks/document_processors/extension_processor.py @@ -0,0 +1,163 @@ +""" +Extension document processor for SurfSense browser extension. +""" + +import logging + +from sqlalchemy.exc import SQLAlchemyError +from sqlalchemy.ext.asyncio import AsyncSession + +from app.db import Document, DocumentType +from app.schemas import ExtensionDocumentContent +from app.services.llm_service import get_user_long_context_llm +from app.services.task_logging_service import TaskLoggingService +from app.utils.document_converters import generate_content_hash + +from .base import ( + check_duplicate_document, + create_document_chunks, + generate_document_summary, +) + + +async def add_extension_received_document( + session: AsyncSession, + content: ExtensionDocumentContent, + search_space_id: int, + user_id: str, +) -> Document | None: + """ + Process and store document content received from the SurfSense Extension. + + Args: + session: Database session + content: Document content from extension + search_space_id: ID of the search space + user_id: ID of the user + + Returns: + Document object if successful, None if failed + """ + task_logger = TaskLoggingService(session, search_space_id) + + # Log task start + log_entry = await task_logger.log_task_start( + task_name="extension_document", + source="background_task", + message=f"Processing extension document: {content.metadata.VisitedWebPageTitle}", + metadata={ + "url": content.metadata.VisitedWebPageURL, + "title": content.metadata.VisitedWebPageTitle, + "user_id": str(user_id), + }, + ) + + try: + # Format document metadata in a more maintainable way + metadata_sections = [ + ( + "METADATA", + [ + f"SESSION_ID: {content.metadata.BrowsingSessionId}", + f"URL: {content.metadata.VisitedWebPageURL}", + f"TITLE: {content.metadata.VisitedWebPageTitle}", + f"REFERRER: {content.metadata.VisitedWebPageReffererURL}", + f"TIMESTAMP: {content.metadata.VisitedWebPageDateWithTimeInISOString}", + f"DURATION_MS: {content.metadata.VisitedWebPageVisitDurationInMilliseconds}", + ], + ), + ( + "CONTENT", + ["FORMAT: markdown", "TEXT_START", content.pageContent, "TEXT_END"], + ), + ] + + # Build the document string more efficiently + document_parts = [] + document_parts.append("") + + for section_title, section_content in metadata_sections: + document_parts.append(f"<{section_title}>") + document_parts.extend(section_content) + document_parts.append(f"") + + document_parts.append("") + combined_document_string = "\n".join(document_parts) + content_hash = generate_content_hash(combined_document_string, search_space_id) + + # Check if document with this content hash already exists + existing_document = await check_duplicate_document(session, content_hash) + if existing_document: + await task_logger.log_task_success( + log_entry, + f"Extension document already exists: {content.metadata.VisitedWebPageTitle}", + { + "duplicate_detected": True, + "existing_document_id": existing_document.id, + }, + ) + logging.info( + f"Document with content hash {content_hash} already exists. Skipping processing." + ) + return existing_document + + # Get user's long context LLM + user_llm = await get_user_long_context_llm(session, user_id) + if not user_llm: + raise RuntimeError(f"No long context LLM configured for user {user_id}") + + # Generate summary + summary_content, summary_embedding = await generate_document_summary( + combined_document_string, user_llm + ) + + # Process chunks + chunks = await create_document_chunks(content.pageContent) + + # Create and store document + document = Document( + search_space_id=search_space_id, + title=content.metadata.VisitedWebPageTitle, + document_type=DocumentType.EXTENSION, + document_metadata=content.metadata.model_dump(), + content=summary_content, + embedding=summary_embedding, + chunks=chunks, + content_hash=content_hash, + ) + + session.add(document) + await session.commit() + await session.refresh(document) + + # Log success + await task_logger.log_task_success( + log_entry, + f"Successfully processed extension document: {content.metadata.VisitedWebPageTitle}", + { + "document_id": document.id, + "content_hash": content_hash, + "url": content.metadata.VisitedWebPageURL, + }, + ) + + return document + + except SQLAlchemyError as db_error: + await session.rollback() + await task_logger.log_task_failure( + log_entry, + f"Database error processing extension document: {content.metadata.VisitedWebPageTitle}", + str(db_error), + {"error_type": "SQLAlchemyError"}, + ) + raise db_error + except Exception as e: + await session.rollback() + await task_logger.log_task_failure( + log_entry, + f"Failed to process extension document: {content.metadata.VisitedWebPageTitle}", + str(e), + {"error_type": type(e).__name__}, + ) + raise RuntimeError(f"Failed to process extension document: {e!s}") from e diff --git a/surfsense_backend/app/tasks/document_processors/file_processors.py b/surfsense_backend/app/tasks/document_processors/file_processors.py new file mode 100644 index 0000000..34403e5 --- /dev/null +++ b/surfsense_backend/app/tasks/document_processors/file_processors.py @@ -0,0 +1,261 @@ +""" +File document processors for different ETL services (Unstructured, LlamaCloud, Docling). +""" + +import logging + +from langchain_core.documents import Document as LangChainDocument +from sqlalchemy.exc import SQLAlchemyError +from sqlalchemy.ext.asyncio import AsyncSession + +from app.db import Document, DocumentType +from app.services.llm_service import get_user_long_context_llm +from app.utils.document_converters import ( + convert_document_to_markdown, + generate_content_hash, +) + +from .base import ( + check_duplicate_document, + create_document_chunks, + generate_document_summary, +) + + +async def add_received_file_document_using_unstructured( + session: AsyncSession, + file_name: str, + unstructured_processed_elements: list[LangChainDocument], + search_space_id: int, + user_id: str, +) -> Document | None: + """ + Process and store a file document using Unstructured service. + + Args: + session: Database session + file_name: Name of the processed file + unstructured_processed_elements: Processed elements from Unstructured + search_space_id: ID of the search space + user_id: ID of the user + + Returns: + Document object if successful, None if failed + """ + try: + file_in_markdown = await convert_document_to_markdown( + unstructured_processed_elements + ) + + content_hash = generate_content_hash(file_in_markdown, search_space_id) + + # Check if document with this content hash already exists + existing_document = await check_duplicate_document(session, content_hash) + if existing_document: + logging.info( + f"Document with content hash {content_hash} already exists. Skipping processing." + ) + return existing_document + + # TODO: Check if file_markdown exceeds token limit of embedding model + + # Get user's long context LLM + user_llm = await get_user_long_context_llm(session, user_id) + if not user_llm: + raise RuntimeError(f"No long context LLM configured for user {user_id}") + + # Generate summary + summary_content, summary_embedding = await generate_document_summary( + file_in_markdown, user_llm + ) + + # Process chunks + chunks = await create_document_chunks(file_in_markdown) + + # Create and store document + document = Document( + search_space_id=search_space_id, + title=file_name, + document_type=DocumentType.FILE, + document_metadata={ + "FILE_NAME": file_name, + "ETL_SERVICE": "UNSTRUCTURED", + }, + content=summary_content, + embedding=summary_embedding, + chunks=chunks, + content_hash=content_hash, + ) + + session.add(document) + await session.commit() + await session.refresh(document) + + return document + except SQLAlchemyError as db_error: + await session.rollback() + raise db_error + except Exception as e: + await session.rollback() + raise RuntimeError(f"Failed to process file document: {e!s}") from e + + +async def add_received_file_document_using_llamacloud( + session: AsyncSession, + file_name: str, + llamacloud_markdown_document: str, + search_space_id: int, + user_id: str, +) -> Document | None: + """ + Process and store document content parsed by LlamaCloud. + + Args: + session: Database session + file_name: Name of the processed file + llamacloud_markdown_document: Markdown content from LlamaCloud parsing + search_space_id: ID of the search space + user_id: ID of the user + + Returns: + Document object if successful, None if failed + """ + try: + # Combine all markdown documents into one + file_in_markdown = llamacloud_markdown_document + + content_hash = generate_content_hash(file_in_markdown, search_space_id) + + # Check if document with this content hash already exists + existing_document = await check_duplicate_document(session, content_hash) + if existing_document: + logging.info( + f"Document with content hash {content_hash} already exists. Skipping processing." + ) + return existing_document + + # Get user's long context LLM + user_llm = await get_user_long_context_llm(session, user_id) + if not user_llm: + raise RuntimeError(f"No long context LLM configured for user {user_id}") + + # Generate summary + summary_content, summary_embedding = await generate_document_summary( + file_in_markdown, user_llm + ) + + # Process chunks + chunks = await create_document_chunks(file_in_markdown) + + # Create and store document + document = Document( + search_space_id=search_space_id, + title=file_name, + document_type=DocumentType.FILE, + document_metadata={ + "FILE_NAME": file_name, + "ETL_SERVICE": "LLAMACLOUD", + }, + content=summary_content, + embedding=summary_embedding, + chunks=chunks, + content_hash=content_hash, + ) + + session.add(document) + await session.commit() + await session.refresh(document) + + return document + except SQLAlchemyError as db_error: + await session.rollback() + raise db_error + except Exception as e: + await session.rollback() + raise RuntimeError( + f"Failed to process file document using LlamaCloud: {e!s}" + ) from e + + +async def add_received_file_document_using_docling( + session: AsyncSession, + file_name: str, + docling_markdown_document: str, + search_space_id: int, + user_id: str, +) -> Document | None: + """ + Process and store document content parsed by Docling. + + Args: + session: Database session + file_name: Name of the processed file + docling_markdown_document: Markdown content from Docling parsing + search_space_id: ID of the search space + user_id: ID of the user + + Returns: + Document object if successful, None if failed + """ + try: + file_in_markdown = docling_markdown_document + + content_hash = generate_content_hash(file_in_markdown, search_space_id) + + # Check if document with this content hash already exists + existing_document = await check_duplicate_document(session, content_hash) + if existing_document: + logging.info( + f"Document with content hash {content_hash} already exists. Skipping processing." + ) + return existing_document + + # Get user's long context LLM + user_llm = await get_user_long_context_llm(session, user_id) + if not user_llm: + raise RuntimeError(f"No long context LLM configured for user {user_id}") + + # Generate summary using chunked processing for large documents + from app.services.docling_service import create_docling_service + + docling_service = create_docling_service() + + summary_content = await docling_service.process_large_document_summary( + content=file_in_markdown, llm=user_llm, document_title=file_name + ) + + from app.config import config + + summary_embedding = config.embedding_model_instance.embed(summary_content) + + # Process chunks + chunks = await create_document_chunks(file_in_markdown) + + # Create and store document + document = Document( + search_space_id=search_space_id, + title=file_name, + document_type=DocumentType.FILE, + document_metadata={ + "FILE_NAME": file_name, + "ETL_SERVICE": "DOCLING", + }, + content=summary_content, + embedding=summary_embedding, + chunks=chunks, + content_hash=content_hash, + ) + + session.add(document) + await session.commit() + await session.refresh(document) + + return document + except SQLAlchemyError as db_error: + await session.rollback() + raise db_error + except Exception as e: + await session.rollback() + raise RuntimeError( + f"Failed to process file document using Docling: {e!s}" + ) from e diff --git a/surfsense_backend/app/tasks/document_processors/markdown_processor.py b/surfsense_backend/app/tasks/document_processors/markdown_processor.py new file mode 100644 index 0000000..c7c8c75 --- /dev/null +++ b/surfsense_backend/app/tasks/document_processors/markdown_processor.py @@ -0,0 +1,136 @@ +""" +Markdown file document processor. +""" + +import logging + +from sqlalchemy.exc import SQLAlchemyError +from sqlalchemy.ext.asyncio import AsyncSession + +from app.db import Document, DocumentType +from app.services.llm_service import get_user_long_context_llm +from app.services.task_logging_service import TaskLoggingService +from app.utils.document_converters import generate_content_hash + +from .base import ( + check_duplicate_document, + create_document_chunks, + generate_document_summary, +) + + +async def add_received_markdown_file_document( + session: AsyncSession, + file_name: str, + file_in_markdown: str, + search_space_id: int, + user_id: str, +) -> Document | None: + """ + Process and store a markdown file document. + + Args: + session: Database session + file_name: Name of the markdown file + file_in_markdown: Content of the markdown file + search_space_id: ID of the search space + user_id: ID of the user + + Returns: + Document object if successful, None if failed + """ + task_logger = TaskLoggingService(session, search_space_id) + + # Log task start + log_entry = await task_logger.log_task_start( + task_name="markdown_file_document", + source="background_task", + message=f"Processing markdown file: {file_name}", + metadata={ + "filename": file_name, + "user_id": str(user_id), + "content_length": len(file_in_markdown), + }, + ) + + try: + content_hash = generate_content_hash(file_in_markdown, search_space_id) + + # Check if document with this content hash already exists + existing_document = await check_duplicate_document(session, content_hash) + if existing_document: + await task_logger.log_task_success( + log_entry, + f"Markdown file document already exists: {file_name}", + { + "duplicate_detected": True, + "existing_document_id": existing_document.id, + }, + ) + logging.info( + f"Document with content hash {content_hash} already exists. Skipping processing." + ) + return existing_document + + # Get user's long context LLM + user_llm = await get_user_long_context_llm(session, user_id) + if not user_llm: + raise RuntimeError(f"No long context LLM configured for user {user_id}") + + # Generate summary + summary_content, summary_embedding = await generate_document_summary( + file_in_markdown, user_llm + ) + + # Process chunks + chunks = await create_document_chunks(file_in_markdown) + + # Create and store document + document = Document( + search_space_id=search_space_id, + title=file_name, + document_type=DocumentType.FILE, + document_metadata={ + "FILE_NAME": file_name, + }, + content=summary_content, + embedding=summary_embedding, + chunks=chunks, + content_hash=content_hash, + ) + + session.add(document) + await session.commit() + await session.refresh(document) + + # Log success + await task_logger.log_task_success( + log_entry, + f"Successfully processed markdown file: {file_name}", + { + "document_id": document.id, + "content_hash": content_hash, + "chunks_count": len(chunks), + "summary_length": len(summary_content), + }, + ) + + return document + except SQLAlchemyError as db_error: + await session.rollback() + await task_logger.log_task_failure( + log_entry, + f"Database error processing markdown file: {file_name}", + str(db_error), + {"error_type": "SQLAlchemyError"}, + ) + raise db_error + except Exception as e: + await session.rollback() + await task_logger.log_task_failure( + log_entry, + f"Failed to process markdown file: {file_name}", + str(e), + {"error_type": type(e).__name__}, + ) + raise RuntimeError(f"Failed to process file document: {e!s}") from e diff --git a/surfsense_backend/app/tasks/document_processors/url_crawler.py b/surfsense_backend/app/tasks/document_processors/url_crawler.py new file mode 100644 index 0000000..85d1bc0 --- /dev/null +++ b/surfsense_backend/app/tasks/document_processors/url_crawler.py @@ -0,0 +1,242 @@ +""" +URL crawler document processor. +""" + +import logging + +import validators +from langchain_community.document_loaders import AsyncChromiumLoader, FireCrawlLoader +from sqlalchemy.exc import SQLAlchemyError +from sqlalchemy.ext.asyncio import AsyncSession + +from app.config import config +from app.db import Document, DocumentType +from app.services.llm_service import get_user_long_context_llm +from app.services.task_logging_service import TaskLoggingService +from app.utils.document_converters import generate_content_hash + +from .base import ( + check_duplicate_document, + create_document_chunks, + generate_document_summary, + md, +) + + +async def add_crawled_url_document( + session: AsyncSession, url: str, search_space_id: int, user_id: str +) -> Document | None: + """ + Process and store a document from a crawled URL. + + Args: + session: Database session + url: URL to crawl + search_space_id: ID of the search space + user_id: ID of the user + + Returns: + Document object if successful, None if failed + """ + task_logger = TaskLoggingService(session, search_space_id) + + # Log task start + log_entry = await task_logger.log_task_start( + task_name="crawl_url_document", + source="background_task", + message=f"Starting URL crawling process for: {url}", + metadata={"url": url, "user_id": str(user_id)}, + ) + + try: + # URL validation step + await task_logger.log_task_progress( + log_entry, f"Validating URL: {url}", {"stage": "validation"} + ) + + if not validators.url(url): + raise ValueError(f"Url {url} is not a valid URL address") + + # Set up crawler + await task_logger.log_task_progress( + log_entry, + f"Setting up crawler for URL: {url}", + { + "stage": "crawler_setup", + "firecrawl_available": bool(config.FIRECRAWL_API_KEY), + }, + ) + + if config.FIRECRAWL_API_KEY: + crawl_loader = FireCrawlLoader( + url=url, + api_key=config.FIRECRAWL_API_KEY, + mode="scrape", + params={ + "formats": ["markdown"], + "excludeTags": ["a"], + }, + ) + else: + crawl_loader = AsyncChromiumLoader(urls=[url], headless=True) + + # Perform crawling + await task_logger.log_task_progress( + log_entry, + f"Crawling URL content: {url}", + {"stage": "crawling", "crawler_type": type(crawl_loader).__name__}, + ) + + url_crawled = await crawl_loader.aload() + + if isinstance(crawl_loader, FireCrawlLoader): + content_in_markdown = url_crawled[0].page_content + elif isinstance(crawl_loader, AsyncChromiumLoader): + content_in_markdown = md.transform_documents(url_crawled)[0].page_content + + # Format document + await task_logger.log_task_progress( + log_entry, + f"Processing crawled content from: {url}", + {"stage": "content_processing", "content_length": len(content_in_markdown)}, + ) + + # Format document metadata in a more maintainable way + metadata_sections = [ + ( + "METADATA", + [ + f"{key.upper()}: {value}" + for key, value in url_crawled[0].metadata.items() + ], + ), + ( + "CONTENT", + ["FORMAT: markdown", "TEXT_START", content_in_markdown, "TEXT_END"], + ), + ] + + # Build the document string more efficiently + document_parts = [] + document_parts.append("") + + for section_title, section_content in metadata_sections: + document_parts.append(f"<{section_title}>") + document_parts.extend(section_content) + document_parts.append(f"") + + document_parts.append("") + combined_document_string = "\n".join(document_parts) + content_hash = generate_content_hash(combined_document_string, search_space_id) + + # Check for duplicates + await task_logger.log_task_progress( + log_entry, + f"Checking for duplicate content: {url}", + {"stage": "duplicate_check", "content_hash": content_hash}, + ) + + existing_document = await check_duplicate_document(session, content_hash) + if existing_document: + await task_logger.log_task_success( + log_entry, + f"Document already exists for URL: {url}", + { + "duplicate_detected": True, + "existing_document_id": existing_document.id, + }, + ) + logging.info( + f"Document with content hash {content_hash} already exists. Skipping processing." + ) + return existing_document + + # Get LLM for summary generation + await task_logger.log_task_progress( + log_entry, + f"Preparing for summary generation: {url}", + {"stage": "llm_setup"}, + ) + + # Get user's long context LLM + user_llm = await get_user_long_context_llm(session, user_id) + if not user_llm: + raise RuntimeError(f"No long context LLM configured for user {user_id}") + + # Generate summary + await task_logger.log_task_progress( + log_entry, + f"Generating summary for URL content: {url}", + {"stage": "summary_generation"}, + ) + + summary_content, summary_embedding = await generate_document_summary( + combined_document_string, user_llm + ) + + # Process chunks + await task_logger.log_task_progress( + log_entry, + f"Processing content chunks for URL: {url}", + {"stage": "chunk_processing"}, + ) + + chunks = await create_document_chunks(content_in_markdown) + + # Create and store document + await task_logger.log_task_progress( + log_entry, + f"Creating document in database for URL: {url}", + {"stage": "document_creation", "chunks_count": len(chunks)}, + ) + + document = Document( + search_space_id=search_space_id, + title=url_crawled[0].metadata["title"] + if isinstance(crawl_loader, FireCrawlLoader) + else url_crawled[0].metadata["source"], + document_type=DocumentType.CRAWLED_URL, + document_metadata=url_crawled[0].metadata, + content=summary_content, + embedding=summary_embedding, + chunks=chunks, + content_hash=content_hash, + ) + + session.add(document) + await session.commit() + await session.refresh(document) + + # Log success + await task_logger.log_task_success( + log_entry, + f"Successfully crawled and processed URL: {url}", + { + "document_id": document.id, + "title": document.title, + "content_hash": content_hash, + "chunks_count": len(chunks), + "summary_length": len(summary_content), + }, + ) + + return document + + except SQLAlchemyError as db_error: + await session.rollback() + await task_logger.log_task_failure( + log_entry, + f"Database error while processing URL: {url}", + str(db_error), + {"error_type": "SQLAlchemyError"}, + ) + raise db_error + except Exception as e: + await session.rollback() + await task_logger.log_task_failure( + log_entry, + f"Failed to crawl URL: {url}", + str(e), + {"error_type": type(e).__name__}, + ) + raise RuntimeError(f"Failed to crawl URL: {e!s}") from e diff --git a/surfsense_backend/app/tasks/document_processors/youtube_processor.py b/surfsense_backend/app/tasks/document_processors/youtube_processor.py new file mode 100644 index 0000000..f75ce0a --- /dev/null +++ b/surfsense_backend/app/tasks/document_processors/youtube_processor.py @@ -0,0 +1,326 @@ +""" +YouTube video document processor. +""" + +import logging +from urllib.parse import parse_qs, urlparse + +import aiohttp +from sqlalchemy.exc import SQLAlchemyError +from sqlalchemy.ext.asyncio import AsyncSession +from youtube_transcript_api import YouTubeTranscriptApi + +from app.db import Document, DocumentType +from app.services.llm_service import get_user_long_context_llm +from app.services.task_logging_service import TaskLoggingService +from app.utils.document_converters import generate_content_hash + +from .base import ( + check_duplicate_document, + create_document_chunks, + generate_document_summary, +) + + +def get_youtube_video_id(url: str) -> str | None: + """ + Extract video ID from various YouTube URL formats. + + Args: + url: YouTube URL + + Returns: + Video ID if found, None otherwise + """ + parsed_url = urlparse(url) + hostname = parsed_url.hostname + + if hostname == "youtu.be": + return parsed_url.path[1:] + if hostname in ("www.youtube.com", "youtube.com"): + if parsed_url.path == "/watch": + query_params = parse_qs(parsed_url.query) + return query_params.get("v", [None])[0] + if parsed_url.path.startswith("/embed/"): + return parsed_url.path.split("/")[2] + if parsed_url.path.startswith("/v/"): + return parsed_url.path.split("/")[2] + return None + + +async def add_youtube_video_document( + session: AsyncSession, url: str, search_space_id: int, user_id: str +) -> Document: + """ + Process a YouTube video URL, extract transcripts, and store as a document. + + Args: + session: Database session for storing the document + url: YouTube video URL (supports standard, shortened, and embed formats) + search_space_id: ID of the search space to add the document to + user_id: ID of the user + + Returns: + Document: The created document object + + Raises: + ValueError: If the YouTube video ID cannot be extracted from the URL + SQLAlchemyError: If there's a database error + RuntimeError: If the video processing fails + """ + task_logger = TaskLoggingService(session, search_space_id) + + # Log task start + log_entry = await task_logger.log_task_start( + task_name="youtube_video_document", + source="background_task", + message=f"Starting YouTube video processing for: {url}", + metadata={"url": url, "user_id": str(user_id)}, + ) + + try: + # Extract video ID from URL + await task_logger.log_task_progress( + log_entry, + f"Extracting video ID from URL: {url}", + {"stage": "video_id_extraction"}, + ) + + # Get video ID + video_id = get_youtube_video_id(url) + if not video_id: + raise ValueError(f"Could not extract video ID from URL: {url}") + + await task_logger.log_task_progress( + log_entry, + f"Video ID extracted: {video_id}", + {"stage": "video_id_extracted", "video_id": video_id}, + ) + + # Get video metadata + await task_logger.log_task_progress( + log_entry, + f"Fetching video metadata for: {video_id}", + {"stage": "metadata_fetch"}, + ) + + params = { + "format": "json", + "url": f"https://www.youtube.com/watch?v={video_id}", + } + oembed_url = "https://www.youtube.com/oembed" + + async with ( + aiohttp.ClientSession() as http_session, + http_session.get(oembed_url, params=params) as response, + ): + video_data = await response.json() + + await task_logger.log_task_progress( + log_entry, + f"Video metadata fetched: {video_data.get('title', 'Unknown')}", + { + "stage": "metadata_fetched", + "title": video_data.get("title"), + "author": video_data.get("author_name"), + }, + ) + + # Get video transcript + await task_logger.log_task_progress( + log_entry, + f"Fetching transcript for video: {video_id}", + {"stage": "transcript_fetch"}, + ) + + try: + captions = YouTubeTranscriptApi.get_transcript(video_id) + # Include complete caption information with timestamps + transcript_segments = [] + for line in captions: + start_time = line.get("start", 0) + duration = line.get("duration", 0) + text = line.get("text", "") + timestamp = f"[{start_time:.2f}s-{start_time + duration:.2f}s]" + transcript_segments.append(f"{timestamp} {text}") + transcript_text = "\n".join(transcript_segments) + + await task_logger.log_task_progress( + log_entry, + f"Transcript fetched successfully: {len(captions)} segments", + { + "stage": "transcript_fetched", + "segments_count": len(captions), + "transcript_length": len(transcript_text), + }, + ) + except Exception as e: + transcript_text = f"No captions available for this video. Error: {e!s}" + await task_logger.log_task_progress( + log_entry, + f"No transcript available for video: {video_id}", + {"stage": "transcript_unavailable", "error": str(e)}, + ) + + # Format document + await task_logger.log_task_progress( + log_entry, + f"Processing video content: {video_data.get('title', 'YouTube Video')}", + {"stage": "content_processing"}, + ) + + # Format document metadata in a more maintainable way + metadata_sections = [ + ( + "METADATA", + [ + f"TITLE: {video_data.get('title', 'YouTube Video')}", + f"URL: {url}", + f"VIDEO_ID: {video_id}", + f"AUTHOR: {video_data.get('author_name', 'Unknown')}", + f"THUMBNAIL: {video_data.get('thumbnail_url', '')}", + ], + ), + ( + "CONTENT", + ["FORMAT: transcript", "TEXT_START", transcript_text, "TEXT_END"], + ), + ] + + # Build the document string more efficiently + document_parts = [] + document_parts.append("") + + for section_title, section_content in metadata_sections: + document_parts.append(f"<{section_title}>") + document_parts.extend(section_content) + document_parts.append(f"") + + document_parts.append("") + combined_document_string = "\n".join(document_parts) + content_hash = generate_content_hash(combined_document_string, search_space_id) + + # Check for duplicates + await task_logger.log_task_progress( + log_entry, + f"Checking for duplicate video content: {video_id}", + {"stage": "duplicate_check", "content_hash": content_hash}, + ) + + existing_document = await check_duplicate_document(session, content_hash) + if existing_document: + await task_logger.log_task_success( + log_entry, + f"YouTube video document already exists: {video_data.get('title', 'YouTube Video')}", + { + "duplicate_detected": True, + "existing_document_id": existing_document.id, + "video_id": video_id, + }, + ) + logging.info( + f"Document with content hash {content_hash} already exists. Skipping processing." + ) + return existing_document + + # Get LLM for summary generation + await task_logger.log_task_progress( + log_entry, + f"Preparing for summary generation: {video_data.get('title', 'YouTube Video')}", + {"stage": "llm_setup"}, + ) + + # Get user's long context LLM + user_llm = await get_user_long_context_llm(session, user_id) + if not user_llm: + raise RuntimeError(f"No long context LLM configured for user {user_id}") + + # Generate summary + await task_logger.log_task_progress( + log_entry, + f"Generating summary for video: {video_data.get('title', 'YouTube Video')}", + {"stage": "summary_generation"}, + ) + + summary_content, summary_embedding = await generate_document_summary( + combined_document_string, user_llm + ) + + # Process chunks + await task_logger.log_task_progress( + log_entry, + f"Processing content chunks for video: {video_data.get('title', 'YouTube Video')}", + {"stage": "chunk_processing"}, + ) + + chunks = await create_document_chunks(combined_document_string) + + # Create document + await task_logger.log_task_progress( + log_entry, + f"Creating YouTube video document in database: {video_data.get('title', 'YouTube Video')}", + {"stage": "document_creation", "chunks_count": len(chunks)}, + ) + + document = Document( + title=video_data.get("title", "YouTube Video"), + document_type=DocumentType.YOUTUBE_VIDEO, + document_metadata={ + "url": url, + "video_id": video_id, + "video_title": video_data.get("title", "YouTube Video"), + "author": video_data.get("author_name", "Unknown"), + "thumbnail": video_data.get("thumbnail_url", ""), + }, + content=summary_content, + embedding=summary_embedding, + chunks=chunks, + search_space_id=search_space_id, + content_hash=content_hash, + ) + + session.add(document) + await session.commit() + await session.refresh(document) + + # Log success + await task_logger.log_task_success( + log_entry, + f"Successfully processed YouTube video: {video_data.get('title', 'YouTube Video')}", + { + "document_id": document.id, + "video_id": video_id, + "title": document.title, + "content_hash": content_hash, + "chunks_count": len(chunks), + "summary_length": len(summary_content), + "has_transcript": "No captions available" not in transcript_text, + }, + ) + + return document + except SQLAlchemyError as db_error: + await session.rollback() + await task_logger.log_task_failure( + log_entry, + f"Database error while processing YouTube video: {url}", + str(db_error), + { + "error_type": "SQLAlchemyError", + "video_id": video_id if "video_id" in locals() else None, + }, + ) + raise db_error + except Exception as e: + await session.rollback() + await task_logger.log_task_failure( + log_entry, + f"Failed to process YouTube video: {url}", + str(e), + { + "error_type": type(e).__name__, + "video_id": video_id if "video_id" in locals() else None, + }, + ) + logging.error(f"Failed to process YouTube video: {e!s}") + raise