refactor: refactored background_tasks & indexing_tasks

2025-09-02 02:29:08 +00:00 · 2025-08-12 15:28:13 -07:00 · 2025-08-12 15:28:13 -07:00 · 5aa52375c3
commit 5aa52375c3
parent 356bbb86f5
24 changed files with 4704 additions and 5149 deletions
--- a/surfsense_backend/app/connectors/test_github_connector.py
+++ b/surfsense_backend/app/connectors/test_github_connector.py
@ -1,188 +0,0 @@
 import unittest
 from datetime import datetime
 from unittest.mock import Mock, patch
 from github3.exceptions import ForbiddenError  # Import the specific exception
 # Adjust the import path based on the actual location if test_github_connector.py
 # is not in the same directory as github_connector.py or if paths are set up differently.
 # Assuming surfsend_backend/app/connectors/test_github_connector.py
 from surfsense_backend.app.connectors.github_connector import GitHubConnector
 class TestGitHubConnector(unittest.TestCase):
    @patch("surfsense_backend.app.connectors.github_connector.github_login")
    def test_get_user_repositories_uses_type_all(self, mock_github_login):
        # Mock the GitHub client object and its methods
        mock_gh_instance = Mock()
        mock_github_login.return_value = mock_gh_instance
        # Mock the self.gh.me() call in __init__ to prevent an actual API call
        mock_gh_instance.me.return_value = Mock()  # Simple mock to pass initialization
        # Prepare mock repository data
        mock_repo1_data = Mock()
        mock_repo1_data.id = 1
        mock_repo1_data.name = "repo1"
        mock_repo1_data.full_name = "user/repo1"
        mock_repo1_data.private = False
        mock_repo1_data.html_url = "http://example.com/user/repo1"
        mock_repo1_data.description = "Test repo 1"
        mock_repo1_data.updated_at = datetime(
            2023, 1, 1, 10, 30, 0
        )  # Added time component
        mock_repo2_data = Mock()
        mock_repo2_data.id = 2
        mock_repo2_data.name = "org-repo"
        mock_repo2_data.full_name = "org/org-repo"
        mock_repo2_data.private = True
        mock_repo2_data.html_url = "http://example.com/org/org-repo"
        mock_repo2_data.description = "Org repo"
        mock_repo2_data.updated_at = datetime(
            2023, 1, 2, 12, 0, 0
        )  # Added time component
        # Configure the mock for gh.repositories() call
        # This method is an iterator, so it should return an iterable (e.g., a list)
        mock_gh_instance.repositories.return_value = [mock_repo1_data, mock_repo2_data]
        connector = GitHubConnector(token="fake_token")
        repositories = connector.get_user_repositories()
        # Assert that gh.repositories was called correctly
        mock_gh_instance.repositories.assert_called_once_with(
            type="all", sort="updated"
        )
        # Assert the structure and content of the returned data
        expected_repositories = [
            {
                "id": 1,
                "name": "repo1",
                "full_name": "user/repo1",
                "private": False,
                "url": "http://example.com/user/repo1",
                "description": "Test repo 1",
                "last_updated": datetime(2023, 1, 1, 10, 30, 0),
            },
            {
                "id": 2,
                "name": "org-repo",
                "full_name": "org/org-repo",
                "private": True,
                "url": "http://example.com/org/org-repo",
                "description": "Org repo",
                "last_updated": datetime(2023, 1, 2, 12, 0, 0),
            },
        ]
        self.assertEqual(repositories, expected_repositories)
        self.assertEqual(len(repositories), 2)
    @patch("surfsense_backend.app.connectors.github_connector.github_login")
    def test_get_user_repositories_handles_empty_description_and_none_updated_at(
        self, mock_github_login
    ):
        # Mock the GitHub client object and its methods
        mock_gh_instance = Mock()
        mock_github_login.return_value = mock_gh_instance
        mock_gh_instance.me.return_value = Mock()
        mock_repo_data = Mock()
        mock_repo_data.id = 1
        mock_repo_data.name = "repo_no_desc"
        mock_repo_data.full_name = "user/repo_no_desc"
        mock_repo_data.private = False
        mock_repo_data.html_url = "http://example.com/user/repo_no_desc"
        mock_repo_data.description = None  # Test None description
        mock_repo_data.updated_at = None  # Test None updated_at
        mock_gh_instance.repositories.return_value = [mock_repo_data]
        connector = GitHubConnector(token="fake_token")
        repositories = connector.get_user_repositories()
        mock_gh_instance.repositories.assert_called_once_with(
            type="all", sort="updated"
        )
        expected_repositories = [
            {
                "id": 1,
                "name": "repo_no_desc",
                "full_name": "user/repo_no_desc",
                "private": False,
                "url": "http://example.com/user/repo_no_desc",
                "description": "",  # Expect empty string
                "last_updated": None,  # Expect None
            }
        ]
        self.assertEqual(repositories, expected_repositories)
    @patch("surfsense_backend.app.connectors.github_connector.github_login")
    def test_github_connector_initialization_failure_forbidden(self, mock_github_login):
        # Test that __init__ raises ValueError on auth failure (ForbiddenError)
        mock_gh_instance = Mock()
        mock_github_login.return_value = mock_gh_instance
        # Create a mock response object for the ForbiddenError
        # The actual response structure might vary, but github3.py's ForbiddenError
        # can be instantiated with just a response object that has a status_code.
        mock_response = Mock()
        mock_response.status_code = 403  # Typically Forbidden
        # Setup the side_effect for self.gh.me()
        mock_gh_instance.me.side_effect = ForbiddenError(mock_response)
        with self.assertRaises(ValueError) as context:
            GitHubConnector(token="invalid_token_forbidden")
        self.assertIn(
            "Invalid GitHub token or insufficient permissions.", str(context.exception)
        )
    @patch("surfsense_backend.app.connectors.github_connector.github_login")
    def test_github_connector_initialization_failure_authentication_failed(
        self, mock_github_login
    ):
        # Test that __init__ raises ValueError on auth failure (AuthenticationFailed, which is a subclass of ForbiddenError)
        # For github3.py, AuthenticationFailed is more specific for token issues.
        from github3.exceptions import AuthenticationFailed
        mock_gh_instance = Mock()
        mock_github_login.return_value = mock_gh_instance
        mock_response = Mock()
        mock_response.status_code = 401  # Typically Unauthorized
        mock_gh_instance.me.side_effect = AuthenticationFailed(mock_response)
        with self.assertRaises(ValueError) as context:
            GitHubConnector(token="invalid_token_authfailed")
        self.assertIn(
            "Invalid GitHub token or insufficient permissions.", str(context.exception)
        )
    @patch("surfsense_backend.app.connectors.github_connector.github_login")
    def test_get_user_repositories_handles_api_exception(self, mock_github_login):
        mock_gh_instance = Mock()
        mock_github_login.return_value = mock_gh_instance
        mock_gh_instance.me.return_value = Mock()
        # Simulate an exception when calling repositories
        mock_gh_instance.repositories.side_effect = Exception("API Error")
        connector = GitHubConnector(token="fake_token")
        # We expect it to log an error and return an empty list
        with patch(
            "surfsense_backend.app.connectors.github_connector.logger"
        ) as mock_logger:
            repositories = connector.get_user_repositories()
        self.assertEqual(repositories, [])
        mock_logger.error.assert_called_once()
        self.assertIn(
            "Failed to fetch GitHub repositories: API Error",
            mock_logger.error.call_args[0][0],
        )
 if __name__ == "__main__":
    unittest.main()
--- a/surfsense_backend/app/connectors/test_slack_history.py
+++ b/surfsense_backend/app/connectors/test_slack_history.py
@ -1,507 +0,0 @@
 import unittest
 from unittest.mock import Mock, call, patch
 from slack_sdk.errors import SlackApiError
 # Since test_slack_history.py is in the same directory as slack_history.py
 from .slack_history import SlackHistory
 class TestSlackHistoryGetAllChannels(unittest.TestCase):
    @patch("surfsense_backend.app.connectors.slack_history.logger")
    @patch("surfsense_backend.app.connectors.slack_history.time.sleep")
    @patch("slack_sdk.WebClient")
    def test_get_all_channels_pagination_with_delay(
        self, mock_web_client, mock_sleep, mock_logger
    ):
        mock_client_instance = mock_web_client.return_value
        # Mock API responses now include is_private and is_member
        page1_response = {
            "channels": [
                {"name": "general", "id": "C1", "is_private": False, "is_member": True},
                {"name": "dev", "id": "C0", "is_private": False, "is_member": True},
            ],
            "response_metadata": {"next_cursor": "cursor123"},
        }
        page2_response = {
            "channels": [
                {"name": "random", "id": "C2", "is_private": True, "is_member": True}
            ],
            "response_metadata": {"next_cursor": ""},
        }
        mock_client_instance.conversations_list.side_effect = [
            page1_response,
            page2_response,
        ]
        slack_history = SlackHistory(token="fake_token")
        channels_list = slack_history.get_all_channels(include_private=True)
        expected_channels_list = [
            {"id": "C1", "name": "general", "is_private": False, "is_member": True},
            {"id": "C0", "name": "dev", "is_private": False, "is_member": True},
            {"id": "C2", "name": "random", "is_private": True, "is_member": True},
        ]
        self.assertEqual(len(channels_list), 3)
        self.assertListEqual(
            channels_list, expected_channels_list
        )  # Assert list equality
        expected_calls = [
            call(types="public_channel,private_channel", cursor=None, limit=1000),
            call(
                types="public_channel,private_channel", cursor="cursor123", limit=1000
            ),
        ]
        mock_client_instance.conversations_list.assert_has_calls(expected_calls)
        self.assertEqual(mock_client_instance.conversations_list.call_count, 2)
        mock_sleep.assert_called_once_with(3)
        mock_logger.info.assert_called_once_with(
            "Paginating for channels, waiting 3 seconds before next call. Cursor: cursor123"
        )
    @patch("surfsense_backend.app.connectors.slack_history.logger")
    @patch("surfsense_backend.app.connectors.slack_history.time.sleep")
    @patch("slack_sdk.WebClient")
    def test_get_all_channels_rate_limit_with_retry_after(
        self, mock_web_client, mock_sleep, mock_logger
    ):
        mock_client_instance = mock_web_client.return_value
        mock_error_response = Mock()
        mock_error_response.status_code = 429
        mock_error_response.headers = {"Retry-After": "5"}
        successful_response = {
            "channels": [
                {"name": "general", "id": "C1", "is_private": False, "is_member": True}
            ],
            "response_metadata": {"next_cursor": ""},
        }
        mock_client_instance.conversations_list.side_effect = [
            SlackApiError(message="ratelimited", response=mock_error_response),
            successful_response,
        ]
        slack_history = SlackHistory(token="fake_token")
        channels_list = slack_history.get_all_channels(include_private=True)
        expected_channels_list = [
            {"id": "C1", "name": "general", "is_private": False, "is_member": True}
        ]
        self.assertEqual(len(channels_list), 1)
        self.assertListEqual(channels_list, expected_channels_list)
        mock_sleep.assert_called_once_with(5)
        mock_logger.warning.assert_called_once_with(
            "Slack API rate limit hit while fetching channels. Waiting for 5 seconds. Cursor: None"
        )
        expected_calls = [
            call(types="public_channel,private_channel", cursor=None, limit=1000),
            call(types="public_channel,private_channel", cursor=None, limit=1000),
        ]
        mock_client_instance.conversations_list.assert_has_calls(expected_calls)
        self.assertEqual(mock_client_instance.conversations_list.call_count, 2)
    @patch("surfsense_backend.app.connectors.slack_history.logger")
    @patch("surfsense_backend.app.connectors.slack_history.time.sleep")
    @patch("slack_sdk.WebClient")
    def test_get_all_channels_rate_limit_no_retry_after_valid_header(
        self, mock_web_client, mock_sleep, mock_logger
    ):
        mock_client_instance = mock_web_client.return_value
        mock_error_response = Mock()
        mock_error_response.status_code = 429
        mock_error_response.headers = {"Retry-After": "invalid_value"}
        successful_response = {
            "channels": [
                {"name": "general", "id": "C1", "is_private": False, "is_member": True}
            ],
            "response_metadata": {"next_cursor": ""},
        }
        mock_client_instance.conversations_list.side_effect = [
            SlackApiError(message="ratelimited", response=mock_error_response),
            successful_response,
        ]
        slack_history = SlackHistory(token="fake_token")
        channels_list = slack_history.get_all_channels(include_private=True)
        expected_channels_list = [
            {"id": "C1", "name": "general", "is_private": False, "is_member": True}
        ]
        self.assertListEqual(channels_list, expected_channels_list)
        mock_sleep.assert_called_once_with(60)  # Default fallback
        mock_logger.warning.assert_called_once_with(
            "Slack API rate limit hit while fetching channels. Waiting for 60 seconds. Cursor: None"
        )
        self.assertEqual(mock_client_instance.conversations_list.call_count, 2)
    @patch("surfsense_backend.app.connectors.slack_history.logger")
    @patch("surfsense_backend.app.connectors.slack_history.time.sleep")
    @patch("slack_sdk.WebClient")
    def test_get_all_channels_rate_limit_no_retry_after_header(
        self, mock_web_client, mock_sleep, mock_logger
    ):
        mock_client_instance = mock_web_client.return_value
        mock_error_response = Mock()
        mock_error_response.status_code = 429
        mock_error_response.headers = {}
        successful_response = {
            "channels": [
                {"name": "general", "id": "C1", "is_private": False, "is_member": True}
            ],
            "response_metadata": {"next_cursor": ""},
        }
        mock_client_instance.conversations_list.side_effect = [
            SlackApiError(message="ratelimited", response=mock_error_response),
            successful_response,
        ]
        slack_history = SlackHistory(token="fake_token")
        channels_list = slack_history.get_all_channels(include_private=True)
        expected_channels_list = [
            {"id": "C1", "name": "general", "is_private": False, "is_member": True}
        ]
        self.assertListEqual(channels_list, expected_channels_list)
        mock_sleep.assert_called_once_with(60)  # Default fallback
        mock_logger.warning.assert_called_once_with(
            "Slack API rate limit hit while fetching channels. Waiting for 60 seconds. Cursor: None"
        )
        self.assertEqual(mock_client_instance.conversations_list.call_count, 2)
    @patch("surfsense_backend.app.connectors.slack_history.logger")
    @patch("surfsense_backend.app.connectors.slack_history.time.sleep")
    @patch("slack_sdk.WebClient")
    def test_get_all_channels_other_slack_api_error(
        self, mock_web_client, mock_sleep, mock_logger
    ):
        mock_client_instance = mock_web_client.return_value
        mock_error_response = Mock()
        mock_error_response.status_code = 500
        mock_error_response.headers = {}
        mock_error_response.data = {"ok": False, "error": "internal_error"}
        original_error = SlackApiError(
            message="server error", response=mock_error_response
        )
        mock_client_instance.conversations_list.side_effect = original_error
        slack_history = SlackHistory(token="fake_token")
        with self.assertRaises(SlackApiError) as context:
            slack_history.get_all_channels(include_private=True)
        self.assertEqual(context.exception.response.status_code, 500)
        self.assertIn("server error", str(context.exception))
        mock_sleep.assert_not_called()
        mock_logger.warning.assert_not_called()  # Ensure no rate limit log
        mock_client_instance.conversations_list.assert_called_once_with(
            types="public_channel,private_channel", cursor=None, limit=1000
        )
    @patch("surfsense_backend.app.connectors.slack_history.logger")
    @patch("surfsense_backend.app.connectors.slack_history.time.sleep")
    @patch("slack_sdk.WebClient")
    def test_get_all_channels_handles_missing_name_id_gracefully(
        self, mock_web_client, mock_sleep, mock_logger
    ):
        mock_client_instance = mock_web_client.return_value
        response_with_malformed_data = {
            "channels": [
                {"id": "C1_missing_name", "is_private": False, "is_member": True},
                {"name": "channel_missing_id", "is_private": False, "is_member": True},
                {
                    "name": "general",
                    "id": "C2_valid",
                    "is_private": False,
                    "is_member": True,
                },
            ],
            "response_metadata": {"next_cursor": ""},
        }
        mock_client_instance.conversations_list.return_value = (
            response_with_malformed_data
        )
        slack_history = SlackHistory(token="fake_token")
        channels_list = slack_history.get_all_channels(include_private=True)
        expected_channels_list = [
            {
                "id": "C2_valid",
                "name": "general",
                "is_private": False,
                "is_member": True,
            }
        ]
        self.assertEqual(len(channels_list), 1)
        self.assertListEqual(channels_list, expected_channels_list)
        self.assertEqual(mock_logger.warning.call_count, 2)
        mock_logger.warning.assert_any_call(
            "Channel found with missing name or id. Data: {'id': 'C1_missing_name', 'is_private': False, 'is_member': True}"
        )
        mock_logger.warning.assert_any_call(
            "Channel found with missing name or id. Data: {'name': 'channel_missing_id', 'is_private': False, 'is_member': True}"
        )
        mock_sleep.assert_not_called()
        mock_client_instance.conversations_list.assert_called_once_with(
            types="public_channel,private_channel", cursor=None, limit=1000
        )
 if __name__ == "__main__":
    unittest.main()
 class TestSlackHistoryGetConversationHistory(unittest.TestCase):
    @patch("surfsense_backend.app.connectors.slack_history.logger")
    @patch("surfsense_backend.app.connectors.slack_history.time.sleep")
    @patch("slack_sdk.WebClient")
    def test_proactive_delay_single_page(
        self, mock_web_client, mock_time_sleep, mock_logger
    ):
        mock_client_instance = mock_web_client.return_value
        mock_client_instance.conversations_history.return_value = {
            "messages": [{"text": "msg1"}],
            "has_more": False,
        }
        slack_history = SlackHistory(token="fake_token")
        slack_history.get_conversation_history(channel_id="C123")
        mock_time_sleep.assert_called_once_with(1.2)  # Proactive delay
    @patch("surfsense_backend.app.connectors.slack_history.logger")
    @patch("surfsense_backend.app.connectors.slack_history.time.sleep")
    @patch("slack_sdk.WebClient")
    def test_proactive_delay_multiple_pages(
        self, mock_web_client, mock_time_sleep, mock_logger
    ):
        mock_client_instance = mock_web_client.return_value
        mock_client_instance.conversations_history.side_effect = [
            {
                "messages": [{"text": "msg1"}],
                "has_more": True,
                "response_metadata": {"next_cursor": "cursor1"},
            },
            {"messages": [{"text": "msg2"}], "has_more": False},
        ]
        slack_history = SlackHistory(token="fake_token")
        slack_history.get_conversation_history(channel_id="C123")
        # Expected calls: 1.2 (page1), 1.2 (page2)
        self.assertEqual(mock_time_sleep.call_count, 2)
        mock_time_sleep.assert_has_calls([call(1.2), call(1.2)])
    @patch("surfsense_backend.app.connectors.slack_history.logger")
    @patch("surfsense_backend.app.connectors.slack_history.time.sleep")
    @patch("slack_sdk.WebClient")
    def test_retry_after_logic(self, mock_web_client, mock_time_sleep, mock_logger):
        mock_client_instance = mock_web_client.return_value
        mock_error_response = Mock()
        mock_error_response.status_code = 429
        mock_error_response.headers = {"Retry-After": "5"}
        mock_client_instance.conversations_history.side_effect = [
            SlackApiError(message="ratelimited", response=mock_error_response),
            {"messages": [{"text": "msg1"}], "has_more": False},
        ]
        slack_history = SlackHistory(token="fake_token")
        messages = slack_history.get_conversation_history(channel_id="C123")
        self.assertEqual(len(messages), 1)
        self.assertEqual(messages[0]["text"], "msg1")
        # Expected sleep calls: 1.2 (proactive for 1st attempt), 5 (rate limit), 1.2 (proactive for 2nd attempt)
        mock_time_sleep.assert_has_calls(
            [call(1.2), call(5), call(1.2)], any_order=False
        )
        mock_logger.warning.assert_called_once()  # Check that a warning was logged for rate limiting
    @patch("surfsense_backend.app.connectors.slack_history.logger")
    @patch("surfsense_backend.app.connectors.slack_history.time.sleep")
    @patch("slack_sdk.WebClient")
    def test_not_in_channel_error(self, mock_web_client, mock_time_sleep, mock_logger):
        mock_client_instance = mock_web_client.return_value
        mock_error_response = Mock()
        mock_error_response.status_code = (
            403  # Typical for not_in_channel, but data matters more
        )
        mock_error_response.data = {"ok": False, "error": "not_in_channel"}
        # This error is now raised by the inner try-except, then caught by the outer one
        mock_client_instance.conversations_history.side_effect = SlackApiError(
            message="not_in_channel error", response=mock_error_response
        )
        slack_history = SlackHistory(token="fake_token")
        messages = slack_history.get_conversation_history(channel_id="C123")
        self.assertEqual(messages, [])
        mock_logger.warning.assert_called_with(
            "Bot is not in channel 'C123'. Cannot fetch history. Please add the bot to this channel."
        )
        mock_time_sleep.assert_called_once_with(
            1.2
        )  # Proactive delay before the API call
    @patch("surfsense_backend.app.connectors.slack_history.logger")
    @patch("surfsense_backend.app.connectors.slack_history.time.sleep")
    @patch("slack_sdk.WebClient")
    def test_other_slack_api_error_propagates(
        self, mock_web_client, mock_time_sleep, mock_logger
    ):
        mock_client_instance = mock_web_client.return_value
        mock_error_response = Mock()
        mock_error_response.status_code = 500
        mock_error_response.data = {"ok": False, "error": "internal_error"}
        original_error = SlackApiError(
            message="server error", response=mock_error_response
        )
        mock_client_instance.conversations_history.side_effect = original_error
        slack_history = SlackHistory(token="fake_token")
        with self.assertRaises(SlackApiError) as context:
            slack_history.get_conversation_history(channel_id="C123")
        self.assertIn(
            "Error retrieving history for channel C123", str(context.exception)
        )
        self.assertIs(context.exception.response, mock_error_response)
        mock_time_sleep.assert_called_once_with(1.2)  # Proactive delay
    @patch("surfsense_backend.app.connectors.slack_history.logger")
    @patch("surfsense_backend.app.connectors.slack_history.time.sleep")
    @patch("slack_sdk.WebClient")
    def test_general_exception_propagates(
        self, mock_web_client, mock_time_sleep, mock_logger
    ):
        mock_client_instance = mock_web_client.return_value
        original_error = Exception("Something broke")
        mock_client_instance.conversations_history.side_effect = original_error
        slack_history = SlackHistory(token="fake_token")
        with self.assertRaises(Exception) as context:  # Check for generic Exception
            slack_history.get_conversation_history(channel_id="C123")
        self.assertIs(
            context.exception, original_error
        )  # Should re-raise the original error
        mock_logger.error.assert_called_once_with(
            "Unexpected error in get_conversation_history for channel C123: Something broke"
        )
        mock_time_sleep.assert_called_once_with(1.2)  # Proactive delay
 class TestSlackHistoryGetUserInfo(unittest.TestCase):
    @patch("surfsense_backend.app.connectors.slack_history.logger")
    @patch("surfsense_backend.app.connectors.slack_history.time.sleep")
    @patch("slack_sdk.WebClient")
    def test_retry_after_logic(self, mock_web_client, mock_time_sleep, mock_logger):
        mock_client_instance = mock_web_client.return_value
        mock_error_response = Mock()
        mock_error_response.status_code = 429
        mock_error_response.headers = {"Retry-After": "3"}  # Using 3 seconds for test
        successful_user_data = {"id": "U123", "name": "testuser"}
        mock_client_instance.users_info.side_effect = [
            SlackApiError(message="ratelimited_userinfo", response=mock_error_response),
            {"user": successful_user_data},
        ]
        slack_history = SlackHistory(token="fake_token")
        user_info = slack_history.get_user_info(user_id="U123")
        self.assertEqual(user_info, successful_user_data)
        # Assert that time.sleep was called for the rate limit
        mock_time_sleep.assert_called_once_with(3)
        mock_logger.warning.assert_called_once_with(
            "Rate limited by Slack on users.info for user U123. Retrying after 3 seconds."
        )
        # Assert users_info was called twice (original + retry)
        self.assertEqual(mock_client_instance.users_info.call_count, 2)
        mock_client_instance.users_info.assert_has_calls(
            [call(user="U123"), call(user="U123")]
        )
    @patch("surfsense_backend.app.connectors.slack_history.logger")
    @patch(
        "surfsense_backend.app.connectors.slack_history.time.sleep"
    )  # time.sleep might be called by other logic, but not expected here
    @patch("slack_sdk.WebClient")
    def test_other_slack_api_error_propagates(
        self, mock_web_client, mock_time_sleep, mock_logger
    ):
        mock_client_instance = mock_web_client.return_value
        mock_error_response = Mock()
        mock_error_response.status_code = 500  # Some other error
        mock_error_response.data = {"ok": False, "error": "internal_server_error"}
        original_error = SlackApiError(
            message="internal server error", response=mock_error_response
        )
        mock_client_instance.users_info.side_effect = original_error
        slack_history = SlackHistory(token="fake_token")
        with self.assertRaises(SlackApiError) as context:
            slack_history.get_user_info(user_id="U123")
        # Check that the raised error is the one we expect
        self.assertIn("Error retrieving user info for U123", str(context.exception))
        self.assertIs(context.exception.response, mock_error_response)
        mock_time_sleep.assert_not_called()  # No rate limit sleep
    @patch("surfsense_backend.app.connectors.slack_history.logger")
    @patch("surfsense_backend.app.connectors.slack_history.time.sleep")
    @patch("slack_sdk.WebClient")
    def test_general_exception_propagates(
        self, mock_web_client, mock_time_sleep, mock_logger
    ):
        mock_client_instance = mock_web_client.return_value
        original_error = Exception("A very generic problem")
        mock_client_instance.users_info.side_effect = original_error
        slack_history = SlackHistory(token="fake_token")
        with self.assertRaises(Exception) as context:
            slack_history.get_user_info(user_id="U123")
        self.assertIs(
            context.exception, original_error
        )  # Check it's the exact same exception
        mock_logger.error.assert_called_once_with(
            "Unexpected error in get_user_info for user U123: A very generic problem"
        )
        mock_time_sleep.assert_not_called()  # No rate limit sleep
--- a/surfsense_backend/app/routes/documents_routes.py
+++ b/surfsense_backend/app/routes/documents_routes.py
@ -10,7 +10,7 @@ from app.config import config as app_config
 from app.db import Document, DocumentType, Log, SearchSpace, User, get_async_session
 from app.schemas import DocumentRead, DocumentsCreate, DocumentUpdate
 from app.services.task_logging_service import TaskLoggingService
-from app.tasks.background_tasks import (
+from app.tasks.document_processors import (
    add_crawled_url_document,
    add_extension_received_document,
    add_received_file_document_using_docling,
--- a/surfsense_backend/app/routes/search_source_connectors_routes.py
+++ b/surfsense_backend/app/routes/search_source_connectors_routes.py
@ -35,7 +35,7 @@ from app.schemas import (
    SearchSourceConnectorRead,
    SearchSourceConnectorUpdate,
 )
-from app.tasks.connectors_indexing_tasks import (
+from app.tasks.connector_indexers import (
    index_clickup_tasks,
    index_confluence_pages,
    index_discord_messages,
--- a/surfsense_backend/app/tasks/background_tasks.py
+++ b/surfsense_backend/app/tasks/background_tasks.py
--- a/surfsense_backend/app/tasks/connector_indexers/init.py
+++ b/surfsense_backend/app/tasks/connector_indexers/init.py
@ -0,0 +1,54 @@
 """
 Connector indexers module for background tasks.
 This module provides a collection of connector indexers for different platforms
 and services. Each indexer is responsible for handling the indexing of content
 from a specific connector type.
 Available indexers:
 - Slack: Index messages from Slack channels
 - Notion: Index pages from Notion workspaces
 - GitHub: Index repositories and files from GitHub
 - Linear: Index issues from Linear workspaces
 - Jira: Index issues from Jira projects
 - Confluence: Index pages from Confluence spaces
 - Discord: Index messages from Discord servers
 - ClickUp: Index tasks from ClickUp workspaces
 - Google Calendar: Index events from Google Calendar
 """
 # Communication platforms
 from .clickup_indexer import index_clickup_tasks
 from .confluence_indexer import index_confluence_pages
 from .discord_indexer import index_discord_messages
 # Development platforms
 from .github_indexer import index_github_repos
 # Calendar and scheduling
 from .google_calendar_indexer import index_google_calendar_events
 from .jira_indexer import index_jira_issues
 # Issue tracking and project management
 from .linear_indexer import index_linear_issues
 # Documentation and knowledge management
 from .notion_indexer import index_notion_pages
 from .slack_indexer import index_slack_messages
 __all__ = [
    "index_clickup_tasks",
    "index_confluence_pages",
    "index_discord_messages",
    # Development platforms
    "index_github_repos",
    # Calendar and scheduling
    "index_google_calendar_events",
    "index_jira_issues",
    # Issue tracking and project management
    "index_linear_issues",
    # Documentation and knowledge management
    "index_notion_pages",
    # Communication platforms
    "index_slack_messages",
 ]
--- a/surfsense_backend/app/tasks/connector_indexers/base.py
+++ b/surfsense_backend/app/tasks/connector_indexers/base.py
@ -0,0 +1,183 @@
 """
 Base functionality and shared imports for connector indexers.
 """
 import logging
 from datetime import datetime, timedelta
 from sqlalchemy.ext.asyncio import AsyncSession
 from sqlalchemy.future import select
 from app.config import config
 from app.db import (
    Chunk,
    Document,
    SearchSourceConnector,
    SearchSourceConnectorType,
 )
 # Set up logging
 logger = logging.getLogger(__name__)
 async def check_duplicate_document_by_hash(
    session: AsyncSession, content_hash: str
 ) -> Document | None:
    """
    Check if a document with the given content hash already exists.
    Args:
        session: Database session
        content_hash: Hash of the document content
    Returns:
        Existing document if found, None otherwise
    """
    existing_doc_result = await session.execute(
        select(Document).where(Document.content_hash == content_hash)
    )
    return existing_doc_result.scalars().first()
 async def create_document_chunks(content: str) -> list[Chunk]:
    """
    Create chunks from document content.
    Args:
        content: Document content to chunk
    Returns:
        List of Chunk objects with embeddings
    """
    return [
        Chunk(
            content=chunk.text,
            embedding=config.embedding_model_instance.embed(chunk.text),
        )
        for chunk in config.chunker_instance.chunk(content)
    ]
 async def get_connector_by_id(
    session: AsyncSession, connector_id: int, connector_type: SearchSourceConnectorType
 ) -> SearchSourceConnector | None:
    """
    Get a connector by ID and type from the database.
    Args:
        session: Database session
        connector_id: ID of the connector
        connector_type: Expected type of the connector
    Returns:
        Connector object if found, None otherwise
    """
    result = await session.execute(
        select(SearchSourceConnector).filter(
            SearchSourceConnector.id == connector_id,
            SearchSourceConnector.connector_type == connector_type,
        )
    )
    return result.scalars().first()
 def calculate_date_range(
    connector: SearchSourceConnector,
    start_date: str | None = None,
    end_date: str | None = None,
    default_days_back: int = 365,
 ) -> tuple[str, str]:
    """
    Calculate date range for indexing based on provided dates or connector's last indexed date.
    Args:
        connector: The connector object
        start_date: Optional start date string (YYYY-MM-DD)
        end_date: Optional end date string (YYYY-MM-DD)
        default_days_back: Default number of days to go back if no last indexed date
    Returns:
        Tuple of (start_date_str, end_date_str)
    """
    if start_date is not None and end_date is not None:
        return start_date, end_date
    # Fall back to calculating dates based on last_indexed_at
    calculated_end_date = datetime.now()
    # Use last_indexed_at as start date if available, otherwise use default_days_back
    if connector.last_indexed_at:
        # Convert dates to be comparable (both timezone-naive)
        last_indexed_naive = (
            connector.last_indexed_at.replace(tzinfo=None)
            if connector.last_indexed_at.tzinfo
            else connector.last_indexed_at
        )
        # Check if last_indexed_at is in the future or after end_date
        if last_indexed_naive > calculated_end_date:
            logger.warning(
                f"Last indexed date ({last_indexed_naive.strftime('%Y-%m-%d')}) is in the future. Using {default_days_back} days ago instead."
            )
            calculated_start_date = calculated_end_date - timedelta(
                days=default_days_back
            )
        else:
            calculated_start_date = last_indexed_naive
            logger.info(
                f"Using last_indexed_at ({calculated_start_date.strftime('%Y-%m-%d')}) as start date"
            )
    else:
        calculated_start_date = calculated_end_date - timedelta(days=default_days_back)
        logger.info(
            f"No last_indexed_at found, using {calculated_start_date.strftime('%Y-%m-%d')} ({default_days_back} days ago) as start date"
        )
    # Use calculated dates if not provided
    start_date_str = (
        start_date if start_date else calculated_start_date.strftime("%Y-%m-%d")
    )
    end_date_str = end_date if end_date else calculated_end_date.strftime("%Y-%m-%d")
    return start_date_str, end_date_str
 async def update_connector_last_indexed(
    session: AsyncSession,
    connector: SearchSourceConnector,
    update_last_indexed: bool = True,
 ) -> None:
    """
    Update the last_indexed_at timestamp for a connector.
    Args:
        session: Database session
        connector: The connector object
        update_last_indexed: Whether to actually update the timestamp
    """
    if update_last_indexed:
        connector.last_indexed_at = datetime.now()
        logger.info(f"Updated last_indexed_at to {connector.last_indexed_at}")
 def build_document_metadata_string(
    metadata_sections: list[tuple[str, list[str]]],
 ) -> str:
    """
    Build a document string from metadata sections.
    Args:
        metadata_sections: List of (section_title, section_content) tuples
    Returns:
        Combined document string
    """
    document_parts = ["<DOCUMENT>"]
    for section_title, section_content in metadata_sections:
        document_parts.append(f"<{section_title}>")
        document_parts.extend(section_content)
        document_parts.append(f"</{section_title}>")
    document_parts.append("</DOCUMENT>")
    return "\n".join(document_parts)
--- a/surfsense_backend/app/tasks/connector_indexers/clickup_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/clickup_indexer.py
@ -0,0 +1,301 @@
 """
 ClickUp connector indexer.
 """
 from datetime import datetime
 from sqlalchemy.exc import SQLAlchemyError
 from sqlalchemy.ext.asyncio import AsyncSession
 from app.config import config
 from app.connectors.clickup_connector import ClickUpConnector
 from app.db import Document, DocumentType, SearchSourceConnectorType
 from app.services.task_logging_service import TaskLoggingService
 from app.utils.document_converters import generate_content_hash
 from .base import (
    check_duplicate_document_by_hash,
    create_document_chunks,
    get_connector_by_id,
    logger,
    update_connector_last_indexed,
 )
 async def index_clickup_tasks(
    session: AsyncSession,
    connector_id: int,
    search_space_id: int,
    user_id: str,
    start_date: str | None = None,
    end_date: str | None = None,
    update_last_indexed: bool = True,
 ) -> tuple[int, str | None]:
    """
    Index tasks from ClickUp workspace.
    Args:
        session: Database session
        connector_id: ID of the ClickUp connector
        search_space_id: ID of the search space
        user_id: ID of the user
        start_date: Start date for filtering tasks (YYYY-MM-DD format)
        end_date: End date for filtering tasks (YYYY-MM-DD format)
        update_last_indexed: Whether to update the last_indexed_at timestamp
    Returns:
        Tuple of (number of indexed tasks, error message if any)
    """
    task_logger = TaskLoggingService(session, search_space_id)
    # Log task start
    log_entry = await task_logger.log_task_start(
        task_name="clickup_tasks_indexing",
        source="connector_indexing_task",
        message=f"Starting ClickUp tasks indexing for connector {connector_id}",
        metadata={
            "connector_id": connector_id,
            "start_date": start_date,
            "end_date": end_date,
        },
    )
    try:
        # Get connector configuration
        connector = await get_connector_by_id(
            session, connector_id, SearchSourceConnectorType.CLICKUP_CONNECTOR
        )
        if not connector:
            error_msg = f"ClickUp connector with ID {connector_id} not found"
            await task_logger.log_task_failure(
                log_entry,
                f"Connector with ID {connector_id} not found or is not a ClickUp connector",
                "Connector not found",
                {"error_type": "ConnectorNotFound"},
            )
            return 0, error_msg
        # Extract ClickUp configuration
        clickup_api_token = connector.config.get("CLICKUP_API_TOKEN")
        if not clickup_api_token:
            error_msg = "ClickUp API token not found in connector configuration"
            await task_logger.log_task_failure(
                log_entry,
                f"ClickUp API token not found in connector config for connector {connector_id}",
                "Missing ClickUp token",
                {"error_type": "MissingToken"},
            )
            return 0, error_msg
        await task_logger.log_task_progress(
            log_entry,
            f"Initializing ClickUp client for connector {connector_id}",
            {"stage": "client_initialization"},
        )
        clickup_client = ClickUpConnector(api_token=clickup_api_token)
        # Get authorized workspaces
        await task_logger.log_task_progress(
            log_entry,
            "Fetching authorized ClickUp workspaces",
            {"stage": "workspace_fetching"},
        )
        workspaces_response = clickup_client.get_authorized_workspaces()
        workspaces = workspaces_response.get("teams", [])
        if not workspaces:
            error_msg = "No authorized ClickUp workspaces found"
            await task_logger.log_task_failure(
                log_entry,
                f"No authorized ClickUp workspaces found for connector {connector_id}",
                "No workspaces found",
                {"error_type": "NoWorkspacesFound"},
            )
            return 0, error_msg
        documents_indexed = 0
        documents_skipped = 0
        # Iterate workspaces and fetch tasks
        for workspace in workspaces:
            workspace_id = workspace.get("id")
            workspace_name = workspace.get("name", "Unknown Workspace")
            if not workspace_id:
                continue
            await task_logger.log_task_progress(
                log_entry,
                f"Processing workspace: {workspace_name}",
                {"stage": "workspace_processing", "workspace_id": workspace_id},
            )
            # Fetch tasks for date range if provided
            if start_date and end_date:
                tasks, error = clickup_client.get_tasks_in_date_range(
                    workspace_id=workspace_id,
                    start_date=start_date,
                    end_date=end_date,
                    include_closed=True,
                )
                if error:
                    logger.warning(
                        f"Error fetching tasks from workspace {workspace_name}: {error}"
                    )
                    continue
            else:
                tasks = clickup_client.get_workspace_tasks(
                    workspace_id=workspace_id, include_closed=True
                )
            await task_logger.log_task_progress(
                log_entry,
                f"Found {len(tasks)} tasks in workspace {workspace_name}",
                {"stage": "tasks_found", "task_count": len(tasks)},
            )
            for task in tasks:
                try:
                    task_id = task.get("id")
                    task_name = task.get("name", "Untitled Task")
                    task_description = task.get("description", "")
                    task_status = task.get("status", {}).get("status", "Unknown")
                    task_priority = (
                        task.get("priority", {}).get("priority", "Unknown")
                        if task.get("priority")
                        else "None"
                    )
                    task_assignees = task.get("assignees", [])
                    task_due_date = task.get("due_date")
                    task_created = task.get("date_created")
                    task_updated = task.get("date_updated")
                    task_list = task.get("list", {})
                    task_list_name = task_list.get("name", "Unknown List")
                    task_space = task.get("space", {})
                    task_space_name = task_space.get("name", "Unknown Space")
                    # Build task content string
                    content_parts: list[str] = [f"Task: {task_name}"]
                    if task_description:
                        content_parts.append(f"Description: {task_description}")
                    content_parts.extend(
                        [
                            f"Status: {task_status}",
                            f"Priority: {task_priority}",
                            f"List: {task_list_name}",
                            f"Space: {task_space_name}",
                        ]
                    )
                    if task_assignees:
                        assignee_names = [
                            assignee.get("username", "Unknown")
                            for assignee in task_assignees
                        ]
                        content_parts.append(
                            f"Assignees: {', '.join(assignee_names)}"
                        )
                    if task_due_date:
                        content_parts.append(f"Due Date: {task_due_date}")
                    task_content = "\n".join(content_parts)
                    if not task_content.strip():
                        logger.warning(f"Skipping task with no content: {task_name}")
                        documents_skipped += 1
                        continue
                    # Hash for duplicates
                    content_hash = generate_content_hash(task_content, search_space_id)
                    existing_document_by_hash = await check_duplicate_document_by_hash(
                        session, content_hash
                    )
                    if existing_document_by_hash:
                        logger.info(
                            f"Document with content hash {content_hash} already exists for task {task_name}. Skipping processing."
                        )
                        documents_skipped += 1
                        continue
                    # Embedding and chunks
                    summary_embedding = config.embedding_model_instance.embed(
                        task_content
                    )
                    chunks = await create_document_chunks(task_content)
                    document = Document(
                        search_space_id=search_space_id,
                        title=f"Task - {task_name}",
                        document_type=DocumentType.CLICKUP_CONNECTOR,
                        document_metadata={
                            "task_id": task_id,
                            "task_name": task_name,
                            "task_status": task_status,
                            "task_priority": task_priority,
                            "task_assignees": task_assignees,
                            "task_due_date": task_due_date,
                            "task_created": task_created,
                            "task_updated": task_updated,
                            "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                        },
                        content=task_content,
                        content_hash=content_hash,
                        embedding=summary_embedding,
                        chunks=chunks,
                    )
                    session.add(document)
                    documents_indexed += 1
                    logger.info(f"Successfully indexed new task {task_name}")
                except Exception as e:
                    logger.error(
                        f"Error processing task {task.get('name', 'Unknown')}: {e!s}",
                        exc_info=True,
                    )
                    documents_skipped += 1
        total_processed = documents_indexed
        if total_processed > 0:
            await update_connector_last_indexed(session, connector, update_last_indexed)
        await session.commit()
        await task_logger.log_task_success(
            log_entry,
            f"Successfully completed clickup indexing for connector {connector_id}",
            {
                "pages_processed": total_processed,
                "documents_indexed": documents_indexed,
                "documents_skipped": documents_skipped,
            },
        )
        logger.info(
            f"clickup indexing completed: {documents_indexed} new tasks, {documents_skipped} skipped"
        )
        return total_processed, None
    except SQLAlchemyError as db_error:
        await session.rollback()
        await task_logger.log_task_failure(
            log_entry,
            f"Database error during ClickUp indexing for connector {connector_id}",
            str(db_error),
            {"error_type": "SQLAlchemyError"},
        )
        logger.error(f"Database error: {db_error!s}", exc_info=True)
        return 0, f"Database error: {db_error!s}"
    except Exception as e:
        await session.rollback()
        await task_logger.log_task_failure(
            log_entry,
            f"Failed to index ClickUp tasks for connector {connector_id}",
            str(e),
            {"error_type": type(e).__name__},
        )
        logger.error(f"Failed to index ClickUp tasks: {e!s}", exc_info=True)
        return 0, f"Failed to index ClickUp tasks: {e!s}"
--- a/surfsense_backend/app/tasks/connector_indexers/confluence_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/confluence_indexer.py
@ -0,0 +1,338 @@
 """
 Confluence connector indexer.
 """
 from datetime import datetime
 from sqlalchemy.exc import SQLAlchemyError
 from sqlalchemy.ext.asyncio import AsyncSession
 from app.config import config
 from app.connectors.confluence_connector import ConfluenceConnector
 from app.db import Document, DocumentType, SearchSourceConnectorType
 from app.services.task_logging_service import TaskLoggingService
 from app.utils.document_converters import generate_content_hash
 from .base import (
    calculate_date_range,
    check_duplicate_document_by_hash,
    create_document_chunks,
    get_connector_by_id,
    logger,
    update_connector_last_indexed,
 )
 async def index_confluence_pages(
    session: AsyncSession,
    connector_id: int,
    search_space_id: int,
    user_id: str,
    start_date: str | None = None,
    end_date: str | None = None,
    update_last_indexed: bool = True,
 ) -> tuple[int, str | None]:
    """
    Index Confluence pages and comments.
    Args:
        session: Database session
        connector_id: ID of the Confluence connector
        search_space_id: ID of the search space to store documents in
        user_id: User ID
        start_date: Start date for indexing (YYYY-MM-DD format)
        end_date: End date for indexing (YYYY-MM-DD format)
        update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
    Returns:
        Tuple containing (number of documents indexed, error message or None)
    """
    task_logger = TaskLoggingService(session, search_space_id)
    # Log task start
    log_entry = await task_logger.log_task_start(
        task_name="confluence_pages_indexing",
        source="connector_indexing_task",
        message=f"Starting Confluence pages indexing for connector {connector_id}",
        metadata={
            "connector_id": connector_id,
            "user_id": str(user_id),
            "start_date": start_date,
            "end_date": end_date,
        },
    )
    try:
        # Get the connector from the database
        connector = await get_connector_by_id(
            session, connector_id, SearchSourceConnectorType.CONFLUENCE_CONNECTOR
        )
        if not connector:
            await task_logger.log_task_failure(
                log_entry,
                f"Connector with ID {connector_id} not found",
                "Connector not found",
                {"error_type": "ConnectorNotFound"},
            )
            return 0, f"Connector with ID {connector_id} not found"
        # Get the Confluence credentials from the connector config
        confluence_email = connector.config.get("CONFLUENCE_EMAIL")
        confluence_api_token = connector.config.get("CONFLUENCE_API_TOKEN")
        confluence_base_url = connector.config.get("CONFLUENCE_BASE_URL")
        if not confluence_email or not confluence_api_token or not confluence_base_url:
            await task_logger.log_task_failure(
                log_entry,
                f"Confluence credentials not found in connector config for connector {connector_id}",
                "Missing Confluence credentials",
                {"error_type": "MissingCredentials"},
            )
            return 0, "Confluence credentials not found in connector config"
        # Initialize Confluence client
        await task_logger.log_task_progress(
            log_entry,
            f"Initializing Confluence client for connector {connector_id}",
            {"stage": "client_initialization"},
        )
        confluence_client = ConfluenceConnector(
            base_url=confluence_base_url,
            email=confluence_email,
            api_token=confluence_api_token,
        )
        # Calculate date range
        start_date_str, end_date_str = calculate_date_range(
            connector, start_date, end_date, default_days_back=365
        )
        await task_logger.log_task_progress(
            log_entry,
            f"Fetching Confluence pages from {start_date_str} to {end_date_str}",
            {
                "stage": "fetching_pages",
                "start_date": start_date_str,
                "end_date": end_date_str,
            },
        )
        # Get pages within date range
        try:
            pages, error = confluence_client.get_pages_by_date_range(
                start_date=start_date_str, end_date=end_date_str, include_comments=True
            )
            if error:
                logger.error(f"Failed to get Confluence pages: {error}")
                # Don't treat "No pages found" as an error that should stop indexing
                if "No pages found" in error:
                    logger.info(
                        "No pages found is not a critical error, continuing with update"
                    )
                    if update_last_indexed:
                        await update_connector_last_indexed(
                            session, connector, update_last_indexed
                        )
                        await session.commit()
                        logger.info(
                            f"Updated last_indexed_at to {connector.last_indexed_at} despite no pages found"
                        )
                    await task_logger.log_task_success(
                        log_entry,
                        f"No Confluence pages found in date range {start_date_str} to {end_date_str}",
                        {"pages_found": 0},
                    )
                    return 0, None
                else:
                    await task_logger.log_task_failure(
                        log_entry,
                        f"Failed to get Confluence pages: {error}",
                        "API Error",
                        {"error_type": "APIError"},
                    )
                    return 0, f"Failed to get Confluence pages: {error}"
            logger.info(f"Retrieved {len(pages)} pages from Confluence API")
        except Exception as e:
            logger.error(f"Error fetching Confluence pages: {e!s}", exc_info=True)
            return 0, f"Error fetching Confluence pages: {e!s}"
        # Process and index each page
        documents_indexed = 0
        skipped_pages = []
        documents_skipped = 0
        for page in pages:
            try:
                page_id = page.get("id")
                page_title = page.get("title", "")
                space_id = page.get("spaceId", "")
                if not page_id or not page_title:
                    logger.warning(
                        f"Skipping page with missing ID or title: {page_id or 'Unknown'}"
                    )
                    skipped_pages.append(f"{page_title or 'Unknown'} (missing data)")
                    documents_skipped += 1
                    continue
                # Extract page content
                page_content = ""
                if page.get("body") and page["body"].get("storage"):
                    page_content = page["body"]["storage"].get("value", "")
                # Add comments to content
                comments = page.get("comments", [])
                comments_content = ""
                if comments:
                    comments_content = "\n\n## Comments\n\n"
                    for comment in comments:
                        comment_body = ""
                        if comment.get("body") and comment["body"].get("storage"):
                            comment_body = comment["body"]["storage"].get("value", "")
                        comment_author = comment.get("version", {}).get(
                            "authorId", "Unknown"
                        )
                        comment_date = comment.get("version", {}).get("createdAt", "")
                        comments_content += f"**Comment by {comment_author}** ({comment_date}):\n{comment_body}\n\n"
                # Combine page content with comments
                full_content = f"# {page_title}\n\n{page_content}{comments_content}"
                if not full_content.strip():
                    logger.warning(f"Skipping page with no content: {page_title}")
                    skipped_pages.append(f"{page_title} (no content)")
                    documents_skipped += 1
                    continue
                # Create a simple summary
                summary_content = (
                    f"Confluence Page: {page_title}\n\nSpace ID: {space_id}\n\n"
                )
                if page_content:
                    # Take first 500 characters of content for summary
                    content_preview = page_content[:500]
                    if len(page_content) > 500:
                        content_preview += "..."
                    summary_content += f"Content Preview: {content_preview}\n\n"
                # Add comment count
                comment_count = len(comments)
                summary_content += f"Comments: {comment_count}"
                # Generate content hash
                content_hash = generate_content_hash(full_content, search_space_id)
                # Check if document already exists
                existing_document_by_hash = await check_duplicate_document_by_hash(
                    session, content_hash
                )
                if existing_document_by_hash:
                    logger.info(
                        f"Document with content hash {content_hash} already exists for page {page_title}. Skipping processing."
                    )
                    documents_skipped += 1
                    continue
                # Generate embedding for the summary
                summary_embedding = config.embedding_model_instance.embed(
                    summary_content
                )
                # Process chunks - using the full page content with comments
                chunks = await create_document_chunks(full_content)
                # Create and store new document
                logger.info(f"Creating new document for page {page_title}")
                document = Document(
                    search_space_id=search_space_id,
                    title=f"Confluence - {page_title}",
                    document_type=DocumentType.CONFLUENCE_CONNECTOR,
                    document_metadata={
                        "page_id": page_id,
                        "page_title": page_title,
                        "space_id": space_id,
                        "comment_count": comment_count,
                        "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                    },
                    content=summary_content,
                    content_hash=content_hash,
                    embedding=summary_embedding,
                    chunks=chunks,
                )
                session.add(document)
                documents_indexed += 1
                logger.info(f"Successfully indexed new page {page_title}")
            except Exception as e:
                logger.error(
                    f"Error processing page {page.get('title', 'Unknown')}: {e!s}",
                    exc_info=True,
                )
                skipped_pages.append(
                    f"{page.get('title', 'Unknown')} (processing error)"
                )
                documents_skipped += 1
                continue  # Skip this page and continue with others
        # Update the last_indexed_at timestamp for the connector only if requested
        total_processed = documents_indexed
        if update_last_indexed:
            await update_connector_last_indexed(session, connector, update_last_indexed)
        # Commit all changes
        await session.commit()
        logger.info(
            "Successfully committed all Confluence document changes to database"
        )
        # Log success
        await task_logger.log_task_success(
            log_entry,
            f"Successfully completed Confluence indexing for connector {connector_id}",
            {
                "pages_processed": total_processed,
                "documents_indexed": documents_indexed,
                "documents_skipped": documents_skipped,
                "skipped_pages_count": len(skipped_pages),
            },
        )
        logger.info(
            f"Confluence indexing completed: {documents_indexed} new pages, {documents_skipped} skipped"
        )
        return (
            total_processed,
            None,
        )  # Return None as the error message to indicate success
    except SQLAlchemyError as db_error:
        await session.rollback()
        await task_logger.log_task_failure(
            log_entry,
            f"Database error during Confluence indexing for connector {connector_id}",
            str(db_error),
            {"error_type": "SQLAlchemyError"},
        )
        logger.error(f"Database error: {db_error!s}", exc_info=True)
        return 0, f"Database error: {db_error!s}"
    except Exception as e:
        await session.rollback()
        await task_logger.log_task_failure(
            log_entry,
            f"Failed to index Confluence pages for connector {connector_id}",
            str(e),
            {"error_type": type(e).__name__},
        )
        logger.error(f"Failed to index Confluence pages: {e!s}", exc_info=True)
        return 0, f"Failed to index Confluence pages: {e!s}"
--- a/surfsense_backend/app/tasks/connector_indexers/discord_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/discord_indexer.py
@ -0,0 +1,441 @@
 """
 Discord connector indexer.
 """
 import asyncio
 from datetime import UTC, datetime, timedelta
 from sqlalchemy.exc import SQLAlchemyError
 from sqlalchemy.ext.asyncio import AsyncSession
 from app.config import config
 from app.connectors.discord_connector import DiscordConnector
 from app.db import Document, DocumentType, SearchSourceConnectorType
 from app.prompts import SUMMARY_PROMPT_TEMPLATE
 from app.services.llm_service import get_user_long_context_llm
 from app.services.task_logging_service import TaskLoggingService
 from app.utils.document_converters import generate_content_hash
 from .base import (
    build_document_metadata_string,
    check_duplicate_document_by_hash,
    create_document_chunks,
    get_connector_by_id,
    logger,
    update_connector_last_indexed,
 )
 async def index_discord_messages(
    session: AsyncSession,
    connector_id: int,
    search_space_id: int,
    user_id: str,
    start_date: str | None = None,
    end_date: str | None = None,
    update_last_indexed: bool = True,
 ) -> tuple[int, str | None]:
    """
    Index Discord messages from all accessible channels.
    Args:
        session: Database session
        connector_id: ID of the Discord connector
        search_space_id: ID of the search space to store documents in
        user_id: ID of the user
        start_date: Start date for indexing (YYYY-MM-DD format)
        end_date: End date for indexing (YYYY-MM-DD format)
        update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
    Returns:
        Tuple containing (number of documents indexed, error message or None)
    """
    task_logger = TaskLoggingService(session, search_space_id)
    # Log task start
    log_entry = await task_logger.log_task_start(
        task_name="discord_messages_indexing",
        source="connector_indexing_task",
        message=f"Starting Discord messages indexing for connector {connector_id}",
        metadata={
            "connector_id": connector_id,
            "user_id": str(user_id),
            "start_date": start_date,
            "end_date": end_date,
        },
    )
    try:
        # Get the connector
        await task_logger.log_task_progress(
            log_entry,
            f"Retrieving Discord connector {connector_id} from database",
            {"stage": "connector_retrieval"},
        )
        connector = await get_connector_by_id(
            session, connector_id, SearchSourceConnectorType.DISCORD_CONNECTOR
        )
        if not connector:
            await task_logger.log_task_failure(
                log_entry,
                f"Connector with ID {connector_id} not found or is not a Discord connector",
                "Connector not found",
                {"error_type": "ConnectorNotFound"},
            )
            return (
                0,
                f"Connector with ID {connector_id} not found or is not a Discord connector",
            )
        # Get the Discord token from the connector config
        discord_token = connector.config.get("DISCORD_BOT_TOKEN")
        if not discord_token:
            await task_logger.log_task_failure(
                log_entry,
                f"Discord token not found in connector config for connector {connector_id}",
                "Missing Discord token",
                {"error_type": "MissingToken"},
            )
            return 0, "Discord token not found in connector config"
        logger.info(f"Starting Discord indexing for connector {connector_id}")
        # Initialize Discord client
        await task_logger.log_task_progress(
            log_entry,
            f"Initializing Discord client for connector {connector_id}",
            {"stage": "client_initialization"},
        )
        discord_client = DiscordConnector(token=discord_token)
        # Calculate date range
        if start_date is None or end_date is None:
            # Fall back to calculating dates based on last_indexed_at
            calculated_end_date = datetime.now(UTC)
            # Use last_indexed_at as start date if available, otherwise use 365 days ago
            if connector.last_indexed_at:
                calculated_start_date = connector.last_indexed_at.replace(tzinfo=UTC)
                logger.info(
                    f"Using last_indexed_at ({calculated_start_date.strftime('%Y-%m-%d')}) as start date"
                )
            else:
                calculated_start_date = calculated_end_date - timedelta(days=365)
                logger.info(
                    f"No last_indexed_at found, using {calculated_start_date.strftime('%Y-%m-%d')} (365 days ago) as start date"
                )
            # Use calculated dates if not provided, convert to ISO format for Discord API
            if start_date is None:
                start_date_iso = calculated_start_date.isoformat()
            else:
                # Convert YYYY-MM-DD to ISO format
                start_date_iso = (
                    datetime.strptime(start_date, "%Y-%m-%d")
                    .replace(tzinfo=UTC)
                    .isoformat()
                )
            if end_date is None:
                end_date_iso = calculated_end_date.isoformat()
            else:
                # Convert YYYY-MM-DD to ISO format
                end_date_iso = (
                    datetime.strptime(end_date, "%Y-%m-%d")
                    .replace(tzinfo=UTC)
                    .isoformat()
                )
        else:
            # Convert provided dates to ISO format for Discord API
            start_date_iso = (
                datetime.strptime(start_date, "%Y-%m-%d")
                .replace(tzinfo=UTC)
                .isoformat()
            )
            end_date_iso = (
                datetime.strptime(end_date, "%Y-%m-%d").replace(tzinfo=UTC).isoformat()
            )
        logger.info(
            f"Indexing Discord messages from {start_date_iso} to {end_date_iso}"
        )
        try:
            await task_logger.log_task_progress(
                log_entry,
                f"Starting Discord bot and fetching guilds for connector {connector_id}",
                {"stage": "fetch_guilds"},
            )
            logger.info("Starting Discord bot to fetch guilds")
            discord_client._bot_task = asyncio.create_task(discord_client.start_bot())
            await discord_client._wait_until_ready()
            logger.info("Fetching Discord guilds")
            guilds = await discord_client.get_guilds()
            logger.info(f"Found {len(guilds)} guilds")
        except Exception as e:
            await task_logger.log_task_failure(
                log_entry,
                f"Failed to get Discord guilds for connector {connector_id}",
                str(e),
                {"error_type": "GuildFetchError"},
            )
            logger.error(f"Failed to get Discord guilds: {e!s}", exc_info=True)
            await discord_client.close_bot()
            return 0, f"Failed to get Discord guilds: {e!s}"
        if not guilds:
            await task_logger.log_task_success(
                log_entry,
                f"No Discord guilds found for connector {connector_id}",
                {"guilds_found": 0},
            )
            logger.info("No Discord guilds found to index")
            await discord_client.close_bot()
            return 0, "No Discord guilds found"
        # Track results
        documents_indexed = 0
        documents_skipped = 0
        skipped_channels: list[str] = []
        # Process each guild and channel
        await task_logger.log_task_progress(
            log_entry,
            f"Starting to process {len(guilds)} Discord guilds",
            {"stage": "process_guilds", "total_guilds": len(guilds)},
        )
        try:
            for guild in guilds:
                guild_id = guild["id"]
                guild_name = guild["name"]
                logger.info(f"Processing guild: {guild_name} ({guild_id})")
                try:
                    channels = await discord_client.get_text_channels(guild_id)
                    if not channels:
                        logger.info(f"No channels found in guild {guild_name}. Skipping.")
                        skipped_channels.append(f"{guild_name} (no channels)")
                        documents_skipped += 1
                        continue
                    for channel in channels:
                        channel_id = channel["id"]
                        channel_name = channel["name"]
                        try:
                            messages = await discord_client.get_channel_history(
                                channel_id=channel_id,
                                start_date=start_date_iso,
                                end_date=end_date_iso,
                            )
                        except Exception as e:
                            logger.error(
                                f"Failed to get messages for channel {channel_name}: {e!s}"
                            )
                            skipped_channels.append(
                                f"{guild_name}#{channel_name} (fetch error)"
                            )
                            documents_skipped += 1
                            continue
                        if not messages:
                            logger.info(
                                f"No messages found in channel {channel_name} for the specified date range."
                            )
                            documents_skipped += 1
                            continue
                        # Filter/format messages
                        formatted_messages: list[dict] = []
                        for msg in messages:
                            # Optionally skip system messages
                            if msg.get("type") in ["system"]:
                                continue
                            formatted_messages.append(msg)
                        if not formatted_messages:
                            logger.info(
                                f"No valid messages found in channel {channel_name} after filtering."
                            )
                            documents_skipped += 1
                            continue
                        # Convert messages to markdown format
                        channel_content = (
                            f"# Discord Channel: {guild_name} / {channel_name}\n\n"
                        )
                        for msg in formatted_messages:
                            user_name = msg.get("author_name", "Unknown User")
                            timestamp = msg.get("created_at", "Unknown Time")
                            text = msg.get("content", "")
                            channel_content += (
                                f"## {user_name} ({timestamp})\n\n{text}\n\n---\n\n"
                            )
                        # Metadata sections
                        metadata_sections = [
                            (
                                "METADATA",
                                [
                                    f"GUILD_NAME: {guild_name}",
                                    f"GUILD_ID: {guild_id}",
                                    f"CHANNEL_NAME: {channel_name}",
                                    f"CHANNEL_ID: {channel_id}",
                                    f"MESSAGE_COUNT: {len(formatted_messages)}",
                                ],
                            ),
                            (
                                "CONTENT",
                                [
                                    "FORMAT: markdown",
                                    "TEXT_START",
                                    channel_content,
                                    "TEXT_END",
                                ],
                            ),
                        ]
                        combined_document_string = build_document_metadata_string(
                            metadata_sections
                        )
                        content_hash = generate_content_hash(
                            combined_document_string, search_space_id
                        )
                        # Skip duplicates by hash
                        existing_document_by_hash = await check_duplicate_document_by_hash(
                            session, content_hash
                        )
                        if existing_document_by_hash:
                            logger.info(
                                f"Document with content hash {content_hash} already exists for channel {guild_name}#{channel_name}. Skipping processing."
                            )
                            documents_skipped += 1
                            continue
                        # Get user's long context LLM
                        user_llm = await get_user_long_context_llm(session, user_id)
                        if not user_llm:
                            logger.error(
                                f"No long context LLM configured for user {user_id}"
                            )
                            skipped_channels.append(
                                f"{guild_name}#{channel_name} (no LLM configured)"
                            )
                            documents_skipped += 1
                            continue
                        # Generate summary using summary_chain
                        summary_chain = SUMMARY_PROMPT_TEMPLATE | user_llm
                        summary_result = await summary_chain.ainvoke(
                            {"document": combined_document_string}
                        )
                        summary_content = summary_result.content
                        summary_embedding = config.embedding_model_instance.embed(
                            summary_content
                        )
                        # Chunks from channel content
                        chunks = await create_document_chunks(channel_content)
                        # Create and store new document
                        document = Document(
                            search_space_id=search_space_id,
                            title=f"Discord - {guild_name}#{channel_name}",
                            document_type=DocumentType.DISCORD_CONNECTOR,
                            document_metadata={
                                "guild_name": guild_name,
                                "guild_id": guild_id,
                                "channel_name": channel_name,
                                "channel_id": channel_id,
                                "message_count": len(formatted_messages),
                                "start_date": start_date_iso,
                                "end_date": end_date_iso,
                                "indexed_at": datetime.now(UTC).strftime(
                                    "%Y-%m-%d %H:%M:%S"
                                ),
                            },
                            content=summary_content,
                            content_hash=content_hash,
                            embedding=summary_embedding,
                            chunks=chunks,
                        )
                        session.add(document)
                        documents_indexed += 1
                        logger.info(
                            f"Successfully indexed new channel {guild_name}#{channel_name} with {len(formatted_messages)} messages"
                        )
                except Exception as e:
                    logger.error(
                        f"Error processing guild {guild_name}: {e!s}", exc_info=True
                    )
                    skipped_channels.append(f"{guild_name} (processing error)")
                    documents_skipped += 1
                    continue
        finally:
            await discord_client.close_bot()
        # Update last_indexed_at only if we indexed at least one
        if documents_indexed > 0:
            await update_connector_last_indexed(session, connector, update_last_indexed)
        await session.commit()
        # Prepare result message
        result_message = None
        if skipped_channels:
            result_message = (
                f"Processed {documents_indexed} channels. Skipped {len(skipped_channels)} channels: "
                + ", ".join(skipped_channels)
            )
        else:
            result_message = f"Processed {documents_indexed} channels."
        # Log success
        await task_logger.log_task_success(
            log_entry,
            f"Successfully completed Discord indexing for connector {connector_id}",
            {
                "channels_processed": documents_indexed,
                "documents_indexed": documents_indexed,
                "documents_skipped": documents_skipped,
                "skipped_channels_count": len(skipped_channels),
                "guilds_processed": len(guilds),
                "result_message": result_message,
            },
        )
        logger.info(
            f"Discord indexing completed: {documents_indexed} new channels, {documents_skipped} skipped"
        )
        return documents_indexed, result_message
    except SQLAlchemyError as db_error:
        await session.rollback()
        await task_logger.log_task_failure(
            log_entry,
            f"Database error during Discord indexing for connector {connector_id}",
            str(db_error),
            {"error_type": "SQLAlchemyError"},
        )
        logger.error(f"Database error: {db_error!s}", exc_info=True)
        return 0, f"Database error: {db_error!s}"
    except Exception as e:
        await session.rollback()
        await task_logger.log_task_failure(
            log_entry,
            f"Failed to index Discord messages for connector {connector_id}",
            str(e),
            {"error_type": type(e).__name__},
        )
        logger.error(f"Failed to index Discord messages: {e!s}", exc_info=True)
        return 0, f"Failed to index Discord messages: {e!s}"
--- a/surfsense_backend/app/tasks/connector_indexers/github_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/github_indexer.py
@ -0,0 +1,326 @@
 """
 GitHub connector indexer.
 """
 from datetime import UTC, datetime
 from sqlalchemy.exc import SQLAlchemyError
 from sqlalchemy.ext.asyncio import AsyncSession
 from app.config import config
 from app.connectors.github_connector import GitHubConnector
 from app.db import Document, DocumentType, SearchSourceConnectorType
 from app.services.task_logging_service import TaskLoggingService
 from app.utils.document_converters import generate_content_hash
 from .base import (
    check_duplicate_document_by_hash,
    create_document_chunks,
    get_connector_by_id,
    logger,
 )
 async def index_github_repos(
    session: AsyncSession,
    connector_id: int,
    search_space_id: int,
    user_id: str,
    start_date: str | None = None,
    end_date: str | None = None,
    update_last_indexed: bool = True,
 ) -> tuple[int, str | None]:
    """
    Index code and documentation files from accessible GitHub repositories.
    Args:
        session: Database session
        connector_id: ID of the GitHub connector
        search_space_id: ID of the search space to store documents in
        user_id: ID of the user
        start_date: Start date for filtering (YYYY-MM-DD format) - Note: GitHub indexing processes all files regardless of dates
        end_date: End date for filtering (YYYY-MM-DD format) - Note: GitHub indexing processes all files regardless of dates
        update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
    Returns:
        Tuple containing (number of documents indexed, error message or None)
    """
    task_logger = TaskLoggingService(session, search_space_id)
    # Log task start
    log_entry = await task_logger.log_task_start(
        task_name="github_repos_indexing",
        source="connector_indexing_task",
        message=f"Starting GitHub repositories indexing for connector {connector_id}",
        metadata={
            "connector_id": connector_id,
            "user_id": str(user_id),
            "start_date": start_date,
            "end_date": end_date,
        },
    )
    documents_processed = 0
    errors = []
    try:
        # 1. Get the GitHub connector from the database
        await task_logger.log_task_progress(
            log_entry,
            f"Retrieving GitHub connector {connector_id} from database",
            {"stage": "connector_retrieval"},
        )
        connector = await get_connector_by_id(
            session, connector_id, SearchSourceConnectorType.GITHUB_CONNECTOR
        )
        if not connector:
            await task_logger.log_task_failure(
                log_entry,
                f"Connector with ID {connector_id} not found or is not a GitHub connector",
                "Connector not found",
                {"error_type": "ConnectorNotFound"},
            )
            return (
                0,
                f"Connector with ID {connector_id} not found or is not a GitHub connector",
            )
        # 2. Get the GitHub PAT and selected repositories from the connector config
        github_pat = connector.config.get("GITHUB_PAT")
        repo_full_names_to_index = connector.config.get("repo_full_names")
        if not github_pat:
            await task_logger.log_task_failure(
                log_entry,
                f"GitHub Personal Access Token (PAT) not found in connector config for connector {connector_id}",
                "Missing GitHub PAT",
                {"error_type": "MissingToken"},
            )
            return 0, "GitHub Personal Access Token (PAT) not found in connector config"
        if not repo_full_names_to_index or not isinstance(
            repo_full_names_to_index, list
        ):
            await task_logger.log_task_failure(
                log_entry,
                f"'repo_full_names' not found or is not a list in connector config for connector {connector_id}",
                "Invalid repo configuration",
                {"error_type": "InvalidConfiguration"},
            )
            return 0, "'repo_full_names' not found or is not a list in connector config"
        # 3. Initialize GitHub connector client
        await task_logger.log_task_progress(
            log_entry,
            f"Initializing GitHub client for connector {connector_id}",
            {
                "stage": "client_initialization",
                "repo_count": len(repo_full_names_to_index),
            },
        )
        try:
            github_client = GitHubConnector(token=github_pat)
        except ValueError as e:
            await task_logger.log_task_failure(
                log_entry,
                f"Failed to initialize GitHub client for connector {connector_id}",
                str(e),
                {"error_type": "ClientInitializationError"},
            )
            return 0, f"Failed to initialize GitHub client: {e!s}"
        # 4. Validate selected repositories
        await task_logger.log_task_progress(
            log_entry,
            f"Starting indexing for {len(repo_full_names_to_index)} selected repositories",
            {
                "stage": "repo_processing",
                "repo_count": len(repo_full_names_to_index),
                "start_date": start_date,
                "end_date": end_date,
            },
        )
        logger.info(
            f"Starting indexing for {len(repo_full_names_to_index)} selected repositories."
        )
        if start_date and end_date:
            logger.info(
                f"Date range requested: {start_date} to {end_date} (Note: GitHub indexing processes all files regardless of dates)"
            )
        # 6. Iterate through selected repositories and index files
        for repo_full_name in repo_full_names_to_index:
            if not repo_full_name or not isinstance(repo_full_name, str):
                logger.warning(f"Skipping invalid repository entry: {repo_full_name}")
                continue
            logger.info(f"Processing repository: {repo_full_name}")
            try:
                files_to_index = github_client.get_repository_files(repo_full_name)
                if not files_to_index:
                    logger.info(
                        f"No indexable files found in repository: {repo_full_name}"
                    )
                    continue
                logger.info(
                    f"Found {len(files_to_index)} files to process in {repo_full_name}"
                )
                for file_info in files_to_index:
                    file_path = file_info.get("path")
                    file_url = file_info.get("url")
                    file_sha = file_info.get("sha")
                    file_type = file_info.get("type")  # 'code' or 'doc'
                    full_path_key = f"{repo_full_name}/{file_path}"
                    if not file_path or not file_url or not file_sha:
                        logger.warning(
                            f"Skipping file with missing info in {repo_full_name}: {file_info}"
                        )
                        continue
                    # Get file content
                    file_content = github_client.get_file_content(
                        repo_full_name, file_path
                    )
                    if file_content is None:
                        logger.warning(
                            f"Could not retrieve content for {full_path_key}. Skipping."
                        )
                        continue  # Skip if content fetch failed
                    content_hash = generate_content_hash(file_content, search_space_id)
                    # Check if document with this content hash already exists
                    existing_document_by_hash = await check_duplicate_document_by_hash(
                        session, content_hash
                    )
                    if existing_document_by_hash:
                        logger.info(
                            f"Document with content hash {content_hash} already exists for file {full_path_key}. Skipping processing."
                        )
                        continue
                    # Use file_content directly for chunking, maybe summary for main content?
                    # For now, let's use the full content for both, might need refinement
                    summary_content = f"GitHub file: {full_path_key}\n\n{file_content[:1000]}..."  # Simple summary
                    summary_embedding = config.embedding_model_instance.embed(
                        summary_content
                    )
                    # Chunk the content
                    try:
                        chunks_data = [
                            await create_document_chunks(file_content)
                        ][0]
                        # Use code chunker if available, otherwise regular chunker
                        if hasattr(config, "code_chunker_instance"):
                            chunks_data = [
                                {
                                    "content": chunk.text,
                                    "embedding": config.embedding_model_instance.embed(
                                        chunk.text
                                    ),
                                }
                                for chunk in config.code_chunker_instance.chunk(
                                    file_content
                                )
                            ]
                        else:
                            chunks_data = await create_document_chunks(file_content)
                    except Exception as chunk_err:
                        logger.error(
                            f"Failed to chunk file {full_path_key}: {chunk_err}"
                        )
                        errors.append(
                            f"Chunking failed for {full_path_key}: {chunk_err}"
                        )
                        continue  # Skip this file if chunking fails
                    doc_metadata = {
                        "repository_full_name": repo_full_name,
                        "file_path": file_path,
                        "full_path": full_path_key,  # For easier lookup
                        "url": file_url,
                        "sha": file_sha,
                        "type": file_type,
                        "indexed_at": datetime.now(UTC).isoformat(),
                    }
                    # Create new document
                    logger.info(f"Creating new document for file: {full_path_key}")
                    document = Document(
                        title=f"GitHub - {file_path}",
                        document_type=DocumentType.GITHUB_CONNECTOR,
                        document_metadata=doc_metadata,
                        content=summary_content,  # Store summary
                        content_hash=content_hash,
                        embedding=summary_embedding,
                        search_space_id=search_space_id,
                        chunks=chunks_data,  # Associate chunks directly
                    )
                    session.add(document)
                    documents_processed += 1
            except Exception as repo_err:
                logger.error(
                    f"Failed to process repository {repo_full_name}: {repo_err}"
                )
                errors.append(f"Failed processing {repo_full_name}: {repo_err}")
        # Commit all changes at the end
        await session.commit()
        logger.info(
            f"Finished GitHub indexing for connector {connector_id}. Processed {documents_processed} files."
        )
        # Log success
        await task_logger.log_task_success(
            log_entry,
            f"Successfully completed GitHub indexing for connector {connector_id}",
            {
                "documents_processed": documents_processed,
                "errors_count": len(errors),
                "repo_count": len(repo_full_names_to_index),
            },
        )
    except SQLAlchemyError as db_err:
        await session.rollback()
        await task_logger.log_task_failure(
            log_entry,
            f"Database error during GitHub indexing for connector {connector_id}",
            str(db_err),
            {"error_type": "SQLAlchemyError"},
        )
        logger.error(
            f"Database error during GitHub indexing for connector {connector_id}: {db_err}"
        )
        errors.append(f"Database error: {db_err}")
        return documents_processed, "; ".join(errors) if errors else str(db_err)
    except Exception as e:
        await session.rollback()
        await task_logger.log_task_failure(
            log_entry,
            f"Unexpected error during GitHub indexing for connector {connector_id}",
            str(e),
            {"error_type": type(e).__name__},
        )
        logger.error(
            f"Unexpected error during GitHub indexing for connector {connector_id}: {e}",
            exc_info=True,
        )
        errors.append(f"Unexpected error: {e}")
        return documents_processed, "; ".join(errors) if errors else str(e)
    error_message = "; ".join(errors) if errors else None
    return documents_processed, error_message
--- a/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py
@ -0,0 +1,350 @@
 """
 Google Calendar connector indexer.
 """
 from datetime import datetime, timedelta
 from google.oauth2.credentials import Credentials
 from sqlalchemy.exc import SQLAlchemyError
 from sqlalchemy.ext.asyncio import AsyncSession
 from app.config import config
 from app.connectors.google_calendar_connector import GoogleCalendarConnector
 from app.db import Document, DocumentType, SearchSourceConnectorType
 from app.services.task_logging_service import TaskLoggingService
 from app.utils.document_converters import generate_content_hash
 from .base import (
    create_document_chunks,
    get_connector_by_id,
    logger,
    update_connector_last_indexed,
 )
 async def index_google_calendar_events(
    session: AsyncSession,
    connector_id: int,
    search_space_id: int,
    user_id: str,
    start_date: str | None = None,
    end_date: str | None = None,
    update_last_indexed: bool = True,
 ) -> tuple[int, str | None]:
    """
    Index Google Calendar events.
    Args:
        session: Database session
        connector_id: ID of the Google Calendar connector
        search_space_id: ID of the search space to store documents in
        user_id: User ID
        start_date: Start date for indexing (YYYY-MM-DD format)
        end_date: End date for indexing (YYYY-MM-DD format)
        update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
    Returns:
        Tuple containing (number of documents indexed, error message or None)
    """
    task_logger = TaskLoggingService(session, search_space_id)
    # Log task start
    log_entry = await task_logger.log_task_start(
        task_name="google_calendar_events_indexing",
        source="connector_indexing_task",
        message=f"Starting Google Calendar events indexing for connector {connector_id}",
        metadata={
            "connector_id": connector_id,
            "user_id": str(user_id),
            "start_date": start_date,
            "end_date": end_date,
        },
    )
    try:
        # Get the connector from the database
        connector = await get_connector_by_id(
            session, connector_id, SearchSourceConnectorType.GOOGLE_CALENDAR_CONNECTOR
        )
        if not connector:
            await task_logger.log_task_failure(
                log_entry,
                f"Connector with ID {connector_id} not found",
                "Connector not found",
                {"error_type": "ConnectorNotFound"},
            )
            return 0, f"Connector with ID {connector_id} not found"
        # Get the Google Calendar credentials from the connector config
        credentials = Credentials(
            token=connector.config.get("token"),
            refresh_token=connector.config.get("refresh_token"),
            token_uri=connector.config.get("token_uri"),
            client_id=connector.config.get("client_id"),
            client_secret=connector.config.get("client_secret"),
            scopes=connector.config.get("scopes"),
        )
        if (
            not credentials.client_id
            or not credentials.client_secret
            or not credentials.refresh_token
        ):
            await task_logger.log_task_failure(
                log_entry,
                f"Google Calendar credentials not found in connector config for connector {connector_id}",
                "Missing Google Calendar credentials",
                {"error_type": "MissingCredentials"},
            )
            return 0, "Google Calendar credentials not found in connector config"
        # Initialize Google Calendar client
        await task_logger.log_task_progress(
            log_entry,
            f"Initializing Google Calendar client for connector {connector_id}",
            {"stage": "client_initialization"},
        )
        calendar_client = GoogleCalendarConnector(credentials=credentials)
        # Calculate date range
        if start_date is None or end_date is None:
            # Fall back to calculating dates based on last_indexed_at
            calculated_end_date = datetime.now()
            # Use last_indexed_at as start date if available, otherwise use 30 days ago
            if connector.last_indexed_at:
                # Convert dates to be comparable (both timezone-naive)
                last_indexed_naive = (
                    connector.last_indexed_at.replace(tzinfo=None)
                    if connector.last_indexed_at.tzinfo
                    else connector.last_indexed_at
                )
                # Check if last_indexed_at is in the future or after end_date
                if last_indexed_naive > calculated_end_date:
                    logger.warning(
                        f"Last indexed date ({last_indexed_naive.strftime('%Y-%m-%d')}) is in the future. Using 30 days ago instead."
                    )
                    calculated_start_date = calculated_end_date - timedelta(days=30)
                else:
                    calculated_start_date = last_indexed_naive
                    logger.info(
                        f"Using last_indexed_at ({calculated_start_date.strftime('%Y-%m-%d')}) as start date"
                    )
            else:
                calculated_start_date = calculated_end_date - timedelta(
                    days=30
                )  # Use 30 days as default for calendar events
                logger.info(
                    f"No last_indexed_at found, using {calculated_start_date.strftime('%Y-%m-%d')} (30 days ago) as start date"
                )
            # Use calculated dates if not provided
            start_date_str = (
                start_date if start_date else calculated_start_date.strftime("%Y-%m-%d")
            )
            end_date_str = (
                end_date if end_date else calculated_end_date.strftime("%Y-%m-%d")
            )
        else:
            # Use provided dates
            start_date_str = start_date
            end_date_str = end_date
        await task_logger.log_task_progress(
            log_entry,
            f"Fetching Google Calendar events from {start_date_str} to {end_date_str}",
            {
                "stage": "fetching_events",
                "start_date": start_date_str,
                "end_date": end_date_str,
            },
        )
        # Get events within date range from primary calendar
        try:
            events, error = calendar_client.get_all_primary_calendar_events(
                start_date=start_date_str, end_date=end_date_str
            )
            if error:
                logger.error(f"Failed to get Google Calendar events: {error}")
                # Don't treat "No events found" as an error that should stop indexing
                if "No events found" in error:
                    logger.info(
                        "No events found is not a critical error, continuing with update"
                    )
                    if update_last_indexed:
                        await update_connector_last_indexed(
                            session, connector, update_last_indexed
                        )
                        await session.commit()
                        logger.info(
                            f"Updated last_indexed_at to {connector.last_indexed_at} despite no events found"
                        )
                    await task_logger.log_task_success(
                        log_entry,
                        f"No Google Calendar events found in date range {start_date_str} to {end_date_str}",
                        {"events_found": 0},
                    )
                    return 0, None
                else:
                    await task_logger.log_task_failure(
                        log_entry,
                        f"Failed to get Google Calendar events: {error}",
                        "API Error",
                        {"error_type": "APIError"},
                    )
                    return 0, f"Failed to get Google Calendar events: {error}"
            logger.info(f"Retrieved {len(events)} events from Google Calendar API")
        except Exception as e:
            logger.error(f"Error fetching Google Calendar events: {e!s}", exc_info=True)
            return 0, f"Error fetching Google Calendar events: {e!s}"
        documents_indexed = 0
        documents_skipped = 0
        skipped_events = []
        for event in events:
            try:
                event_id = event.get("id")
                event_summary = event.get("summary", "No Title")
                calendar_id = event.get("calendarId", "")
                if not event_id:
                    logger.warning(f"Skipping event with missing ID: {event_summary}")
                    skipped_events.append(f"{event_summary} (missing ID)")
                    documents_skipped += 1
                    continue
                event_markdown = calendar_client.format_event_to_markdown(event)
                if not event_markdown.strip():
                    logger.warning(f"Skipping event with no content: {event_summary}")
                    skipped_events.append(f"{event_summary} (no content)")
                    documents_skipped += 1
                    continue
                start = event.get("start", {})
                end = event.get("end", {})
                start_time = start.get("dateTime") or start.get("date", "")
                end_time = end.get("dateTime") or end.get("date", "")
                location = event.get("location", "")
                description = event.get("description", "")
                summary_content = f"Google Calendar Event: {event_summary}\n\n"
                summary_content += f"Calendar: {calendar_id}\n"
                summary_content += f"Start: {start_time}\n"
                summary_content += f"End: {end_time}\n"
                if location:
                    summary_content += f"Location: {location}\n"
                if description:
                    desc_preview = description[:300]
                    if len(description) > 300:
                        desc_preview += "..."
                    summary_content += f"Description: {desc_preview}\n"
                content_hash = generate_content_hash(event_markdown, search_space_id)
                # Duplicate check via simple query using helper in base
                from .base import (
                    check_duplicate_document_by_hash,  # local import to avoid circular at module import
                )
                existing_document_by_hash = await check_duplicate_document_by_hash(
                    session, content_hash
                )
                if existing_document_by_hash:
                    logger.info(
                        f"Document with content hash {content_hash} already exists for event {event_summary}. Skipping processing."
                    )
                    documents_skipped += 1
                    continue
                # Embeddings and chunks
                summary_embedding = config.embedding_model_instance.embed(
                    summary_content
                )
                chunks = await create_document_chunks(event_markdown)
                document = Document(
                    search_space_id=search_space_id,
                    title=f"Calendar Event - {event_summary}",
                    document_type=DocumentType.GOOGLE_CALENDAR_CONNECTOR,
                    document_metadata={
                        "event_id": event_id,
                        "event_summary": event_summary,
                        "calendar_id": calendar_id,
                        "start_time": start_time,
                        "end_time": end_time,
                        "location": location,
                        "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                    },
                    content=summary_content,
                    content_hash=content_hash,
                    embedding=summary_embedding,
                    chunks=chunks,
                )
                session.add(document)
                documents_indexed += 1
                logger.info(f"Successfully indexed new event {event_summary}")
            except Exception as e:
                logger.error(
                    f"Error processing event {event.get('summary', 'Unknown')}: {e!s}",
                    exc_info=True,
                )
                skipped_events.append(
                    f"{event.get('summary', 'Unknown')} (processing error)"
                )
                documents_skipped += 1
                continue
        total_processed = documents_indexed
        if total_processed > 0:
            await update_connector_last_indexed(session, connector, update_last_indexed)
        await session.commit()
        await task_logger.log_task_success(
            log_entry,
            f"Successfully completed Google Calendar indexing for connector {connector_id}",
            {
                "events_processed": total_processed,
                "documents_indexed": documents_indexed,
                "documents_skipped": documents_skipped,
                "skipped_events_count": len(skipped_events),
            },
        )
        logger.info(
            f"Google Calendar indexing completed: {documents_indexed} new events, {documents_skipped} skipped"
        )
        return total_processed, None
    except SQLAlchemyError as db_error:
        await session.rollback()
        await task_logger.log_task_failure(
            log_entry,
            f"Database error during Google Calendar indexing for connector {connector_id}",
            str(db_error),
            {"error_type": "SQLAlchemyError"},
        )
        logger.error(f"Database error: {db_error!s}", exc_info=True)
        return 0, f"Database error: {db_error!s}"
    except Exception as e:
        await session.rollback()
        await task_logger.log_task_failure(
            log_entry,
            f"Failed to index Google Calendar events for connector {connector_id}",
            str(e),
            {"error_type": type(e).__name__},
        )
        logger.error(f"Failed to index Google Calendar events: {e!s}", exc_info=True)
        return 0, f"Failed to index Google Calendar events: {e!s}"
--- a/surfsense_backend/app/tasks/connector_indexers/jira_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/jira_indexer.py
@ -0,0 +1,320 @@
 """
 Jira connector indexer.
 """
 from datetime import datetime
 from sqlalchemy.exc import SQLAlchemyError
 from sqlalchemy.ext.asyncio import AsyncSession
 from app.config import config
 from app.connectors.jira_connector import JiraConnector
 from app.db import Document, DocumentType, SearchSourceConnectorType
 from app.services.task_logging_service import TaskLoggingService
 from app.utils.document_converters import generate_content_hash
 from .base import (
    calculate_date_range,
    check_duplicate_document_by_hash,
    create_document_chunks,
    get_connector_by_id,
    logger,
    update_connector_last_indexed,
 )
 async def index_jira_issues(
    session: AsyncSession,
    connector_id: int,
    search_space_id: int,
    user_id: str,
    start_date: str | None = None,
    end_date: str | None = None,
    update_last_indexed: bool = True,
 ) -> tuple[int, str | None]:
    """
    Index Jira issues and comments.
    Args:
        session: Database session
        connector_id: ID of the Jira connector
        search_space_id: ID of the search space to store documents in
        user_id: User ID
        start_date: Start date for indexing (YYYY-MM-DD format)
        end_date: End date for indexing (YYYY-MM-DD format)
        update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
    Returns:
        Tuple containing (number of documents indexed, error message or None)
    """
    task_logger = TaskLoggingService(session, search_space_id)
    # Log task start
    log_entry = await task_logger.log_task_start(
        task_name="jira_issues_indexing",
        source="connector_indexing_task",
        message=f"Starting Jira issues indexing for connector {connector_id}",
        metadata={
            "connector_id": connector_id,
            "user_id": str(user_id),
            "start_date": start_date,
            "end_date": end_date,
        },
    )
    try:
        # Get the connector from the database
        connector = await get_connector_by_id(
            session, connector_id, SearchSourceConnectorType.JIRA_CONNECTOR
        )
        if not connector:
            await task_logger.log_task_failure(
                log_entry,
                f"Connector with ID {connector_id} not found",
                "Connector not found",
                {"error_type": "ConnectorNotFound"},
            )
            return 0, f"Connector with ID {connector_id} not found"
        # Get the Jira credentials from the connector config
        jira_email = connector.config.get("JIRA_EMAIL")
        jira_api_token = connector.config.get("JIRA_API_TOKEN")
        jira_base_url = connector.config.get("JIRA_BASE_URL")
        if not jira_email or not jira_api_token or not jira_base_url:
            await task_logger.log_task_failure(
                log_entry,
                f"Jira credentials not found in connector config for connector {connector_id}",
                "Missing Jira credentials",
                {"error_type": "MissingCredentials"},
            )
            return 0, "Jira credentials not found in connector config"
        # Initialize Jira client
        await task_logger.log_task_progress(
            log_entry,
            f"Initializing Jira client for connector {connector_id}",
            {"stage": "client_initialization"},
        )
        jira_client = JiraConnector(
            base_url=jira_base_url, email=jira_email, api_token=jira_api_token
        )
        # Calculate date range
        start_date_str, end_date_str = calculate_date_range(
            connector, start_date, end_date, default_days_back=365
        )
        await task_logger.log_task_progress(
            log_entry,
            f"Fetching Jira issues from {start_date_str} to {end_date_str}",
            {
                "stage": "fetching_issues",
                "start_date": start_date_str,
                "end_date": end_date_str,
            },
        )
        # Get issues within date range
        try:
            issues, error = jira_client.get_issues_by_date_range(
                start_date=start_date_str, end_date=end_date_str, include_comments=True
            )
            if error:
                logger.error(f"Failed to get Jira issues: {error}")
                # Don't treat "No issues found" as an error that should stop indexing
                if "No issues found" in error:
                    logger.info(
                        "No issues found is not a critical error, continuing with update"
                    )
                    if update_last_indexed:
                        await update_connector_last_indexed(
                            session, connector, update_last_indexed
                        )
                        await session.commit()
                        logger.info(
                            f"Updated last_indexed_at to {connector.last_indexed_at} despite no issues found"
                        )
                    await task_logger.log_task_success(
                        log_entry,
                        f"No Jira issues found in date range {start_date_str} to {end_date_str}",
                        {"issues_found": 0},
                    )
                    return 0, None
                else:
                    await task_logger.log_task_failure(
                        log_entry,
                        f"Failed to get Jira issues: {error}",
                        "API Error",
                        {"error_type": "APIError"},
                    )
                    return 0, f"Failed to get Jira issues: {error}"
            logger.info(f"Retrieved {len(issues)} issues from Jira API")
        except Exception as e:
            logger.error(f"Error fetching Jira issues: {e!s}", exc_info=True)
            return 0, f"Error fetching Jira issues: {e!s}"
        # Process and index each issue
        documents_indexed = 0
        skipped_issues = []
        documents_skipped = 0
        for issue in issues:
            try:
                issue_id = issue.get("key")
                issue_identifier = issue.get("key", "")
                issue_title = issue.get("id", "")
                if not issue_id or not issue_title:
                    logger.warning(
                        f"Skipping issue with missing ID or title: {issue_id or 'Unknown'}"
                    )
                    skipped_issues.append(
                        f"{issue_identifier or 'Unknown'} (missing data)"
                    )
                    documents_skipped += 1
                    continue
                # Format the issue for better readability
                formatted_issue = jira_client.format_issue(issue)
                # Convert to markdown
                issue_content = jira_client.format_issue_to_markdown(formatted_issue)
                if not issue_content:
                    logger.warning(
                        f"Skipping issue with no content: {issue_identifier} - {issue_title}"
                    )
                    skipped_issues.append(f"{issue_identifier} (no content)")
                    documents_skipped += 1
                    continue
                # Create a simple summary
                summary_content = f"Jira Issue {issue_identifier}: {issue_title}\n\nStatus: {formatted_issue.get('status', 'Unknown')}\n\n"
                if formatted_issue.get("description"):
                    summary_content += (
                        f"Description: {formatted_issue.get('description')}\n\n"
                    )
                # Add comment count
                comment_count = len(formatted_issue.get("comments", []))
                summary_content += f"Comments: {comment_count}"
                # Generate content hash
                content_hash = generate_content_hash(issue_content, search_space_id)
                # Check if document already exists
                existing_document_by_hash = await check_duplicate_document_by_hash(
                    session, content_hash
                )
                if existing_document_by_hash:
                    logger.info(
                        f"Document with content hash {content_hash} already exists for issue {issue_identifier}. Skipping processing."
                    )
                    documents_skipped += 1
                    continue
                # Generate embedding for the summary
                summary_embedding = config.embedding_model_instance.embed(
                    summary_content
                )
                # Process chunks - using the full issue content with comments
                chunks = await create_document_chunks(issue_content)
                # Create and store new document
                logger.info(
                    f"Creating new document for issue {issue_identifier} - {issue_title}"
                )
                document = Document(
                    search_space_id=search_space_id,
                    title=f"Jira - {issue_identifier}: {issue_title}",
                    document_type=DocumentType.JIRA_CONNECTOR,
                    document_metadata={
                        "issue_id": issue_id,
                        "issue_identifier": issue_identifier,
                        "issue_title": issue_title,
                        "state": formatted_issue.get("status", "Unknown"),
                        "comment_count": comment_count,
                        "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                    },
                    content=summary_content,
                    content_hash=content_hash,
                    embedding=summary_embedding,
                    chunks=chunks,
                )
                session.add(document)
                documents_indexed += 1
                logger.info(
                    f"Successfully indexed new issue {issue_identifier} - {issue_title}"
                )
            except Exception as e:
                logger.error(
                    f"Error processing issue {issue.get('identifier', 'Unknown')}: {e!s}",
                    exc_info=True,
                )
                skipped_issues.append(
                    f"{issue.get('identifier', 'Unknown')} (processing error)"
                )
                documents_skipped += 1
                continue  # Skip this issue and continue with others
        # Update the last_indexed_at timestamp for the connector only if requested
        total_processed = documents_indexed
        if update_last_indexed:
            await update_connector_last_indexed(session, connector, update_last_indexed)
        # Commit all changes
        await session.commit()
        logger.info("Successfully committed all JIRA document changes to database")
        # Log success
        await task_logger.log_task_success(
            log_entry,
            f"Successfully completed JIRA indexing for connector {connector_id}",
            {
                "issues_processed": total_processed,
                "documents_indexed": documents_indexed,
                "documents_skipped": documents_skipped,
                "skipped_issues_count": len(skipped_issues),
            },
        )
        logger.info(
            f"JIRA indexing completed: {documents_indexed} new issues, {documents_skipped} skipped"
        )
        return (
            total_processed,
            None,
        )  # Return None as the error message to indicate success
    except SQLAlchemyError as db_error:
        await session.rollback()
        await task_logger.log_task_failure(
            log_entry,
            f"Database error during JIRA indexing for connector {connector_id}",
            str(db_error),
            {"error_type": "SQLAlchemyError"},
        )
        logger.error(f"Database error: {db_error!s}", exc_info=True)
        return 0, f"Database error: {db_error!s}"
    except Exception as e:
        await session.rollback()
        await task_logger.log_task_failure(
            log_entry,
            f"Failed to index JIRA issues for connector {connector_id}",
            str(e),
            {"error_type": type(e).__name__},
        )
        logger.error(f"Failed to index JIRA issues: {e!s}", exc_info=True)
        return 0, f"Failed to index JIRA issues: {e!s}"
--- a/surfsense_backend/app/tasks/connector_indexers/linear_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/linear_indexer.py
@ -0,0 +1,337 @@
 """
 Linear connector indexer.
 """
 from datetime import datetime
 from sqlalchemy.exc import SQLAlchemyError
 from sqlalchemy.ext.asyncio import AsyncSession
 from app.config import config
 from app.connectors.linear_connector import LinearConnector
 from app.db import Document, DocumentType, SearchSourceConnectorType
 from app.services.task_logging_service import TaskLoggingService
 from app.utils.document_converters import generate_content_hash
 from .base import (
    calculate_date_range,
    check_duplicate_document_by_hash,
    create_document_chunks,
    get_connector_by_id,
    logger,
    update_connector_last_indexed,
 )
 async def index_linear_issues(
    session: AsyncSession,
    connector_id: int,
    search_space_id: int,
    user_id: str,
    start_date: str | None = None,
    end_date: str | None = None,
    update_last_indexed: bool = True,
 ) -> tuple[int, str | None]:
    """
    Index Linear issues and comments.
    Args:
        session: Database session
        connector_id: ID of the Linear connector
        search_space_id: ID of the search space to store documents in
        user_id: ID of the user
        start_date: Start date for indexing (YYYY-MM-DD format)
        end_date: End date for indexing (YYYY-MM-DD format)
        update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
    Returns:
        Tuple containing (number of documents indexed, error message or None)
    """
    task_logger = TaskLoggingService(session, search_space_id)
    # Log task start
    log_entry = await task_logger.log_task_start(
        task_name="linear_issues_indexing",
        source="connector_indexing_task",
        message=f"Starting Linear issues indexing for connector {connector_id}",
        metadata={
            "connector_id": connector_id,
            "user_id": str(user_id),
            "start_date": start_date,
            "end_date": end_date,
        },
    )
    try:
        # Get the connector
        await task_logger.log_task_progress(
            log_entry,
            f"Retrieving Linear connector {connector_id} from database",
            {"stage": "connector_retrieval"},
        )
        connector = await get_connector_by_id(
            session, connector_id, SearchSourceConnectorType.LINEAR_CONNECTOR
        )
        if not connector:
            await task_logger.log_task_failure(
                log_entry,
                f"Connector with ID {connector_id} not found or is not a Linear connector",
                "Connector not found",
                {"error_type": "ConnectorNotFound"},
            )
            return (
                0,
                f"Connector with ID {connector_id} not found or is not a Linear connector",
            )
        # Get the Linear token from the connector config
        linear_token = connector.config.get("LINEAR_API_KEY")
        if not linear_token:
            await task_logger.log_task_failure(
                log_entry,
                f"Linear API token not found in connector config for connector {connector_id}",
                "Missing Linear token",
                {"error_type": "MissingToken"},
            )
            return 0, "Linear API token not found in connector config"
        # Initialize Linear client
        await task_logger.log_task_progress(
            log_entry,
            f"Initializing Linear client for connector {connector_id}",
            {"stage": "client_initialization"},
        )
        linear_client = LinearConnector(token=linear_token)
        # Calculate date range
        start_date_str, end_date_str = calculate_date_range(
            connector, start_date, end_date, default_days_back=365
        )
        logger.info(f"Fetching Linear issues from {start_date_str} to {end_date_str}")
        await task_logger.log_task_progress(
            log_entry,
            f"Fetching Linear issues from {start_date_str} to {end_date_str}",
            {
                "stage": "fetch_issues",
                "start_date": start_date_str,
                "end_date": end_date_str,
            },
        )
        # Get issues within date range
        try:
            issues, error = linear_client.get_issues_by_date_range(
                start_date=start_date_str, end_date=end_date_str, include_comments=True
            )
            if error:
                logger.error(f"Failed to get Linear issues: {error}")
                # Don't treat "No issues found" as an error that should stop indexing
                if "No issues found" in error:
                    logger.info(
                        "No issues found is not a critical error, continuing with update"
                    )
                    if update_last_indexed:
                        await update_connector_last_indexed(
                            session, connector, update_last_indexed
                        )
                        await session.commit()
                        logger.info(
                            f"Updated last_indexed_at to {connector.last_indexed_at} despite no issues found"
                        )
                    return 0, None
                else:
                    return 0, f"Failed to get Linear issues: {error}"
            logger.info(f"Retrieved {len(issues)} issues from Linear API")
        except Exception as e:
            logger.error(f"Exception when calling Linear API: {e!s}", exc_info=True)
            return 0, f"Failed to get Linear issues: {e!s}"
        if not issues:
            logger.info("No Linear issues found for the specified date range")
            if update_last_indexed:
                await update_connector_last_indexed(
                    session, connector, update_last_indexed
                )
                await session.commit()
                logger.info(
                    f"Updated last_indexed_at to {connector.last_indexed_at} despite no issues found"
                )
            return 0, None  # Return None instead of error message when no issues found
        # Track the number of documents indexed
        documents_indexed = 0
        documents_skipped = 0
        skipped_issues = []
        await task_logger.log_task_progress(
            log_entry,
            f"Starting to process {len(issues)} Linear issues",
            {"stage": "process_issues", "total_issues": len(issues)},
        )
        # Process each issue
        for issue in issues:
            try:
                issue_id = issue.get("id", "")
                issue_identifier = issue.get("identifier", "")
                issue_title = issue.get("title", "")
                if not issue_id or not issue_title:
                    logger.warning(
                        f"Skipping issue with missing ID or title: {issue_id or 'Unknown'}"
                    )
                    skipped_issues.append(
                        f"{issue_identifier or 'Unknown'} (missing data)"
                    )
                    documents_skipped += 1
                    continue
                # Format the issue first to get well-structured data
                formatted_issue = linear_client.format_issue(issue)
                # Convert issue to markdown format
                issue_content = linear_client.format_issue_to_markdown(formatted_issue)
                if not issue_content:
                    logger.warning(
                        f"Skipping issue with no content: {issue_identifier} - {issue_title}"
                    )
                    skipped_issues.append(f"{issue_identifier} (no content)")
                    documents_skipped += 1
                    continue
                # Create a short summary for the embedding
                state = formatted_issue.get("state", "Unknown")
                description = formatted_issue.get("description", "")
                # Truncate description if it's too long for the summary
                if description and len(description) > 500:
                    description = description[:497] + "..."
                # Create a simple summary from the issue data
                summary_content = f"Linear Issue {issue_identifier}: {issue_title}\n\nStatus: {state}\n\n"
                if description:
                    summary_content += f"Description: {description}\n\n"
                # Add comment count
                comment_count = len(formatted_issue.get("comments", []))
                summary_content += f"Comments: {comment_count}"
                content_hash = generate_content_hash(issue_content, search_space_id)
                # Check if document with this content hash already exists
                existing_document_by_hash = await check_duplicate_document_by_hash(
                    session, content_hash
                )
                if existing_document_by_hash:
                    logger.info(
                        f"Document with content hash {content_hash} already exists for issue {issue_identifier}. Skipping processing."
                    )
                    documents_skipped += 1
                    continue
                # Generate embedding for the summary
                summary_embedding = config.embedding_model_instance.embed(
                    summary_content
                )
                # Process chunks - using the full issue content with comments
                chunks = await create_document_chunks(issue_content)
                # Create and store new document
                logger.info(
                    f"Creating new document for issue {issue_identifier} - {issue_title}"
                )
                document = Document(
                    search_space_id=search_space_id,
                    title=f"Linear - {issue_identifier}: {issue_title}",
                    document_type=DocumentType.LINEAR_CONNECTOR,
                    document_metadata={
                        "issue_id": issue_id,
                        "issue_identifier": issue_identifier,
                        "issue_title": issue_title,
                        "state": state,
                        "comment_count": comment_count,
                        "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                    },
                    content=summary_content,
                    content_hash=content_hash,
                    embedding=summary_embedding,
                    chunks=chunks,
                )
                session.add(document)
                documents_indexed += 1
                logger.info(
                    f"Successfully indexed new issue {issue_identifier} - {issue_title}"
                )
            except Exception as e:
                logger.error(
                    f"Error processing issue {issue.get('identifier', 'Unknown')}: {e!s}",
                    exc_info=True,
                )
                skipped_issues.append(
                    f"{issue.get('identifier', 'Unknown')} (processing error)"
                )
                documents_skipped += 1
                continue  # Skip this issue and continue with others
        # Update the last_indexed_at timestamp for the connector only if requested
        total_processed = documents_indexed
        if update_last_indexed:
            await update_connector_last_indexed(session, connector, update_last_indexed)
        # Commit all changes
        await session.commit()
        logger.info("Successfully committed all Linear document changes to database")
        # Log success
        await task_logger.log_task_success(
            log_entry,
            f"Successfully completed Linear indexing for connector {connector_id}",
            {
                "issues_processed": total_processed,
                "documents_indexed": documents_indexed,
                "documents_skipped": documents_skipped,
                "skipped_issues_count": len(skipped_issues),
            },
        )
        logger.info(
            f"Linear indexing completed: {documents_indexed} new issues, {documents_skipped} skipped"
        )
        return (
            total_processed,
            None,
        )  # Return None as the error message to indicate success
    except SQLAlchemyError as db_error:
        await session.rollback()
        await task_logger.log_task_failure(
            log_entry,
            f"Database error during Linear indexing for connector {connector_id}",
            str(db_error),
            {"error_type": "SQLAlchemyError"},
        )
        logger.error(f"Database error: {db_error!s}", exc_info=True)
        return 0, f"Database error: {db_error!s}"
    except Exception as e:
        await session.rollback()
        await task_logger.log_task_failure(
            log_entry,
            f"Failed to index Linear issues for connector {connector_id}",
            str(e),
            {"error_type": type(e).__name__},
        )
        logger.error(f"Failed to index Linear issues: {e!s}", exc_info=True)
        return 0, f"Failed to index Linear issues: {e!s}"
--- a/surfsense_backend/app/tasks/connector_indexers/notion_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/notion_indexer.py
@ -0,0 +1,406 @@
 """
 Notion connector indexer.
 """
 from datetime import datetime, timedelta
 from sqlalchemy.exc import SQLAlchemyError
 from sqlalchemy.ext.asyncio import AsyncSession
 from app.config import config
 from app.connectors.notion_history import NotionHistoryConnector
 from app.db import Document, DocumentType, SearchSourceConnectorType
 from app.prompts import SUMMARY_PROMPT_TEMPLATE
 from app.services.llm_service import get_user_long_context_llm
 from app.services.task_logging_service import TaskLoggingService
 from app.utils.document_converters import generate_content_hash
 from .base import (
    build_document_metadata_string,
    check_duplicate_document_by_hash,
    create_document_chunks,
    get_connector_by_id,
    logger,
    update_connector_last_indexed,
 )
 async def index_notion_pages(
    session: AsyncSession,
    connector_id: int,
    search_space_id: int,
    user_id: str,
    start_date: str | None = None,
    end_date: str | None = None,
    update_last_indexed: bool = True,
 ) -> tuple[int, str | None]:
    """
    Index Notion pages from all accessible pages.
    Args:
        session: Database session
        connector_id: ID of the Notion connector
        search_space_id: ID of the search space to store documents in
        user_id: ID of the user
        start_date: Start date for indexing (YYYY-MM-DD format)
        end_date: End date for indexing (YYYY-MM-DD format)
        update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
    Returns:
        Tuple containing (number of documents indexed, error message or None)
    """
    task_logger = TaskLoggingService(session, search_space_id)
    # Log task start
    log_entry = await task_logger.log_task_start(
        task_name="notion_pages_indexing",
        source="connector_indexing_task",
        message=f"Starting Notion pages indexing for connector {connector_id}",
        metadata={
            "connector_id": connector_id,
            "user_id": str(user_id),
            "start_date": start_date,
            "end_date": end_date,
        },
    )
    try:
        # Get the connector
        await task_logger.log_task_progress(
            log_entry,
            f"Retrieving Notion connector {connector_id} from database",
            {"stage": "connector_retrieval"},
        )
        connector = await get_connector_by_id(
            session, connector_id, SearchSourceConnectorType.NOTION_CONNECTOR
        )
        if not connector:
            await task_logger.log_task_failure(
                log_entry,
                f"Connector with ID {connector_id} not found or is not a Notion connector",
                "Connector not found",
                {"error_type": "ConnectorNotFound"},
            )
            return (
                0,
                f"Connector with ID {connector_id} not found or is not a Notion connector",
            )
        # Get the Notion token from the connector config
        notion_token = connector.config.get("NOTION_INTEGRATION_TOKEN")
        if not notion_token:
            await task_logger.log_task_failure(
                log_entry,
                f"Notion integration token not found in connector config for connector {connector_id}",
                "Missing Notion token",
                {"error_type": "MissingToken"},
            )
            return 0, "Notion integration token not found in connector config"
        # Initialize Notion client
        await task_logger.log_task_progress(
            log_entry,
            f"Initializing Notion client for connector {connector_id}",
            {"stage": "client_initialization"},
        )
        logger.info(f"Initializing Notion client for connector {connector_id}")
        notion_client = NotionHistoryConnector(token=notion_token)
        # Calculate date range
        if start_date is None or end_date is None:
            # Fall back to calculating dates
            calculated_end_date = datetime.now()
            calculated_start_date = calculated_end_date - timedelta(
                days=365
            )  # Check for last 1 year of pages
            # Use calculated dates if not provided
            if start_date is None:
                start_date_iso = calculated_start_date.strftime("%Y-%m-%dT%H:%M:%SZ")
            else:
                # Convert YYYY-MM-DD to ISO format
                start_date_iso = datetime.strptime(start_date, "%Y-%m-%d").strftime(
                    "%Y-%m-%dT%H:%M:%SZ"
                )
            if end_date is None:
                end_date_iso = calculated_end_date.strftime("%Y-%m-%dT%H:%M:%SZ")
            else:
                # Convert YYYY-MM-DD to ISO format
                end_date_iso = datetime.strptime(end_date, "%Y-%m-%d").strftime(
                    "%Y-%m-%dT%H:%M:%SZ"
                )
        else:
            # Convert provided dates to ISO format for Notion API
            start_date_iso = datetime.strptime(start_date, "%Y-%m-%d").strftime(
                "%Y-%m-%dT%H:%M:%SZ"
            )
            end_date_iso = datetime.strptime(end_date, "%Y-%m-%d").strftime(
                "%Y-%m-%dT%H:%M:%SZ"
            )
        logger.info(f"Fetching Notion pages from {start_date_iso} to {end_date_iso}")
        await task_logger.log_task_progress(
            log_entry,
            f"Fetching Notion pages from {start_date_iso} to {end_date_iso}",
            {
                "stage": "fetch_pages",
                "start_date": start_date_iso,
                "end_date": end_date_iso,
            },
        )
        # Get all pages
        try:
            pages = notion_client.get_all_pages(
                start_date=start_date_iso, end_date=end_date_iso
            )
            logger.info(f"Found {len(pages)} Notion pages")
        except Exception as e:
            await task_logger.log_task_failure(
                log_entry,
                f"Failed to get Notion pages for connector {connector_id}",
                str(e),
                {"error_type": "PageFetchError"},
            )
            logger.error(f"Error fetching Notion pages: {e!s}", exc_info=True)
            return 0, f"Failed to get Notion pages: {e!s}"
        if not pages:
            await task_logger.log_task_success(
                log_entry,
                f"No Notion pages found for connector {connector_id}",
                {"pages_found": 0},
            )
            logger.info("No Notion pages found to index")
            return 0, "No Notion pages found"
        # Track the number of documents indexed
        documents_indexed = 0
        documents_skipped = 0
        skipped_pages = []
        await task_logger.log_task_progress(
            log_entry,
            f"Starting to process {len(pages)} Notion pages",
            {"stage": "process_pages", "total_pages": len(pages)},
        )
        # Process each page
        for page in pages:
            try:
                page_id = page.get("page_id")
                page_title = page.get("title", f"Untitled page ({page_id})")
                page_content = page.get("content", [])
                logger.info(f"Processing Notion page: {page_title} ({page_id})")
                if not page_content:
                    logger.info(f"No content found in page {page_title}. Skipping.")
                    skipped_pages.append(f"{page_title} (no content)")
                    documents_skipped += 1
                    continue
                # Convert page content to markdown format
                markdown_content = f"# Notion Page: {page_title}\n\n"
                # Process blocks recursively
                def process_blocks(blocks, level=0):
                    result = ""
                    for block in blocks:
                        block_type = block.get("type")
                        block_content = block.get("content", "")
                        children = block.get("children", [])
                        # Add indentation based on level
                        indent = "  " * level
                        # Format based on block type
                        if block_type in ["paragraph", "text"]:
                            result += f"{indent}{block_content}\n\n"
                        elif block_type in ["heading_1", "header"]:
                            result += f"{indent}# {block_content}\n\n"
                        elif block_type == "heading_2":
                            result += f"{indent}## {block_content}\n\n"
                        elif block_type == "heading_3":
                            result += f"{indent}### {block_content}\n\n"
                        elif block_type == "bulleted_list_item":
                            result += f"{indent}* {block_content}\n"
                        elif block_type == "numbered_list_item":
                            result += f"{indent}1. {block_content}\n"
                        elif block_type == "to_do":
                            result += f"{indent}- [ ] {block_content}\n"
                        elif block_type == "toggle":
                            result += f"{indent}> {block_content}\n"
                        elif block_type == "code":
                            result += f"{indent}```\n{block_content}\n```\n\n"
                        elif block_type == "quote":
                            result += f"{indent}> {block_content}\n\n"
                        elif block_type == "callout":
                            result += f"{indent}> **Note:** {block_content}\n\n"
                        elif block_type == "image":
                            result += f"{indent}![Image]({block_content})\n\n"
                        else:
                            # Default for other block types
                            if block_content:
                                result += f"{indent}{block_content}\n\n"
                        # Process children recursively
                        if children:
                            result += process_blocks(children, level + 1)
                    return result
                logger.debug(
                    f"Converting {len(page_content)} blocks to markdown for page {page_title}"
                )
                markdown_content += process_blocks(page_content)
                # Format document metadata
                metadata_sections = [
                    ("METADATA", [f"PAGE_TITLE: {page_title}", f"PAGE_ID: {page_id}"]),
                    (
                        "CONTENT",
                        [
                            "FORMAT: markdown",
                            "TEXT_START",
                            markdown_content,
                            "TEXT_END",
                        ],
                    ),
                ]
                # Build the document string
                combined_document_string = build_document_metadata_string(
                    metadata_sections
                )
                content_hash = generate_content_hash(
                    combined_document_string, search_space_id
                )
                # Check if document with this content hash already exists
                existing_document_by_hash = await check_duplicate_document_by_hash(
                    session, content_hash
                )
                if existing_document_by_hash:
                    logger.info(
                        f"Document with content hash {content_hash} already exists for page {page_title}. Skipping processing."
                    )
                    documents_skipped += 1
                    continue
                # Get user's long context LLM
                user_llm = await get_user_long_context_llm(session, user_id)
                if not user_llm:
                    logger.error(f"No long context LLM configured for user {user_id}")
                    skipped_pages.append(f"{page_title} (no LLM configured)")
                    documents_skipped += 1
                    continue
                # Generate summary
                logger.debug(f"Generating summary for page {page_title}")
                summary_chain = SUMMARY_PROMPT_TEMPLATE | user_llm
                summary_result = await summary_chain.ainvoke(
                    {"document": combined_document_string}
                )
                summary_content = summary_result.content
                summary_embedding = config.embedding_model_instance.embed(
                    summary_content
                )
                # Process chunks
                logger.debug(f"Chunking content for page {page_title}")
                chunks = await create_document_chunks(markdown_content)
                # Create and store new document
                document = Document(
                    search_space_id=search_space_id,
                    title=f"Notion - {page_title}",
                    document_type=DocumentType.NOTION_CONNECTOR,
                    document_metadata={
                        "page_title": page_title,
                        "page_id": page_id,
                        "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                    },
                    content=summary_content,
                    content_hash=content_hash,
                    embedding=summary_embedding,
                    chunks=chunks,
                )
                session.add(document)
                documents_indexed += 1
                logger.info(f"Successfully indexed new Notion page: {page_title}")
            except Exception as e:
                logger.error(
                    f"Error processing Notion page {page.get('title', 'Unknown')}: {e!s}",
                    exc_info=True,
                )
                skipped_pages.append(
                    f"{page.get('title', 'Unknown')} (processing error)"
                )
                documents_skipped += 1
                continue  # Skip this page and continue with others
        # Update the last_indexed_at timestamp for the connector only if requested
        # and if we successfully indexed at least one page
        total_processed = documents_indexed
        if total_processed > 0:
            await update_connector_last_indexed(session, connector, update_last_indexed)
        # Commit all changes
        await session.commit()
        # Prepare result message
        result_message = None
        if skipped_pages:
            result_message = f"Processed {total_processed} pages. Skipped {len(skipped_pages)} pages: {', '.join(skipped_pages)}"
        else:
            result_message = f"Processed {total_processed} pages."
        # Log success
        await task_logger.log_task_success(
            log_entry,
            f"Successfully completed Notion indexing for connector {connector_id}",
            {
                "pages_processed": total_processed,
                "documents_indexed": documents_indexed,
                "documents_skipped": documents_skipped,
                "skipped_pages_count": len(skipped_pages),
                "result_message": result_message,
            },
        )
        logger.info(
            f"Notion indexing completed: {documents_indexed} new pages, {documents_skipped} skipped"
        )
        return total_processed, result_message
    except SQLAlchemyError as db_error:
        await session.rollback()
        await task_logger.log_task_failure(
            log_entry,
            f"Database error during Notion indexing for connector {connector_id}",
            str(db_error),
            {"error_type": "SQLAlchemyError"},
        )
        logger.error(
            f"Database error during Notion indexing: {db_error!s}", exc_info=True
        )
        return 0, f"Database error: {db_error!s}"
    except Exception as e:
        await session.rollback()
        await task_logger.log_task_failure(
            log_entry,
            f"Failed to index Notion pages for connector {connector_id}",
            str(e),
            {"error_type": type(e).__name__},
        )
        logger.error(f"Failed to index Notion pages: {e!s}", exc_info=True)
        return 0, f"Failed to index Notion pages: {e!s}"
--- a/surfsense_backend/app/tasks/connector_indexers/slack_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/slack_indexer.py
@ -0,0 +1,396 @@
 """
 Slack connector indexer.
 """
 from datetime import datetime
 from slack_sdk.errors import SlackApiError
 from sqlalchemy.exc import SQLAlchemyError
 from sqlalchemy.ext.asyncio import AsyncSession
 from app.config import config
 from app.connectors.slack_history import SlackHistory
 from app.db import Document, DocumentType, SearchSourceConnectorType
 from app.prompts import SUMMARY_PROMPT_TEMPLATE
 from app.services.llm_service import get_user_long_context_llm
 from app.services.task_logging_service import TaskLoggingService
 from app.utils.document_converters import generate_content_hash
 from .base import (
    build_document_metadata_string,
    calculate_date_range,
    check_duplicate_document_by_hash,
    create_document_chunks,
    get_connector_by_id,
    logger,
    update_connector_last_indexed,
 )
 async def index_slack_messages(
    session: AsyncSession,
    connector_id: int,
    search_space_id: int,
    user_id: str,
    start_date: str | None = None,
    end_date: str | None = None,
    update_last_indexed: bool = True,
 ) -> tuple[int, str | None]:
    """
    Index Slack messages from all accessible channels.
    Args:
        session: Database session
        connector_id: ID of the Slack connector
        search_space_id: ID of the search space to store documents in
        user_id: ID of the user
        start_date: Start date for indexing (YYYY-MM-DD format)
        end_date: End date for indexing (YYYY-MM-DD format)
        update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
    Returns:
        Tuple containing (number of documents indexed, error message or None)
    """
    task_logger = TaskLoggingService(session, search_space_id)
    # Log task start
    log_entry = await task_logger.log_task_start(
        task_name="slack_messages_indexing",
        source="connector_indexing_task",
        message=f"Starting Slack messages indexing for connector {connector_id}",
        metadata={
            "connector_id": connector_id,
            "user_id": str(user_id),
            "start_date": start_date,
            "end_date": end_date,
        },
    )
    try:
        # Get the connector
        await task_logger.log_task_progress(
            log_entry,
            f"Retrieving Slack connector {connector_id} from database",
            {"stage": "connector_retrieval"},
        )
        connector = await get_connector_by_id(
            session, connector_id, SearchSourceConnectorType.SLACK_CONNECTOR
        )
        if not connector:
            await task_logger.log_task_failure(
                log_entry,
                f"Connector with ID {connector_id} not found or is not a Slack connector",
                "Connector not found",
                {"error_type": "ConnectorNotFound"},
            )
            return (
                0,
                f"Connector with ID {connector_id} not found or is not a Slack connector",
            )
        # Get the Slack token from the connector config
        slack_token = connector.config.get("SLACK_BOT_TOKEN")
        if not slack_token:
            await task_logger.log_task_failure(
                log_entry,
                f"Slack token not found in connector config for connector {connector_id}",
                "Missing Slack token",
                {"error_type": "MissingToken"},
            )
            return 0, "Slack token not found in connector config"
        # Initialize Slack client
        await task_logger.log_task_progress(
            log_entry,
            f"Initializing Slack client for connector {connector_id}",
            {"stage": "client_initialization"},
        )
        slack_client = SlackHistory(token=slack_token)
        # Calculate date range
        await task_logger.log_task_progress(
            log_entry,
            "Calculating date range for Slack indexing",
            {
                "stage": "date_calculation",
                "provided_start_date": start_date,
                "provided_end_date": end_date,
            },
        )
        start_date_str, end_date_str = calculate_date_range(
            connector, start_date, end_date, default_days_back=365
        )
        logger.info(f"Indexing Slack messages from {start_date_str} to {end_date_str}")
        await task_logger.log_task_progress(
            log_entry,
            f"Fetching Slack channels from {start_date_str} to {end_date_str}",
            {
                "stage": "fetch_channels",
                "start_date": start_date_str,
                "end_date": end_date_str,
            },
        )
        # Get all channels
        try:
            channels = slack_client.get_all_channels()
        except Exception as e:
            await task_logger.log_task_failure(
                log_entry,
                f"Failed to get Slack channels for connector {connector_id}",
                str(e),
                {"error_type": "ChannelFetchError"},
            )
            return 0, f"Failed to get Slack channels: {e!s}"
        if not channels:
            await task_logger.log_task_success(
                log_entry,
                f"No Slack channels found for connector {connector_id}",
                {"channels_found": 0},
            )
            return 0, "No Slack channels found"
        # Track the number of documents indexed
        documents_indexed = 0
        documents_skipped = 0
        skipped_channels = []
        await task_logger.log_task_progress(
            log_entry,
            f"Starting to process {len(channels)} Slack channels",
            {"stage": "process_channels", "total_channels": len(channels)},
        )
        # Process each channel
        for channel_obj in channels:
            channel_id = channel_obj["id"]
            channel_name = channel_obj["name"]
            is_private = channel_obj["is_private"]
            is_member = channel_obj["is_member"]
            try:
                # If it's a private channel and the bot is not a member, skip.
                if is_private and not is_member:
                    logger.warning(
                        f"Bot is not a member of private channel {channel_name} ({channel_id}). Skipping."
                    )
                    skipped_channels.append(
                        f"{channel_name} (private, bot not a member)"
                    )
                    documents_skipped += 1
                    continue
                # Get messages for this channel
                messages, error = slack_client.get_history_by_date_range(
                    channel_id=channel_id,
                    start_date=start_date_str,
                    end_date=end_date_str,
                    limit=1000,  # Limit to 1000 messages per channel
                )
                if error:
                    logger.warning(
                        f"Error getting messages from channel {channel_name}: {error}"
                    )
                    skipped_channels.append(f"{channel_name} (error: {error})")
                    documents_skipped += 1
                    continue  # Skip this channel if there's an error
                if not messages:
                    logger.info(
                        f"No messages found in channel {channel_name} for the specified date range."
                    )
                    documents_skipped += 1
                    continue  # Skip if no messages
                # Format messages with user info
                formatted_messages = []
                for msg in messages:
                    # Skip bot messages and system messages
                    if msg.get("subtype") in [
                        "bot_message",
                        "channel_join",
                        "channel_leave",
                    ]:
                        continue
                    formatted_msg = slack_client.format_message(
                        msg, include_user_info=True
                    )
                    formatted_messages.append(formatted_msg)
                if not formatted_messages:
                    logger.info(
                        f"No valid messages found in channel {channel_name} after filtering."
                    )
                    documents_skipped += 1
                    continue  # Skip if no valid messages after filtering
                # Convert messages to markdown format
                channel_content = f"# Slack Channel: {channel_name}\n\n"
                for msg in formatted_messages:
                    user_name = msg.get("user_name", "Unknown User")
                    timestamp = msg.get("datetime", "Unknown Time")
                    text = msg.get("text", "")
                    channel_content += (
                        f"## {user_name} ({timestamp})\n\n{text}\n\n---\n\n"
                    )
                # Format document metadata
                metadata_sections = [
                    (
                        "METADATA",
                        [
                            f"CHANNEL_NAME: {channel_name}",
                            f"CHANNEL_ID: {channel_id}",
                            f"MESSAGE_COUNT: {len(formatted_messages)}",
                        ],
                    ),
                    (
                        "CONTENT",
                        ["FORMAT: markdown", "TEXT_START", channel_content, "TEXT_END"],
                    ),
                ]
                # Build the document string
                combined_document_string = build_document_metadata_string(
                    metadata_sections
                )
                content_hash = generate_content_hash(
                    combined_document_string, search_space_id
                )
                # Check if document with this content hash already exists
                existing_document_by_hash = await check_duplicate_document_by_hash(
                    session, content_hash
                )
                if existing_document_by_hash:
                    logger.info(
                        f"Document with content hash {content_hash} already exists for channel {channel_name}. Skipping processing."
                    )
                    documents_skipped += 1
                    continue
                # Get user's long context LLM
                user_llm = await get_user_long_context_llm(session, user_id)
                if not user_llm:
                    logger.error(f"No long context LLM configured for user {user_id}")
                    skipped_channels.append(f"{channel_name} (no LLM configured)")
                    documents_skipped += 1
                    continue
                # Generate summary
                summary_chain = SUMMARY_PROMPT_TEMPLATE | user_llm
                summary_result = await summary_chain.ainvoke(
                    {"document": combined_document_string}
                )
                summary_content = summary_result.content
                summary_embedding = config.embedding_model_instance.embed(
                    summary_content
                )
                # Process chunks
                chunks = await create_document_chunks(channel_content)
                # Create and store new document
                document = Document(
                    search_space_id=search_space_id,
                    title=f"Slack - {channel_name}",
                    document_type=DocumentType.SLACK_CONNECTOR,
                    document_metadata={
                        "channel_name": channel_name,
                        "channel_id": channel_id,
                        "start_date": start_date_str,
                        "end_date": end_date_str,
                        "message_count": len(formatted_messages),
                        "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                    },
                    content=summary_content,
                    embedding=summary_embedding,
                    chunks=chunks,
                    content_hash=content_hash,
                )
                session.add(document)
                documents_indexed += 1
                logger.info(
                    f"Successfully indexed new channel {channel_name} with {len(formatted_messages)} messages"
                )
            except SlackApiError as slack_error:
                logger.error(
                    f"Slack API error for channel {channel_name}: {slack_error!s}"
                )
                skipped_channels.append(f"{channel_name} (Slack API error)")
                documents_skipped += 1
                continue  # Skip this channel and continue with others
            except Exception as e:
                logger.error(f"Error processing channel {channel_name}: {e!s}")
                skipped_channels.append(f"{channel_name} (processing error)")
                documents_skipped += 1
                continue  # Skip this channel and continue with others
        # Update the last_indexed_at timestamp for the connector only if requested
        # and if we successfully indexed at least one channel
        total_processed = documents_indexed
        if total_processed > 0:
            await update_connector_last_indexed(session, connector, update_last_indexed)
        # Commit all changes
        await session.commit()
        # Prepare result message
        result_message = None
        if skipped_channels:
            result_message = f"Processed {total_processed} channels. Skipped {len(skipped_channels)} channels: {', '.join(skipped_channels)}"
        else:
            result_message = f"Processed {total_processed} channels."
        # Log success
        await task_logger.log_task_success(
            log_entry,
            f"Successfully completed Slack indexing for connector {connector_id}",
            {
                "channels_processed": total_processed,
                "documents_indexed": documents_indexed,
                "documents_skipped": documents_skipped,
                "skipped_channels_count": len(skipped_channels),
                "result_message": result_message,
            },
        )
        logger.info(
            f"Slack indexing completed: {documents_indexed} new channels, {documents_skipped} skipped"
        )
        return total_processed, result_message
    except SQLAlchemyError as db_error:
        await session.rollback()
        await task_logger.log_task_failure(
            log_entry,
            f"Database error during Slack indexing for connector {connector_id}",
            str(db_error),
            {"error_type": "SQLAlchemyError"},
        )
        logger.error(f"Database error: {db_error!s}")
        return 0, f"Database error: {db_error!s}"
    except Exception as e:
        await session.rollback()
        await task_logger.log_task_failure(
            log_entry,
            f"Failed to index Slack messages for connector {connector_id}",
            str(e),
            {"error_type": type(e).__name__},
        )
        logger.error(f"Failed to index Slack messages: {e!s}")
        return 0, f"Failed to index Slack messages: {e!s}"
--- a/surfsense_backend/app/tasks/connectors_indexing_tasks.py
+++ b/surfsense_backend/app/tasks/connectors_indexing_tasks.py
--- a/surfsense_backend/app/tasks/document_processors/init.py
+++ b/surfsense_backend/app/tasks/document_processors/init.py
@ -0,0 +1,47 @@
 """
 Document processors module for background tasks.
 This module provides a collection of document processors for different content types
 and sources. Each processor is responsible for handling a specific type of document
 processing task in the background.
 Available processors:
 - URL crawler: Process web pages from URLs
 - Extension processor: Handle documents from browser extension
 - Markdown processor: Process markdown files
 - File processors: Handle files using different ETL services (Unstructured, LlamaCloud, Docling)
 - YouTube processor: Process YouTube videos and extract transcripts
 """
 # URL crawler
 # Extension processor
 from .extension_processor import add_extension_received_document
 # File processors
 from .file_processors import (
    add_received_file_document_using_docling,
    add_received_file_document_using_llamacloud,
    add_received_file_document_using_unstructured,
 )
 # Markdown processor
 from .markdown_processor import add_received_markdown_file_document
 from .url_crawler import add_crawled_url_document
 # YouTube processor
 from .youtube_processor import add_youtube_video_document
 __all__ = [
    # URL processing
    "add_crawled_url_document",
    # Extension processing
    "add_extension_received_document",
    "add_received_file_document_using_docling",
    "add_received_file_document_using_llamacloud",
    # File processing with different ETL services
    "add_received_file_document_using_unstructured",
    # Markdown file processing
    "add_received_markdown_file_document",
    # YouTube video processing
    "add_youtube_video_document",
 ]
--- a/surfsense_backend/app/tasks/document_processors/base.py
+++ b/surfsense_backend/app/tasks/document_processors/base.py
@ -0,0 +1,75 @@
 """
 Base functionality and shared imports for document processors.
 """
 from langchain_community.document_transformers import MarkdownifyTransformer
 from sqlalchemy.ext.asyncio import AsyncSession
 from sqlalchemy.future import select
 from app.config import config
 from app.db import Chunk, Document
 from app.prompts import SUMMARY_PROMPT_TEMPLATE
 # Initialize markdown transformer
 md = MarkdownifyTransformer()
 async def check_duplicate_document(
    session: AsyncSession, content_hash: str
 ) -> Document | None:
    """
    Check if a document with the given content hash already exists.
    Args:
        session: Database session
        content_hash: Hash of the document content
    Returns:
        Existing document if found, None otherwise
    """
    existing_doc_result = await session.execute(
        select(Document).where(Document.content_hash == content_hash)
    )
    return existing_doc_result.scalars().first()
 async def create_document_chunks(content: str) -> list[Chunk]:
    """
    Create chunks from document content.
    Args:
        content: Document content to chunk
    Returns:
        List of Chunk objects with embeddings
    """
    return [
        Chunk(
            content=chunk.text,
            embedding=config.embedding_model_instance.embed(chunk.text),
        )
        for chunk in config.chunker_instance.chunk(content)
    ]
 async def generate_document_summary(
    content: str, user_llm, document_title: str = ""
 ) -> tuple[str, list[float]]:
    """
    Generate summary and embedding for document content.
    Args:
        content: Document content
        user_llm: User's LLM instance
        document_title: Optional document title for context
    Returns:
        Tuple of (summary_content, summary_embedding)
    """
    summary_chain = SUMMARY_PROMPT_TEMPLATE | user_llm
    summary_result = await summary_chain.ainvoke({"document": content})
    summary_content = summary_result.content
    summary_embedding = config.embedding_model_instance.embed(summary_content)
    return summary_content, summary_embedding
--- a/surfsense_backend/app/tasks/document_processors/extension_processor.py
+++ b/surfsense_backend/app/tasks/document_processors/extension_processor.py
@ -0,0 +1,163 @@
 """
 Extension document processor for SurfSense browser extension.
 """
 import logging
 from sqlalchemy.exc import SQLAlchemyError
 from sqlalchemy.ext.asyncio import AsyncSession
 from app.db import Document, DocumentType
 from app.schemas import ExtensionDocumentContent
 from app.services.llm_service import get_user_long_context_llm
 from app.services.task_logging_service import TaskLoggingService
 from app.utils.document_converters import generate_content_hash
 from .base import (
    check_duplicate_document,
    create_document_chunks,
    generate_document_summary,
 )
 async def add_extension_received_document(
    session: AsyncSession,
    content: ExtensionDocumentContent,
    search_space_id: int,
    user_id: str,
 ) -> Document | None:
    """
    Process and store document content received from the SurfSense Extension.
    Args:
        session: Database session
        content: Document content from extension
        search_space_id: ID of the search space
        user_id: ID of the user
    Returns:
        Document object if successful, None if failed
    """
    task_logger = TaskLoggingService(session, search_space_id)
    # Log task start
    log_entry = await task_logger.log_task_start(
        task_name="extension_document",
        source="background_task",
        message=f"Processing extension document: {content.metadata.VisitedWebPageTitle}",
        metadata={
            "url": content.metadata.VisitedWebPageURL,
            "title": content.metadata.VisitedWebPageTitle,
            "user_id": str(user_id),
        },
    )
    try:
        # Format document metadata in a more maintainable way
        metadata_sections = [
            (
                "METADATA",
                [
                    f"SESSION_ID: {content.metadata.BrowsingSessionId}",
                    f"URL: {content.metadata.VisitedWebPageURL}",
                    f"TITLE: {content.metadata.VisitedWebPageTitle}",
                    f"REFERRER: {content.metadata.VisitedWebPageReffererURL}",
                    f"TIMESTAMP: {content.metadata.VisitedWebPageDateWithTimeInISOString}",
                    f"DURATION_MS: {content.metadata.VisitedWebPageVisitDurationInMilliseconds}",
                ],
            ),
            (
                "CONTENT",
                ["FORMAT: markdown", "TEXT_START", content.pageContent, "TEXT_END"],
            ),
        ]
        # Build the document string more efficiently
        document_parts = []
        document_parts.append("<DOCUMENT>")
        for section_title, section_content in metadata_sections:
            document_parts.append(f"<{section_title}>")
            document_parts.extend(section_content)
            document_parts.append(f"</{section_title}>")
        document_parts.append("</DOCUMENT>")
        combined_document_string = "\n".join(document_parts)
        content_hash = generate_content_hash(combined_document_string, search_space_id)
        # Check if document with this content hash already exists
        existing_document = await check_duplicate_document(session, content_hash)
        if existing_document:
            await task_logger.log_task_success(
                log_entry,
                f"Extension document already exists: {content.metadata.VisitedWebPageTitle}",
                {
                    "duplicate_detected": True,
                    "existing_document_id": existing_document.id,
                },
            )
            logging.info(
                f"Document with content hash {content_hash} already exists. Skipping processing."
            )
            return existing_document
        # Get user's long context LLM
        user_llm = await get_user_long_context_llm(session, user_id)
        if not user_llm:
            raise RuntimeError(f"No long context LLM configured for user {user_id}")
        # Generate summary
        summary_content, summary_embedding = await generate_document_summary(
            combined_document_string, user_llm
        )
        # Process chunks
        chunks = await create_document_chunks(content.pageContent)
        # Create and store document
        document = Document(
            search_space_id=search_space_id,
            title=content.metadata.VisitedWebPageTitle,
            document_type=DocumentType.EXTENSION,
            document_metadata=content.metadata.model_dump(),
            content=summary_content,
            embedding=summary_embedding,
            chunks=chunks,
            content_hash=content_hash,
        )
        session.add(document)
        await session.commit()
        await session.refresh(document)
        # Log success
        await task_logger.log_task_success(
            log_entry,
            f"Successfully processed extension document: {content.metadata.VisitedWebPageTitle}",
            {
                "document_id": document.id,
                "content_hash": content_hash,
                "url": content.metadata.VisitedWebPageURL,
            },
        )
        return document
    except SQLAlchemyError as db_error:
        await session.rollback()
        await task_logger.log_task_failure(
            log_entry,
            f"Database error processing extension document: {content.metadata.VisitedWebPageTitle}",
            str(db_error),
            {"error_type": "SQLAlchemyError"},
        )
        raise db_error
    except Exception as e:
        await session.rollback()
        await task_logger.log_task_failure(
            log_entry,
            f"Failed to process extension document: {content.metadata.VisitedWebPageTitle}",
            str(e),
            {"error_type": type(e).__name__},
        )
        raise RuntimeError(f"Failed to process extension document: {e!s}") from e
--- a/surfsense_backend/app/tasks/document_processors/file_processors.py
+++ b/surfsense_backend/app/tasks/document_processors/file_processors.py
@ -0,0 +1,261 @@
 """
 File document processors for different ETL services (Unstructured, LlamaCloud, Docling).
 """
 import logging
 from langchain_core.documents import Document as LangChainDocument
 from sqlalchemy.exc import SQLAlchemyError
 from sqlalchemy.ext.asyncio import AsyncSession
 from app.db import Document, DocumentType
 from app.services.llm_service import get_user_long_context_llm
 from app.utils.document_converters import (
    convert_document_to_markdown,
    generate_content_hash,
 )
 from .base import (
    check_duplicate_document,
    create_document_chunks,
    generate_document_summary,
 )
 async def add_received_file_document_using_unstructured(
    session: AsyncSession,
    file_name: str,
    unstructured_processed_elements: list[LangChainDocument],
    search_space_id: int,
    user_id: str,
 ) -> Document | None:
    """
    Process and store a file document using Unstructured service.
    Args:
        session: Database session
        file_name: Name of the processed file
        unstructured_processed_elements: Processed elements from Unstructured
        search_space_id: ID of the search space
        user_id: ID of the user
    Returns:
        Document object if successful, None if failed
    """
    try:
        file_in_markdown = await convert_document_to_markdown(
            unstructured_processed_elements
        )
        content_hash = generate_content_hash(file_in_markdown, search_space_id)
        # Check if document with this content hash already exists
        existing_document = await check_duplicate_document(session, content_hash)
        if existing_document:
            logging.info(
                f"Document with content hash {content_hash} already exists. Skipping processing."
            )
            return existing_document
        # TODO: Check if file_markdown exceeds token limit of embedding model
        # Get user's long context LLM
        user_llm = await get_user_long_context_llm(session, user_id)
        if not user_llm:
            raise RuntimeError(f"No long context LLM configured for user {user_id}")
        # Generate summary
        summary_content, summary_embedding = await generate_document_summary(
            file_in_markdown, user_llm
        )
        # Process chunks
        chunks = await create_document_chunks(file_in_markdown)
        # Create and store document
        document = Document(
            search_space_id=search_space_id,
            title=file_name,
            document_type=DocumentType.FILE,
            document_metadata={
                "FILE_NAME": file_name,
                "ETL_SERVICE": "UNSTRUCTURED",
            },
            content=summary_content,
            embedding=summary_embedding,
            chunks=chunks,
            content_hash=content_hash,
        )
        session.add(document)
        await session.commit()
        await session.refresh(document)
        return document
    except SQLAlchemyError as db_error:
        await session.rollback()
        raise db_error
    except Exception as e:
        await session.rollback()
        raise RuntimeError(f"Failed to process file document: {e!s}") from e
 async def add_received_file_document_using_llamacloud(
    session: AsyncSession,
    file_name: str,
    llamacloud_markdown_document: str,
    search_space_id: int,
    user_id: str,
 ) -> Document | None:
    """
    Process and store document content parsed by LlamaCloud.
    Args:
        session: Database session
        file_name: Name of the processed file
        llamacloud_markdown_document: Markdown content from LlamaCloud parsing
        search_space_id: ID of the search space
        user_id: ID of the user
    Returns:
        Document object if successful, None if failed
    """
    try:
        # Combine all markdown documents into one
        file_in_markdown = llamacloud_markdown_document
        content_hash = generate_content_hash(file_in_markdown, search_space_id)
        # Check if document with this content hash already exists
        existing_document = await check_duplicate_document(session, content_hash)
        if existing_document:
            logging.info(
                f"Document with content hash {content_hash} already exists. Skipping processing."
            )
            return existing_document
        # Get user's long context LLM
        user_llm = await get_user_long_context_llm(session, user_id)
        if not user_llm:
            raise RuntimeError(f"No long context LLM configured for user {user_id}")
        # Generate summary
        summary_content, summary_embedding = await generate_document_summary(
            file_in_markdown, user_llm
        )
        # Process chunks
        chunks = await create_document_chunks(file_in_markdown)
        # Create and store document
        document = Document(
            search_space_id=search_space_id,
            title=file_name,
            document_type=DocumentType.FILE,
            document_metadata={
                "FILE_NAME": file_name,
                "ETL_SERVICE": "LLAMACLOUD",
            },
            content=summary_content,
            embedding=summary_embedding,
            chunks=chunks,
            content_hash=content_hash,
        )
        session.add(document)
        await session.commit()
        await session.refresh(document)
        return document
    except SQLAlchemyError as db_error:
        await session.rollback()
        raise db_error
    except Exception as e:
        await session.rollback()
        raise RuntimeError(
            f"Failed to process file document using LlamaCloud: {e!s}"
        ) from e
 async def add_received_file_document_using_docling(
    session: AsyncSession,
    file_name: str,
    docling_markdown_document: str,
    search_space_id: int,
    user_id: str,
 ) -> Document | None:
    """
    Process and store document content parsed by Docling.
    Args:
        session: Database session
        file_name: Name of the processed file
        docling_markdown_document: Markdown content from Docling parsing
        search_space_id: ID of the search space
        user_id: ID of the user
    Returns:
        Document object if successful, None if failed
    """
    try:
        file_in_markdown = docling_markdown_document
        content_hash = generate_content_hash(file_in_markdown, search_space_id)
        # Check if document with this content hash already exists
        existing_document = await check_duplicate_document(session, content_hash)
        if existing_document:
            logging.info(
                f"Document with content hash {content_hash} already exists. Skipping processing."
            )
            return existing_document
        # Get user's long context LLM
        user_llm = await get_user_long_context_llm(session, user_id)
        if not user_llm:
            raise RuntimeError(f"No long context LLM configured for user {user_id}")
        # Generate summary using chunked processing for large documents
        from app.services.docling_service import create_docling_service
        docling_service = create_docling_service()
        summary_content = await docling_service.process_large_document_summary(
            content=file_in_markdown, llm=user_llm, document_title=file_name
        )
        from app.config import config
        summary_embedding = config.embedding_model_instance.embed(summary_content)
        # Process chunks
        chunks = await create_document_chunks(file_in_markdown)
        # Create and store document
        document = Document(
            search_space_id=search_space_id,
            title=file_name,
            document_type=DocumentType.FILE,
            document_metadata={
                "FILE_NAME": file_name,
                "ETL_SERVICE": "DOCLING",
            },
            content=summary_content,
            embedding=summary_embedding,
            chunks=chunks,
            content_hash=content_hash,
        )
        session.add(document)
        await session.commit()
        await session.refresh(document)
        return document
    except SQLAlchemyError as db_error:
        await session.rollback()
        raise db_error
    except Exception as e:
        await session.rollback()
        raise RuntimeError(
            f"Failed to process file document using Docling: {e!s}"
        ) from e
--- a/surfsense_backend/app/tasks/document_processors/markdown_processor.py
+++ b/surfsense_backend/app/tasks/document_processors/markdown_processor.py
@ -0,0 +1,136 @@
 """
 Markdown file document processor.
 """
 import logging
 from sqlalchemy.exc import SQLAlchemyError
 from sqlalchemy.ext.asyncio import AsyncSession
 from app.db import Document, DocumentType
 from app.services.llm_service import get_user_long_context_llm
 from app.services.task_logging_service import TaskLoggingService
 from app.utils.document_converters import generate_content_hash
 from .base import (
    check_duplicate_document,
    create_document_chunks,
    generate_document_summary,
 )
 async def add_received_markdown_file_document(
    session: AsyncSession,
    file_name: str,
    file_in_markdown: str,
    search_space_id: int,
    user_id: str,
 ) -> Document | None:
    """
    Process and store a markdown file document.
    Args:
        session: Database session
        file_name: Name of the markdown file
        file_in_markdown: Content of the markdown file
        search_space_id: ID of the search space
        user_id: ID of the user
    Returns:
        Document object if successful, None if failed
    """
    task_logger = TaskLoggingService(session, search_space_id)
    # Log task start
    log_entry = await task_logger.log_task_start(
        task_name="markdown_file_document",
        source="background_task",
        message=f"Processing markdown file: {file_name}",
        metadata={
            "filename": file_name,
            "user_id": str(user_id),
            "content_length": len(file_in_markdown),
        },
    )
    try:
        content_hash = generate_content_hash(file_in_markdown, search_space_id)
        # Check if document with this content hash already exists
        existing_document = await check_duplicate_document(session, content_hash)
        if existing_document:
            await task_logger.log_task_success(
                log_entry,
                f"Markdown file document already exists: {file_name}",
                {
                    "duplicate_detected": True,
                    "existing_document_id": existing_document.id,
                },
            )
            logging.info(
                f"Document with content hash {content_hash} already exists. Skipping processing."
            )
            return existing_document
        # Get user's long context LLM
        user_llm = await get_user_long_context_llm(session, user_id)
        if not user_llm:
            raise RuntimeError(f"No long context LLM configured for user {user_id}")
        # Generate summary
        summary_content, summary_embedding = await generate_document_summary(
            file_in_markdown, user_llm
        )
        # Process chunks
        chunks = await create_document_chunks(file_in_markdown)
        # Create and store document
        document = Document(
            search_space_id=search_space_id,
            title=file_name,
            document_type=DocumentType.FILE,
            document_metadata={
                "FILE_NAME": file_name,
            },
            content=summary_content,
            embedding=summary_embedding,
            chunks=chunks,
            content_hash=content_hash,
        )
        session.add(document)
        await session.commit()
        await session.refresh(document)
        # Log success
        await task_logger.log_task_success(
            log_entry,
            f"Successfully processed markdown file: {file_name}",
            {
                "document_id": document.id,
                "content_hash": content_hash,
                "chunks_count": len(chunks),
                "summary_length": len(summary_content),
            },
        )
        return document
    except SQLAlchemyError as db_error:
        await session.rollback()
        await task_logger.log_task_failure(
            log_entry,
            f"Database error processing markdown file: {file_name}",
            str(db_error),
            {"error_type": "SQLAlchemyError"},
        )
        raise db_error
    except Exception as e:
        await session.rollback()
        await task_logger.log_task_failure(
            log_entry,
            f"Failed to process markdown file: {file_name}",
            str(e),
            {"error_type": type(e).__name__},
        )
        raise RuntimeError(f"Failed to process file document: {e!s}") from e
--- a/surfsense_backend/app/tasks/document_processors/url_crawler.py
+++ b/surfsense_backend/app/tasks/document_processors/url_crawler.py
@ -0,0 +1,242 @@
 """
 URL crawler document processor.
 """
 import logging
 import validators
 from langchain_community.document_loaders import AsyncChromiumLoader, FireCrawlLoader
 from sqlalchemy.exc import SQLAlchemyError
 from sqlalchemy.ext.asyncio import AsyncSession
 from app.config import config
 from app.db import Document, DocumentType
 from app.services.llm_service import get_user_long_context_llm
 from app.services.task_logging_service import TaskLoggingService
 from app.utils.document_converters import generate_content_hash
 from .base import (
    check_duplicate_document,
    create_document_chunks,
    generate_document_summary,
    md,
 )
 async def add_crawled_url_document(
    session: AsyncSession, url: str, search_space_id: int, user_id: str
 ) -> Document | None:
    """
    Process and store a document from a crawled URL.
    Args:
        session: Database session
        url: URL to crawl
        search_space_id: ID of the search space
        user_id: ID of the user
    Returns:
        Document object if successful, None if failed
    """
    task_logger = TaskLoggingService(session, search_space_id)
    # Log task start
    log_entry = await task_logger.log_task_start(
        task_name="crawl_url_document",
        source="background_task",
        message=f"Starting URL crawling process for: {url}",
        metadata={"url": url, "user_id": str(user_id)},
    )
    try:
        # URL validation step
        await task_logger.log_task_progress(
            log_entry, f"Validating URL: {url}", {"stage": "validation"}
        )
        if not validators.url(url):
            raise ValueError(f"Url {url} is not a valid URL address")
        # Set up crawler
        await task_logger.log_task_progress(
            log_entry,
            f"Setting up crawler for URL: {url}",
            {
                "stage": "crawler_setup",
                "firecrawl_available": bool(config.FIRECRAWL_API_KEY),
            },
        )
        if config.FIRECRAWL_API_KEY:
            crawl_loader = FireCrawlLoader(
                url=url,
                api_key=config.FIRECRAWL_API_KEY,
                mode="scrape",
                params={
                    "formats": ["markdown"],
                    "excludeTags": ["a"],
                },
            )
        else:
            crawl_loader = AsyncChromiumLoader(urls=[url], headless=True)
        # Perform crawling
        await task_logger.log_task_progress(
            log_entry,
            f"Crawling URL content: {url}",
            {"stage": "crawling", "crawler_type": type(crawl_loader).__name__},
        )
        url_crawled = await crawl_loader.aload()
        if isinstance(crawl_loader, FireCrawlLoader):
            content_in_markdown = url_crawled[0].page_content
        elif isinstance(crawl_loader, AsyncChromiumLoader):
            content_in_markdown = md.transform_documents(url_crawled)[0].page_content
        # Format document
        await task_logger.log_task_progress(
            log_entry,
            f"Processing crawled content from: {url}",
            {"stage": "content_processing", "content_length": len(content_in_markdown)},
        )
        # Format document metadata in a more maintainable way
        metadata_sections = [
            (
                "METADATA",
                [
                    f"{key.upper()}: {value}"
                    for key, value in url_crawled[0].metadata.items()
                ],
            ),
            (
                "CONTENT",
                ["FORMAT: markdown", "TEXT_START", content_in_markdown, "TEXT_END"],
            ),
        ]
        # Build the document string more efficiently
        document_parts = []
        document_parts.append("<DOCUMENT>")
        for section_title, section_content in metadata_sections:
            document_parts.append(f"<{section_title}>")
            document_parts.extend(section_content)
            document_parts.append(f"</{section_title}>")
        document_parts.append("</DOCUMENT>")
        combined_document_string = "\n".join(document_parts)
        content_hash = generate_content_hash(combined_document_string, search_space_id)
        # Check for duplicates
        await task_logger.log_task_progress(
            log_entry,
            f"Checking for duplicate content: {url}",
            {"stage": "duplicate_check", "content_hash": content_hash},
        )
        existing_document = await check_duplicate_document(session, content_hash)
        if existing_document:
            await task_logger.log_task_success(
                log_entry,
                f"Document already exists for URL: {url}",
                {
                    "duplicate_detected": True,
                    "existing_document_id": existing_document.id,
                },
            )
            logging.info(
                f"Document with content hash {content_hash} already exists. Skipping processing."
            )
            return existing_document
        # Get LLM for summary generation
        await task_logger.log_task_progress(
            log_entry,
            f"Preparing for summary generation: {url}",
            {"stage": "llm_setup"},
        )
        # Get user's long context LLM
        user_llm = await get_user_long_context_llm(session, user_id)
        if not user_llm:
            raise RuntimeError(f"No long context LLM configured for user {user_id}")
        # Generate summary
        await task_logger.log_task_progress(
            log_entry,
            f"Generating summary for URL content: {url}",
            {"stage": "summary_generation"},
        )
        summary_content, summary_embedding = await generate_document_summary(
            combined_document_string, user_llm
        )
        # Process chunks
        await task_logger.log_task_progress(
            log_entry,
            f"Processing content chunks for URL: {url}",
            {"stage": "chunk_processing"},
        )
        chunks = await create_document_chunks(content_in_markdown)
        # Create and store document
        await task_logger.log_task_progress(
            log_entry,
            f"Creating document in database for URL: {url}",
            {"stage": "document_creation", "chunks_count": len(chunks)},
        )
        document = Document(
            search_space_id=search_space_id,
            title=url_crawled[0].metadata["title"]
            if isinstance(crawl_loader, FireCrawlLoader)
            else url_crawled[0].metadata["source"],
            document_type=DocumentType.CRAWLED_URL,
            document_metadata=url_crawled[0].metadata,
            content=summary_content,
            embedding=summary_embedding,
            chunks=chunks,
            content_hash=content_hash,
        )
        session.add(document)
        await session.commit()
        await session.refresh(document)
        # Log success
        await task_logger.log_task_success(
            log_entry,
            f"Successfully crawled and processed URL: {url}",
            {
                "document_id": document.id,
                "title": document.title,
                "content_hash": content_hash,
                "chunks_count": len(chunks),
                "summary_length": len(summary_content),
            },
        )
        return document
    except SQLAlchemyError as db_error:
        await session.rollback()
        await task_logger.log_task_failure(
            log_entry,
            f"Database error while processing URL: {url}",
            str(db_error),
            {"error_type": "SQLAlchemyError"},
        )
        raise db_error
    except Exception as e:
        await session.rollback()
        await task_logger.log_task_failure(
            log_entry,
            f"Failed to crawl URL: {url}",
            str(e),
            {"error_type": type(e).__name__},
        )
        raise RuntimeError(f"Failed to crawl URL: {e!s}") from e
--- a/surfsense_backend/app/tasks/document_processors/youtube_processor.py
+++ b/surfsense_backend/app/tasks/document_processors/youtube_processor.py
@ -0,0 +1,326 @@
 """
 YouTube video document processor.
 """
 import logging
 from urllib.parse import parse_qs, urlparse
 import aiohttp
 from sqlalchemy.exc import SQLAlchemyError
 from sqlalchemy.ext.asyncio import AsyncSession
 from youtube_transcript_api import YouTubeTranscriptApi
 from app.db import Document, DocumentType
 from app.services.llm_service import get_user_long_context_llm
 from app.services.task_logging_service import TaskLoggingService
 from app.utils.document_converters import generate_content_hash
 from .base import (
    check_duplicate_document,
    create_document_chunks,
    generate_document_summary,
 )
 def get_youtube_video_id(url: str) -> str | None:
    """
    Extract video ID from various YouTube URL formats.
    Args:
        url: YouTube URL
    Returns:
        Video ID if found, None otherwise
    """
    parsed_url = urlparse(url)
    hostname = parsed_url.hostname
    if hostname == "youtu.be":
        return parsed_url.path[1:]
    if hostname in ("www.youtube.com", "youtube.com"):
        if parsed_url.path == "/watch":
            query_params = parse_qs(parsed_url.query)
            return query_params.get("v", [None])[0]
        if parsed_url.path.startswith("/embed/"):
            return parsed_url.path.split("/")[2]
        if parsed_url.path.startswith("/v/"):
            return parsed_url.path.split("/")[2]
    return None
 async def add_youtube_video_document(
    session: AsyncSession, url: str, search_space_id: int, user_id: str
 ) -> Document:
    """
    Process a YouTube video URL, extract transcripts, and store as a document.
    Args:
        session: Database session for storing the document
        url: YouTube video URL (supports standard, shortened, and embed formats)
        search_space_id: ID of the search space to add the document to
        user_id: ID of the user
    Returns:
        Document: The created document object
    Raises:
        ValueError: If the YouTube video ID cannot be extracted from the URL
        SQLAlchemyError: If there's a database error
        RuntimeError: If the video processing fails
    """
    task_logger = TaskLoggingService(session, search_space_id)
    # Log task start
    log_entry = await task_logger.log_task_start(
        task_name="youtube_video_document",
        source="background_task",
        message=f"Starting YouTube video processing for: {url}",
        metadata={"url": url, "user_id": str(user_id)},
    )
    try:
        # Extract video ID from URL
        await task_logger.log_task_progress(
            log_entry,
            f"Extracting video ID from URL: {url}",
            {"stage": "video_id_extraction"},
        )
        # Get video ID
        video_id = get_youtube_video_id(url)
        if not video_id:
            raise ValueError(f"Could not extract video ID from URL: {url}")
        await task_logger.log_task_progress(
            log_entry,
            f"Video ID extracted: {video_id}",
            {"stage": "video_id_extracted", "video_id": video_id},
        )
        # Get video metadata
        await task_logger.log_task_progress(
            log_entry,
            f"Fetching video metadata for: {video_id}",
            {"stage": "metadata_fetch"},
        )
        params = {
            "format": "json",
            "url": f"https://www.youtube.com/watch?v={video_id}",
        }
        oembed_url = "https://www.youtube.com/oembed"
        async with (
            aiohttp.ClientSession() as http_session,
            http_session.get(oembed_url, params=params) as response,
        ):
            video_data = await response.json()
        await task_logger.log_task_progress(
            log_entry,
            f"Video metadata fetched: {video_data.get('title', 'Unknown')}",
            {
                "stage": "metadata_fetched",
                "title": video_data.get("title"),
                "author": video_data.get("author_name"),
            },
        )
        # Get video transcript
        await task_logger.log_task_progress(
            log_entry,
            f"Fetching transcript for video: {video_id}",
            {"stage": "transcript_fetch"},
        )
        try:
            captions = YouTubeTranscriptApi.get_transcript(video_id)
            # Include complete caption information with timestamps
            transcript_segments = []
            for line in captions:
                start_time = line.get("start", 0)
                duration = line.get("duration", 0)
                text = line.get("text", "")
                timestamp = f"[{start_time:.2f}s-{start_time + duration:.2f}s]"
                transcript_segments.append(f"{timestamp} {text}")
            transcript_text = "\n".join(transcript_segments)
            await task_logger.log_task_progress(
                log_entry,
                f"Transcript fetched successfully: {len(captions)} segments",
                {
                    "stage": "transcript_fetched",
                    "segments_count": len(captions),
                    "transcript_length": len(transcript_text),
                },
            )
        except Exception as e:
            transcript_text = f"No captions available for this video. Error: {e!s}"
            await task_logger.log_task_progress(
                log_entry,
                f"No transcript available for video: {video_id}",
                {"stage": "transcript_unavailable", "error": str(e)},
            )
        # Format document
        await task_logger.log_task_progress(
            log_entry,
            f"Processing video content: {video_data.get('title', 'YouTube Video')}",
            {"stage": "content_processing"},
        )
        # Format document metadata in a more maintainable way
        metadata_sections = [
            (
                "METADATA",
                [
                    f"TITLE: {video_data.get('title', 'YouTube Video')}",
                    f"URL: {url}",
                    f"VIDEO_ID: {video_id}",
                    f"AUTHOR: {video_data.get('author_name', 'Unknown')}",
                    f"THUMBNAIL: {video_data.get('thumbnail_url', '')}",
                ],
            ),
            (
                "CONTENT",
                ["FORMAT: transcript", "TEXT_START", transcript_text, "TEXT_END"],
            ),
        ]
        # Build the document string more efficiently
        document_parts = []
        document_parts.append("<DOCUMENT>")
        for section_title, section_content in metadata_sections:
            document_parts.append(f"<{section_title}>")
            document_parts.extend(section_content)
            document_parts.append(f"</{section_title}>")
        document_parts.append("</DOCUMENT>")
        combined_document_string = "\n".join(document_parts)
        content_hash = generate_content_hash(combined_document_string, search_space_id)
        # Check for duplicates
        await task_logger.log_task_progress(
            log_entry,
            f"Checking for duplicate video content: {video_id}",
            {"stage": "duplicate_check", "content_hash": content_hash},
        )
        existing_document = await check_duplicate_document(session, content_hash)
        if existing_document:
            await task_logger.log_task_success(
                log_entry,
                f"YouTube video document already exists: {video_data.get('title', 'YouTube Video')}",
                {
                    "duplicate_detected": True,
                    "existing_document_id": existing_document.id,
                    "video_id": video_id,
                },
            )
            logging.info(
                f"Document with content hash {content_hash} already exists. Skipping processing."
            )
            return existing_document
        # Get LLM for summary generation
        await task_logger.log_task_progress(
            log_entry,
            f"Preparing for summary generation: {video_data.get('title', 'YouTube Video')}",
            {"stage": "llm_setup"},
        )
        # Get user's long context LLM
        user_llm = await get_user_long_context_llm(session, user_id)
        if not user_llm:
            raise RuntimeError(f"No long context LLM configured for user {user_id}")
        # Generate summary
        await task_logger.log_task_progress(
            log_entry,
            f"Generating summary for video: {video_data.get('title', 'YouTube Video')}",
            {"stage": "summary_generation"},
        )
        summary_content, summary_embedding = await generate_document_summary(
            combined_document_string, user_llm
        )
        # Process chunks
        await task_logger.log_task_progress(
            log_entry,
            f"Processing content chunks for video: {video_data.get('title', 'YouTube Video')}",
            {"stage": "chunk_processing"},
        )
        chunks = await create_document_chunks(combined_document_string)
        # Create document
        await task_logger.log_task_progress(
            log_entry,
            f"Creating YouTube video document in database: {video_data.get('title', 'YouTube Video')}",
            {"stage": "document_creation", "chunks_count": len(chunks)},
        )
        document = Document(
            title=video_data.get("title", "YouTube Video"),
            document_type=DocumentType.YOUTUBE_VIDEO,
            document_metadata={
                "url": url,
                "video_id": video_id,
                "video_title": video_data.get("title", "YouTube Video"),
                "author": video_data.get("author_name", "Unknown"),
                "thumbnail": video_data.get("thumbnail_url", ""),
            },
            content=summary_content,
            embedding=summary_embedding,
            chunks=chunks,
            search_space_id=search_space_id,
            content_hash=content_hash,
        )
        session.add(document)
        await session.commit()
        await session.refresh(document)
        # Log success
        await task_logger.log_task_success(
            log_entry,
            f"Successfully processed YouTube video: {video_data.get('title', 'YouTube Video')}",
            {
                "document_id": document.id,
                "video_id": video_id,
                "title": document.title,
                "content_hash": content_hash,
                "chunks_count": len(chunks),
                "summary_length": len(summary_content),
                "has_transcript": "No captions available" not in transcript_text,
            },
        )
        return document
    except SQLAlchemyError as db_error:
        await session.rollback()
        await task_logger.log_task_failure(
            log_entry,
            f"Database error while processing YouTube video: {url}",
            str(db_error),
            {
                "error_type": "SQLAlchemyError",
                "video_id": video_id if "video_id" in locals() else None,
            },
        )
        raise db_error
    except Exception as e:
        await session.rollback()
        await task_logger.log_task_failure(
            log_entry,
            f"Failed to process YouTube video: {url}",
            str(e),
            {
                "error_type": type(e).__name__,
                "video_id": video_id if "video_id" in locals() else None,
            },
        )
        logging.error(f"Failed to process YouTube video: {e!s}")
        raise