refactor: refactored background_tasks & indexing_tasks

This commit is contained in:
DESKTOP-RTLN3BA\$punk 2025-08-12 15:28:13 -07:00
parent 356bbb86f5
commit 5aa52375c3
24 changed files with 4704 additions and 5149 deletions

View file

@ -1,188 +0,0 @@
import unittest
from datetime import datetime
from unittest.mock import Mock, patch
from github3.exceptions import ForbiddenError # Import the specific exception
# Adjust the import path based on the actual location if test_github_connector.py
# is not in the same directory as github_connector.py or if paths are set up differently.
# Assuming surfsend_backend/app/connectors/test_github_connector.py
from surfsense_backend.app.connectors.github_connector import GitHubConnector
class TestGitHubConnector(unittest.TestCase):
@patch("surfsense_backend.app.connectors.github_connector.github_login")
def test_get_user_repositories_uses_type_all(self, mock_github_login):
# Mock the GitHub client object and its methods
mock_gh_instance = Mock()
mock_github_login.return_value = mock_gh_instance
# Mock the self.gh.me() call in __init__ to prevent an actual API call
mock_gh_instance.me.return_value = Mock() # Simple mock to pass initialization
# Prepare mock repository data
mock_repo1_data = Mock()
mock_repo1_data.id = 1
mock_repo1_data.name = "repo1"
mock_repo1_data.full_name = "user/repo1"
mock_repo1_data.private = False
mock_repo1_data.html_url = "http://example.com/user/repo1"
mock_repo1_data.description = "Test repo 1"
mock_repo1_data.updated_at = datetime(
2023, 1, 1, 10, 30, 0
) # Added time component
mock_repo2_data = Mock()
mock_repo2_data.id = 2
mock_repo2_data.name = "org-repo"
mock_repo2_data.full_name = "org/org-repo"
mock_repo2_data.private = True
mock_repo2_data.html_url = "http://example.com/org/org-repo"
mock_repo2_data.description = "Org repo"
mock_repo2_data.updated_at = datetime(
2023, 1, 2, 12, 0, 0
) # Added time component
# Configure the mock for gh.repositories() call
# This method is an iterator, so it should return an iterable (e.g., a list)
mock_gh_instance.repositories.return_value = [mock_repo1_data, mock_repo2_data]
connector = GitHubConnector(token="fake_token")
repositories = connector.get_user_repositories()
# Assert that gh.repositories was called correctly
mock_gh_instance.repositories.assert_called_once_with(
type="all", sort="updated"
)
# Assert the structure and content of the returned data
expected_repositories = [
{
"id": 1,
"name": "repo1",
"full_name": "user/repo1",
"private": False,
"url": "http://example.com/user/repo1",
"description": "Test repo 1",
"last_updated": datetime(2023, 1, 1, 10, 30, 0),
},
{
"id": 2,
"name": "org-repo",
"full_name": "org/org-repo",
"private": True,
"url": "http://example.com/org/org-repo",
"description": "Org repo",
"last_updated": datetime(2023, 1, 2, 12, 0, 0),
},
]
self.assertEqual(repositories, expected_repositories)
self.assertEqual(len(repositories), 2)
@patch("surfsense_backend.app.connectors.github_connector.github_login")
def test_get_user_repositories_handles_empty_description_and_none_updated_at(
self, mock_github_login
):
# Mock the GitHub client object and its methods
mock_gh_instance = Mock()
mock_github_login.return_value = mock_gh_instance
mock_gh_instance.me.return_value = Mock()
mock_repo_data = Mock()
mock_repo_data.id = 1
mock_repo_data.name = "repo_no_desc"
mock_repo_data.full_name = "user/repo_no_desc"
mock_repo_data.private = False
mock_repo_data.html_url = "http://example.com/user/repo_no_desc"
mock_repo_data.description = None # Test None description
mock_repo_data.updated_at = None # Test None updated_at
mock_gh_instance.repositories.return_value = [mock_repo_data]
connector = GitHubConnector(token="fake_token")
repositories = connector.get_user_repositories()
mock_gh_instance.repositories.assert_called_once_with(
type="all", sort="updated"
)
expected_repositories = [
{
"id": 1,
"name": "repo_no_desc",
"full_name": "user/repo_no_desc",
"private": False,
"url": "http://example.com/user/repo_no_desc",
"description": "", # Expect empty string
"last_updated": None, # Expect None
}
]
self.assertEqual(repositories, expected_repositories)
@patch("surfsense_backend.app.connectors.github_connector.github_login")
def test_github_connector_initialization_failure_forbidden(self, mock_github_login):
# Test that __init__ raises ValueError on auth failure (ForbiddenError)
mock_gh_instance = Mock()
mock_github_login.return_value = mock_gh_instance
# Create a mock response object for the ForbiddenError
# The actual response structure might vary, but github3.py's ForbiddenError
# can be instantiated with just a response object that has a status_code.
mock_response = Mock()
mock_response.status_code = 403 # Typically Forbidden
# Setup the side_effect for self.gh.me()
mock_gh_instance.me.side_effect = ForbiddenError(mock_response)
with self.assertRaises(ValueError) as context:
GitHubConnector(token="invalid_token_forbidden")
self.assertIn(
"Invalid GitHub token or insufficient permissions.", str(context.exception)
)
@patch("surfsense_backend.app.connectors.github_connector.github_login")
def test_github_connector_initialization_failure_authentication_failed(
self, mock_github_login
):
# Test that __init__ raises ValueError on auth failure (AuthenticationFailed, which is a subclass of ForbiddenError)
# For github3.py, AuthenticationFailed is more specific for token issues.
from github3.exceptions import AuthenticationFailed
mock_gh_instance = Mock()
mock_github_login.return_value = mock_gh_instance
mock_response = Mock()
mock_response.status_code = 401 # Typically Unauthorized
mock_gh_instance.me.side_effect = AuthenticationFailed(mock_response)
with self.assertRaises(ValueError) as context:
GitHubConnector(token="invalid_token_authfailed")
self.assertIn(
"Invalid GitHub token or insufficient permissions.", str(context.exception)
)
@patch("surfsense_backend.app.connectors.github_connector.github_login")
def test_get_user_repositories_handles_api_exception(self, mock_github_login):
mock_gh_instance = Mock()
mock_github_login.return_value = mock_gh_instance
mock_gh_instance.me.return_value = Mock()
# Simulate an exception when calling repositories
mock_gh_instance.repositories.side_effect = Exception("API Error")
connector = GitHubConnector(token="fake_token")
# We expect it to log an error and return an empty list
with patch(
"surfsense_backend.app.connectors.github_connector.logger"
) as mock_logger:
repositories = connector.get_user_repositories()
self.assertEqual(repositories, [])
mock_logger.error.assert_called_once()
self.assertIn(
"Failed to fetch GitHub repositories: API Error",
mock_logger.error.call_args[0][0],
)
if __name__ == "__main__":
unittest.main()

View file

@ -1,507 +0,0 @@
import unittest
from unittest.mock import Mock, call, patch
from slack_sdk.errors import SlackApiError
# Since test_slack_history.py is in the same directory as slack_history.py
from .slack_history import SlackHistory
class TestSlackHistoryGetAllChannels(unittest.TestCase):
@patch("surfsense_backend.app.connectors.slack_history.logger")
@patch("surfsense_backend.app.connectors.slack_history.time.sleep")
@patch("slack_sdk.WebClient")
def test_get_all_channels_pagination_with_delay(
self, mock_web_client, mock_sleep, mock_logger
):
mock_client_instance = mock_web_client.return_value
# Mock API responses now include is_private and is_member
page1_response = {
"channels": [
{"name": "general", "id": "C1", "is_private": False, "is_member": True},
{"name": "dev", "id": "C0", "is_private": False, "is_member": True},
],
"response_metadata": {"next_cursor": "cursor123"},
}
page2_response = {
"channels": [
{"name": "random", "id": "C2", "is_private": True, "is_member": True}
],
"response_metadata": {"next_cursor": ""},
}
mock_client_instance.conversations_list.side_effect = [
page1_response,
page2_response,
]
slack_history = SlackHistory(token="fake_token")
channels_list = slack_history.get_all_channels(include_private=True)
expected_channels_list = [
{"id": "C1", "name": "general", "is_private": False, "is_member": True},
{"id": "C0", "name": "dev", "is_private": False, "is_member": True},
{"id": "C2", "name": "random", "is_private": True, "is_member": True},
]
self.assertEqual(len(channels_list), 3)
self.assertListEqual(
channels_list, expected_channels_list
) # Assert list equality
expected_calls = [
call(types="public_channel,private_channel", cursor=None, limit=1000),
call(
types="public_channel,private_channel", cursor="cursor123", limit=1000
),
]
mock_client_instance.conversations_list.assert_has_calls(expected_calls)
self.assertEqual(mock_client_instance.conversations_list.call_count, 2)
mock_sleep.assert_called_once_with(3)
mock_logger.info.assert_called_once_with(
"Paginating for channels, waiting 3 seconds before next call. Cursor: cursor123"
)
@patch("surfsense_backend.app.connectors.slack_history.logger")
@patch("surfsense_backend.app.connectors.slack_history.time.sleep")
@patch("slack_sdk.WebClient")
def test_get_all_channels_rate_limit_with_retry_after(
self, mock_web_client, mock_sleep, mock_logger
):
mock_client_instance = mock_web_client.return_value
mock_error_response = Mock()
mock_error_response.status_code = 429
mock_error_response.headers = {"Retry-After": "5"}
successful_response = {
"channels": [
{"name": "general", "id": "C1", "is_private": False, "is_member": True}
],
"response_metadata": {"next_cursor": ""},
}
mock_client_instance.conversations_list.side_effect = [
SlackApiError(message="ratelimited", response=mock_error_response),
successful_response,
]
slack_history = SlackHistory(token="fake_token")
channels_list = slack_history.get_all_channels(include_private=True)
expected_channels_list = [
{"id": "C1", "name": "general", "is_private": False, "is_member": True}
]
self.assertEqual(len(channels_list), 1)
self.assertListEqual(channels_list, expected_channels_list)
mock_sleep.assert_called_once_with(5)
mock_logger.warning.assert_called_once_with(
"Slack API rate limit hit while fetching channels. Waiting for 5 seconds. Cursor: None"
)
expected_calls = [
call(types="public_channel,private_channel", cursor=None, limit=1000),
call(types="public_channel,private_channel", cursor=None, limit=1000),
]
mock_client_instance.conversations_list.assert_has_calls(expected_calls)
self.assertEqual(mock_client_instance.conversations_list.call_count, 2)
@patch("surfsense_backend.app.connectors.slack_history.logger")
@patch("surfsense_backend.app.connectors.slack_history.time.sleep")
@patch("slack_sdk.WebClient")
def test_get_all_channels_rate_limit_no_retry_after_valid_header(
self, mock_web_client, mock_sleep, mock_logger
):
mock_client_instance = mock_web_client.return_value
mock_error_response = Mock()
mock_error_response.status_code = 429
mock_error_response.headers = {"Retry-After": "invalid_value"}
successful_response = {
"channels": [
{"name": "general", "id": "C1", "is_private": False, "is_member": True}
],
"response_metadata": {"next_cursor": ""},
}
mock_client_instance.conversations_list.side_effect = [
SlackApiError(message="ratelimited", response=mock_error_response),
successful_response,
]
slack_history = SlackHistory(token="fake_token")
channels_list = slack_history.get_all_channels(include_private=True)
expected_channels_list = [
{"id": "C1", "name": "general", "is_private": False, "is_member": True}
]
self.assertListEqual(channels_list, expected_channels_list)
mock_sleep.assert_called_once_with(60) # Default fallback
mock_logger.warning.assert_called_once_with(
"Slack API rate limit hit while fetching channels. Waiting for 60 seconds. Cursor: None"
)
self.assertEqual(mock_client_instance.conversations_list.call_count, 2)
@patch("surfsense_backend.app.connectors.slack_history.logger")
@patch("surfsense_backend.app.connectors.slack_history.time.sleep")
@patch("slack_sdk.WebClient")
def test_get_all_channels_rate_limit_no_retry_after_header(
self, mock_web_client, mock_sleep, mock_logger
):
mock_client_instance = mock_web_client.return_value
mock_error_response = Mock()
mock_error_response.status_code = 429
mock_error_response.headers = {}
successful_response = {
"channels": [
{"name": "general", "id": "C1", "is_private": False, "is_member": True}
],
"response_metadata": {"next_cursor": ""},
}
mock_client_instance.conversations_list.side_effect = [
SlackApiError(message="ratelimited", response=mock_error_response),
successful_response,
]
slack_history = SlackHistory(token="fake_token")
channels_list = slack_history.get_all_channels(include_private=True)
expected_channels_list = [
{"id": "C1", "name": "general", "is_private": False, "is_member": True}
]
self.assertListEqual(channels_list, expected_channels_list)
mock_sleep.assert_called_once_with(60) # Default fallback
mock_logger.warning.assert_called_once_with(
"Slack API rate limit hit while fetching channels. Waiting for 60 seconds. Cursor: None"
)
self.assertEqual(mock_client_instance.conversations_list.call_count, 2)
@patch("surfsense_backend.app.connectors.slack_history.logger")
@patch("surfsense_backend.app.connectors.slack_history.time.sleep")
@patch("slack_sdk.WebClient")
def test_get_all_channels_other_slack_api_error(
self, mock_web_client, mock_sleep, mock_logger
):
mock_client_instance = mock_web_client.return_value
mock_error_response = Mock()
mock_error_response.status_code = 500
mock_error_response.headers = {}
mock_error_response.data = {"ok": False, "error": "internal_error"}
original_error = SlackApiError(
message="server error", response=mock_error_response
)
mock_client_instance.conversations_list.side_effect = original_error
slack_history = SlackHistory(token="fake_token")
with self.assertRaises(SlackApiError) as context:
slack_history.get_all_channels(include_private=True)
self.assertEqual(context.exception.response.status_code, 500)
self.assertIn("server error", str(context.exception))
mock_sleep.assert_not_called()
mock_logger.warning.assert_not_called() # Ensure no rate limit log
mock_client_instance.conversations_list.assert_called_once_with(
types="public_channel,private_channel", cursor=None, limit=1000
)
@patch("surfsense_backend.app.connectors.slack_history.logger")
@patch("surfsense_backend.app.connectors.slack_history.time.sleep")
@patch("slack_sdk.WebClient")
def test_get_all_channels_handles_missing_name_id_gracefully(
self, mock_web_client, mock_sleep, mock_logger
):
mock_client_instance = mock_web_client.return_value
response_with_malformed_data = {
"channels": [
{"id": "C1_missing_name", "is_private": False, "is_member": True},
{"name": "channel_missing_id", "is_private": False, "is_member": True},
{
"name": "general",
"id": "C2_valid",
"is_private": False,
"is_member": True,
},
],
"response_metadata": {"next_cursor": ""},
}
mock_client_instance.conversations_list.return_value = (
response_with_malformed_data
)
slack_history = SlackHistory(token="fake_token")
channels_list = slack_history.get_all_channels(include_private=True)
expected_channels_list = [
{
"id": "C2_valid",
"name": "general",
"is_private": False,
"is_member": True,
}
]
self.assertEqual(len(channels_list), 1)
self.assertListEqual(channels_list, expected_channels_list)
self.assertEqual(mock_logger.warning.call_count, 2)
mock_logger.warning.assert_any_call(
"Channel found with missing name or id. Data: {'id': 'C1_missing_name', 'is_private': False, 'is_member': True}"
)
mock_logger.warning.assert_any_call(
"Channel found with missing name or id. Data: {'name': 'channel_missing_id', 'is_private': False, 'is_member': True}"
)
mock_sleep.assert_not_called()
mock_client_instance.conversations_list.assert_called_once_with(
types="public_channel,private_channel", cursor=None, limit=1000
)
if __name__ == "__main__":
unittest.main()
class TestSlackHistoryGetConversationHistory(unittest.TestCase):
@patch("surfsense_backend.app.connectors.slack_history.logger")
@patch("surfsense_backend.app.connectors.slack_history.time.sleep")
@patch("slack_sdk.WebClient")
def test_proactive_delay_single_page(
self, mock_web_client, mock_time_sleep, mock_logger
):
mock_client_instance = mock_web_client.return_value
mock_client_instance.conversations_history.return_value = {
"messages": [{"text": "msg1"}],
"has_more": False,
}
slack_history = SlackHistory(token="fake_token")
slack_history.get_conversation_history(channel_id="C123")
mock_time_sleep.assert_called_once_with(1.2) # Proactive delay
@patch("surfsense_backend.app.connectors.slack_history.logger")
@patch("surfsense_backend.app.connectors.slack_history.time.sleep")
@patch("slack_sdk.WebClient")
def test_proactive_delay_multiple_pages(
self, mock_web_client, mock_time_sleep, mock_logger
):
mock_client_instance = mock_web_client.return_value
mock_client_instance.conversations_history.side_effect = [
{
"messages": [{"text": "msg1"}],
"has_more": True,
"response_metadata": {"next_cursor": "cursor1"},
},
{"messages": [{"text": "msg2"}], "has_more": False},
]
slack_history = SlackHistory(token="fake_token")
slack_history.get_conversation_history(channel_id="C123")
# Expected calls: 1.2 (page1), 1.2 (page2)
self.assertEqual(mock_time_sleep.call_count, 2)
mock_time_sleep.assert_has_calls([call(1.2), call(1.2)])
@patch("surfsense_backend.app.connectors.slack_history.logger")
@patch("surfsense_backend.app.connectors.slack_history.time.sleep")
@patch("slack_sdk.WebClient")
def test_retry_after_logic(self, mock_web_client, mock_time_sleep, mock_logger):
mock_client_instance = mock_web_client.return_value
mock_error_response = Mock()
mock_error_response.status_code = 429
mock_error_response.headers = {"Retry-After": "5"}
mock_client_instance.conversations_history.side_effect = [
SlackApiError(message="ratelimited", response=mock_error_response),
{"messages": [{"text": "msg1"}], "has_more": False},
]
slack_history = SlackHistory(token="fake_token")
messages = slack_history.get_conversation_history(channel_id="C123")
self.assertEqual(len(messages), 1)
self.assertEqual(messages[0]["text"], "msg1")
# Expected sleep calls: 1.2 (proactive for 1st attempt), 5 (rate limit), 1.2 (proactive for 2nd attempt)
mock_time_sleep.assert_has_calls(
[call(1.2), call(5), call(1.2)], any_order=False
)
mock_logger.warning.assert_called_once() # Check that a warning was logged for rate limiting
@patch("surfsense_backend.app.connectors.slack_history.logger")
@patch("surfsense_backend.app.connectors.slack_history.time.sleep")
@patch("slack_sdk.WebClient")
def test_not_in_channel_error(self, mock_web_client, mock_time_sleep, mock_logger):
mock_client_instance = mock_web_client.return_value
mock_error_response = Mock()
mock_error_response.status_code = (
403 # Typical for not_in_channel, but data matters more
)
mock_error_response.data = {"ok": False, "error": "not_in_channel"}
# This error is now raised by the inner try-except, then caught by the outer one
mock_client_instance.conversations_history.side_effect = SlackApiError(
message="not_in_channel error", response=mock_error_response
)
slack_history = SlackHistory(token="fake_token")
messages = slack_history.get_conversation_history(channel_id="C123")
self.assertEqual(messages, [])
mock_logger.warning.assert_called_with(
"Bot is not in channel 'C123'. Cannot fetch history. Please add the bot to this channel."
)
mock_time_sleep.assert_called_once_with(
1.2
) # Proactive delay before the API call
@patch("surfsense_backend.app.connectors.slack_history.logger")
@patch("surfsense_backend.app.connectors.slack_history.time.sleep")
@patch("slack_sdk.WebClient")
def test_other_slack_api_error_propagates(
self, mock_web_client, mock_time_sleep, mock_logger
):
mock_client_instance = mock_web_client.return_value
mock_error_response = Mock()
mock_error_response.status_code = 500
mock_error_response.data = {"ok": False, "error": "internal_error"}
original_error = SlackApiError(
message="server error", response=mock_error_response
)
mock_client_instance.conversations_history.side_effect = original_error
slack_history = SlackHistory(token="fake_token")
with self.assertRaises(SlackApiError) as context:
slack_history.get_conversation_history(channel_id="C123")
self.assertIn(
"Error retrieving history for channel C123", str(context.exception)
)
self.assertIs(context.exception.response, mock_error_response)
mock_time_sleep.assert_called_once_with(1.2) # Proactive delay
@patch("surfsense_backend.app.connectors.slack_history.logger")
@patch("surfsense_backend.app.connectors.slack_history.time.sleep")
@patch("slack_sdk.WebClient")
def test_general_exception_propagates(
self, mock_web_client, mock_time_sleep, mock_logger
):
mock_client_instance = mock_web_client.return_value
original_error = Exception("Something broke")
mock_client_instance.conversations_history.side_effect = original_error
slack_history = SlackHistory(token="fake_token")
with self.assertRaises(Exception) as context: # Check for generic Exception
slack_history.get_conversation_history(channel_id="C123")
self.assertIs(
context.exception, original_error
) # Should re-raise the original error
mock_logger.error.assert_called_once_with(
"Unexpected error in get_conversation_history for channel C123: Something broke"
)
mock_time_sleep.assert_called_once_with(1.2) # Proactive delay
class TestSlackHistoryGetUserInfo(unittest.TestCase):
@patch("surfsense_backend.app.connectors.slack_history.logger")
@patch("surfsense_backend.app.connectors.slack_history.time.sleep")
@patch("slack_sdk.WebClient")
def test_retry_after_logic(self, mock_web_client, mock_time_sleep, mock_logger):
mock_client_instance = mock_web_client.return_value
mock_error_response = Mock()
mock_error_response.status_code = 429
mock_error_response.headers = {"Retry-After": "3"} # Using 3 seconds for test
successful_user_data = {"id": "U123", "name": "testuser"}
mock_client_instance.users_info.side_effect = [
SlackApiError(message="ratelimited_userinfo", response=mock_error_response),
{"user": successful_user_data},
]
slack_history = SlackHistory(token="fake_token")
user_info = slack_history.get_user_info(user_id="U123")
self.assertEqual(user_info, successful_user_data)
# Assert that time.sleep was called for the rate limit
mock_time_sleep.assert_called_once_with(3)
mock_logger.warning.assert_called_once_with(
"Rate limited by Slack on users.info for user U123. Retrying after 3 seconds."
)
# Assert users_info was called twice (original + retry)
self.assertEqual(mock_client_instance.users_info.call_count, 2)
mock_client_instance.users_info.assert_has_calls(
[call(user="U123"), call(user="U123")]
)
@patch("surfsense_backend.app.connectors.slack_history.logger")
@patch(
"surfsense_backend.app.connectors.slack_history.time.sleep"
) # time.sleep might be called by other logic, but not expected here
@patch("slack_sdk.WebClient")
def test_other_slack_api_error_propagates(
self, mock_web_client, mock_time_sleep, mock_logger
):
mock_client_instance = mock_web_client.return_value
mock_error_response = Mock()
mock_error_response.status_code = 500 # Some other error
mock_error_response.data = {"ok": False, "error": "internal_server_error"}
original_error = SlackApiError(
message="internal server error", response=mock_error_response
)
mock_client_instance.users_info.side_effect = original_error
slack_history = SlackHistory(token="fake_token")
with self.assertRaises(SlackApiError) as context:
slack_history.get_user_info(user_id="U123")
# Check that the raised error is the one we expect
self.assertIn("Error retrieving user info for U123", str(context.exception))
self.assertIs(context.exception.response, mock_error_response)
mock_time_sleep.assert_not_called() # No rate limit sleep
@patch("surfsense_backend.app.connectors.slack_history.logger")
@patch("surfsense_backend.app.connectors.slack_history.time.sleep")
@patch("slack_sdk.WebClient")
def test_general_exception_propagates(
self, mock_web_client, mock_time_sleep, mock_logger
):
mock_client_instance = mock_web_client.return_value
original_error = Exception("A very generic problem")
mock_client_instance.users_info.side_effect = original_error
slack_history = SlackHistory(token="fake_token")
with self.assertRaises(Exception) as context:
slack_history.get_user_info(user_id="U123")
self.assertIs(
context.exception, original_error
) # Check it's the exact same exception
mock_logger.error.assert_called_once_with(
"Unexpected error in get_user_info for user U123: A very generic problem"
)
mock_time_sleep.assert_not_called() # No rate limit sleep

View file

@ -10,7 +10,7 @@ from app.config import config as app_config
from app.db import Document, DocumentType, Log, SearchSpace, User, get_async_session
from app.schemas import DocumentRead, DocumentsCreate, DocumentUpdate
from app.services.task_logging_service import TaskLoggingService
from app.tasks.background_tasks import (
from app.tasks.document_processors import (
add_crawled_url_document,
add_extension_received_document,
add_received_file_document_using_docling,

View file

@ -35,7 +35,7 @@ from app.schemas import (
SearchSourceConnectorRead,
SearchSourceConnectorUpdate,
)
from app.tasks.connectors_indexing_tasks import (
from app.tasks.connector_indexers import (
index_clickup_tasks,
index_confluence_pages,
index_discord_messages,

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,54 @@
"""
Connector indexers module for background tasks.
This module provides a collection of connector indexers for different platforms
and services. Each indexer is responsible for handling the indexing of content
from a specific connector type.
Available indexers:
- Slack: Index messages from Slack channels
- Notion: Index pages from Notion workspaces
- GitHub: Index repositories and files from GitHub
- Linear: Index issues from Linear workspaces
- Jira: Index issues from Jira projects
- Confluence: Index pages from Confluence spaces
- Discord: Index messages from Discord servers
- ClickUp: Index tasks from ClickUp workspaces
- Google Calendar: Index events from Google Calendar
"""
# Communication platforms
from .clickup_indexer import index_clickup_tasks
from .confluence_indexer import index_confluence_pages
from .discord_indexer import index_discord_messages
# Development platforms
from .github_indexer import index_github_repos
# Calendar and scheduling
from .google_calendar_indexer import index_google_calendar_events
from .jira_indexer import index_jira_issues
# Issue tracking and project management
from .linear_indexer import index_linear_issues
# Documentation and knowledge management
from .notion_indexer import index_notion_pages
from .slack_indexer import index_slack_messages
__all__ = [
"index_clickup_tasks",
"index_confluence_pages",
"index_discord_messages",
# Development platforms
"index_github_repos",
# Calendar and scheduling
"index_google_calendar_events",
"index_jira_issues",
# Issue tracking and project management
"index_linear_issues",
# Documentation and knowledge management
"index_notion_pages",
# Communication platforms
"index_slack_messages",
]

View file

@ -0,0 +1,183 @@
"""
Base functionality and shared imports for connector indexers.
"""
import logging
from datetime import datetime, timedelta
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.future import select
from app.config import config
from app.db import (
Chunk,
Document,
SearchSourceConnector,
SearchSourceConnectorType,
)
# Set up logging
logger = logging.getLogger(__name__)
async def check_duplicate_document_by_hash(
session: AsyncSession, content_hash: str
) -> Document | None:
"""
Check if a document with the given content hash already exists.
Args:
session: Database session
content_hash: Hash of the document content
Returns:
Existing document if found, None otherwise
"""
existing_doc_result = await session.execute(
select(Document).where(Document.content_hash == content_hash)
)
return existing_doc_result.scalars().first()
async def create_document_chunks(content: str) -> list[Chunk]:
"""
Create chunks from document content.
Args:
content: Document content to chunk
Returns:
List of Chunk objects with embeddings
"""
return [
Chunk(
content=chunk.text,
embedding=config.embedding_model_instance.embed(chunk.text),
)
for chunk in config.chunker_instance.chunk(content)
]
async def get_connector_by_id(
session: AsyncSession, connector_id: int, connector_type: SearchSourceConnectorType
) -> SearchSourceConnector | None:
"""
Get a connector by ID and type from the database.
Args:
session: Database session
connector_id: ID of the connector
connector_type: Expected type of the connector
Returns:
Connector object if found, None otherwise
"""
result = await session.execute(
select(SearchSourceConnector).filter(
SearchSourceConnector.id == connector_id,
SearchSourceConnector.connector_type == connector_type,
)
)
return result.scalars().first()
def calculate_date_range(
connector: SearchSourceConnector,
start_date: str | None = None,
end_date: str | None = None,
default_days_back: int = 365,
) -> tuple[str, str]:
"""
Calculate date range for indexing based on provided dates or connector's last indexed date.
Args:
connector: The connector object
start_date: Optional start date string (YYYY-MM-DD)
end_date: Optional end date string (YYYY-MM-DD)
default_days_back: Default number of days to go back if no last indexed date
Returns:
Tuple of (start_date_str, end_date_str)
"""
if start_date is not None and end_date is not None:
return start_date, end_date
# Fall back to calculating dates based on last_indexed_at
calculated_end_date = datetime.now()
# Use last_indexed_at as start date if available, otherwise use default_days_back
if connector.last_indexed_at:
# Convert dates to be comparable (both timezone-naive)
last_indexed_naive = (
connector.last_indexed_at.replace(tzinfo=None)
if connector.last_indexed_at.tzinfo
else connector.last_indexed_at
)
# Check if last_indexed_at is in the future or after end_date
if last_indexed_naive > calculated_end_date:
logger.warning(
f"Last indexed date ({last_indexed_naive.strftime('%Y-%m-%d')}) is in the future. Using {default_days_back} days ago instead."
)
calculated_start_date = calculated_end_date - timedelta(
days=default_days_back
)
else:
calculated_start_date = last_indexed_naive
logger.info(
f"Using last_indexed_at ({calculated_start_date.strftime('%Y-%m-%d')}) as start date"
)
else:
calculated_start_date = calculated_end_date - timedelta(days=default_days_back)
logger.info(
f"No last_indexed_at found, using {calculated_start_date.strftime('%Y-%m-%d')} ({default_days_back} days ago) as start date"
)
# Use calculated dates if not provided
start_date_str = (
start_date if start_date else calculated_start_date.strftime("%Y-%m-%d")
)
end_date_str = end_date if end_date else calculated_end_date.strftime("%Y-%m-%d")
return start_date_str, end_date_str
async def update_connector_last_indexed(
session: AsyncSession,
connector: SearchSourceConnector,
update_last_indexed: bool = True,
) -> None:
"""
Update the last_indexed_at timestamp for a connector.
Args:
session: Database session
connector: The connector object
update_last_indexed: Whether to actually update the timestamp
"""
if update_last_indexed:
connector.last_indexed_at = datetime.now()
logger.info(f"Updated last_indexed_at to {connector.last_indexed_at}")
def build_document_metadata_string(
metadata_sections: list[tuple[str, list[str]]],
) -> str:
"""
Build a document string from metadata sections.
Args:
metadata_sections: List of (section_title, section_content) tuples
Returns:
Combined document string
"""
document_parts = ["<DOCUMENT>"]
for section_title, section_content in metadata_sections:
document_parts.append(f"<{section_title}>")
document_parts.extend(section_content)
document_parts.append(f"</{section_title}>")
document_parts.append("</DOCUMENT>")
return "\n".join(document_parts)

View file

@ -0,0 +1,301 @@
"""
ClickUp connector indexer.
"""
from datetime import datetime
from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.ext.asyncio import AsyncSession
from app.config import config
from app.connectors.clickup_connector import ClickUpConnector
from app.db import Document, DocumentType, SearchSourceConnectorType
from app.services.task_logging_service import TaskLoggingService
from app.utils.document_converters import generate_content_hash
from .base import (
check_duplicate_document_by_hash,
create_document_chunks,
get_connector_by_id,
logger,
update_connector_last_indexed,
)
async def index_clickup_tasks(
session: AsyncSession,
connector_id: int,
search_space_id: int,
user_id: str,
start_date: str | None = None,
end_date: str | None = None,
update_last_indexed: bool = True,
) -> tuple[int, str | None]:
"""
Index tasks from ClickUp workspace.
Args:
session: Database session
connector_id: ID of the ClickUp connector
search_space_id: ID of the search space
user_id: ID of the user
start_date: Start date for filtering tasks (YYYY-MM-DD format)
end_date: End date for filtering tasks (YYYY-MM-DD format)
update_last_indexed: Whether to update the last_indexed_at timestamp
Returns:
Tuple of (number of indexed tasks, error message if any)
"""
task_logger = TaskLoggingService(session, search_space_id)
# Log task start
log_entry = await task_logger.log_task_start(
task_name="clickup_tasks_indexing",
source="connector_indexing_task",
message=f"Starting ClickUp tasks indexing for connector {connector_id}",
metadata={
"connector_id": connector_id,
"start_date": start_date,
"end_date": end_date,
},
)
try:
# Get connector configuration
connector = await get_connector_by_id(
session, connector_id, SearchSourceConnectorType.CLICKUP_CONNECTOR
)
if not connector:
error_msg = f"ClickUp connector with ID {connector_id} not found"
await task_logger.log_task_failure(
log_entry,
f"Connector with ID {connector_id} not found or is not a ClickUp connector",
"Connector not found",
{"error_type": "ConnectorNotFound"},
)
return 0, error_msg
# Extract ClickUp configuration
clickup_api_token = connector.config.get("CLICKUP_API_TOKEN")
if not clickup_api_token:
error_msg = "ClickUp API token not found in connector configuration"
await task_logger.log_task_failure(
log_entry,
f"ClickUp API token not found in connector config for connector {connector_id}",
"Missing ClickUp token",
{"error_type": "MissingToken"},
)
return 0, error_msg
await task_logger.log_task_progress(
log_entry,
f"Initializing ClickUp client for connector {connector_id}",
{"stage": "client_initialization"},
)
clickup_client = ClickUpConnector(api_token=clickup_api_token)
# Get authorized workspaces
await task_logger.log_task_progress(
log_entry,
"Fetching authorized ClickUp workspaces",
{"stage": "workspace_fetching"},
)
workspaces_response = clickup_client.get_authorized_workspaces()
workspaces = workspaces_response.get("teams", [])
if not workspaces:
error_msg = "No authorized ClickUp workspaces found"
await task_logger.log_task_failure(
log_entry,
f"No authorized ClickUp workspaces found for connector {connector_id}",
"No workspaces found",
{"error_type": "NoWorkspacesFound"},
)
return 0, error_msg
documents_indexed = 0
documents_skipped = 0
# Iterate workspaces and fetch tasks
for workspace in workspaces:
workspace_id = workspace.get("id")
workspace_name = workspace.get("name", "Unknown Workspace")
if not workspace_id:
continue
await task_logger.log_task_progress(
log_entry,
f"Processing workspace: {workspace_name}",
{"stage": "workspace_processing", "workspace_id": workspace_id},
)
# Fetch tasks for date range if provided
if start_date and end_date:
tasks, error = clickup_client.get_tasks_in_date_range(
workspace_id=workspace_id,
start_date=start_date,
end_date=end_date,
include_closed=True,
)
if error:
logger.warning(
f"Error fetching tasks from workspace {workspace_name}: {error}"
)
continue
else:
tasks = clickup_client.get_workspace_tasks(
workspace_id=workspace_id, include_closed=True
)
await task_logger.log_task_progress(
log_entry,
f"Found {len(tasks)} tasks in workspace {workspace_name}",
{"stage": "tasks_found", "task_count": len(tasks)},
)
for task in tasks:
try:
task_id = task.get("id")
task_name = task.get("name", "Untitled Task")
task_description = task.get("description", "")
task_status = task.get("status", {}).get("status", "Unknown")
task_priority = (
task.get("priority", {}).get("priority", "Unknown")
if task.get("priority")
else "None"
)
task_assignees = task.get("assignees", [])
task_due_date = task.get("due_date")
task_created = task.get("date_created")
task_updated = task.get("date_updated")
task_list = task.get("list", {})
task_list_name = task_list.get("name", "Unknown List")
task_space = task.get("space", {})
task_space_name = task_space.get("name", "Unknown Space")
# Build task content string
content_parts: list[str] = [f"Task: {task_name}"]
if task_description:
content_parts.append(f"Description: {task_description}")
content_parts.extend(
[
f"Status: {task_status}",
f"Priority: {task_priority}",
f"List: {task_list_name}",
f"Space: {task_space_name}",
]
)
if task_assignees:
assignee_names = [
assignee.get("username", "Unknown")
for assignee in task_assignees
]
content_parts.append(
f"Assignees: {', '.join(assignee_names)}"
)
if task_due_date:
content_parts.append(f"Due Date: {task_due_date}")
task_content = "\n".join(content_parts)
if not task_content.strip():
logger.warning(f"Skipping task with no content: {task_name}")
documents_skipped += 1
continue
# Hash for duplicates
content_hash = generate_content_hash(task_content, search_space_id)
existing_document_by_hash = await check_duplicate_document_by_hash(
session, content_hash
)
if existing_document_by_hash:
logger.info(
f"Document with content hash {content_hash} already exists for task {task_name}. Skipping processing."
)
documents_skipped += 1
continue
# Embedding and chunks
summary_embedding = config.embedding_model_instance.embed(
task_content
)
chunks = await create_document_chunks(task_content)
document = Document(
search_space_id=search_space_id,
title=f"Task - {task_name}",
document_type=DocumentType.CLICKUP_CONNECTOR,
document_metadata={
"task_id": task_id,
"task_name": task_name,
"task_status": task_status,
"task_priority": task_priority,
"task_assignees": task_assignees,
"task_due_date": task_due_date,
"task_created": task_created,
"task_updated": task_updated,
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
},
content=task_content,
content_hash=content_hash,
embedding=summary_embedding,
chunks=chunks,
)
session.add(document)
documents_indexed += 1
logger.info(f"Successfully indexed new task {task_name}")
except Exception as e:
logger.error(
f"Error processing task {task.get('name', 'Unknown')}: {e!s}",
exc_info=True,
)
documents_skipped += 1
total_processed = documents_indexed
if total_processed > 0:
await update_connector_last_indexed(session, connector, update_last_indexed)
await session.commit()
await task_logger.log_task_success(
log_entry,
f"Successfully completed clickup indexing for connector {connector_id}",
{
"pages_processed": total_processed,
"documents_indexed": documents_indexed,
"documents_skipped": documents_skipped,
},
)
logger.info(
f"clickup indexing completed: {documents_indexed} new tasks, {documents_skipped} skipped"
)
return total_processed, None
except SQLAlchemyError as db_error:
await session.rollback()
await task_logger.log_task_failure(
log_entry,
f"Database error during ClickUp indexing for connector {connector_id}",
str(db_error),
{"error_type": "SQLAlchemyError"},
)
logger.error(f"Database error: {db_error!s}", exc_info=True)
return 0, f"Database error: {db_error!s}"
except Exception as e:
await session.rollback()
await task_logger.log_task_failure(
log_entry,
f"Failed to index ClickUp tasks for connector {connector_id}",
str(e),
{"error_type": type(e).__name__},
)
logger.error(f"Failed to index ClickUp tasks: {e!s}", exc_info=True)
return 0, f"Failed to index ClickUp tasks: {e!s}"

View file

@ -0,0 +1,338 @@
"""
Confluence connector indexer.
"""
from datetime import datetime
from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.ext.asyncio import AsyncSession
from app.config import config
from app.connectors.confluence_connector import ConfluenceConnector
from app.db import Document, DocumentType, SearchSourceConnectorType
from app.services.task_logging_service import TaskLoggingService
from app.utils.document_converters import generate_content_hash
from .base import (
calculate_date_range,
check_duplicate_document_by_hash,
create_document_chunks,
get_connector_by_id,
logger,
update_connector_last_indexed,
)
async def index_confluence_pages(
session: AsyncSession,
connector_id: int,
search_space_id: int,
user_id: str,
start_date: str | None = None,
end_date: str | None = None,
update_last_indexed: bool = True,
) -> tuple[int, str | None]:
"""
Index Confluence pages and comments.
Args:
session: Database session
connector_id: ID of the Confluence connector
search_space_id: ID of the search space to store documents in
user_id: User ID
start_date: Start date for indexing (YYYY-MM-DD format)
end_date: End date for indexing (YYYY-MM-DD format)
update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
Returns:
Tuple containing (number of documents indexed, error message or None)
"""
task_logger = TaskLoggingService(session, search_space_id)
# Log task start
log_entry = await task_logger.log_task_start(
task_name="confluence_pages_indexing",
source="connector_indexing_task",
message=f"Starting Confluence pages indexing for connector {connector_id}",
metadata={
"connector_id": connector_id,
"user_id": str(user_id),
"start_date": start_date,
"end_date": end_date,
},
)
try:
# Get the connector from the database
connector = await get_connector_by_id(
session, connector_id, SearchSourceConnectorType.CONFLUENCE_CONNECTOR
)
if not connector:
await task_logger.log_task_failure(
log_entry,
f"Connector with ID {connector_id} not found",
"Connector not found",
{"error_type": "ConnectorNotFound"},
)
return 0, f"Connector with ID {connector_id} not found"
# Get the Confluence credentials from the connector config
confluence_email = connector.config.get("CONFLUENCE_EMAIL")
confluence_api_token = connector.config.get("CONFLUENCE_API_TOKEN")
confluence_base_url = connector.config.get("CONFLUENCE_BASE_URL")
if not confluence_email or not confluence_api_token or not confluence_base_url:
await task_logger.log_task_failure(
log_entry,
f"Confluence credentials not found in connector config for connector {connector_id}",
"Missing Confluence credentials",
{"error_type": "MissingCredentials"},
)
return 0, "Confluence credentials not found in connector config"
# Initialize Confluence client
await task_logger.log_task_progress(
log_entry,
f"Initializing Confluence client for connector {connector_id}",
{"stage": "client_initialization"},
)
confluence_client = ConfluenceConnector(
base_url=confluence_base_url,
email=confluence_email,
api_token=confluence_api_token,
)
# Calculate date range
start_date_str, end_date_str = calculate_date_range(
connector, start_date, end_date, default_days_back=365
)
await task_logger.log_task_progress(
log_entry,
f"Fetching Confluence pages from {start_date_str} to {end_date_str}",
{
"stage": "fetching_pages",
"start_date": start_date_str,
"end_date": end_date_str,
},
)
# Get pages within date range
try:
pages, error = confluence_client.get_pages_by_date_range(
start_date=start_date_str, end_date=end_date_str, include_comments=True
)
if error:
logger.error(f"Failed to get Confluence pages: {error}")
# Don't treat "No pages found" as an error that should stop indexing
if "No pages found" in error:
logger.info(
"No pages found is not a critical error, continuing with update"
)
if update_last_indexed:
await update_connector_last_indexed(
session, connector, update_last_indexed
)
await session.commit()
logger.info(
f"Updated last_indexed_at to {connector.last_indexed_at} despite no pages found"
)
await task_logger.log_task_success(
log_entry,
f"No Confluence pages found in date range {start_date_str} to {end_date_str}",
{"pages_found": 0},
)
return 0, None
else:
await task_logger.log_task_failure(
log_entry,
f"Failed to get Confluence pages: {error}",
"API Error",
{"error_type": "APIError"},
)
return 0, f"Failed to get Confluence pages: {error}"
logger.info(f"Retrieved {len(pages)} pages from Confluence API")
except Exception as e:
logger.error(f"Error fetching Confluence pages: {e!s}", exc_info=True)
return 0, f"Error fetching Confluence pages: {e!s}"
# Process and index each page
documents_indexed = 0
skipped_pages = []
documents_skipped = 0
for page in pages:
try:
page_id = page.get("id")
page_title = page.get("title", "")
space_id = page.get("spaceId", "")
if not page_id or not page_title:
logger.warning(
f"Skipping page with missing ID or title: {page_id or 'Unknown'}"
)
skipped_pages.append(f"{page_title or 'Unknown'} (missing data)")
documents_skipped += 1
continue
# Extract page content
page_content = ""
if page.get("body") and page["body"].get("storage"):
page_content = page["body"]["storage"].get("value", "")
# Add comments to content
comments = page.get("comments", [])
comments_content = ""
if comments:
comments_content = "\n\n## Comments\n\n"
for comment in comments:
comment_body = ""
if comment.get("body") and comment["body"].get("storage"):
comment_body = comment["body"]["storage"].get("value", "")
comment_author = comment.get("version", {}).get(
"authorId", "Unknown"
)
comment_date = comment.get("version", {}).get("createdAt", "")
comments_content += f"**Comment by {comment_author}** ({comment_date}):\n{comment_body}\n\n"
# Combine page content with comments
full_content = f"# {page_title}\n\n{page_content}{comments_content}"
if not full_content.strip():
logger.warning(f"Skipping page with no content: {page_title}")
skipped_pages.append(f"{page_title} (no content)")
documents_skipped += 1
continue
# Create a simple summary
summary_content = (
f"Confluence Page: {page_title}\n\nSpace ID: {space_id}\n\n"
)
if page_content:
# Take first 500 characters of content for summary
content_preview = page_content[:500]
if len(page_content) > 500:
content_preview += "..."
summary_content += f"Content Preview: {content_preview}\n\n"
# Add comment count
comment_count = len(comments)
summary_content += f"Comments: {comment_count}"
# Generate content hash
content_hash = generate_content_hash(full_content, search_space_id)
# Check if document already exists
existing_document_by_hash = await check_duplicate_document_by_hash(
session, content_hash
)
if existing_document_by_hash:
logger.info(
f"Document with content hash {content_hash} already exists for page {page_title}. Skipping processing."
)
documents_skipped += 1
continue
# Generate embedding for the summary
summary_embedding = config.embedding_model_instance.embed(
summary_content
)
# Process chunks - using the full page content with comments
chunks = await create_document_chunks(full_content)
# Create and store new document
logger.info(f"Creating new document for page {page_title}")
document = Document(
search_space_id=search_space_id,
title=f"Confluence - {page_title}",
document_type=DocumentType.CONFLUENCE_CONNECTOR,
document_metadata={
"page_id": page_id,
"page_title": page_title,
"space_id": space_id,
"comment_count": comment_count,
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
},
content=summary_content,
content_hash=content_hash,
embedding=summary_embedding,
chunks=chunks,
)
session.add(document)
documents_indexed += 1
logger.info(f"Successfully indexed new page {page_title}")
except Exception as e:
logger.error(
f"Error processing page {page.get('title', 'Unknown')}: {e!s}",
exc_info=True,
)
skipped_pages.append(
f"{page.get('title', 'Unknown')} (processing error)"
)
documents_skipped += 1
continue # Skip this page and continue with others
# Update the last_indexed_at timestamp for the connector only if requested
total_processed = documents_indexed
if update_last_indexed:
await update_connector_last_indexed(session, connector, update_last_indexed)
# Commit all changes
await session.commit()
logger.info(
"Successfully committed all Confluence document changes to database"
)
# Log success
await task_logger.log_task_success(
log_entry,
f"Successfully completed Confluence indexing for connector {connector_id}",
{
"pages_processed": total_processed,
"documents_indexed": documents_indexed,
"documents_skipped": documents_skipped,
"skipped_pages_count": len(skipped_pages),
},
)
logger.info(
f"Confluence indexing completed: {documents_indexed} new pages, {documents_skipped} skipped"
)
return (
total_processed,
None,
) # Return None as the error message to indicate success
except SQLAlchemyError as db_error:
await session.rollback()
await task_logger.log_task_failure(
log_entry,
f"Database error during Confluence indexing for connector {connector_id}",
str(db_error),
{"error_type": "SQLAlchemyError"},
)
logger.error(f"Database error: {db_error!s}", exc_info=True)
return 0, f"Database error: {db_error!s}"
except Exception as e:
await session.rollback()
await task_logger.log_task_failure(
log_entry,
f"Failed to index Confluence pages for connector {connector_id}",
str(e),
{"error_type": type(e).__name__},
)
logger.error(f"Failed to index Confluence pages: {e!s}", exc_info=True)
return 0, f"Failed to index Confluence pages: {e!s}"

View file

@ -0,0 +1,441 @@
"""
Discord connector indexer.
"""
import asyncio
from datetime import UTC, datetime, timedelta
from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.ext.asyncio import AsyncSession
from app.config import config
from app.connectors.discord_connector import DiscordConnector
from app.db import Document, DocumentType, SearchSourceConnectorType
from app.prompts import SUMMARY_PROMPT_TEMPLATE
from app.services.llm_service import get_user_long_context_llm
from app.services.task_logging_service import TaskLoggingService
from app.utils.document_converters import generate_content_hash
from .base import (
build_document_metadata_string,
check_duplicate_document_by_hash,
create_document_chunks,
get_connector_by_id,
logger,
update_connector_last_indexed,
)
async def index_discord_messages(
session: AsyncSession,
connector_id: int,
search_space_id: int,
user_id: str,
start_date: str | None = None,
end_date: str | None = None,
update_last_indexed: bool = True,
) -> tuple[int, str | None]:
"""
Index Discord messages from all accessible channels.
Args:
session: Database session
connector_id: ID of the Discord connector
search_space_id: ID of the search space to store documents in
user_id: ID of the user
start_date: Start date for indexing (YYYY-MM-DD format)
end_date: End date for indexing (YYYY-MM-DD format)
update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
Returns:
Tuple containing (number of documents indexed, error message or None)
"""
task_logger = TaskLoggingService(session, search_space_id)
# Log task start
log_entry = await task_logger.log_task_start(
task_name="discord_messages_indexing",
source="connector_indexing_task",
message=f"Starting Discord messages indexing for connector {connector_id}",
metadata={
"connector_id": connector_id,
"user_id": str(user_id),
"start_date": start_date,
"end_date": end_date,
},
)
try:
# Get the connector
await task_logger.log_task_progress(
log_entry,
f"Retrieving Discord connector {connector_id} from database",
{"stage": "connector_retrieval"},
)
connector = await get_connector_by_id(
session, connector_id, SearchSourceConnectorType.DISCORD_CONNECTOR
)
if not connector:
await task_logger.log_task_failure(
log_entry,
f"Connector with ID {connector_id} not found or is not a Discord connector",
"Connector not found",
{"error_type": "ConnectorNotFound"},
)
return (
0,
f"Connector with ID {connector_id} not found or is not a Discord connector",
)
# Get the Discord token from the connector config
discord_token = connector.config.get("DISCORD_BOT_TOKEN")
if not discord_token:
await task_logger.log_task_failure(
log_entry,
f"Discord token not found in connector config for connector {connector_id}",
"Missing Discord token",
{"error_type": "MissingToken"},
)
return 0, "Discord token not found in connector config"
logger.info(f"Starting Discord indexing for connector {connector_id}")
# Initialize Discord client
await task_logger.log_task_progress(
log_entry,
f"Initializing Discord client for connector {connector_id}",
{"stage": "client_initialization"},
)
discord_client = DiscordConnector(token=discord_token)
# Calculate date range
if start_date is None or end_date is None:
# Fall back to calculating dates based on last_indexed_at
calculated_end_date = datetime.now(UTC)
# Use last_indexed_at as start date if available, otherwise use 365 days ago
if connector.last_indexed_at:
calculated_start_date = connector.last_indexed_at.replace(tzinfo=UTC)
logger.info(
f"Using last_indexed_at ({calculated_start_date.strftime('%Y-%m-%d')}) as start date"
)
else:
calculated_start_date = calculated_end_date - timedelta(days=365)
logger.info(
f"No last_indexed_at found, using {calculated_start_date.strftime('%Y-%m-%d')} (365 days ago) as start date"
)
# Use calculated dates if not provided, convert to ISO format for Discord API
if start_date is None:
start_date_iso = calculated_start_date.isoformat()
else:
# Convert YYYY-MM-DD to ISO format
start_date_iso = (
datetime.strptime(start_date, "%Y-%m-%d")
.replace(tzinfo=UTC)
.isoformat()
)
if end_date is None:
end_date_iso = calculated_end_date.isoformat()
else:
# Convert YYYY-MM-DD to ISO format
end_date_iso = (
datetime.strptime(end_date, "%Y-%m-%d")
.replace(tzinfo=UTC)
.isoformat()
)
else:
# Convert provided dates to ISO format for Discord API
start_date_iso = (
datetime.strptime(start_date, "%Y-%m-%d")
.replace(tzinfo=UTC)
.isoformat()
)
end_date_iso = (
datetime.strptime(end_date, "%Y-%m-%d").replace(tzinfo=UTC).isoformat()
)
logger.info(
f"Indexing Discord messages from {start_date_iso} to {end_date_iso}"
)
try:
await task_logger.log_task_progress(
log_entry,
f"Starting Discord bot and fetching guilds for connector {connector_id}",
{"stage": "fetch_guilds"},
)
logger.info("Starting Discord bot to fetch guilds")
discord_client._bot_task = asyncio.create_task(discord_client.start_bot())
await discord_client._wait_until_ready()
logger.info("Fetching Discord guilds")
guilds = await discord_client.get_guilds()
logger.info(f"Found {len(guilds)} guilds")
except Exception as e:
await task_logger.log_task_failure(
log_entry,
f"Failed to get Discord guilds for connector {connector_id}",
str(e),
{"error_type": "GuildFetchError"},
)
logger.error(f"Failed to get Discord guilds: {e!s}", exc_info=True)
await discord_client.close_bot()
return 0, f"Failed to get Discord guilds: {e!s}"
if not guilds:
await task_logger.log_task_success(
log_entry,
f"No Discord guilds found for connector {connector_id}",
{"guilds_found": 0},
)
logger.info("No Discord guilds found to index")
await discord_client.close_bot()
return 0, "No Discord guilds found"
# Track results
documents_indexed = 0
documents_skipped = 0
skipped_channels: list[str] = []
# Process each guild and channel
await task_logger.log_task_progress(
log_entry,
f"Starting to process {len(guilds)} Discord guilds",
{"stage": "process_guilds", "total_guilds": len(guilds)},
)
try:
for guild in guilds:
guild_id = guild["id"]
guild_name = guild["name"]
logger.info(f"Processing guild: {guild_name} ({guild_id})")
try:
channels = await discord_client.get_text_channels(guild_id)
if not channels:
logger.info(f"No channels found in guild {guild_name}. Skipping.")
skipped_channels.append(f"{guild_name} (no channels)")
documents_skipped += 1
continue
for channel in channels:
channel_id = channel["id"]
channel_name = channel["name"]
try:
messages = await discord_client.get_channel_history(
channel_id=channel_id,
start_date=start_date_iso,
end_date=end_date_iso,
)
except Exception as e:
logger.error(
f"Failed to get messages for channel {channel_name}: {e!s}"
)
skipped_channels.append(
f"{guild_name}#{channel_name} (fetch error)"
)
documents_skipped += 1
continue
if not messages:
logger.info(
f"No messages found in channel {channel_name} for the specified date range."
)
documents_skipped += 1
continue
# Filter/format messages
formatted_messages: list[dict] = []
for msg in messages:
# Optionally skip system messages
if msg.get("type") in ["system"]:
continue
formatted_messages.append(msg)
if not formatted_messages:
logger.info(
f"No valid messages found in channel {channel_name} after filtering."
)
documents_skipped += 1
continue
# Convert messages to markdown format
channel_content = (
f"# Discord Channel: {guild_name} / {channel_name}\n\n"
)
for msg in formatted_messages:
user_name = msg.get("author_name", "Unknown User")
timestamp = msg.get("created_at", "Unknown Time")
text = msg.get("content", "")
channel_content += (
f"## {user_name} ({timestamp})\n\n{text}\n\n---\n\n"
)
# Metadata sections
metadata_sections = [
(
"METADATA",
[
f"GUILD_NAME: {guild_name}",
f"GUILD_ID: {guild_id}",
f"CHANNEL_NAME: {channel_name}",
f"CHANNEL_ID: {channel_id}",
f"MESSAGE_COUNT: {len(formatted_messages)}",
],
),
(
"CONTENT",
[
"FORMAT: markdown",
"TEXT_START",
channel_content,
"TEXT_END",
],
),
]
combined_document_string = build_document_metadata_string(
metadata_sections
)
content_hash = generate_content_hash(
combined_document_string, search_space_id
)
# Skip duplicates by hash
existing_document_by_hash = await check_duplicate_document_by_hash(
session, content_hash
)
if existing_document_by_hash:
logger.info(
f"Document with content hash {content_hash} already exists for channel {guild_name}#{channel_name}. Skipping processing."
)
documents_skipped += 1
continue
# Get user's long context LLM
user_llm = await get_user_long_context_llm(session, user_id)
if not user_llm:
logger.error(
f"No long context LLM configured for user {user_id}"
)
skipped_channels.append(
f"{guild_name}#{channel_name} (no LLM configured)"
)
documents_skipped += 1
continue
# Generate summary using summary_chain
summary_chain = SUMMARY_PROMPT_TEMPLATE | user_llm
summary_result = await summary_chain.ainvoke(
{"document": combined_document_string}
)
summary_content = summary_result.content
summary_embedding = config.embedding_model_instance.embed(
summary_content
)
# Chunks from channel content
chunks = await create_document_chunks(channel_content)
# Create and store new document
document = Document(
search_space_id=search_space_id,
title=f"Discord - {guild_name}#{channel_name}",
document_type=DocumentType.DISCORD_CONNECTOR,
document_metadata={
"guild_name": guild_name,
"guild_id": guild_id,
"channel_name": channel_name,
"channel_id": channel_id,
"message_count": len(formatted_messages),
"start_date": start_date_iso,
"end_date": end_date_iso,
"indexed_at": datetime.now(UTC).strftime(
"%Y-%m-%d %H:%M:%S"
),
},
content=summary_content,
content_hash=content_hash,
embedding=summary_embedding,
chunks=chunks,
)
session.add(document)
documents_indexed += 1
logger.info(
f"Successfully indexed new channel {guild_name}#{channel_name} with {len(formatted_messages)} messages"
)
except Exception as e:
logger.error(
f"Error processing guild {guild_name}: {e!s}", exc_info=True
)
skipped_channels.append(f"{guild_name} (processing error)")
documents_skipped += 1
continue
finally:
await discord_client.close_bot()
# Update last_indexed_at only if we indexed at least one
if documents_indexed > 0:
await update_connector_last_indexed(session, connector, update_last_indexed)
await session.commit()
# Prepare result message
result_message = None
if skipped_channels:
result_message = (
f"Processed {documents_indexed} channels. Skipped {len(skipped_channels)} channels: "
+ ", ".join(skipped_channels)
)
else:
result_message = f"Processed {documents_indexed} channels."
# Log success
await task_logger.log_task_success(
log_entry,
f"Successfully completed Discord indexing for connector {connector_id}",
{
"channels_processed": documents_indexed,
"documents_indexed": documents_indexed,
"documents_skipped": documents_skipped,
"skipped_channels_count": len(skipped_channels),
"guilds_processed": len(guilds),
"result_message": result_message,
},
)
logger.info(
f"Discord indexing completed: {documents_indexed} new channels, {documents_skipped} skipped"
)
return documents_indexed, result_message
except SQLAlchemyError as db_error:
await session.rollback()
await task_logger.log_task_failure(
log_entry,
f"Database error during Discord indexing for connector {connector_id}",
str(db_error),
{"error_type": "SQLAlchemyError"},
)
logger.error(f"Database error: {db_error!s}", exc_info=True)
return 0, f"Database error: {db_error!s}"
except Exception as e:
await session.rollback()
await task_logger.log_task_failure(
log_entry,
f"Failed to index Discord messages for connector {connector_id}",
str(e),
{"error_type": type(e).__name__},
)
logger.error(f"Failed to index Discord messages: {e!s}", exc_info=True)
return 0, f"Failed to index Discord messages: {e!s}"

View file

@ -0,0 +1,326 @@
"""
GitHub connector indexer.
"""
from datetime import UTC, datetime
from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.ext.asyncio import AsyncSession
from app.config import config
from app.connectors.github_connector import GitHubConnector
from app.db import Document, DocumentType, SearchSourceConnectorType
from app.services.task_logging_service import TaskLoggingService
from app.utils.document_converters import generate_content_hash
from .base import (
check_duplicate_document_by_hash,
create_document_chunks,
get_connector_by_id,
logger,
)
async def index_github_repos(
session: AsyncSession,
connector_id: int,
search_space_id: int,
user_id: str,
start_date: str | None = None,
end_date: str | None = None,
update_last_indexed: bool = True,
) -> tuple[int, str | None]:
"""
Index code and documentation files from accessible GitHub repositories.
Args:
session: Database session
connector_id: ID of the GitHub connector
search_space_id: ID of the search space to store documents in
user_id: ID of the user
start_date: Start date for filtering (YYYY-MM-DD format) - Note: GitHub indexing processes all files regardless of dates
end_date: End date for filtering (YYYY-MM-DD format) - Note: GitHub indexing processes all files regardless of dates
update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
Returns:
Tuple containing (number of documents indexed, error message or None)
"""
task_logger = TaskLoggingService(session, search_space_id)
# Log task start
log_entry = await task_logger.log_task_start(
task_name="github_repos_indexing",
source="connector_indexing_task",
message=f"Starting GitHub repositories indexing for connector {connector_id}",
metadata={
"connector_id": connector_id,
"user_id": str(user_id),
"start_date": start_date,
"end_date": end_date,
},
)
documents_processed = 0
errors = []
try:
# 1. Get the GitHub connector from the database
await task_logger.log_task_progress(
log_entry,
f"Retrieving GitHub connector {connector_id} from database",
{"stage": "connector_retrieval"},
)
connector = await get_connector_by_id(
session, connector_id, SearchSourceConnectorType.GITHUB_CONNECTOR
)
if not connector:
await task_logger.log_task_failure(
log_entry,
f"Connector with ID {connector_id} not found or is not a GitHub connector",
"Connector not found",
{"error_type": "ConnectorNotFound"},
)
return (
0,
f"Connector with ID {connector_id} not found or is not a GitHub connector",
)
# 2. Get the GitHub PAT and selected repositories from the connector config
github_pat = connector.config.get("GITHUB_PAT")
repo_full_names_to_index = connector.config.get("repo_full_names")
if not github_pat:
await task_logger.log_task_failure(
log_entry,
f"GitHub Personal Access Token (PAT) not found in connector config for connector {connector_id}",
"Missing GitHub PAT",
{"error_type": "MissingToken"},
)
return 0, "GitHub Personal Access Token (PAT) not found in connector config"
if not repo_full_names_to_index or not isinstance(
repo_full_names_to_index, list
):
await task_logger.log_task_failure(
log_entry,
f"'repo_full_names' not found or is not a list in connector config for connector {connector_id}",
"Invalid repo configuration",
{"error_type": "InvalidConfiguration"},
)
return 0, "'repo_full_names' not found or is not a list in connector config"
# 3. Initialize GitHub connector client
await task_logger.log_task_progress(
log_entry,
f"Initializing GitHub client for connector {connector_id}",
{
"stage": "client_initialization",
"repo_count": len(repo_full_names_to_index),
},
)
try:
github_client = GitHubConnector(token=github_pat)
except ValueError as e:
await task_logger.log_task_failure(
log_entry,
f"Failed to initialize GitHub client for connector {connector_id}",
str(e),
{"error_type": "ClientInitializationError"},
)
return 0, f"Failed to initialize GitHub client: {e!s}"
# 4. Validate selected repositories
await task_logger.log_task_progress(
log_entry,
f"Starting indexing for {len(repo_full_names_to_index)} selected repositories",
{
"stage": "repo_processing",
"repo_count": len(repo_full_names_to_index),
"start_date": start_date,
"end_date": end_date,
},
)
logger.info(
f"Starting indexing for {len(repo_full_names_to_index)} selected repositories."
)
if start_date and end_date:
logger.info(
f"Date range requested: {start_date} to {end_date} (Note: GitHub indexing processes all files regardless of dates)"
)
# 6. Iterate through selected repositories and index files
for repo_full_name in repo_full_names_to_index:
if not repo_full_name or not isinstance(repo_full_name, str):
logger.warning(f"Skipping invalid repository entry: {repo_full_name}")
continue
logger.info(f"Processing repository: {repo_full_name}")
try:
files_to_index = github_client.get_repository_files(repo_full_name)
if not files_to_index:
logger.info(
f"No indexable files found in repository: {repo_full_name}"
)
continue
logger.info(
f"Found {len(files_to_index)} files to process in {repo_full_name}"
)
for file_info in files_to_index:
file_path = file_info.get("path")
file_url = file_info.get("url")
file_sha = file_info.get("sha")
file_type = file_info.get("type") # 'code' or 'doc'
full_path_key = f"{repo_full_name}/{file_path}"
if not file_path or not file_url or not file_sha:
logger.warning(
f"Skipping file with missing info in {repo_full_name}: {file_info}"
)
continue
# Get file content
file_content = github_client.get_file_content(
repo_full_name, file_path
)
if file_content is None:
logger.warning(
f"Could not retrieve content for {full_path_key}. Skipping."
)
continue # Skip if content fetch failed
content_hash = generate_content_hash(file_content, search_space_id)
# Check if document with this content hash already exists
existing_document_by_hash = await check_duplicate_document_by_hash(
session, content_hash
)
if existing_document_by_hash:
logger.info(
f"Document with content hash {content_hash} already exists for file {full_path_key}. Skipping processing."
)
continue
# Use file_content directly for chunking, maybe summary for main content?
# For now, let's use the full content for both, might need refinement
summary_content = f"GitHub file: {full_path_key}\n\n{file_content[:1000]}..." # Simple summary
summary_embedding = config.embedding_model_instance.embed(
summary_content
)
# Chunk the content
try:
chunks_data = [
await create_document_chunks(file_content)
][0]
# Use code chunker if available, otherwise regular chunker
if hasattr(config, "code_chunker_instance"):
chunks_data = [
{
"content": chunk.text,
"embedding": config.embedding_model_instance.embed(
chunk.text
),
}
for chunk in config.code_chunker_instance.chunk(
file_content
)
]
else:
chunks_data = await create_document_chunks(file_content)
except Exception as chunk_err:
logger.error(
f"Failed to chunk file {full_path_key}: {chunk_err}"
)
errors.append(
f"Chunking failed for {full_path_key}: {chunk_err}"
)
continue # Skip this file if chunking fails
doc_metadata = {
"repository_full_name": repo_full_name,
"file_path": file_path,
"full_path": full_path_key, # For easier lookup
"url": file_url,
"sha": file_sha,
"type": file_type,
"indexed_at": datetime.now(UTC).isoformat(),
}
# Create new document
logger.info(f"Creating new document for file: {full_path_key}")
document = Document(
title=f"GitHub - {file_path}",
document_type=DocumentType.GITHUB_CONNECTOR,
document_metadata=doc_metadata,
content=summary_content, # Store summary
content_hash=content_hash,
embedding=summary_embedding,
search_space_id=search_space_id,
chunks=chunks_data, # Associate chunks directly
)
session.add(document)
documents_processed += 1
except Exception as repo_err:
logger.error(
f"Failed to process repository {repo_full_name}: {repo_err}"
)
errors.append(f"Failed processing {repo_full_name}: {repo_err}")
# Commit all changes at the end
await session.commit()
logger.info(
f"Finished GitHub indexing for connector {connector_id}. Processed {documents_processed} files."
)
# Log success
await task_logger.log_task_success(
log_entry,
f"Successfully completed GitHub indexing for connector {connector_id}",
{
"documents_processed": documents_processed,
"errors_count": len(errors),
"repo_count": len(repo_full_names_to_index),
},
)
except SQLAlchemyError as db_err:
await session.rollback()
await task_logger.log_task_failure(
log_entry,
f"Database error during GitHub indexing for connector {connector_id}",
str(db_err),
{"error_type": "SQLAlchemyError"},
)
logger.error(
f"Database error during GitHub indexing for connector {connector_id}: {db_err}"
)
errors.append(f"Database error: {db_err}")
return documents_processed, "; ".join(errors) if errors else str(db_err)
except Exception as e:
await session.rollback()
await task_logger.log_task_failure(
log_entry,
f"Unexpected error during GitHub indexing for connector {connector_id}",
str(e),
{"error_type": type(e).__name__},
)
logger.error(
f"Unexpected error during GitHub indexing for connector {connector_id}: {e}",
exc_info=True,
)
errors.append(f"Unexpected error: {e}")
return documents_processed, "; ".join(errors) if errors else str(e)
error_message = "; ".join(errors) if errors else None
return documents_processed, error_message

View file

@ -0,0 +1,350 @@
"""
Google Calendar connector indexer.
"""
from datetime import datetime, timedelta
from google.oauth2.credentials import Credentials
from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.ext.asyncio import AsyncSession
from app.config import config
from app.connectors.google_calendar_connector import GoogleCalendarConnector
from app.db import Document, DocumentType, SearchSourceConnectorType
from app.services.task_logging_service import TaskLoggingService
from app.utils.document_converters import generate_content_hash
from .base import (
create_document_chunks,
get_connector_by_id,
logger,
update_connector_last_indexed,
)
async def index_google_calendar_events(
session: AsyncSession,
connector_id: int,
search_space_id: int,
user_id: str,
start_date: str | None = None,
end_date: str | None = None,
update_last_indexed: bool = True,
) -> tuple[int, str | None]:
"""
Index Google Calendar events.
Args:
session: Database session
connector_id: ID of the Google Calendar connector
search_space_id: ID of the search space to store documents in
user_id: User ID
start_date: Start date for indexing (YYYY-MM-DD format)
end_date: End date for indexing (YYYY-MM-DD format)
update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
Returns:
Tuple containing (number of documents indexed, error message or None)
"""
task_logger = TaskLoggingService(session, search_space_id)
# Log task start
log_entry = await task_logger.log_task_start(
task_name="google_calendar_events_indexing",
source="connector_indexing_task",
message=f"Starting Google Calendar events indexing for connector {connector_id}",
metadata={
"connector_id": connector_id,
"user_id": str(user_id),
"start_date": start_date,
"end_date": end_date,
},
)
try:
# Get the connector from the database
connector = await get_connector_by_id(
session, connector_id, SearchSourceConnectorType.GOOGLE_CALENDAR_CONNECTOR
)
if not connector:
await task_logger.log_task_failure(
log_entry,
f"Connector with ID {connector_id} not found",
"Connector not found",
{"error_type": "ConnectorNotFound"},
)
return 0, f"Connector with ID {connector_id} not found"
# Get the Google Calendar credentials from the connector config
credentials = Credentials(
token=connector.config.get("token"),
refresh_token=connector.config.get("refresh_token"),
token_uri=connector.config.get("token_uri"),
client_id=connector.config.get("client_id"),
client_secret=connector.config.get("client_secret"),
scopes=connector.config.get("scopes"),
)
if (
not credentials.client_id
or not credentials.client_secret
or not credentials.refresh_token
):
await task_logger.log_task_failure(
log_entry,
f"Google Calendar credentials not found in connector config for connector {connector_id}",
"Missing Google Calendar credentials",
{"error_type": "MissingCredentials"},
)
return 0, "Google Calendar credentials not found in connector config"
# Initialize Google Calendar client
await task_logger.log_task_progress(
log_entry,
f"Initializing Google Calendar client for connector {connector_id}",
{"stage": "client_initialization"},
)
calendar_client = GoogleCalendarConnector(credentials=credentials)
# Calculate date range
if start_date is None or end_date is None:
# Fall back to calculating dates based on last_indexed_at
calculated_end_date = datetime.now()
# Use last_indexed_at as start date if available, otherwise use 30 days ago
if connector.last_indexed_at:
# Convert dates to be comparable (both timezone-naive)
last_indexed_naive = (
connector.last_indexed_at.replace(tzinfo=None)
if connector.last_indexed_at.tzinfo
else connector.last_indexed_at
)
# Check if last_indexed_at is in the future or after end_date
if last_indexed_naive > calculated_end_date:
logger.warning(
f"Last indexed date ({last_indexed_naive.strftime('%Y-%m-%d')}) is in the future. Using 30 days ago instead."
)
calculated_start_date = calculated_end_date - timedelta(days=30)
else:
calculated_start_date = last_indexed_naive
logger.info(
f"Using last_indexed_at ({calculated_start_date.strftime('%Y-%m-%d')}) as start date"
)
else:
calculated_start_date = calculated_end_date - timedelta(
days=30
) # Use 30 days as default for calendar events
logger.info(
f"No last_indexed_at found, using {calculated_start_date.strftime('%Y-%m-%d')} (30 days ago) as start date"
)
# Use calculated dates if not provided
start_date_str = (
start_date if start_date else calculated_start_date.strftime("%Y-%m-%d")
)
end_date_str = (
end_date if end_date else calculated_end_date.strftime("%Y-%m-%d")
)
else:
# Use provided dates
start_date_str = start_date
end_date_str = end_date
await task_logger.log_task_progress(
log_entry,
f"Fetching Google Calendar events from {start_date_str} to {end_date_str}",
{
"stage": "fetching_events",
"start_date": start_date_str,
"end_date": end_date_str,
},
)
# Get events within date range from primary calendar
try:
events, error = calendar_client.get_all_primary_calendar_events(
start_date=start_date_str, end_date=end_date_str
)
if error:
logger.error(f"Failed to get Google Calendar events: {error}")
# Don't treat "No events found" as an error that should stop indexing
if "No events found" in error:
logger.info(
"No events found is not a critical error, continuing with update"
)
if update_last_indexed:
await update_connector_last_indexed(
session, connector, update_last_indexed
)
await session.commit()
logger.info(
f"Updated last_indexed_at to {connector.last_indexed_at} despite no events found"
)
await task_logger.log_task_success(
log_entry,
f"No Google Calendar events found in date range {start_date_str} to {end_date_str}",
{"events_found": 0},
)
return 0, None
else:
await task_logger.log_task_failure(
log_entry,
f"Failed to get Google Calendar events: {error}",
"API Error",
{"error_type": "APIError"},
)
return 0, f"Failed to get Google Calendar events: {error}"
logger.info(f"Retrieved {len(events)} events from Google Calendar API")
except Exception as e:
logger.error(f"Error fetching Google Calendar events: {e!s}", exc_info=True)
return 0, f"Error fetching Google Calendar events: {e!s}"
documents_indexed = 0
documents_skipped = 0
skipped_events = []
for event in events:
try:
event_id = event.get("id")
event_summary = event.get("summary", "No Title")
calendar_id = event.get("calendarId", "")
if not event_id:
logger.warning(f"Skipping event with missing ID: {event_summary}")
skipped_events.append(f"{event_summary} (missing ID)")
documents_skipped += 1
continue
event_markdown = calendar_client.format_event_to_markdown(event)
if not event_markdown.strip():
logger.warning(f"Skipping event with no content: {event_summary}")
skipped_events.append(f"{event_summary} (no content)")
documents_skipped += 1
continue
start = event.get("start", {})
end = event.get("end", {})
start_time = start.get("dateTime") or start.get("date", "")
end_time = end.get("dateTime") or end.get("date", "")
location = event.get("location", "")
description = event.get("description", "")
summary_content = f"Google Calendar Event: {event_summary}\n\n"
summary_content += f"Calendar: {calendar_id}\n"
summary_content += f"Start: {start_time}\n"
summary_content += f"End: {end_time}\n"
if location:
summary_content += f"Location: {location}\n"
if description:
desc_preview = description[:300]
if len(description) > 300:
desc_preview += "..."
summary_content += f"Description: {desc_preview}\n"
content_hash = generate_content_hash(event_markdown, search_space_id)
# Duplicate check via simple query using helper in base
from .base import (
check_duplicate_document_by_hash, # local import to avoid circular at module import
)
existing_document_by_hash = await check_duplicate_document_by_hash(
session, content_hash
)
if existing_document_by_hash:
logger.info(
f"Document with content hash {content_hash} already exists for event {event_summary}. Skipping processing."
)
documents_skipped += 1
continue
# Embeddings and chunks
summary_embedding = config.embedding_model_instance.embed(
summary_content
)
chunks = await create_document_chunks(event_markdown)
document = Document(
search_space_id=search_space_id,
title=f"Calendar Event - {event_summary}",
document_type=DocumentType.GOOGLE_CALENDAR_CONNECTOR,
document_metadata={
"event_id": event_id,
"event_summary": event_summary,
"calendar_id": calendar_id,
"start_time": start_time,
"end_time": end_time,
"location": location,
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
},
content=summary_content,
content_hash=content_hash,
embedding=summary_embedding,
chunks=chunks,
)
session.add(document)
documents_indexed += 1
logger.info(f"Successfully indexed new event {event_summary}")
except Exception as e:
logger.error(
f"Error processing event {event.get('summary', 'Unknown')}: {e!s}",
exc_info=True,
)
skipped_events.append(
f"{event.get('summary', 'Unknown')} (processing error)"
)
documents_skipped += 1
continue
total_processed = documents_indexed
if total_processed > 0:
await update_connector_last_indexed(session, connector, update_last_indexed)
await session.commit()
await task_logger.log_task_success(
log_entry,
f"Successfully completed Google Calendar indexing for connector {connector_id}",
{
"events_processed": total_processed,
"documents_indexed": documents_indexed,
"documents_skipped": documents_skipped,
"skipped_events_count": len(skipped_events),
},
)
logger.info(
f"Google Calendar indexing completed: {documents_indexed} new events, {documents_skipped} skipped"
)
return total_processed, None
except SQLAlchemyError as db_error:
await session.rollback()
await task_logger.log_task_failure(
log_entry,
f"Database error during Google Calendar indexing for connector {connector_id}",
str(db_error),
{"error_type": "SQLAlchemyError"},
)
logger.error(f"Database error: {db_error!s}", exc_info=True)
return 0, f"Database error: {db_error!s}"
except Exception as e:
await session.rollback()
await task_logger.log_task_failure(
log_entry,
f"Failed to index Google Calendar events for connector {connector_id}",
str(e),
{"error_type": type(e).__name__},
)
logger.error(f"Failed to index Google Calendar events: {e!s}", exc_info=True)
return 0, f"Failed to index Google Calendar events: {e!s}"

View file

@ -0,0 +1,320 @@
"""
Jira connector indexer.
"""
from datetime import datetime
from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.ext.asyncio import AsyncSession
from app.config import config
from app.connectors.jira_connector import JiraConnector
from app.db import Document, DocumentType, SearchSourceConnectorType
from app.services.task_logging_service import TaskLoggingService
from app.utils.document_converters import generate_content_hash
from .base import (
calculate_date_range,
check_duplicate_document_by_hash,
create_document_chunks,
get_connector_by_id,
logger,
update_connector_last_indexed,
)
async def index_jira_issues(
session: AsyncSession,
connector_id: int,
search_space_id: int,
user_id: str,
start_date: str | None = None,
end_date: str | None = None,
update_last_indexed: bool = True,
) -> tuple[int, str | None]:
"""
Index Jira issues and comments.
Args:
session: Database session
connector_id: ID of the Jira connector
search_space_id: ID of the search space to store documents in
user_id: User ID
start_date: Start date for indexing (YYYY-MM-DD format)
end_date: End date for indexing (YYYY-MM-DD format)
update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
Returns:
Tuple containing (number of documents indexed, error message or None)
"""
task_logger = TaskLoggingService(session, search_space_id)
# Log task start
log_entry = await task_logger.log_task_start(
task_name="jira_issues_indexing",
source="connector_indexing_task",
message=f"Starting Jira issues indexing for connector {connector_id}",
metadata={
"connector_id": connector_id,
"user_id": str(user_id),
"start_date": start_date,
"end_date": end_date,
},
)
try:
# Get the connector from the database
connector = await get_connector_by_id(
session, connector_id, SearchSourceConnectorType.JIRA_CONNECTOR
)
if not connector:
await task_logger.log_task_failure(
log_entry,
f"Connector with ID {connector_id} not found",
"Connector not found",
{"error_type": "ConnectorNotFound"},
)
return 0, f"Connector with ID {connector_id} not found"
# Get the Jira credentials from the connector config
jira_email = connector.config.get("JIRA_EMAIL")
jira_api_token = connector.config.get("JIRA_API_TOKEN")
jira_base_url = connector.config.get("JIRA_BASE_URL")
if not jira_email or not jira_api_token or not jira_base_url:
await task_logger.log_task_failure(
log_entry,
f"Jira credentials not found in connector config for connector {connector_id}",
"Missing Jira credentials",
{"error_type": "MissingCredentials"},
)
return 0, "Jira credentials not found in connector config"
# Initialize Jira client
await task_logger.log_task_progress(
log_entry,
f"Initializing Jira client for connector {connector_id}",
{"stage": "client_initialization"},
)
jira_client = JiraConnector(
base_url=jira_base_url, email=jira_email, api_token=jira_api_token
)
# Calculate date range
start_date_str, end_date_str = calculate_date_range(
connector, start_date, end_date, default_days_back=365
)
await task_logger.log_task_progress(
log_entry,
f"Fetching Jira issues from {start_date_str} to {end_date_str}",
{
"stage": "fetching_issues",
"start_date": start_date_str,
"end_date": end_date_str,
},
)
# Get issues within date range
try:
issues, error = jira_client.get_issues_by_date_range(
start_date=start_date_str, end_date=end_date_str, include_comments=True
)
if error:
logger.error(f"Failed to get Jira issues: {error}")
# Don't treat "No issues found" as an error that should stop indexing
if "No issues found" in error:
logger.info(
"No issues found is not a critical error, continuing with update"
)
if update_last_indexed:
await update_connector_last_indexed(
session, connector, update_last_indexed
)
await session.commit()
logger.info(
f"Updated last_indexed_at to {connector.last_indexed_at} despite no issues found"
)
await task_logger.log_task_success(
log_entry,
f"No Jira issues found in date range {start_date_str} to {end_date_str}",
{"issues_found": 0},
)
return 0, None
else:
await task_logger.log_task_failure(
log_entry,
f"Failed to get Jira issues: {error}",
"API Error",
{"error_type": "APIError"},
)
return 0, f"Failed to get Jira issues: {error}"
logger.info(f"Retrieved {len(issues)} issues from Jira API")
except Exception as e:
logger.error(f"Error fetching Jira issues: {e!s}", exc_info=True)
return 0, f"Error fetching Jira issues: {e!s}"
# Process and index each issue
documents_indexed = 0
skipped_issues = []
documents_skipped = 0
for issue in issues:
try:
issue_id = issue.get("key")
issue_identifier = issue.get("key", "")
issue_title = issue.get("id", "")
if not issue_id or not issue_title:
logger.warning(
f"Skipping issue with missing ID or title: {issue_id or 'Unknown'}"
)
skipped_issues.append(
f"{issue_identifier or 'Unknown'} (missing data)"
)
documents_skipped += 1
continue
# Format the issue for better readability
formatted_issue = jira_client.format_issue(issue)
# Convert to markdown
issue_content = jira_client.format_issue_to_markdown(formatted_issue)
if not issue_content:
logger.warning(
f"Skipping issue with no content: {issue_identifier} - {issue_title}"
)
skipped_issues.append(f"{issue_identifier} (no content)")
documents_skipped += 1
continue
# Create a simple summary
summary_content = f"Jira Issue {issue_identifier}: {issue_title}\n\nStatus: {formatted_issue.get('status', 'Unknown')}\n\n"
if formatted_issue.get("description"):
summary_content += (
f"Description: {formatted_issue.get('description')}\n\n"
)
# Add comment count
comment_count = len(formatted_issue.get("comments", []))
summary_content += f"Comments: {comment_count}"
# Generate content hash
content_hash = generate_content_hash(issue_content, search_space_id)
# Check if document already exists
existing_document_by_hash = await check_duplicate_document_by_hash(
session, content_hash
)
if existing_document_by_hash:
logger.info(
f"Document with content hash {content_hash} already exists for issue {issue_identifier}. Skipping processing."
)
documents_skipped += 1
continue
# Generate embedding for the summary
summary_embedding = config.embedding_model_instance.embed(
summary_content
)
# Process chunks - using the full issue content with comments
chunks = await create_document_chunks(issue_content)
# Create and store new document
logger.info(
f"Creating new document for issue {issue_identifier} - {issue_title}"
)
document = Document(
search_space_id=search_space_id,
title=f"Jira - {issue_identifier}: {issue_title}",
document_type=DocumentType.JIRA_CONNECTOR,
document_metadata={
"issue_id": issue_id,
"issue_identifier": issue_identifier,
"issue_title": issue_title,
"state": formatted_issue.get("status", "Unknown"),
"comment_count": comment_count,
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
},
content=summary_content,
content_hash=content_hash,
embedding=summary_embedding,
chunks=chunks,
)
session.add(document)
documents_indexed += 1
logger.info(
f"Successfully indexed new issue {issue_identifier} - {issue_title}"
)
except Exception as e:
logger.error(
f"Error processing issue {issue.get('identifier', 'Unknown')}: {e!s}",
exc_info=True,
)
skipped_issues.append(
f"{issue.get('identifier', 'Unknown')} (processing error)"
)
documents_skipped += 1
continue # Skip this issue and continue with others
# Update the last_indexed_at timestamp for the connector only if requested
total_processed = documents_indexed
if update_last_indexed:
await update_connector_last_indexed(session, connector, update_last_indexed)
# Commit all changes
await session.commit()
logger.info("Successfully committed all JIRA document changes to database")
# Log success
await task_logger.log_task_success(
log_entry,
f"Successfully completed JIRA indexing for connector {connector_id}",
{
"issues_processed": total_processed,
"documents_indexed": documents_indexed,
"documents_skipped": documents_skipped,
"skipped_issues_count": len(skipped_issues),
},
)
logger.info(
f"JIRA indexing completed: {documents_indexed} new issues, {documents_skipped} skipped"
)
return (
total_processed,
None,
) # Return None as the error message to indicate success
except SQLAlchemyError as db_error:
await session.rollback()
await task_logger.log_task_failure(
log_entry,
f"Database error during JIRA indexing for connector {connector_id}",
str(db_error),
{"error_type": "SQLAlchemyError"},
)
logger.error(f"Database error: {db_error!s}", exc_info=True)
return 0, f"Database error: {db_error!s}"
except Exception as e:
await session.rollback()
await task_logger.log_task_failure(
log_entry,
f"Failed to index JIRA issues for connector {connector_id}",
str(e),
{"error_type": type(e).__name__},
)
logger.error(f"Failed to index JIRA issues: {e!s}", exc_info=True)
return 0, f"Failed to index JIRA issues: {e!s}"

View file

@ -0,0 +1,337 @@
"""
Linear connector indexer.
"""
from datetime import datetime
from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.ext.asyncio import AsyncSession
from app.config import config
from app.connectors.linear_connector import LinearConnector
from app.db import Document, DocumentType, SearchSourceConnectorType
from app.services.task_logging_service import TaskLoggingService
from app.utils.document_converters import generate_content_hash
from .base import (
calculate_date_range,
check_duplicate_document_by_hash,
create_document_chunks,
get_connector_by_id,
logger,
update_connector_last_indexed,
)
async def index_linear_issues(
session: AsyncSession,
connector_id: int,
search_space_id: int,
user_id: str,
start_date: str | None = None,
end_date: str | None = None,
update_last_indexed: bool = True,
) -> tuple[int, str | None]:
"""
Index Linear issues and comments.
Args:
session: Database session
connector_id: ID of the Linear connector
search_space_id: ID of the search space to store documents in
user_id: ID of the user
start_date: Start date for indexing (YYYY-MM-DD format)
end_date: End date for indexing (YYYY-MM-DD format)
update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
Returns:
Tuple containing (number of documents indexed, error message or None)
"""
task_logger = TaskLoggingService(session, search_space_id)
# Log task start
log_entry = await task_logger.log_task_start(
task_name="linear_issues_indexing",
source="connector_indexing_task",
message=f"Starting Linear issues indexing for connector {connector_id}",
metadata={
"connector_id": connector_id,
"user_id": str(user_id),
"start_date": start_date,
"end_date": end_date,
},
)
try:
# Get the connector
await task_logger.log_task_progress(
log_entry,
f"Retrieving Linear connector {connector_id} from database",
{"stage": "connector_retrieval"},
)
connector = await get_connector_by_id(
session, connector_id, SearchSourceConnectorType.LINEAR_CONNECTOR
)
if not connector:
await task_logger.log_task_failure(
log_entry,
f"Connector with ID {connector_id} not found or is not a Linear connector",
"Connector not found",
{"error_type": "ConnectorNotFound"},
)
return (
0,
f"Connector with ID {connector_id} not found or is not a Linear connector",
)
# Get the Linear token from the connector config
linear_token = connector.config.get("LINEAR_API_KEY")
if not linear_token:
await task_logger.log_task_failure(
log_entry,
f"Linear API token not found in connector config for connector {connector_id}",
"Missing Linear token",
{"error_type": "MissingToken"},
)
return 0, "Linear API token not found in connector config"
# Initialize Linear client
await task_logger.log_task_progress(
log_entry,
f"Initializing Linear client for connector {connector_id}",
{"stage": "client_initialization"},
)
linear_client = LinearConnector(token=linear_token)
# Calculate date range
start_date_str, end_date_str = calculate_date_range(
connector, start_date, end_date, default_days_back=365
)
logger.info(f"Fetching Linear issues from {start_date_str} to {end_date_str}")
await task_logger.log_task_progress(
log_entry,
f"Fetching Linear issues from {start_date_str} to {end_date_str}",
{
"stage": "fetch_issues",
"start_date": start_date_str,
"end_date": end_date_str,
},
)
# Get issues within date range
try:
issues, error = linear_client.get_issues_by_date_range(
start_date=start_date_str, end_date=end_date_str, include_comments=True
)
if error:
logger.error(f"Failed to get Linear issues: {error}")
# Don't treat "No issues found" as an error that should stop indexing
if "No issues found" in error:
logger.info(
"No issues found is not a critical error, continuing with update"
)
if update_last_indexed:
await update_connector_last_indexed(
session, connector, update_last_indexed
)
await session.commit()
logger.info(
f"Updated last_indexed_at to {connector.last_indexed_at} despite no issues found"
)
return 0, None
else:
return 0, f"Failed to get Linear issues: {error}"
logger.info(f"Retrieved {len(issues)} issues from Linear API")
except Exception as e:
logger.error(f"Exception when calling Linear API: {e!s}", exc_info=True)
return 0, f"Failed to get Linear issues: {e!s}"
if not issues:
logger.info("No Linear issues found for the specified date range")
if update_last_indexed:
await update_connector_last_indexed(
session, connector, update_last_indexed
)
await session.commit()
logger.info(
f"Updated last_indexed_at to {connector.last_indexed_at} despite no issues found"
)
return 0, None # Return None instead of error message when no issues found
# Track the number of documents indexed
documents_indexed = 0
documents_skipped = 0
skipped_issues = []
await task_logger.log_task_progress(
log_entry,
f"Starting to process {len(issues)} Linear issues",
{"stage": "process_issues", "total_issues": len(issues)},
)
# Process each issue
for issue in issues:
try:
issue_id = issue.get("id", "")
issue_identifier = issue.get("identifier", "")
issue_title = issue.get("title", "")
if not issue_id or not issue_title:
logger.warning(
f"Skipping issue with missing ID or title: {issue_id or 'Unknown'}"
)
skipped_issues.append(
f"{issue_identifier or 'Unknown'} (missing data)"
)
documents_skipped += 1
continue
# Format the issue first to get well-structured data
formatted_issue = linear_client.format_issue(issue)
# Convert issue to markdown format
issue_content = linear_client.format_issue_to_markdown(formatted_issue)
if not issue_content:
logger.warning(
f"Skipping issue with no content: {issue_identifier} - {issue_title}"
)
skipped_issues.append(f"{issue_identifier} (no content)")
documents_skipped += 1
continue
# Create a short summary for the embedding
state = formatted_issue.get("state", "Unknown")
description = formatted_issue.get("description", "")
# Truncate description if it's too long for the summary
if description and len(description) > 500:
description = description[:497] + "..."
# Create a simple summary from the issue data
summary_content = f"Linear Issue {issue_identifier}: {issue_title}\n\nStatus: {state}\n\n"
if description:
summary_content += f"Description: {description}\n\n"
# Add comment count
comment_count = len(formatted_issue.get("comments", []))
summary_content += f"Comments: {comment_count}"
content_hash = generate_content_hash(issue_content, search_space_id)
# Check if document with this content hash already exists
existing_document_by_hash = await check_duplicate_document_by_hash(
session, content_hash
)
if existing_document_by_hash:
logger.info(
f"Document with content hash {content_hash} already exists for issue {issue_identifier}. Skipping processing."
)
documents_skipped += 1
continue
# Generate embedding for the summary
summary_embedding = config.embedding_model_instance.embed(
summary_content
)
# Process chunks - using the full issue content with comments
chunks = await create_document_chunks(issue_content)
# Create and store new document
logger.info(
f"Creating new document for issue {issue_identifier} - {issue_title}"
)
document = Document(
search_space_id=search_space_id,
title=f"Linear - {issue_identifier}: {issue_title}",
document_type=DocumentType.LINEAR_CONNECTOR,
document_metadata={
"issue_id": issue_id,
"issue_identifier": issue_identifier,
"issue_title": issue_title,
"state": state,
"comment_count": comment_count,
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
},
content=summary_content,
content_hash=content_hash,
embedding=summary_embedding,
chunks=chunks,
)
session.add(document)
documents_indexed += 1
logger.info(
f"Successfully indexed new issue {issue_identifier} - {issue_title}"
)
except Exception as e:
logger.error(
f"Error processing issue {issue.get('identifier', 'Unknown')}: {e!s}",
exc_info=True,
)
skipped_issues.append(
f"{issue.get('identifier', 'Unknown')} (processing error)"
)
documents_skipped += 1
continue # Skip this issue and continue with others
# Update the last_indexed_at timestamp for the connector only if requested
total_processed = documents_indexed
if update_last_indexed:
await update_connector_last_indexed(session, connector, update_last_indexed)
# Commit all changes
await session.commit()
logger.info("Successfully committed all Linear document changes to database")
# Log success
await task_logger.log_task_success(
log_entry,
f"Successfully completed Linear indexing for connector {connector_id}",
{
"issues_processed": total_processed,
"documents_indexed": documents_indexed,
"documents_skipped": documents_skipped,
"skipped_issues_count": len(skipped_issues),
},
)
logger.info(
f"Linear indexing completed: {documents_indexed} new issues, {documents_skipped} skipped"
)
return (
total_processed,
None,
) # Return None as the error message to indicate success
except SQLAlchemyError as db_error:
await session.rollback()
await task_logger.log_task_failure(
log_entry,
f"Database error during Linear indexing for connector {connector_id}",
str(db_error),
{"error_type": "SQLAlchemyError"},
)
logger.error(f"Database error: {db_error!s}", exc_info=True)
return 0, f"Database error: {db_error!s}"
except Exception as e:
await session.rollback()
await task_logger.log_task_failure(
log_entry,
f"Failed to index Linear issues for connector {connector_id}",
str(e),
{"error_type": type(e).__name__},
)
logger.error(f"Failed to index Linear issues: {e!s}", exc_info=True)
return 0, f"Failed to index Linear issues: {e!s}"

View file

@ -0,0 +1,406 @@
"""
Notion connector indexer.
"""
from datetime import datetime, timedelta
from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.ext.asyncio import AsyncSession
from app.config import config
from app.connectors.notion_history import NotionHistoryConnector
from app.db import Document, DocumentType, SearchSourceConnectorType
from app.prompts import SUMMARY_PROMPT_TEMPLATE
from app.services.llm_service import get_user_long_context_llm
from app.services.task_logging_service import TaskLoggingService
from app.utils.document_converters import generate_content_hash
from .base import (
build_document_metadata_string,
check_duplicate_document_by_hash,
create_document_chunks,
get_connector_by_id,
logger,
update_connector_last_indexed,
)
async def index_notion_pages(
session: AsyncSession,
connector_id: int,
search_space_id: int,
user_id: str,
start_date: str | None = None,
end_date: str | None = None,
update_last_indexed: bool = True,
) -> tuple[int, str | None]:
"""
Index Notion pages from all accessible pages.
Args:
session: Database session
connector_id: ID of the Notion connector
search_space_id: ID of the search space to store documents in
user_id: ID of the user
start_date: Start date for indexing (YYYY-MM-DD format)
end_date: End date for indexing (YYYY-MM-DD format)
update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
Returns:
Tuple containing (number of documents indexed, error message or None)
"""
task_logger = TaskLoggingService(session, search_space_id)
# Log task start
log_entry = await task_logger.log_task_start(
task_name="notion_pages_indexing",
source="connector_indexing_task",
message=f"Starting Notion pages indexing for connector {connector_id}",
metadata={
"connector_id": connector_id,
"user_id": str(user_id),
"start_date": start_date,
"end_date": end_date,
},
)
try:
# Get the connector
await task_logger.log_task_progress(
log_entry,
f"Retrieving Notion connector {connector_id} from database",
{"stage": "connector_retrieval"},
)
connector = await get_connector_by_id(
session, connector_id, SearchSourceConnectorType.NOTION_CONNECTOR
)
if not connector:
await task_logger.log_task_failure(
log_entry,
f"Connector with ID {connector_id} not found or is not a Notion connector",
"Connector not found",
{"error_type": "ConnectorNotFound"},
)
return (
0,
f"Connector with ID {connector_id} not found or is not a Notion connector",
)
# Get the Notion token from the connector config
notion_token = connector.config.get("NOTION_INTEGRATION_TOKEN")
if not notion_token:
await task_logger.log_task_failure(
log_entry,
f"Notion integration token not found in connector config for connector {connector_id}",
"Missing Notion token",
{"error_type": "MissingToken"},
)
return 0, "Notion integration token not found in connector config"
# Initialize Notion client
await task_logger.log_task_progress(
log_entry,
f"Initializing Notion client for connector {connector_id}",
{"stage": "client_initialization"},
)
logger.info(f"Initializing Notion client for connector {connector_id}")
notion_client = NotionHistoryConnector(token=notion_token)
# Calculate date range
if start_date is None or end_date is None:
# Fall back to calculating dates
calculated_end_date = datetime.now()
calculated_start_date = calculated_end_date - timedelta(
days=365
) # Check for last 1 year of pages
# Use calculated dates if not provided
if start_date is None:
start_date_iso = calculated_start_date.strftime("%Y-%m-%dT%H:%M:%SZ")
else:
# Convert YYYY-MM-DD to ISO format
start_date_iso = datetime.strptime(start_date, "%Y-%m-%d").strftime(
"%Y-%m-%dT%H:%M:%SZ"
)
if end_date is None:
end_date_iso = calculated_end_date.strftime("%Y-%m-%dT%H:%M:%SZ")
else:
# Convert YYYY-MM-DD to ISO format
end_date_iso = datetime.strptime(end_date, "%Y-%m-%d").strftime(
"%Y-%m-%dT%H:%M:%SZ"
)
else:
# Convert provided dates to ISO format for Notion API
start_date_iso = datetime.strptime(start_date, "%Y-%m-%d").strftime(
"%Y-%m-%dT%H:%M:%SZ"
)
end_date_iso = datetime.strptime(end_date, "%Y-%m-%d").strftime(
"%Y-%m-%dT%H:%M:%SZ"
)
logger.info(f"Fetching Notion pages from {start_date_iso} to {end_date_iso}")
await task_logger.log_task_progress(
log_entry,
f"Fetching Notion pages from {start_date_iso} to {end_date_iso}",
{
"stage": "fetch_pages",
"start_date": start_date_iso,
"end_date": end_date_iso,
},
)
# Get all pages
try:
pages = notion_client.get_all_pages(
start_date=start_date_iso, end_date=end_date_iso
)
logger.info(f"Found {len(pages)} Notion pages")
except Exception as e:
await task_logger.log_task_failure(
log_entry,
f"Failed to get Notion pages for connector {connector_id}",
str(e),
{"error_type": "PageFetchError"},
)
logger.error(f"Error fetching Notion pages: {e!s}", exc_info=True)
return 0, f"Failed to get Notion pages: {e!s}"
if not pages:
await task_logger.log_task_success(
log_entry,
f"No Notion pages found for connector {connector_id}",
{"pages_found": 0},
)
logger.info("No Notion pages found to index")
return 0, "No Notion pages found"
# Track the number of documents indexed
documents_indexed = 0
documents_skipped = 0
skipped_pages = []
await task_logger.log_task_progress(
log_entry,
f"Starting to process {len(pages)} Notion pages",
{"stage": "process_pages", "total_pages": len(pages)},
)
# Process each page
for page in pages:
try:
page_id = page.get("page_id")
page_title = page.get("title", f"Untitled page ({page_id})")
page_content = page.get("content", [])
logger.info(f"Processing Notion page: {page_title} ({page_id})")
if not page_content:
logger.info(f"No content found in page {page_title}. Skipping.")
skipped_pages.append(f"{page_title} (no content)")
documents_skipped += 1
continue
# Convert page content to markdown format
markdown_content = f"# Notion Page: {page_title}\n\n"
# Process blocks recursively
def process_blocks(blocks, level=0):
result = ""
for block in blocks:
block_type = block.get("type")
block_content = block.get("content", "")
children = block.get("children", [])
# Add indentation based on level
indent = " " * level
# Format based on block type
if block_type in ["paragraph", "text"]:
result += f"{indent}{block_content}\n\n"
elif block_type in ["heading_1", "header"]:
result += f"{indent}# {block_content}\n\n"
elif block_type == "heading_2":
result += f"{indent}## {block_content}\n\n"
elif block_type == "heading_3":
result += f"{indent}### {block_content}\n\n"
elif block_type == "bulleted_list_item":
result += f"{indent}* {block_content}\n"
elif block_type == "numbered_list_item":
result += f"{indent}1. {block_content}\n"
elif block_type == "to_do":
result += f"{indent}- [ ] {block_content}\n"
elif block_type == "toggle":
result += f"{indent}> {block_content}\n"
elif block_type == "code":
result += f"{indent}```\n{block_content}\n```\n\n"
elif block_type == "quote":
result += f"{indent}> {block_content}\n\n"
elif block_type == "callout":
result += f"{indent}> **Note:** {block_content}\n\n"
elif block_type == "image":
result += f"{indent}![Image]({block_content})\n\n"
else:
# Default for other block types
if block_content:
result += f"{indent}{block_content}\n\n"
# Process children recursively
if children:
result += process_blocks(children, level + 1)
return result
logger.debug(
f"Converting {len(page_content)} blocks to markdown for page {page_title}"
)
markdown_content += process_blocks(page_content)
# Format document metadata
metadata_sections = [
("METADATA", [f"PAGE_TITLE: {page_title}", f"PAGE_ID: {page_id}"]),
(
"CONTENT",
[
"FORMAT: markdown",
"TEXT_START",
markdown_content,
"TEXT_END",
],
),
]
# Build the document string
combined_document_string = build_document_metadata_string(
metadata_sections
)
content_hash = generate_content_hash(
combined_document_string, search_space_id
)
# Check if document with this content hash already exists
existing_document_by_hash = await check_duplicate_document_by_hash(
session, content_hash
)
if existing_document_by_hash:
logger.info(
f"Document with content hash {content_hash} already exists for page {page_title}. Skipping processing."
)
documents_skipped += 1
continue
# Get user's long context LLM
user_llm = await get_user_long_context_llm(session, user_id)
if not user_llm:
logger.error(f"No long context LLM configured for user {user_id}")
skipped_pages.append(f"{page_title} (no LLM configured)")
documents_skipped += 1
continue
# Generate summary
logger.debug(f"Generating summary for page {page_title}")
summary_chain = SUMMARY_PROMPT_TEMPLATE | user_llm
summary_result = await summary_chain.ainvoke(
{"document": combined_document_string}
)
summary_content = summary_result.content
summary_embedding = config.embedding_model_instance.embed(
summary_content
)
# Process chunks
logger.debug(f"Chunking content for page {page_title}")
chunks = await create_document_chunks(markdown_content)
# Create and store new document
document = Document(
search_space_id=search_space_id,
title=f"Notion - {page_title}",
document_type=DocumentType.NOTION_CONNECTOR,
document_metadata={
"page_title": page_title,
"page_id": page_id,
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
},
content=summary_content,
content_hash=content_hash,
embedding=summary_embedding,
chunks=chunks,
)
session.add(document)
documents_indexed += 1
logger.info(f"Successfully indexed new Notion page: {page_title}")
except Exception as e:
logger.error(
f"Error processing Notion page {page.get('title', 'Unknown')}: {e!s}",
exc_info=True,
)
skipped_pages.append(
f"{page.get('title', 'Unknown')} (processing error)"
)
documents_skipped += 1
continue # Skip this page and continue with others
# Update the last_indexed_at timestamp for the connector only if requested
# and if we successfully indexed at least one page
total_processed = documents_indexed
if total_processed > 0:
await update_connector_last_indexed(session, connector, update_last_indexed)
# Commit all changes
await session.commit()
# Prepare result message
result_message = None
if skipped_pages:
result_message = f"Processed {total_processed} pages. Skipped {len(skipped_pages)} pages: {', '.join(skipped_pages)}"
else:
result_message = f"Processed {total_processed} pages."
# Log success
await task_logger.log_task_success(
log_entry,
f"Successfully completed Notion indexing for connector {connector_id}",
{
"pages_processed": total_processed,
"documents_indexed": documents_indexed,
"documents_skipped": documents_skipped,
"skipped_pages_count": len(skipped_pages),
"result_message": result_message,
},
)
logger.info(
f"Notion indexing completed: {documents_indexed} new pages, {documents_skipped} skipped"
)
return total_processed, result_message
except SQLAlchemyError as db_error:
await session.rollback()
await task_logger.log_task_failure(
log_entry,
f"Database error during Notion indexing for connector {connector_id}",
str(db_error),
{"error_type": "SQLAlchemyError"},
)
logger.error(
f"Database error during Notion indexing: {db_error!s}", exc_info=True
)
return 0, f"Database error: {db_error!s}"
except Exception as e:
await session.rollback()
await task_logger.log_task_failure(
log_entry,
f"Failed to index Notion pages for connector {connector_id}",
str(e),
{"error_type": type(e).__name__},
)
logger.error(f"Failed to index Notion pages: {e!s}", exc_info=True)
return 0, f"Failed to index Notion pages: {e!s}"

View file

@ -0,0 +1,396 @@
"""
Slack connector indexer.
"""
from datetime import datetime
from slack_sdk.errors import SlackApiError
from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.ext.asyncio import AsyncSession
from app.config import config
from app.connectors.slack_history import SlackHistory
from app.db import Document, DocumentType, SearchSourceConnectorType
from app.prompts import SUMMARY_PROMPT_TEMPLATE
from app.services.llm_service import get_user_long_context_llm
from app.services.task_logging_service import TaskLoggingService
from app.utils.document_converters import generate_content_hash
from .base import (
build_document_metadata_string,
calculate_date_range,
check_duplicate_document_by_hash,
create_document_chunks,
get_connector_by_id,
logger,
update_connector_last_indexed,
)
async def index_slack_messages(
session: AsyncSession,
connector_id: int,
search_space_id: int,
user_id: str,
start_date: str | None = None,
end_date: str | None = None,
update_last_indexed: bool = True,
) -> tuple[int, str | None]:
"""
Index Slack messages from all accessible channels.
Args:
session: Database session
connector_id: ID of the Slack connector
search_space_id: ID of the search space to store documents in
user_id: ID of the user
start_date: Start date for indexing (YYYY-MM-DD format)
end_date: End date for indexing (YYYY-MM-DD format)
update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
Returns:
Tuple containing (number of documents indexed, error message or None)
"""
task_logger = TaskLoggingService(session, search_space_id)
# Log task start
log_entry = await task_logger.log_task_start(
task_name="slack_messages_indexing",
source="connector_indexing_task",
message=f"Starting Slack messages indexing for connector {connector_id}",
metadata={
"connector_id": connector_id,
"user_id": str(user_id),
"start_date": start_date,
"end_date": end_date,
},
)
try:
# Get the connector
await task_logger.log_task_progress(
log_entry,
f"Retrieving Slack connector {connector_id} from database",
{"stage": "connector_retrieval"},
)
connector = await get_connector_by_id(
session, connector_id, SearchSourceConnectorType.SLACK_CONNECTOR
)
if not connector:
await task_logger.log_task_failure(
log_entry,
f"Connector with ID {connector_id} not found or is not a Slack connector",
"Connector not found",
{"error_type": "ConnectorNotFound"},
)
return (
0,
f"Connector with ID {connector_id} not found or is not a Slack connector",
)
# Get the Slack token from the connector config
slack_token = connector.config.get("SLACK_BOT_TOKEN")
if not slack_token:
await task_logger.log_task_failure(
log_entry,
f"Slack token not found in connector config for connector {connector_id}",
"Missing Slack token",
{"error_type": "MissingToken"},
)
return 0, "Slack token not found in connector config"
# Initialize Slack client
await task_logger.log_task_progress(
log_entry,
f"Initializing Slack client for connector {connector_id}",
{"stage": "client_initialization"},
)
slack_client = SlackHistory(token=slack_token)
# Calculate date range
await task_logger.log_task_progress(
log_entry,
"Calculating date range for Slack indexing",
{
"stage": "date_calculation",
"provided_start_date": start_date,
"provided_end_date": end_date,
},
)
start_date_str, end_date_str = calculate_date_range(
connector, start_date, end_date, default_days_back=365
)
logger.info(f"Indexing Slack messages from {start_date_str} to {end_date_str}")
await task_logger.log_task_progress(
log_entry,
f"Fetching Slack channels from {start_date_str} to {end_date_str}",
{
"stage": "fetch_channels",
"start_date": start_date_str,
"end_date": end_date_str,
},
)
# Get all channels
try:
channels = slack_client.get_all_channels()
except Exception as e:
await task_logger.log_task_failure(
log_entry,
f"Failed to get Slack channels for connector {connector_id}",
str(e),
{"error_type": "ChannelFetchError"},
)
return 0, f"Failed to get Slack channels: {e!s}"
if not channels:
await task_logger.log_task_success(
log_entry,
f"No Slack channels found for connector {connector_id}",
{"channels_found": 0},
)
return 0, "No Slack channels found"
# Track the number of documents indexed
documents_indexed = 0
documents_skipped = 0
skipped_channels = []
await task_logger.log_task_progress(
log_entry,
f"Starting to process {len(channels)} Slack channels",
{"stage": "process_channels", "total_channels": len(channels)},
)
# Process each channel
for channel_obj in channels:
channel_id = channel_obj["id"]
channel_name = channel_obj["name"]
is_private = channel_obj["is_private"]
is_member = channel_obj["is_member"]
try:
# If it's a private channel and the bot is not a member, skip.
if is_private and not is_member:
logger.warning(
f"Bot is not a member of private channel {channel_name} ({channel_id}). Skipping."
)
skipped_channels.append(
f"{channel_name} (private, bot not a member)"
)
documents_skipped += 1
continue
# Get messages for this channel
messages, error = slack_client.get_history_by_date_range(
channel_id=channel_id,
start_date=start_date_str,
end_date=end_date_str,
limit=1000, # Limit to 1000 messages per channel
)
if error:
logger.warning(
f"Error getting messages from channel {channel_name}: {error}"
)
skipped_channels.append(f"{channel_name} (error: {error})")
documents_skipped += 1
continue # Skip this channel if there's an error
if not messages:
logger.info(
f"No messages found in channel {channel_name} for the specified date range."
)
documents_skipped += 1
continue # Skip if no messages
# Format messages with user info
formatted_messages = []
for msg in messages:
# Skip bot messages and system messages
if msg.get("subtype") in [
"bot_message",
"channel_join",
"channel_leave",
]:
continue
formatted_msg = slack_client.format_message(
msg, include_user_info=True
)
formatted_messages.append(formatted_msg)
if not formatted_messages:
logger.info(
f"No valid messages found in channel {channel_name} after filtering."
)
documents_skipped += 1
continue # Skip if no valid messages after filtering
# Convert messages to markdown format
channel_content = f"# Slack Channel: {channel_name}\n\n"
for msg in formatted_messages:
user_name = msg.get("user_name", "Unknown User")
timestamp = msg.get("datetime", "Unknown Time")
text = msg.get("text", "")
channel_content += (
f"## {user_name} ({timestamp})\n\n{text}\n\n---\n\n"
)
# Format document metadata
metadata_sections = [
(
"METADATA",
[
f"CHANNEL_NAME: {channel_name}",
f"CHANNEL_ID: {channel_id}",
f"MESSAGE_COUNT: {len(formatted_messages)}",
],
),
(
"CONTENT",
["FORMAT: markdown", "TEXT_START", channel_content, "TEXT_END"],
),
]
# Build the document string
combined_document_string = build_document_metadata_string(
metadata_sections
)
content_hash = generate_content_hash(
combined_document_string, search_space_id
)
# Check if document with this content hash already exists
existing_document_by_hash = await check_duplicate_document_by_hash(
session, content_hash
)
if existing_document_by_hash:
logger.info(
f"Document with content hash {content_hash} already exists for channel {channel_name}. Skipping processing."
)
documents_skipped += 1
continue
# Get user's long context LLM
user_llm = await get_user_long_context_llm(session, user_id)
if not user_llm:
logger.error(f"No long context LLM configured for user {user_id}")
skipped_channels.append(f"{channel_name} (no LLM configured)")
documents_skipped += 1
continue
# Generate summary
summary_chain = SUMMARY_PROMPT_TEMPLATE | user_llm
summary_result = await summary_chain.ainvoke(
{"document": combined_document_string}
)
summary_content = summary_result.content
summary_embedding = config.embedding_model_instance.embed(
summary_content
)
# Process chunks
chunks = await create_document_chunks(channel_content)
# Create and store new document
document = Document(
search_space_id=search_space_id,
title=f"Slack - {channel_name}",
document_type=DocumentType.SLACK_CONNECTOR,
document_metadata={
"channel_name": channel_name,
"channel_id": channel_id,
"start_date": start_date_str,
"end_date": end_date_str,
"message_count": len(formatted_messages),
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
},
content=summary_content,
embedding=summary_embedding,
chunks=chunks,
content_hash=content_hash,
)
session.add(document)
documents_indexed += 1
logger.info(
f"Successfully indexed new channel {channel_name} with {len(formatted_messages)} messages"
)
except SlackApiError as slack_error:
logger.error(
f"Slack API error for channel {channel_name}: {slack_error!s}"
)
skipped_channels.append(f"{channel_name} (Slack API error)")
documents_skipped += 1
continue # Skip this channel and continue with others
except Exception as e:
logger.error(f"Error processing channel {channel_name}: {e!s}")
skipped_channels.append(f"{channel_name} (processing error)")
documents_skipped += 1
continue # Skip this channel and continue with others
# Update the last_indexed_at timestamp for the connector only if requested
# and if we successfully indexed at least one channel
total_processed = documents_indexed
if total_processed > 0:
await update_connector_last_indexed(session, connector, update_last_indexed)
# Commit all changes
await session.commit()
# Prepare result message
result_message = None
if skipped_channels:
result_message = f"Processed {total_processed} channels. Skipped {len(skipped_channels)} channels: {', '.join(skipped_channels)}"
else:
result_message = f"Processed {total_processed} channels."
# Log success
await task_logger.log_task_success(
log_entry,
f"Successfully completed Slack indexing for connector {connector_id}",
{
"channels_processed": total_processed,
"documents_indexed": documents_indexed,
"documents_skipped": documents_skipped,
"skipped_channels_count": len(skipped_channels),
"result_message": result_message,
},
)
logger.info(
f"Slack indexing completed: {documents_indexed} new channels, {documents_skipped} skipped"
)
return total_processed, result_message
except SQLAlchemyError as db_error:
await session.rollback()
await task_logger.log_task_failure(
log_entry,
f"Database error during Slack indexing for connector {connector_id}",
str(db_error),
{"error_type": "SQLAlchemyError"},
)
logger.error(f"Database error: {db_error!s}")
return 0, f"Database error: {db_error!s}"
except Exception as e:
await session.rollback()
await task_logger.log_task_failure(
log_entry,
f"Failed to index Slack messages for connector {connector_id}",
str(e),
{"error_type": type(e).__name__},
)
logger.error(f"Failed to index Slack messages: {e!s}")
return 0, f"Failed to index Slack messages: {e!s}"

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,47 @@
"""
Document processors module for background tasks.
This module provides a collection of document processors for different content types
and sources. Each processor is responsible for handling a specific type of document
processing task in the background.
Available processors:
- URL crawler: Process web pages from URLs
- Extension processor: Handle documents from browser extension
- Markdown processor: Process markdown files
- File processors: Handle files using different ETL services (Unstructured, LlamaCloud, Docling)
- YouTube processor: Process YouTube videos and extract transcripts
"""
# URL crawler
# Extension processor
from .extension_processor import add_extension_received_document
# File processors
from .file_processors import (
add_received_file_document_using_docling,
add_received_file_document_using_llamacloud,
add_received_file_document_using_unstructured,
)
# Markdown processor
from .markdown_processor import add_received_markdown_file_document
from .url_crawler import add_crawled_url_document
# YouTube processor
from .youtube_processor import add_youtube_video_document
__all__ = [
# URL processing
"add_crawled_url_document",
# Extension processing
"add_extension_received_document",
"add_received_file_document_using_docling",
"add_received_file_document_using_llamacloud",
# File processing with different ETL services
"add_received_file_document_using_unstructured",
# Markdown file processing
"add_received_markdown_file_document",
# YouTube video processing
"add_youtube_video_document",
]

View file

@ -0,0 +1,75 @@
"""
Base functionality and shared imports for document processors.
"""
from langchain_community.document_transformers import MarkdownifyTransformer
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.future import select
from app.config import config
from app.db import Chunk, Document
from app.prompts import SUMMARY_PROMPT_TEMPLATE
# Initialize markdown transformer
md = MarkdownifyTransformer()
async def check_duplicate_document(
session: AsyncSession, content_hash: str
) -> Document | None:
"""
Check if a document with the given content hash already exists.
Args:
session: Database session
content_hash: Hash of the document content
Returns:
Existing document if found, None otherwise
"""
existing_doc_result = await session.execute(
select(Document).where(Document.content_hash == content_hash)
)
return existing_doc_result.scalars().first()
async def create_document_chunks(content: str) -> list[Chunk]:
"""
Create chunks from document content.
Args:
content: Document content to chunk
Returns:
List of Chunk objects with embeddings
"""
return [
Chunk(
content=chunk.text,
embedding=config.embedding_model_instance.embed(chunk.text),
)
for chunk in config.chunker_instance.chunk(content)
]
async def generate_document_summary(
content: str, user_llm, document_title: str = ""
) -> tuple[str, list[float]]:
"""
Generate summary and embedding for document content.
Args:
content: Document content
user_llm: User's LLM instance
document_title: Optional document title for context
Returns:
Tuple of (summary_content, summary_embedding)
"""
summary_chain = SUMMARY_PROMPT_TEMPLATE | user_llm
summary_result = await summary_chain.ainvoke({"document": content})
summary_content = summary_result.content
summary_embedding = config.embedding_model_instance.embed(summary_content)
return summary_content, summary_embedding

View file

@ -0,0 +1,163 @@
"""
Extension document processor for SurfSense browser extension.
"""
import logging
from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.ext.asyncio import AsyncSession
from app.db import Document, DocumentType
from app.schemas import ExtensionDocumentContent
from app.services.llm_service import get_user_long_context_llm
from app.services.task_logging_service import TaskLoggingService
from app.utils.document_converters import generate_content_hash
from .base import (
check_duplicate_document,
create_document_chunks,
generate_document_summary,
)
async def add_extension_received_document(
session: AsyncSession,
content: ExtensionDocumentContent,
search_space_id: int,
user_id: str,
) -> Document | None:
"""
Process and store document content received from the SurfSense Extension.
Args:
session: Database session
content: Document content from extension
search_space_id: ID of the search space
user_id: ID of the user
Returns:
Document object if successful, None if failed
"""
task_logger = TaskLoggingService(session, search_space_id)
# Log task start
log_entry = await task_logger.log_task_start(
task_name="extension_document",
source="background_task",
message=f"Processing extension document: {content.metadata.VisitedWebPageTitle}",
metadata={
"url": content.metadata.VisitedWebPageURL,
"title": content.metadata.VisitedWebPageTitle,
"user_id": str(user_id),
},
)
try:
# Format document metadata in a more maintainable way
metadata_sections = [
(
"METADATA",
[
f"SESSION_ID: {content.metadata.BrowsingSessionId}",
f"URL: {content.metadata.VisitedWebPageURL}",
f"TITLE: {content.metadata.VisitedWebPageTitle}",
f"REFERRER: {content.metadata.VisitedWebPageReffererURL}",
f"TIMESTAMP: {content.metadata.VisitedWebPageDateWithTimeInISOString}",
f"DURATION_MS: {content.metadata.VisitedWebPageVisitDurationInMilliseconds}",
],
),
(
"CONTENT",
["FORMAT: markdown", "TEXT_START", content.pageContent, "TEXT_END"],
),
]
# Build the document string more efficiently
document_parts = []
document_parts.append("<DOCUMENT>")
for section_title, section_content in metadata_sections:
document_parts.append(f"<{section_title}>")
document_parts.extend(section_content)
document_parts.append(f"</{section_title}>")
document_parts.append("</DOCUMENT>")
combined_document_string = "\n".join(document_parts)
content_hash = generate_content_hash(combined_document_string, search_space_id)
# Check if document with this content hash already exists
existing_document = await check_duplicate_document(session, content_hash)
if existing_document:
await task_logger.log_task_success(
log_entry,
f"Extension document already exists: {content.metadata.VisitedWebPageTitle}",
{
"duplicate_detected": True,
"existing_document_id": existing_document.id,
},
)
logging.info(
f"Document with content hash {content_hash} already exists. Skipping processing."
)
return existing_document
# Get user's long context LLM
user_llm = await get_user_long_context_llm(session, user_id)
if not user_llm:
raise RuntimeError(f"No long context LLM configured for user {user_id}")
# Generate summary
summary_content, summary_embedding = await generate_document_summary(
combined_document_string, user_llm
)
# Process chunks
chunks = await create_document_chunks(content.pageContent)
# Create and store document
document = Document(
search_space_id=search_space_id,
title=content.metadata.VisitedWebPageTitle,
document_type=DocumentType.EXTENSION,
document_metadata=content.metadata.model_dump(),
content=summary_content,
embedding=summary_embedding,
chunks=chunks,
content_hash=content_hash,
)
session.add(document)
await session.commit()
await session.refresh(document)
# Log success
await task_logger.log_task_success(
log_entry,
f"Successfully processed extension document: {content.metadata.VisitedWebPageTitle}",
{
"document_id": document.id,
"content_hash": content_hash,
"url": content.metadata.VisitedWebPageURL,
},
)
return document
except SQLAlchemyError as db_error:
await session.rollback()
await task_logger.log_task_failure(
log_entry,
f"Database error processing extension document: {content.metadata.VisitedWebPageTitle}",
str(db_error),
{"error_type": "SQLAlchemyError"},
)
raise db_error
except Exception as e:
await session.rollback()
await task_logger.log_task_failure(
log_entry,
f"Failed to process extension document: {content.metadata.VisitedWebPageTitle}",
str(e),
{"error_type": type(e).__name__},
)
raise RuntimeError(f"Failed to process extension document: {e!s}") from e

View file

@ -0,0 +1,261 @@
"""
File document processors for different ETL services (Unstructured, LlamaCloud, Docling).
"""
import logging
from langchain_core.documents import Document as LangChainDocument
from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.ext.asyncio import AsyncSession
from app.db import Document, DocumentType
from app.services.llm_service import get_user_long_context_llm
from app.utils.document_converters import (
convert_document_to_markdown,
generate_content_hash,
)
from .base import (
check_duplicate_document,
create_document_chunks,
generate_document_summary,
)
async def add_received_file_document_using_unstructured(
session: AsyncSession,
file_name: str,
unstructured_processed_elements: list[LangChainDocument],
search_space_id: int,
user_id: str,
) -> Document | None:
"""
Process and store a file document using Unstructured service.
Args:
session: Database session
file_name: Name of the processed file
unstructured_processed_elements: Processed elements from Unstructured
search_space_id: ID of the search space
user_id: ID of the user
Returns:
Document object if successful, None if failed
"""
try:
file_in_markdown = await convert_document_to_markdown(
unstructured_processed_elements
)
content_hash = generate_content_hash(file_in_markdown, search_space_id)
# Check if document with this content hash already exists
existing_document = await check_duplicate_document(session, content_hash)
if existing_document:
logging.info(
f"Document with content hash {content_hash} already exists. Skipping processing."
)
return existing_document
# TODO: Check if file_markdown exceeds token limit of embedding model
# Get user's long context LLM
user_llm = await get_user_long_context_llm(session, user_id)
if not user_llm:
raise RuntimeError(f"No long context LLM configured for user {user_id}")
# Generate summary
summary_content, summary_embedding = await generate_document_summary(
file_in_markdown, user_llm
)
# Process chunks
chunks = await create_document_chunks(file_in_markdown)
# Create and store document
document = Document(
search_space_id=search_space_id,
title=file_name,
document_type=DocumentType.FILE,
document_metadata={
"FILE_NAME": file_name,
"ETL_SERVICE": "UNSTRUCTURED",
},
content=summary_content,
embedding=summary_embedding,
chunks=chunks,
content_hash=content_hash,
)
session.add(document)
await session.commit()
await session.refresh(document)
return document
except SQLAlchemyError as db_error:
await session.rollback()
raise db_error
except Exception as e:
await session.rollback()
raise RuntimeError(f"Failed to process file document: {e!s}") from e
async def add_received_file_document_using_llamacloud(
session: AsyncSession,
file_name: str,
llamacloud_markdown_document: str,
search_space_id: int,
user_id: str,
) -> Document | None:
"""
Process and store document content parsed by LlamaCloud.
Args:
session: Database session
file_name: Name of the processed file
llamacloud_markdown_document: Markdown content from LlamaCloud parsing
search_space_id: ID of the search space
user_id: ID of the user
Returns:
Document object if successful, None if failed
"""
try:
# Combine all markdown documents into one
file_in_markdown = llamacloud_markdown_document
content_hash = generate_content_hash(file_in_markdown, search_space_id)
# Check if document with this content hash already exists
existing_document = await check_duplicate_document(session, content_hash)
if existing_document:
logging.info(
f"Document with content hash {content_hash} already exists. Skipping processing."
)
return existing_document
# Get user's long context LLM
user_llm = await get_user_long_context_llm(session, user_id)
if not user_llm:
raise RuntimeError(f"No long context LLM configured for user {user_id}")
# Generate summary
summary_content, summary_embedding = await generate_document_summary(
file_in_markdown, user_llm
)
# Process chunks
chunks = await create_document_chunks(file_in_markdown)
# Create and store document
document = Document(
search_space_id=search_space_id,
title=file_name,
document_type=DocumentType.FILE,
document_metadata={
"FILE_NAME": file_name,
"ETL_SERVICE": "LLAMACLOUD",
},
content=summary_content,
embedding=summary_embedding,
chunks=chunks,
content_hash=content_hash,
)
session.add(document)
await session.commit()
await session.refresh(document)
return document
except SQLAlchemyError as db_error:
await session.rollback()
raise db_error
except Exception as e:
await session.rollback()
raise RuntimeError(
f"Failed to process file document using LlamaCloud: {e!s}"
) from e
async def add_received_file_document_using_docling(
session: AsyncSession,
file_name: str,
docling_markdown_document: str,
search_space_id: int,
user_id: str,
) -> Document | None:
"""
Process and store document content parsed by Docling.
Args:
session: Database session
file_name: Name of the processed file
docling_markdown_document: Markdown content from Docling parsing
search_space_id: ID of the search space
user_id: ID of the user
Returns:
Document object if successful, None if failed
"""
try:
file_in_markdown = docling_markdown_document
content_hash = generate_content_hash(file_in_markdown, search_space_id)
# Check if document with this content hash already exists
existing_document = await check_duplicate_document(session, content_hash)
if existing_document:
logging.info(
f"Document with content hash {content_hash} already exists. Skipping processing."
)
return existing_document
# Get user's long context LLM
user_llm = await get_user_long_context_llm(session, user_id)
if not user_llm:
raise RuntimeError(f"No long context LLM configured for user {user_id}")
# Generate summary using chunked processing for large documents
from app.services.docling_service import create_docling_service
docling_service = create_docling_service()
summary_content = await docling_service.process_large_document_summary(
content=file_in_markdown, llm=user_llm, document_title=file_name
)
from app.config import config
summary_embedding = config.embedding_model_instance.embed(summary_content)
# Process chunks
chunks = await create_document_chunks(file_in_markdown)
# Create and store document
document = Document(
search_space_id=search_space_id,
title=file_name,
document_type=DocumentType.FILE,
document_metadata={
"FILE_NAME": file_name,
"ETL_SERVICE": "DOCLING",
},
content=summary_content,
embedding=summary_embedding,
chunks=chunks,
content_hash=content_hash,
)
session.add(document)
await session.commit()
await session.refresh(document)
return document
except SQLAlchemyError as db_error:
await session.rollback()
raise db_error
except Exception as e:
await session.rollback()
raise RuntimeError(
f"Failed to process file document using Docling: {e!s}"
) from e

View file

@ -0,0 +1,136 @@
"""
Markdown file document processor.
"""
import logging
from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.ext.asyncio import AsyncSession
from app.db import Document, DocumentType
from app.services.llm_service import get_user_long_context_llm
from app.services.task_logging_service import TaskLoggingService
from app.utils.document_converters import generate_content_hash
from .base import (
check_duplicate_document,
create_document_chunks,
generate_document_summary,
)
async def add_received_markdown_file_document(
session: AsyncSession,
file_name: str,
file_in_markdown: str,
search_space_id: int,
user_id: str,
) -> Document | None:
"""
Process and store a markdown file document.
Args:
session: Database session
file_name: Name of the markdown file
file_in_markdown: Content of the markdown file
search_space_id: ID of the search space
user_id: ID of the user
Returns:
Document object if successful, None if failed
"""
task_logger = TaskLoggingService(session, search_space_id)
# Log task start
log_entry = await task_logger.log_task_start(
task_name="markdown_file_document",
source="background_task",
message=f"Processing markdown file: {file_name}",
metadata={
"filename": file_name,
"user_id": str(user_id),
"content_length": len(file_in_markdown),
},
)
try:
content_hash = generate_content_hash(file_in_markdown, search_space_id)
# Check if document with this content hash already exists
existing_document = await check_duplicate_document(session, content_hash)
if existing_document:
await task_logger.log_task_success(
log_entry,
f"Markdown file document already exists: {file_name}",
{
"duplicate_detected": True,
"existing_document_id": existing_document.id,
},
)
logging.info(
f"Document with content hash {content_hash} already exists. Skipping processing."
)
return existing_document
# Get user's long context LLM
user_llm = await get_user_long_context_llm(session, user_id)
if not user_llm:
raise RuntimeError(f"No long context LLM configured for user {user_id}")
# Generate summary
summary_content, summary_embedding = await generate_document_summary(
file_in_markdown, user_llm
)
# Process chunks
chunks = await create_document_chunks(file_in_markdown)
# Create and store document
document = Document(
search_space_id=search_space_id,
title=file_name,
document_type=DocumentType.FILE,
document_metadata={
"FILE_NAME": file_name,
},
content=summary_content,
embedding=summary_embedding,
chunks=chunks,
content_hash=content_hash,
)
session.add(document)
await session.commit()
await session.refresh(document)
# Log success
await task_logger.log_task_success(
log_entry,
f"Successfully processed markdown file: {file_name}",
{
"document_id": document.id,
"content_hash": content_hash,
"chunks_count": len(chunks),
"summary_length": len(summary_content),
},
)
return document
except SQLAlchemyError as db_error:
await session.rollback()
await task_logger.log_task_failure(
log_entry,
f"Database error processing markdown file: {file_name}",
str(db_error),
{"error_type": "SQLAlchemyError"},
)
raise db_error
except Exception as e:
await session.rollback()
await task_logger.log_task_failure(
log_entry,
f"Failed to process markdown file: {file_name}",
str(e),
{"error_type": type(e).__name__},
)
raise RuntimeError(f"Failed to process file document: {e!s}") from e

View file

@ -0,0 +1,242 @@
"""
URL crawler document processor.
"""
import logging
import validators
from langchain_community.document_loaders import AsyncChromiumLoader, FireCrawlLoader
from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.ext.asyncio import AsyncSession
from app.config import config
from app.db import Document, DocumentType
from app.services.llm_service import get_user_long_context_llm
from app.services.task_logging_service import TaskLoggingService
from app.utils.document_converters import generate_content_hash
from .base import (
check_duplicate_document,
create_document_chunks,
generate_document_summary,
md,
)
async def add_crawled_url_document(
session: AsyncSession, url: str, search_space_id: int, user_id: str
) -> Document | None:
"""
Process and store a document from a crawled URL.
Args:
session: Database session
url: URL to crawl
search_space_id: ID of the search space
user_id: ID of the user
Returns:
Document object if successful, None if failed
"""
task_logger = TaskLoggingService(session, search_space_id)
# Log task start
log_entry = await task_logger.log_task_start(
task_name="crawl_url_document",
source="background_task",
message=f"Starting URL crawling process for: {url}",
metadata={"url": url, "user_id": str(user_id)},
)
try:
# URL validation step
await task_logger.log_task_progress(
log_entry, f"Validating URL: {url}", {"stage": "validation"}
)
if not validators.url(url):
raise ValueError(f"Url {url} is not a valid URL address")
# Set up crawler
await task_logger.log_task_progress(
log_entry,
f"Setting up crawler for URL: {url}",
{
"stage": "crawler_setup",
"firecrawl_available": bool(config.FIRECRAWL_API_KEY),
},
)
if config.FIRECRAWL_API_KEY:
crawl_loader = FireCrawlLoader(
url=url,
api_key=config.FIRECRAWL_API_KEY,
mode="scrape",
params={
"formats": ["markdown"],
"excludeTags": ["a"],
},
)
else:
crawl_loader = AsyncChromiumLoader(urls=[url], headless=True)
# Perform crawling
await task_logger.log_task_progress(
log_entry,
f"Crawling URL content: {url}",
{"stage": "crawling", "crawler_type": type(crawl_loader).__name__},
)
url_crawled = await crawl_loader.aload()
if isinstance(crawl_loader, FireCrawlLoader):
content_in_markdown = url_crawled[0].page_content
elif isinstance(crawl_loader, AsyncChromiumLoader):
content_in_markdown = md.transform_documents(url_crawled)[0].page_content
# Format document
await task_logger.log_task_progress(
log_entry,
f"Processing crawled content from: {url}",
{"stage": "content_processing", "content_length": len(content_in_markdown)},
)
# Format document metadata in a more maintainable way
metadata_sections = [
(
"METADATA",
[
f"{key.upper()}: {value}"
for key, value in url_crawled[0].metadata.items()
],
),
(
"CONTENT",
["FORMAT: markdown", "TEXT_START", content_in_markdown, "TEXT_END"],
),
]
# Build the document string more efficiently
document_parts = []
document_parts.append("<DOCUMENT>")
for section_title, section_content in metadata_sections:
document_parts.append(f"<{section_title}>")
document_parts.extend(section_content)
document_parts.append(f"</{section_title}>")
document_parts.append("</DOCUMENT>")
combined_document_string = "\n".join(document_parts)
content_hash = generate_content_hash(combined_document_string, search_space_id)
# Check for duplicates
await task_logger.log_task_progress(
log_entry,
f"Checking for duplicate content: {url}",
{"stage": "duplicate_check", "content_hash": content_hash},
)
existing_document = await check_duplicate_document(session, content_hash)
if existing_document:
await task_logger.log_task_success(
log_entry,
f"Document already exists for URL: {url}",
{
"duplicate_detected": True,
"existing_document_id": existing_document.id,
},
)
logging.info(
f"Document with content hash {content_hash} already exists. Skipping processing."
)
return existing_document
# Get LLM for summary generation
await task_logger.log_task_progress(
log_entry,
f"Preparing for summary generation: {url}",
{"stage": "llm_setup"},
)
# Get user's long context LLM
user_llm = await get_user_long_context_llm(session, user_id)
if not user_llm:
raise RuntimeError(f"No long context LLM configured for user {user_id}")
# Generate summary
await task_logger.log_task_progress(
log_entry,
f"Generating summary for URL content: {url}",
{"stage": "summary_generation"},
)
summary_content, summary_embedding = await generate_document_summary(
combined_document_string, user_llm
)
# Process chunks
await task_logger.log_task_progress(
log_entry,
f"Processing content chunks for URL: {url}",
{"stage": "chunk_processing"},
)
chunks = await create_document_chunks(content_in_markdown)
# Create and store document
await task_logger.log_task_progress(
log_entry,
f"Creating document in database for URL: {url}",
{"stage": "document_creation", "chunks_count": len(chunks)},
)
document = Document(
search_space_id=search_space_id,
title=url_crawled[0].metadata["title"]
if isinstance(crawl_loader, FireCrawlLoader)
else url_crawled[0].metadata["source"],
document_type=DocumentType.CRAWLED_URL,
document_metadata=url_crawled[0].metadata,
content=summary_content,
embedding=summary_embedding,
chunks=chunks,
content_hash=content_hash,
)
session.add(document)
await session.commit()
await session.refresh(document)
# Log success
await task_logger.log_task_success(
log_entry,
f"Successfully crawled and processed URL: {url}",
{
"document_id": document.id,
"title": document.title,
"content_hash": content_hash,
"chunks_count": len(chunks),
"summary_length": len(summary_content),
},
)
return document
except SQLAlchemyError as db_error:
await session.rollback()
await task_logger.log_task_failure(
log_entry,
f"Database error while processing URL: {url}",
str(db_error),
{"error_type": "SQLAlchemyError"},
)
raise db_error
except Exception as e:
await session.rollback()
await task_logger.log_task_failure(
log_entry,
f"Failed to crawl URL: {url}",
str(e),
{"error_type": type(e).__name__},
)
raise RuntimeError(f"Failed to crawl URL: {e!s}") from e

View file

@ -0,0 +1,326 @@
"""
YouTube video document processor.
"""
import logging
from urllib.parse import parse_qs, urlparse
import aiohttp
from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.ext.asyncio import AsyncSession
from youtube_transcript_api import YouTubeTranscriptApi
from app.db import Document, DocumentType
from app.services.llm_service import get_user_long_context_llm
from app.services.task_logging_service import TaskLoggingService
from app.utils.document_converters import generate_content_hash
from .base import (
check_duplicate_document,
create_document_chunks,
generate_document_summary,
)
def get_youtube_video_id(url: str) -> str | None:
"""
Extract video ID from various YouTube URL formats.
Args:
url: YouTube URL
Returns:
Video ID if found, None otherwise
"""
parsed_url = urlparse(url)
hostname = parsed_url.hostname
if hostname == "youtu.be":
return parsed_url.path[1:]
if hostname in ("www.youtube.com", "youtube.com"):
if parsed_url.path == "/watch":
query_params = parse_qs(parsed_url.query)
return query_params.get("v", [None])[0]
if parsed_url.path.startswith("/embed/"):
return parsed_url.path.split("/")[2]
if parsed_url.path.startswith("/v/"):
return parsed_url.path.split("/")[2]
return None
async def add_youtube_video_document(
session: AsyncSession, url: str, search_space_id: int, user_id: str
) -> Document:
"""
Process a YouTube video URL, extract transcripts, and store as a document.
Args:
session: Database session for storing the document
url: YouTube video URL (supports standard, shortened, and embed formats)
search_space_id: ID of the search space to add the document to
user_id: ID of the user
Returns:
Document: The created document object
Raises:
ValueError: If the YouTube video ID cannot be extracted from the URL
SQLAlchemyError: If there's a database error
RuntimeError: If the video processing fails
"""
task_logger = TaskLoggingService(session, search_space_id)
# Log task start
log_entry = await task_logger.log_task_start(
task_name="youtube_video_document",
source="background_task",
message=f"Starting YouTube video processing for: {url}",
metadata={"url": url, "user_id": str(user_id)},
)
try:
# Extract video ID from URL
await task_logger.log_task_progress(
log_entry,
f"Extracting video ID from URL: {url}",
{"stage": "video_id_extraction"},
)
# Get video ID
video_id = get_youtube_video_id(url)
if not video_id:
raise ValueError(f"Could not extract video ID from URL: {url}")
await task_logger.log_task_progress(
log_entry,
f"Video ID extracted: {video_id}",
{"stage": "video_id_extracted", "video_id": video_id},
)
# Get video metadata
await task_logger.log_task_progress(
log_entry,
f"Fetching video metadata for: {video_id}",
{"stage": "metadata_fetch"},
)
params = {
"format": "json",
"url": f"https://www.youtube.com/watch?v={video_id}",
}
oembed_url = "https://www.youtube.com/oembed"
async with (
aiohttp.ClientSession() as http_session,
http_session.get(oembed_url, params=params) as response,
):
video_data = await response.json()
await task_logger.log_task_progress(
log_entry,
f"Video metadata fetched: {video_data.get('title', 'Unknown')}",
{
"stage": "metadata_fetched",
"title": video_data.get("title"),
"author": video_data.get("author_name"),
},
)
# Get video transcript
await task_logger.log_task_progress(
log_entry,
f"Fetching transcript for video: {video_id}",
{"stage": "transcript_fetch"},
)
try:
captions = YouTubeTranscriptApi.get_transcript(video_id)
# Include complete caption information with timestamps
transcript_segments = []
for line in captions:
start_time = line.get("start", 0)
duration = line.get("duration", 0)
text = line.get("text", "")
timestamp = f"[{start_time:.2f}s-{start_time + duration:.2f}s]"
transcript_segments.append(f"{timestamp} {text}")
transcript_text = "\n".join(transcript_segments)
await task_logger.log_task_progress(
log_entry,
f"Transcript fetched successfully: {len(captions)} segments",
{
"stage": "transcript_fetched",
"segments_count": len(captions),
"transcript_length": len(transcript_text),
},
)
except Exception as e:
transcript_text = f"No captions available for this video. Error: {e!s}"
await task_logger.log_task_progress(
log_entry,
f"No transcript available for video: {video_id}",
{"stage": "transcript_unavailable", "error": str(e)},
)
# Format document
await task_logger.log_task_progress(
log_entry,
f"Processing video content: {video_data.get('title', 'YouTube Video')}",
{"stage": "content_processing"},
)
# Format document metadata in a more maintainable way
metadata_sections = [
(
"METADATA",
[
f"TITLE: {video_data.get('title', 'YouTube Video')}",
f"URL: {url}",
f"VIDEO_ID: {video_id}",
f"AUTHOR: {video_data.get('author_name', 'Unknown')}",
f"THUMBNAIL: {video_data.get('thumbnail_url', '')}",
],
),
(
"CONTENT",
["FORMAT: transcript", "TEXT_START", transcript_text, "TEXT_END"],
),
]
# Build the document string more efficiently
document_parts = []
document_parts.append("<DOCUMENT>")
for section_title, section_content in metadata_sections:
document_parts.append(f"<{section_title}>")
document_parts.extend(section_content)
document_parts.append(f"</{section_title}>")
document_parts.append("</DOCUMENT>")
combined_document_string = "\n".join(document_parts)
content_hash = generate_content_hash(combined_document_string, search_space_id)
# Check for duplicates
await task_logger.log_task_progress(
log_entry,
f"Checking for duplicate video content: {video_id}",
{"stage": "duplicate_check", "content_hash": content_hash},
)
existing_document = await check_duplicate_document(session, content_hash)
if existing_document:
await task_logger.log_task_success(
log_entry,
f"YouTube video document already exists: {video_data.get('title', 'YouTube Video')}",
{
"duplicate_detected": True,
"existing_document_id": existing_document.id,
"video_id": video_id,
},
)
logging.info(
f"Document with content hash {content_hash} already exists. Skipping processing."
)
return existing_document
# Get LLM for summary generation
await task_logger.log_task_progress(
log_entry,
f"Preparing for summary generation: {video_data.get('title', 'YouTube Video')}",
{"stage": "llm_setup"},
)
# Get user's long context LLM
user_llm = await get_user_long_context_llm(session, user_id)
if not user_llm:
raise RuntimeError(f"No long context LLM configured for user {user_id}")
# Generate summary
await task_logger.log_task_progress(
log_entry,
f"Generating summary for video: {video_data.get('title', 'YouTube Video')}",
{"stage": "summary_generation"},
)
summary_content, summary_embedding = await generate_document_summary(
combined_document_string, user_llm
)
# Process chunks
await task_logger.log_task_progress(
log_entry,
f"Processing content chunks for video: {video_data.get('title', 'YouTube Video')}",
{"stage": "chunk_processing"},
)
chunks = await create_document_chunks(combined_document_string)
# Create document
await task_logger.log_task_progress(
log_entry,
f"Creating YouTube video document in database: {video_data.get('title', 'YouTube Video')}",
{"stage": "document_creation", "chunks_count": len(chunks)},
)
document = Document(
title=video_data.get("title", "YouTube Video"),
document_type=DocumentType.YOUTUBE_VIDEO,
document_metadata={
"url": url,
"video_id": video_id,
"video_title": video_data.get("title", "YouTube Video"),
"author": video_data.get("author_name", "Unknown"),
"thumbnail": video_data.get("thumbnail_url", ""),
},
content=summary_content,
embedding=summary_embedding,
chunks=chunks,
search_space_id=search_space_id,
content_hash=content_hash,
)
session.add(document)
await session.commit()
await session.refresh(document)
# Log success
await task_logger.log_task_success(
log_entry,
f"Successfully processed YouTube video: {video_data.get('title', 'YouTube Video')}",
{
"document_id": document.id,
"video_id": video_id,
"title": document.title,
"content_hash": content_hash,
"chunks_count": len(chunks),
"summary_length": len(summary_content),
"has_transcript": "No captions available" not in transcript_text,
},
)
return document
except SQLAlchemyError as db_error:
await session.rollback()
await task_logger.log_task_failure(
log_entry,
f"Database error while processing YouTube video: {url}",
str(db_error),
{
"error_type": "SQLAlchemyError",
"video_id": video_id if "video_id" in locals() else None,
},
)
raise db_error
except Exception as e:
await session.rollback()
await task_logger.log_task_failure(
log_entry,
f"Failed to process YouTube video: {url}",
str(e),
{
"error_type": type(e).__name__,
"video_id": video_id if "video_id" in locals() else None,
},
)
logging.error(f"Failed to process YouTube video: {e!s}")
raise