mirror of
https://github.com/MODSetter/SurfSense.git
synced 2025-09-09 13:54:40 +00:00
refactor: refactored background_tasks & indexing_tasks
This commit is contained in:
parent
356bbb86f5
commit
5aa52375c3
24 changed files with 4704 additions and 5149 deletions
326
surfsense_backend/app/tasks/connector_indexers/github_indexer.py
Normal file
326
surfsense_backend/app/tasks/connector_indexers/github_indexer.py
Normal file
|
@ -0,0 +1,326 @@
|
|||
"""
|
||||
GitHub connector indexer.
|
||||
"""
|
||||
|
||||
from datetime import UTC, datetime
|
||||
|
||||
from sqlalchemy.exc import SQLAlchemyError
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.config import config
|
||||
from app.connectors.github_connector import GitHubConnector
|
||||
from app.db import Document, DocumentType, SearchSourceConnectorType
|
||||
from app.services.task_logging_service import TaskLoggingService
|
||||
from app.utils.document_converters import generate_content_hash
|
||||
|
||||
from .base import (
|
||||
check_duplicate_document_by_hash,
|
||||
create_document_chunks,
|
||||
get_connector_by_id,
|
||||
logger,
|
||||
)
|
||||
|
||||
|
||||
async def index_github_repos(
|
||||
session: AsyncSession,
|
||||
connector_id: int,
|
||||
search_space_id: int,
|
||||
user_id: str,
|
||||
start_date: str | None = None,
|
||||
end_date: str | None = None,
|
||||
update_last_indexed: bool = True,
|
||||
) -> tuple[int, str | None]:
|
||||
"""
|
||||
Index code and documentation files from accessible GitHub repositories.
|
||||
|
||||
Args:
|
||||
session: Database session
|
||||
connector_id: ID of the GitHub connector
|
||||
search_space_id: ID of the search space to store documents in
|
||||
user_id: ID of the user
|
||||
start_date: Start date for filtering (YYYY-MM-DD format) - Note: GitHub indexing processes all files regardless of dates
|
||||
end_date: End date for filtering (YYYY-MM-DD format) - Note: GitHub indexing processes all files regardless of dates
|
||||
update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
|
||||
|
||||
Returns:
|
||||
Tuple containing (number of documents indexed, error message or None)
|
||||
"""
|
||||
task_logger = TaskLoggingService(session, search_space_id)
|
||||
|
||||
# Log task start
|
||||
log_entry = await task_logger.log_task_start(
|
||||
task_name="github_repos_indexing",
|
||||
source="connector_indexing_task",
|
||||
message=f"Starting GitHub repositories indexing for connector {connector_id}",
|
||||
metadata={
|
||||
"connector_id": connector_id,
|
||||
"user_id": str(user_id),
|
||||
"start_date": start_date,
|
||||
"end_date": end_date,
|
||||
},
|
||||
)
|
||||
|
||||
documents_processed = 0
|
||||
errors = []
|
||||
|
||||
try:
|
||||
# 1. Get the GitHub connector from the database
|
||||
await task_logger.log_task_progress(
|
||||
log_entry,
|
||||
f"Retrieving GitHub connector {connector_id} from database",
|
||||
{"stage": "connector_retrieval"},
|
||||
)
|
||||
|
||||
connector = await get_connector_by_id(
|
||||
session, connector_id, SearchSourceConnectorType.GITHUB_CONNECTOR
|
||||
)
|
||||
|
||||
if not connector:
|
||||
await task_logger.log_task_failure(
|
||||
log_entry,
|
||||
f"Connector with ID {connector_id} not found or is not a GitHub connector",
|
||||
"Connector not found",
|
||||
{"error_type": "ConnectorNotFound"},
|
||||
)
|
||||
return (
|
||||
0,
|
||||
f"Connector with ID {connector_id} not found or is not a GitHub connector",
|
||||
)
|
||||
|
||||
# 2. Get the GitHub PAT and selected repositories from the connector config
|
||||
github_pat = connector.config.get("GITHUB_PAT")
|
||||
repo_full_names_to_index = connector.config.get("repo_full_names")
|
||||
|
||||
if not github_pat:
|
||||
await task_logger.log_task_failure(
|
||||
log_entry,
|
||||
f"GitHub Personal Access Token (PAT) not found in connector config for connector {connector_id}",
|
||||
"Missing GitHub PAT",
|
||||
{"error_type": "MissingToken"},
|
||||
)
|
||||
return 0, "GitHub Personal Access Token (PAT) not found in connector config"
|
||||
|
||||
if not repo_full_names_to_index or not isinstance(
|
||||
repo_full_names_to_index, list
|
||||
):
|
||||
await task_logger.log_task_failure(
|
||||
log_entry,
|
||||
f"'repo_full_names' not found or is not a list in connector config for connector {connector_id}",
|
||||
"Invalid repo configuration",
|
||||
{"error_type": "InvalidConfiguration"},
|
||||
)
|
||||
return 0, "'repo_full_names' not found or is not a list in connector config"
|
||||
|
||||
# 3. Initialize GitHub connector client
|
||||
await task_logger.log_task_progress(
|
||||
log_entry,
|
||||
f"Initializing GitHub client for connector {connector_id}",
|
||||
{
|
||||
"stage": "client_initialization",
|
||||
"repo_count": len(repo_full_names_to_index),
|
||||
},
|
||||
)
|
||||
|
||||
try:
|
||||
github_client = GitHubConnector(token=github_pat)
|
||||
except ValueError as e:
|
||||
await task_logger.log_task_failure(
|
||||
log_entry,
|
||||
f"Failed to initialize GitHub client for connector {connector_id}",
|
||||
str(e),
|
||||
{"error_type": "ClientInitializationError"},
|
||||
)
|
||||
return 0, f"Failed to initialize GitHub client: {e!s}"
|
||||
|
||||
# 4. Validate selected repositories
|
||||
await task_logger.log_task_progress(
|
||||
log_entry,
|
||||
f"Starting indexing for {len(repo_full_names_to_index)} selected repositories",
|
||||
{
|
||||
"stage": "repo_processing",
|
||||
"repo_count": len(repo_full_names_to_index),
|
||||
"start_date": start_date,
|
||||
"end_date": end_date,
|
||||
},
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"Starting indexing for {len(repo_full_names_to_index)} selected repositories."
|
||||
)
|
||||
if start_date and end_date:
|
||||
logger.info(
|
||||
f"Date range requested: {start_date} to {end_date} (Note: GitHub indexing processes all files regardless of dates)"
|
||||
)
|
||||
|
||||
# 6. Iterate through selected repositories and index files
|
||||
for repo_full_name in repo_full_names_to_index:
|
||||
if not repo_full_name or not isinstance(repo_full_name, str):
|
||||
logger.warning(f"Skipping invalid repository entry: {repo_full_name}")
|
||||
continue
|
||||
|
||||
logger.info(f"Processing repository: {repo_full_name}")
|
||||
try:
|
||||
files_to_index = github_client.get_repository_files(repo_full_name)
|
||||
if not files_to_index:
|
||||
logger.info(
|
||||
f"No indexable files found in repository: {repo_full_name}"
|
||||
)
|
||||
continue
|
||||
|
||||
logger.info(
|
||||
f"Found {len(files_to_index)} files to process in {repo_full_name}"
|
||||
)
|
||||
|
||||
for file_info in files_to_index:
|
||||
file_path = file_info.get("path")
|
||||
file_url = file_info.get("url")
|
||||
file_sha = file_info.get("sha")
|
||||
file_type = file_info.get("type") # 'code' or 'doc'
|
||||
full_path_key = f"{repo_full_name}/{file_path}"
|
||||
|
||||
if not file_path or not file_url or not file_sha:
|
||||
logger.warning(
|
||||
f"Skipping file with missing info in {repo_full_name}: {file_info}"
|
||||
)
|
||||
continue
|
||||
|
||||
# Get file content
|
||||
file_content = github_client.get_file_content(
|
||||
repo_full_name, file_path
|
||||
)
|
||||
|
||||
if file_content is None:
|
||||
logger.warning(
|
||||
f"Could not retrieve content for {full_path_key}. Skipping."
|
||||
)
|
||||
continue # Skip if content fetch failed
|
||||
|
||||
content_hash = generate_content_hash(file_content, search_space_id)
|
||||
|
||||
# Check if document with this content hash already exists
|
||||
existing_document_by_hash = await check_duplicate_document_by_hash(
|
||||
session, content_hash
|
||||
)
|
||||
|
||||
if existing_document_by_hash:
|
||||
logger.info(
|
||||
f"Document with content hash {content_hash} already exists for file {full_path_key}. Skipping processing."
|
||||
)
|
||||
continue
|
||||
|
||||
# Use file_content directly for chunking, maybe summary for main content?
|
||||
# For now, let's use the full content for both, might need refinement
|
||||
summary_content = f"GitHub file: {full_path_key}\n\n{file_content[:1000]}..." # Simple summary
|
||||
summary_embedding = config.embedding_model_instance.embed(
|
||||
summary_content
|
||||
)
|
||||
|
||||
# Chunk the content
|
||||
try:
|
||||
chunks_data = [
|
||||
await create_document_chunks(file_content)
|
||||
][0]
|
||||
|
||||
# Use code chunker if available, otherwise regular chunker
|
||||
if hasattr(config, "code_chunker_instance"):
|
||||
chunks_data = [
|
||||
{
|
||||
"content": chunk.text,
|
||||
"embedding": config.embedding_model_instance.embed(
|
||||
chunk.text
|
||||
),
|
||||
}
|
||||
for chunk in config.code_chunker_instance.chunk(
|
||||
file_content
|
||||
)
|
||||
]
|
||||
else:
|
||||
chunks_data = await create_document_chunks(file_content)
|
||||
|
||||
except Exception as chunk_err:
|
||||
logger.error(
|
||||
f"Failed to chunk file {full_path_key}: {chunk_err}"
|
||||
)
|
||||
errors.append(
|
||||
f"Chunking failed for {full_path_key}: {chunk_err}"
|
||||
)
|
||||
continue # Skip this file if chunking fails
|
||||
|
||||
doc_metadata = {
|
||||
"repository_full_name": repo_full_name,
|
||||
"file_path": file_path,
|
||||
"full_path": full_path_key, # For easier lookup
|
||||
"url": file_url,
|
||||
"sha": file_sha,
|
||||
"type": file_type,
|
||||
"indexed_at": datetime.now(UTC).isoformat(),
|
||||
}
|
||||
|
||||
# Create new document
|
||||
logger.info(f"Creating new document for file: {full_path_key}")
|
||||
document = Document(
|
||||
title=f"GitHub - {file_path}",
|
||||
document_type=DocumentType.GITHUB_CONNECTOR,
|
||||
document_metadata=doc_metadata,
|
||||
content=summary_content, # Store summary
|
||||
content_hash=content_hash,
|
||||
embedding=summary_embedding,
|
||||
search_space_id=search_space_id,
|
||||
chunks=chunks_data, # Associate chunks directly
|
||||
)
|
||||
session.add(document)
|
||||
documents_processed += 1
|
||||
|
||||
except Exception as repo_err:
|
||||
logger.error(
|
||||
f"Failed to process repository {repo_full_name}: {repo_err}"
|
||||
)
|
||||
errors.append(f"Failed processing {repo_full_name}: {repo_err}")
|
||||
|
||||
# Commit all changes at the end
|
||||
await session.commit()
|
||||
logger.info(
|
||||
f"Finished GitHub indexing for connector {connector_id}. Processed {documents_processed} files."
|
||||
)
|
||||
|
||||
# Log success
|
||||
await task_logger.log_task_success(
|
||||
log_entry,
|
||||
f"Successfully completed GitHub indexing for connector {connector_id}",
|
||||
{
|
||||
"documents_processed": documents_processed,
|
||||
"errors_count": len(errors),
|
||||
"repo_count": len(repo_full_names_to_index),
|
||||
},
|
||||
)
|
||||
|
||||
except SQLAlchemyError as db_err:
|
||||
await session.rollback()
|
||||
await task_logger.log_task_failure(
|
||||
log_entry,
|
||||
f"Database error during GitHub indexing for connector {connector_id}",
|
||||
str(db_err),
|
||||
{"error_type": "SQLAlchemyError"},
|
||||
)
|
||||
logger.error(
|
||||
f"Database error during GitHub indexing for connector {connector_id}: {db_err}"
|
||||
)
|
||||
errors.append(f"Database error: {db_err}")
|
||||
return documents_processed, "; ".join(errors) if errors else str(db_err)
|
||||
except Exception as e:
|
||||
await session.rollback()
|
||||
await task_logger.log_task_failure(
|
||||
log_entry,
|
||||
f"Unexpected error during GitHub indexing for connector {connector_id}",
|
||||
str(e),
|
||||
{"error_type": type(e).__name__},
|
||||
)
|
||||
logger.error(
|
||||
f"Unexpected error during GitHub indexing for connector {connector_id}: {e}",
|
||||
exc_info=True,
|
||||
)
|
||||
errors.append(f"Unexpected error: {e}")
|
||||
return documents_processed, "; ".join(errors) if errors else str(e)
|
||||
|
||||
error_message = "; ".join(errors) if errors else None
|
||||
return documents_processed, error_message
|
Loading…
Add table
Add a link
Reference in a new issue