SurfSense/surfsense_backend/app/tasks/connector_indexers/github_indexer.py
2025-08-12 15:28:13 -07:00

326 lines
13 KiB
Python

"""
GitHub connector indexer.
"""
from datetime import UTC, datetime
from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.ext.asyncio import AsyncSession
from app.config import config
from app.connectors.github_connector import GitHubConnector
from app.db import Document, DocumentType, SearchSourceConnectorType
from app.services.task_logging_service import TaskLoggingService
from app.utils.document_converters import generate_content_hash
from .base import (
check_duplicate_document_by_hash,
create_document_chunks,
get_connector_by_id,
logger,
)
async def index_github_repos(
session: AsyncSession,
connector_id: int,
search_space_id: int,
user_id: str,
start_date: str | None = None,
end_date: str | None = None,
update_last_indexed: bool = True,
) -> tuple[int, str | None]:
"""
Index code and documentation files from accessible GitHub repositories.
Args:
session: Database session
connector_id: ID of the GitHub connector
search_space_id: ID of the search space to store documents in
user_id: ID of the user
start_date: Start date for filtering (YYYY-MM-DD format) - Note: GitHub indexing processes all files regardless of dates
end_date: End date for filtering (YYYY-MM-DD format) - Note: GitHub indexing processes all files regardless of dates
update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
Returns:
Tuple containing (number of documents indexed, error message or None)
"""
task_logger = TaskLoggingService(session, search_space_id)
# Log task start
log_entry = await task_logger.log_task_start(
task_name="github_repos_indexing",
source="connector_indexing_task",
message=f"Starting GitHub repositories indexing for connector {connector_id}",
metadata={
"connector_id": connector_id,
"user_id": str(user_id),
"start_date": start_date,
"end_date": end_date,
},
)
documents_processed = 0
errors = []
try:
# 1. Get the GitHub connector from the database
await task_logger.log_task_progress(
log_entry,
f"Retrieving GitHub connector {connector_id} from database",
{"stage": "connector_retrieval"},
)
connector = await get_connector_by_id(
session, connector_id, SearchSourceConnectorType.GITHUB_CONNECTOR
)
if not connector:
await task_logger.log_task_failure(
log_entry,
f"Connector with ID {connector_id} not found or is not a GitHub connector",
"Connector not found",
{"error_type": "ConnectorNotFound"},
)
return (
0,
f"Connector with ID {connector_id} not found or is not a GitHub connector",
)
# 2. Get the GitHub PAT and selected repositories from the connector config
github_pat = connector.config.get("GITHUB_PAT")
repo_full_names_to_index = connector.config.get("repo_full_names")
if not github_pat:
await task_logger.log_task_failure(
log_entry,
f"GitHub Personal Access Token (PAT) not found in connector config for connector {connector_id}",
"Missing GitHub PAT",
{"error_type": "MissingToken"},
)
return 0, "GitHub Personal Access Token (PAT) not found in connector config"
if not repo_full_names_to_index or not isinstance(
repo_full_names_to_index, list
):
await task_logger.log_task_failure(
log_entry,
f"'repo_full_names' not found or is not a list in connector config for connector {connector_id}",
"Invalid repo configuration",
{"error_type": "InvalidConfiguration"},
)
return 0, "'repo_full_names' not found or is not a list in connector config"
# 3. Initialize GitHub connector client
await task_logger.log_task_progress(
log_entry,
f"Initializing GitHub client for connector {connector_id}",
{
"stage": "client_initialization",
"repo_count": len(repo_full_names_to_index),
},
)
try:
github_client = GitHubConnector(token=github_pat)
except ValueError as e:
await task_logger.log_task_failure(
log_entry,
f"Failed to initialize GitHub client for connector {connector_id}",
str(e),
{"error_type": "ClientInitializationError"},
)
return 0, f"Failed to initialize GitHub client: {e!s}"
# 4. Validate selected repositories
await task_logger.log_task_progress(
log_entry,
f"Starting indexing for {len(repo_full_names_to_index)} selected repositories",
{
"stage": "repo_processing",
"repo_count": len(repo_full_names_to_index),
"start_date": start_date,
"end_date": end_date,
},
)
logger.info(
f"Starting indexing for {len(repo_full_names_to_index)} selected repositories."
)
if start_date and end_date:
logger.info(
f"Date range requested: {start_date} to {end_date} (Note: GitHub indexing processes all files regardless of dates)"
)
# 6. Iterate through selected repositories and index files
for repo_full_name in repo_full_names_to_index:
if not repo_full_name or not isinstance(repo_full_name, str):
logger.warning(f"Skipping invalid repository entry: {repo_full_name}")
continue
logger.info(f"Processing repository: {repo_full_name}")
try:
files_to_index = github_client.get_repository_files(repo_full_name)
if not files_to_index:
logger.info(
f"No indexable files found in repository: {repo_full_name}"
)
continue
logger.info(
f"Found {len(files_to_index)} files to process in {repo_full_name}"
)
for file_info in files_to_index:
file_path = file_info.get("path")
file_url = file_info.get("url")
file_sha = file_info.get("sha")
file_type = file_info.get("type") # 'code' or 'doc'
full_path_key = f"{repo_full_name}/{file_path}"
if not file_path or not file_url or not file_sha:
logger.warning(
f"Skipping file with missing info in {repo_full_name}: {file_info}"
)
continue
# Get file content
file_content = github_client.get_file_content(
repo_full_name, file_path
)
if file_content is None:
logger.warning(
f"Could not retrieve content for {full_path_key}. Skipping."
)
continue # Skip if content fetch failed
content_hash = generate_content_hash(file_content, search_space_id)
# Check if document with this content hash already exists
existing_document_by_hash = await check_duplicate_document_by_hash(
session, content_hash
)
if existing_document_by_hash:
logger.info(
f"Document with content hash {content_hash} already exists for file {full_path_key}. Skipping processing."
)
continue
# Use file_content directly for chunking, maybe summary for main content?
# For now, let's use the full content for both, might need refinement
summary_content = f"GitHub file: {full_path_key}\n\n{file_content[:1000]}..." # Simple summary
summary_embedding = config.embedding_model_instance.embed(
summary_content
)
# Chunk the content
try:
chunks_data = [
await create_document_chunks(file_content)
][0]
# Use code chunker if available, otherwise regular chunker
if hasattr(config, "code_chunker_instance"):
chunks_data = [
{
"content": chunk.text,
"embedding": config.embedding_model_instance.embed(
chunk.text
),
}
for chunk in config.code_chunker_instance.chunk(
file_content
)
]
else:
chunks_data = await create_document_chunks(file_content)
except Exception as chunk_err:
logger.error(
f"Failed to chunk file {full_path_key}: {chunk_err}"
)
errors.append(
f"Chunking failed for {full_path_key}: {chunk_err}"
)
continue # Skip this file if chunking fails
doc_metadata = {
"repository_full_name": repo_full_name,
"file_path": file_path,
"full_path": full_path_key, # For easier lookup
"url": file_url,
"sha": file_sha,
"type": file_type,
"indexed_at": datetime.now(UTC).isoformat(),
}
# Create new document
logger.info(f"Creating new document for file: {full_path_key}")
document = Document(
title=f"GitHub - {file_path}",
document_type=DocumentType.GITHUB_CONNECTOR,
document_metadata=doc_metadata,
content=summary_content, # Store summary
content_hash=content_hash,
embedding=summary_embedding,
search_space_id=search_space_id,
chunks=chunks_data, # Associate chunks directly
)
session.add(document)
documents_processed += 1
except Exception as repo_err:
logger.error(
f"Failed to process repository {repo_full_name}: {repo_err}"
)
errors.append(f"Failed processing {repo_full_name}: {repo_err}")
# Commit all changes at the end
await session.commit()
logger.info(
f"Finished GitHub indexing for connector {connector_id}. Processed {documents_processed} files."
)
# Log success
await task_logger.log_task_success(
log_entry,
f"Successfully completed GitHub indexing for connector {connector_id}",
{
"documents_processed": documents_processed,
"errors_count": len(errors),
"repo_count": len(repo_full_names_to_index),
},
)
except SQLAlchemyError as db_err:
await session.rollback()
await task_logger.log_task_failure(
log_entry,
f"Database error during GitHub indexing for connector {connector_id}",
str(db_err),
{"error_type": "SQLAlchemyError"},
)
logger.error(
f"Database error during GitHub indexing for connector {connector_id}: {db_err}"
)
errors.append(f"Database error: {db_err}")
return documents_processed, "; ".join(errors) if errors else str(db_err)
except Exception as e:
await session.rollback()
await task_logger.log_task_failure(
log_entry,
f"Unexpected error during GitHub indexing for connector {connector_id}",
str(e),
{"error_type": type(e).__name__},
)
logger.error(
f"Unexpected error during GitHub indexing for connector {connector_id}: {e}",
exc_info=True,
)
errors.append(f"Unexpected error: {e}")
return documents_processed, "; ".join(errors) if errors else str(e)
error_message = "; ".join(errors) if errors else None
return documents_processed, error_message