""" GitHub connector indexer. """ from datetime import UTC, datetime from sqlalchemy.exc import SQLAlchemyError from sqlalchemy.ext.asyncio import AsyncSession from app.config import config from app.connectors.github_connector import GitHubConnector from app.db import Document, DocumentType, SearchSourceConnectorType from app.services.task_logging_service import TaskLoggingService from app.utils.document_converters import generate_content_hash from .base import ( check_duplicate_document_by_hash, create_document_chunks, get_connector_by_id, logger, ) async def index_github_repos( session: AsyncSession, connector_id: int, search_space_id: int, user_id: str, start_date: str | None = None, end_date: str | None = None, update_last_indexed: bool = True, ) -> tuple[int, str | None]: """ Index code and documentation files from accessible GitHub repositories. Args: session: Database session connector_id: ID of the GitHub connector search_space_id: ID of the search space to store documents in user_id: ID of the user start_date: Start date for filtering (YYYY-MM-DD format) - Note: GitHub indexing processes all files regardless of dates end_date: End date for filtering (YYYY-MM-DD format) - Note: GitHub indexing processes all files regardless of dates update_last_indexed: Whether to update the last_indexed_at timestamp (default: True) Returns: Tuple containing (number of documents indexed, error message or None) """ task_logger = TaskLoggingService(session, search_space_id) # Log task start log_entry = await task_logger.log_task_start( task_name="github_repos_indexing", source="connector_indexing_task", message=f"Starting GitHub repositories indexing for connector {connector_id}", metadata={ "connector_id": connector_id, "user_id": str(user_id), "start_date": start_date, "end_date": end_date, }, ) documents_processed = 0 errors = [] try: # 1. Get the GitHub connector from the database await task_logger.log_task_progress( log_entry, f"Retrieving GitHub connector {connector_id} from database", {"stage": "connector_retrieval"}, ) connector = await get_connector_by_id( session, connector_id, SearchSourceConnectorType.GITHUB_CONNECTOR ) if not connector: await task_logger.log_task_failure( log_entry, f"Connector with ID {connector_id} not found or is not a GitHub connector", "Connector not found", {"error_type": "ConnectorNotFound"}, ) return ( 0, f"Connector with ID {connector_id} not found or is not a GitHub connector", ) # 2. Get the GitHub PAT and selected repositories from the connector config github_pat = connector.config.get("GITHUB_PAT") repo_full_names_to_index = connector.config.get("repo_full_names") if not github_pat: await task_logger.log_task_failure( log_entry, f"GitHub Personal Access Token (PAT) not found in connector config for connector {connector_id}", "Missing GitHub PAT", {"error_type": "MissingToken"}, ) return 0, "GitHub Personal Access Token (PAT) not found in connector config" if not repo_full_names_to_index or not isinstance( repo_full_names_to_index, list ): await task_logger.log_task_failure( log_entry, f"'repo_full_names' not found or is not a list in connector config for connector {connector_id}", "Invalid repo configuration", {"error_type": "InvalidConfiguration"}, ) return 0, "'repo_full_names' not found or is not a list in connector config" # 3. Initialize GitHub connector client await task_logger.log_task_progress( log_entry, f"Initializing GitHub client for connector {connector_id}", { "stage": "client_initialization", "repo_count": len(repo_full_names_to_index), }, ) try: github_client = GitHubConnector(token=github_pat) except ValueError as e: await task_logger.log_task_failure( log_entry, f"Failed to initialize GitHub client for connector {connector_id}", str(e), {"error_type": "ClientInitializationError"}, ) return 0, f"Failed to initialize GitHub client: {e!s}" # 4. Validate selected repositories await task_logger.log_task_progress( log_entry, f"Starting indexing for {len(repo_full_names_to_index)} selected repositories", { "stage": "repo_processing", "repo_count": len(repo_full_names_to_index), "start_date": start_date, "end_date": end_date, }, ) logger.info( f"Starting indexing for {len(repo_full_names_to_index)} selected repositories." ) if start_date and end_date: logger.info( f"Date range requested: {start_date} to {end_date} (Note: GitHub indexing processes all files regardless of dates)" ) # 6. Iterate through selected repositories and index files for repo_full_name in repo_full_names_to_index: if not repo_full_name or not isinstance(repo_full_name, str): logger.warning(f"Skipping invalid repository entry: {repo_full_name}") continue logger.info(f"Processing repository: {repo_full_name}") try: files_to_index = github_client.get_repository_files(repo_full_name) if not files_to_index: logger.info( f"No indexable files found in repository: {repo_full_name}" ) continue logger.info( f"Found {len(files_to_index)} files to process in {repo_full_name}" ) for file_info in files_to_index: file_path = file_info.get("path") file_url = file_info.get("url") file_sha = file_info.get("sha") file_type = file_info.get("type") # 'code' or 'doc' full_path_key = f"{repo_full_name}/{file_path}" if not file_path or not file_url or not file_sha: logger.warning( f"Skipping file with missing info in {repo_full_name}: {file_info}" ) continue # Get file content file_content = github_client.get_file_content( repo_full_name, file_path ) if file_content is None: logger.warning( f"Could not retrieve content for {full_path_key}. Skipping." ) continue # Skip if content fetch failed content_hash = generate_content_hash(file_content, search_space_id) # Check if document with this content hash already exists existing_document_by_hash = await check_duplicate_document_by_hash( session, content_hash ) if existing_document_by_hash: logger.info( f"Document with content hash {content_hash} already exists for file {full_path_key}. Skipping processing." ) continue # Use file_content directly for chunking, maybe summary for main content? # For now, let's use the full content for both, might need refinement summary_content = f"GitHub file: {full_path_key}\n\n{file_content[:1000]}..." # Simple summary summary_embedding = config.embedding_model_instance.embed( summary_content ) # Chunk the content try: chunks_data = [ await create_document_chunks(file_content) ][0] # Use code chunker if available, otherwise regular chunker if hasattr(config, "code_chunker_instance"): chunks_data = [ { "content": chunk.text, "embedding": config.embedding_model_instance.embed( chunk.text ), } for chunk in config.code_chunker_instance.chunk( file_content ) ] else: chunks_data = await create_document_chunks(file_content) except Exception as chunk_err: logger.error( f"Failed to chunk file {full_path_key}: {chunk_err}" ) errors.append( f"Chunking failed for {full_path_key}: {chunk_err}" ) continue # Skip this file if chunking fails doc_metadata = { "repository_full_name": repo_full_name, "file_path": file_path, "full_path": full_path_key, # For easier lookup "url": file_url, "sha": file_sha, "type": file_type, "indexed_at": datetime.now(UTC).isoformat(), } # Create new document logger.info(f"Creating new document for file: {full_path_key}") document = Document( title=f"GitHub - {file_path}", document_type=DocumentType.GITHUB_CONNECTOR, document_metadata=doc_metadata, content=summary_content, # Store summary content_hash=content_hash, embedding=summary_embedding, search_space_id=search_space_id, chunks=chunks_data, # Associate chunks directly ) session.add(document) documents_processed += 1 except Exception as repo_err: logger.error( f"Failed to process repository {repo_full_name}: {repo_err}" ) errors.append(f"Failed processing {repo_full_name}: {repo_err}") # Commit all changes at the end await session.commit() logger.info( f"Finished GitHub indexing for connector {connector_id}. Processed {documents_processed} files." ) # Log success await task_logger.log_task_success( log_entry, f"Successfully completed GitHub indexing for connector {connector_id}", { "documents_processed": documents_processed, "errors_count": len(errors), "repo_count": len(repo_full_names_to_index), }, ) except SQLAlchemyError as db_err: await session.rollback() await task_logger.log_task_failure( log_entry, f"Database error during GitHub indexing for connector {connector_id}", str(db_err), {"error_type": "SQLAlchemyError"}, ) logger.error( f"Database error during GitHub indexing for connector {connector_id}: {db_err}" ) errors.append(f"Database error: {db_err}") return documents_processed, "; ".join(errors) if errors else str(db_err) except Exception as e: await session.rollback() await task_logger.log_task_failure( log_entry, f"Unexpected error during GitHub indexing for connector {connector_id}", str(e), {"error_type": type(e).__name__}, ) logger.error( f"Unexpected error during GitHub indexing for connector {connector_id}: {e}", exc_info=True, ) errors.append(f"Unexpected error: {e}") return documents_processed, "; ".join(errors) if errors else str(e) error_message = "; ".join(errors) if errors else None return documents_processed, error_message