From bb198e38c015c6f7e0face338158627bc1b10f62 Mon Sep 17 00:00:00 2001 From: Adamsmith6300 Date: Sun, 13 Apr 2025 13:56:22 -0700 Subject: [PATCH 1/4] add github connector, add alembic for db migrations, fix bug updating connectors --- surfsense_backend/.gitignore | 3 +- surfsense_backend/alembic.ini | 119 +++++++ surfsense_backend/alembic/README | 1 + surfsense_backend/alembic/env.py | 98 ++++++ surfsense_backend/alembic/script.py.mako | 28 ++ .../versions/1_add_github_connector_enum.py | 53 ++++ .../app/connectors/github_connector.py | 182 +++++++++++ surfsense_backend/app/db.py | 2 + .../routes/search_source_connectors_routes.py | 146 +++++---- .../app/schemas/search_source_connector.py | 12 +- .../app/tasks/connectors_indexing_tasks.py | 195 +++++++++++- surfsense_backend/main.py | 7 + surfsense_backend/pyproject.toml | 2 + surfsense_backend/uv.lock | 54 ++++ .../connectors/(manage)/page.tsx | 3 +- .../connectors/[connector_id]/page.tsx | 24 +- .../connectors/add/github-connector/page.tsx | 298 ++++++++++++++++++ .../[search_space_id]/connectors/add/page.tsx | 189 +++++------ 18 files changed, 1232 insertions(+), 184 deletions(-) create mode 100644 surfsense_backend/alembic.ini create mode 100644 surfsense_backend/alembic/README create mode 100644 surfsense_backend/alembic/env.py create mode 100644 surfsense_backend/alembic/script.py.mako create mode 100644 surfsense_backend/alembic/versions/1_add_github_connector_enum.py create mode 100644 surfsense_backend/app/connectors/github_connector.py create mode 100644 surfsense_web/app/dashboard/[search_space_id]/connectors/add/github-connector/page.tsx diff --git a/surfsense_backend/.gitignore b/surfsense_backend/.gitignore index a663d9d..ee59e47 100644 --- a/surfsense_backend/.gitignore +++ b/surfsense_backend/.gitignore @@ -3,4 +3,5 @@ venv/ data/ __pycache__/ -.flashrank_cache \ No newline at end of file +.flashrank_cache +surf_new_backend.egg-info/ diff --git a/surfsense_backend/alembic.ini b/surfsense_backend/alembic.ini new file mode 100644 index 0000000..9b2a76f --- /dev/null +++ b/surfsense_backend/alembic.ini @@ -0,0 +1,119 @@ +# A generic, single database configuration. + +[alembic] +# path to migration scripts. +# Use forward slashes (/) also on windows to provide an os agnostic path +script_location = alembic + +# template used to generate migration file names; The default value is %%(rev)s_%%(slug)s +# Uncomment the line below if you want the files to be prepended with date and time +# file_template = %%(year)d_%%(month).2d_%%(day).2d_%%(hour).2d%%(minute).2d-%%(rev)s_%%(slug)s + +# sys.path path, will be prepended to sys.path if present. +# defaults to the current working directory. +prepend_sys_path = . + +# timezone to use when rendering the date within the migration file +# as well as the filename. +# If specified, requires the python>=3.9 or backports.zoneinfo library and tzdata library. +# Any required deps can installed by adding `alembic[tz]` to the pip requirements +# string value is passed to ZoneInfo() +# leave blank for localtime +# timezone = + +# max length of characters to apply to the "slug" field +# truncate_slug_length = 40 + +# set to 'true' to run the environment during +# the 'revision' command, regardless of autogenerate +# revision_environment = false + +# set to 'true' to allow .pyc and .pyo files without +# a source .py file to be detected as revisions in the +# versions/ directory +# sourceless = false + +# version location specification; This defaults +# to alembic/versions. When using multiple version +# directories, initial revisions must be specified with --version-path. +# The path separator used here should be the separator specified by "version_path_separator" below. +# version_locations = %(here)s/bar:%(here)s/bat:alembic/versions + +# version path separator; As mentioned above, this is the character used to split +# version_locations. The default within new alembic.ini files is "os", which uses os.pathsep. +# If this key is omitted entirely, it falls back to the legacy behavior of splitting on spaces and/or commas. +# Valid values for version_path_separator are: +# +# version_path_separator = : +# version_path_separator = ; +# version_path_separator = space +# version_path_separator = newline +# +# Use os.pathsep. Default configuration used for new projects. +version_path_separator = os + +# set to 'true' to search source files recursively +# in each "version_locations" directory +# new in Alembic version 1.10 +# recursive_version_locations = false + +# the output encoding used when revision files +# are written from script.py.mako +# output_encoding = utf-8 + +# The SQLAlchemy URL to connect to +# IMPORTANT: Replace this with your actual async database URL +sqlalchemy.url = postgresql+asyncpg://postgres:postgres@localhost:5432/surfsense + + +[post_write_hooks] +# post_write_hooks defines scripts or Python functions that are run +# on newly generated revision scripts. See the documentation for further +# detail and examples + +# format using "black" - use the console_scripts runner, against the "black" entrypoint +# hooks = black +# black.type = console_scripts +# black.entrypoint = black +# black.options = -l 79 REVISION_SCRIPT_FILENAME + +# lint with attempts to fix using "ruff" - use the exec runner, execute a binary +# hooks = ruff +# ruff.type = exec +# ruff.executable = %(here)s/.venv/bin/ruff +# ruff.options = check --fix REVISION_SCRIPT_FILENAME + +# Logging configuration +[loggers] +keys = root,sqlalchemy,alembic + +[handlers] +keys = console + +[formatters] +keys = generic + +[logger_root] +level = WARNING +handlers = console +qualname = + +[logger_sqlalchemy] +level = WARNING +handlers = +qualname = sqlalchemy.engine + +[logger_alembic] +level = INFO +handlers = +qualname = alembic + +[handler_console] +class = StreamHandler +args = (sys.stderr,) +level = NOTSET +formatter = generic + +[formatter_generic] +format = %(levelname)-5.5s [%(name)s] %(message)s +datefmt = %H:%M:%S diff --git a/surfsense_backend/alembic/README b/surfsense_backend/alembic/README new file mode 100644 index 0000000..e0d0858 --- /dev/null +++ b/surfsense_backend/alembic/README @@ -0,0 +1 @@ +Generic single-database configuration with an async dbapi. \ No newline at end of file diff --git a/surfsense_backend/alembic/env.py b/surfsense_backend/alembic/env.py new file mode 100644 index 0000000..d6e7104 --- /dev/null +++ b/surfsense_backend/alembic/env.py @@ -0,0 +1,98 @@ +import asyncio +from logging.config import fileConfig + +import os +import sys +from sqlalchemy import pool +from sqlalchemy.engine import Connection +from sqlalchemy.ext.asyncio import async_engine_from_config + +from alembic import context + +# Ensure the app directory is in the Python path +# This allows Alembic to find your models +sys.path.insert(0, os.path.realpath(os.path.join(os.path.dirname(__file__), '..'))) + +# Import your models base +from app.db import Base # Assuming your Base is defined in app.db + +# this is the Alembic Config object, which provides +# access to the values within the .ini file in use. +config = context.config + +# Interpret the config file for Python logging. +# This line sets up loggers basically. +if config.config_file_name is not None: + fileConfig(config.config_file_name) + +# add your model's MetaData object here +# for 'autogenerate' support +# from myapp import mymodel +# target_metadata = mymodel.Base.metadata +target_metadata = Base.metadata + +# other values from the config, defined by the needs of env.py, +# can be acquired: +# my_important_option = config.get_main_option("my_important_option") +# ... etc. + + +def run_migrations_offline() -> None: + """Run migrations in 'offline' mode. + + This configures the context with just a URL + and not an Engine, though an Engine is acceptable + here as well. By skipping the Engine creation + we don't even need a DBAPI to be available. + + Calls to context.execute() here emit the given string to the + script output. + + """ + url = config.get_main_option("sqlalchemy.url") + context.configure( + url=url, + target_metadata=target_metadata, + literal_binds=True, + dialect_opts={"paramstyle": "named"}, + ) + + with context.begin_transaction(): + context.run_migrations() + + +def do_run_migrations(connection: Connection) -> None: + context.configure(connection=connection, target_metadata=target_metadata) + + with context.begin_transaction(): + context.run_migrations() + + +async def run_async_migrations() -> None: + """In this scenario we need to create an Engine + and associate a connection with the context. + + """ + + connectable = async_engine_from_config( + config.get_section(config.config_ini_section, {}), + prefix="sqlalchemy.", + poolclass=pool.NullPool, + ) + + async with connectable.connect() as connection: + await connection.run_sync(do_run_migrations) + + await connectable.dispose() + + +def run_migrations_online() -> None: + """Run migrations in 'online' mode.""" + + asyncio.run(run_async_migrations()) + + +if context.is_offline_mode(): + run_migrations_offline() +else: + run_migrations_online() diff --git a/surfsense_backend/alembic/script.py.mako b/surfsense_backend/alembic/script.py.mako new file mode 100644 index 0000000..480b130 --- /dev/null +++ b/surfsense_backend/alembic/script.py.mako @@ -0,0 +1,28 @@ +"""${message} + +Revision ID: ${up_revision} +Revises: ${down_revision | comma,n} +Create Date: ${create_date} + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa +${imports if imports else ""} + +# revision identifiers, used by Alembic. +revision: str = ${repr(up_revision)} +down_revision: Union[str, None] = ${repr(down_revision)} +branch_labels: Union[str, Sequence[str], None] = ${repr(branch_labels)} +depends_on: Union[str, Sequence[str], None] = ${repr(depends_on)} + + +def upgrade() -> None: + """Upgrade schema.""" + ${upgrades if upgrades else "pass"} + + +def downgrade() -> None: + """Downgrade schema.""" + ${downgrades if downgrades else "pass"} diff --git a/surfsense_backend/alembic/versions/1_add_github_connector_enum.py b/surfsense_backend/alembic/versions/1_add_github_connector_enum.py new file mode 100644 index 0000000..bb72838 --- /dev/null +++ b/surfsense_backend/alembic/versions/1_add_github_connector_enum.py @@ -0,0 +1,53 @@ +"""Add GITHUB_CONNECTOR to SearchSourceConnectorType enum + +Revision ID: 1 +Revises: +Create Date: 2023-10-27 10:00:00.000000 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa +# Import pgvector if needed for other types, though not for this ENUM change +# import pgvector + + +# revision identifiers, used by Alembic. +revision: str = '1' +down_revision: Union[str, None] = None +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + + # Manually add the command to add the enum value + # Note: It's generally better to let autogenerate handle this, but we're bypassing it + op.execute("ALTER TYPE searchsourceconnectortype ADD VALUE 'GITHUB_CONNECTOR'") + + # Pass for the rest, as autogenerate didn't run to add other schema details + pass + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + + # Downgrading removal of an enum value is complex and potentially dangerous + # if the value is in use. Often omitted or requires manual SQL based on context. + # For now, we'll just pass. If you needed to reverse this, you'd likely + # have to manually check if 'GITHUB_CONNECTOR' is used in the table + # and then potentially recreate the type without it. + op.execute("ALTER TYPE searchsourceconnectortype RENAME TO searchsourceconnectortype_old") + op.execute("CREATE TYPE searchsourceconnectortype AS ENUM('SERPER_API', 'TAVILY_API', 'SLACK_CONNECTOR', 'NOTION_CONNECTOR')") + op.execute(( + "ALTER TABLE search_source_connectors ALTER COLUMN connector_type TYPE searchsourceconnectortype USING " + "connector_type::text::searchsourceconnectortype" + )) + op.execute("DROP TYPE searchsourceconnectortype_old") + + + pass + # ### end Alembic commands ### diff --git a/surfsense_backend/app/connectors/github_connector.py b/surfsense_backend/app/connectors/github_connector.py new file mode 100644 index 0000000..d827dac --- /dev/null +++ b/surfsense_backend/app/connectors/github_connector.py @@ -0,0 +1,182 @@ +import base64 +import logging +from typing import List, Optional, Dict, Any, Tuple +from github3 import login as github_login, exceptions as github_exceptions +from github3.repos.repo import Repository +from github3.repos.contents import Contents +from github3.exceptions import ForbiddenError, NotFoundError + +logger = logging.getLogger(__name__) + +# List of common code file extensions to target +CODE_EXTENSIONS = { + '.py', '.js', '.jsx', '.ts', '.tsx', '.java', '.c', '.cpp', '.h', '.hpp', + '.cs', '.go', '.rb', '.php', '.swift', '.kt', '.scala', '.rs', '.m', + '.sh', '.bash', '.ps1', '.lua', '.pl', '.pm', '.r', '.dart', '.sql' +} + +# List of common documentation/text file extensions +DOC_EXTENSIONS = { + '.md', '.txt', '.rst', '.adoc', '.html', '.htm', '.xml', '.json', '.yaml', '.yml', '.toml' +} + +# Maximum file size in bytes (e.g., 1MB) +MAX_FILE_SIZE = 1 * 1024 * 1024 + +class GitHubConnector: + """Connector for interacting with the GitHub API.""" + + def __init__(self, token: str): + """ + Initializes the GitHub connector. + + Args: + token: GitHub Personal Access Token (PAT). + """ + if not token: + raise ValueError("GitHub token cannot be empty.") + try: + self.gh = github_login(token=token) + # Try a simple authenticated call to check token validity + self.gh.me() + logger.info("Successfully authenticated with GitHub API.") + except (github_exceptions.AuthenticationFailed, ForbiddenError) as e: + logger.error(f"GitHub authentication failed: {e}") + raise ValueError("Invalid GitHub token or insufficient permissions.") + except Exception as e: + logger.error(f"Failed to initialize GitHub client: {e}") + raise + + def get_user_repositories(self) -> List[Dict[str, Any]]: + """Fetches repositories accessible by the authenticated user.""" + repos_data = [] + try: + # type='owner' fetches repos owned by the user + # type='member' fetches repos the user is a collaborator on (including orgs) + # type='all' fetches both + for repo in self.gh.repositories(type='all', sort='updated'): + if isinstance(repo, Repository): + repos_data.append({ + "id": repo.id, + "name": repo.name, + "full_name": repo.full_name, + "private": repo.private, + "url": repo.html_url, + "description": repo.description or "", + "last_updated": repo.updated_at.isoformat() if repo.updated_at else None, + }) + logger.info(f"Fetched {len(repos_data)} repositories.") + return repos_data + except Exception as e: + logger.error(f"Failed to fetch GitHub repositories: {e}") + return [] # Return empty list on error + + def get_repository_files(self, repo_full_name: str, path: str = '') -> List[Dict[str, Any]]: + """ + Recursively fetches details of relevant files (code, docs) within a repository path. + + Args: + repo_full_name: The full name of the repository (e.g., 'owner/repo'). + path: The starting path within the repository (default is root). + + Returns: + A list of dictionaries, each containing file details (path, sha, url, size). + Returns an empty list if the repository or path is not found or on error. + """ + files_list = [] + try: + owner, repo_name = repo_full_name.split('/') + repo = self.gh.repository(owner, repo_name) + if not repo: + logger.warning(f"Repository '{repo_full_name}' not found.") + return [] + + contents = repo.directory_contents(path=path) # Use directory_contents for clarity + + # contents returns a list of tuples (name, content_obj) + for item_name, content_item in contents: + if not isinstance(content_item, Contents): + continue + + if content_item.type == 'dir': + # Recursively fetch contents of subdirectory + files_list.extend(self.get_repository_files(repo_full_name, path=content_item.path)) + elif content_item.type == 'file': + # Check if the file extension is relevant and size is within limits + file_extension = '.' + content_item.name.split('.')[-1].lower() if '.' in content_item.name else '' + is_code = file_extension in CODE_EXTENSIONS + is_doc = file_extension in DOC_EXTENSIONS + + if (is_code or is_doc) and content_item.size <= MAX_FILE_SIZE: + files_list.append({ + "path": content_item.path, + "sha": content_item.sha, + "url": content_item.html_url, + "size": content_item.size, + "type": "code" if is_code else "doc" + }) + elif content_item.size > MAX_FILE_SIZE: + logger.debug(f"Skipping large file: {content_item.path} ({content_item.size} bytes)") + else: + logger.debug(f"Skipping irrelevant file type: {content_item.path}") + + except (NotFoundError, ForbiddenError) as e: + logger.warning(f"Cannot access path '{path}' in '{repo_full_name}': {e}") + except Exception as e: + logger.error(f"Failed to get files for {repo_full_name} at path '{path}': {e}") + # Return what we have collected so far in case of partial failure + + return files_list + + def get_file_content(self, repo_full_name: str, file_path: str) -> Optional[str]: + """ + Fetches the decoded content of a specific file. + + Args: + repo_full_name: The full name of the repository (e.g., 'owner/repo'). + file_path: The path to the file within the repository. + + Returns: + The decoded file content as a string, or None if fetching fails or file is too large. + """ + try: + owner, repo_name = repo_full_name.split('/') + repo = self.gh.repository(owner, repo_name) + if not repo: + logger.warning(f"Repository '{repo_full_name}' not found when fetching file '{file_path}'.") + return None + + content_item = repo.file_contents(path=file_path) # Use file_contents for clarity + + if not content_item or not isinstance(content_item, Contents) or content_item.type != 'file': + logger.warning(f"File '{file_path}' not found or is not a file in '{repo_full_name}'.") + return None + + if content_item.size > MAX_FILE_SIZE: + logger.warning(f"File '{file_path}' in '{repo_full_name}' exceeds max size ({content_item.size} > {MAX_FILE_SIZE}). Skipping content fetch.") + return None + + # Content is base64 encoded + if content_item.content: + try: + decoded_content = base64.b64decode(content_item.content).decode('utf-8') + return decoded_content + except UnicodeDecodeError: + logger.warning(f"Could not decode file '{file_path}' in '{repo_full_name}' as UTF-8. Trying with 'latin-1'.") + try: + # Try a fallback encoding + decoded_content = base64.b64decode(content_item.content).decode('latin-1') + return decoded_content + except Exception as decode_err: + logger.error(f"Failed to decode file '{file_path}' with fallback encoding: {decode_err}") + return None # Give up if fallback fails + else: + logger.warning(f"No content returned for file '{file_path}' in '{repo_full_name}'. It might be empty.") + return "" # Return empty string for empty files + + except (NotFoundError, ForbiddenError) as e: + logger.warning(f"Cannot access file '{file_path}' in '{repo_full_name}': {e}") + return None + except Exception as e: + logger.error(f"Failed to get content for file '{file_path}' in '{repo_full_name}': {e}") + return None diff --git a/surfsense_backend/app/db.py b/surfsense_backend/app/db.py index b0fb2f0..25b7bfb 100644 --- a/surfsense_backend/app/db.py +++ b/surfsense_backend/app/db.py @@ -40,12 +40,14 @@ class DocumentType(str, Enum): SLACK_CONNECTOR = "SLACK_CONNECTOR" NOTION_CONNECTOR = "NOTION_CONNECTOR" YOUTUBE_VIDEO = "YOUTUBE_VIDEO" + GITHUB_CONNECTOR = "GITHUB_CONNECTOR" class SearchSourceConnectorType(str, Enum): SERPER_API = "SERPER_API" TAVILY_API = "TAVILY_API" SLACK_CONNECTOR = "SLACK_CONNECTOR" NOTION_CONNECTOR = "NOTION_CONNECTOR" + GITHUB_CONNECTOR = "GITHUB_CONNECTOR" class ChatType(str, Enum): GENERAL = "GENERAL" diff --git a/surfsense_backend/app/routes/search_source_connectors_routes.py b/surfsense_backend/app/routes/search_source_connectors_routes.py index 4025f2d..482a825 100644 --- a/surfsense_backend/app/routes/search_source_connectors_routes.py +++ b/surfsense_backend/app/routes/search_source_connectors_routes.py @@ -14,13 +14,13 @@ from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.future import select from sqlalchemy.exc import IntegrityError from typing import List, Dict, Any -from app.db import get_async_session, User, SearchSourceConnector, SearchSourceConnectorType, SearchSpace +from app.db import get_async_session, User, SearchSourceConnector, SearchSourceConnectorType, SearchSpace, async_session_maker from app.schemas import SearchSourceConnectorCreate, SearchSourceConnectorUpdate, SearchSourceConnectorRead from app.users import current_active_user from app.utils.check_ownership import check_ownership from pydantic import ValidationError -from app.tasks.connectors_indexing_tasks import index_slack_messages, index_notion_pages -from datetime import datetime +from app.tasks.connectors_indexing_tasks import index_slack_messages, index_notion_pages, index_github_repos +from datetime import datetime, timezone import logging # Set up logging @@ -50,13 +50,11 @@ async def create_search_source_connector( ) ) existing_connector = result.scalars().first() - if existing_connector: raise HTTPException( status_code=409, detail=f"A connector with type {connector.connector_type} already exists. Each user can have only one connector of each type." ) - db_connector = SearchSourceConnector(**connector.model_dump(), user_id=user.id) session.add(db_connector) await session.commit() @@ -239,10 +237,15 @@ async def index_connector_content( search_space = await check_ownership(session, SearchSpace, search_space_id, user) # Handle different connector types + response_message = "" + indexing_from = None + indexing_to = None + today_str = datetime.now().strftime("%Y-%m-%d") + if connector.connector_type == SearchSourceConnectorType.SLACK_CONNECTOR: # Determine the time range that will be indexed if not connector.last_indexed_at: - start_date = "365 days ago" + start_date = "365 days ago" # Or perhaps set a specific date if needed else: # Check if last_indexed_at is today today = datetime.now().date() @@ -252,33 +255,18 @@ async def index_connector_content( else: start_date = connector.last_indexed_at.strftime("%Y-%m-%d") - # Add the indexing task to background tasks - if background_tasks: - background_tasks.add_task( - run_slack_indexing_with_new_session, - connector_id, - search_space_id - ) - - return { - "success": True, - "message": "Slack indexing started in the background", - "connector_type": connector.connector_type, - "search_space": search_space.name, - "indexing_from": start_date, - "indexing_to": datetime.now().strftime("%Y-%m-%d") - } - else: - # For testing or if background tasks are not available - return { - "success": False, - "message": "Background tasks not available", - "connector_type": connector.connector_type - } + indexing_from = start_date + indexing_to = today_str + + # Run indexing in background + logger.info(f"Triggering Slack indexing for connector {connector_id} into search space {search_space_id}") + background_tasks.add_task(run_slack_indexing_with_new_session, connector_id, search_space_id) + response_message = "Slack indexing started in the background." + elif connector.connector_type == SearchSourceConnectorType.NOTION_CONNECTOR: # Determine the time range that will be indexed if not connector.last_indexed_at: - start_date = "365 days ago" + start_date = "365 days ago" # Or perhaps set a specific date else: # Check if last_indexed_at is today today = datetime.now().date() @@ -288,44 +276,46 @@ async def index_connector_content( else: start_date = connector.last_indexed_at.strftime("%Y-%m-%d") - # Add the indexing task to background tasks - if background_tasks: - background_tasks.add_task( - run_notion_indexing_with_new_session, - connector_id, - search_space_id - ) - - return { - "success": True, - "message": "Notion indexing started in the background", - "connector_type": connector.connector_type, - "search_space": search_space.name, - "indexing_from": start_date, - "indexing_to": datetime.now().strftime("%Y-%m-%d") - } - else: - # For testing or if background tasks are not available - return { - "success": False, - "message": "Background tasks not available", - "connector_type": connector.connector_type - } + indexing_from = start_date + indexing_to = today_str + + # Run indexing in background + logger.info(f"Triggering Notion indexing for connector {connector_id} into search space {search_space_id}") + background_tasks.add_task(run_notion_indexing_with_new_session, connector_id, search_space_id) + response_message = "Notion indexing started in the background." + + elif connector.connector_type == SearchSourceConnectorType.GITHUB_CONNECTOR: + # GitHub connector likely indexes everything relevant, or uses internal logic + # Setting indexing_from to None and indexing_to to today + indexing_from = None + indexing_to = today_str + + # Run indexing in background + logger.info(f"Triggering GitHub indexing for connector {connector_id} into search space {search_space_id}") + background_tasks.add_task(run_github_indexing_with_new_session, connector_id, search_space_id) + response_message = "GitHub indexing started in the background." + else: raise HTTPException( status_code=400, detail=f"Indexing not supported for connector type: {connector.connector_type}" ) - + + return { + "message": response_message, + "connector_id": connector_id, + "search_space_id": search_space_id, + "indexing_from": indexing_from, + "indexing_to": indexing_to + } except HTTPException: raise except Exception as e: - logger.error(f"Failed to start indexing: {str(e)}") + logger.error(f"Failed to initiate indexing for connector {connector_id}: {e}", exc_info=True) raise HTTPException( status_code=500, - detail=f"Failed to start indexing: {str(e)}" - ) - + detail=f"Failed to initiate indexing: {str(e)}" + ) async def update_connector_last_indexed( session: AsyncSession, @@ -361,8 +351,6 @@ async def run_slack_indexing_with_new_session( Create a new session and run the Slack indexing task. This prevents session leaks by creating a dedicated session for the background task. """ - from app.db import async_session_maker - async with async_session_maker() as session: await run_slack_indexing(session, connector_id, search_space_id) @@ -405,8 +393,6 @@ async def run_notion_indexing_with_new_session( Create a new session and run the Notion indexing task. This prevents session leaks by creating a dedicated session for the background task. """ - from app.db import async_session_maker - async with async_session_maker() as session: await run_notion_indexing(session, connector_id, search_space_id) @@ -439,4 +425,38 @@ async def run_notion_indexing( else: logger.error(f"Notion indexing failed or no documents processed: {error_or_warning}") except Exception as e: - logger.error(f"Error in background Notion indexing task: {str(e)}") \ No newline at end of file + logger.error(f"Error in background Notion indexing task: {str(e)}") + +# Add new helper functions for GitHub indexing +async def run_github_indexing_with_new_session( + connector_id: int, + search_space_id: int +): + """Wrapper to run GitHub indexing with its own database session.""" + logger.info(f"Background task started: Indexing GitHub connector {connector_id} into space {search_space_id}") + async with async_session_maker() as session: + await run_github_indexing(session, connector_id, search_space_id) + logger.info(f"Background task finished: Indexing GitHub connector {connector_id}") + +async def run_github_indexing( + session: AsyncSession, + connector_id: int, + search_space_id: int +): + """Runs the GitHub indexing task and updates the timestamp.""" + try: + indexed_count, error_message = await index_github_repos( + session, connector_id, search_space_id, update_last_indexed=False + ) + if error_message: + logger.error(f"GitHub indexing failed for connector {connector_id}: {error_message}") + # Optionally update status in DB to indicate failure + else: + logger.info(f"GitHub indexing successful for connector {connector_id}. Indexed {indexed_count} documents.") + # Update the last indexed timestamp only on success + await update_connector_last_indexed(session, connector_id) + await session.commit() # Commit timestamp update + except Exception as e: + await session.rollback() + logger.error(f"Critical error in run_github_indexing for connector {connector_id}: {e}", exc_info=True) + # Optionally update status in DB to indicate failure diff --git a/surfsense_backend/app/schemas/search_source_connector.py b/surfsense_backend/app/schemas/search_source_connector.py index f86f45d..5386658 100644 --- a/surfsense_backend/app/schemas/search_source_connector.py +++ b/surfsense_backend/app/schemas/search_source_connector.py @@ -57,6 +57,16 @@ class SearchSourceConnectorBase(BaseModel): # Ensure the integration token is not empty if not config.get("NOTION_INTEGRATION_TOKEN"): raise ValueError("NOTION_INTEGRATION_TOKEN cannot be empty") + + elif connector_type == SearchSourceConnectorType.GITHUB_CONNECTOR: + # For GITHUB_CONNECTOR, only allow GITHUB_TOKEN + allowed_keys = ["GITHUB_PAT"] + if set(config.keys()) != set(allowed_keys): + raise ValueError(f"For GITHUB_CONNECTOR connector type, config must only contain these keys: {allowed_keys}") + + # Ensure the token is not empty + if not config.get("GITHUB_PAT"): + raise ValueError("GITHUB_TOKEN cannot be empty") return config @@ -70,4 +80,4 @@ class SearchSourceConnectorRead(SearchSourceConnectorBase, IDModel, TimestampMod user_id: uuid.UUID class Config: - from_attributes = True \ No newline at end of file + from_attributes = True diff --git a/surfsense_backend/app/tasks/connectors_indexing_tasks.py b/surfsense_backend/app/tasks/connectors_indexing_tasks.py index 580a5c7..670fa26 100644 --- a/surfsense_backend/app/tasks/connectors_indexing_tasks.py +++ b/surfsense_backend/app/tasks/connectors_indexing_tasks.py @@ -3,12 +3,13 @@ from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.exc import SQLAlchemyError from sqlalchemy.future import select from sqlalchemy import delete -from datetime import datetime, timedelta +from datetime import datetime, timedelta, timezone from app.db import Document, DocumentType, Chunk, SearchSourceConnector, SearchSourceConnectorType from app.config import config from app.prompts import SUMMARY_PROMPT_TEMPLATE from app.connectors.slack_history import SlackHistory from app.connectors.notion_history import NotionHistoryConnector +from app.connectors.github_connector import GitHubConnector from slack_sdk.errors import SlackApiError import logging @@ -589,3 +590,195 @@ async def index_notion_pages( await session.rollback() logger.error(f"Failed to index Notion pages: {str(e)}", exc_info=True) return 0, f"Failed to index Notion pages: {str(e)}" + +async def index_github_repos( + session: AsyncSession, + connector_id: int, + search_space_id: int, + update_last_indexed: bool = True +) -> Tuple[int, Optional[str]]: + """ + Index code and documentation files from accessible GitHub repositories. + + Args: + session: Database session + connector_id: ID of the GitHub connector + search_space_id: ID of the search space to store documents in + update_last_indexed: Whether to update the last_indexed_at timestamp (default: True) + + Returns: + Tuple containing (number of documents indexed, error message or None) + """ + documents_processed = 0 + errors = [] + + try: + # 1. Get the GitHub connector from the database + result = await session.execute( + select(SearchSourceConnector) + .filter( + SearchSourceConnector.id == connector_id, + SearchSourceConnector.connector_type == SearchSourceConnectorType.GITHUB_CONNECTOR + ) + ) + connector = result.scalars().first() + + if not connector: + return 0, f"Connector with ID {connector_id} not found or is not a GitHub connector" + + # 2. Get the GitHub PAT from the connector config + github_pat = connector.config.get("GITHUB_PAT") + if not github_pat: + return 0, "GitHub Personal Access Token (PAT) not found in connector config" + + # 3. Initialize GitHub connector client + try: + github_client = GitHubConnector(token=github_pat) + except ValueError as e: + return 0, f"Failed to initialize GitHub client: {str(e)}" + + # 4. Get list of accessible repositories + repositories = github_client.get_user_repositories() + if not repositories: + logger.info("No accessible GitHub repositories found for the provided token.") + return 0, "No accessible GitHub repositories found." + + logger.info(f"Found {len(repositories)} repositories to potentially index.") + + # 5. Get existing documents for this search space and connector type to prevent duplicates + existing_docs_result = await session.execute( + select(Document) + .filter( + Document.search_space_id == search_space_id, + Document.document_type == DocumentType.GITHUB_CONNECTOR + ) + ) + existing_docs = existing_docs_result.scalars().all() + # Create a lookup dict: key=repo_fullname/file_path, value=Document object + existing_docs_lookup = {doc.document_metadata.get("full_path"): doc for doc in existing_docs if doc.document_metadata.get("full_path")} + logger.info(f"Found {len(existing_docs_lookup)} existing GitHub documents in database for search space {search_space_id}") + + # 6. Iterate through repositories and index files + for repo_info in repositories: + repo_full_name = repo_info.get("full_name") + if not repo_full_name: + logger.warning(f"Skipping repository with missing full_name: {repo_info.get('name')}") + continue + + logger.info(f"Processing repository: {repo_full_name}") + try: + files_to_index = github_client.get_repository_files(repo_full_name) + if not files_to_index: + logger.info(f"No indexable files found in repository: {repo_full_name}") + continue + + logger.info(f"Found {len(files_to_index)} files to process in {repo_full_name}") + + for file_info in files_to_index: + file_path = file_info.get("path") + file_url = file_info.get("url") + file_sha = file_info.get("sha") + file_type = file_info.get("type") # 'code' or 'doc' + full_path_key = f"{repo_full_name}/{file_path}" + + if not file_path or not file_url or not file_sha: + logger.warning(f"Skipping file with missing info in {repo_full_name}: {file_info}") + continue + + # Check if document already exists and if content hash matches + existing_doc = existing_docs_lookup.get(full_path_key) + if existing_doc and existing_doc.document_metadata.get("sha") == file_sha: + logger.debug(f"Skipping unchanged file: {full_path_key}") + continue # Skip if SHA matches (content hasn't changed) + + # Get file content + file_content = github_client.get_file_content(repo_full_name, file_path) + + if file_content is None: + logger.warning(f"Could not retrieve content for {full_path_key}. Skipping.") + continue # Skip if content fetch failed + + # Use file_content directly for chunking, maybe summary for main content? + # For now, let's use the full content for both, might need refinement + summary_content = f"GitHub file: {full_path_key}\n\n{file_content[:1000]}..." # Simple summary + summary_embedding = config.embedding_model_instance.embed(summary_content) + + # Chunk the content + try: + chunks_data = [ + Chunk(content=chunk.text, embedding=chunk.embedding) + for chunk in config.chunker_instance.chunk(file_content) + ] + except Exception as chunk_err: + logger.error(f"Failed to chunk file {full_path_key}: {chunk_err}") + errors.append(f"Chunking failed for {full_path_key}: {chunk_err}") + continue # Skip this file if chunking fails + + doc_metadata = { + "repository_full_name": repo_full_name, + "file_path": file_path, + "full_path": full_path_key, # For easier lookup + "url": file_url, + "sha": file_sha, + "type": file_type, + "indexed_at": datetime.now(timezone.utc).isoformat() + } + + if existing_doc: + # Update existing document + logger.info(f"Updating document for file: {full_path_key}") + existing_doc.title = f"GitHub - {file_path}" + existing_doc.document_metadata = doc_metadata + existing_doc.content = summary_content # Update summary + existing_doc.embedding = summary_embedding # Update embedding + + # Delete old chunks + await session.execute( + delete(Chunk) + .where(Chunk.document_id == existing_doc.id) + ) + # Add new chunks + for chunk_obj in chunks_data: + chunk_obj.document_id = existing_doc.id + session.add(chunk_obj) + + documents_processed += 1 + else: + # Create new document + logger.info(f"Creating new document for file: {full_path_key}") + document = Document( + title=f"GitHub - {file_path}", + document_type=DocumentType.GITHUB_CONNECTOR, + document_metadata=doc_metadata, + content=summary_content, # Store summary + embedding=summary_embedding, + search_space_id=search_space_id, + chunks=chunks_data # Associate chunks directly + ) + session.add(document) + documents_processed += 1 + + # Commit periodically or at the end? For now, commit per repo + # await session.commit() + + except Exception as repo_err: + logger.error(f"Failed to process repository {repo_full_name}: {repo_err}") + errors.append(f"Failed processing {repo_full_name}: {repo_err}") + + # Commit all changes at the end + await session.commit() + logger.info(f"Finished GitHub indexing for connector {connector_id}. Processed {documents_processed} files.") + + except SQLAlchemyError as db_err: + await session.rollback() + logger.error(f"Database error during GitHub indexing for connector {connector_id}: {db_err}") + errors.append(f"Database error: {db_err}") + return documents_processed, "; ".join(errors) if errors else str(db_err) + except Exception as e: + await session.rollback() + logger.error(f"Unexpected error during GitHub indexing for connector {connector_id}: {e}", exc_info=True) + errors.append(f"Unexpected error: {e}") + return documents_processed, "; ".join(errors) if errors else str(e) + + error_message = "; ".join(errors) if errors else None + return documents_processed, error_message diff --git a/surfsense_backend/main.py b/surfsense_backend/main.py index 76d478b..81ef520 100644 --- a/surfsense_backend/main.py +++ b/surfsense_backend/main.py @@ -1,5 +1,12 @@ import uvicorn import argparse +import logging + +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', + datefmt='%Y-%m-%d %H:%M:%S' +) if __name__ == "__main__": parser = argparse.ArgumentParser(description='Run the SurfSense application') diff --git a/surfsense_backend/pyproject.toml b/surfsense_backend/pyproject.toml index 2d1e00a..9fb5fbb 100644 --- a/surfsense_backend/pyproject.toml +++ b/surfsense_backend/pyproject.toml @@ -5,12 +5,14 @@ description = "Add your description here" readme = "README.md" requires-python = ">=3.12" dependencies = [ + "alembic>=1.13.0", "asyncpg>=0.30.0", "chonkie[all]>=0.4.1", "fastapi>=0.115.8", "fastapi-users[oauth,sqlalchemy]>=14.0.1", "firecrawl-py>=1.12.0", "gpt-researcher>=0.12.12", + "github3.py==4.0.1", "langchain-community>=0.3.17", "langchain-unstructured>=0.1.6", "litellm>=1.61.4", diff --git a/surfsense_backend/uv.lock b/surfsense_backend/uv.lock index e64ff39..5211ea0 100644 --- a/surfsense_backend/uv.lock +++ b/surfsense_backend/uv.lock @@ -92,6 +92,20 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ec/6a/bc7e17a3e87a2985d3e8f4da4cd0f481060eb78fb08596c42be62c90a4d9/aiosignal-1.3.2-py2.py3-none-any.whl", hash = "sha256:45cde58e409a301715980c2b01d0c28bdde3770d8290b5eb2173759d9acb31a5", size = 7597 }, ] +[[package]] +name = "alembic" +version = "1.15.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "mako" }, + { name = "sqlalchemy" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/e6/57/e314c31b261d1e8a5a5f1908065b4ff98270a778ce7579bd4254477209a7/alembic-1.15.2.tar.gz", hash = "sha256:1c72391bbdeffccfe317eefba686cb9a3c078005478885413b95c3b26c57a8a7", size = 1925573 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/41/18/d89a443ed1ab9bcda16264716f809c663866d4ca8de218aa78fd50b38ead/alembic-1.15.2-py3-none-any.whl", hash = "sha256:2e76bd916d547f6900ec4bb5a90aeac1485d2c92536923d0b138c02b126edc53", size = 231911 }, +] + [[package]] name = "annotated-types" version = "0.7.0" @@ -884,6 +898,21 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e2/94/758680531a00d06e471ef649e4ec2ed6bf185356a7f9fbfbb7368a40bd49/fsspec-2025.2.0-py3-none-any.whl", hash = "sha256:9de2ad9ce1f85e1931858535bc882543171d197001a0a5eb2ddc04f1781ab95b", size = 184484 }, ] +[[package]] +name = "github3-py" +version = "4.0.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pyjwt", extra = ["crypto"] }, + { name = "python-dateutil" }, + { name = "requests" }, + { name = "uritemplate" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/89/91/603bcaf8cd1b3927de64bf56c3a8915f6653ea7281919140c5bcff2bfe7b/github3.py-4.0.1.tar.gz", hash = "sha256:30d571076753efc389edc7f9aaef338a4fcb24b54d8968d5f39b1342f45ddd36", size = 36214038 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/61/ad/2394d4fb542574678b0ba342daf734d4d811768da3c2ee0c84d509dcb26c/github3.py-4.0.1-py3-none-any.whl", hash = "sha256:a89af7de25650612d1da2f0609622bcdeb07ee8a45a1c06b2d16a05e4234e753", size = 151800 }, +] + [[package]] name = "google-api-core" version = "2.24.2" @@ -1614,6 +1643,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/89/a1/3e145759e776c8866488a71270c399bf7c4e554551ac2e247aa0a18a0596/makefun-1.15.6-py2.py3-none-any.whl", hash = "sha256:e69b870f0bb60304765b1e3db576aaecf2f9b3e5105afe8cfeff8f2afe6ad067", size = 22946 }, ] +[[package]] +name = "mako" +version = "1.3.10" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "markupsafe" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/9e/38/bd5b78a920a64d708fe6bc8e0a2c075e1389d53bef8413725c63ba041535/mako-1.3.10.tar.gz", hash = "sha256:99579a6f39583fa7e5630a28c3c1f440e4e97a414b80372649c0ce338da2ea28", size = 392474 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/87/fb/99f81ac72ae23375f22b7afdb7642aba97c00a713c217124420147681a2f/mako-1.3.10-py3-none-any.whl", hash = "sha256:baef24a52fc4fc514a0887ac600f9f1cff3d82c61d4d700a1fa84d597b88db59", size = 78509 }, +] + [[package]] name = "markdown" version = "3.7" @@ -3228,11 +3269,13 @@ name = "surf-new-backend" version = "0.1.0" source = { virtual = "." } dependencies = [ + { name = "alembic" }, { name = "asyncpg" }, { name = "chonkie", extra = ["all"] }, { name = "fastapi" }, { name = "fastapi-users", extra = ["oauth", "sqlalchemy"] }, { name = "firecrawl-py" }, + { name = "github3-py" }, { name = "gpt-researcher" }, { name = "langchain-community" }, { name = "langchain-unstructured" }, @@ -3254,11 +3297,13 @@ dependencies = [ [package.metadata] requires-dist = [ + { name = "alembic", specifier = ">=1.13.0" }, { name = "asyncpg", specifier = ">=0.30.0" }, { name = "chonkie", extras = ["all"], specifier = ">=0.4.1" }, { name = "fastapi", specifier = ">=0.115.8" }, { name = "fastapi-users", extras = ["oauth", "sqlalchemy"], specifier = ">=14.0.1" }, { name = "firecrawl-py", specifier = ">=1.12.0" }, + { name = "github3-py", specifier = "==4.0.1" }, { name = "gpt-researcher", specifier = ">=0.12.12" }, { name = "langchain-community", specifier = ">=0.3.17" }, { name = "langchain-unstructured", specifier = ">=0.1.6" }, @@ -3658,6 +3703,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/10/6d/adb955ecf60811a3735d508974bbb5358e7745b635dc001329267529c6f2/unstructured.pytesseract-0.3.15-py3-none-any.whl", hash = "sha256:a3f505c5efb7ff9f10379051a7dd6aa624b3be6b0f023ed6767cc80d0b1613d1", size = 14992 }, ] +[[package]] +name = "uritemplate" +version = "4.1.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d2/5a/4742fdba39cd02a56226815abfa72fe0aa81c33bed16ed045647d6000eba/uritemplate-4.1.1.tar.gz", hash = "sha256:4346edfc5c3b79f694bccd6d6099a322bbeb628dbf2cd86eea55a456ce5124f0", size = 273898 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/81/c0/7461b49cd25aeece13766f02ee576d1db528f1c37ce69aee300e075b485b/uritemplate-4.1.1-py2.py3-none-any.whl", hash = "sha256:830c08b8d99bdd312ea4ead05994a38e8936266f84b9a7878232db50b044e02e", size = 10356 }, +] + [[package]] name = "urllib3" version = "2.3.0" diff --git a/surfsense_web/app/dashboard/[search_space_id]/connectors/(manage)/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/connectors/(manage)/page.tsx index dfc8b82..817ca58 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/connectors/(manage)/page.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/connectors/(manage)/page.tsx @@ -44,6 +44,7 @@ const getConnectorTypeDisplay = (type: string): string => { "TAVILY_API": "Tavily API", "SLACK_CONNECTOR": "Slack", "NOTION_CONNECTOR": "Notion", + "GITHUB_CONNECTOR": "GitHub", // Add other connector types here as needed }; return typeMap[type] || type; @@ -253,4 +254,4 @@ export default function ConnectorsPage() { ); -} \ No newline at end of file +} diff --git a/surfsense_web/app/dashboard/[search_space_id]/connectors/[connector_id]/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/connectors/[connector_id]/page.tsx index 7700bc8..e841639 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/connectors/[connector_id]/page.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/connectors/[connector_id]/page.tsx @@ -51,6 +51,7 @@ const getConnectorTypeDisplay = (type: string): string => { "TAVILY_API": "Tavily API", "SLACK_CONNECTOR": "Slack Connector", "NOTION_CONNECTOR": "Notion Connector", + "GITHUB_CONNECTOR": "GitHub Connector", // Add other connector types here as needed }; return typeMap[type] || type; @@ -85,7 +86,8 @@ export default function EditConnectorPage() { "SERPER_API": "SERPER_API_KEY", "TAVILY_API": "TAVILY_API_KEY", "SLACK_CONNECTOR": "SLACK_BOT_TOKEN", - "NOTION_CONNECTOR": "NOTION_INTEGRATION_TOKEN" + "NOTION_CONNECTOR": "NOTION_INTEGRATION_TOKEN", + "GITHUB_CONNECTOR": "GITHUB_PAT" }; return fieldMap[connectorType] || ""; }; @@ -136,6 +138,8 @@ export default function EditConnectorPage() { name: values.name, connector_type: connector.connector_type, config: updatedConfig, + is_indexable: connector.is_indexable, + last_indexed_at: connector.last_indexed_at, }); toast.success("Connector updated successfully!"); @@ -223,17 +227,21 @@ export default function EditConnectorPage() { ? "Slack Bot Token" : connector?.connector_type === "NOTION_CONNECTOR" ? "Notion Integration Token" - : "API Key"} + : connector?.connector_type === "GITHUB_CONNECTOR" + ? "GitHub Personal Access Token (PAT)" + : "API Key"} @@ -243,7 +251,9 @@ export default function EditConnectorPage() { ? "Enter a new Slack Bot Token or leave blank to keep your existing token." : connector?.connector_type === "NOTION_CONNECTOR" ? "Enter a new Notion Integration Token or leave blank to keep your existing token." - : "Enter a new API key or leave blank to keep your existing key."} + : connector?.connector_type === "GITHUB_CONNECTOR" + ? "Enter a new GitHub PAT or leave blank to keep your existing token." + : "Enter a new API key or leave blank to keep your existing key."} @@ -276,4 +286,4 @@ export default function EditConnectorPage() { ); -} \ No newline at end of file +} diff --git a/surfsense_web/app/dashboard/[search_space_id]/connectors/add/github-connector/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/connectors/add/github-connector/page.tsx new file mode 100644 index 0000000..45534d6 --- /dev/null +++ b/surfsense_web/app/dashboard/[search_space_id]/connectors/add/github-connector/page.tsx @@ -0,0 +1,298 @@ +"use client"; + +import { useState } from "react"; +import { useRouter, useParams } from "next/navigation"; +import { motion } from "framer-motion"; +import { zodResolver } from "@hookform/resolvers/zod"; +import { useForm } from "react-hook-form"; +import * as z from "zod"; +import { toast } from "sonner"; +import { ArrowLeft, Check, Info, Loader2, Github } from "lucide-react"; + +// Assuming useSearchSourceConnectors hook exists and works similarly +import { useSearchSourceConnectors } from "@/hooks/useSearchSourceConnectors"; +import { + Form, + FormControl, + FormDescription, + FormField, + FormItem, + FormLabel, + FormMessage, +} from "@/components/ui/form"; +import { Input } from "@/components/ui/input"; +import { Button } from "@/components/ui/button"; +import { + Card, + CardContent, + CardDescription, + CardFooter, + CardHeader, + CardTitle, +} from "@/components/ui/card"; +import { + Alert, + AlertDescription, + AlertTitle, +} from "@/components/ui/alert"; +import { + Accordion, + AccordionContent, + AccordionItem, + AccordionTrigger, +} from "@/components/ui/accordion"; +import { Tabs, TabsContent, TabsList, TabsTrigger } from "@/components/ui/tabs"; + +// Define the form schema with Zod for GitHub +const githubConnectorFormSchema = z.object({ + name: z.string().min(3, { + message: "Connector name must be at least 3 characters.", + }), + github_pat: z.string() + .min(20, { // Apply min length first + message: "GitHub Personal Access Token seems too short.", + }) + .refine(pat => pat.startsWith('ghp_') || pat.startsWith('github_pat_'), { // Then refine the pattern + message: "GitHub PAT should start with 'ghp_' or 'github_pat_'", + }), +}); + +// Define the type for the form values +type GithubConnectorFormValues = z.infer; + +export default function GithubConnectorPage() { + const router = useRouter(); + const params = useParams(); + const searchSpaceId = params.search_space_id as string; + const [isSubmitting, setIsSubmitting] = useState(false); + const { createConnector } = useSearchSourceConnectors(); // Assuming this hook exists + + // Initialize the form + const form = useForm({ + resolver: zodResolver(githubConnectorFormSchema), + defaultValues: { + name: "GitHub Connector", + github_pat: "", + }, + }); + + // Handle form submission + const onSubmit = async (values: GithubConnectorFormValues) => { + setIsSubmitting(true); + try { + await createConnector({ + name: values.name, + connector_type: "GITHUB_CONNECTOR", + config: { + GITHUB_PAT: values.github_pat, + }, + is_indexable: true, // GitHub connector is indexable + last_indexed_at: null, // New connector hasn't been indexed + }); + + toast.success("GitHub connector created successfully!"); + + // Navigate back to connectors management page (or the add page) + router.push(`/dashboard/${searchSpaceId}/connectors`); + } catch (error) { // Added type check for error + console.error("Error creating GitHub connector:", error); + // Display specific backend error message if available + const errorMessage = error instanceof Error ? error.message : "Failed to create GitHub connector. Please check the PAT and permissions."; + toast.error(errorMessage); + } finally { + setIsSubmitting(false); + } + }; + + return ( +
+ + + + + + Connect GitHub + Setup Guide + + + + + + Connect GitHub Account + + Integrate with GitHub using a Personal Access Token (PAT) to search and retrieve information from accessible repositories. This connector can index your code and documentation. + + + + + + GitHub Personal Access Token (PAT) Required + + You'll need a GitHub PAT with the appropriate scopes (e.g., 'repo') to use this connector. You can create one from your + + GitHub Developer Settings + . + + + +
+ + ( + + Connector Name + + + + + A friendly name to identify this GitHub connection. + + + + )} + /> + + ( + + GitHub Personal Access Token (PAT) + + + + + Your GitHub PAT will be encrypted and stored securely. Ensure it has the necessary 'repo' scopes. + + + + )} + /> + +
+ +
+ + +
+ +

What you get with GitHub integration:

+
    +
  • Search through code and documentation in your repositories
  • +
  • Access READMEs, Markdown files, and common code files
  • +
  • Connect your project knowledge directly to your search space
  • +
  • Index your repositories for enhanced search capabilities
  • +
+
+
+
+ + + + + GitHub Connector Setup Guide + + Learn how to generate a Personal Access Token (PAT) and connect your GitHub account. + + + +
+

How it works

+

+ The GitHub connector uses a Personal Access Token (PAT) to authenticate with the GitHub API. It fetches information about repositories accessible to the token and indexes relevant files (code, markdown, text). +

+
    +
  • The connector indexes files based on common code and documentation extensions.
  • +
  • Large files (over 1MB) are skipped during indexing.
  • +
  • Indexing runs periodically (check connector settings for frequency) to keep content up-to-date.
  • +
+
+ + + + Step 1: Create a GitHub PAT + + + + Token Security + + Treat your PAT like a password. Store it securely and consider using fine-grained tokens if possible. + + + +
+
+

Generating a Token:

+
    +
  1. Go to your GitHub Developer settings.
  2. +
  3. Click on Personal access tokens, then choose Tokens (classic) or Fine-grained tokens (recommended if available and suitable).
  4. +
  5. Click Generate new token (and choose the appropriate type).
  6. +
  7. Give your token a descriptive name (e.g., "SurfSense Connector").
  8. +
  9. Set an expiration date for the token (recommended for security).
  10. +
  11. Under Select scopes (for classic tokens) or Repository access (for fine-grained), grant the necessary permissions. At minimum, the `repo` scope (or equivalent read access to repositories for fine-grained tokens) is required to read repository content.
  12. +
  13. Click Generate token.
  14. +
  15. Important: Copy your new PAT immediately. You won't be able to see it again after leaving the page.
  16. +
+
+
+
+
+ + + Step 2: Connect in SurfSense + +
    +
  1. Paste the copied GitHub PAT into the "GitHub Personal Access Token (PAT)" field on the "Connect GitHub" tab.
  2. +
  3. Optionally, give the connector a custom name.
  4. +
  5. Click the Connect GitHub button.
  6. +
  7. If the connection is successful, you will be redirected and can start indexing from the Connectors page.
  8. +
+
+
+
+
+
+
+
+
+
+ ); +} diff --git a/surfsense_web/app/dashboard/[search_space_id]/connectors/add/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/connectors/add/page.tsx index d68de69..f70bb62 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/connectors/add/page.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/connectors/add/page.tsx @@ -14,6 +14,7 @@ import { IconMail, IconBrandZoom, IconChevronRight, + IconWorldWww, } from "@tabler/icons-react"; import { motion, AnimatePresence } from "framer-motion"; import { useState } from "react"; @@ -22,36 +23,43 @@ import Link from "next/link"; import { Button } from "@/components/ui/button"; import { Separator } from "@/components/ui/separator"; import { Collapsible, CollapsibleContent, CollapsibleTrigger } from "@/components/ui/collapsible"; +import { useForm } from "react-hook-form"; + +// Define the Connector type +interface Connector { + id: string; + title: string; + description: string; + icon: React.ReactNode; + status: "available" | "coming-soon" | "connected"; // Added connected status example +} + +interface ConnectorCategory { + id: string; + title: string; + connectors: Connector[]; +} // Define connector categories and their connectors -const connectorCategories = [ +const connectorCategories: ConnectorCategory[] = [ { id: "search-engines", title: "Search Engines", - description: "Connect to search engines to enhance your research capabilities.", - icon: , connectors: [ { - id: "tavily-api", - title: "Tavily Search API", - description: "Connect to Tavily Search API to search the web.", - icon: , - status: "available", - }, - { - id: "serper-api", - title: "Serper API", - description: "Connect to Serper API to search the web.", - icon: , - status: "coming-soon", + id: "web-search", + title: "Web Search", + description: "Enable web search capabilities for broader context.", + icon: , + status: "available", // Example status + // Potentially add config form here if needed (e.g., choosing provider) }, + // Add other search engine connectors like Tavily, Serper if they have UI config ], }, { id: "team-chats", title: "Team Chats", - description: "Connect to your team communication platforms.", - icon: , connectors: [ { id: "slack-connector", @@ -79,8 +87,6 @@ const connectorCategories = [ { id: "knowledge-bases", title: "Knowledge Bases", - description: "Connect to your knowledge bases and documentation.", - icon: , connectors: [ { id: "notion-connector", @@ -88,21 +94,20 @@ const connectorCategories = [ description: "Connect to your Notion workspace to access pages and databases.", icon: , status: "available", + // No form here, assumes it links to its own page }, { - id: "github", + id: "github-connector", // Keep the id simple title: "GitHub", - description: "Connect to GitHub repositories to access code and documentation.", + description: "Connect a GitHub PAT to index code and docs from accessible repositories.", icon: , - status: "coming-soon", + status: "available", }, ], }, { id: "communication", title: "Communication", - description: "Connect to your email and meeting platforms.", - icon: , connectors: [ { id: "gmail", @@ -125,7 +130,7 @@ const connectorCategories = [ export default function ConnectorsPage() { const params = useParams(); const searchSpaceId = params.search_space_id as string; - const [expandedCategories, setExpandedCategories] = useState(["search-engines"]); + const [expandedCategories, setExpandedCategories] = useState(["search-engines", "knowledge-bases"]); const toggleCategory = (categoryId: string) => { setExpandedCategories(prev => @@ -150,104 +155,68 @@ export default function ConnectorsPage() {
- {connectorCategories.map((category, categoryIndex) => ( + {connectorCategories.map((category) => ( toggleCategory(category.id)} - className="border rounded-lg overflow-hidden bg-card" + className="space-y-2" > - - -
-
- {category.icon} -
-
-

{category.title}

-

{category.description}

-
-
- -
-
+
+

{category.title}

+ + {/* Replace with your preferred expand/collapse icon/button */} + + +
- -
- - {category.connectors.map((connector, index) => ( - -
- -
- {connector.icon} +
+ {category.connectors.map((connector) => ( +
+
+
+
+ {connector.icon} +

+ {connector.title} +

+ {connector.status === "coming-soon" && ( + + Coming soon + + )} + {/* TODO: Add 'Connected' badge based on actual state */} +
+

+ {connector.description} +

- -
-

- {connector.title} -

- {connector.status === "coming-soon" && ( - Coming soon - )} -
- -

- {connector.description} -

- - {connector.status === "available" ? ( - -
+ {/* Always render Link button if available */} + {connector.status === 'available' && ( +
+ + - ) : ( -
+ )} + {connector.status === 'coming-soon' && ( +
+ - )} - - ))} - +
+ )} + {/* TODO: Add logic for 'connected' status */} +
+ ))}
+ ))}
From a26fac435b8bcee0e189ffa422729e2c9ca9ac7c Mon Sep 17 00:00:00 2001 From: Adamsmith6300 Date: Sun, 13 Apr 2025 21:23:05 -0700 Subject: [PATCH 2/4] documents table migration, fix/update github indexing --- ...1_add_github_connector_to_documenttype_.py | 70 +++++++++++++++++++ .../app/connectors/github_connector.py | 57 +++++++++++---- .../tasks/stream_connector_search_results.py | 27 +++++++ .../app/utils/connector_service.py | 55 ++++++++++++++- .../documents/(manage)/page.tsx | 5 +- 5 files changed, 197 insertions(+), 17 deletions(-) create mode 100644 surfsense_backend/alembic/versions/e55302644c51_add_github_connector_to_documenttype_.py diff --git a/surfsense_backend/alembic/versions/e55302644c51_add_github_connector_to_documenttype_.py b/surfsense_backend/alembic/versions/e55302644c51_add_github_connector_to_documenttype_.py new file mode 100644 index 0000000..1f15912 --- /dev/null +++ b/surfsense_backend/alembic/versions/e55302644c51_add_github_connector_to_documenttype_.py @@ -0,0 +1,70 @@ +"""Add GITHUB_CONNECTOR to DocumentType enum + +Revision ID: e55302644c51 +Revises: 1 +Create Date: 2025-04-13 19:56:00.059921 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = 'e55302644c51' +down_revision: Union[str, None] = '1' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + +# Define the ENUM type name and the new value +ENUM_NAME = 'documenttype' # Make sure this matches the name in your DB (usually lowercase class name) +NEW_VALUE = 'GITHUB_CONNECTOR' + +def upgrade() -> None: + """Upgrade schema.""" + op.execute(f"ALTER TYPE {ENUM_NAME} ADD VALUE '{NEW_VALUE}'") + + +# Warning: This will delete all rows with the new value +def downgrade() -> None: + """Downgrade schema - remove GITHUB_CONNECTOR from enum.""" + + # The old type name + old_enum_name = f"{ENUM_NAME}_old" + + # Enum values *before* GITHUB_CONNECTOR was added + old_values = ( + 'EXTENSION', + 'CRAWLED_URL', + 'FILE', + 'SLACK_CONNECTOR', + 'NOTION_CONNECTOR', + 'YOUTUBE_VIDEO' + ) + old_values_sql = ", ".join([f"'{v}'" for v in old_values]) + + # Table and column names (adjust if different) + table_name = 'documents' + column_name = 'document_type' + + # 1. Rename the current enum type + op.execute(f"ALTER TYPE {ENUM_NAME} RENAME TO {old_enum_name}") + + # 2. Create the new enum type with the old values + op.execute(f"CREATE TYPE {ENUM_NAME} AS ENUM({old_values_sql})") + + # 3. Update the table: + op.execute( + f"DELETE FROM {table_name} WHERE {column_name}::text = '{NEW_VALUE}'" + ) + + # 4. Alter the column to use the new enum type (casting old values) + op.execute( + f"ALTER TABLE {table_name} ALTER COLUMN {column_name} " + f"TYPE {ENUM_NAME} USING {column_name}::text::{ENUM_NAME}" + ) + + # 5. Drop the old enum type + op.execute(f"DROP TYPE {old_enum_name}") + # ### end Alembic commands ### diff --git a/surfsense_backend/app/connectors/github_connector.py b/surfsense_backend/app/connectors/github_connector.py index d827dac..265f89b 100644 --- a/surfsense_backend/app/connectors/github_connector.py +++ b/surfsense_backend/app/connectors/github_connector.py @@ -2,7 +2,6 @@ import base64 import logging from typing import List, Optional, Dict, Any, Tuple from github3 import login as github_login, exceptions as github_exceptions -from github3.repos.repo import Repository from github3.repos.contents import Contents from github3.exceptions import ForbiddenError, NotFoundError @@ -26,6 +25,33 @@ MAX_FILE_SIZE = 1 * 1024 * 1024 class GitHubConnector: """Connector for interacting with the GitHub API.""" + # Directories to skip during file traversal + SKIPPED_DIRS = { + # Version control + '.git', + # Dependencies + 'node_modules', + 'vendor', + # Build artifacts / Caches + 'build', + 'dist', + 'target', + '__pycache__', + # Virtual environments + 'venv', + '.venv', + 'env', + # IDE/Editor config + '.vscode', + '.idea', + '.project', + '.settings', + # Temporary / Logs + 'tmp', + 'logs', + # Add other project-specific irrelevant directories if needed + } + def __init__(self, token: str): """ Initializes the GitHub connector. @@ -54,17 +80,16 @@ class GitHubConnector: # type='owner' fetches repos owned by the user # type='member' fetches repos the user is a collaborator on (including orgs) # type='all' fetches both - for repo in self.gh.repositories(type='all', sort='updated'): - if isinstance(repo, Repository): - repos_data.append({ - "id": repo.id, - "name": repo.name, - "full_name": repo.full_name, - "private": repo.private, - "url": repo.html_url, - "description": repo.description or "", - "last_updated": repo.updated_at.isoformat() if repo.updated_at else None, - }) + for repo in self.gh.repositories(type='owner', sort='updated'): + repos_data.append({ + "id": repo.id, + "name": repo.name, + "full_name": repo.full_name, + "private": repo.private, + "url": repo.html_url, + "description": repo.description or "", + "last_updated": repo.updated_at if repo.updated_at else None, + }) logger.info(f"Fetched {len(repos_data)} repositories.") return repos_data except Exception as e: @@ -90,8 +115,7 @@ class GitHubConnector: if not repo: logger.warning(f"Repository '{repo_full_name}' not found.") return [] - - contents = repo.directory_contents(path=path) # Use directory_contents for clarity + contents = repo.directory_contents(directory_path=path) # Use directory_contents for clarity # contents returns a list of tuples (name, content_obj) for item_name, content_item in contents: @@ -99,6 +123,11 @@ class GitHubConnector: continue if content_item.type == 'dir': + # Check if the directory name is in the skipped list + if content_item.name in self.SKIPPED_DIRS: + logger.debug(f"Skipping directory: {content_item.path}") + continue # Skip recursion for this directory + # Recursively fetch contents of subdirectory files_list.extend(self.get_repository_files(repo_full_name, path=content_item.path)) elif content_item.type == 'file': diff --git a/surfsense_backend/app/tasks/stream_connector_search_results.py b/surfsense_backend/app/tasks/stream_connector_search_results.py index 5c563dc..b9a703a 100644 --- a/surfsense_backend/app/tasks/stream_connector_search_results.py +++ b/surfsense_backend/app/tasks/stream_connector_search_results.py @@ -244,6 +244,33 @@ async def stream_connector_search_results( all_raw_documents.extend(notion_chunks) + # Github Connector + if connector == "GITHUB_CONNECTOR": + # Send terminal message about starting search + yield streaming_service.add_terminal_message("Starting to search for GitHub connector...") + print("Starting to search for GitHub connector...") + # Search using Github API with reformulated query + result_object, github_chunks = await connector_service.search_github( + user_query=reformulated_query, + user_id=user_id, + search_space_id=search_space_id, + top_k=TOP_K + ) + + # Send terminal message about search results + yield streaming_service.add_terminal_message( + f"Found {len(result_object['sources'])} relevant results from Github", + "success" + ) + + # Update sources + all_sources.append(result_object) + yield streaming_service.update_sources(all_sources) + + # Add documents to collection + all_raw_documents.extend(github_chunks) + + # If we have documents to research diff --git a/surfsense_backend/app/utils/connector_service.py b/surfsense_backend/app/utils/connector_service.py index 9e676e5..8d7a551 100644 --- a/surfsense_backend/app/utils/connector_service.py +++ b/surfsense_backend/app/utils/connector_service.py @@ -558,4 +558,57 @@ class ConnectorService: "sources": sources_list, } - return result_object, youtube_chunks \ No newline at end of file + return result_object, youtube_chunks + + async def search_github(self, user_query: str, user_id: int, search_space_id: int, top_k: int = 20) -> tuple: + """ + Search for GitHub documents and return both the source information and langchain documents + + Returns: + tuple: (sources_info, langchain_documents) + """ + github_chunks = await self.retriever.hybrid_search( + query_text=user_query, + top_k=top_k, + user_id=user_id, + search_space_id=search_space_id, + document_type="GITHUB_CONNECTOR" + ) + + # Map github_chunks to the required format + mapped_sources = {} + for i, chunk in enumerate(github_chunks): + # Fix for UI - assign a unique ID for citation/source tracking + github_chunks[i]['document']['id'] = self.source_id_counter + + # Extract document metadata + document = chunk.get('document', {}) + metadata = document.get('metadata', {}) + + # Create a mapped source entry + source = { + "id": self.source_id_counter, + "title": document.get('title', 'GitHub Document'), # Use specific title if available + "description": metadata.get('description', chunk.get('content', '')[:100]), # Use description or content preview + "url": metadata.get('url', '') # Use URL if available in metadata + } + + self.source_id_counter += 1 + + # Use a unique identifier for tracking unique sources (URL preferred) + source_key = source.get("url") or source.get("title") + if source_key and source_key not in mapped_sources: + mapped_sources[source_key] = source + + # Convert to list of sources + sources_list = list(mapped_sources.values()) + + # Create result object + result_object = { + "id": 7, # Assuming 7 is the next available ID + "name": "GitHub", + "type": "GITHUB_CONNECTOR", + "sources": sources_list, + } + + return result_object, github_chunks diff --git a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/page.tsx index 66f8b08..18b4357 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/page.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/page.tsx @@ -94,7 +94,7 @@ import rehypeSanitize from "rehype-sanitize"; import remarkGfm from "remark-gfm"; import { DocumentViewer } from "@/components/document-viewer"; import { JsonMetadataViewer } from "@/components/json-metadata-viewer"; -import { IconBrandNotion, IconBrandSlack, IconBrandYoutube } from "@tabler/icons-react"; +import { IconBrandGithub, IconBrandNotion, IconBrandSlack, IconBrandYoutube } from "@tabler/icons-react"; // Define animation variants for reuse const fadeInScale = { @@ -142,6 +142,7 @@ const documentTypeIcons = { NOTION_CONNECTOR: IconBrandNotion, FILE: File, YOUTUBE_VIDEO: IconBrandYoutube, + GITHUB_CONNECTOR: IconBrandGithub, } as const; const columns: ColumnDef[] = [ @@ -1028,4 +1029,4 @@ function RowActions({ row }: { row: Row }) { ); } -export { DocumentsTable } \ No newline at end of file +export { DocumentsTable } From 396c070b28b0c2d0f7a8f54a104c64f22fcd03c0 Mon Sep 17 00:00:00 2001 From: Adamsmith6300 Date: Sun, 13 Apr 2025 21:33:10 -0700 Subject: [PATCH 3/4] addressing coderabbit PR comment --- surfsense_backend/app/schemas/search_source_connector.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/surfsense_backend/app/schemas/search_source_connector.py b/surfsense_backend/app/schemas/search_source_connector.py index 5386658..41e1086 100644 --- a/surfsense_backend/app/schemas/search_source_connector.py +++ b/surfsense_backend/app/schemas/search_source_connector.py @@ -59,14 +59,14 @@ class SearchSourceConnectorBase(BaseModel): raise ValueError("NOTION_INTEGRATION_TOKEN cannot be empty") elif connector_type == SearchSourceConnectorType.GITHUB_CONNECTOR: - # For GITHUB_CONNECTOR, only allow GITHUB_TOKEN + # For GITHUB_CONNECTOR, only allow GITHUB_PAT allowed_keys = ["GITHUB_PAT"] if set(config.keys()) != set(allowed_keys): raise ValueError(f"For GITHUB_CONNECTOR connector type, config must only contain these keys: {allowed_keys}") # Ensure the token is not empty if not config.get("GITHUB_PAT"): - raise ValueError("GITHUB_TOKEN cannot be empty") + raise ValueError("GITHUB_PAT cannot be empty") return config From 3e472c124fe323ca10ea8230b2bb20c3d0a857db Mon Sep 17 00:00:00 2001 From: Adamsmith6300 Date: Mon, 14 Apr 2025 17:04:43 -0700 Subject: [PATCH 4/4] sync with main and address comments --- .../app/utils/connector_service.py | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/surfsense_backend/app/utils/connector_service.py b/surfsense_backend/app/utils/connector_service.py index e0d6322..fe08572 100644 --- a/surfsense_backend/app/utils/connector_service.py +++ b/surfsense_backend/app/utils/connector_service.py @@ -529,8 +529,8 @@ class ConnectorService: document_type="GITHUB_CONNECTOR" ) - # Map github_chunks to the required format - mapped_sources = {} + # Process each chunk and create sources directly without deduplication + sources_list = [] for i, chunk in enumerate(github_chunks): # Fix for UI - assign a unique ID for citation/source tracking github_chunks[i]['document']['id'] = self.source_id_counter @@ -539,7 +539,7 @@ class ConnectorService: document = chunk.get('document', {}) metadata = document.get('metadata', {}) - # Create a mapped source entry + # Create a source entry source = { "id": self.source_id_counter, "title": document.get('title', 'GitHub Document'), # Use specific title if available @@ -548,18 +548,11 @@ class ConnectorService: } self.source_id_counter += 1 - - # Use a unique identifier for tracking unique sources (URL preferred) - source_key = source.get("url") or source.get("title") - if source_key and source_key not in mapped_sources: - mapped_sources[source_key] = source - - # Convert to list of sources - sources_list = list(mapped_sources.values()) + sources_list.append(source) # Create result object result_object = { - "id": 7, # Assuming 7 is the next available ID + "id": 8, "name": "GitHub", "type": "GITHUB_CONNECTOR", "sources": sources_list,