add github connector, add alembic for db migrations, fix bug updating connectors

This commit is contained in:
Adamsmith6300 2025-04-13 13:56:22 -07:00
parent fa5dbb786f
commit bb198e38c0
18 changed files with 1232 additions and 184 deletions

View file

@ -3,4 +3,5 @@
venv/
data/
__pycache__/
.flashrank_cache
.flashrank_cache
surf_new_backend.egg-info/

View file

@ -0,0 +1,119 @@
# A generic, single database configuration.
[alembic]
# path to migration scripts.
# Use forward slashes (/) also on windows to provide an os agnostic path
script_location = alembic
# template used to generate migration file names; The default value is %%(rev)s_%%(slug)s
# Uncomment the line below if you want the files to be prepended with date and time
# file_template = %%(year)d_%%(month).2d_%%(day).2d_%%(hour).2d%%(minute).2d-%%(rev)s_%%(slug)s
# sys.path path, will be prepended to sys.path if present.
# defaults to the current working directory.
prepend_sys_path = .
# timezone to use when rendering the date within the migration file
# as well as the filename.
# If specified, requires the python>=3.9 or backports.zoneinfo library and tzdata library.
# Any required deps can installed by adding `alembic[tz]` to the pip requirements
# string value is passed to ZoneInfo()
# leave blank for localtime
# timezone =
# max length of characters to apply to the "slug" field
# truncate_slug_length = 40
# set to 'true' to run the environment during
# the 'revision' command, regardless of autogenerate
# revision_environment = false
# set to 'true' to allow .pyc and .pyo files without
# a source .py file to be detected as revisions in the
# versions/ directory
# sourceless = false
# version location specification; This defaults
# to alembic/versions. When using multiple version
# directories, initial revisions must be specified with --version-path.
# The path separator used here should be the separator specified by "version_path_separator" below.
# version_locations = %(here)s/bar:%(here)s/bat:alembic/versions
# version path separator; As mentioned above, this is the character used to split
# version_locations. The default within new alembic.ini files is "os", which uses os.pathsep.
# If this key is omitted entirely, it falls back to the legacy behavior of splitting on spaces and/or commas.
# Valid values for version_path_separator are:
#
# version_path_separator = :
# version_path_separator = ;
# version_path_separator = space
# version_path_separator = newline
#
# Use os.pathsep. Default configuration used for new projects.
version_path_separator = os
# set to 'true' to search source files recursively
# in each "version_locations" directory
# new in Alembic version 1.10
# recursive_version_locations = false
# the output encoding used when revision files
# are written from script.py.mako
# output_encoding = utf-8
# The SQLAlchemy URL to connect to
# IMPORTANT: Replace this with your actual async database URL
sqlalchemy.url = postgresql+asyncpg://postgres:postgres@localhost:5432/surfsense
[post_write_hooks]
# post_write_hooks defines scripts or Python functions that are run
# on newly generated revision scripts. See the documentation for further
# detail and examples
# format using "black" - use the console_scripts runner, against the "black" entrypoint
# hooks = black
# black.type = console_scripts
# black.entrypoint = black
# black.options = -l 79 REVISION_SCRIPT_FILENAME
# lint with attempts to fix using "ruff" - use the exec runner, execute a binary
# hooks = ruff
# ruff.type = exec
# ruff.executable = %(here)s/.venv/bin/ruff
# ruff.options = check --fix REVISION_SCRIPT_FILENAME
# Logging configuration
[loggers]
keys = root,sqlalchemy,alembic
[handlers]
keys = console
[formatters]
keys = generic
[logger_root]
level = WARNING
handlers = console
qualname =
[logger_sqlalchemy]
level = WARNING
handlers =
qualname = sqlalchemy.engine
[logger_alembic]
level = INFO
handlers =
qualname = alembic
[handler_console]
class = StreamHandler
args = (sys.stderr,)
level = NOTSET
formatter = generic
[formatter_generic]
format = %(levelname)-5.5s [%(name)s] %(message)s
datefmt = %H:%M:%S

View file

@ -0,0 +1 @@
Generic single-database configuration with an async dbapi.

View file

@ -0,0 +1,98 @@
import asyncio
from logging.config import fileConfig
import os
import sys
from sqlalchemy import pool
from sqlalchemy.engine import Connection
from sqlalchemy.ext.asyncio import async_engine_from_config
from alembic import context
# Ensure the app directory is in the Python path
# This allows Alembic to find your models
sys.path.insert(0, os.path.realpath(os.path.join(os.path.dirname(__file__), '..')))
# Import your models base
from app.db import Base # Assuming your Base is defined in app.db
# this is the Alembic Config object, which provides
# access to the values within the .ini file in use.
config = context.config
# Interpret the config file for Python logging.
# This line sets up loggers basically.
if config.config_file_name is not None:
fileConfig(config.config_file_name)
# add your model's MetaData object here
# for 'autogenerate' support
# from myapp import mymodel
# target_metadata = mymodel.Base.metadata
target_metadata = Base.metadata
# other values from the config, defined by the needs of env.py,
# can be acquired:
# my_important_option = config.get_main_option("my_important_option")
# ... etc.
def run_migrations_offline() -> None:
"""Run migrations in 'offline' mode.
This configures the context with just a URL
and not an Engine, though an Engine is acceptable
here as well. By skipping the Engine creation
we don't even need a DBAPI to be available.
Calls to context.execute() here emit the given string to the
script output.
"""
url = config.get_main_option("sqlalchemy.url")
context.configure(
url=url,
target_metadata=target_metadata,
literal_binds=True,
dialect_opts={"paramstyle": "named"},
)
with context.begin_transaction():
context.run_migrations()
def do_run_migrations(connection: Connection) -> None:
context.configure(connection=connection, target_metadata=target_metadata)
with context.begin_transaction():
context.run_migrations()
async def run_async_migrations() -> None:
"""In this scenario we need to create an Engine
and associate a connection with the context.
"""
connectable = async_engine_from_config(
config.get_section(config.config_ini_section, {}),
prefix="sqlalchemy.",
poolclass=pool.NullPool,
)
async with connectable.connect() as connection:
await connection.run_sync(do_run_migrations)
await connectable.dispose()
def run_migrations_online() -> None:
"""Run migrations in 'online' mode."""
asyncio.run(run_async_migrations())
if context.is_offline_mode():
run_migrations_offline()
else:
run_migrations_online()

View file

@ -0,0 +1,28 @@
"""${message}
Revision ID: ${up_revision}
Revises: ${down_revision | comma,n}
Create Date: ${create_date}
"""
from typing import Sequence, Union
from alembic import op
import sqlalchemy as sa
${imports if imports else ""}
# revision identifiers, used by Alembic.
revision: str = ${repr(up_revision)}
down_revision: Union[str, None] = ${repr(down_revision)}
branch_labels: Union[str, Sequence[str], None] = ${repr(branch_labels)}
depends_on: Union[str, Sequence[str], None] = ${repr(depends_on)}
def upgrade() -> None:
"""Upgrade schema."""
${upgrades if upgrades else "pass"}
def downgrade() -> None:
"""Downgrade schema."""
${downgrades if downgrades else "pass"}

View file

@ -0,0 +1,53 @@
"""Add GITHUB_CONNECTOR to SearchSourceConnectorType enum
Revision ID: 1
Revises:
Create Date: 2023-10-27 10:00:00.000000
"""
from typing import Sequence, Union
from alembic import op
import sqlalchemy as sa
# Import pgvector if needed for other types, though not for this ENUM change
# import pgvector
# revision identifiers, used by Alembic.
revision: str = '1'
down_revision: Union[str, None] = None
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None
def upgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
# Manually add the command to add the enum value
# Note: It's generally better to let autogenerate handle this, but we're bypassing it
op.execute("ALTER TYPE searchsourceconnectortype ADD VALUE 'GITHUB_CONNECTOR'")
# Pass for the rest, as autogenerate didn't run to add other schema details
pass
# ### end Alembic commands ###
def downgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
# Downgrading removal of an enum value is complex and potentially dangerous
# if the value is in use. Often omitted or requires manual SQL based on context.
# For now, we'll just pass. If you needed to reverse this, you'd likely
# have to manually check if 'GITHUB_CONNECTOR' is used in the table
# and then potentially recreate the type without it.
op.execute("ALTER TYPE searchsourceconnectortype RENAME TO searchsourceconnectortype_old")
op.execute("CREATE TYPE searchsourceconnectortype AS ENUM('SERPER_API', 'TAVILY_API', 'SLACK_CONNECTOR', 'NOTION_CONNECTOR')")
op.execute((
"ALTER TABLE search_source_connectors ALTER COLUMN connector_type TYPE searchsourceconnectortype USING "
"connector_type::text::searchsourceconnectortype"
))
op.execute("DROP TYPE searchsourceconnectortype_old")
pass
# ### end Alembic commands ###

View file

@ -0,0 +1,182 @@
import base64
import logging
from typing import List, Optional, Dict, Any, Tuple
from github3 import login as github_login, exceptions as github_exceptions
from github3.repos.repo import Repository
from github3.repos.contents import Contents
from github3.exceptions import ForbiddenError, NotFoundError
logger = logging.getLogger(__name__)
# List of common code file extensions to target
CODE_EXTENSIONS = {
'.py', '.js', '.jsx', '.ts', '.tsx', '.java', '.c', '.cpp', '.h', '.hpp',
'.cs', '.go', '.rb', '.php', '.swift', '.kt', '.scala', '.rs', '.m',
'.sh', '.bash', '.ps1', '.lua', '.pl', '.pm', '.r', '.dart', '.sql'
}
# List of common documentation/text file extensions
DOC_EXTENSIONS = {
'.md', '.txt', '.rst', '.adoc', '.html', '.htm', '.xml', '.json', '.yaml', '.yml', '.toml'
}
# Maximum file size in bytes (e.g., 1MB)
MAX_FILE_SIZE = 1 * 1024 * 1024
class GitHubConnector:
"""Connector for interacting with the GitHub API."""
def __init__(self, token: str):
"""
Initializes the GitHub connector.
Args:
token: GitHub Personal Access Token (PAT).
"""
if not token:
raise ValueError("GitHub token cannot be empty.")
try:
self.gh = github_login(token=token)
# Try a simple authenticated call to check token validity
self.gh.me()
logger.info("Successfully authenticated with GitHub API.")
except (github_exceptions.AuthenticationFailed, ForbiddenError) as e:
logger.error(f"GitHub authentication failed: {e}")
raise ValueError("Invalid GitHub token or insufficient permissions.")
except Exception as e:
logger.error(f"Failed to initialize GitHub client: {e}")
raise
def get_user_repositories(self) -> List[Dict[str, Any]]:
"""Fetches repositories accessible by the authenticated user."""
repos_data = []
try:
# type='owner' fetches repos owned by the user
# type='member' fetches repos the user is a collaborator on (including orgs)
# type='all' fetches both
for repo in self.gh.repositories(type='all', sort='updated'):
if isinstance(repo, Repository):
repos_data.append({
"id": repo.id,
"name": repo.name,
"full_name": repo.full_name,
"private": repo.private,
"url": repo.html_url,
"description": repo.description or "",
"last_updated": repo.updated_at.isoformat() if repo.updated_at else None,
})
logger.info(f"Fetched {len(repos_data)} repositories.")
return repos_data
except Exception as e:
logger.error(f"Failed to fetch GitHub repositories: {e}")
return [] # Return empty list on error
def get_repository_files(self, repo_full_name: str, path: str = '') -> List[Dict[str, Any]]:
"""
Recursively fetches details of relevant files (code, docs) within a repository path.
Args:
repo_full_name: The full name of the repository (e.g., 'owner/repo').
path: The starting path within the repository (default is root).
Returns:
A list of dictionaries, each containing file details (path, sha, url, size).
Returns an empty list if the repository or path is not found or on error.
"""
files_list = []
try:
owner, repo_name = repo_full_name.split('/')
repo = self.gh.repository(owner, repo_name)
if not repo:
logger.warning(f"Repository '{repo_full_name}' not found.")
return []
contents = repo.directory_contents(path=path) # Use directory_contents for clarity
# contents returns a list of tuples (name, content_obj)
for item_name, content_item in contents:
if not isinstance(content_item, Contents):
continue
if content_item.type == 'dir':
# Recursively fetch contents of subdirectory
files_list.extend(self.get_repository_files(repo_full_name, path=content_item.path))
elif content_item.type == 'file':
# Check if the file extension is relevant and size is within limits
file_extension = '.' + content_item.name.split('.')[-1].lower() if '.' in content_item.name else ''
is_code = file_extension in CODE_EXTENSIONS
is_doc = file_extension in DOC_EXTENSIONS
if (is_code or is_doc) and content_item.size <= MAX_FILE_SIZE:
files_list.append({
"path": content_item.path,
"sha": content_item.sha,
"url": content_item.html_url,
"size": content_item.size,
"type": "code" if is_code else "doc"
})
elif content_item.size > MAX_FILE_SIZE:
logger.debug(f"Skipping large file: {content_item.path} ({content_item.size} bytes)")
else:
logger.debug(f"Skipping irrelevant file type: {content_item.path}")
except (NotFoundError, ForbiddenError) as e:
logger.warning(f"Cannot access path '{path}' in '{repo_full_name}': {e}")
except Exception as e:
logger.error(f"Failed to get files for {repo_full_name} at path '{path}': {e}")
# Return what we have collected so far in case of partial failure
return files_list
def get_file_content(self, repo_full_name: str, file_path: str) -> Optional[str]:
"""
Fetches the decoded content of a specific file.
Args:
repo_full_name: The full name of the repository (e.g., 'owner/repo').
file_path: The path to the file within the repository.
Returns:
The decoded file content as a string, or None if fetching fails or file is too large.
"""
try:
owner, repo_name = repo_full_name.split('/')
repo = self.gh.repository(owner, repo_name)
if not repo:
logger.warning(f"Repository '{repo_full_name}' not found when fetching file '{file_path}'.")
return None
content_item = repo.file_contents(path=file_path) # Use file_contents for clarity
if not content_item or not isinstance(content_item, Contents) or content_item.type != 'file':
logger.warning(f"File '{file_path}' not found or is not a file in '{repo_full_name}'.")
return None
if content_item.size > MAX_FILE_SIZE:
logger.warning(f"File '{file_path}' in '{repo_full_name}' exceeds max size ({content_item.size} > {MAX_FILE_SIZE}). Skipping content fetch.")
return None
# Content is base64 encoded
if content_item.content:
try:
decoded_content = base64.b64decode(content_item.content).decode('utf-8')
return decoded_content
except UnicodeDecodeError:
logger.warning(f"Could not decode file '{file_path}' in '{repo_full_name}' as UTF-8. Trying with 'latin-1'.")
try:
# Try a fallback encoding
decoded_content = base64.b64decode(content_item.content).decode('latin-1')
return decoded_content
except Exception as decode_err:
logger.error(f"Failed to decode file '{file_path}' with fallback encoding: {decode_err}")
return None # Give up if fallback fails
else:
logger.warning(f"No content returned for file '{file_path}' in '{repo_full_name}'. It might be empty.")
return "" # Return empty string for empty files
except (NotFoundError, ForbiddenError) as e:
logger.warning(f"Cannot access file '{file_path}' in '{repo_full_name}': {e}")
return None
except Exception as e:
logger.error(f"Failed to get content for file '{file_path}' in '{repo_full_name}': {e}")
return None

View file

@ -40,12 +40,14 @@ class DocumentType(str, Enum):
SLACK_CONNECTOR = "SLACK_CONNECTOR"
NOTION_CONNECTOR = "NOTION_CONNECTOR"
YOUTUBE_VIDEO = "YOUTUBE_VIDEO"
GITHUB_CONNECTOR = "GITHUB_CONNECTOR"
class SearchSourceConnectorType(str, Enum):
SERPER_API = "SERPER_API"
TAVILY_API = "TAVILY_API"
SLACK_CONNECTOR = "SLACK_CONNECTOR"
NOTION_CONNECTOR = "NOTION_CONNECTOR"
GITHUB_CONNECTOR = "GITHUB_CONNECTOR"
class ChatType(str, Enum):
GENERAL = "GENERAL"

View file

@ -14,13 +14,13 @@ from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.future import select
from sqlalchemy.exc import IntegrityError
from typing import List, Dict, Any
from app.db import get_async_session, User, SearchSourceConnector, SearchSourceConnectorType, SearchSpace
from app.db import get_async_session, User, SearchSourceConnector, SearchSourceConnectorType, SearchSpace, async_session_maker
from app.schemas import SearchSourceConnectorCreate, SearchSourceConnectorUpdate, SearchSourceConnectorRead
from app.users import current_active_user
from app.utils.check_ownership import check_ownership
from pydantic import ValidationError
from app.tasks.connectors_indexing_tasks import index_slack_messages, index_notion_pages
from datetime import datetime
from app.tasks.connectors_indexing_tasks import index_slack_messages, index_notion_pages, index_github_repos
from datetime import datetime, timezone
import logging
# Set up logging
@ -50,13 +50,11 @@ async def create_search_source_connector(
)
)
existing_connector = result.scalars().first()
if existing_connector:
raise HTTPException(
status_code=409,
detail=f"A connector with type {connector.connector_type} already exists. Each user can have only one connector of each type."
)
db_connector = SearchSourceConnector(**connector.model_dump(), user_id=user.id)
session.add(db_connector)
await session.commit()
@ -239,10 +237,15 @@ async def index_connector_content(
search_space = await check_ownership(session, SearchSpace, search_space_id, user)
# Handle different connector types
response_message = ""
indexing_from = None
indexing_to = None
today_str = datetime.now().strftime("%Y-%m-%d")
if connector.connector_type == SearchSourceConnectorType.SLACK_CONNECTOR:
# Determine the time range that will be indexed
if not connector.last_indexed_at:
start_date = "365 days ago"
start_date = "365 days ago" # Or perhaps set a specific date if needed
else:
# Check if last_indexed_at is today
today = datetime.now().date()
@ -252,33 +255,18 @@ async def index_connector_content(
else:
start_date = connector.last_indexed_at.strftime("%Y-%m-%d")
# Add the indexing task to background tasks
if background_tasks:
background_tasks.add_task(
run_slack_indexing_with_new_session,
connector_id,
search_space_id
)
return {
"success": True,
"message": "Slack indexing started in the background",
"connector_type": connector.connector_type,
"search_space": search_space.name,
"indexing_from": start_date,
"indexing_to": datetime.now().strftime("%Y-%m-%d")
}
else:
# For testing or if background tasks are not available
return {
"success": False,
"message": "Background tasks not available",
"connector_type": connector.connector_type
}
indexing_from = start_date
indexing_to = today_str
# Run indexing in background
logger.info(f"Triggering Slack indexing for connector {connector_id} into search space {search_space_id}")
background_tasks.add_task(run_slack_indexing_with_new_session, connector_id, search_space_id)
response_message = "Slack indexing started in the background."
elif connector.connector_type == SearchSourceConnectorType.NOTION_CONNECTOR:
# Determine the time range that will be indexed
if not connector.last_indexed_at:
start_date = "365 days ago"
start_date = "365 days ago" # Or perhaps set a specific date
else:
# Check if last_indexed_at is today
today = datetime.now().date()
@ -288,44 +276,46 @@ async def index_connector_content(
else:
start_date = connector.last_indexed_at.strftime("%Y-%m-%d")
# Add the indexing task to background tasks
if background_tasks:
background_tasks.add_task(
run_notion_indexing_with_new_session,
connector_id,
search_space_id
)
return {
"success": True,
"message": "Notion indexing started in the background",
"connector_type": connector.connector_type,
"search_space": search_space.name,
"indexing_from": start_date,
"indexing_to": datetime.now().strftime("%Y-%m-%d")
}
else:
# For testing or if background tasks are not available
return {
"success": False,
"message": "Background tasks not available",
"connector_type": connector.connector_type
}
indexing_from = start_date
indexing_to = today_str
# Run indexing in background
logger.info(f"Triggering Notion indexing for connector {connector_id} into search space {search_space_id}")
background_tasks.add_task(run_notion_indexing_with_new_session, connector_id, search_space_id)
response_message = "Notion indexing started in the background."
elif connector.connector_type == SearchSourceConnectorType.GITHUB_CONNECTOR:
# GitHub connector likely indexes everything relevant, or uses internal logic
# Setting indexing_from to None and indexing_to to today
indexing_from = None
indexing_to = today_str
# Run indexing in background
logger.info(f"Triggering GitHub indexing for connector {connector_id} into search space {search_space_id}")
background_tasks.add_task(run_github_indexing_with_new_session, connector_id, search_space_id)
response_message = "GitHub indexing started in the background."
else:
raise HTTPException(
status_code=400,
detail=f"Indexing not supported for connector type: {connector.connector_type}"
)
return {
"message": response_message,
"connector_id": connector_id,
"search_space_id": search_space_id,
"indexing_from": indexing_from,
"indexing_to": indexing_to
}
except HTTPException:
raise
except Exception as e:
logger.error(f"Failed to start indexing: {str(e)}")
logger.error(f"Failed to initiate indexing for connector {connector_id}: {e}", exc_info=True)
raise HTTPException(
status_code=500,
detail=f"Failed to start indexing: {str(e)}"
)
detail=f"Failed to initiate indexing: {str(e)}"
)
async def update_connector_last_indexed(
session: AsyncSession,
@ -361,8 +351,6 @@ async def run_slack_indexing_with_new_session(
Create a new session and run the Slack indexing task.
This prevents session leaks by creating a dedicated session for the background task.
"""
from app.db import async_session_maker
async with async_session_maker() as session:
await run_slack_indexing(session, connector_id, search_space_id)
@ -405,8 +393,6 @@ async def run_notion_indexing_with_new_session(
Create a new session and run the Notion indexing task.
This prevents session leaks by creating a dedicated session for the background task.
"""
from app.db import async_session_maker
async with async_session_maker() as session:
await run_notion_indexing(session, connector_id, search_space_id)
@ -439,4 +425,38 @@ async def run_notion_indexing(
else:
logger.error(f"Notion indexing failed or no documents processed: {error_or_warning}")
except Exception as e:
logger.error(f"Error in background Notion indexing task: {str(e)}")
logger.error(f"Error in background Notion indexing task: {str(e)}")
# Add new helper functions for GitHub indexing
async def run_github_indexing_with_new_session(
connector_id: int,
search_space_id: int
):
"""Wrapper to run GitHub indexing with its own database session."""
logger.info(f"Background task started: Indexing GitHub connector {connector_id} into space {search_space_id}")
async with async_session_maker() as session:
await run_github_indexing(session, connector_id, search_space_id)
logger.info(f"Background task finished: Indexing GitHub connector {connector_id}")
async def run_github_indexing(
session: AsyncSession,
connector_id: int,
search_space_id: int
):
"""Runs the GitHub indexing task and updates the timestamp."""
try:
indexed_count, error_message = await index_github_repos(
session, connector_id, search_space_id, update_last_indexed=False
)
if error_message:
logger.error(f"GitHub indexing failed for connector {connector_id}: {error_message}")
# Optionally update status in DB to indicate failure
else:
logger.info(f"GitHub indexing successful for connector {connector_id}. Indexed {indexed_count} documents.")
# Update the last indexed timestamp only on success
await update_connector_last_indexed(session, connector_id)
await session.commit() # Commit timestamp update
except Exception as e:
await session.rollback()
logger.error(f"Critical error in run_github_indexing for connector {connector_id}: {e}", exc_info=True)
# Optionally update status in DB to indicate failure

View file

@ -57,6 +57,16 @@ class SearchSourceConnectorBase(BaseModel):
# Ensure the integration token is not empty
if not config.get("NOTION_INTEGRATION_TOKEN"):
raise ValueError("NOTION_INTEGRATION_TOKEN cannot be empty")
elif connector_type == SearchSourceConnectorType.GITHUB_CONNECTOR:
# For GITHUB_CONNECTOR, only allow GITHUB_TOKEN
allowed_keys = ["GITHUB_PAT"]
if set(config.keys()) != set(allowed_keys):
raise ValueError(f"For GITHUB_CONNECTOR connector type, config must only contain these keys: {allowed_keys}")
# Ensure the token is not empty
if not config.get("GITHUB_PAT"):
raise ValueError("GITHUB_TOKEN cannot be empty")
return config
@ -70,4 +80,4 @@ class SearchSourceConnectorRead(SearchSourceConnectorBase, IDModel, TimestampMod
user_id: uuid.UUID
class Config:
from_attributes = True
from_attributes = True

View file

@ -3,12 +3,13 @@ from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.future import select
from sqlalchemy import delete
from datetime import datetime, timedelta
from datetime import datetime, timedelta, timezone
from app.db import Document, DocumentType, Chunk, SearchSourceConnector, SearchSourceConnectorType
from app.config import config
from app.prompts import SUMMARY_PROMPT_TEMPLATE
from app.connectors.slack_history import SlackHistory
from app.connectors.notion_history import NotionHistoryConnector
from app.connectors.github_connector import GitHubConnector
from slack_sdk.errors import SlackApiError
import logging
@ -589,3 +590,195 @@ async def index_notion_pages(
await session.rollback()
logger.error(f"Failed to index Notion pages: {str(e)}", exc_info=True)
return 0, f"Failed to index Notion pages: {str(e)}"
async def index_github_repos(
session: AsyncSession,
connector_id: int,
search_space_id: int,
update_last_indexed: bool = True
) -> Tuple[int, Optional[str]]:
"""
Index code and documentation files from accessible GitHub repositories.
Args:
session: Database session
connector_id: ID of the GitHub connector
search_space_id: ID of the search space to store documents in
update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
Returns:
Tuple containing (number of documents indexed, error message or None)
"""
documents_processed = 0
errors = []
try:
# 1. Get the GitHub connector from the database
result = await session.execute(
select(SearchSourceConnector)
.filter(
SearchSourceConnector.id == connector_id,
SearchSourceConnector.connector_type == SearchSourceConnectorType.GITHUB_CONNECTOR
)
)
connector = result.scalars().first()
if not connector:
return 0, f"Connector with ID {connector_id} not found or is not a GitHub connector"
# 2. Get the GitHub PAT from the connector config
github_pat = connector.config.get("GITHUB_PAT")
if not github_pat:
return 0, "GitHub Personal Access Token (PAT) not found in connector config"
# 3. Initialize GitHub connector client
try:
github_client = GitHubConnector(token=github_pat)
except ValueError as e:
return 0, f"Failed to initialize GitHub client: {str(e)}"
# 4. Get list of accessible repositories
repositories = github_client.get_user_repositories()
if not repositories:
logger.info("No accessible GitHub repositories found for the provided token.")
return 0, "No accessible GitHub repositories found."
logger.info(f"Found {len(repositories)} repositories to potentially index.")
# 5. Get existing documents for this search space and connector type to prevent duplicates
existing_docs_result = await session.execute(
select(Document)
.filter(
Document.search_space_id == search_space_id,
Document.document_type == DocumentType.GITHUB_CONNECTOR
)
)
existing_docs = existing_docs_result.scalars().all()
# Create a lookup dict: key=repo_fullname/file_path, value=Document object
existing_docs_lookup = {doc.document_metadata.get("full_path"): doc for doc in existing_docs if doc.document_metadata.get("full_path")}
logger.info(f"Found {len(existing_docs_lookup)} existing GitHub documents in database for search space {search_space_id}")
# 6. Iterate through repositories and index files
for repo_info in repositories:
repo_full_name = repo_info.get("full_name")
if not repo_full_name:
logger.warning(f"Skipping repository with missing full_name: {repo_info.get('name')}")
continue
logger.info(f"Processing repository: {repo_full_name}")
try:
files_to_index = github_client.get_repository_files(repo_full_name)
if not files_to_index:
logger.info(f"No indexable files found in repository: {repo_full_name}")
continue
logger.info(f"Found {len(files_to_index)} files to process in {repo_full_name}")
for file_info in files_to_index:
file_path = file_info.get("path")
file_url = file_info.get("url")
file_sha = file_info.get("sha")
file_type = file_info.get("type") # 'code' or 'doc'
full_path_key = f"{repo_full_name}/{file_path}"
if not file_path or not file_url or not file_sha:
logger.warning(f"Skipping file with missing info in {repo_full_name}: {file_info}")
continue
# Check if document already exists and if content hash matches
existing_doc = existing_docs_lookup.get(full_path_key)
if existing_doc and existing_doc.document_metadata.get("sha") == file_sha:
logger.debug(f"Skipping unchanged file: {full_path_key}")
continue # Skip if SHA matches (content hasn't changed)
# Get file content
file_content = github_client.get_file_content(repo_full_name, file_path)
if file_content is None:
logger.warning(f"Could not retrieve content for {full_path_key}. Skipping.")
continue # Skip if content fetch failed
# Use file_content directly for chunking, maybe summary for main content?
# For now, let's use the full content for both, might need refinement
summary_content = f"GitHub file: {full_path_key}\n\n{file_content[:1000]}..." # Simple summary
summary_embedding = config.embedding_model_instance.embed(summary_content)
# Chunk the content
try:
chunks_data = [
Chunk(content=chunk.text, embedding=chunk.embedding)
for chunk in config.chunker_instance.chunk(file_content)
]
except Exception as chunk_err:
logger.error(f"Failed to chunk file {full_path_key}: {chunk_err}")
errors.append(f"Chunking failed for {full_path_key}: {chunk_err}")
continue # Skip this file if chunking fails
doc_metadata = {
"repository_full_name": repo_full_name,
"file_path": file_path,
"full_path": full_path_key, # For easier lookup
"url": file_url,
"sha": file_sha,
"type": file_type,
"indexed_at": datetime.now(timezone.utc).isoformat()
}
if existing_doc:
# Update existing document
logger.info(f"Updating document for file: {full_path_key}")
existing_doc.title = f"GitHub - {file_path}"
existing_doc.document_metadata = doc_metadata
existing_doc.content = summary_content # Update summary
existing_doc.embedding = summary_embedding # Update embedding
# Delete old chunks
await session.execute(
delete(Chunk)
.where(Chunk.document_id == existing_doc.id)
)
# Add new chunks
for chunk_obj in chunks_data:
chunk_obj.document_id = existing_doc.id
session.add(chunk_obj)
documents_processed += 1
else:
# Create new document
logger.info(f"Creating new document for file: {full_path_key}")
document = Document(
title=f"GitHub - {file_path}",
document_type=DocumentType.GITHUB_CONNECTOR,
document_metadata=doc_metadata,
content=summary_content, # Store summary
embedding=summary_embedding,
search_space_id=search_space_id,
chunks=chunks_data # Associate chunks directly
)
session.add(document)
documents_processed += 1
# Commit periodically or at the end? For now, commit per repo
# await session.commit()
except Exception as repo_err:
logger.error(f"Failed to process repository {repo_full_name}: {repo_err}")
errors.append(f"Failed processing {repo_full_name}: {repo_err}")
# Commit all changes at the end
await session.commit()
logger.info(f"Finished GitHub indexing for connector {connector_id}. Processed {documents_processed} files.")
except SQLAlchemyError as db_err:
await session.rollback()
logger.error(f"Database error during GitHub indexing for connector {connector_id}: {db_err}")
errors.append(f"Database error: {db_err}")
return documents_processed, "; ".join(errors) if errors else str(db_err)
except Exception as e:
await session.rollback()
logger.error(f"Unexpected error during GitHub indexing for connector {connector_id}: {e}", exc_info=True)
errors.append(f"Unexpected error: {e}")
return documents_processed, "; ".join(errors) if errors else str(e)
error_message = "; ".join(errors) if errors else None
return documents_processed, error_message

View file

@ -1,5 +1,12 @@
import uvicorn
import argparse
import logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Run the SurfSense application')

View file

@ -5,12 +5,14 @@ description = "Add your description here"
readme = "README.md"
requires-python = ">=3.12"
dependencies = [
"alembic>=1.13.0",
"asyncpg>=0.30.0",
"chonkie[all]>=0.4.1",
"fastapi>=0.115.8",
"fastapi-users[oauth,sqlalchemy]>=14.0.1",
"firecrawl-py>=1.12.0",
"gpt-researcher>=0.12.12",
"github3.py==4.0.1",
"langchain-community>=0.3.17",
"langchain-unstructured>=0.1.6",
"litellm>=1.61.4",

View file

@ -92,6 +92,20 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/ec/6a/bc7e17a3e87a2985d3e8f4da4cd0f481060eb78fb08596c42be62c90a4d9/aiosignal-1.3.2-py2.py3-none-any.whl", hash = "sha256:45cde58e409a301715980c2b01d0c28bdde3770d8290b5eb2173759d9acb31a5", size = 7597 },
]
[[package]]
name = "alembic"
version = "1.15.2"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "mako" },
{ name = "sqlalchemy" },
{ name = "typing-extensions" },
]
sdist = { url = "https://files.pythonhosted.org/packages/e6/57/e314c31b261d1e8a5a5f1908065b4ff98270a778ce7579bd4254477209a7/alembic-1.15.2.tar.gz", hash = "sha256:1c72391bbdeffccfe317eefba686cb9a3c078005478885413b95c3b26c57a8a7", size = 1925573 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/41/18/d89a443ed1ab9bcda16264716f809c663866d4ca8de218aa78fd50b38ead/alembic-1.15.2-py3-none-any.whl", hash = "sha256:2e76bd916d547f6900ec4bb5a90aeac1485d2c92536923d0b138c02b126edc53", size = 231911 },
]
[[package]]
name = "annotated-types"
version = "0.7.0"
@ -884,6 +898,21 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/e2/94/758680531a00d06e471ef649e4ec2ed6bf185356a7f9fbfbb7368a40bd49/fsspec-2025.2.0-py3-none-any.whl", hash = "sha256:9de2ad9ce1f85e1931858535bc882543171d197001a0a5eb2ddc04f1781ab95b", size = 184484 },
]
[[package]]
name = "github3-py"
version = "4.0.1"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "pyjwt", extra = ["crypto"] },
{ name = "python-dateutil" },
{ name = "requests" },
{ name = "uritemplate" },
]
sdist = { url = "https://files.pythonhosted.org/packages/89/91/603bcaf8cd1b3927de64bf56c3a8915f6653ea7281919140c5bcff2bfe7b/github3.py-4.0.1.tar.gz", hash = "sha256:30d571076753efc389edc7f9aaef338a4fcb24b54d8968d5f39b1342f45ddd36", size = 36214038 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/61/ad/2394d4fb542574678b0ba342daf734d4d811768da3c2ee0c84d509dcb26c/github3.py-4.0.1-py3-none-any.whl", hash = "sha256:a89af7de25650612d1da2f0609622bcdeb07ee8a45a1c06b2d16a05e4234e753", size = 151800 },
]
[[package]]
name = "google-api-core"
version = "2.24.2"
@ -1614,6 +1643,18 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/89/a1/3e145759e776c8866488a71270c399bf7c4e554551ac2e247aa0a18a0596/makefun-1.15.6-py2.py3-none-any.whl", hash = "sha256:e69b870f0bb60304765b1e3db576aaecf2f9b3e5105afe8cfeff8f2afe6ad067", size = 22946 },
]
[[package]]
name = "mako"
version = "1.3.10"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "markupsafe" },
]
sdist = { url = "https://files.pythonhosted.org/packages/9e/38/bd5b78a920a64d708fe6bc8e0a2c075e1389d53bef8413725c63ba041535/mako-1.3.10.tar.gz", hash = "sha256:99579a6f39583fa7e5630a28c3c1f440e4e97a414b80372649c0ce338da2ea28", size = 392474 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/87/fb/99f81ac72ae23375f22b7afdb7642aba97c00a713c217124420147681a2f/mako-1.3.10-py3-none-any.whl", hash = "sha256:baef24a52fc4fc514a0887ac600f9f1cff3d82c61d4d700a1fa84d597b88db59", size = 78509 },
]
[[package]]
name = "markdown"
version = "3.7"
@ -3228,11 +3269,13 @@ name = "surf-new-backend"
version = "0.1.0"
source = { virtual = "." }
dependencies = [
{ name = "alembic" },
{ name = "asyncpg" },
{ name = "chonkie", extra = ["all"] },
{ name = "fastapi" },
{ name = "fastapi-users", extra = ["oauth", "sqlalchemy"] },
{ name = "firecrawl-py" },
{ name = "github3-py" },
{ name = "gpt-researcher" },
{ name = "langchain-community" },
{ name = "langchain-unstructured" },
@ -3254,11 +3297,13 @@ dependencies = [
[package.metadata]
requires-dist = [
{ name = "alembic", specifier = ">=1.13.0" },
{ name = "asyncpg", specifier = ">=0.30.0" },
{ name = "chonkie", extras = ["all"], specifier = ">=0.4.1" },
{ name = "fastapi", specifier = ">=0.115.8" },
{ name = "fastapi-users", extras = ["oauth", "sqlalchemy"], specifier = ">=14.0.1" },
{ name = "firecrawl-py", specifier = ">=1.12.0" },
{ name = "github3-py", specifier = "==4.0.1" },
{ name = "gpt-researcher", specifier = ">=0.12.12" },
{ name = "langchain-community", specifier = ">=0.3.17" },
{ name = "langchain-unstructured", specifier = ">=0.1.6" },
@ -3658,6 +3703,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/10/6d/adb955ecf60811a3735d508974bbb5358e7745b635dc001329267529c6f2/unstructured.pytesseract-0.3.15-py3-none-any.whl", hash = "sha256:a3f505c5efb7ff9f10379051a7dd6aa624b3be6b0f023ed6767cc80d0b1613d1", size = 14992 },
]
[[package]]
name = "uritemplate"
version = "4.1.1"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/d2/5a/4742fdba39cd02a56226815abfa72fe0aa81c33bed16ed045647d6000eba/uritemplate-4.1.1.tar.gz", hash = "sha256:4346edfc5c3b79f694bccd6d6099a322bbeb628dbf2cd86eea55a456ce5124f0", size = 273898 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/81/c0/7461b49cd25aeece13766f02ee576d1db528f1c37ce69aee300e075b485b/uritemplate-4.1.1-py2.py3-none-any.whl", hash = "sha256:830c08b8d99bdd312ea4ead05994a38e8936266f84b9a7878232db50b044e02e", size = 10356 },
]
[[package]]
name = "urllib3"
version = "2.3.0"

View file

@ -44,6 +44,7 @@ const getConnectorTypeDisplay = (type: string): string => {
"TAVILY_API": "Tavily API",
"SLACK_CONNECTOR": "Slack",
"NOTION_CONNECTOR": "Notion",
"GITHUB_CONNECTOR": "GitHub",
// Add other connector types here as needed
};
return typeMap[type] || type;
@ -253,4 +254,4 @@ export default function ConnectorsPage() {
</Card>
</div>
);
}
}

View file

@ -51,6 +51,7 @@ const getConnectorTypeDisplay = (type: string): string => {
"TAVILY_API": "Tavily API",
"SLACK_CONNECTOR": "Slack Connector",
"NOTION_CONNECTOR": "Notion Connector",
"GITHUB_CONNECTOR": "GitHub Connector",
// Add other connector types here as needed
};
return typeMap[type] || type;
@ -85,7 +86,8 @@ export default function EditConnectorPage() {
"SERPER_API": "SERPER_API_KEY",
"TAVILY_API": "TAVILY_API_KEY",
"SLACK_CONNECTOR": "SLACK_BOT_TOKEN",
"NOTION_CONNECTOR": "NOTION_INTEGRATION_TOKEN"
"NOTION_CONNECTOR": "NOTION_INTEGRATION_TOKEN",
"GITHUB_CONNECTOR": "GITHUB_PAT"
};
return fieldMap[connectorType] || "";
};
@ -136,6 +138,8 @@ export default function EditConnectorPage() {
name: values.name,
connector_type: connector.connector_type,
config: updatedConfig,
is_indexable: connector.is_indexable,
last_indexed_at: connector.last_indexed_at,
});
toast.success("Connector updated successfully!");
@ -223,17 +227,21 @@ export default function EditConnectorPage() {
? "Slack Bot Token"
: connector?.connector_type === "NOTION_CONNECTOR"
? "Notion Integration Token"
: "API Key"}
: connector?.connector_type === "GITHUB_CONNECTOR"
? "GitHub Personal Access Token (PAT)"
: "API Key"}
</FormLabel>
<FormControl>
<Input
type="password"
placeholder={
connector?.connector_type === "SLACK_CONNECTOR"
? "Enter your Slack Bot Token"
? "Enter new Slack Bot Token (optional)"
: connector?.connector_type === "NOTION_CONNECTOR"
? "Enter your Notion Integration Token"
: "Enter your API key"
? "Enter new Notion Token (optional)"
: connector?.connector_type === "GITHUB_CONNECTOR"
? "Enter new GitHub PAT (optional)"
: "Enter new API key (optional)"
}
{...field}
/>
@ -243,7 +251,9 @@ export default function EditConnectorPage() {
? "Enter a new Slack Bot Token or leave blank to keep your existing token."
: connector?.connector_type === "NOTION_CONNECTOR"
? "Enter a new Notion Integration Token or leave blank to keep your existing token."
: "Enter a new API key or leave blank to keep your existing key."}
: connector?.connector_type === "GITHUB_CONNECTOR"
? "Enter a new GitHub PAT or leave blank to keep your existing token."
: "Enter a new API key or leave blank to keep your existing key."}
</FormDescription>
<FormMessage />
</FormItem>
@ -276,4 +286,4 @@ export default function EditConnectorPage() {
</motion.div>
</div>
);
}
}

View file

@ -0,0 +1,298 @@
"use client";
import { useState } from "react";
import { useRouter, useParams } from "next/navigation";
import { motion } from "framer-motion";
import { zodResolver } from "@hookform/resolvers/zod";
import { useForm } from "react-hook-form";
import * as z from "zod";
import { toast } from "sonner";
import { ArrowLeft, Check, Info, Loader2, Github } from "lucide-react";
// Assuming useSearchSourceConnectors hook exists and works similarly
import { useSearchSourceConnectors } from "@/hooks/useSearchSourceConnectors";
import {
Form,
FormControl,
FormDescription,
FormField,
FormItem,
FormLabel,
FormMessage,
} from "@/components/ui/form";
import { Input } from "@/components/ui/input";
import { Button } from "@/components/ui/button";
import {
Card,
CardContent,
CardDescription,
CardFooter,
CardHeader,
CardTitle,
} from "@/components/ui/card";
import {
Alert,
AlertDescription,
AlertTitle,
} from "@/components/ui/alert";
import {
Accordion,
AccordionContent,
AccordionItem,
AccordionTrigger,
} from "@/components/ui/accordion";
import { Tabs, TabsContent, TabsList, TabsTrigger } from "@/components/ui/tabs";
// Define the form schema with Zod for GitHub
const githubConnectorFormSchema = z.object({
name: z.string().min(3, {
message: "Connector name must be at least 3 characters.",
}),
github_pat: z.string()
.min(20, { // Apply min length first
message: "GitHub Personal Access Token seems too short.",
})
.refine(pat => pat.startsWith('ghp_') || pat.startsWith('github_pat_'), { // Then refine the pattern
message: "GitHub PAT should start with 'ghp_' or 'github_pat_'",
}),
});
// Define the type for the form values
type GithubConnectorFormValues = z.infer<typeof githubConnectorFormSchema>;
export default function GithubConnectorPage() {
const router = useRouter();
const params = useParams();
const searchSpaceId = params.search_space_id as string;
const [isSubmitting, setIsSubmitting] = useState(false);
const { createConnector } = useSearchSourceConnectors(); // Assuming this hook exists
// Initialize the form
const form = useForm<GithubConnectorFormValues>({
resolver: zodResolver(githubConnectorFormSchema),
defaultValues: {
name: "GitHub Connector",
github_pat: "",
},
});
// Handle form submission
const onSubmit = async (values: GithubConnectorFormValues) => {
setIsSubmitting(true);
try {
await createConnector({
name: values.name,
connector_type: "GITHUB_CONNECTOR",
config: {
GITHUB_PAT: values.github_pat,
},
is_indexable: true, // GitHub connector is indexable
last_indexed_at: null, // New connector hasn't been indexed
});
toast.success("GitHub connector created successfully!");
// Navigate back to connectors management page (or the add page)
router.push(`/dashboard/${searchSpaceId}/connectors`);
} catch (error) { // Added type check for error
console.error("Error creating GitHub connector:", error);
// Display specific backend error message if available
const errorMessage = error instanceof Error ? error.message : "Failed to create GitHub connector. Please check the PAT and permissions.";
toast.error(errorMessage);
} finally {
setIsSubmitting(false);
}
};
return (
<div className="container mx-auto py-8 max-w-3xl">
<Button
variant="ghost"
className="mb-6"
onClick={() => router.push(`/dashboard/${searchSpaceId}/connectors/add`)}
>
<ArrowLeft className="mr-2 h-4 w-4" />
Back to Add Connectors
</Button>
<motion.div
initial={{ opacity: 0, y: 20 }}
animate={{ opacity: 1, y: 0 }}
transition={{ duration: 0.5 }}
>
<Tabs defaultValue="connect" className="w-full">
<TabsList className="grid w-full grid-cols-2 mb-6">
<TabsTrigger value="connect">Connect GitHub</TabsTrigger>
<TabsTrigger value="documentation">Setup Guide</TabsTrigger>
</TabsList>
<TabsContent value="connect">
<Card className="border-2 border-border">
<CardHeader>
<CardTitle className="text-2xl font-bold flex items-center gap-2"><Github className="h-6 w-6" /> Connect GitHub Account</CardTitle>
<CardDescription>
Integrate with GitHub using a Personal Access Token (PAT) to search and retrieve information from accessible repositories. This connector can index your code and documentation.
</CardDescription>
</CardHeader>
<CardContent>
<Alert className="mb-6 bg-muted">
<Info className="h-4 w-4" />
<AlertTitle>GitHub Personal Access Token (PAT) Required</AlertTitle>
<AlertDescription>
You'll need a GitHub PAT with the appropriate scopes (e.g., 'repo') to use this connector. You can create one from your
<a
href="https://github.com/settings/personal-access-tokens"
target="_blank"
rel="noopener noreferrer"
className="font-medium underline underline-offset-4 ml-1"
>
GitHub Developer Settings
</a>.
</AlertDescription>
</Alert>
<Form {...form}>
<form onSubmit={form.handleSubmit(onSubmit)} className="space-y-6">
<FormField
control={form.control}
name="name"
render={({ field }) => (
<FormItem>
<FormLabel>Connector Name</FormLabel>
<FormControl>
<Input placeholder="My GitHub Connector" {...field} />
</FormControl>
<FormDescription>
A friendly name to identify this GitHub connection.
</FormDescription>
<FormMessage />
</FormItem>
)}
/>
<FormField
control={form.control}
name="github_pat"
render={({ field }) => (
<FormItem>
<FormLabel>GitHub Personal Access Token (PAT)</FormLabel>
<FormControl>
<Input
type="password"
placeholder="ghp_... or github_pat_..."
{...field}
/>
</FormControl>
<FormDescription>
Your GitHub PAT will be encrypted and stored securely. Ensure it has the necessary 'repo' scopes.
</FormDescription>
<FormMessage />
</FormItem>
)}
/>
<div className="flex justify-end">
<Button
type="submit"
disabled={isSubmitting}
className="w-full sm:w-auto"
>
{isSubmitting ? (
<>
<Loader2 className="mr-2 h-4 w-4 animate-spin" />
Connecting...
</>
) : (
<>
<Check className="mr-2 h-4 w-4" />
Connect GitHub
</>
)}
</Button>
</div>
</form>
</Form>
</CardContent>
<CardFooter className="flex flex-col items-start border-t bg-muted/50 px-6 py-4">
<h4 className="text-sm font-medium">What you get with GitHub integration:</h4>
<ul className="mt-2 list-disc pl-5 text-sm text-muted-foreground">
<li>Search through code and documentation in your repositories</li>
<li>Access READMEs, Markdown files, and common code files</li>
<li>Connect your project knowledge directly to your search space</li>
<li>Index your repositories for enhanced search capabilities</li>
</ul>
</CardFooter>
</Card>
</TabsContent>
<TabsContent value="documentation">
<Card className="border-2 border-border">
<CardHeader>
<CardTitle className="text-2xl font-bold">GitHub Connector Setup Guide</CardTitle>
<CardDescription>
Learn how to generate a Personal Access Token (PAT) and connect your GitHub account.
</CardDescription>
</CardHeader>
<CardContent className="space-y-6">
<div>
<h3 className="text-xl font-semibold mb-2">How it works</h3>
<p className="text-muted-foreground">
The GitHub connector uses a Personal Access Token (PAT) to authenticate with the GitHub API. It fetches information about repositories accessible to the token and indexes relevant files (code, markdown, text).
</p>
<ul className="mt-2 list-disc pl-5 text-muted-foreground">
<li>The connector indexes files based on common code and documentation extensions.</li>
<li>Large files (over 1MB) are skipped during indexing.</li>
<li>Indexing runs periodically (check connector settings for frequency) to keep content up-to-date.</li>
</ul>
</div>
<Accordion type="single" collapsible className="w-full">
<AccordionItem value="create_pat">
<AccordionTrigger className="text-lg font-medium">Step 1: Create a GitHub PAT</AccordionTrigger>
<AccordionContent className="space-y-4">
<Alert className="bg-muted">
<Info className="h-4 w-4" />
<AlertTitle>Token Security</AlertTitle>
<AlertDescription>
Treat your PAT like a password. Store it securely and consider using fine-grained tokens if possible.
</AlertDescription>
</Alert>
<div className="space-y-6">
<div>
<h4 className="font-medium mb-2">Generating a Token:</h4>
<ol className="list-decimal pl-5 space-y-3">
<li>Go to your GitHub <a href="https://github.com/settings/tokens" target="_blank" rel="noopener noreferrer" className="font-medium underline underline-offset-4">Developer settings</a>.</li>
<li>Click on <strong>Personal access tokens</strong>, then choose <strong>Tokens (classic)</strong> or <strong>Fine-grained tokens</strong> (recommended if available and suitable).</li>
<li>Click <strong>Generate new token</strong> (and choose the appropriate type).</li>
<li>Give your token a descriptive name (e.g., "SurfSense Connector").</li>
<li>Set an expiration date for the token (recommended for security).</li>
<li>Under <strong>Select scopes</strong> (for classic tokens) or <strong>Repository access</strong> (for fine-grained), grant the necessary permissions. At minimum, the <strong>`repo`</strong> scope (or equivalent read access to repositories for fine-grained tokens) is required to read repository content.</li>
<li>Click <strong>Generate token</strong>.</li>
<li><strong>Important:</strong> Copy your new PAT immediately. You won't be able to see it again after leaving the page.</li>
</ol>
</div>
</div>
</AccordionContent>
</AccordionItem>
<AccordionItem value="connect_app">
<AccordionTrigger className="text-lg font-medium">Step 2: Connect in SurfSense</AccordionTrigger>
<AccordionContent className="space-y-4">
<ol className="list-decimal pl-5 space-y-3">
<li>Paste the copied GitHub PAT into the "GitHub Personal Access Token (PAT)" field on the "Connect GitHub" tab.</li>
<li>Optionally, give the connector a custom name.</li>
<li>Click the <strong>Connect GitHub</strong> button.</li>
<li>If the connection is successful, you will be redirected and can start indexing from the Connectors page.</li>
</ol>
</AccordionContent>
</AccordionItem>
</Accordion>
</CardContent>
</Card>
</TabsContent>
</Tabs>
</motion.div>
</div>
);
}

View file

@ -14,6 +14,7 @@ import {
IconMail,
IconBrandZoom,
IconChevronRight,
IconWorldWww,
} from "@tabler/icons-react";
import { motion, AnimatePresence } from "framer-motion";
import { useState } from "react";
@ -22,36 +23,43 @@ import Link from "next/link";
import { Button } from "@/components/ui/button";
import { Separator } from "@/components/ui/separator";
import { Collapsible, CollapsibleContent, CollapsibleTrigger } from "@/components/ui/collapsible";
import { useForm } from "react-hook-form";
// Define the Connector type
interface Connector {
id: string;
title: string;
description: string;
icon: React.ReactNode;
status: "available" | "coming-soon" | "connected"; // Added connected status example
}
interface ConnectorCategory {
id: string;
title: string;
connectors: Connector[];
}
// Define connector categories and their connectors
const connectorCategories = [
const connectorCategories: ConnectorCategory[] = [
{
id: "search-engines",
title: "Search Engines",
description: "Connect to search engines to enhance your research capabilities.",
icon: <IconSearch className="h-5 w-5" />,
connectors: [
{
id: "tavily-api",
title: "Tavily Search API",
description: "Connect to Tavily Search API to search the web.",
icon: <IconSearch className="h-6 w-6" />,
status: "available",
},
{
id: "serper-api",
title: "Serper API",
description: "Connect to Serper API to search the web.",
icon: <IconBrandGoogle className="h-6 w-6" />,
status: "coming-soon",
id: "web-search",
title: "Web Search",
description: "Enable web search capabilities for broader context.",
icon: <IconWorldWww className="h-6 w-6" />,
status: "available", // Example status
// Potentially add config form here if needed (e.g., choosing provider)
},
// Add other search engine connectors like Tavily, Serper if they have UI config
],
},
{
id: "team-chats",
title: "Team Chats",
description: "Connect to your team communication platforms.",
icon: <IconMessages className="h-5 w-5" />,
connectors: [
{
id: "slack-connector",
@ -79,8 +87,6 @@ const connectorCategories = [
{
id: "knowledge-bases",
title: "Knowledge Bases",
description: "Connect to your knowledge bases and documentation.",
icon: <IconDatabase className="h-5 w-5" />,
connectors: [
{
id: "notion-connector",
@ -88,21 +94,20 @@ const connectorCategories = [
description: "Connect to your Notion workspace to access pages and databases.",
icon: <IconBrandNotion className="h-6 w-6" />,
status: "available",
// No form here, assumes it links to its own page
},
{
id: "github",
id: "github-connector", // Keep the id simple
title: "GitHub",
description: "Connect to GitHub repositories to access code and documentation.",
description: "Connect a GitHub PAT to index code and docs from accessible repositories.",
icon: <IconBrandGithub className="h-6 w-6" />,
status: "coming-soon",
status: "available",
},
],
},
{
id: "communication",
title: "Communication",
description: "Connect to your email and meeting platforms.",
icon: <IconMail className="h-5 w-5" />,
connectors: [
{
id: "gmail",
@ -125,7 +130,7 @@ const connectorCategories = [
export default function ConnectorsPage() {
const params = useParams();
const searchSpaceId = params.search_space_id as string;
const [expandedCategories, setExpandedCategories] = useState<string[]>(["search-engines"]);
const [expandedCategories, setExpandedCategories] = useState<string[]>(["search-engines", "knowledge-bases"]);
const toggleCategory = (categoryId: string) => {
setExpandedCategories(prev =>
@ -150,104 +155,68 @@ export default function ConnectorsPage() {
</motion.div>
<div className="space-y-6">
{connectorCategories.map((category, categoryIndex) => (
{connectorCategories.map((category) => (
<Collapsible
key={category.id}
open={expandedCategories.includes(category.id)}
onOpenChange={() => toggleCategory(category.id)}
className="border rounded-lg overflow-hidden bg-card"
className="space-y-2"
>
<CollapsibleTrigger asChild>
<motion.div
initial={{ opacity: 0, y: 10 }}
animate={{ opacity: 1, y: 0 }}
transition={{ duration: 0.3, delay: categoryIndex * 0.1 }}
className="p-4 flex items-center justify-between cursor-pointer hover:bg-accent/50 transition-colors"
>
<div className="flex items-center gap-3">
<div className="p-2 rounded-md bg-primary/10 text-primary">
{category.icon}
</div>
<div>
<h2 className="text-xl font-semibold">{category.title}</h2>
<p className="text-sm text-muted-foreground">{category.description}</p>
</div>
</div>
<IconChevronRight
className={cn(
"h-5 w-5 text-muted-foreground transition-transform duration-200",
expandedCategories.includes(category.id) && "rotate-90"
)}
/>
</motion.div>
</CollapsibleTrigger>
<div className="flex items-center justify-between space-x-4 px-1">
<h3 className="text-lg font-semibold dark:text-gray-200">{category.title}</h3>
<CollapsibleTrigger asChild>
{/* Replace with your preferred expand/collapse icon/button */}
<button className="text-sm text-indigo-600 hover:underline dark:text-indigo-400">
{expandedCategories.includes(category.id) ? "Collapse" : "Expand"}
</button>
</CollapsibleTrigger>
</div>
<CollapsibleContent>
<Separator />
<div className="p-4 grid grid-cols-1 md:grid-cols-2 lg:grid-cols-3 gap-4">
<AnimatePresence>
{category.connectors.map((connector, index) => (
<motion.div
key={connector.id}
initial={{ opacity: 0, scale: 0.95 }}
animate={{ opacity: 1, scale: 1 }}
exit={{ opacity: 0, scale: 0.95 }}
transition={{
duration: 0.2,
delay: index * 0.05,
type: "spring",
stiffness: 300,
damping: 30
}}
className={cn(
"relative group flex flex-col p-4 rounded-lg border",
connector.status === "coming-soon" ? "opacity-70" : ""
)}
>
<div className="absolute inset-0 opacity-0 group-hover:opacity-100 transition duration-200 bg-gradient-to-t from-accent/50 to-transparent rounded-lg pointer-events-none" />
<div className="mb-4 relative z-10 text-primary">
{connector.icon}
<div className="grid grid-cols-1 gap-6 sm:grid-cols-2 lg:grid-cols-3 p-1">
{category.connectors.map((connector) => (
<div key={connector.id} className="col-span-1 flex flex-col divide-y divide-gray-200 dark:divide-gray-700 rounded-lg bg-white dark:bg-gray-800 shadow">
<div className="flex w-full items-center justify-between space-x-6 p-6 flex-grow">
<div className="flex-1 truncate">
<div className="flex items-center space-x-3">
<span className="text-gray-900 dark:text-gray-100">{connector.icon}</span>
<h3 className="truncate text-sm font-medium text-gray-900 dark:text-gray-100">
{connector.title}
</h3>
{connector.status === "coming-soon" && (
<span className="inline-block flex-shrink-0 rounded-full bg-yellow-100 px-2 py-0.5 text-xs font-medium text-yellow-800 dark:bg-yellow-900 dark:text-yellow-200">
Coming soon
</span>
)}
{/* TODO: Add 'Connected' badge based on actual state */}
</div>
<p className="mt-1 truncate text-sm text-gray-500 dark:text-gray-400">
{connector.description}
</p>
</div>
<div className="flex items-center justify-between mb-2">
<h3 className="text-lg font-semibold group-hover:translate-x-1 transition duration-200">
{connector.title}
</h3>
{connector.status === "coming-soon" && (
<span className="text-xs bg-muted px-2 py-1 rounded-full">Coming soon</span>
)}
</div>
<p className="text-sm text-muted-foreground mb-4 flex-grow">
{connector.description}
</p>
{connector.status === "available" ? (
<Link
href={`/dashboard/${searchSpaceId}/connectors/add/${connector.id}`}
className="w-full mt-auto"
>
<Button
variant="default"
className="w-full"
>
</div>
{/* Always render Link button if available */}
{connector.status === 'available' && (
<div className="px-6 py-4 border-t border-gray-200 dark:border-gray-700">
<Link href={`/dashboard/${searchSpaceId}/connectors/add/${connector.id}`}>
<Button variant="default" className="w-full">
Connect
</Button>
</Link>
) : (
<Button
variant="outline"
className="w-full mt-auto"
disabled
>
Notify Me
</div>
)}
{connector.status === 'coming-soon' && (
<div className="px-6 py-4 border-t border-gray-200 dark:border-gray-700">
<Button variant="outline" disabled className="w-full">
Coming Soon
</Button>
)}
</motion.div>
))}
</AnimatePresence>
</div>
)}
{/* TODO: Add logic for 'connected' status */}
</div>
))}
</div>
</CollapsibleContent>
<Separator className="my-4" />
</Collapsible>
))}
</div>