documents table migration, fix/update github indexing

2025-09-05 03:59:06 +00:00 · 2025-04-13 21:23:05 -07:00 · 2025-04-13 21:23:05 -07:00 · a26fac435b
commit a26fac435b
parent bb198e38c0
5 changed files with 197 additions and 17 deletions
--- a/surfsense_backend/alembic/versions/e55302644c51_add_github_connector_to_documenttype_.py
+++ b/surfsense_backend/alembic/versions/e55302644c51_add_github_connector_to_documenttype_.py
@ -0,0 +1,70 @@
 """Add GITHUB_CONNECTOR to DocumentType enum
 Revision ID: e55302644c51
 Revises: 1
 Create Date: 2025-04-13 19:56:00.059921
 """
 from typing import Sequence, Union
 from alembic import op
 import sqlalchemy as sa
 # revision identifiers, used by Alembic.
 revision: str = 'e55302644c51'
 down_revision: Union[str, None] = '1'
 branch_labels: Union[str, Sequence[str], None] = None
 depends_on: Union[str, Sequence[str], None] = None
 # Define the ENUM type name and the new value
 ENUM_NAME = 'documenttype' # Make sure this matches the name in your DB (usually lowercase class name)
 NEW_VALUE = 'GITHUB_CONNECTOR'
 def upgrade() -> None:
    """Upgrade schema."""
    op.execute(f"ALTER TYPE {ENUM_NAME} ADD VALUE '{NEW_VALUE}'")
 # Warning: This will delete all rows with the new value
 def downgrade() -> None:
    """Downgrade schema - remove GITHUB_CONNECTOR from enum."""
    # The old type name
    old_enum_name = f"{ENUM_NAME}_old"
    # Enum values *before* GITHUB_CONNECTOR was added
    old_values = (
        'EXTENSION',
        'CRAWLED_URL',
        'FILE',
        'SLACK_CONNECTOR',
        'NOTION_CONNECTOR',
        'YOUTUBE_VIDEO'
    )
    old_values_sql = ", ".join([f"'{v}'" for v in old_values])
    # Table and column names (adjust if different)
    table_name = 'documents'
    column_name = 'document_type'
    # 1. Rename the current enum type
    op.execute(f"ALTER TYPE {ENUM_NAME} RENAME TO {old_enum_name}")
    # 2. Create the new enum type with the old values
    op.execute(f"CREATE TYPE {ENUM_NAME} AS ENUM({old_values_sql})")
    # 3. Update the table: 
    op.execute(
        f"DELETE FROM {table_name} WHERE {column_name}::text = '{NEW_VALUE}'"
    )
    # 4. Alter the column to use the new enum type (casting old values)
    op.execute(
        f"ALTER TABLE {table_name} ALTER COLUMN {column_name} "
        f"TYPE {ENUM_NAME} USING {column_name}::text::{ENUM_NAME}"
    )
    # 5. Drop the old enum type
    op.execute(f"DROP TYPE {old_enum_name}")
    # ### end Alembic commands ### 
--- a/surfsense_backend/app/connectors/github_connector.py
+++ b/surfsense_backend/app/connectors/github_connector.py
@ -2,7 +2,6 @@ import base64
 import logging
 from typing import List, Optional, Dict, Any, Tuple
 from github3 import login as github_login, exceptions as github_exceptions
 from github3.repos.repo import Repository
 from github3.repos.contents import Contents
 from github3.exceptions import ForbiddenError, NotFoundError
@ -26,6 +25,33 @@ MAX_FILE_SIZE = 1 * 1024 * 1024
 class GitHubConnector:
    """Connector for interacting with the GitHub API."""
    # Directories to skip during file traversal
    SKIPPED_DIRS = {
        # Version control
        '.git',
        # Dependencies
        'node_modules',
        'vendor', 
        # Build artifacts / Caches
        'build',
        'dist',
        'target',
        '__pycache__',
        # Virtual environments
        'venv',
        '.venv',
        'env',
        # IDE/Editor config
        '.vscode',
        '.idea',
        '.project',
        '.settings',
        # Temporary / Logs
        'tmp',
        'logs',
        # Add other project-specific irrelevant directories if needed
    }
    def __init__(self, token: str):
        """
        Initializes the GitHub connector.
@ -54,8 +80,7 @@ class GitHubConnector:
            # type='owner' fetches repos owned by the user
            # type='member' fetches repos the user is a collaborator on (including orgs)
            # type='all' fetches both
-            for repo in self.gh.repositories(type='all', sort='updated'):
+            for repo in self.gh.repositories(type='owner', sort='updated'):
                if isinstance(repo, Repository):
                repos_data.append({
                    "id": repo.id,
                    "name": repo.name,
@ -63,7 +88,7 @@ class GitHubConnector:
                    "private": repo.private,
                    "url": repo.html_url,
                    "description": repo.description or "",
-                        "last_updated": repo.updated_at.isoformat() if repo.updated_at else None,
+                    "last_updated": repo.updated_at if repo.updated_at else None,
                })
            logger.info(f"Fetched {len(repos_data)} repositories.")
            return repos_data
@ -90,8 +115,7 @@ class GitHubConnector:
            if not repo:
                logger.warning(f"Repository '{repo_full_name}' not found.")
                return []
-                
+            contents = repo.directory_contents(directory_path=path) # Use directory_contents for clarity
            contents = repo.directory_contents(path=path) # Use directory_contents for clarity
            # contents returns a list of tuples (name, content_obj)
            for item_name, content_item in contents:
@ -99,6 +123,11 @@ class GitHubConnector:
                    continue
                if content_item.type == 'dir':
                    # Check if the directory name is in the skipped list
                    if content_item.name in self.SKIPPED_DIRS:
                        logger.debug(f"Skipping directory: {content_item.path}")
                        continue # Skip recursion for this directory
                    # Recursively fetch contents of subdirectory
                    files_list.extend(self.get_repository_files(repo_full_name, path=content_item.path))
                elif content_item.type == 'file':
--- a/surfsense_backend/app/tasks/stream_connector_search_results.py
+++ b/surfsense_backend/app/tasks/stream_connector_search_results.py
@ -244,6 +244,33 @@ async def stream_connector_search_results(
            all_raw_documents.extend(notion_chunks)
        # Github Connector
        if connector == "GITHUB_CONNECTOR":
            # Send terminal message about starting search
            yield streaming_service.add_terminal_message("Starting to search for GitHub connector...")  
            print("Starting to search for GitHub connector...")
            # Search using Github API with reformulated query
            result_object, github_chunks = await connector_service.search_github(
                user_query=reformulated_query,
                user_id=user_id,
                search_space_id=search_space_id,
                top_k=TOP_K
            )
            # Send terminal message about search results
            yield streaming_service.add_terminal_message(
                f"Found {len(result_object['sources'])} relevant results from Github",
                "success"
            )
            # Update sources
            all_sources.append(result_object)
            yield streaming_service.update_sources(all_sources)
            # Add documents to collection
            all_raw_documents.extend(github_chunks)
    # If we have documents to research
--- a/surfsense_backend/app/utils/connector_service.py
+++ b/surfsense_backend/app/utils/connector_service.py
@ -559,3 +559,56 @@ class ConnectorService:
        }
        return result_object, youtube_chunks
    async def search_github(self, user_query: str, user_id: int, search_space_id: int, top_k: int = 20) -> tuple:
        """
        Search for GitHub documents and return both the source information and langchain documents
        Returns:
            tuple: (sources_info, langchain_documents)
        """
        github_chunks = await self.retriever.hybrid_search(
            query_text=user_query,
            top_k=top_k,
            user_id=user_id,
            search_space_id=search_space_id,
            document_type="GITHUB_CONNECTOR"
        )
        # Map github_chunks to the required format
        mapped_sources = {}
        for i, chunk in enumerate(github_chunks):
            # Fix for UI - assign a unique ID for citation/source tracking
            github_chunks[i]['document']['id'] = self.source_id_counter
            # Extract document metadata
            document = chunk.get('document', {})
            metadata = document.get('metadata', {})
            # Create a mapped source entry
            source = {
                "id": self.source_id_counter,
                "title": document.get('title', 'GitHub Document'), # Use specific title if available
                "description": metadata.get('description', chunk.get('content', '')[:100]), # Use description or content preview
                "url": metadata.get('url', '') # Use URL if available in metadata
            }
            self.source_id_counter += 1
            # Use a unique identifier for tracking unique sources (URL preferred)
            source_key = source.get("url") or source.get("title")
            if source_key and source_key not in mapped_sources:
                mapped_sources[source_key] = source
        # Convert to list of sources
        sources_list = list(mapped_sources.values())
        # Create result object
        result_object = {
            "id": 7, # Assuming 7 is the next available ID
            "name": "GitHub",
            "type": "GITHUB_CONNECTOR",
            "sources": sources_list,
        }
        return result_object, github_chunks
--- a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/page.tsx
+++ b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/page.tsx
@ -94,7 +94,7 @@ import rehypeSanitize from "rehype-sanitize";
 import remarkGfm from "remark-gfm";
 import { DocumentViewer } from "@/components/document-viewer";
 import { JsonMetadataViewer } from "@/components/json-metadata-viewer";
-import { IconBrandNotion, IconBrandSlack, IconBrandYoutube } from "@tabler/icons-react";
+import { IconBrandGithub, IconBrandNotion, IconBrandSlack, IconBrandYoutube } from "@tabler/icons-react";
 // Define animation variants for reuse
 const fadeInScale = {
@ -142,6 +142,7 @@ const documentTypeIcons = {
    NOTION_CONNECTOR: IconBrandNotion,
    FILE: File,
    YOUTUBE_VIDEO: IconBrandYoutube,
    GITHUB_CONNECTOR: IconBrandGithub,
 } as const;
 const columns: ColumnDef<Document>[] = [