documents table migration, fix/update github indexing

2025-09-01 10:09:08 +00:00 · 2025-04-13 21:23:05 -07:00 · 2025-04-13 21:23:05 -07:00 · a26fac435b
commit a26fac435b
parent bb198e38c0
5 changed files with 197 additions and 17 deletions
--- a/surfsense_backend/alembic/versions/e55302644c51_add_github_connector_to_documenttype_.py
+++ b/surfsense_backend/alembic/versions/e55302644c51_add_github_connector_to_documenttype_.py
@ -0,0 +1,70 @@
+"""Add GITHUB_CONNECTOR to DocumentType enum
+
+Revision ID: e55302644c51
+Revises: 1
+Create Date: 2025-04-13 19:56:00.059921
+
+"""
+from typing import Sequence, Union
+
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision: str = 'e55302644c51'
+down_revision: Union[str, None] = '1'
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+# Define the ENUM type name and the new value
+ENUM_NAME = 'documenttype' # Make sure this matches the name in your DB (usually lowercase class name)
+NEW_VALUE = 'GITHUB_CONNECTOR'
+
+def upgrade() -> None:
+    """Upgrade schema."""
+    op.execute(f"ALTER TYPE {ENUM_NAME} ADD VALUE '{NEW_VALUE}'")
+        
+
+# Warning: This will delete all rows with the new value
+def downgrade() -> None:
+    """Downgrade schema - remove GITHUB_CONNECTOR from enum."""
+
+    # The old type name
+    old_enum_name = f"{ENUM_NAME}_old"
+
+    # Enum values *before* GITHUB_CONNECTOR was added
+    old_values = (
+        'EXTENSION',
+        'CRAWLED_URL',
+        'FILE',
+        'SLACK_CONNECTOR',
+        'NOTION_CONNECTOR',
+        'YOUTUBE_VIDEO'
+    )
+    old_values_sql = ", ".join([f"'{v}'" for v in old_values])
+
+    # Table and column names (adjust if different)
+    table_name = 'documents'
+    column_name = 'document_type'
+
+    # 1. Rename the current enum type
+    op.execute(f"ALTER TYPE {ENUM_NAME} RENAME TO {old_enum_name}")
+
+    # 2. Create the new enum type with the old values
+    op.execute(f"CREATE TYPE {ENUM_NAME} AS ENUM({old_values_sql})")
+
+    # 3. Update the table: 
+    op.execute(
+        f"DELETE FROM {table_name} WHERE {column_name}::text = '{NEW_VALUE}'"
+    )
+
+    # 4. Alter the column to use the new enum type (casting old values)
+    op.execute(
+        f"ALTER TABLE {table_name} ALTER COLUMN {column_name} "
+        f"TYPE {ENUM_NAME} USING {column_name}::text::{ENUM_NAME}"
+    )
+
+    # 5. Drop the old enum type
+    op.execute(f"DROP TYPE {old_enum_name}")
+    # ### end Alembic commands ### 
--- a/surfsense_backend/app/connectors/github_connector.py
+++ b/surfsense_backend/app/connectors/github_connector.py
@ -2,7 +2,6 @@ import base64
 import logging
 from typing import List, Optional, Dict, Any, Tuple
 from github3 import login as github_login, exceptions as github_exceptions
-from github3.repos.repo import Repository
 from github3.repos.contents import Contents
 from github3.exceptions import ForbiddenError, NotFoundError

@ -26,6 +25,33 @@ MAX_FILE_SIZE = 1 * 1024 * 1024
 class GitHubConnector:
    """Connector for interacting with the GitHub API."""

+    # Directories to skip during file traversal
+    SKIPPED_DIRS = {
+        # Version control
+        '.git',
+        # Dependencies
+        'node_modules',
+        'vendor', 
+        # Build artifacts / Caches
+        'build',
+        'dist',
+        'target',
+        '__pycache__',
+        # Virtual environments
+        'venv',
+        '.venv',
+        'env',
+        # IDE/Editor config
+        '.vscode',
+        '.idea',
+        '.project',
+        '.settings',
+        # Temporary / Logs
+        'tmp',
+        'logs',
+        # Add other project-specific irrelevant directories if needed
+    }
+
    def __init__(self, token: str):
        """
        Initializes the GitHub connector.
@ -54,17 +80,16 @@ class GitHubConnector:
            # type='owner' fetches repos owned by the user
            # type='member' fetches repos the user is a collaborator on (including orgs)
            # type='all' fetches both
-            for repo in self.gh.repositories(type='all', sort='updated'):
-                if isinstance(repo, Repository):
-                    repos_data.append({
-                        "id": repo.id,
-                        "name": repo.name,
-                        "full_name": repo.full_name,
-                        "private": repo.private,
-                        "url": repo.html_url,
-                        "description": repo.description or "",
-                        "last_updated": repo.updated_at.isoformat() if repo.updated_at else None,
-                    })
+            for repo in self.gh.repositories(type='owner', sort='updated'):
+                repos_data.append({
+                    "id": repo.id,
+                    "name": repo.name,
+                    "full_name": repo.full_name,
+                    "private": repo.private,
+                    "url": repo.html_url,
+                    "description": repo.description or "",
+                    "last_updated": repo.updated_at if repo.updated_at else None,
+                })
            logger.info(f"Fetched {len(repos_data)} repositories.")
            return repos_data
        except Exception as e:
@ -90,8 +115,7 @@ class GitHubConnector:
            if not repo:
                logger.warning(f"Repository '{repo_full_name}' not found.")
                return []
-                
-            contents = repo.directory_contents(path=path) # Use directory_contents for clarity
+            contents = repo.directory_contents(directory_path=path) # Use directory_contents for clarity
            
            # contents returns a list of tuples (name, content_obj)
            for item_name, content_item in contents:
@ -99,6 +123,11 @@ class GitHubConnector:
                    continue

                if content_item.type == 'dir':
+                    # Check if the directory name is in the skipped list
+                    if content_item.name in self.SKIPPED_DIRS:
+                        logger.debug(f"Skipping directory: {content_item.path}")
+                        continue # Skip recursion for this directory
+                    
                    # Recursively fetch contents of subdirectory
                    files_list.extend(self.get_repository_files(repo_full_name, path=content_item.path))
                elif content_item.type == 'file':
--- a/surfsense_backend/app/tasks/stream_connector_search_results.py
+++ b/surfsense_backend/app/tasks/stream_connector_search_results.py
@ -244,6 +244,33 @@ async def stream_connector_search_results(
            all_raw_documents.extend(notion_chunks)
            
            
+        # Github Connector
+        if connector == "GITHUB_CONNECTOR":
+            # Send terminal message about starting search
+            yield streaming_service.add_terminal_message("Starting to search for GitHub connector...")  
+            print("Starting to search for GitHub connector...")
+            # Search using Github API with reformulated query
+            result_object, github_chunks = await connector_service.search_github(
+                user_query=reformulated_query,
+                user_id=user_id,
+                search_space_id=search_space_id,
+                top_k=TOP_K
+            )
+            
+            # Send terminal message about search results
+            yield streaming_service.add_terminal_message(
+                f"Found {len(result_object['sources'])} relevant results from Github",
+                "success"
+            )
+            
+            # Update sources
+            all_sources.append(result_object)
+            yield streaming_service.update_sources(all_sources)
+            
+            # Add documents to collection
+            all_raw_documents.extend(github_chunks)
+            
+            
    

    # If we have documents to research
--- a/surfsense_backend/app/utils/connector_service.py
+++ b/surfsense_backend/app/utils/connector_service.py
@ -558,4 +558,57 @@ class ConnectorService:
            "sources": sources_list,
        }
        
-        return result_object, youtube_chunks
+        return result_object, youtube_chunks
+
+    async def search_github(self, user_query: str, user_id: int, search_space_id: int, top_k: int = 20) -> tuple:
+        """
+        Search for GitHub documents and return both the source information and langchain documents
+        
+        Returns:
+            tuple: (sources_info, langchain_documents)
+        """
+        github_chunks = await self.retriever.hybrid_search(
+            query_text=user_query,
+            top_k=top_k,
+            user_id=user_id,
+            search_space_id=search_space_id,
+            document_type="GITHUB_CONNECTOR"
+        )
+
+        # Map github_chunks to the required format
+        mapped_sources = {}
+        for i, chunk in enumerate(github_chunks):
+            # Fix for UI - assign a unique ID for citation/source tracking
+            github_chunks[i]['document']['id'] = self.source_id_counter
+            
+            # Extract document metadata
+            document = chunk.get('document', {})
+            metadata = document.get('metadata', {})
+
+            # Create a mapped source entry
+            source = {
+                "id": self.source_id_counter,
+                "title": document.get('title', 'GitHub Document'), # Use specific title if available
+                "description": metadata.get('description', chunk.get('content', '')[:100]), # Use description or content preview
+                "url": metadata.get('url', '') # Use URL if available in metadata
+            }
+
+            self.source_id_counter += 1
+
+            # Use a unique identifier for tracking unique sources (URL preferred)
+            source_key = source.get("url") or source.get("title")
+            if source_key and source_key not in mapped_sources:
+                mapped_sources[source_key] = source
+        
+        # Convert to list of sources
+        sources_list = list(mapped_sources.values())
+        
+        # Create result object
+        result_object = {
+            "id": 7, # Assuming 7 is the next available ID
+            "name": "GitHub",
+            "type": "GITHUB_CONNECTOR",
+            "sources": sources_list,
+        }
+        
+        return result_object, github_chunks
--- a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/page.tsx
+++ b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/page.tsx
@ -94,7 +94,7 @@ import rehypeSanitize from "rehype-sanitize";
 import remarkGfm from "remark-gfm";
 import { DocumentViewer } from "@/components/document-viewer";
 import { JsonMetadataViewer } from "@/components/json-metadata-viewer";
-import { IconBrandNotion, IconBrandSlack, IconBrandYoutube } from "@tabler/icons-react";
+import { IconBrandGithub, IconBrandNotion, IconBrandSlack, IconBrandYoutube } from "@tabler/icons-react";

 // Define animation variants for reuse
 const fadeInScale = {
@ -142,6 +142,7 @@ const documentTypeIcons = {
    NOTION_CONNECTOR: IconBrandNotion,
    FILE: File,
    YOUTUBE_VIDEO: IconBrandYoutube,
+    GITHUB_CONNECTOR: IconBrandGithub,
 } as const;

 const columns: ColumnDef<Document>[] = [
@ -1028,4 +1029,4 @@ function RowActions({ row }: { row: Row<Document> }) {
    );
 }

-export { DocumentsTable }
+export { DocumentsTable }