From a26fac435b8bcee0e189ffa422729e2c9ca9ac7c Mon Sep 17 00:00:00 2001 From: Adamsmith6300 Date: Sun, 13 Apr 2025 21:23:05 -0700 Subject: [PATCH] documents table migration, fix/update github indexing --- ...1_add_github_connector_to_documenttype_.py | 70 +++++++++++++++++++ .../app/connectors/github_connector.py | 57 +++++++++++---- .../tasks/stream_connector_search_results.py | 27 +++++++ .../app/utils/connector_service.py | 55 ++++++++++++++- .../documents/(manage)/page.tsx | 5 +- 5 files changed, 197 insertions(+), 17 deletions(-) create mode 100644 surfsense_backend/alembic/versions/e55302644c51_add_github_connector_to_documenttype_.py diff --git a/surfsense_backend/alembic/versions/e55302644c51_add_github_connector_to_documenttype_.py b/surfsense_backend/alembic/versions/e55302644c51_add_github_connector_to_documenttype_.py new file mode 100644 index 0000000..1f15912 --- /dev/null +++ b/surfsense_backend/alembic/versions/e55302644c51_add_github_connector_to_documenttype_.py @@ -0,0 +1,70 @@ +"""Add GITHUB_CONNECTOR to DocumentType enum + +Revision ID: e55302644c51 +Revises: 1 +Create Date: 2025-04-13 19:56:00.059921 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = 'e55302644c51' +down_revision: Union[str, None] = '1' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + +# Define the ENUM type name and the new value +ENUM_NAME = 'documenttype' # Make sure this matches the name in your DB (usually lowercase class name) +NEW_VALUE = 'GITHUB_CONNECTOR' + +def upgrade() -> None: + """Upgrade schema.""" + op.execute(f"ALTER TYPE {ENUM_NAME} ADD VALUE '{NEW_VALUE}'") + + +# Warning: This will delete all rows with the new value +def downgrade() -> None: + """Downgrade schema - remove GITHUB_CONNECTOR from enum.""" + + # The old type name + old_enum_name = f"{ENUM_NAME}_old" + + # Enum values *before* GITHUB_CONNECTOR was added + old_values = ( + 'EXTENSION', + 'CRAWLED_URL', + 'FILE', + 'SLACK_CONNECTOR', + 'NOTION_CONNECTOR', + 'YOUTUBE_VIDEO' + ) + old_values_sql = ", ".join([f"'{v}'" for v in old_values]) + + # Table and column names (adjust if different) + table_name = 'documents' + column_name = 'document_type' + + # 1. Rename the current enum type + op.execute(f"ALTER TYPE {ENUM_NAME} RENAME TO {old_enum_name}") + + # 2. Create the new enum type with the old values + op.execute(f"CREATE TYPE {ENUM_NAME} AS ENUM({old_values_sql})") + + # 3. Update the table: + op.execute( + f"DELETE FROM {table_name} WHERE {column_name}::text = '{NEW_VALUE}'" + ) + + # 4. Alter the column to use the new enum type (casting old values) + op.execute( + f"ALTER TABLE {table_name} ALTER COLUMN {column_name} " + f"TYPE {ENUM_NAME} USING {column_name}::text::{ENUM_NAME}" + ) + + # 5. Drop the old enum type + op.execute(f"DROP TYPE {old_enum_name}") + # ### end Alembic commands ### diff --git a/surfsense_backend/app/connectors/github_connector.py b/surfsense_backend/app/connectors/github_connector.py index d827dac..265f89b 100644 --- a/surfsense_backend/app/connectors/github_connector.py +++ b/surfsense_backend/app/connectors/github_connector.py @@ -2,7 +2,6 @@ import base64 import logging from typing import List, Optional, Dict, Any, Tuple from github3 import login as github_login, exceptions as github_exceptions -from github3.repos.repo import Repository from github3.repos.contents import Contents from github3.exceptions import ForbiddenError, NotFoundError @@ -26,6 +25,33 @@ MAX_FILE_SIZE = 1 * 1024 * 1024 class GitHubConnector: """Connector for interacting with the GitHub API.""" + # Directories to skip during file traversal + SKIPPED_DIRS = { + # Version control + '.git', + # Dependencies + 'node_modules', + 'vendor', + # Build artifacts / Caches + 'build', + 'dist', + 'target', + '__pycache__', + # Virtual environments + 'venv', + '.venv', + 'env', + # IDE/Editor config + '.vscode', + '.idea', + '.project', + '.settings', + # Temporary / Logs + 'tmp', + 'logs', + # Add other project-specific irrelevant directories if needed + } + def __init__(self, token: str): """ Initializes the GitHub connector. @@ -54,17 +80,16 @@ class GitHubConnector: # type='owner' fetches repos owned by the user # type='member' fetches repos the user is a collaborator on (including orgs) # type='all' fetches both - for repo in self.gh.repositories(type='all', sort='updated'): - if isinstance(repo, Repository): - repos_data.append({ - "id": repo.id, - "name": repo.name, - "full_name": repo.full_name, - "private": repo.private, - "url": repo.html_url, - "description": repo.description or "", - "last_updated": repo.updated_at.isoformat() if repo.updated_at else None, - }) + for repo in self.gh.repositories(type='owner', sort='updated'): + repos_data.append({ + "id": repo.id, + "name": repo.name, + "full_name": repo.full_name, + "private": repo.private, + "url": repo.html_url, + "description": repo.description or "", + "last_updated": repo.updated_at if repo.updated_at else None, + }) logger.info(f"Fetched {len(repos_data)} repositories.") return repos_data except Exception as e: @@ -90,8 +115,7 @@ class GitHubConnector: if not repo: logger.warning(f"Repository '{repo_full_name}' not found.") return [] - - contents = repo.directory_contents(path=path) # Use directory_contents for clarity + contents = repo.directory_contents(directory_path=path) # Use directory_contents for clarity # contents returns a list of tuples (name, content_obj) for item_name, content_item in contents: @@ -99,6 +123,11 @@ class GitHubConnector: continue if content_item.type == 'dir': + # Check if the directory name is in the skipped list + if content_item.name in self.SKIPPED_DIRS: + logger.debug(f"Skipping directory: {content_item.path}") + continue # Skip recursion for this directory + # Recursively fetch contents of subdirectory files_list.extend(self.get_repository_files(repo_full_name, path=content_item.path)) elif content_item.type == 'file': diff --git a/surfsense_backend/app/tasks/stream_connector_search_results.py b/surfsense_backend/app/tasks/stream_connector_search_results.py index 5c563dc..b9a703a 100644 --- a/surfsense_backend/app/tasks/stream_connector_search_results.py +++ b/surfsense_backend/app/tasks/stream_connector_search_results.py @@ -244,6 +244,33 @@ async def stream_connector_search_results( all_raw_documents.extend(notion_chunks) + # Github Connector + if connector == "GITHUB_CONNECTOR": + # Send terminal message about starting search + yield streaming_service.add_terminal_message("Starting to search for GitHub connector...") + print("Starting to search for GitHub connector...") + # Search using Github API with reformulated query + result_object, github_chunks = await connector_service.search_github( + user_query=reformulated_query, + user_id=user_id, + search_space_id=search_space_id, + top_k=TOP_K + ) + + # Send terminal message about search results + yield streaming_service.add_terminal_message( + f"Found {len(result_object['sources'])} relevant results from Github", + "success" + ) + + # Update sources + all_sources.append(result_object) + yield streaming_service.update_sources(all_sources) + + # Add documents to collection + all_raw_documents.extend(github_chunks) + + # If we have documents to research diff --git a/surfsense_backend/app/utils/connector_service.py b/surfsense_backend/app/utils/connector_service.py index 9e676e5..8d7a551 100644 --- a/surfsense_backend/app/utils/connector_service.py +++ b/surfsense_backend/app/utils/connector_service.py @@ -558,4 +558,57 @@ class ConnectorService: "sources": sources_list, } - return result_object, youtube_chunks \ No newline at end of file + return result_object, youtube_chunks + + async def search_github(self, user_query: str, user_id: int, search_space_id: int, top_k: int = 20) -> tuple: + """ + Search for GitHub documents and return both the source information and langchain documents + + Returns: + tuple: (sources_info, langchain_documents) + """ + github_chunks = await self.retriever.hybrid_search( + query_text=user_query, + top_k=top_k, + user_id=user_id, + search_space_id=search_space_id, + document_type="GITHUB_CONNECTOR" + ) + + # Map github_chunks to the required format + mapped_sources = {} + for i, chunk in enumerate(github_chunks): + # Fix for UI - assign a unique ID for citation/source tracking + github_chunks[i]['document']['id'] = self.source_id_counter + + # Extract document metadata + document = chunk.get('document', {}) + metadata = document.get('metadata', {}) + + # Create a mapped source entry + source = { + "id": self.source_id_counter, + "title": document.get('title', 'GitHub Document'), # Use specific title if available + "description": metadata.get('description', chunk.get('content', '')[:100]), # Use description or content preview + "url": metadata.get('url', '') # Use URL if available in metadata + } + + self.source_id_counter += 1 + + # Use a unique identifier for tracking unique sources (URL preferred) + source_key = source.get("url") or source.get("title") + if source_key and source_key not in mapped_sources: + mapped_sources[source_key] = source + + # Convert to list of sources + sources_list = list(mapped_sources.values()) + + # Create result object + result_object = { + "id": 7, # Assuming 7 is the next available ID + "name": "GitHub", + "type": "GITHUB_CONNECTOR", + "sources": sources_list, + } + + return result_object, github_chunks diff --git a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/page.tsx index 66f8b08..18b4357 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/page.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/page.tsx @@ -94,7 +94,7 @@ import rehypeSanitize from "rehype-sanitize"; import remarkGfm from "remark-gfm"; import { DocumentViewer } from "@/components/document-viewer"; import { JsonMetadataViewer } from "@/components/json-metadata-viewer"; -import { IconBrandNotion, IconBrandSlack, IconBrandYoutube } from "@tabler/icons-react"; +import { IconBrandGithub, IconBrandNotion, IconBrandSlack, IconBrandYoutube } from "@tabler/icons-react"; // Define animation variants for reuse const fadeInScale = { @@ -142,6 +142,7 @@ const documentTypeIcons = { NOTION_CONNECTOR: IconBrandNotion, FILE: File, YOUTUBE_VIDEO: IconBrandYoutube, + GITHUB_CONNECTOR: IconBrandGithub, } as const; const columns: ColumnDef[] = [ @@ -1028,4 +1029,4 @@ function RowActions({ row }: { row: Row }) { ); } -export { DocumentsTable } \ No newline at end of file +export { DocumentsTable }