mirror of
https://github.com/MODSetter/SurfSense.git
synced 2025-09-05 03:59:06 +00:00
documents table migration, fix/update github indexing
This commit is contained in:
parent
bb198e38c0
commit
a26fac435b
5 changed files with 197 additions and 17 deletions
|
@ -0,0 +1,70 @@
|
||||||
|
"""Add GITHUB_CONNECTOR to DocumentType enum
|
||||||
|
|
||||||
|
Revision ID: e55302644c51
|
||||||
|
Revises: 1
|
||||||
|
Create Date: 2025-04-13 19:56:00.059921
|
||||||
|
|
||||||
|
"""
|
||||||
|
from typing import Sequence, Union
|
||||||
|
|
||||||
|
from alembic import op
|
||||||
|
import sqlalchemy as sa
|
||||||
|
|
||||||
|
|
||||||
|
# revision identifiers, used by Alembic.
|
||||||
|
revision: str = 'e55302644c51'
|
||||||
|
down_revision: Union[str, None] = '1'
|
||||||
|
branch_labels: Union[str, Sequence[str], None] = None
|
||||||
|
depends_on: Union[str, Sequence[str], None] = None
|
||||||
|
|
||||||
|
# Define the ENUM type name and the new value
|
||||||
|
ENUM_NAME = 'documenttype' # Make sure this matches the name in your DB (usually lowercase class name)
|
||||||
|
NEW_VALUE = 'GITHUB_CONNECTOR'
|
||||||
|
|
||||||
|
def upgrade() -> None:
|
||||||
|
"""Upgrade schema."""
|
||||||
|
op.execute(f"ALTER TYPE {ENUM_NAME} ADD VALUE '{NEW_VALUE}'")
|
||||||
|
|
||||||
|
|
||||||
|
# Warning: This will delete all rows with the new value
|
||||||
|
def downgrade() -> None:
|
||||||
|
"""Downgrade schema - remove GITHUB_CONNECTOR from enum."""
|
||||||
|
|
||||||
|
# The old type name
|
||||||
|
old_enum_name = f"{ENUM_NAME}_old"
|
||||||
|
|
||||||
|
# Enum values *before* GITHUB_CONNECTOR was added
|
||||||
|
old_values = (
|
||||||
|
'EXTENSION',
|
||||||
|
'CRAWLED_URL',
|
||||||
|
'FILE',
|
||||||
|
'SLACK_CONNECTOR',
|
||||||
|
'NOTION_CONNECTOR',
|
||||||
|
'YOUTUBE_VIDEO'
|
||||||
|
)
|
||||||
|
old_values_sql = ", ".join([f"'{v}'" for v in old_values])
|
||||||
|
|
||||||
|
# Table and column names (adjust if different)
|
||||||
|
table_name = 'documents'
|
||||||
|
column_name = 'document_type'
|
||||||
|
|
||||||
|
# 1. Rename the current enum type
|
||||||
|
op.execute(f"ALTER TYPE {ENUM_NAME} RENAME TO {old_enum_name}")
|
||||||
|
|
||||||
|
# 2. Create the new enum type with the old values
|
||||||
|
op.execute(f"CREATE TYPE {ENUM_NAME} AS ENUM({old_values_sql})")
|
||||||
|
|
||||||
|
# 3. Update the table:
|
||||||
|
op.execute(
|
||||||
|
f"DELETE FROM {table_name} WHERE {column_name}::text = '{NEW_VALUE}'"
|
||||||
|
)
|
||||||
|
|
||||||
|
# 4. Alter the column to use the new enum type (casting old values)
|
||||||
|
op.execute(
|
||||||
|
f"ALTER TABLE {table_name} ALTER COLUMN {column_name} "
|
||||||
|
f"TYPE {ENUM_NAME} USING {column_name}::text::{ENUM_NAME}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# 5. Drop the old enum type
|
||||||
|
op.execute(f"DROP TYPE {old_enum_name}")
|
||||||
|
# ### end Alembic commands ###
|
|
@ -2,7 +2,6 @@ import base64
|
||||||
import logging
|
import logging
|
||||||
from typing import List, Optional, Dict, Any, Tuple
|
from typing import List, Optional, Dict, Any, Tuple
|
||||||
from github3 import login as github_login, exceptions as github_exceptions
|
from github3 import login as github_login, exceptions as github_exceptions
|
||||||
from github3.repos.repo import Repository
|
|
||||||
from github3.repos.contents import Contents
|
from github3.repos.contents import Contents
|
||||||
from github3.exceptions import ForbiddenError, NotFoundError
|
from github3.exceptions import ForbiddenError, NotFoundError
|
||||||
|
|
||||||
|
@ -26,6 +25,33 @@ MAX_FILE_SIZE = 1 * 1024 * 1024
|
||||||
class GitHubConnector:
|
class GitHubConnector:
|
||||||
"""Connector for interacting with the GitHub API."""
|
"""Connector for interacting with the GitHub API."""
|
||||||
|
|
||||||
|
# Directories to skip during file traversal
|
||||||
|
SKIPPED_DIRS = {
|
||||||
|
# Version control
|
||||||
|
'.git',
|
||||||
|
# Dependencies
|
||||||
|
'node_modules',
|
||||||
|
'vendor',
|
||||||
|
# Build artifacts / Caches
|
||||||
|
'build',
|
||||||
|
'dist',
|
||||||
|
'target',
|
||||||
|
'__pycache__',
|
||||||
|
# Virtual environments
|
||||||
|
'venv',
|
||||||
|
'.venv',
|
||||||
|
'env',
|
||||||
|
# IDE/Editor config
|
||||||
|
'.vscode',
|
||||||
|
'.idea',
|
||||||
|
'.project',
|
||||||
|
'.settings',
|
||||||
|
# Temporary / Logs
|
||||||
|
'tmp',
|
||||||
|
'logs',
|
||||||
|
# Add other project-specific irrelevant directories if needed
|
||||||
|
}
|
||||||
|
|
||||||
def __init__(self, token: str):
|
def __init__(self, token: str):
|
||||||
"""
|
"""
|
||||||
Initializes the GitHub connector.
|
Initializes the GitHub connector.
|
||||||
|
@ -54,8 +80,7 @@ class GitHubConnector:
|
||||||
# type='owner' fetches repos owned by the user
|
# type='owner' fetches repos owned by the user
|
||||||
# type='member' fetches repos the user is a collaborator on (including orgs)
|
# type='member' fetches repos the user is a collaborator on (including orgs)
|
||||||
# type='all' fetches both
|
# type='all' fetches both
|
||||||
for repo in self.gh.repositories(type='all', sort='updated'):
|
for repo in self.gh.repositories(type='owner', sort='updated'):
|
||||||
if isinstance(repo, Repository):
|
|
||||||
repos_data.append({
|
repos_data.append({
|
||||||
"id": repo.id,
|
"id": repo.id,
|
||||||
"name": repo.name,
|
"name": repo.name,
|
||||||
|
@ -63,7 +88,7 @@ class GitHubConnector:
|
||||||
"private": repo.private,
|
"private": repo.private,
|
||||||
"url": repo.html_url,
|
"url": repo.html_url,
|
||||||
"description": repo.description or "",
|
"description": repo.description or "",
|
||||||
"last_updated": repo.updated_at.isoformat() if repo.updated_at else None,
|
"last_updated": repo.updated_at if repo.updated_at else None,
|
||||||
})
|
})
|
||||||
logger.info(f"Fetched {len(repos_data)} repositories.")
|
logger.info(f"Fetched {len(repos_data)} repositories.")
|
||||||
return repos_data
|
return repos_data
|
||||||
|
@ -90,8 +115,7 @@ class GitHubConnector:
|
||||||
if not repo:
|
if not repo:
|
||||||
logger.warning(f"Repository '{repo_full_name}' not found.")
|
logger.warning(f"Repository '{repo_full_name}' not found.")
|
||||||
return []
|
return []
|
||||||
|
contents = repo.directory_contents(directory_path=path) # Use directory_contents for clarity
|
||||||
contents = repo.directory_contents(path=path) # Use directory_contents for clarity
|
|
||||||
|
|
||||||
# contents returns a list of tuples (name, content_obj)
|
# contents returns a list of tuples (name, content_obj)
|
||||||
for item_name, content_item in contents:
|
for item_name, content_item in contents:
|
||||||
|
@ -99,6 +123,11 @@ class GitHubConnector:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if content_item.type == 'dir':
|
if content_item.type == 'dir':
|
||||||
|
# Check if the directory name is in the skipped list
|
||||||
|
if content_item.name in self.SKIPPED_DIRS:
|
||||||
|
logger.debug(f"Skipping directory: {content_item.path}")
|
||||||
|
continue # Skip recursion for this directory
|
||||||
|
|
||||||
# Recursively fetch contents of subdirectory
|
# Recursively fetch contents of subdirectory
|
||||||
files_list.extend(self.get_repository_files(repo_full_name, path=content_item.path))
|
files_list.extend(self.get_repository_files(repo_full_name, path=content_item.path))
|
||||||
elif content_item.type == 'file':
|
elif content_item.type == 'file':
|
||||||
|
|
|
@ -244,6 +244,33 @@ async def stream_connector_search_results(
|
||||||
all_raw_documents.extend(notion_chunks)
|
all_raw_documents.extend(notion_chunks)
|
||||||
|
|
||||||
|
|
||||||
|
# Github Connector
|
||||||
|
if connector == "GITHUB_CONNECTOR":
|
||||||
|
# Send terminal message about starting search
|
||||||
|
yield streaming_service.add_terminal_message("Starting to search for GitHub connector...")
|
||||||
|
print("Starting to search for GitHub connector...")
|
||||||
|
# Search using Github API with reformulated query
|
||||||
|
result_object, github_chunks = await connector_service.search_github(
|
||||||
|
user_query=reformulated_query,
|
||||||
|
user_id=user_id,
|
||||||
|
search_space_id=search_space_id,
|
||||||
|
top_k=TOP_K
|
||||||
|
)
|
||||||
|
|
||||||
|
# Send terminal message about search results
|
||||||
|
yield streaming_service.add_terminal_message(
|
||||||
|
f"Found {len(result_object['sources'])} relevant results from Github",
|
||||||
|
"success"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Update sources
|
||||||
|
all_sources.append(result_object)
|
||||||
|
yield streaming_service.update_sources(all_sources)
|
||||||
|
|
||||||
|
# Add documents to collection
|
||||||
|
all_raw_documents.extend(github_chunks)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# If we have documents to research
|
# If we have documents to research
|
||||||
|
|
|
@ -559,3 +559,56 @@ class ConnectorService:
|
||||||
}
|
}
|
||||||
|
|
||||||
return result_object, youtube_chunks
|
return result_object, youtube_chunks
|
||||||
|
|
||||||
|
async def search_github(self, user_query: str, user_id: int, search_space_id: int, top_k: int = 20) -> tuple:
|
||||||
|
"""
|
||||||
|
Search for GitHub documents and return both the source information and langchain documents
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
tuple: (sources_info, langchain_documents)
|
||||||
|
"""
|
||||||
|
github_chunks = await self.retriever.hybrid_search(
|
||||||
|
query_text=user_query,
|
||||||
|
top_k=top_k,
|
||||||
|
user_id=user_id,
|
||||||
|
search_space_id=search_space_id,
|
||||||
|
document_type="GITHUB_CONNECTOR"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Map github_chunks to the required format
|
||||||
|
mapped_sources = {}
|
||||||
|
for i, chunk in enumerate(github_chunks):
|
||||||
|
# Fix for UI - assign a unique ID for citation/source tracking
|
||||||
|
github_chunks[i]['document']['id'] = self.source_id_counter
|
||||||
|
|
||||||
|
# Extract document metadata
|
||||||
|
document = chunk.get('document', {})
|
||||||
|
metadata = document.get('metadata', {})
|
||||||
|
|
||||||
|
# Create a mapped source entry
|
||||||
|
source = {
|
||||||
|
"id": self.source_id_counter,
|
||||||
|
"title": document.get('title', 'GitHub Document'), # Use specific title if available
|
||||||
|
"description": metadata.get('description', chunk.get('content', '')[:100]), # Use description or content preview
|
||||||
|
"url": metadata.get('url', '') # Use URL if available in metadata
|
||||||
|
}
|
||||||
|
|
||||||
|
self.source_id_counter += 1
|
||||||
|
|
||||||
|
# Use a unique identifier for tracking unique sources (URL preferred)
|
||||||
|
source_key = source.get("url") or source.get("title")
|
||||||
|
if source_key and source_key not in mapped_sources:
|
||||||
|
mapped_sources[source_key] = source
|
||||||
|
|
||||||
|
# Convert to list of sources
|
||||||
|
sources_list = list(mapped_sources.values())
|
||||||
|
|
||||||
|
# Create result object
|
||||||
|
result_object = {
|
||||||
|
"id": 7, # Assuming 7 is the next available ID
|
||||||
|
"name": "GitHub",
|
||||||
|
"type": "GITHUB_CONNECTOR",
|
||||||
|
"sources": sources_list,
|
||||||
|
}
|
||||||
|
|
||||||
|
return result_object, github_chunks
|
||||||
|
|
|
@ -94,7 +94,7 @@ import rehypeSanitize from "rehype-sanitize";
|
||||||
import remarkGfm from "remark-gfm";
|
import remarkGfm from "remark-gfm";
|
||||||
import { DocumentViewer } from "@/components/document-viewer";
|
import { DocumentViewer } from "@/components/document-viewer";
|
||||||
import { JsonMetadataViewer } from "@/components/json-metadata-viewer";
|
import { JsonMetadataViewer } from "@/components/json-metadata-viewer";
|
||||||
import { IconBrandNotion, IconBrandSlack, IconBrandYoutube } from "@tabler/icons-react";
|
import { IconBrandGithub, IconBrandNotion, IconBrandSlack, IconBrandYoutube } from "@tabler/icons-react";
|
||||||
|
|
||||||
// Define animation variants for reuse
|
// Define animation variants for reuse
|
||||||
const fadeInScale = {
|
const fadeInScale = {
|
||||||
|
@ -142,6 +142,7 @@ const documentTypeIcons = {
|
||||||
NOTION_CONNECTOR: IconBrandNotion,
|
NOTION_CONNECTOR: IconBrandNotion,
|
||||||
FILE: File,
|
FILE: File,
|
||||||
YOUTUBE_VIDEO: IconBrandYoutube,
|
YOUTUBE_VIDEO: IconBrandYoutube,
|
||||||
|
GITHUB_CONNECTOR: IconBrandGithub,
|
||||||
} as const;
|
} as const;
|
||||||
|
|
||||||
const columns: ColumnDef<Document>[] = [
|
const columns: ColumnDef<Document>[] = [
|
||||||
|
|
Loading…
Add table
Reference in a new issue