mirror of
https://github.com/MODSetter/SurfSense.git
synced 2025-09-09 13:54:40 +00:00
documents table migration, fix/update github indexing
This commit is contained in:
parent
bb198e38c0
commit
a26fac435b
5 changed files with 197 additions and 17 deletions
|
@ -558,4 +558,57 @@ class ConnectorService:
|
|||
"sources": sources_list,
|
||||
}
|
||||
|
||||
return result_object, youtube_chunks
|
||||
return result_object, youtube_chunks
|
||||
|
||||
async def search_github(self, user_query: str, user_id: int, search_space_id: int, top_k: int = 20) -> tuple:
|
||||
"""
|
||||
Search for GitHub documents and return both the source information and langchain documents
|
||||
|
||||
Returns:
|
||||
tuple: (sources_info, langchain_documents)
|
||||
"""
|
||||
github_chunks = await self.retriever.hybrid_search(
|
||||
query_text=user_query,
|
||||
top_k=top_k,
|
||||
user_id=user_id,
|
||||
search_space_id=search_space_id,
|
||||
document_type="GITHUB_CONNECTOR"
|
||||
)
|
||||
|
||||
# Map github_chunks to the required format
|
||||
mapped_sources = {}
|
||||
for i, chunk in enumerate(github_chunks):
|
||||
# Fix for UI - assign a unique ID for citation/source tracking
|
||||
github_chunks[i]['document']['id'] = self.source_id_counter
|
||||
|
||||
# Extract document metadata
|
||||
document = chunk.get('document', {})
|
||||
metadata = document.get('metadata', {})
|
||||
|
||||
# Create a mapped source entry
|
||||
source = {
|
||||
"id": self.source_id_counter,
|
||||
"title": document.get('title', 'GitHub Document'), # Use specific title if available
|
||||
"description": metadata.get('description', chunk.get('content', '')[:100]), # Use description or content preview
|
||||
"url": metadata.get('url', '') # Use URL if available in metadata
|
||||
}
|
||||
|
||||
self.source_id_counter += 1
|
||||
|
||||
# Use a unique identifier for tracking unique sources (URL preferred)
|
||||
source_key = source.get("url") or source.get("title")
|
||||
if source_key and source_key not in mapped_sources:
|
||||
mapped_sources[source_key] = source
|
||||
|
||||
# Convert to list of sources
|
||||
sources_list = list(mapped_sources.values())
|
||||
|
||||
# Create result object
|
||||
result_object = {
|
||||
"id": 7, # Assuming 7 is the next available ID
|
||||
"name": "GitHub",
|
||||
"type": "GITHUB_CONNECTOR",
|
||||
"sources": sources_list,
|
||||
}
|
||||
|
||||
return result_object, github_chunks
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue