mirror of
https://github.com/MODSetter/SurfSense.git
synced 2025-09-09 13:54:40 +00:00
feat: Added Extension
This commit is contained in:
parent
8cd1264d3f
commit
77833f21f3
3 changed files with 126 additions and 2 deletions
|
@ -59,6 +59,33 @@ async def stream_connector_search_results(
|
|||
|
||||
# Process each selected connector
|
||||
for connector in selected_connectors:
|
||||
# Extension Docs
|
||||
if connector == "EXTENSION":
|
||||
# Send terminal message about starting search
|
||||
yield streaming_service.add_terminal_message("Starting to search for extension...")
|
||||
|
||||
# Search for crawled URLs using reformulated query
|
||||
result_object, extension_chunks = await connector_service.search_extension(
|
||||
user_query=reformulated_query,
|
||||
user_id=user_id,
|
||||
search_space_id=search_space_id,
|
||||
top_k=TOP_K
|
||||
)
|
||||
|
||||
# Send terminal message about search results
|
||||
yield streaming_service.add_terminal_message(
|
||||
f"Found {len(result_object['sources'])} relevant extension documents",
|
||||
"success"
|
||||
)
|
||||
|
||||
# Update sources
|
||||
all_sources.append(result_object)
|
||||
yield streaming_service.update_sources(all_sources)
|
||||
|
||||
# Add documents to collection
|
||||
all_raw_documents.extend(extension_chunks)
|
||||
|
||||
|
||||
# Crawled URLs
|
||||
if connector == "CRAWLED_URL":
|
||||
# Send terminal message about starting search
|
||||
|
|
|
@ -383,3 +383,100 @@ class ConnectorService:
|
|||
}
|
||||
|
||||
return result_object, notion_chunks
|
||||
|
||||
async def search_extension(self, user_query: str, user_id: int, search_space_id: int, top_k: int = 20) -> tuple:
|
||||
"""
|
||||
Search for extension data and return both the source information and langchain documents
|
||||
|
||||
Args:
|
||||
user_query: The user's query
|
||||
user_id: The user's ID
|
||||
search_space_id: The search space ID to search in
|
||||
top_k: Maximum number of results to return
|
||||
|
||||
Returns:
|
||||
tuple: (sources_info, langchain_documents)
|
||||
"""
|
||||
extension_chunks = await self.retriever.hybrid_search(
|
||||
query_text=user_query,
|
||||
top_k=top_k,
|
||||
user_id=user_id,
|
||||
search_space_id=search_space_id,
|
||||
document_type="EXTENSION"
|
||||
)
|
||||
|
||||
# Map extension_chunks to the required format
|
||||
mapped_sources = {}
|
||||
for i, chunk in enumerate(extension_chunks):
|
||||
# Fix for UI
|
||||
extension_chunks[i]['document']['id'] = self.source_id_counter
|
||||
|
||||
# Extract document metadata
|
||||
document = chunk.get('document', {})
|
||||
metadata = document.get('metadata', {})
|
||||
|
||||
# Extract extension-specific metadata
|
||||
webpage_title = metadata.get('VisitedWebPageTitle', 'Untitled Page')
|
||||
webpage_url = metadata.get('VisitedWebPageURL', '')
|
||||
visit_date = metadata.get('VisitedWebPageDateWithTimeInISOString', '')
|
||||
visit_duration = metadata.get('VisitedWebPageVisitDurationInMilliseconds', '')
|
||||
browsing_session_id = metadata.get('BrowsingSessionId', '')
|
||||
|
||||
# Create a more descriptive title for extension data
|
||||
title = webpage_title
|
||||
if visit_date:
|
||||
# Format the date for display (simplified)
|
||||
try:
|
||||
# Just extract the date part for display
|
||||
formatted_date = visit_date.split('T')[0] if 'T' in visit_date else visit_date
|
||||
title += f" (visited: {formatted_date})"
|
||||
except:
|
||||
# Fallback if date parsing fails
|
||||
title += f" (visited: {visit_date})"
|
||||
|
||||
# Create a more descriptive description for extension data
|
||||
description = chunk.get('content', '')[:100]
|
||||
if len(description) == 100:
|
||||
description += "..."
|
||||
|
||||
# Add visit duration if available
|
||||
if visit_duration:
|
||||
try:
|
||||
duration_seconds = int(visit_duration) / 1000
|
||||
if duration_seconds < 60:
|
||||
duration_text = f"{duration_seconds:.1f} seconds"
|
||||
else:
|
||||
duration_text = f"{duration_seconds/60:.1f} minutes"
|
||||
|
||||
if description:
|
||||
description += f" | Duration: {duration_text}"
|
||||
except:
|
||||
# Fallback if duration parsing fails
|
||||
pass
|
||||
|
||||
source = {
|
||||
"id": self.source_id_counter,
|
||||
"title": title,
|
||||
"description": description,
|
||||
"url": webpage_url
|
||||
}
|
||||
|
||||
self.source_id_counter += 1
|
||||
|
||||
# Use URL and timestamp as a unique identifier for tracking unique sources
|
||||
source_key = f"{webpage_url}_{visit_date}"
|
||||
if source_key and source_key not in mapped_sources:
|
||||
mapped_sources[source_key] = source
|
||||
|
||||
# Convert to list of sources
|
||||
sources_list = list(mapped_sources.values())
|
||||
|
||||
# Create result object
|
||||
result_object = {
|
||||
"id": 6,
|
||||
"name": "Extension",
|
||||
"type": "EXTENSION",
|
||||
"sources": sources_list,
|
||||
}
|
||||
|
||||
return result_object, extension_chunks
|
|
@ -1 +1 @@
|
|||
Subproject commit 269cef48438adfba31f5405898a8ef1b1231020a
|
||||
Subproject commit 1de75613320f6d077ca04c6ec7a7441e07536613
|
Loading…
Add table
Add a link
Reference in a new issue