feat: Added Extension

This commit is contained in:
DESKTOP-RTLN3BA\$punk 2025-03-26 20:41:47 -07:00
parent 8cd1264d3f
commit 77833f21f3
3 changed files with 126 additions and 2 deletions

View file

@ -59,6 +59,33 @@ async def stream_connector_search_results(
# Process each selected connector
for connector in selected_connectors:
# Extension Docs
if connector == "EXTENSION":
# Send terminal message about starting search
yield streaming_service.add_terminal_message("Starting to search for extension...")
# Search for crawled URLs using reformulated query
result_object, extension_chunks = await connector_service.search_extension(
user_query=reformulated_query,
user_id=user_id,
search_space_id=search_space_id,
top_k=TOP_K
)
# Send terminal message about search results
yield streaming_service.add_terminal_message(
f"Found {len(result_object['sources'])} relevant extension documents",
"success"
)
# Update sources
all_sources.append(result_object)
yield streaming_service.update_sources(all_sources)
# Add documents to collection
all_raw_documents.extend(extension_chunks)
# Crawled URLs
if connector == "CRAWLED_URL":
# Send terminal message about starting search

View file

@ -383,3 +383,100 @@ class ConnectorService:
}
return result_object, notion_chunks
async def search_extension(self, user_query: str, user_id: int, search_space_id: int, top_k: int = 20) -> tuple:
"""
Search for extension data and return both the source information and langchain documents
Args:
user_query: The user's query
user_id: The user's ID
search_space_id: The search space ID to search in
top_k: Maximum number of results to return
Returns:
tuple: (sources_info, langchain_documents)
"""
extension_chunks = await self.retriever.hybrid_search(
query_text=user_query,
top_k=top_k,
user_id=user_id,
search_space_id=search_space_id,
document_type="EXTENSION"
)
# Map extension_chunks to the required format
mapped_sources = {}
for i, chunk in enumerate(extension_chunks):
# Fix for UI
extension_chunks[i]['document']['id'] = self.source_id_counter
# Extract document metadata
document = chunk.get('document', {})
metadata = document.get('metadata', {})
# Extract extension-specific metadata
webpage_title = metadata.get('VisitedWebPageTitle', 'Untitled Page')
webpage_url = metadata.get('VisitedWebPageURL', '')
visit_date = metadata.get('VisitedWebPageDateWithTimeInISOString', '')
visit_duration = metadata.get('VisitedWebPageVisitDurationInMilliseconds', '')
browsing_session_id = metadata.get('BrowsingSessionId', '')
# Create a more descriptive title for extension data
title = webpage_title
if visit_date:
# Format the date for display (simplified)
try:
# Just extract the date part for display
formatted_date = visit_date.split('T')[0] if 'T' in visit_date else visit_date
title += f" (visited: {formatted_date})"
except:
# Fallback if date parsing fails
title += f" (visited: {visit_date})"
# Create a more descriptive description for extension data
description = chunk.get('content', '')[:100]
if len(description) == 100:
description += "..."
# Add visit duration if available
if visit_duration:
try:
duration_seconds = int(visit_duration) / 1000
if duration_seconds < 60:
duration_text = f"{duration_seconds:.1f} seconds"
else:
duration_text = f"{duration_seconds/60:.1f} minutes"
if description:
description += f" | Duration: {duration_text}"
except:
# Fallback if duration parsing fails
pass
source = {
"id": self.source_id_counter,
"title": title,
"description": description,
"url": webpage_url
}
self.source_id_counter += 1
# Use URL and timestamp as a unique identifier for tracking unique sources
source_key = f"{webpage_url}_{visit_date}"
if source_key and source_key not in mapped_sources:
mapped_sources[source_key] = source
# Convert to list of sources
sources_list = list(mapped_sources.values())
# Create result object
result_object = {
"id": 6,
"name": "Extension",
"type": "EXTENSION",
"sources": sources_list,
}
return result_object, extension_chunks

@ -1 +1 @@
Subproject commit 269cef48438adfba31f5405898a8ef1b1231020a
Subproject commit 1de75613320f6d077ca04c6ec7a7441e07536613