diff --git a/surfsense_backend/app/agents/researcher/nodes.py b/surfsense_backend/app/agents/researcher/nodes.py index 2d4584f..e73aefa 100644 --- a/surfsense_backend/app/agents/researcher/nodes.py +++ b/surfsense_backend/app/agents/researcher/nodes.py @@ -572,12 +572,15 @@ async def process_sections(state: State, config: RunnableConfig, writer: StreamW TOP_K = 20 elif configuration.num_sections == 6: TOP_K = 30 + else: + TOP_K = 10 relevant_documents = [] async with async_session_maker() as db_session: try: # Create connector service inside the db_session scope - connector_service = ConnectorService(db_session) + connector_service = ConnectorService(db_session, user_id=configuration.user_id) + await connector_service.initialize_counter() relevant_documents = await fetch_relevant_documents( research_questions=all_questions, @@ -875,7 +878,8 @@ async def handle_qna_workflow(state: State, config: RunnableConfig, writer: Stre async with async_session_maker() as db_session: try: # Create connector service inside the db_session scope - connector_service = ConnectorService(db_session) + connector_service = ConnectorService(db_session, user_id=configuration.user_id) + await connector_service.initialize_counter() # Use the reformulated query as a single research question research_questions = [reformulated_query] diff --git a/surfsense_backend/app/utils/connector_service.py b/surfsense_backend/app/utils/connector_service.py index 650259b..1052939 100644 --- a/surfsense_backend/app/utils/connector_service.py +++ b/surfsense_backend/app/utils/connector_service.py @@ -4,21 +4,45 @@ from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.future import select from app.retriver.chunks_hybrid_search import ChucksHybridSearchRetriever from app.retriver.documents_hybrid_search import DocumentHybridSearchRetriever -from app.db import SearchSourceConnector, SearchSourceConnectorType +from app.db import SearchSourceConnector, SearchSourceConnectorType, Chunk, Document from tavily import TavilyClient from linkup import LinkupClient +from sqlalchemy import func from app.agents.researcher.configuration import SearchMode class ConnectorService: - def __init__(self, session: AsyncSession): + def __init__(self, session: AsyncSession, user_id: str = None): self.session = session self.chunk_retriever = ChucksHybridSearchRetriever(session) self.document_retriever = DocumentHybridSearchRetriever(session) - self.source_id_counter = 1 + self.user_id = user_id + self.source_id_counter = 100000 # High starting value to avoid collisions with existing IDs self.counter_lock = asyncio.Lock() # Lock to protect counter in multithreaded environments + async def initialize_counter(self): + """ + Initialize the source_id_counter based on the total number of chunks for the user. + This ensures unique IDs across different sessions. + """ + if self.user_id: + try: + # Count total chunks for documents belonging to this user + + result = await self.session.execute( + select(func.count(Chunk.id)) + .join(Document) + .filter(Document.user_id == self.user_id) + ) + chunk_count = result.scalar() or 0 + self.source_id_counter = chunk_count + 1 + print(f"Initialized source_id_counter to {self.source_id_counter} for user {self.user_id}") + except Exception as e: + print(f"Error initializing source_id_counter: {str(e)}") + # Fallback to default value + self.source_id_counter = 1 + async def search_crawled_urls(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple: """ Search for crawled URLs and return both the source information and langchain documents @@ -57,16 +81,14 @@ class ConnectorService: # Process each chunk and create sources directly without deduplication sources_list = [] async with self.counter_lock: - for i, chunk in enumerate(crawled_urls_chunks): - # Fix for UI - crawled_urls_chunks[i]['document']['id'] = self.source_id_counter + for _i, chunk in enumerate(crawled_urls_chunks): # Extract document metadata document = chunk.get('document', {}) metadata = document.get('metadata', {}) # Create a source entry source = { - "id": self.source_id_counter, + "id": document.get('id', self.source_id_counter), "title": document.get('title', 'Untitled Document'), "description": metadata.get('og:description', metadata.get('ogDescription', chunk.get('content', '')[:100])), "url": metadata.get('url', '') @@ -123,16 +145,14 @@ class ConnectorService: # Process each chunk and create sources directly without deduplication sources_list = [] async with self.counter_lock: - for i, chunk in enumerate(files_chunks): - # Fix for UI - files_chunks[i]['document']['id'] = self.source_id_counter + for _i, chunk in enumerate(files_chunks): # Extract document metadata document = chunk.get('document', {}) metadata = document.get('metadata', {}) # Create a source entry source = { - "id": self.source_id_counter, + "id": document.get('id', self.source_id_counter), "title": document.get('title', 'Untitled Document'), "description": metadata.get('og:description', metadata.get('ogDescription', chunk.get('content', '')[:100])), "url": metadata.get('url', '') @@ -337,9 +357,7 @@ class ConnectorService: # Process each chunk and create sources directly without deduplication sources_list = [] async with self.counter_lock: - for i, chunk in enumerate(slack_chunks): - # Fix for UI - slack_chunks[i]['document']['id'] = self.source_id_counter + for _i, chunk in enumerate(slack_chunks): # Extract document metadata document = chunk.get('document', {}) metadata = document.get('metadata', {}) @@ -365,7 +383,7 @@ class ConnectorService: url = f"https://slack.com/app_redirect?channel={channel_id}" source = { - "id": self.source_id_counter, + "id": document.get('id', self.source_id_counter), "title": title, "description": description, "url": url, @@ -428,10 +446,7 @@ class ConnectorService: # Process each chunk and create sources directly without deduplication sources_list = [] async with self.counter_lock: - for i, chunk in enumerate(notion_chunks): - # Fix for UI - notion_chunks[i]['document']['id'] = self.source_id_counter - + for _i, chunk in enumerate(notion_chunks): # Extract document metadata document = chunk.get('document', {}) metadata = document.get('metadata', {}) @@ -458,7 +473,7 @@ class ConnectorService: url = f"https://notion.so/{page_id.replace('-', '')}" source = { - "id": self.source_id_counter, + "id": document.get('id', self.source_id_counter), "title": title, "description": description, "url": url, @@ -522,9 +537,6 @@ class ConnectorService: sources_list = [] async with self.counter_lock: for i, chunk in enumerate(extension_chunks): - # Fix for UI - extension_chunks[i]['document']['id'] = self.source_id_counter - # Extract document metadata document = chunk.get('document', {}) metadata = document.get('metadata', {}) @@ -569,7 +581,7 @@ class ConnectorService: pass source = { - "id": self.source_id_counter, + "id": document.get('id', self.source_id_counter), "title": title, "description": description, "url": webpage_url @@ -632,10 +644,7 @@ class ConnectorService: # Process each chunk and create sources directly without deduplication sources_list = [] async with self.counter_lock: - for i, chunk in enumerate(youtube_chunks): - # Fix for UI - youtube_chunks[i]['document']['id'] = self.source_id_counter - + for _i, chunk in enumerate(youtube_chunks): # Extract document metadata document = chunk.get('document', {}) metadata = document.get('metadata', {}) @@ -644,7 +653,7 @@ class ConnectorService: video_title = metadata.get('video_title', 'Untitled Video') video_id = metadata.get('video_id', '') channel_name = metadata.get('channel_name', '') - published_date = metadata.get('published_date', '') + # published_date = metadata.get('published_date', '') # Create a more descriptive title for YouTube videos title = video_title @@ -660,7 +669,7 @@ class ConnectorService: url = f"https://www.youtube.com/watch?v={video_id}" if video_id else "" source = { - "id": self.source_id_counter, + "id": document.get('id', self.source_id_counter), "title": title, "description": description, "url": url, @@ -719,17 +728,14 @@ class ConnectorService: # Process each chunk and create sources directly without deduplication sources_list = [] async with self.counter_lock: - for i, chunk in enumerate(github_chunks): - # Fix for UI - assign a unique ID for citation/source tracking - github_chunks[i]['document']['id'] = self.source_id_counter - + for _i, chunk in enumerate(github_chunks): # Extract document metadata document = chunk.get('document', {}) metadata = document.get('metadata', {}) # Create a source entry source = { - "id": self.source_id_counter, + "id": document.get('id', self.source_id_counter), "title": document.get('title', 'GitHub Document'), # Use specific title if available "description": metadata.get('description', chunk.get('content', '')[:100]), # Use description or content preview "url": metadata.get('url', '') # Use URL if available in metadata @@ -792,10 +798,7 @@ class ConnectorService: # Process each chunk and create sources directly without deduplication sources_list = [] async with self.counter_lock: - for i, chunk in enumerate(linear_chunks): - # Fix for UI - linear_chunks[i]['document']['id'] = self.source_id_counter - + for _i, chunk in enumerate(linear_chunks): # Extract document metadata document = chunk.get('document', {}) metadata = document.get('metadata', {}) @@ -831,7 +834,7 @@ class ConnectorService: url = f"https://linear.app/issue/{issue_identifier}" source = { - "id": self.source_id_counter, + "id": document.get('id', self.source_id_counter), "title": title, "description": description, "url": url, @@ -1004,8 +1007,6 @@ class ConnectorService: sources_list = [] async with self.counter_lock: for i, chunk in enumerate(discord_chunks): - # Fix for UI - discord_chunks[i]['document']['id'] = self.source_id_counter # Extract document metadata document = chunk.get('document', {}) metadata = document.get('metadata', {}) @@ -1034,7 +1035,7 @@ class ConnectorService: url = f"https://discord.com/channels/@me/{channel_id}" source = { - "id": self.source_id_counter, + "id": document.get('id', self.source_id_counter), "title": title, "description": description, "url": url, diff --git a/surfsense_web/app/dashboard/[search_space_id]/researcher/[chat_id]/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/researcher/[chat_id]/page.tsx index 56893b7..6a547a6 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/researcher/[chat_id]/page.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/researcher/[chat_id]/page.tsx @@ -183,8 +183,8 @@ const SourcesDialogContent = ({