chore: update README and refactor ConnectorService for improved document handling and error management

This commit is contained in:
DESKTOP-RTLN3BA\$punk 2025-04-27 20:39:17 -07:00
parent 1309f9e262
commit a971bb1f72
4 changed files with 399 additions and 303 deletions

View file

@ -110,7 +110,6 @@ See pyproject.toml for detailed dependency information. Key dependencies include
- fastapi and related packages - fastapi and related packages
- fastapi-users: Authentication and user management - fastapi-users: Authentication and user management
- firecrawl-py: Web crawling capabilities - firecrawl-py: Web crawling capabilities
- gpt-researcher: Advanced research capabilities
- langchain components for AI workflows - langchain components for AI workflows
- litellm: LLM model integration - litellm: LLM model integration
- pgvector: Vector similarity search in PostgreSQL - pgvector: Vector similarity search in PostgreSQL

View file

@ -143,7 +143,8 @@ async def fetch_relevant_documents(
connectors_to_search: List[str], connectors_to_search: List[str],
writer: StreamWriter = None, writer: StreamWriter = None,
state: State = None, state: State = None,
top_k: int = 10 top_k: int = 10,
connector_service: ConnectorService = None
) -> List[Dict[str, Any]]: ) -> List[Dict[str, Any]]:
""" """
Fetch relevant documents for research questions using the provided connectors. Fetch relevant documents for research questions using the provided connectors.
@ -162,7 +163,7 @@ async def fetch_relevant_documents(
List of relevant documents List of relevant documents
""" """
# Initialize services # Initialize services
connector_service = ConnectorService(db_session) # connector_service = ConnectorService(db_session)
# Only use streaming if both writer and state are provided # Only use streaming if both writer and state are provided
streaming_service = state.streaming_service if state is not None else None streaming_service = state.streaming_service if state is not None else None
@ -494,10 +495,12 @@ async def process_sections(state: State, config: RunnableConfig, writer: StreamW
elif configuration.num_sections == 6: elif configuration.num_sections == 6:
TOP_K = 30 TOP_K = 30
relevant_documents = [] relevant_documents = []
async with async_session_maker() as db_session: async with async_session_maker() as db_session:
try: try:
# Create connector service inside the db_session scope
connector_service = ConnectorService(db_session)
relevant_documents = await fetch_relevant_documents( relevant_documents = await fetch_relevant_documents(
research_questions=all_questions, research_questions=all_questions,
user_id=configuration.user_id, user_id=configuration.user_id,
@ -506,7 +509,8 @@ async def process_sections(state: State, config: RunnableConfig, writer: StreamW
connectors_to_search=configuration.connectors_to_search, connectors_to_search=configuration.connectors_to_search,
writer=writer, writer=writer,
state=state, state=state,
top_k=TOP_K top_k=TOP_K,
connector_service=connector_service
) )
except Exception as e: except Exception as e:
error_message = f"Error fetching relevant documents: {str(e)}" error_message = f"Error fetching relevant documents: {str(e)}"

View file

@ -102,7 +102,7 @@ async def write_sub_section(state: State, config: RunnableConfig) -> Dict[str, A
# Extract content and metadata # Extract content and metadata
content = doc.get("content", "") content = doc.get("content", "")
doc_info = doc.get("document", {}) doc_info = doc.get("document", {})
document_id = doc_info.get("id", f"{i+1}") # Use document ID or index+1 as source_id document_id = doc_info.get("id") # Use document ID
# Format document according to the citation system prompt's expected format # Format document according to the citation system prompt's expected format
formatted_doc = f""" formatted_doc = f"""

View file

@ -1,5 +1,6 @@
import json import json
from typing import List, Dict, Any, Optional, Tuple from typing import List, Dict, Any, Optional, Tuple
import asyncio
from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.future import select from sqlalchemy.future import select
from app.retriver.chunks_hybrid_search import ChucksHybridSearchRetriever from app.retriver.chunks_hybrid_search import ChucksHybridSearchRetriever
@ -13,6 +14,7 @@ class ConnectorService:
self.session = session self.session = session
self.retriever = ChucksHybridSearchRetriever(session) self.retriever = ChucksHybridSearchRetriever(session)
self.source_id_counter = 1 self.source_id_counter = 1
self.counter_lock = asyncio.Lock() # Lock to protect counter in multithreaded environments
async def search_crawled_urls(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20) -> tuple: async def search_crawled_urls(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20) -> tuple:
""" """
@ -29,8 +31,18 @@ class ConnectorService:
document_type="CRAWLED_URL" document_type="CRAWLED_URL"
) )
# Early return if no results
if not crawled_urls_chunks:
return {
"id": 1,
"name": "Crawled URLs",
"type": "CRAWLED_URL",
"sources": [],
}, []
# Process each chunk and create sources directly without deduplication # Process each chunk and create sources directly without deduplication
sources_list = [] sources_list = []
async with self.counter_lock:
for i, chunk in enumerate(crawled_urls_chunks): for i, chunk in enumerate(crawled_urls_chunks):
# Fix for UI # Fix for UI
crawled_urls_chunks[i]['document']['id'] = self.source_id_counter crawled_urls_chunks[i]['document']['id'] = self.source_id_counter
@ -74,8 +86,18 @@ class ConnectorService:
document_type="FILE" document_type="FILE"
) )
# Early return if no results
if not files_chunks:
return {
"id": 2,
"name": "Files",
"type": "FILE",
"sources": [],
}, []
# Process each chunk and create sources directly without deduplication # Process each chunk and create sources directly without deduplication
sources_list = [] sources_list = []
async with self.counter_lock:
for i, chunk in enumerate(files_chunks): for i, chunk in enumerate(files_chunks):
# Fix for UI # Fix for UI
files_chunks[i]['document']['id'] = self.source_id_counter files_chunks[i]['document']['id'] = self.source_id_counter
@ -163,10 +185,20 @@ class ConnectorService:
# Extract results from Tavily response # Extract results from Tavily response
tavily_results = response.get("results", []) tavily_results = response.get("results", [])
# Early return if no results
if not tavily_results:
return {
"id": 3,
"name": "Tavily Search",
"type": "TAVILY_API",
"sources": [],
}, []
# Process each result and create sources directly without deduplication # Process each result and create sources directly without deduplication
sources_list = [] sources_list = []
documents = [] documents = []
async with self.counter_lock:
for i, result in enumerate(tavily_results): for i, result in enumerate(tavily_results):
# Create a source entry # Create a source entry
@ -232,8 +264,18 @@ class ConnectorService:
document_type="SLACK_CONNECTOR" document_type="SLACK_CONNECTOR"
) )
# Early return if no results
if not slack_chunks:
return {
"id": 4,
"name": "Slack",
"type": "SLACK_CONNECTOR",
"sources": [],
}, []
# Process each chunk and create sources directly without deduplication # Process each chunk and create sources directly without deduplication
sources_list = [] sources_list = []
async with self.counter_lock:
for i, chunk in enumerate(slack_chunks): for i, chunk in enumerate(slack_chunks):
# Fix for UI # Fix for UI
slack_chunks[i]['document']['id'] = self.source_id_counter slack_chunks[i]['document']['id'] = self.source_id_counter
@ -302,8 +344,18 @@ class ConnectorService:
document_type="NOTION_CONNECTOR" document_type="NOTION_CONNECTOR"
) )
# Early return if no results
if not notion_chunks:
return {
"id": 5,
"name": "Notion",
"type": "NOTION_CONNECTOR",
"sources": [],
}, []
# Process each chunk and create sources directly without deduplication # Process each chunk and create sources directly without deduplication
sources_list = [] sources_list = []
async with self.counter_lock:
for i, chunk in enumerate(notion_chunks): for i, chunk in enumerate(notion_chunks):
# Fix for UI # Fix for UI
notion_chunks[i]['document']['id'] = self.source_id_counter notion_chunks[i]['document']['id'] = self.source_id_counter
@ -374,8 +426,18 @@ class ConnectorService:
document_type="EXTENSION" document_type="EXTENSION"
) )
# Early return if no results
if not extension_chunks:
return {
"id": 6,
"name": "Extension",
"type": "EXTENSION",
"sources": [],
}, []
# Process each chunk and create sources directly without deduplication # Process each chunk and create sources directly without deduplication
sources_list = [] sources_list = []
async with self.counter_lock:
for i, chunk in enumerate(extension_chunks): for i, chunk in enumerate(extension_chunks):
# Fix for UI # Fix for UI
extension_chunks[i]['document']['id'] = self.source_id_counter extension_chunks[i]['document']['id'] = self.source_id_counter
@ -464,8 +526,18 @@ class ConnectorService:
document_type="YOUTUBE_VIDEO" document_type="YOUTUBE_VIDEO"
) )
# Early return if no results
if not youtube_chunks:
return {
"id": 7,
"name": "YouTube Videos",
"type": "YOUTUBE_VIDEO",
"sources": [],
}, []
# Process each chunk and create sources directly without deduplication # Process each chunk and create sources directly without deduplication
sources_list = [] sources_list = []
async with self.counter_lock:
for i, chunk in enumerate(youtube_chunks): for i, chunk in enumerate(youtube_chunks):
# Fix for UI # Fix for UI
youtube_chunks[i]['document']['id'] = self.source_id_counter youtube_chunks[i]['document']['id'] = self.source_id_counter
@ -530,8 +602,18 @@ class ConnectorService:
document_type="GITHUB_CONNECTOR" document_type="GITHUB_CONNECTOR"
) )
# Early return if no results
if not github_chunks:
return {
"id": 8,
"name": "GitHub",
"type": "GITHUB_CONNECTOR",
"sources": [],
}, []
# Process each chunk and create sources directly without deduplication # Process each chunk and create sources directly without deduplication
sources_list = [] sources_list = []
async with self.counter_lock:
for i, chunk in enumerate(github_chunks): for i, chunk in enumerate(github_chunks):
# Fix for UI - assign a unique ID for citation/source tracking # Fix for UI - assign a unique ID for citation/source tracking
github_chunks[i]['document']['id'] = self.source_id_counter github_chunks[i]['document']['id'] = self.source_id_counter
@ -582,8 +664,18 @@ class ConnectorService:
document_type="LINEAR_CONNECTOR" document_type="LINEAR_CONNECTOR"
) )
# Early return if no results
if not linear_chunks:
return {
"id": 9,
"name": "Linear Issues",
"type": "LINEAR_CONNECTOR",
"sources": [],
}, []
# Process each chunk and create sources directly without deduplication # Process each chunk and create sources directly without deduplication
sources_list = [] sources_list = []
async with self.counter_lock:
for i, chunk in enumerate(linear_chunks): for i, chunk in enumerate(linear_chunks):
# Fix for UI # Fix for UI
linear_chunks[i]['document']['id'] = self.source_id_counter linear_chunks[i]['document']['id'] = self.source_id_counter
@ -697,6 +789,7 @@ class ConnectorService:
sources_list = [] sources_list = []
documents = [] documents = []
async with self.counter_lock:
for i, result in enumerate(linkup_results): for i, result in enumerate(linkup_results):
# Only process results that have content # Only process results that have content
if not hasattr(result, 'content') or not result.content: if not hasattr(result, 'content') or not result.content: