mirror of
https://github.com/MODSetter/SurfSense.git
synced 2025-09-06 20:49:09 +00:00
chore: update README and refactor ConnectorService for improved document handling and error management
This commit is contained in:
parent
1309f9e262
commit
a971bb1f72
4 changed files with 399 additions and 303 deletions
|
@ -110,7 +110,6 @@ See pyproject.toml for detailed dependency information. Key dependencies include
|
||||||
- fastapi and related packages
|
- fastapi and related packages
|
||||||
- fastapi-users: Authentication and user management
|
- fastapi-users: Authentication and user management
|
||||||
- firecrawl-py: Web crawling capabilities
|
- firecrawl-py: Web crawling capabilities
|
||||||
- gpt-researcher: Advanced research capabilities
|
|
||||||
- langchain components for AI workflows
|
- langchain components for AI workflows
|
||||||
- litellm: LLM model integration
|
- litellm: LLM model integration
|
||||||
- pgvector: Vector similarity search in PostgreSQL
|
- pgvector: Vector similarity search in PostgreSQL
|
||||||
|
|
|
@ -143,7 +143,8 @@ async def fetch_relevant_documents(
|
||||||
connectors_to_search: List[str],
|
connectors_to_search: List[str],
|
||||||
writer: StreamWriter = None,
|
writer: StreamWriter = None,
|
||||||
state: State = None,
|
state: State = None,
|
||||||
top_k: int = 10
|
top_k: int = 10,
|
||||||
|
connector_service: ConnectorService = None
|
||||||
) -> List[Dict[str, Any]]:
|
) -> List[Dict[str, Any]]:
|
||||||
"""
|
"""
|
||||||
Fetch relevant documents for research questions using the provided connectors.
|
Fetch relevant documents for research questions using the provided connectors.
|
||||||
|
@ -162,7 +163,7 @@ async def fetch_relevant_documents(
|
||||||
List of relevant documents
|
List of relevant documents
|
||||||
"""
|
"""
|
||||||
# Initialize services
|
# Initialize services
|
||||||
connector_service = ConnectorService(db_session)
|
# connector_service = ConnectorService(db_session)
|
||||||
|
|
||||||
# Only use streaming if both writer and state are provided
|
# Only use streaming if both writer and state are provided
|
||||||
streaming_service = state.streaming_service if state is not None else None
|
streaming_service = state.streaming_service if state is not None else None
|
||||||
|
@ -494,10 +495,12 @@ async def process_sections(state: State, config: RunnableConfig, writer: StreamW
|
||||||
elif configuration.num_sections == 6:
|
elif configuration.num_sections == 6:
|
||||||
TOP_K = 30
|
TOP_K = 30
|
||||||
|
|
||||||
|
|
||||||
relevant_documents = []
|
relevant_documents = []
|
||||||
async with async_session_maker() as db_session:
|
async with async_session_maker() as db_session:
|
||||||
try:
|
try:
|
||||||
|
# Create connector service inside the db_session scope
|
||||||
|
connector_service = ConnectorService(db_session)
|
||||||
|
|
||||||
relevant_documents = await fetch_relevant_documents(
|
relevant_documents = await fetch_relevant_documents(
|
||||||
research_questions=all_questions,
|
research_questions=all_questions,
|
||||||
user_id=configuration.user_id,
|
user_id=configuration.user_id,
|
||||||
|
@ -506,7 +509,8 @@ async def process_sections(state: State, config: RunnableConfig, writer: StreamW
|
||||||
connectors_to_search=configuration.connectors_to_search,
|
connectors_to_search=configuration.connectors_to_search,
|
||||||
writer=writer,
|
writer=writer,
|
||||||
state=state,
|
state=state,
|
||||||
top_k=TOP_K
|
top_k=TOP_K,
|
||||||
|
connector_service=connector_service
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
error_message = f"Error fetching relevant documents: {str(e)}"
|
error_message = f"Error fetching relevant documents: {str(e)}"
|
||||||
|
|
|
@ -102,7 +102,7 @@ async def write_sub_section(state: State, config: RunnableConfig) -> Dict[str, A
|
||||||
# Extract content and metadata
|
# Extract content and metadata
|
||||||
content = doc.get("content", "")
|
content = doc.get("content", "")
|
||||||
doc_info = doc.get("document", {})
|
doc_info = doc.get("document", {})
|
||||||
document_id = doc_info.get("id", f"{i+1}") # Use document ID or index+1 as source_id
|
document_id = doc_info.get("id") # Use document ID
|
||||||
|
|
||||||
# Format document according to the citation system prompt's expected format
|
# Format document according to the citation system prompt's expected format
|
||||||
formatted_doc = f"""
|
formatted_doc = f"""
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
import json
|
import json
|
||||||
from typing import List, Dict, Any, Optional, Tuple
|
from typing import List, Dict, Any, Optional, Tuple
|
||||||
|
import asyncio
|
||||||
from sqlalchemy.ext.asyncio import AsyncSession
|
from sqlalchemy.ext.asyncio import AsyncSession
|
||||||
from sqlalchemy.future import select
|
from sqlalchemy.future import select
|
||||||
from app.retriver.chunks_hybrid_search import ChucksHybridSearchRetriever
|
from app.retriver.chunks_hybrid_search import ChucksHybridSearchRetriever
|
||||||
|
@ -13,6 +14,7 @@ class ConnectorService:
|
||||||
self.session = session
|
self.session = session
|
||||||
self.retriever = ChucksHybridSearchRetriever(session)
|
self.retriever = ChucksHybridSearchRetriever(session)
|
||||||
self.source_id_counter = 1
|
self.source_id_counter = 1
|
||||||
|
self.counter_lock = asyncio.Lock() # Lock to protect counter in multithreaded environments
|
||||||
|
|
||||||
async def search_crawled_urls(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20) -> tuple:
|
async def search_crawled_urls(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20) -> tuple:
|
||||||
"""
|
"""
|
||||||
|
@ -29,8 +31,18 @@ class ConnectorService:
|
||||||
document_type="CRAWLED_URL"
|
document_type="CRAWLED_URL"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Early return if no results
|
||||||
|
if not crawled_urls_chunks:
|
||||||
|
return {
|
||||||
|
"id": 1,
|
||||||
|
"name": "Crawled URLs",
|
||||||
|
"type": "CRAWLED_URL",
|
||||||
|
"sources": [],
|
||||||
|
}, []
|
||||||
|
|
||||||
# Process each chunk and create sources directly without deduplication
|
# Process each chunk and create sources directly without deduplication
|
||||||
sources_list = []
|
sources_list = []
|
||||||
|
async with self.counter_lock:
|
||||||
for i, chunk in enumerate(crawled_urls_chunks):
|
for i, chunk in enumerate(crawled_urls_chunks):
|
||||||
# Fix for UI
|
# Fix for UI
|
||||||
crawled_urls_chunks[i]['document']['id'] = self.source_id_counter
|
crawled_urls_chunks[i]['document']['id'] = self.source_id_counter
|
||||||
|
@ -74,8 +86,18 @@ class ConnectorService:
|
||||||
document_type="FILE"
|
document_type="FILE"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Early return if no results
|
||||||
|
if not files_chunks:
|
||||||
|
return {
|
||||||
|
"id": 2,
|
||||||
|
"name": "Files",
|
||||||
|
"type": "FILE",
|
||||||
|
"sources": [],
|
||||||
|
}, []
|
||||||
|
|
||||||
# Process each chunk and create sources directly without deduplication
|
# Process each chunk and create sources directly without deduplication
|
||||||
sources_list = []
|
sources_list = []
|
||||||
|
async with self.counter_lock:
|
||||||
for i, chunk in enumerate(files_chunks):
|
for i, chunk in enumerate(files_chunks):
|
||||||
# Fix for UI
|
# Fix for UI
|
||||||
files_chunks[i]['document']['id'] = self.source_id_counter
|
files_chunks[i]['document']['id'] = self.source_id_counter
|
||||||
|
@ -163,10 +185,20 @@ class ConnectorService:
|
||||||
# Extract results from Tavily response
|
# Extract results from Tavily response
|
||||||
tavily_results = response.get("results", [])
|
tavily_results = response.get("results", [])
|
||||||
|
|
||||||
|
# Early return if no results
|
||||||
|
if not tavily_results:
|
||||||
|
return {
|
||||||
|
"id": 3,
|
||||||
|
"name": "Tavily Search",
|
||||||
|
"type": "TAVILY_API",
|
||||||
|
"sources": [],
|
||||||
|
}, []
|
||||||
|
|
||||||
# Process each result and create sources directly without deduplication
|
# Process each result and create sources directly without deduplication
|
||||||
sources_list = []
|
sources_list = []
|
||||||
documents = []
|
documents = []
|
||||||
|
|
||||||
|
async with self.counter_lock:
|
||||||
for i, result in enumerate(tavily_results):
|
for i, result in enumerate(tavily_results):
|
||||||
|
|
||||||
# Create a source entry
|
# Create a source entry
|
||||||
|
@ -232,8 +264,18 @@ class ConnectorService:
|
||||||
document_type="SLACK_CONNECTOR"
|
document_type="SLACK_CONNECTOR"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Early return if no results
|
||||||
|
if not slack_chunks:
|
||||||
|
return {
|
||||||
|
"id": 4,
|
||||||
|
"name": "Slack",
|
||||||
|
"type": "SLACK_CONNECTOR",
|
||||||
|
"sources": [],
|
||||||
|
}, []
|
||||||
|
|
||||||
# Process each chunk and create sources directly without deduplication
|
# Process each chunk and create sources directly without deduplication
|
||||||
sources_list = []
|
sources_list = []
|
||||||
|
async with self.counter_lock:
|
||||||
for i, chunk in enumerate(slack_chunks):
|
for i, chunk in enumerate(slack_chunks):
|
||||||
# Fix for UI
|
# Fix for UI
|
||||||
slack_chunks[i]['document']['id'] = self.source_id_counter
|
slack_chunks[i]['document']['id'] = self.source_id_counter
|
||||||
|
@ -302,8 +344,18 @@ class ConnectorService:
|
||||||
document_type="NOTION_CONNECTOR"
|
document_type="NOTION_CONNECTOR"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Early return if no results
|
||||||
|
if not notion_chunks:
|
||||||
|
return {
|
||||||
|
"id": 5,
|
||||||
|
"name": "Notion",
|
||||||
|
"type": "NOTION_CONNECTOR",
|
||||||
|
"sources": [],
|
||||||
|
}, []
|
||||||
|
|
||||||
# Process each chunk and create sources directly without deduplication
|
# Process each chunk and create sources directly without deduplication
|
||||||
sources_list = []
|
sources_list = []
|
||||||
|
async with self.counter_lock:
|
||||||
for i, chunk in enumerate(notion_chunks):
|
for i, chunk in enumerate(notion_chunks):
|
||||||
# Fix for UI
|
# Fix for UI
|
||||||
notion_chunks[i]['document']['id'] = self.source_id_counter
|
notion_chunks[i]['document']['id'] = self.source_id_counter
|
||||||
|
@ -374,8 +426,18 @@ class ConnectorService:
|
||||||
document_type="EXTENSION"
|
document_type="EXTENSION"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Early return if no results
|
||||||
|
if not extension_chunks:
|
||||||
|
return {
|
||||||
|
"id": 6,
|
||||||
|
"name": "Extension",
|
||||||
|
"type": "EXTENSION",
|
||||||
|
"sources": [],
|
||||||
|
}, []
|
||||||
|
|
||||||
# Process each chunk and create sources directly without deduplication
|
# Process each chunk and create sources directly without deduplication
|
||||||
sources_list = []
|
sources_list = []
|
||||||
|
async with self.counter_lock:
|
||||||
for i, chunk in enumerate(extension_chunks):
|
for i, chunk in enumerate(extension_chunks):
|
||||||
# Fix for UI
|
# Fix for UI
|
||||||
extension_chunks[i]['document']['id'] = self.source_id_counter
|
extension_chunks[i]['document']['id'] = self.source_id_counter
|
||||||
|
@ -464,8 +526,18 @@ class ConnectorService:
|
||||||
document_type="YOUTUBE_VIDEO"
|
document_type="YOUTUBE_VIDEO"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Early return if no results
|
||||||
|
if not youtube_chunks:
|
||||||
|
return {
|
||||||
|
"id": 7,
|
||||||
|
"name": "YouTube Videos",
|
||||||
|
"type": "YOUTUBE_VIDEO",
|
||||||
|
"sources": [],
|
||||||
|
}, []
|
||||||
|
|
||||||
# Process each chunk and create sources directly without deduplication
|
# Process each chunk and create sources directly without deduplication
|
||||||
sources_list = []
|
sources_list = []
|
||||||
|
async with self.counter_lock:
|
||||||
for i, chunk in enumerate(youtube_chunks):
|
for i, chunk in enumerate(youtube_chunks):
|
||||||
# Fix for UI
|
# Fix for UI
|
||||||
youtube_chunks[i]['document']['id'] = self.source_id_counter
|
youtube_chunks[i]['document']['id'] = self.source_id_counter
|
||||||
|
@ -530,8 +602,18 @@ class ConnectorService:
|
||||||
document_type="GITHUB_CONNECTOR"
|
document_type="GITHUB_CONNECTOR"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Early return if no results
|
||||||
|
if not github_chunks:
|
||||||
|
return {
|
||||||
|
"id": 8,
|
||||||
|
"name": "GitHub",
|
||||||
|
"type": "GITHUB_CONNECTOR",
|
||||||
|
"sources": [],
|
||||||
|
}, []
|
||||||
|
|
||||||
# Process each chunk and create sources directly without deduplication
|
# Process each chunk and create sources directly without deduplication
|
||||||
sources_list = []
|
sources_list = []
|
||||||
|
async with self.counter_lock:
|
||||||
for i, chunk in enumerate(github_chunks):
|
for i, chunk in enumerate(github_chunks):
|
||||||
# Fix for UI - assign a unique ID for citation/source tracking
|
# Fix for UI - assign a unique ID for citation/source tracking
|
||||||
github_chunks[i]['document']['id'] = self.source_id_counter
|
github_chunks[i]['document']['id'] = self.source_id_counter
|
||||||
|
@ -582,8 +664,18 @@ class ConnectorService:
|
||||||
document_type="LINEAR_CONNECTOR"
|
document_type="LINEAR_CONNECTOR"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Early return if no results
|
||||||
|
if not linear_chunks:
|
||||||
|
return {
|
||||||
|
"id": 9,
|
||||||
|
"name": "Linear Issues",
|
||||||
|
"type": "LINEAR_CONNECTOR",
|
||||||
|
"sources": [],
|
||||||
|
}, []
|
||||||
|
|
||||||
# Process each chunk and create sources directly without deduplication
|
# Process each chunk and create sources directly without deduplication
|
||||||
sources_list = []
|
sources_list = []
|
||||||
|
async with self.counter_lock:
|
||||||
for i, chunk in enumerate(linear_chunks):
|
for i, chunk in enumerate(linear_chunks):
|
||||||
# Fix for UI
|
# Fix for UI
|
||||||
linear_chunks[i]['document']['id'] = self.source_id_counter
|
linear_chunks[i]['document']['id'] = self.source_id_counter
|
||||||
|
@ -697,6 +789,7 @@ class ConnectorService:
|
||||||
sources_list = []
|
sources_list = []
|
||||||
documents = []
|
documents = []
|
||||||
|
|
||||||
|
async with self.counter_lock:
|
||||||
for i, result in enumerate(linkup_results):
|
for i, result in enumerate(linkup_results):
|
||||||
# Only process results that have content
|
# Only process results that have content
|
||||||
if not hasattr(result, 'content') or not result.content:
|
if not hasattr(result, 'content') or not result.content:
|
||||||
|
|
Loading…
Add table
Reference in a new issue