chore: update README and refactor ConnectorService for improved document handling and error management

This commit is contained in:
DESKTOP-RTLN3BA\$punk 2025-04-27 20:39:17 -07:00
parent 1309f9e262
commit a971bb1f72
4 changed files with 399 additions and 303 deletions

View file

@ -110,7 +110,6 @@ See pyproject.toml for detailed dependency information. Key dependencies include
- fastapi and related packages - fastapi and related packages
- fastapi-users: Authentication and user management - fastapi-users: Authentication and user management
- firecrawl-py: Web crawling capabilities - firecrawl-py: Web crawling capabilities
- gpt-researcher: Advanced research capabilities
- langchain components for AI workflows - langchain components for AI workflows
- litellm: LLM model integration - litellm: LLM model integration
- pgvector: Vector similarity search in PostgreSQL - pgvector: Vector similarity search in PostgreSQL

View file

@ -143,7 +143,8 @@ async def fetch_relevant_documents(
connectors_to_search: List[str], connectors_to_search: List[str],
writer: StreamWriter = None, writer: StreamWriter = None,
state: State = None, state: State = None,
top_k: int = 10 top_k: int = 10,
connector_service: ConnectorService = None
) -> List[Dict[str, Any]]: ) -> List[Dict[str, Any]]:
""" """
Fetch relevant documents for research questions using the provided connectors. Fetch relevant documents for research questions using the provided connectors.
@ -162,7 +163,7 @@ async def fetch_relevant_documents(
List of relevant documents List of relevant documents
""" """
# Initialize services # Initialize services
connector_service = ConnectorService(db_session) # connector_service = ConnectorService(db_session)
# Only use streaming if both writer and state are provided # Only use streaming if both writer and state are provided
streaming_service = state.streaming_service if state is not None else None streaming_service = state.streaming_service if state is not None else None
@ -494,10 +495,12 @@ async def process_sections(state: State, config: RunnableConfig, writer: StreamW
elif configuration.num_sections == 6: elif configuration.num_sections == 6:
TOP_K = 30 TOP_K = 30
relevant_documents = [] relevant_documents = []
async with async_session_maker() as db_session: async with async_session_maker() as db_session:
try: try:
# Create connector service inside the db_session scope
connector_service = ConnectorService(db_session)
relevant_documents = await fetch_relevant_documents( relevant_documents = await fetch_relevant_documents(
research_questions=all_questions, research_questions=all_questions,
user_id=configuration.user_id, user_id=configuration.user_id,
@ -506,7 +509,8 @@ async def process_sections(state: State, config: RunnableConfig, writer: StreamW
connectors_to_search=configuration.connectors_to_search, connectors_to_search=configuration.connectors_to_search,
writer=writer, writer=writer,
state=state, state=state,
top_k=TOP_K top_k=TOP_K,
connector_service=connector_service
) )
except Exception as e: except Exception as e:
error_message = f"Error fetching relevant documents: {str(e)}" error_message = f"Error fetching relevant documents: {str(e)}"

View file

@ -102,7 +102,7 @@ async def write_sub_section(state: State, config: RunnableConfig) -> Dict[str, A
# Extract content and metadata # Extract content and metadata
content = doc.get("content", "") content = doc.get("content", "")
doc_info = doc.get("document", {}) doc_info = doc.get("document", {})
document_id = doc_info.get("id", f"{i+1}") # Use document ID or index+1 as source_id document_id = doc_info.get("id") # Use document ID
# Format document according to the citation system prompt's expected format # Format document according to the citation system prompt's expected format
formatted_doc = f""" formatted_doc = f"""

View file

@ -1,5 +1,6 @@
import json import json
from typing import List, Dict, Any, Optional, Tuple from typing import List, Dict, Any, Optional, Tuple
import asyncio
from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.future import select from sqlalchemy.future import select
from app.retriver.chunks_hybrid_search import ChucksHybridSearchRetriever from app.retriver.chunks_hybrid_search import ChucksHybridSearchRetriever
@ -13,6 +14,7 @@ class ConnectorService:
self.session = session self.session = session
self.retriever = ChucksHybridSearchRetriever(session) self.retriever = ChucksHybridSearchRetriever(session)
self.source_id_counter = 1 self.source_id_counter = 1
self.counter_lock = asyncio.Lock() # Lock to protect counter in multithreaded environments
async def search_crawled_urls(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20) -> tuple: async def search_crawled_urls(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20) -> tuple:
""" """
@ -29,25 +31,35 @@ class ConnectorService:
document_type="CRAWLED_URL" document_type="CRAWLED_URL"
) )
# Early return if no results
if not crawled_urls_chunks:
return {
"id": 1,
"name": "Crawled URLs",
"type": "CRAWLED_URL",
"sources": [],
}, []
# Process each chunk and create sources directly without deduplication # Process each chunk and create sources directly without deduplication
sources_list = [] sources_list = []
for i, chunk in enumerate(crawled_urls_chunks): async with self.counter_lock:
# Fix for UI for i, chunk in enumerate(crawled_urls_chunks):
crawled_urls_chunks[i]['document']['id'] = self.source_id_counter # Fix for UI
# Extract document metadata crawled_urls_chunks[i]['document']['id'] = self.source_id_counter
document = chunk.get('document', {}) # Extract document metadata
metadata = document.get('metadata', {}) document = chunk.get('document', {})
metadata = document.get('metadata', {})
# Create a source entry # Create a source entry
source = { source = {
"id": self.source_id_counter, "id": self.source_id_counter,
"title": document.get('title', 'Untitled Document'), "title": document.get('title', 'Untitled Document'),
"description": metadata.get('og:description', metadata.get('ogDescription', chunk.get('content', '')[:100])), "description": metadata.get('og:description', metadata.get('ogDescription', chunk.get('content', '')[:100])),
"url": metadata.get('url', '') "url": metadata.get('url', '')
} }
self.source_id_counter += 1 self.source_id_counter += 1
sources_list.append(source) sources_list.append(source)
# Create result object # Create result object
result_object = { result_object = {
@ -73,26 +85,36 @@ class ConnectorService:
search_space_id=search_space_id, search_space_id=search_space_id,
document_type="FILE" document_type="FILE"
) )
# Early return if no results
if not files_chunks:
return {
"id": 2,
"name": "Files",
"type": "FILE",
"sources": [],
}, []
# Process each chunk and create sources directly without deduplication # Process each chunk and create sources directly without deduplication
sources_list = [] sources_list = []
for i, chunk in enumerate(files_chunks): async with self.counter_lock:
# Fix for UI for i, chunk in enumerate(files_chunks):
files_chunks[i]['document']['id'] = self.source_id_counter # Fix for UI
# Extract document metadata files_chunks[i]['document']['id'] = self.source_id_counter
document = chunk.get('document', {}) # Extract document metadata
metadata = document.get('metadata', {}) document = chunk.get('document', {})
metadata = document.get('metadata', {})
# Create a source entry # Create a source entry
source = { source = {
"id": self.source_id_counter, "id": self.source_id_counter,
"title": document.get('title', 'Untitled Document'), "title": document.get('title', 'Untitled Document'),
"description": metadata.get('og:description', metadata.get('ogDescription', chunk.get('content', '')[:100])), "description": metadata.get('og:description', metadata.get('ogDescription', chunk.get('content', '')[:100])),
"url": metadata.get('url', '') "url": metadata.get('url', '')
} }
self.source_id_counter += 1 self.source_id_counter += 1
sources_list.append(source) sources_list.append(source)
# Create result object # Create result object
result_object = { result_object = {
@ -163,39 +185,49 @@ class ConnectorService:
# Extract results from Tavily response # Extract results from Tavily response
tavily_results = response.get("results", []) tavily_results = response.get("results", [])
# Early return if no results
if not tavily_results:
return {
"id": 3,
"name": "Tavily Search",
"type": "TAVILY_API",
"sources": [],
}, []
# Process each result and create sources directly without deduplication # Process each result and create sources directly without deduplication
sources_list = [] sources_list = []
documents = [] documents = []
for i, result in enumerate(tavily_results): async with self.counter_lock:
for i, result in enumerate(tavily_results):
# Create a source entry
source = { # Create a source entry
"id": self.source_id_counter, source = {
"title": result.get("title", "Tavily Result"),
"description": result.get("content", "")[:100],
"url": result.get("url", "")
}
sources_list.append(source)
# Create a document entry
document = {
"chunk_id": f"tavily_chunk_{i}",
"content": result.get("content", ""),
"score": result.get("score", 0.0),
"document": {
"id": self.source_id_counter, "id": self.source_id_counter,
"title": result.get("title", "Tavily Result"), "title": result.get("title", "Tavily Result"),
"document_type": "TAVILY_API", "description": result.get("content", "")[:100],
"metadata": { "url": result.get("url", "")
"url": result.get("url", ""), }
"published_date": result.get("published_date", ""), sources_list.append(source)
"source": "TAVILY_API"
# Create a document entry
document = {
"chunk_id": f"tavily_chunk_{i}",
"content": result.get("content", ""),
"score": result.get("score", 0.0),
"document": {
"id": self.source_id_counter,
"title": result.get("title", "Tavily Result"),
"document_type": "TAVILY_API",
"metadata": {
"url": result.get("url", ""),
"published_date": result.get("published_date", ""),
"source": "TAVILY_API"
}
} }
} }
} documents.append(document)
documents.append(document) self.source_id_counter += 1
self.source_id_counter += 1
# Create result object # Create result object
result_object = { result_object = {
@ -231,45 +263,55 @@ class ConnectorService:
search_space_id=search_space_id, search_space_id=search_space_id,
document_type="SLACK_CONNECTOR" document_type="SLACK_CONNECTOR"
) )
# Early return if no results
if not slack_chunks:
return {
"id": 4,
"name": "Slack",
"type": "SLACK_CONNECTOR",
"sources": [],
}, []
# Process each chunk and create sources directly without deduplication # Process each chunk and create sources directly without deduplication
sources_list = [] sources_list = []
for i, chunk in enumerate(slack_chunks): async with self.counter_lock:
# Fix for UI for i, chunk in enumerate(slack_chunks):
slack_chunks[i]['document']['id'] = self.source_id_counter # Fix for UI
# Extract document metadata slack_chunks[i]['document']['id'] = self.source_id_counter
document = chunk.get('document', {}) # Extract document metadata
metadata = document.get('metadata', {}) document = chunk.get('document', {})
metadata = document.get('metadata', {})
# Create a mapped source entry with Slack-specific metadata # Create a mapped source entry with Slack-specific metadata
channel_name = metadata.get('channel_name', 'Unknown Channel') channel_name = metadata.get('channel_name', 'Unknown Channel')
channel_id = metadata.get('channel_id', '') channel_id = metadata.get('channel_id', '')
message_date = metadata.get('start_date', '') message_date = metadata.get('start_date', '')
# Create a more descriptive title for Slack messages
title = f"Slack: {channel_name}"
if message_date:
title += f" ({message_date})"
# Create a more descriptive description for Slack messages # Create a more descriptive title for Slack messages
description = chunk.get('content', '')[:100] title = f"Slack: {channel_name}"
if len(description) == 100: if message_date:
description += "..." title += f" ({message_date})"
# For URL, we can use a placeholder or construct a URL to the Slack channel if available # Create a more descriptive description for Slack messages
url = "" description = chunk.get('content', '')[:100]
if channel_id: if len(description) == 100:
url = f"https://slack.com/app_redirect?channel={channel_id}" description += "..."
# For URL, we can use a placeholder or construct a URL to the Slack channel if available
url = ""
if channel_id:
url = f"https://slack.com/app_redirect?channel={channel_id}"
source = { source = {
"id": self.source_id_counter, "id": self.source_id_counter,
"title": title, "title": title,
"description": description, "description": description,
"url": url, "url": url,
} }
self.source_id_counter += 1 self.source_id_counter += 1
sources_list.append(source) sources_list.append(source)
# Create result object # Create result object
result_object = { result_object = {
@ -301,47 +343,57 @@ class ConnectorService:
search_space_id=search_space_id, search_space_id=search_space_id,
document_type="NOTION_CONNECTOR" document_type="NOTION_CONNECTOR"
) )
# Early return if no results
if not notion_chunks:
return {
"id": 5,
"name": "Notion",
"type": "NOTION_CONNECTOR",
"sources": [],
}, []
# Process each chunk and create sources directly without deduplication # Process each chunk and create sources directly without deduplication
sources_list = [] sources_list = []
for i, chunk in enumerate(notion_chunks): async with self.counter_lock:
# Fix for UI for i, chunk in enumerate(notion_chunks):
notion_chunks[i]['document']['id'] = self.source_id_counter # Fix for UI
notion_chunks[i]['document']['id'] = self.source_id_counter
# Extract document metadata
document = chunk.get('document', {})
metadata = document.get('metadata', {})
# Create a mapped source entry with Notion-specific metadata
page_title = metadata.get('page_title', 'Untitled Page')
page_id = metadata.get('page_id', '')
indexed_at = metadata.get('indexed_at', '')
# Create a more descriptive title for Notion pages
title = f"Notion: {page_title}"
if indexed_at:
title += f" (indexed: {indexed_at})"
# Create a more descriptive description for Notion pages # Extract document metadata
description = chunk.get('content', '')[:100] document = chunk.get('document', {})
if len(description) == 100: metadata = document.get('metadata', {})
description += "..."
# Create a mapped source entry with Notion-specific metadata
page_title = metadata.get('page_title', 'Untitled Page')
page_id = metadata.get('page_id', '')
indexed_at = metadata.get('indexed_at', '')
# For URL, we can use a placeholder or construct a URL to the Notion page if available # Create a more descriptive title for Notion pages
url = "" title = f"Notion: {page_title}"
if page_id: if indexed_at:
# Notion page URLs follow this format title += f" (indexed: {indexed_at})"
url = f"https://notion.so/{page_id.replace('-', '')}"
# Create a more descriptive description for Notion pages
description = chunk.get('content', '')[:100]
if len(description) == 100:
description += "..."
# For URL, we can use a placeholder or construct a URL to the Notion page if available
url = ""
if page_id:
# Notion page URLs follow this format
url = f"https://notion.so/{page_id.replace('-', '')}"
source = { source = {
"id": self.source_id_counter, "id": self.source_id_counter,
"title": title, "title": title,
"description": description, "description": description,
"url": url, "url": url,
} }
self.source_id_counter += 1 self.source_id_counter += 1
sources_list.append(source) sources_list.append(source)
# Create result object # Create result object
result_object = { result_object = {
@ -373,65 +425,75 @@ class ConnectorService:
search_space_id=search_space_id, search_space_id=search_space_id,
document_type="EXTENSION" document_type="EXTENSION"
) )
# Early return if no results
if not extension_chunks:
return {
"id": 6,
"name": "Extension",
"type": "EXTENSION",
"sources": [],
}, []
# Process each chunk and create sources directly without deduplication # Process each chunk and create sources directly without deduplication
sources_list = [] sources_list = []
for i, chunk in enumerate(extension_chunks): async with self.counter_lock:
# Fix for UI for i, chunk in enumerate(extension_chunks):
extension_chunks[i]['document']['id'] = self.source_id_counter # Fix for UI
extension_chunks[i]['document']['id'] = self.source_id_counter
# Extract document metadata
document = chunk.get('document', {}) # Extract document metadata
metadata = document.get('metadata', {}) document = chunk.get('document', {})
metadata = document.get('metadata', {})
# Extract extension-specific metadata # Extract extension-specific metadata
webpage_title = metadata.get('VisitedWebPageTitle', 'Untitled Page') webpage_title = metadata.get('VisitedWebPageTitle', 'Untitled Page')
webpage_url = metadata.get('VisitedWebPageURL', '') webpage_url = metadata.get('VisitedWebPageURL', '')
visit_date = metadata.get('VisitedWebPageDateWithTimeInISOString', '') visit_date = metadata.get('VisitedWebPageDateWithTimeInISOString', '')
visit_duration = metadata.get('VisitedWebPageVisitDurationInMilliseconds', '') visit_duration = metadata.get('VisitedWebPageVisitDurationInMilliseconds', '')
browsing_session_id = metadata.get('BrowsingSessionId', '') browsing_session_id = metadata.get('BrowsingSessionId', '')
# Create a more descriptive title for extension data
title = webpage_title
if visit_date:
# Format the date for display (simplified)
try:
# Just extract the date part for display
formatted_date = visit_date.split('T')[0] if 'T' in visit_date else visit_date
title += f" (visited: {formatted_date})"
except:
# Fallback if date parsing fails
title += f" (visited: {visit_date})"
# Create a more descriptive description for extension data # Create a more descriptive title for extension data
description = chunk.get('content', '')[:100] title = webpage_title
if len(description) == 100: if visit_date:
description += "..." # Format the date for display (simplified)
try:
# Add visit duration if available # Just extract the date part for display
if visit_duration: formatted_date = visit_date.split('T')[0] if 'T' in visit_date else visit_date
try: title += f" (visited: {formatted_date})"
duration_seconds = int(visit_duration) / 1000 except:
if duration_seconds < 60: # Fallback if date parsing fails
duration_text = f"{duration_seconds:.1f} seconds" title += f" (visited: {visit_date})"
else:
duration_text = f"{duration_seconds/60:.1f} minutes"
if description: # Create a more descriptive description for extension data
description += f" | Duration: {duration_text}" description = chunk.get('content', '')[:100]
except: if len(description) == 100:
# Fallback if duration parsing fails description += "..."
pass
# Add visit duration if available
if visit_duration:
try:
duration_seconds = int(visit_duration) / 1000
if duration_seconds < 60:
duration_text = f"{duration_seconds:.1f} seconds"
else:
duration_text = f"{duration_seconds/60:.1f} minutes"
if description:
description += f" | Duration: {duration_text}"
except:
# Fallback if duration parsing fails
pass
source = { source = {
"id": self.source_id_counter, "id": self.source_id_counter,
"title": title, "title": title,
"description": description, "description": description,
"url": webpage_url "url": webpage_url
} }
self.source_id_counter += 1 self.source_id_counter += 1
sources_list.append(source) sources_list.append(source)
# Create result object # Create result object
result_object = { result_object = {
@ -463,47 +525,57 @@ class ConnectorService:
search_space_id=search_space_id, search_space_id=search_space_id,
document_type="YOUTUBE_VIDEO" document_type="YOUTUBE_VIDEO"
) )
# Early return if no results
if not youtube_chunks:
return {
"id": 7,
"name": "YouTube Videos",
"type": "YOUTUBE_VIDEO",
"sources": [],
}, []
# Process each chunk and create sources directly without deduplication # Process each chunk and create sources directly without deduplication
sources_list = [] sources_list = []
for i, chunk in enumerate(youtube_chunks): async with self.counter_lock:
# Fix for UI for i, chunk in enumerate(youtube_chunks):
youtube_chunks[i]['document']['id'] = self.source_id_counter # Fix for UI
youtube_chunks[i]['document']['id'] = self.source_id_counter
# Extract document metadata
document = chunk.get('document', {})
metadata = document.get('metadata', {})
# Extract YouTube-specific metadata
video_title = metadata.get('video_title', 'Untitled Video')
video_id = metadata.get('video_id', '')
channel_name = metadata.get('channel_name', '')
published_date = metadata.get('published_date', '')
# Create a more descriptive title for YouTube videos
title = video_title
if channel_name:
title += f" - {channel_name}"
# Create a more descriptive description for YouTube videos # Extract document metadata
description = metadata.get('description', chunk.get('content', '')[:100]) document = chunk.get('document', {})
if len(description) == 100: metadata = document.get('metadata', {})
description += "..."
# Extract YouTube-specific metadata
video_title = metadata.get('video_title', 'Untitled Video')
video_id = metadata.get('video_id', '')
channel_name = metadata.get('channel_name', '')
published_date = metadata.get('published_date', '')
# For URL, construct a URL to the YouTube video # Create a more descriptive title for YouTube videos
url = f"https://www.youtube.com/watch?v={video_id}" if video_id else "" title = video_title
if channel_name:
title += f" - {channel_name}"
# Create a more descriptive description for YouTube videos
description = metadata.get('description', chunk.get('content', '')[:100])
if len(description) == 100:
description += "..."
# For URL, construct a URL to the YouTube video
url = f"https://www.youtube.com/watch?v={video_id}" if video_id else ""
source = { source = {
"id": self.source_id_counter, "id": self.source_id_counter,
"title": title, "title": title,
"description": description, "description": description,
"url": url, "url": url,
"video_id": video_id, # Additional field for YouTube videos "video_id": video_id, # Additional field for YouTube videos
"channel_name": channel_name # Additional field for YouTube videos "channel_name": channel_name # Additional field for YouTube videos
} }
self.source_id_counter += 1 self.source_id_counter += 1
sources_list.append(source) sources_list.append(source)
# Create result object # Create result object
result_object = { result_object = {
@ -529,27 +601,37 @@ class ConnectorService:
search_space_id=search_space_id, search_space_id=search_space_id,
document_type="GITHUB_CONNECTOR" document_type="GITHUB_CONNECTOR"
) )
# Early return if no results
if not github_chunks:
return {
"id": 8,
"name": "GitHub",
"type": "GITHUB_CONNECTOR",
"sources": [],
}, []
# Process each chunk and create sources directly without deduplication # Process each chunk and create sources directly without deduplication
sources_list = [] sources_list = []
for i, chunk in enumerate(github_chunks): async with self.counter_lock:
# Fix for UI - assign a unique ID for citation/source tracking for i, chunk in enumerate(github_chunks):
github_chunks[i]['document']['id'] = self.source_id_counter # Fix for UI - assign a unique ID for citation/source tracking
github_chunks[i]['document']['id'] = self.source_id_counter
# Extract document metadata
document = chunk.get('document', {}) # Extract document metadata
metadata = document.get('metadata', {}) document = chunk.get('document', {})
metadata = document.get('metadata', {})
# Create a source entry # Create a source entry
source = { source = {
"id": self.source_id_counter, "id": self.source_id_counter,
"title": document.get('title', 'GitHub Document'), # Use specific title if available "title": document.get('title', 'GitHub Document'), # Use specific title if available
"description": metadata.get('description', chunk.get('content', '')[:100]), # Use description or content preview "description": metadata.get('description', chunk.get('content', '')[:100]), # Use description or content preview
"url": metadata.get('url', '') # Use URL if available in metadata "url": metadata.get('url', '') # Use URL if available in metadata
} }
self.source_id_counter += 1 self.source_id_counter += 1
sources_list.append(source) sources_list.append(source)
# Create result object # Create result object
result_object = { result_object = {
@ -581,59 +663,69 @@ class ConnectorService:
search_space_id=search_space_id, search_space_id=search_space_id,
document_type="LINEAR_CONNECTOR" document_type="LINEAR_CONNECTOR"
) )
# Early return if no results
if not linear_chunks:
return {
"id": 9,
"name": "Linear Issues",
"type": "LINEAR_CONNECTOR",
"sources": [],
}, []
# Process each chunk and create sources directly without deduplication # Process each chunk and create sources directly without deduplication
sources_list = [] sources_list = []
for i, chunk in enumerate(linear_chunks): async with self.counter_lock:
# Fix for UI for i, chunk in enumerate(linear_chunks):
linear_chunks[i]['document']['id'] = self.source_id_counter # Fix for UI
linear_chunks[i]['document']['id'] = self.source_id_counter
# Extract document metadata
document = chunk.get('document', {})
metadata = document.get('metadata', {})
# Extract Linear-specific metadata
issue_identifier = metadata.get('issue_identifier', '')
issue_title = metadata.get('issue_title', 'Untitled Issue')
issue_state = metadata.get('state', '')
comment_count = metadata.get('comment_count', 0)
# Create a more descriptive title for Linear issues
title = f"Linear: {issue_identifier} - {issue_title}"
if issue_state:
title += f" ({issue_state})"
# Create a more descriptive description for Linear issues # Extract document metadata
description = chunk.get('content', '')[:100] document = chunk.get('document', {})
if len(description) == 100: metadata = document.get('metadata', {})
description += "..."
# Add comment count info to description
if comment_count:
if description:
description += f" | Comments: {comment_count}"
else:
description = f"Comments: {comment_count}"
# For URL, we could construct a URL to the Linear issue if we have the workspace info
# For now, use a generic placeholder
url = ""
if issue_identifier:
# This is a generic format, may need to be adjusted based on actual Linear workspace
url = f"https://linear.app/issue/{issue_identifier}"
source = { # Extract Linear-specific metadata
"id": self.source_id_counter, issue_identifier = metadata.get('issue_identifier', '')
"title": title, issue_title = metadata.get('issue_title', 'Untitled Issue')
"description": description, issue_state = metadata.get('state', '')
"url": url, comment_count = metadata.get('comment_count', 0)
"issue_identifier": issue_identifier,
"state": issue_state, # Create a more descriptive title for Linear issues
"comment_count": comment_count title = f"Linear: {issue_identifier} - {issue_title}"
} if issue_state:
title += f" ({issue_state})"
# Create a more descriptive description for Linear issues
description = chunk.get('content', '')[:100]
if len(description) == 100:
description += "..."
# Add comment count info to description
if comment_count:
if description:
description += f" | Comments: {comment_count}"
else:
description = f"Comments: {comment_count}"
# For URL, we could construct a URL to the Linear issue if we have the workspace info
# For now, use a generic placeholder
url = ""
if issue_identifier:
# This is a generic format, may need to be adjusted based on actual Linear workspace
url = f"https://linear.app/issue/{issue_identifier}"
self.source_id_counter += 1 source = {
sources_list.append(source) "id": self.source_id_counter,
"title": title,
"description": description,
"url": url,
"issue_identifier": issue_identifier,
"state": issue_state,
"comment_count": comment_count
}
self.source_id_counter += 1
sources_list.append(source)
# Create result object # Create result object
result_object = { result_object = {
@ -697,38 +789,39 @@ class ConnectorService:
sources_list = [] sources_list = []
documents = [] documents = []
for i, result in enumerate(linkup_results): async with self.counter_lock:
# Only process results that have content for i, result in enumerate(linkup_results):
if not hasattr(result, 'content') or not result.content: # Only process results that have content
continue if not hasattr(result, 'content') or not result.content:
continue
# Create a source entry
source = { # Create a source entry
"id": self.source_id_counter, source = {
"title": result.name if hasattr(result, 'name') else "Linkup Result",
"description": result.content[:100] if hasattr(result, 'content') else "",
"url": result.url if hasattr(result, 'url') else ""
}
sources_list.append(source)
# Create a document entry
document = {
"chunk_id": f"linkup_chunk_{i}",
"content": result.content if hasattr(result, 'content') else "",
"score": 1.0, # Default score since not provided by Linkup
"document": {
"id": self.source_id_counter, "id": self.source_id_counter,
"title": result.name if hasattr(result, 'name') else "Linkup Result", "title": result.name if hasattr(result, 'name') else "Linkup Result",
"document_type": "LINKUP_API", "description": result.content[:100] if hasattr(result, 'content') else "",
"metadata": { "url": result.url if hasattr(result, 'url') else ""
"url": result.url if hasattr(result, 'url') else "", }
"type": result.type if hasattr(result, 'type') else "", sources_list.append(source)
"source": "LINKUP_API"
# Create a document entry
document = {
"chunk_id": f"linkup_chunk_{i}",
"content": result.content if hasattr(result, 'content') else "",
"score": 1.0, # Default score since not provided by Linkup
"document": {
"id": self.source_id_counter,
"title": result.name if hasattr(result, 'name') else "Linkup Result",
"document_type": "LINKUP_API",
"metadata": {
"url": result.url if hasattr(result, 'url') else "",
"type": result.type if hasattr(result, 'type') else "",
"source": "LINKUP_API"
}
} }
} }
} documents.append(document)
documents.append(document) self.source_id_counter += 1
self.source_id_counter += 1
# Create result object # Create result object
result_object = { result_object = {