update connector indexing / update connector service

This commit is contained in:
CREDO23 2025-07-24 11:52:21 +02:00
parent ca98693005
commit cd05a06a91
8 changed files with 1544 additions and 732 deletions

View file

@ -186,6 +186,27 @@ async def fetch_documents_by_ids(
else: else:
url = "" url = ""
elif doc_type == "JIRA_CONNECTOR":
# Extract Jira-specific metadata
issue_key = metadata.get('issue_key', 'Unknown Issue')
issue_title = metadata.get('issue_title', 'Untitled Issue')
status = metadata.get('status', '')
priority = metadata.get('priority', '')
issue_type = metadata.get('issue_type', '')
title = f"Jira: {issue_key} - {issue_title}"
if status:
title += f" ({status})"
description = doc.content[:100] + "..." if len(doc.content) > 100 else doc.content
# Construct Jira URL if we have the base URL
base_url = metadata.get('base_url', '')
if base_url and issue_key:
url = f"{base_url}/browse/{issue_key}"
else:
url = ""
elif doc_type == "EXTENSION": elif doc_type == "EXTENSION":
# Extract Extension-specific metadata # Extract Extension-specific metadata
webpage_title = metadata.get('VisitedWebPageTitle', doc.title) webpage_title = metadata.get('VisitedWebPageTitle', doc.title)
@ -227,6 +248,7 @@ async def fetch_documents_by_ids(
"GITHUB_CONNECTOR": "GitHub (Selected)", "GITHUB_CONNECTOR": "GitHub (Selected)",
"YOUTUBE_VIDEO": "YouTube Videos (Selected)", "YOUTUBE_VIDEO": "YouTube Videos (Selected)",
"DISCORD_CONNECTOR": "Discord (Selected)", "DISCORD_CONNECTOR": "Discord (Selected)",
"JIRA_CONNECTOR": "Jira Issues (Selected)",
"EXTENSION": "Browser Extension (Selected)", "EXTENSION": "Browser Extension (Selected)",
"CRAWLED_URL": "Web Pages (Selected)", "CRAWLED_URL": "Web Pages (Selected)",
"FILE": "Files (Selected)" "FILE": "Files (Selected)"
@ -741,6 +763,30 @@ async def fetch_relevant_documents(
} }
) )
elif connector == "JIRA_CONNECTOR":
source_object, jira_chunks = await connector_service.search_jira(
user_query=reformulated_query,
user_id=user_id,
search_space_id=search_space_id,
top_k=top_k,
search_mode=search_mode
)
# Add to sources and raw documents
if source_object:
all_sources.append(source_object)
all_raw_documents.extend(jira_chunks)
# Stream found document count
if streaming_service and writer:
writer(
{
"yield_value": streaming_service.format_terminal_info_delta(
f"🎫 Found {len(jira_chunks)} Jira issues related to your query"
)
}
)
except Exception as e: except Exception as e:
error_message = f"Error searching connector {connector}: {str(e)}" error_message = f"Error searching connector {connector}: {str(e)}"
print(error_message) print(error_message)

View file

@ -15,6 +15,8 @@ You are SurfSense, an advanced AI research assistant that provides detailed, wel
- YOUTUBE_VIDEO: "YouTube video transcripts and metadata" (personally saved videos) - YOUTUBE_VIDEO: "YouTube video transcripts and metadata" (personally saved videos)
- GITHUB_CONNECTOR: "GitHub repository content and issues" (personal repositories and interactions) - GITHUB_CONNECTOR: "GitHub repository content and issues" (personal repositories and interactions)
- LINEAR_CONNECTOR: "Linear project issues and discussions" (personal project management) - LINEAR_CONNECTOR: "Linear project issues and discussions" (personal project management)
- JIRA_CONNECTOR: "Jira project issues, tickets, and comments" (personal project tracking)
- DISCORD_CONNECTOR: "Discord server conversations and shared content" (personal community communications)
- DISCORD_CONNECTOR: "Discord server messages and channels" (personal community interactions) - DISCORD_CONNECTOR: "Discord server messages and channels" (personal community interactions)
- TAVILY_API: "Tavily search API results" (personalized search results) - TAVILY_API: "Tavily search API results" (personalized search results)
- LINKUP_API: "Linkup search API results" (personalized search results) - LINKUP_API: "Linkup search API results" (personalized search results)

View file

@ -33,6 +33,8 @@ def get_connector_emoji(connector_name: str) -> str:
"NOTION_CONNECTOR": "📘", "NOTION_CONNECTOR": "📘",
"GITHUB_CONNECTOR": "🐙", "GITHUB_CONNECTOR": "🐙",
"LINEAR_CONNECTOR": "📊", "LINEAR_CONNECTOR": "📊",
"JIRA_CONNECTOR": "🎫",
"DISCORD_CONNECTOR": "🗨️",
"TAVILY_API": "🔍", "TAVILY_API": "🔍",
"LINKUP_API": "🔗" "LINKUP_API": "🔗"
} }
@ -50,6 +52,8 @@ def get_connector_friendly_name(connector_name: str) -> str:
"NOTION_CONNECTOR": "Notion", "NOTION_CONNECTOR": "Notion",
"GITHUB_CONNECTOR": "GitHub", "GITHUB_CONNECTOR": "GitHub",
"LINEAR_CONNECTOR": "Linear", "LINEAR_CONNECTOR": "Linear",
"JIRA_CONNECTOR": "Jira",
"DISCORD_CONNECTOR": "Discord",
"TAVILY_API": "Tavily Search", "TAVILY_API": "Tavily Search",
"LINKUP_API": "Linkup Search" "LINKUP_API": "Linkup Search"
} }

View file

@ -0,0 +1,218 @@
import unittest
from unittest.mock import patch, Mock
from datetime import datetime
# Import the JiraConnector
from .jira_connector import JiraConnector
class TestJiraConnector(unittest.TestCase):
def setUp(self):
"""Set up test fixtures."""
self.base_url = "https://test.atlassian.net"
self.token = "test_token"
self.connector = JiraConnector(base_url=self.base_url, personal_access_token=self.token)
def test_init(self):
"""Test JiraConnector initialization."""
self.assertEqual(self.connector.base_url, self.base_url)
self.assertEqual(self.connector.personal_access_token, self.token)
self.assertEqual(self.connector.api_version, "3")
def test_init_with_trailing_slash(self):
"""Test JiraConnector initialization with trailing slash in URL."""
connector = JiraConnector(base_url="https://test.atlassian.net/", personal_access_token=self.token)
self.assertEqual(connector.base_url, "https://test.atlassian.net")
def test_set_credentials(self):
"""Test setting credentials."""
new_url = "https://newtest.atlassian.net/"
new_token = "new_token"
self.connector.set_credentials(new_url, new_token)
self.assertEqual(self.connector.base_url, "https://newtest.atlassian.net")
self.assertEqual(self.connector.personal_access_token, new_token)
def test_get_headers(self):
"""Test header generation."""
headers = self.connector.get_headers()
self.assertIn('Content-Type', headers)
self.assertIn('Authorization', headers)
self.assertIn('Accept', headers)
self.assertEqual(headers['Content-Type'], 'application/json')
self.assertEqual(headers['Accept'], 'application/json')
self.assertTrue(headers['Authorization'].startswith('Bearer '))
def test_get_headers_no_credentials(self):
"""Test header generation without credentials."""
connector = JiraConnector()
with self.assertRaises(ValueError) as context:
connector.get_headers()
self.assertIn("Jira credentials not initialized", str(context.exception))
@patch('requests.get')
def test_make_api_request_success(self, mock_get):
"""Test successful API request."""
mock_response = Mock()
mock_response.status_code = 200
mock_response.json.return_value = {"test": "data"}
mock_get.return_value = mock_response
result = self.connector.make_api_request("test/endpoint")
self.assertEqual(result, {"test": "data"})
mock_get.assert_called_once()
@patch('requests.get')
def test_make_api_request_failure(self, mock_get):
"""Test failed API request."""
mock_response = Mock()
mock_response.status_code = 401
mock_response.text = "Unauthorized"
mock_get.return_value = mock_response
with self.assertRaises(Exception) as context:
self.connector.make_api_request("test/endpoint")
self.assertIn("API request failed with status code 401", str(context.exception))
@patch.object(JiraConnector, 'make_api_request')
def test_get_all_projects(self, mock_api_request):
"""Test getting all projects."""
mock_api_request.return_value = {
"values": [
{"id": "1", "key": "TEST", "name": "Test Project"},
{"id": "2", "key": "DEMO", "name": "Demo Project"}
]
}
projects = self.connector.get_all_projects()
self.assertEqual(len(projects), 2)
self.assertEqual(projects[0]["key"], "TEST")
self.assertEqual(projects[1]["key"], "DEMO")
mock_api_request.assert_called_once_with("project")
@patch.object(JiraConnector, 'make_api_request')
def test_get_all_issues(self, mock_api_request):
"""Test getting all issues."""
mock_api_request.return_value = {
"issues": [
{
"id": "1",
"key": "TEST-1",
"fields": {
"summary": "Test Issue",
"description": "Test Description",
"status": {"name": "Open"},
"priority": {"name": "High"},
"issuetype": {"name": "Bug"},
"project": {"key": "TEST"},
"created": "2023-01-01T10:00:00.000+0000",
"updated": "2023-01-01T12:00:00.000+0000"
}
}
],
"total": 1
}
issues = self.connector.get_all_issues()
self.assertEqual(len(issues), 1)
self.assertEqual(issues[0]["key"], "TEST-1")
self.assertEqual(issues[0]["fields"]["summary"], "Test Issue")
def test_format_issue(self):
"""Test issue formatting."""
raw_issue = {
"id": "1",
"key": "TEST-1",
"fields": {
"summary": "Test Issue",
"description": "Test Description",
"status": {"name": "Open", "statusCategory": {"name": "To Do"}},
"priority": {"name": "High"},
"issuetype": {"name": "Bug"},
"project": {"key": "TEST"},
"created": "2023-01-01T10:00:00.000+0000",
"updated": "2023-01-01T12:00:00.000+0000",
"reporter": {
"accountId": "123",
"displayName": "John Doe",
"emailAddress": "john@example.com"
},
"assignee": {
"accountId": "456",
"displayName": "Jane Smith",
"emailAddress": "jane@example.com"
}
}
}
formatted = self.connector.format_issue(raw_issue)
self.assertEqual(formatted["id"], "1")
self.assertEqual(formatted["key"], "TEST-1")
self.assertEqual(formatted["title"], "Test Issue")
self.assertEqual(formatted["status"], "Open")
self.assertEqual(formatted["priority"], "High")
self.assertEqual(formatted["issue_type"], "Bug")
self.assertEqual(formatted["project"], "TEST")
self.assertEqual(formatted["reporter"]["display_name"], "John Doe")
self.assertEqual(formatted["assignee"]["display_name"], "Jane Smith")
def test_format_date(self):
"""Test date formatting."""
iso_date = "2023-01-01T10:30:00.000+0000"
formatted_date = JiraConnector.format_date(iso_date)
self.assertEqual(formatted_date, "2023-01-01 10:30:00")
def test_format_date_invalid(self):
"""Test date formatting with invalid input."""
formatted_date = JiraConnector.format_date("invalid-date")
self.assertEqual(formatted_date, "invalid-date")
formatted_date = JiraConnector.format_date("")
self.assertEqual(formatted_date, "Unknown date")
formatted_date = JiraConnector.format_date(None)
self.assertEqual(formatted_date, "Unknown date")
def test_format_issue_to_markdown(self):
"""Test issue to markdown conversion."""
formatted_issue = {
"key": "TEST-1",
"title": "Test Issue",
"status": "Open",
"priority": "High",
"issue_type": "Bug",
"project": "TEST",
"assignee": {"display_name": "Jane Smith"},
"reporter": {"display_name": "John Doe"},
"created_at": "2023-01-01T10:00:00.000+0000",
"updated_at": "2023-01-01T12:00:00.000+0000",
"description": "Test Description",
"comments": []
}
markdown = self.connector.format_issue_to_markdown(formatted_issue)
self.assertIn("# TEST-1: Test Issue", markdown)
self.assertIn("**Status:** Open", markdown)
self.assertIn("**Priority:** High", markdown)
self.assertIn("**Type:** Bug", markdown)
self.assertIn("**Project:** TEST", markdown)
self.assertIn("**Assignee:** Jane Smith", markdown)
self.assertIn("**Reporter:** John Doe", markdown)
self.assertIn("## Description", markdown)
self.assertIn("Test Description", markdown)
if __name__ == '__main__':
unittest.main()

View file

@ -19,7 +19,7 @@ from app.schemas import SearchSourceConnectorCreate, SearchSourceConnectorUpdate
from app.users import current_active_user from app.users import current_active_user
from app.utils.check_ownership import check_ownership from app.utils.check_ownership import check_ownership
from pydantic import BaseModel, Field, ValidationError from pydantic import BaseModel, Field, ValidationError
from app.tasks.connectors_indexing_tasks import index_slack_messages, index_notion_pages, index_github_repos, index_linear_issues, index_discord_messages from app.tasks.connectors_indexing_tasks import index_slack_messages, index_notion_pages, index_github_repos, index_linear_issues, index_discord_messages, index_jira_issues
from app.connectors.github_connector import GitHubConnector from app.connectors.github_connector import GitHubConnector
from datetime import datetime, timedelta from datetime import datetime, timedelta
import logging import logging
@ -284,6 +284,7 @@ async def index_connector_content(
- NOTION_CONNECTOR: Indexes pages from all accessible Notion pages - NOTION_CONNECTOR: Indexes pages from all accessible Notion pages
- GITHUB_CONNECTOR: Indexes code and documentation from GitHub repositories - GITHUB_CONNECTOR: Indexes code and documentation from GitHub repositories
- LINEAR_CONNECTOR: Indexes issues and comments from Linear - LINEAR_CONNECTOR: Indexes issues and comments from Linear
- JIRA_CONNECTOR: Indexes issues and comments from Jira
- DISCORD_CONNECTOR: Indexes messages from all accessible Discord channels - DISCORD_CONNECTOR: Indexes messages from all accessible Discord channels
Args: Args:
@ -349,6 +350,12 @@ async def index_connector_content(
background_tasks.add_task(run_linear_indexing_with_new_session, connector_id, search_space_id, str(user.id), indexing_from, indexing_to) background_tasks.add_task(run_linear_indexing_with_new_session, connector_id, search_space_id, str(user.id), indexing_from, indexing_to)
response_message = "Linear indexing started in the background." response_message = "Linear indexing started in the background."
elif connector.connector_type == SearchSourceConnectorType.JIRA_CONNECTOR:
# Run indexing in background
logger.info(f"Triggering Jira indexing for connector {connector_id} into search space {search_space_id} from {indexing_from} to {indexing_to}")
background_tasks.add_task(run_jira_indexing_with_new_session, connector_id, search_space_id, str(user.id), indexing_from, indexing_to)
response_message = "Jira indexing started in the background."
elif connector.connector_type == SearchSourceConnectorType.DISCORD_CONNECTOR: elif connector.connector_type == SearchSourceConnectorType.DISCORD_CONNECTOR:
# Run indexing in background # Run indexing in background
logger.info( logger.info(
@ -648,3 +655,44 @@ async def run_discord_indexing(
logger.error(f"Discord indexing failed or no documents processed: {error_or_warning}") logger.error(f"Discord indexing failed or no documents processed: {error_or_warning}")
except Exception as e: except Exception as e:
logger.error(f"Error in background Discord indexing task: {str(e)}") logger.error(f"Error in background Discord indexing task: {str(e)}")
# Add new helper functions for Jira indexing
async def run_jira_indexing_with_new_session(
connector_id: int,
search_space_id: int,
user_id: str,
start_date: str,
end_date: str
):
"""Wrapper to run Jira indexing with its own database session."""
logger.info(f"Background task started: Indexing Jira connector {connector_id} into space {search_space_id} from {start_date} to {end_date}")
async with async_session_maker() as session:
await run_jira_indexing(session, connector_id, search_space_id, user_id, start_date, end_date)
logger.info(f"Background task finished: Indexing Jira connector {connector_id}")
async def run_jira_indexing(
session: AsyncSession,
connector_id: int,
search_space_id: int,
user_id: str,
start_date: str,
end_date: str
):
"""Runs the Jira indexing task and updates the timestamp."""
try:
indexed_count, error_message = await index_jira_issues(
session, connector_id, search_space_id, user_id, start_date, end_date, update_last_indexed=False
)
if error_message:
logger.error(f"Jira indexing failed for connector {connector_id}: {error_message}")
# Optionally update status in DB to indicate failure
else:
logger.info(f"Jira indexing successful for connector {connector_id}. Indexed {indexed_count} documents.")
# Update the last indexed timestamp only on success
await update_connector_last_indexed(session, connector_id)
await session.commit() # Commit timestamp update
except Exception as e:
await session.rollback()
logger.error(f"Critical error in run_jira_indexing for connector {connector_id}: {e}", exc_info=True)
# Optionally update status in DB to indicate failure

View file

@ -101,6 +101,19 @@ class SearchSourceConnectorBase(BaseModel):
# Ensure the bot token is not empty # Ensure the bot token is not empty
if not config.get("DISCORD_BOT_TOKEN"): if not config.get("DISCORD_BOT_TOKEN"):
raise ValueError("DISCORD_BOT_TOKEN cannot be empty") raise ValueError("DISCORD_BOT_TOKEN cannot be empty")
elif connector_type == SearchSourceConnectorType.JIRA_CONNECTOR:
# For JIRA_CONNECTOR, allow JIRA_PERSONAL_ACCESS_TOKEN and JIRA_BASE_URL
allowed_keys = ["JIRA_PERSONAL_ACCESS_TOKEN", "JIRA_BASE_URL"]
if set(config.keys()) != set(allowed_keys):
raise ValueError(f"For JIRA_CONNECTOR connector type, config must only contain these keys: {allowed_keys}")
# Ensure the token is not empty
if not config.get("JIRA_PERSONAL_ACCESS_TOKEN"):
raise ValueError("JIRA_PERSONAL_ACCESS_TOKEN cannot be empty")
# Ensure the base URL is not empty
if not config.get("JIRA_BASE_URL"):
raise ValueError("JIRA_BASE_URL cannot be empty")
return config return config

View file

@ -1,15 +1,21 @@
from typing import List, Dict, Optional
import asyncio import asyncio
from sqlalchemy.ext.asyncio import AsyncSession from typing import Dict, List, Optional
from sqlalchemy.future import select
from app.retriver.chunks_hybrid_search import ChucksHybridSearchRetriever
from app.retriver.documents_hybrid_search import DocumentHybridSearchRetriever
from app.db import SearchSourceConnector, SearchSourceConnectorType, Chunk, Document, SearchSpace
from tavily import TavilyClient
from linkup import LinkupClient
from sqlalchemy import func
from app.agents.researcher.configuration import SearchMode from app.agents.researcher.configuration import SearchMode
from app.db import (
Chunk,
Document,
SearchSourceConnector,
SearchSourceConnectorType,
SearchSpace,
)
from app.retriver.chunks_hybrid_search import ChucksHybridSearchRetriever
from app.retriver.documents_hybrid_search import DocumentHybridSearchRetriever
from linkup import LinkupClient
from sqlalchemy import func
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.future import select
from tavily import TavilyClient
class ConnectorService: class ConnectorService:
@ -18,8 +24,12 @@ class ConnectorService:
self.chunk_retriever = ChucksHybridSearchRetriever(session) self.chunk_retriever = ChucksHybridSearchRetriever(session)
self.document_retriever = DocumentHybridSearchRetriever(session) self.document_retriever = DocumentHybridSearchRetriever(session)
self.user_id = user_id self.user_id = user_id
self.source_id_counter = 100000 # High starting value to avoid collisions with existing IDs self.source_id_counter = (
self.counter_lock = asyncio.Lock() # Lock to protect counter in multithreaded environments 100000 # High starting value to avoid collisions with existing IDs
)
self.counter_lock = (
asyncio.Lock()
) # Lock to protect counter in multithreaded environments
async def initialize_counter(self): async def initialize_counter(self):
""" """
@ -38,13 +48,22 @@ class ConnectorService:
) )
chunk_count = result.scalar() or 0 chunk_count = result.scalar() or 0
self.source_id_counter = chunk_count + 1 self.source_id_counter = chunk_count + 1
print(f"Initialized source_id_counter to {self.source_id_counter} for user {self.user_id}") print(
f"Initialized source_id_counter to {self.source_id_counter} for user {self.user_id}"
)
except Exception as e: except Exception as e:
print(f"Error initializing source_id_counter: {str(e)}") print(f"Error initializing source_id_counter: {str(e)}")
# Fallback to default value # Fallback to default value
self.source_id_counter = 1 self.source_id_counter = 1
async def search_crawled_urls(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple: async def search_crawled_urls(
self,
user_query: str,
user_id: str,
search_space_id: int,
top_k: int = 20,
search_mode: SearchMode = SearchMode.CHUNKS,
) -> tuple:
""" """
Search for crawled URLs and return both the source information and langchain documents Search for crawled URLs and return both the source information and langchain documents
@ -57,7 +76,7 @@ class ConnectorService:
top_k=top_k, top_k=top_k,
user_id=user_id, user_id=user_id,
search_space_id=search_space_id, search_space_id=search_space_id,
document_type="CRAWLED_URL" document_type="CRAWLED_URL",
) )
elif search_mode == SearchMode.DOCUMENTS: elif search_mode == SearchMode.DOCUMENTS:
crawled_urls_chunks = await self.document_retriever.hybrid_search( crawled_urls_chunks = await self.document_retriever.hybrid_search(
@ -65,7 +84,7 @@ class ConnectorService:
top_k=top_k, top_k=top_k,
user_id=user_id, user_id=user_id,
search_space_id=search_space_id, search_space_id=search_space_id,
document_type="CRAWLED_URL" document_type="CRAWLED_URL",
) )
# Transform document retriever results to match expected format # Transform document retriever results to match expected format
crawled_urls_chunks = self._transform_document_results(crawled_urls_chunks) crawled_urls_chunks = self._transform_document_results(crawled_urls_chunks)
@ -84,15 +103,18 @@ class ConnectorService:
async with self.counter_lock: async with self.counter_lock:
for _i, chunk in enumerate(crawled_urls_chunks): for _i, chunk in enumerate(crawled_urls_chunks):
# Extract document metadata # Extract document metadata
document = chunk.get('document', {}) document = chunk.get("document", {})
metadata = document.get('metadata', {}) metadata = document.get("metadata", {})
# Create a source entry # Create a source entry
source = { source = {
"id": document.get('id', self.source_id_counter), "id": document.get("id", self.source_id_counter),
"title": document.get('title', 'Untitled Document'), "title": document.get("title", "Untitled Document"),
"description": metadata.get('og:description', metadata.get('ogDescription', chunk.get('content', '')[:100])), "description": metadata.get(
"url": metadata.get('url', '') "og:description",
metadata.get("ogDescription", chunk.get("content", "")[:100]),
),
"url": metadata.get("url", ""),
} }
self.source_id_counter += 1 self.source_id_counter += 1
@ -108,7 +130,14 @@ class ConnectorService:
return result_object, crawled_urls_chunks return result_object, crawled_urls_chunks
async def search_files(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple: async def search_files(
self,
user_query: str,
user_id: str,
search_space_id: int,
top_k: int = 20,
search_mode: SearchMode = SearchMode.CHUNKS,
) -> tuple:
""" """
Search for files and return both the source information and langchain documents Search for files and return both the source information and langchain documents
@ -121,7 +150,7 @@ class ConnectorService:
top_k=top_k, top_k=top_k,
user_id=user_id, user_id=user_id,
search_space_id=search_space_id, search_space_id=search_space_id,
document_type="FILE" document_type="FILE",
) )
elif search_mode == SearchMode.DOCUMENTS: elif search_mode == SearchMode.DOCUMENTS:
files_chunks = await self.document_retriever.hybrid_search( files_chunks = await self.document_retriever.hybrid_search(
@ -129,7 +158,7 @@ class ConnectorService:
top_k=top_k, top_k=top_k,
user_id=user_id, user_id=user_id,
search_space_id=search_space_id, search_space_id=search_space_id,
document_type="FILE" document_type="FILE",
) )
# Transform document retriever results to match expected format # Transform document retriever results to match expected format
files_chunks = self._transform_document_results(files_chunks) files_chunks = self._transform_document_results(files_chunks)
@ -148,15 +177,18 @@ class ConnectorService:
async with self.counter_lock: async with self.counter_lock:
for _i, chunk in enumerate(files_chunks): for _i, chunk in enumerate(files_chunks):
# Extract document metadata # Extract document metadata
document = chunk.get('document', {}) document = chunk.get("document", {})
metadata = document.get('metadata', {}) metadata = document.get("metadata", {})
# Create a source entry # Create a source entry
source = { source = {
"id": document.get('id', self.source_id_counter), "id": document.get("id", self.source_id_counter),
"title": document.get('title', 'Untitled Document'), "title": document.get("title", "Untitled Document"),
"description": metadata.get('og:description', metadata.get('ogDescription', chunk.get('content', '')[:100])), "description": metadata.get(
"url": metadata.get('url', '') "og:description",
metadata.get("ogDescription", chunk.get("content", "")[:100]),
),
"url": metadata.get("url", ""),
} }
self.source_id_counter += 1 self.source_id_counter += 1
@ -185,19 +217,23 @@ class ConnectorService:
""" """
transformed_results = [] transformed_results = []
for doc in document_results: for doc in document_results:
transformed_results.append({ transformed_results.append(
'document': { {
'id': doc.get('document_id'), "document": {
'title': doc.get('title', 'Untitled Document'), "id": doc.get("document_id"),
'document_type': doc.get('document_type'), "title": doc.get("title", "Untitled Document"),
'metadata': doc.get('metadata', {}), "document_type": doc.get("document_type"),
"metadata": doc.get("metadata", {}),
}, },
'content': doc.get('chunks_content', doc.get('content', '')), "content": doc.get("chunks_content", doc.get("content", "")),
'score': doc.get('score', 0.0) "score": doc.get("score", 0.0),
}) }
)
return transformed_results return transformed_results
async def get_connector_by_type(self, user_id: str, connector_type: SearchSourceConnectorType) -> Optional[SearchSourceConnector]: async def get_connector_by_type(
self, user_id: str, connector_type: SearchSourceConnectorType
) -> Optional[SearchSourceConnector]:
""" """
Get a connector by type for a specific user Get a connector by type for a specific user
@ -209,15 +245,16 @@ class ConnectorService:
Optional[SearchSourceConnector]: The connector if found, None otherwise Optional[SearchSourceConnector]: The connector if found, None otherwise
""" """
result = await self.session.execute( result = await self.session.execute(
select(SearchSourceConnector) select(SearchSourceConnector).filter(
.filter(
SearchSourceConnector.user_id == user_id, SearchSourceConnector.user_id == user_id,
SearchSourceConnector.connector_type == connector_type SearchSourceConnector.connector_type == connector_type,
) )
) )
return result.scalars().first() return result.scalars().first()
async def search_tavily(self, user_query: str, user_id: str, top_k: int = 20) -> tuple: async def search_tavily(
self, user_query: str, user_id: str, top_k: int = 20
) -> tuple:
""" """
Search using Tavily API and return both the source information and documents Search using Tavily API and return both the source information and documents
@ -230,7 +267,9 @@ class ConnectorService:
tuple: (sources_info, documents) tuple: (sources_info, documents)
""" """
# Get Tavily connector configuration # Get Tavily connector configuration
tavily_connector = await self.get_connector_by_type(user_id, SearchSourceConnectorType.TAVILY_API) tavily_connector = await self.get_connector_by_type(
user_id, SearchSourceConnectorType.TAVILY_API
)
if not tavily_connector: if not tavily_connector:
# Return empty results if no Tavily connector is configured # Return empty results if no Tavily connector is configured
@ -250,7 +289,7 @@ class ConnectorService:
response = tavily_client.search( response = tavily_client.search(
query=user_query, query=user_query,
max_results=top_k, max_results=top_k,
search_depth="advanced" # Use advanced search for better results search_depth="advanced", # Use advanced search for better results
) )
# Extract results from Tavily response # Extract results from Tavily response
@ -271,13 +310,12 @@ class ConnectorService:
async with self.counter_lock: async with self.counter_lock:
for i, result in enumerate(tavily_results): for i, result in enumerate(tavily_results):
# Create a source entry # Create a source entry
source = { source = {
"id": self.source_id_counter, "id": self.source_id_counter,
"title": result.get("title", "Tavily Result"), "title": result.get("title", "Tavily Result"),
"description": result.get("content", "")[:100], "description": result.get("content", "")[:100],
"url": result.get("url", "") "url": result.get("url", ""),
} }
sources_list.append(source) sources_list.append(source)
@ -293,9 +331,9 @@ class ConnectorService:
"metadata": { "metadata": {
"url": result.get("url", ""), "url": result.get("url", ""),
"published_date": result.get("published_date", ""), "published_date": result.get("published_date", ""),
"source": "TAVILY_API" "source": "TAVILY_API",
} },
} },
} }
documents.append(document) documents.append(document)
self.source_id_counter += 1 self.source_id_counter += 1
@ -320,7 +358,14 @@ class ConnectorService:
"sources": [], "sources": [],
}, [] }, []
async def search_slack(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple: async def search_slack(
self,
user_query: str,
user_id: str,
search_space_id: int,
top_k: int = 20,
search_mode: SearchMode = SearchMode.CHUNKS,
) -> tuple:
""" """
Search for slack and return both the source information and langchain documents Search for slack and return both the source information and langchain documents
@ -333,7 +378,7 @@ class ConnectorService:
top_k=top_k, top_k=top_k,
user_id=user_id, user_id=user_id,
search_space_id=search_space_id, search_space_id=search_space_id,
document_type="SLACK_CONNECTOR" document_type="SLACK_CONNECTOR",
) )
elif search_mode == SearchMode.DOCUMENTS: elif search_mode == SearchMode.DOCUMENTS:
slack_chunks = await self.document_retriever.hybrid_search( slack_chunks = await self.document_retriever.hybrid_search(
@ -341,7 +386,7 @@ class ConnectorService:
top_k=top_k, top_k=top_k,
user_id=user_id, user_id=user_id,
search_space_id=search_space_id, search_space_id=search_space_id,
document_type="SLACK_CONNECTOR" document_type="SLACK_CONNECTOR",
) )
# Transform document retriever results to match expected format # Transform document retriever results to match expected format
slack_chunks = self._transform_document_results(slack_chunks) slack_chunks = self._transform_document_results(slack_chunks)
@ -360,13 +405,13 @@ class ConnectorService:
async with self.counter_lock: async with self.counter_lock:
for _i, chunk in enumerate(slack_chunks): for _i, chunk in enumerate(slack_chunks):
# Extract document metadata # Extract document metadata
document = chunk.get('document', {}) document = chunk.get("document", {})
metadata = document.get('metadata', {}) metadata = document.get("metadata", {})
# Create a mapped source entry with Slack-specific metadata # Create a mapped source entry with Slack-specific metadata
channel_name = metadata.get('channel_name', 'Unknown Channel') channel_name = metadata.get("channel_name", "Unknown Channel")
channel_id = metadata.get('channel_id', '') channel_id = metadata.get("channel_id", "")
message_date = metadata.get('start_date', '') message_date = metadata.get("start_date", "")
# Create a more descriptive title for Slack messages # Create a more descriptive title for Slack messages
title = f"Slack: {channel_name}" title = f"Slack: {channel_name}"
@ -374,7 +419,7 @@ class ConnectorService:
title += f" ({message_date})" title += f" ({message_date})"
# Create a more descriptive description for Slack messages # Create a more descriptive description for Slack messages
description = chunk.get('content', '')[:100] description = chunk.get("content", "")[:100]
if len(description) == 100: if len(description) == 100:
description += "..." description += "..."
@ -384,7 +429,7 @@ class ConnectorService:
url = f"https://slack.com/app_redirect?channel={channel_id}" url = f"https://slack.com/app_redirect?channel={channel_id}"
source = { source = {
"id": document.get('id', self.source_id_counter), "id": document.get("id", self.source_id_counter),
"title": title, "title": title,
"description": description, "description": description,
"url": url, "url": url,
@ -403,7 +448,14 @@ class ConnectorService:
return result_object, slack_chunks return result_object, slack_chunks
async def search_notion(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple: async def search_notion(
self,
user_query: str,
user_id: str,
search_space_id: int,
top_k: int = 20,
search_mode: SearchMode = SearchMode.CHUNKS,
) -> tuple:
""" """
Search for Notion pages and return both the source information and langchain documents Search for Notion pages and return both the source information and langchain documents
@ -422,7 +474,7 @@ class ConnectorService:
top_k=top_k, top_k=top_k,
user_id=user_id, user_id=user_id,
search_space_id=search_space_id, search_space_id=search_space_id,
document_type="NOTION_CONNECTOR" document_type="NOTION_CONNECTOR",
) )
elif search_mode == SearchMode.DOCUMENTS: elif search_mode == SearchMode.DOCUMENTS:
notion_chunks = await self.document_retriever.hybrid_search( notion_chunks = await self.document_retriever.hybrid_search(
@ -430,7 +482,7 @@ class ConnectorService:
top_k=top_k, top_k=top_k,
user_id=user_id, user_id=user_id,
search_space_id=search_space_id, search_space_id=search_space_id,
document_type="NOTION_CONNECTOR" document_type="NOTION_CONNECTOR",
) )
# Transform document retriever results to match expected format # Transform document retriever results to match expected format
notion_chunks = self._transform_document_results(notion_chunks) notion_chunks = self._transform_document_results(notion_chunks)
@ -449,13 +501,13 @@ class ConnectorService:
async with self.counter_lock: async with self.counter_lock:
for _i, chunk in enumerate(notion_chunks): for _i, chunk in enumerate(notion_chunks):
# Extract document metadata # Extract document metadata
document = chunk.get('document', {}) document = chunk.get("document", {})
metadata = document.get('metadata', {}) metadata = document.get("metadata", {})
# Create a mapped source entry with Notion-specific metadata # Create a mapped source entry with Notion-specific metadata
page_title = metadata.get('page_title', 'Untitled Page') page_title = metadata.get("page_title", "Untitled Page")
page_id = metadata.get('page_id', '') page_id = metadata.get("page_id", "")
indexed_at = metadata.get('indexed_at', '') indexed_at = metadata.get("indexed_at", "")
# Create a more descriptive title for Notion pages # Create a more descriptive title for Notion pages
title = f"Notion: {page_title}" title = f"Notion: {page_title}"
@ -463,7 +515,7 @@ class ConnectorService:
title += f" (indexed: {indexed_at})" title += f" (indexed: {indexed_at})"
# Create a more descriptive description for Notion pages # Create a more descriptive description for Notion pages
description = chunk.get('content', '')[:100] description = chunk.get("content", "")[:100]
if len(description) == 100: if len(description) == 100:
description += "..." description += "..."
@ -474,7 +526,7 @@ class ConnectorService:
url = f"https://notion.so/{page_id.replace('-', '')}" url = f"https://notion.so/{page_id.replace('-', '')}"
source = { source = {
"id": document.get('id', self.source_id_counter), "id": document.get("id", self.source_id_counter),
"title": title, "title": title,
"description": description, "description": description,
"url": url, "url": url,
@ -493,7 +545,14 @@ class ConnectorService:
return result_object, notion_chunks return result_object, notion_chunks
async def search_extension(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple: async def search_extension(
self,
user_query: str,
user_id: str,
search_space_id: int,
top_k: int = 20,
search_mode: SearchMode = SearchMode.CHUNKS,
) -> tuple:
""" """
Search for extension data and return both the source information and langchain documents Search for extension data and return both the source information and langchain documents
@ -512,7 +571,7 @@ class ConnectorService:
top_k=top_k, top_k=top_k,
user_id=user_id, user_id=user_id,
search_space_id=search_space_id, search_space_id=search_space_id,
document_type="EXTENSION" document_type="EXTENSION",
) )
elif search_mode == SearchMode.DOCUMENTS: elif search_mode == SearchMode.DOCUMENTS:
extension_chunks = await self.document_retriever.hybrid_search( extension_chunks = await self.document_retriever.hybrid_search(
@ -520,7 +579,7 @@ class ConnectorService:
top_k=top_k, top_k=top_k,
user_id=user_id, user_id=user_id,
search_space_id=search_space_id, search_space_id=search_space_id,
document_type="EXTENSION" document_type="EXTENSION",
) )
# Transform document retriever results to match expected format # Transform document retriever results to match expected format
extension_chunks = self._transform_document_results(extension_chunks) extension_chunks = self._transform_document_results(extension_chunks)
@ -539,15 +598,17 @@ class ConnectorService:
async with self.counter_lock: async with self.counter_lock:
for i, chunk in enumerate(extension_chunks): for i, chunk in enumerate(extension_chunks):
# Extract document metadata # Extract document metadata
document = chunk.get('document', {}) document = chunk.get("document", {})
metadata = document.get('metadata', {}) metadata = document.get("metadata", {})
# Extract extension-specific metadata # Extract extension-specific metadata
webpage_title = metadata.get('VisitedWebPageTitle', 'Untitled Page') webpage_title = metadata.get("VisitedWebPageTitle", "Untitled Page")
webpage_url = metadata.get('VisitedWebPageURL', '') webpage_url = metadata.get("VisitedWebPageURL", "")
visit_date = metadata.get('VisitedWebPageDateWithTimeInISOString', '') visit_date = metadata.get("VisitedWebPageDateWithTimeInISOString", "")
visit_duration = metadata.get('VisitedWebPageVisitDurationInMilliseconds', '') visit_duration = metadata.get(
browsing_session_id = metadata.get('BrowsingSessionId', '') "VisitedWebPageVisitDurationInMilliseconds", ""
)
browsing_session_id = metadata.get("BrowsingSessionId", "")
# Create a more descriptive title for extension data # Create a more descriptive title for extension data
title = webpage_title title = webpage_title
@ -555,14 +616,18 @@ class ConnectorService:
# Format the date for display (simplified) # Format the date for display (simplified)
try: try:
# Just extract the date part for display # Just extract the date part for display
formatted_date = visit_date.split('T')[0] if 'T' in visit_date else visit_date formatted_date = (
visit_date.split("T")[0]
if "T" in visit_date
else visit_date
)
title += f" (visited: {formatted_date})" title += f" (visited: {formatted_date})"
except: except:
# Fallback if date parsing fails # Fallback if date parsing fails
title += f" (visited: {visit_date})" title += f" (visited: {visit_date})"
# Create a more descriptive description for extension data # Create a more descriptive description for extension data
description = chunk.get('content', '')[:100] description = chunk.get("content", "")[:100]
if len(description) == 100: if len(description) == 100:
description += "..." description += "..."
@ -573,7 +638,7 @@ class ConnectorService:
if duration_seconds < 60: if duration_seconds < 60:
duration_text = f"{duration_seconds:.1f} seconds" duration_text = f"{duration_seconds:.1f} seconds"
else: else:
duration_text = f"{duration_seconds/60:.1f} minutes" duration_text = f"{duration_seconds / 60:.1f} minutes"
if description: if description:
description += f" | Duration: {duration_text}" description += f" | Duration: {duration_text}"
@ -582,10 +647,10 @@ class ConnectorService:
pass pass
source = { source = {
"id": document.get('id', self.source_id_counter), "id": document.get("id", self.source_id_counter),
"title": title, "title": title,
"description": description, "description": description,
"url": webpage_url "url": webpage_url,
} }
self.source_id_counter += 1 self.source_id_counter += 1
@ -601,7 +666,14 @@ class ConnectorService:
return result_object, extension_chunks return result_object, extension_chunks
async def search_youtube(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple: async def search_youtube(
self,
user_query: str,
user_id: str,
search_space_id: int,
top_k: int = 20,
search_mode: SearchMode = SearchMode.CHUNKS,
) -> tuple:
""" """
Search for YouTube videos and return both the source information and langchain documents Search for YouTube videos and return both the source information and langchain documents
@ -620,7 +692,7 @@ class ConnectorService:
top_k=top_k, top_k=top_k,
user_id=user_id, user_id=user_id,
search_space_id=search_space_id, search_space_id=search_space_id,
document_type="YOUTUBE_VIDEO" document_type="YOUTUBE_VIDEO",
) )
elif search_mode == SearchMode.DOCUMENTS: elif search_mode == SearchMode.DOCUMENTS:
youtube_chunks = await self.document_retriever.hybrid_search( youtube_chunks = await self.document_retriever.hybrid_search(
@ -628,7 +700,7 @@ class ConnectorService:
top_k=top_k, top_k=top_k,
user_id=user_id, user_id=user_id,
search_space_id=search_space_id, search_space_id=search_space_id,
document_type="YOUTUBE_VIDEO" document_type="YOUTUBE_VIDEO",
) )
# Transform document retriever results to match expected format # Transform document retriever results to match expected format
youtube_chunks = self._transform_document_results(youtube_chunks) youtube_chunks = self._transform_document_results(youtube_chunks)
@ -647,13 +719,13 @@ class ConnectorService:
async with self.counter_lock: async with self.counter_lock:
for _i, chunk in enumerate(youtube_chunks): for _i, chunk in enumerate(youtube_chunks):
# Extract document metadata # Extract document metadata
document = chunk.get('document', {}) document = chunk.get("document", {})
metadata = document.get('metadata', {}) metadata = document.get("metadata", {})
# Extract YouTube-specific metadata # Extract YouTube-specific metadata
video_title = metadata.get('video_title', 'Untitled Video') video_title = metadata.get("video_title", "Untitled Video")
video_id = metadata.get('video_id', '') video_id = metadata.get("video_id", "")
channel_name = metadata.get('channel_name', '') channel_name = metadata.get("channel_name", "")
# published_date = metadata.get('published_date', '') # published_date = metadata.get('published_date', '')
# Create a more descriptive title for YouTube videos # Create a more descriptive title for YouTube videos
@ -662,7 +734,9 @@ class ConnectorService:
title += f" - {channel_name}" title += f" - {channel_name}"
# Create a more descriptive description for YouTube videos # Create a more descriptive description for YouTube videos
description = metadata.get('description', chunk.get('content', '')[:100]) description = metadata.get(
"description", chunk.get("content", "")[:100]
)
if len(description) == 100: if len(description) == 100:
description += "..." description += "..."
@ -670,12 +744,12 @@ class ConnectorService:
url = f"https://www.youtube.com/watch?v={video_id}" if video_id else "" url = f"https://www.youtube.com/watch?v={video_id}" if video_id else ""
source = { source = {
"id": document.get('id', self.source_id_counter), "id": document.get("id", self.source_id_counter),
"title": title, "title": title,
"description": description, "description": description,
"url": url, "url": url,
"video_id": video_id, # Additional field for YouTube videos "video_id": video_id, # Additional field for YouTube videos
"channel_name": channel_name # Additional field for YouTube videos "channel_name": channel_name, # Additional field for YouTube videos
} }
self.source_id_counter += 1 self.source_id_counter += 1
@ -691,7 +765,14 @@ class ConnectorService:
return result_object, youtube_chunks return result_object, youtube_chunks
async def search_github(self, user_query: str, user_id: int, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple: async def search_github(
self,
user_query: str,
user_id: int,
search_space_id: int,
top_k: int = 20,
search_mode: SearchMode = SearchMode.CHUNKS,
) -> tuple:
""" """
Search for GitHub documents and return both the source information and langchain documents Search for GitHub documents and return both the source information and langchain documents
@ -704,7 +785,7 @@ class ConnectorService:
top_k=top_k, top_k=top_k,
user_id=user_id, user_id=user_id,
search_space_id=search_space_id, search_space_id=search_space_id,
document_type="GITHUB_CONNECTOR" document_type="GITHUB_CONNECTOR",
) )
elif search_mode == SearchMode.DOCUMENTS: elif search_mode == SearchMode.DOCUMENTS:
github_chunks = await self.document_retriever.hybrid_search( github_chunks = await self.document_retriever.hybrid_search(
@ -712,7 +793,7 @@ class ConnectorService:
top_k=top_k, top_k=top_k,
user_id=user_id, user_id=user_id,
search_space_id=search_space_id, search_space_id=search_space_id,
document_type="GITHUB_CONNECTOR" document_type="GITHUB_CONNECTOR",
) )
# Transform document retriever results to match expected format # Transform document retriever results to match expected format
github_chunks = self._transform_document_results(github_chunks) github_chunks = self._transform_document_results(github_chunks)
@ -731,15 +812,19 @@ class ConnectorService:
async with self.counter_lock: async with self.counter_lock:
for _i, chunk in enumerate(github_chunks): for _i, chunk in enumerate(github_chunks):
# Extract document metadata # Extract document metadata
document = chunk.get('document', {}) document = chunk.get("document", {})
metadata = document.get('metadata', {}) metadata = document.get("metadata", {})
# Create a source entry # Create a source entry
source = { source = {
"id": document.get('id', self.source_id_counter), "id": document.get("id", self.source_id_counter),
"title": document.get('title', 'GitHub Document'), # Use specific title if available "title": document.get(
"description": metadata.get('description', chunk.get('content', '')[:100]), # Use description or content preview "title", "GitHub Document"
"url": metadata.get('url', '') # Use URL if available in metadata ), # Use specific title if available
"description": metadata.get(
"description", chunk.get("content", "")[:100]
), # Use description or content preview
"url": metadata.get("url", ""), # Use URL if available in metadata
} }
self.source_id_counter += 1 self.source_id_counter += 1
@ -755,7 +840,14 @@ class ConnectorService:
return result_object, github_chunks return result_object, github_chunks
async def search_linear(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple: async def search_linear(
self,
user_query: str,
user_id: str,
search_space_id: int,
top_k: int = 20,
search_mode: SearchMode = SearchMode.CHUNKS,
) -> tuple:
""" """
Search for Linear issues and comments and return both the source information and langchain documents Search for Linear issues and comments and return both the source information and langchain documents
@ -774,7 +866,7 @@ class ConnectorService:
top_k=top_k, top_k=top_k,
user_id=user_id, user_id=user_id,
search_space_id=search_space_id, search_space_id=search_space_id,
document_type="LINEAR_CONNECTOR" document_type="LINEAR_CONNECTOR",
) )
elif search_mode == SearchMode.DOCUMENTS: elif search_mode == SearchMode.DOCUMENTS:
linear_chunks = await self.document_retriever.hybrid_search( linear_chunks = await self.document_retriever.hybrid_search(
@ -782,7 +874,7 @@ class ConnectorService:
top_k=top_k, top_k=top_k,
user_id=user_id, user_id=user_id,
search_space_id=search_space_id, search_space_id=search_space_id,
document_type="LINEAR_CONNECTOR" document_type="LINEAR_CONNECTOR",
) )
# Transform document retriever results to match expected format # Transform document retriever results to match expected format
linear_chunks = self._transform_document_results(linear_chunks) linear_chunks = self._transform_document_results(linear_chunks)
@ -801,14 +893,14 @@ class ConnectorService:
async with self.counter_lock: async with self.counter_lock:
for _i, chunk in enumerate(linear_chunks): for _i, chunk in enumerate(linear_chunks):
# Extract document metadata # Extract document metadata
document = chunk.get('document', {}) document = chunk.get("document", {})
metadata = document.get('metadata', {}) metadata = document.get("metadata", {})
# Extract Linear-specific metadata # Extract Linear-specific metadata
issue_identifier = metadata.get('issue_identifier', '') issue_identifier = metadata.get("issue_identifier", "")
issue_title = metadata.get('issue_title', 'Untitled Issue') issue_title = metadata.get("issue_title", "Untitled Issue")
issue_state = metadata.get('state', '') issue_state = metadata.get("state", "")
comment_count = metadata.get('comment_count', 0) comment_count = metadata.get("comment_count", 0)
# Create a more descriptive title for Linear issues # Create a more descriptive title for Linear issues
title = f"Linear: {issue_identifier} - {issue_title}" title = f"Linear: {issue_identifier} - {issue_title}"
@ -816,7 +908,7 @@ class ConnectorService:
title += f" ({issue_state})" title += f" ({issue_state})"
# Create a more descriptive description for Linear issues # Create a more descriptive description for Linear issues
description = chunk.get('content', '')[:100] description = chunk.get("content", "")[:100]
if len(description) == 100: if len(description) == 100:
description += "..." description += "..."
@ -835,13 +927,13 @@ class ConnectorService:
url = f"https://linear.app/issue/{issue_identifier}" url = f"https://linear.app/issue/{issue_identifier}"
source = { source = {
"id": document.get('id', self.source_id_counter), "id": document.get("id", self.source_id_counter),
"title": title, "title": title,
"description": description, "description": description,
"url": url, "url": url,
"issue_identifier": issue_identifier, "issue_identifier": issue_identifier,
"state": issue_state, "state": issue_state,
"comment_count": comment_count "comment_count": comment_count,
} }
self.source_id_counter += 1 self.source_id_counter += 1
@ -857,7 +949,14 @@ class ConnectorService:
return result_object, linear_chunks return result_object, linear_chunks
async def search_jira(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple: async def search_jira(
self,
user_query: str,
user_id: str,
search_space_id: int,
top_k: int = 20,
search_mode: SearchMode = SearchMode.CHUNKS,
) -> tuple:
""" """
Search for Jira issues and comments and return both the source information and langchain documents Search for Jira issues and comments and return both the source information and langchain documents
@ -877,7 +976,7 @@ class ConnectorService:
top_k=top_k, top_k=top_k,
user_id=user_id, user_id=user_id,
search_space_id=search_space_id, search_space_id=search_space_id,
document_type="JIRA_CONNECTOR" document_type="JIRA_CONNECTOR",
) )
elif search_mode == SearchMode.DOCUMENTS: elif search_mode == SearchMode.DOCUMENTS:
jira_chunks = await self.document_retriever.hybrid_search( jira_chunks = await self.document_retriever.hybrid_search(
@ -885,7 +984,7 @@ class ConnectorService:
top_k=top_k, top_k=top_k,
user_id=user_id, user_id=user_id,
search_space_id=search_space_id, search_space_id=search_space_id,
document_type="JIRA_CONNECTOR" document_type="JIRA_CONNECTOR",
) )
# Transform document retriever results to match expected format # Transform document retriever results to match expected format
jira_chunks = self._transform_document_results(jira_chunks) jira_chunks = self._transform_document_results(jira_chunks)
@ -904,16 +1003,16 @@ class ConnectorService:
async with self.counter_lock: async with self.counter_lock:
for _i, chunk in enumerate(jira_chunks): for _i, chunk in enumerate(jira_chunks):
# Extract document metadata # Extract document metadata
document = chunk.get('document', {}) document = chunk.get("document", {})
metadata = document.get('metadata', {}) metadata = document.get("metadata", {})
# Extract Jira-specific metadata # Extract Jira-specific metadata
issue_key = metadata.get('issue_key', '') issue_key = metadata.get("issue_key", "")
issue_title = metadata.get('issue_title', 'Untitled Issue') issue_title = metadata.get("issue_title", "Untitled Issue")
status = metadata.get('status', '') status = metadata.get("status", "")
priority = metadata.get('priority', '') priority = metadata.get("priority", "")
issue_type = metadata.get('issue_type', '') issue_type = metadata.get("issue_type", "")
comment_count = metadata.get('comment_count', 0) comment_count = metadata.get("comment_count", 0)
# Create a more descriptive title for Jira issues # Create a more descriptive title for Jira issues
title = f"Jira: {issue_key} - {issue_title}" title = f"Jira: {issue_key} - {issue_title}"
@ -921,7 +1020,7 @@ class ConnectorService:
title += f" ({status})" title += f" ({status})"
# Create a more descriptive description for Jira issues # Create a more descriptive description for Jira issues
description = chunk.get('content', '')[:100] description = chunk.get("content", "")[:100]
if len(description) == 100: if len(description) == 100:
description += "..." description += "..."
@ -938,16 +1037,16 @@ class ConnectorService:
if description: if description:
description += f" | {' | '.join(info_parts)}" description += f" | {' | '.join(info_parts)}"
else: else:
description = ' | '.join(info_parts) description = " | ".join(info_parts)
# For URL, we could construct a URL to the Jira issue if we have the base URL # For URL, we could construct a URL to the Jira issue if we have the base URL
# For now, use a generic placeholder # For now, use a generic placeholder
url = "" url = ""
if issue_key and metadata.get('base_url'): if issue_key and metadata.get("base_url"):
url = f"{metadata.get('base_url')}/browse/{issue_key}" url = f"{metadata.get('base_url')}/browse/{issue_key}"
source = { source = {
"id": document.get('id', self.source_id_counter), "id": document.get("id", self.source_id_counter),
"title": title, "title": title,
"description": description, "description": description,
"url": url, "url": url,
@ -955,7 +1054,7 @@ class ConnectorService:
"status": status, "status": status,
"priority": priority, "priority": priority,
"issue_type": issue_type, "issue_type": issue_type,
"comment_count": comment_count "comment_count": comment_count,
} }
self.source_id_counter += 1 self.source_id_counter += 1
@ -971,7 +1070,9 @@ class ConnectorService:
return result_object, jira_chunks return result_object, jira_chunks
async def search_linkup(self, user_query: str, user_id: str, mode: str = "standard") -> tuple: async def search_linkup(
self, user_query: str, user_id: str, mode: str = "standard"
) -> tuple:
""" """
Search using Linkup API and return both the source information and documents Search using Linkup API and return both the source information and documents
@ -984,7 +1085,9 @@ class ConnectorService:
tuple: (sources_info, documents) tuple: (sources_info, documents)
""" """
# Get Linkup connector configuration # Get Linkup connector configuration
linkup_connector = await self.get_connector_by_type(user_id, SearchSourceConnectorType.LINKUP_API) linkup_connector = await self.get_connector_by_type(
user_id, SearchSourceConnectorType.LINKUP_API
)
if not linkup_connector: if not linkup_connector:
# Return empty results if no Linkup connector is configured # Return empty results if no Linkup connector is configured
@ -1008,7 +1111,7 @@ class ConnectorService:
) )
# Extract results from Linkup response - access as attribute instead of using .get() # Extract results from Linkup response - access as attribute instead of using .get()
linkup_results = response.results if hasattr(response, 'results') else [] linkup_results = response.results if hasattr(response, "results") else []
# Only proceed if we have results # Only proceed if we have results
if not linkup_results: if not linkup_results:
@ -1026,33 +1129,41 @@ class ConnectorService:
async with self.counter_lock: async with self.counter_lock:
for i, result in enumerate(linkup_results): for i, result in enumerate(linkup_results):
# Only process results that have content # Only process results that have content
if not hasattr(result, 'content') or not result.content: if not hasattr(result, "content") or not result.content:
continue continue
# Create a source entry # Create a source entry
source = { source = {
"id": self.source_id_counter, "id": self.source_id_counter,
"title": result.name if hasattr(result, 'name') else "Linkup Result", "title": (
"description": result.content[:100] if hasattr(result, 'content') else "", result.name if hasattr(result, "name") else "Linkup Result"
"url": result.url if hasattr(result, 'url') else "" ),
"description": (
result.content[:100] if hasattr(result, "content") else ""
),
"url": result.url if hasattr(result, "url") else "",
} }
sources_list.append(source) sources_list.append(source)
# Create a document entry # Create a document entry
document = { document = {
"chunk_id": f"linkup_chunk_{i}", "chunk_id": f"linkup_chunk_{i}",
"content": result.content if hasattr(result, 'content') else "", "content": result.content if hasattr(result, "content") else "",
"score": 1.0, # Default score since not provided by Linkup "score": 1.0, # Default score since not provided by Linkup
"document": { "document": {
"id": self.source_id_counter, "id": self.source_id_counter,
"title": result.name if hasattr(result, 'name') else "Linkup Result", "title": (
result.name
if hasattr(result, "name")
else "Linkup Result"
),
"document_type": "LINKUP_API", "document_type": "LINKUP_API",
"metadata": { "metadata": {
"url": result.url if hasattr(result, 'url') else "", "url": result.url if hasattr(result, "url") else "",
"type": result.type if hasattr(result, 'type') else "", "type": result.type if hasattr(result, "type") else "",
"source": "LINKUP_API" "source": "LINKUP_API",
} },
} },
} }
documents.append(document) documents.append(document)
self.source_id_counter += 1 self.source_id_counter += 1
@ -1077,7 +1188,14 @@ class ConnectorService:
"sources": [], "sources": [],
}, [] }, []
async def search_discord(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple: async def search_discord(
self,
user_query: str,
user_id: str,
search_space_id: int,
top_k: int = 20,
search_mode: SearchMode = SearchMode.CHUNKS,
) -> tuple:
""" """
Search for Discord messages and return both the source information and langchain documents Search for Discord messages and return both the source information and langchain documents
@ -1096,7 +1214,7 @@ class ConnectorService:
top_k=top_k, top_k=top_k,
user_id=user_id, user_id=user_id,
search_space_id=search_space_id, search_space_id=search_space_id,
document_type="DISCORD_CONNECTOR" document_type="DISCORD_CONNECTOR",
) )
elif search_mode == SearchMode.DOCUMENTS: elif search_mode == SearchMode.DOCUMENTS:
discord_chunks = await self.document_retriever.hybrid_search( discord_chunks = await self.document_retriever.hybrid_search(
@ -1104,7 +1222,7 @@ class ConnectorService:
top_k=top_k, top_k=top_k,
user_id=user_id, user_id=user_id,
search_space_id=search_space_id, search_space_id=search_space_id,
document_type="DISCORD_CONNECTOR" document_type="DISCORD_CONNECTOR",
) )
# Transform document retriever results to match expected format # Transform document retriever results to match expected format
discord_chunks = self._transform_document_results(discord_chunks) discord_chunks = self._transform_document_results(discord_chunks)
@ -1123,13 +1241,13 @@ class ConnectorService:
async with self.counter_lock: async with self.counter_lock:
for i, chunk in enumerate(discord_chunks): for i, chunk in enumerate(discord_chunks):
# Extract document metadata # Extract document metadata
document = chunk.get('document', {}) document = chunk.get("document", {})
metadata = document.get('metadata', {}) metadata = document.get("metadata", {})
# Create a mapped source entry with Discord-specific metadata # Create a mapped source entry with Discord-specific metadata
channel_name = metadata.get('channel_name', 'Unknown Channel') channel_name = metadata.get("channel_name", "Unknown Channel")
channel_id = metadata.get('channel_id', '') channel_id = metadata.get("channel_id", "")
message_date = metadata.get('start_date', '') message_date = metadata.get("start_date", "")
# Create a more descriptive title for Discord messages # Create a more descriptive title for Discord messages
title = f"Discord: {channel_name}" title = f"Discord: {channel_name}"
@ -1137,12 +1255,12 @@ class ConnectorService:
title += f" ({message_date})" title += f" ({message_date})"
# Create a more descriptive description for Discord messages # Create a more descriptive description for Discord messages
description = chunk.get('content', '')[:100] description = chunk.get("content", "")[:100]
if len(description) == 100: if len(description) == 100:
description += "..." description += "..."
url = "" url = ""
guild_id = metadata.get('guild_id', '') guild_id = metadata.get("guild_id", "")
if guild_id and channel_id: if guild_id and channel_id:
url = f"https://discord.com/channels/{guild_id}/{channel_id}" url = f"https://discord.com/channels/{guild_id}/{channel_id}"
elif channel_id: elif channel_id:
@ -1150,7 +1268,7 @@ class ConnectorService:
url = f"https://discord.com/channels/@me/{channel_id}" url = f"https://discord.com/channels/@me/{channel_id}"
source = { source = {
"id": document.get('id', self.source_id_counter), "id": document.get("id", self.source_id_counter),
"title": title, "title": title,
"description": description, "description": description,
"url": url, "url": url,
@ -1168,5 +1286,3 @@ class ConnectorService:
} }
return result_object, discord_chunks return result_object, discord_chunks

File diff suppressed because it is too large Load diff