SurfSense/surfsense_backend/app/tasks/connector_indexers/base.py
2025-08-12 15:28:13 -07:00

183 lines
5.5 KiB
Python

"""
Base functionality and shared imports for connector indexers.
"""
import logging
from datetime import datetime, timedelta
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.future import select
from app.config import config
from app.db import (
Chunk,
Document,
SearchSourceConnector,
SearchSourceConnectorType,
)
# Set up logging
logger = logging.getLogger(__name__)
async def check_duplicate_document_by_hash(
session: AsyncSession, content_hash: str
) -> Document | None:
"""
Check if a document with the given content hash already exists.
Args:
session: Database session
content_hash: Hash of the document content
Returns:
Existing document if found, None otherwise
"""
existing_doc_result = await session.execute(
select(Document).where(Document.content_hash == content_hash)
)
return existing_doc_result.scalars().first()
async def create_document_chunks(content: str) -> list[Chunk]:
"""
Create chunks from document content.
Args:
content: Document content to chunk
Returns:
List of Chunk objects with embeddings
"""
return [
Chunk(
content=chunk.text,
embedding=config.embedding_model_instance.embed(chunk.text),
)
for chunk in config.chunker_instance.chunk(content)
]
async def get_connector_by_id(
session: AsyncSession, connector_id: int, connector_type: SearchSourceConnectorType
) -> SearchSourceConnector | None:
"""
Get a connector by ID and type from the database.
Args:
session: Database session
connector_id: ID of the connector
connector_type: Expected type of the connector
Returns:
Connector object if found, None otherwise
"""
result = await session.execute(
select(SearchSourceConnector).filter(
SearchSourceConnector.id == connector_id,
SearchSourceConnector.connector_type == connector_type,
)
)
return result.scalars().first()
def calculate_date_range(
connector: SearchSourceConnector,
start_date: str | None = None,
end_date: str | None = None,
default_days_back: int = 365,
) -> tuple[str, str]:
"""
Calculate date range for indexing based on provided dates or connector's last indexed date.
Args:
connector: The connector object
start_date: Optional start date string (YYYY-MM-DD)
end_date: Optional end date string (YYYY-MM-DD)
default_days_back: Default number of days to go back if no last indexed date
Returns:
Tuple of (start_date_str, end_date_str)
"""
if start_date is not None and end_date is not None:
return start_date, end_date
# Fall back to calculating dates based on last_indexed_at
calculated_end_date = datetime.now()
# Use last_indexed_at as start date if available, otherwise use default_days_back
if connector.last_indexed_at:
# Convert dates to be comparable (both timezone-naive)
last_indexed_naive = (
connector.last_indexed_at.replace(tzinfo=None)
if connector.last_indexed_at.tzinfo
else connector.last_indexed_at
)
# Check if last_indexed_at is in the future or after end_date
if last_indexed_naive > calculated_end_date:
logger.warning(
f"Last indexed date ({last_indexed_naive.strftime('%Y-%m-%d')}) is in the future. Using {default_days_back} days ago instead."
)
calculated_start_date = calculated_end_date - timedelta(
days=default_days_back
)
else:
calculated_start_date = last_indexed_naive
logger.info(
f"Using last_indexed_at ({calculated_start_date.strftime('%Y-%m-%d')}) as start date"
)
else:
calculated_start_date = calculated_end_date - timedelta(days=default_days_back)
logger.info(
f"No last_indexed_at found, using {calculated_start_date.strftime('%Y-%m-%d')} ({default_days_back} days ago) as start date"
)
# Use calculated dates if not provided
start_date_str = (
start_date if start_date else calculated_start_date.strftime("%Y-%m-%d")
)
end_date_str = end_date if end_date else calculated_end_date.strftime("%Y-%m-%d")
return start_date_str, end_date_str
async def update_connector_last_indexed(
session: AsyncSession,
connector: SearchSourceConnector,
update_last_indexed: bool = True,
) -> None:
"""
Update the last_indexed_at timestamp for a connector.
Args:
session: Database session
connector: The connector object
update_last_indexed: Whether to actually update the timestamp
"""
if update_last_indexed:
connector.last_indexed_at = datetime.now()
logger.info(f"Updated last_indexed_at to {connector.last_indexed_at}")
def build_document_metadata_string(
metadata_sections: list[tuple[str, list[str]]],
) -> str:
"""
Build a document string from metadata sections.
Args:
metadata_sections: List of (section_title, section_content) tuples
Returns:
Combined document string
"""
document_parts = ["<DOCUMENT>"]
for section_title, section_content in metadata_sections:
document_parts.append(f"<{section_title}>")
document_parts.extend(section_content)
document_parts.append(f"</{section_title}>")
document_parts.append("</DOCUMENT>")
return "\n".join(document_parts)