mirror of
https://github.com/MODSetter/SurfSense.git
synced 2025-09-09 13:54:40 +00:00
refactor: refactored background_tasks & indexing_tasks
This commit is contained in:
parent
356bbb86f5
commit
5aa52375c3
24 changed files with 4704 additions and 5149 deletions
183
surfsense_backend/app/tasks/connector_indexers/base.py
Normal file
183
surfsense_backend/app/tasks/connector_indexers/base.py
Normal file
|
@ -0,0 +1,183 @@
|
|||
"""
|
||||
Base functionality and shared imports for connector indexers.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
from sqlalchemy.future import select
|
||||
|
||||
from app.config import config
|
||||
from app.db import (
|
||||
Chunk,
|
||||
Document,
|
||||
SearchSourceConnector,
|
||||
SearchSourceConnectorType,
|
||||
)
|
||||
|
||||
# Set up logging
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
async def check_duplicate_document_by_hash(
|
||||
session: AsyncSession, content_hash: str
|
||||
) -> Document | None:
|
||||
"""
|
||||
Check if a document with the given content hash already exists.
|
||||
|
||||
Args:
|
||||
session: Database session
|
||||
content_hash: Hash of the document content
|
||||
|
||||
Returns:
|
||||
Existing document if found, None otherwise
|
||||
"""
|
||||
existing_doc_result = await session.execute(
|
||||
select(Document).where(Document.content_hash == content_hash)
|
||||
)
|
||||
return existing_doc_result.scalars().first()
|
||||
|
||||
|
||||
async def create_document_chunks(content: str) -> list[Chunk]:
|
||||
"""
|
||||
Create chunks from document content.
|
||||
|
||||
Args:
|
||||
content: Document content to chunk
|
||||
|
||||
Returns:
|
||||
List of Chunk objects with embeddings
|
||||
"""
|
||||
return [
|
||||
Chunk(
|
||||
content=chunk.text,
|
||||
embedding=config.embedding_model_instance.embed(chunk.text),
|
||||
)
|
||||
for chunk in config.chunker_instance.chunk(content)
|
||||
]
|
||||
|
||||
|
||||
async def get_connector_by_id(
|
||||
session: AsyncSession, connector_id: int, connector_type: SearchSourceConnectorType
|
||||
) -> SearchSourceConnector | None:
|
||||
"""
|
||||
Get a connector by ID and type from the database.
|
||||
|
||||
Args:
|
||||
session: Database session
|
||||
connector_id: ID of the connector
|
||||
connector_type: Expected type of the connector
|
||||
|
||||
Returns:
|
||||
Connector object if found, None otherwise
|
||||
"""
|
||||
result = await session.execute(
|
||||
select(SearchSourceConnector).filter(
|
||||
SearchSourceConnector.id == connector_id,
|
||||
SearchSourceConnector.connector_type == connector_type,
|
||||
)
|
||||
)
|
||||
return result.scalars().first()
|
||||
|
||||
|
||||
def calculate_date_range(
|
||||
connector: SearchSourceConnector,
|
||||
start_date: str | None = None,
|
||||
end_date: str | None = None,
|
||||
default_days_back: int = 365,
|
||||
) -> tuple[str, str]:
|
||||
"""
|
||||
Calculate date range for indexing based on provided dates or connector's last indexed date.
|
||||
|
||||
Args:
|
||||
connector: The connector object
|
||||
start_date: Optional start date string (YYYY-MM-DD)
|
||||
end_date: Optional end date string (YYYY-MM-DD)
|
||||
default_days_back: Default number of days to go back if no last indexed date
|
||||
|
||||
Returns:
|
||||
Tuple of (start_date_str, end_date_str)
|
||||
"""
|
||||
if start_date is not None and end_date is not None:
|
||||
return start_date, end_date
|
||||
|
||||
# Fall back to calculating dates based on last_indexed_at
|
||||
calculated_end_date = datetime.now()
|
||||
|
||||
# Use last_indexed_at as start date if available, otherwise use default_days_back
|
||||
if connector.last_indexed_at:
|
||||
# Convert dates to be comparable (both timezone-naive)
|
||||
last_indexed_naive = (
|
||||
connector.last_indexed_at.replace(tzinfo=None)
|
||||
if connector.last_indexed_at.tzinfo
|
||||
else connector.last_indexed_at
|
||||
)
|
||||
|
||||
# Check if last_indexed_at is in the future or after end_date
|
||||
if last_indexed_naive > calculated_end_date:
|
||||
logger.warning(
|
||||
f"Last indexed date ({last_indexed_naive.strftime('%Y-%m-%d')}) is in the future. Using {default_days_back} days ago instead."
|
||||
)
|
||||
calculated_start_date = calculated_end_date - timedelta(
|
||||
days=default_days_back
|
||||
)
|
||||
else:
|
||||
calculated_start_date = last_indexed_naive
|
||||
logger.info(
|
||||
f"Using last_indexed_at ({calculated_start_date.strftime('%Y-%m-%d')}) as start date"
|
||||
)
|
||||
else:
|
||||
calculated_start_date = calculated_end_date - timedelta(days=default_days_back)
|
||||
logger.info(
|
||||
f"No last_indexed_at found, using {calculated_start_date.strftime('%Y-%m-%d')} ({default_days_back} days ago) as start date"
|
||||
)
|
||||
|
||||
# Use calculated dates if not provided
|
||||
start_date_str = (
|
||||
start_date if start_date else calculated_start_date.strftime("%Y-%m-%d")
|
||||
)
|
||||
end_date_str = end_date if end_date else calculated_end_date.strftime("%Y-%m-%d")
|
||||
|
||||
return start_date_str, end_date_str
|
||||
|
||||
|
||||
async def update_connector_last_indexed(
|
||||
session: AsyncSession,
|
||||
connector: SearchSourceConnector,
|
||||
update_last_indexed: bool = True,
|
||||
) -> None:
|
||||
"""
|
||||
Update the last_indexed_at timestamp for a connector.
|
||||
|
||||
Args:
|
||||
session: Database session
|
||||
connector: The connector object
|
||||
update_last_indexed: Whether to actually update the timestamp
|
||||
"""
|
||||
if update_last_indexed:
|
||||
connector.last_indexed_at = datetime.now()
|
||||
logger.info(f"Updated last_indexed_at to {connector.last_indexed_at}")
|
||||
|
||||
|
||||
def build_document_metadata_string(
|
||||
metadata_sections: list[tuple[str, list[str]]],
|
||||
) -> str:
|
||||
"""
|
||||
Build a document string from metadata sections.
|
||||
|
||||
Args:
|
||||
metadata_sections: List of (section_title, section_content) tuples
|
||||
|
||||
Returns:
|
||||
Combined document string
|
||||
"""
|
||||
document_parts = ["<DOCUMENT>"]
|
||||
|
||||
for section_title, section_content in metadata_sections:
|
||||
document_parts.append(f"<{section_title}>")
|
||||
document_parts.extend(section_content)
|
||||
document_parts.append(f"</{section_title}>")
|
||||
|
||||
document_parts.append("</DOCUMENT>")
|
||||
return "\n".join(document_parts)
|
Loading…
Add table
Add a link
Reference in a new issue