mirror of
https://github.com/lfnovo/open-notebook.git
synced 2026-05-02 21:30:38 +00:00
* fix: filter empty content in rebuild embeddings queries Update collect_items_for_rebuild() to properly filter out items with empty or whitespace-only content before submitting embedding jobs. Changes: - Sources: add string::trim(full_text) != '' filter - Notes: add string::trim(content) != '' filter - Insights: add content != none AND string::trim(content) != '' filter (previously had no content filter at all) This prevents unnecessary job submissions that would fail validation in the individual embed commands. Ref #513 * feat: add command_id to embedding error logs Add get_command_id() helper to extract command_id from execution context. Include command_id in error logs for all embedding commands: - embed_note_command - embed_insight_command - embed_source_command - create_insight_command This makes it easier to trace failed embedding jobs back to specific command records in the database. Ref #513 * fix: improve logging for embedding commands Log improvements: - Add command_id to all embedding error logs for traceability - Transaction conflicts in repo_insert now log at DEBUG (not ERROR) - Embedding API errors log at DEBUG, only ERROR when retries exhausted - Friendlier retry messages: "This will be retried automatically" - Include model name and command_id in generate_embeddings errors Files changed: - commands/embedding_commands.py: command_id in logs, friendlier messages - open_notebook/database/repository.py: DEBUG for transaction conflicts - open_notebook/utils/embedding.py: DEBUG logging, pass-through command_id Ref #513 * fix: correct field names in rebuild embeddings status endpoint The API status endpoint was looking for wrong field names: - sources_processed → sources_submitted - notes_processed → notes_submitted - insights_processed → insights_submitted - processed_items → jobs_submitted - failed_items → failed_submissions The command outputs "_submitted" because embedding happens async (we count jobs submitted, not items processed). Ref #513 * fix: update rebuild UI text to reflect async job submission Changed terminology from "Completed/processed" to "Jobs Submitted" since the rebuild command submits embedding jobs for async processing, not completing them synchronously. Updated in all locales: en-US, pt-BR, zh-CN, zh-TW, ja-JP Ref #513 * refactor: migrate retry strategy from allowlist to blocklist - Change from `retry_on: [RuntimeError, ...]` to `stop_on: [ValueError]` - This is more resilient: new exception types auto-retry by default - Simplified exception handling: ValueError = permanent, else = retry - Transient errors logged at DEBUG (surreal-commands logs final failure) - Permanent errors (ValueError) logged at ERROR Ref #513
192 lines
6.9 KiB
Python
192 lines
6.9 KiB
Python
from fastapi import APIRouter, HTTPException
|
|
from loguru import logger
|
|
from surreal_commands import get_command_status
|
|
|
|
from api.command_service import CommandService
|
|
from api.models import (
|
|
RebuildProgress,
|
|
RebuildRequest,
|
|
RebuildResponse,
|
|
RebuildStats,
|
|
RebuildStatusResponse,
|
|
)
|
|
from open_notebook.database.repository import repo_query
|
|
|
|
router = APIRouter()
|
|
|
|
|
|
@router.post("/rebuild", response_model=RebuildResponse)
|
|
async def start_rebuild(request: RebuildRequest):
|
|
"""
|
|
Start a background job to rebuild embeddings.
|
|
|
|
- **mode**: "existing" (re-embed items with embeddings) or "all" (embed everything)
|
|
- **include_sources**: Include sources in rebuild (default: true)
|
|
- **include_notes**: Include notes in rebuild (default: true)
|
|
- **include_insights**: Include insights in rebuild (default: true)
|
|
|
|
Returns command ID to track progress and estimated item count.
|
|
"""
|
|
try:
|
|
logger.info(f"Starting rebuild request: mode={request.mode}")
|
|
|
|
# Import commands to ensure they're registered
|
|
import commands.embedding_commands # noqa: F401
|
|
|
|
# Estimate total items (quick count query)
|
|
# This is a rough estimate before the command runs
|
|
total_estimate = 0
|
|
|
|
if request.include_sources:
|
|
if request.mode == "existing":
|
|
# Count sources with embeddings
|
|
result = await repo_query(
|
|
"""
|
|
SELECT VALUE count(array::distinct(
|
|
SELECT VALUE source.id
|
|
FROM source_embedding
|
|
WHERE embedding != none AND array::len(embedding) > 0
|
|
)) as count FROM {}
|
|
"""
|
|
)
|
|
else:
|
|
# Count all sources with content
|
|
result = await repo_query(
|
|
"SELECT VALUE count() as count FROM source WHERE full_text != none GROUP ALL"
|
|
)
|
|
|
|
if result and isinstance(result[0], dict):
|
|
total_estimate += result[0].get("count", 0)
|
|
elif result:
|
|
total_estimate += result[0] if isinstance(result[0], int) else 0
|
|
|
|
if request.include_notes:
|
|
if request.mode == "existing":
|
|
result = await repo_query(
|
|
"SELECT VALUE count() as count FROM note WHERE embedding != none AND array::len(embedding) > 0 GROUP ALL"
|
|
)
|
|
else:
|
|
result = await repo_query(
|
|
"SELECT VALUE count() as count FROM note WHERE content != none GROUP ALL"
|
|
)
|
|
|
|
if result and isinstance(result[0], dict):
|
|
total_estimate += result[0].get("count", 0)
|
|
elif result:
|
|
total_estimate += result[0] if isinstance(result[0], int) else 0
|
|
|
|
if request.include_insights:
|
|
if request.mode == "existing":
|
|
result = await repo_query(
|
|
"SELECT VALUE count() as count FROM source_insight WHERE embedding != none AND array::len(embedding) > 0 GROUP ALL"
|
|
)
|
|
else:
|
|
result = await repo_query(
|
|
"SELECT VALUE count() as count FROM source_insight GROUP ALL"
|
|
)
|
|
|
|
if result and isinstance(result[0], dict):
|
|
total_estimate += result[0].get("count", 0)
|
|
elif result:
|
|
total_estimate += result[0] if isinstance(result[0], int) else 0
|
|
|
|
logger.info(f"Estimated {total_estimate} items to process")
|
|
|
|
# Submit command
|
|
command_id = await CommandService.submit_command_job(
|
|
"open_notebook",
|
|
"rebuild_embeddings",
|
|
{
|
|
"mode": request.mode,
|
|
"include_sources": request.include_sources,
|
|
"include_notes": request.include_notes,
|
|
"include_insights": request.include_insights,
|
|
},
|
|
)
|
|
|
|
logger.info(f"Submitted rebuild command: {command_id}")
|
|
|
|
return RebuildResponse(
|
|
command_id=command_id,
|
|
total_items=total_estimate,
|
|
message=f"Rebuild operation started. Estimated {total_estimate} items to process.",
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to start rebuild: {e}")
|
|
logger.exception(e)
|
|
raise HTTPException(
|
|
status_code=500, detail=f"Failed to start rebuild operation: {str(e)}"
|
|
)
|
|
|
|
|
|
@router.get("/rebuild/{command_id}/status", response_model=RebuildStatusResponse)
|
|
async def get_rebuild_status(command_id: str):
|
|
"""
|
|
Get the status of a rebuild operation.
|
|
|
|
Returns:
|
|
- **status**: queued, running, completed, failed
|
|
- **progress**: processed count, total count, percentage
|
|
- **stats**: breakdown by type (sources, notes, insights, failed)
|
|
- **timestamps**: started_at, completed_at
|
|
"""
|
|
try:
|
|
# Get command status from surreal_commands
|
|
status = await get_command_status(command_id)
|
|
|
|
if not status:
|
|
raise HTTPException(status_code=404, detail="Rebuild command not found")
|
|
|
|
# Build response based on status
|
|
response = RebuildStatusResponse(
|
|
command_id=command_id,
|
|
status=status.status,
|
|
)
|
|
|
|
# Extract metadata from command result
|
|
if status.result and isinstance(status.result, dict):
|
|
result = status.result
|
|
|
|
# Build progress info
|
|
if "total_items" in result and "jobs_submitted" in result:
|
|
total = result["total_items"]
|
|
submitted = result["jobs_submitted"]
|
|
response.progress = RebuildProgress(
|
|
processed=submitted,
|
|
total=total,
|
|
percentage=round((submitted / total * 100) if total > 0 else 0, 2),
|
|
)
|
|
|
|
# Build stats
|
|
response.stats = RebuildStats(
|
|
sources=result.get("sources_submitted", 0),
|
|
notes=result.get("notes_submitted", 0),
|
|
insights=result.get("insights_submitted", 0),
|
|
failed=result.get("failed_submissions", 0),
|
|
)
|
|
|
|
# Add timestamps
|
|
if hasattr(status, "created") and status.created:
|
|
response.started_at = str(status.created)
|
|
if hasattr(status, "updated") and status.updated:
|
|
response.completed_at = str(status.updated)
|
|
|
|
# Add error message if failed
|
|
if (
|
|
status.status == "failed"
|
|
and status.result
|
|
and isinstance(status.result, dict)
|
|
):
|
|
response.error_message = status.result.get("error_message", "Unknown error")
|
|
|
|
return response
|
|
|
|
except HTTPException:
|
|
raise
|
|
except Exception as e:
|
|
logger.error(f"Failed to get rebuild status: {e}")
|
|
logger.exception(e)
|
|
raise HTTPException(
|
|
status_code=500, detail=f"Failed to get rebuild status: {str(e)}"
|
|
)
|