fix: Fixed Notion Reindexing & Updation

This commit is contained in:
DESKTOP-RTLN3BA\$punk 2025-03-26 17:19:10 -07:00
parent 52c0a9eefa
commit 23da404177
3 changed files with 87 additions and 38 deletions

1
.gitignore vendored
View file

@ -12,3 +12,4 @@ __pycache__
__pycache__/ __pycache__/
.__pycache__ .__pycache__
surfsense_backend/.env surfsense_backend/.env
.flashrank_cache

View file

@ -401,18 +401,18 @@ async def run_notion_indexing(
""" """
try: try:
# Index Notion pages without updating last_indexed_at (we'll do it separately) # Index Notion pages without updating last_indexed_at (we'll do it separately)
documents_indexed, error_or_warning = await index_notion_pages( documents_processed, error_or_warning = await index_notion_pages(
session=session, session=session,
connector_id=connector_id, connector_id=connector_id,
search_space_id=search_space_id, search_space_id=search_space_id,
update_last_indexed=False # Don't update timestamp in the indexing function update_last_indexed=False # Don't update timestamp in the indexing function
) )
# Only update last_indexed_at if indexing was successful # Only update last_indexed_at if indexing was successful (either new docs or updated docs)
if documents_indexed > 0 and (error_or_warning is None or "Indexed" in error_or_warning): if documents_processed > 0:
await update_connector_last_indexed(session, connector_id) await update_connector_last_indexed(session, connector_id)
logger.info(f"Notion indexing completed successfully: {documents_indexed} documents indexed") logger.info(f"Notion indexing completed successfully: {documents_processed} documents processed")
else: else:
logger.error(f"Notion indexing failed or no documents indexed: {error_or_warning}") logger.error(f"Notion indexing failed or no documents processed: {error_or_warning}")
except Exception as e: except Exception as e:
logger.error(f"Error in background Notion indexing task: {str(e)}") logger.error(f"Error in background Notion indexing task: {str(e)}")

View file

@ -2,6 +2,7 @@ from typing import Optional, List, Dict, Any, Tuple
from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.exc import SQLAlchemyError from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.future import select from sqlalchemy.future import select
from sqlalchemy import delete
from datetime import datetime, timedelta from datetime import datetime, timedelta
from app.db import Document, DocumentType, Chunk, SearchSourceConnector, SearchSourceConnectorType from app.db import Document, DocumentType, Chunk, SearchSourceConnector, SearchSourceConnectorType
from app.config import config from app.config import config
@ -62,7 +63,7 @@ async def index_slack_messages(
# Check if last_indexed_at is today # Check if last_indexed_at is today
today = datetime.now().date() today = datetime.now().date()
if connector.last_indexed_at.date() == today: if connector.last_indexed_at.date() == today:
# If last indexed today, go back 1 day to ensure we don't miss anything # If last indexed today, go back 7 day to ensure we don't miss anything
start_date = end_date - timedelta(days=7) start_date = end_date - timedelta(days=7)
else: else:
start_date = connector.last_indexed_at start_date = connector.last_indexed_at
@ -293,17 +294,8 @@ async def index_notion_pages(
# Calculate date range # Calculate date range
end_date = datetime.now() end_date = datetime.now()
# Use last_indexed_at as start date if available, otherwise use 365 days ago # Check for last 1 year of pages
if connector.last_indexed_at: start_date = end_date - timedelta(days=365)
# Check if last_indexed_at is today
today = datetime.now().date()
if connector.last_indexed_at.date() == today:
# If last indexed today, go back 1 day to ensure we don't miss anything
start_date = end_date - timedelta(days=1)
else:
start_date = connector.last_indexed_at
else:
start_date = end_date - timedelta(days=365)
# Format dates for Notion API (ISO format) # Format dates for Notion API (ISO format)
start_date_str = start_date.strftime("%Y-%m-%dT%H:%M:%SZ") start_date_str = start_date.strftime("%Y-%m-%dT%H:%M:%SZ")
@ -323,8 +315,28 @@ async def index_notion_pages(
logger.info("No Notion pages found to index") logger.info("No Notion pages found to index")
return 0, "No Notion pages found" return 0, "No Notion pages found"
# Get existing documents for this search space and connector type to prevent duplicates
existing_docs_result = await session.execute(
select(Document)
.filter(
Document.search_space_id == search_space_id,
Document.document_type == DocumentType.NOTION_CONNECTOR
)
)
existing_docs = existing_docs_result.scalars().all()
# Create a lookup dictionary of existing documents by page_id
existing_docs_by_page_id = {}
for doc in existing_docs:
if "page_id" in doc.document_metadata:
existing_docs_by_page_id[doc.document_metadata["page_id"]] = doc
logger.info(f"Found {len(existing_docs_by_page_id)} existing Notion documents in database")
# Track the number of documents indexed # Track the number of documents indexed
documents_indexed = 0 documents_indexed = 0
documents_updated = 0
documents_skipped = 0
skipped_pages = [] skipped_pages = []
# Process each page # Process each page
@ -339,6 +351,7 @@ async def index_notion_pages(
if not page_content: if not page_content:
logger.info(f"No content found in page {page_title}. Skipping.") logger.info(f"No content found in page {page_title}. Skipping.")
skipped_pages.append(f"{page_title} (no content)") skipped_pages.append(f"{page_title} (no content)")
documents_skipped += 1
continue continue
# Convert page content to markdown format # Convert page content to markdown format
@ -435,33 +448,66 @@ async def index_notion_pages(
for chunk in config.chunker_instance.chunk(markdown_content) for chunk in config.chunker_instance.chunk(markdown_content)
] ]
# Create and store document # Check if this page already exists in our database
document = Document( existing_document = existing_docs_by_page_id.get(page_id)
search_space_id=search_space_id,
title=f"Notion - {page_title}", if existing_document:
document_type=DocumentType.NOTION_CONNECTOR, # Update existing document instead of creating a new one
document_metadata={ logger.info(f"Updating existing document for page {page_title}")
# Update document fields
existing_document.title = f"Notion - {page_title}"
existing_document.document_metadata = {
"page_title": page_title, "page_title": page_title,
"page_id": page_id, "page_id": page_id,
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S") "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
}, "last_updated": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
content=summary_content, }
embedding=summary_embedding, existing_document.content = summary_content
chunks=chunks existing_document.embedding = summary_embedding
)
session.add(document) # Delete existing chunks and add new ones
documents_indexed += 1 await session.execute(
logger.info(f"Successfully indexed Notion page: {page_title}") delete(Chunk)
.where(Chunk.document_id == existing_document.id)
)
# Assign new chunks to existing document
for chunk in chunks:
chunk.document_id = existing_document.id
session.add(chunk)
documents_updated += 1
else:
# Create and store new document
document = Document(
search_space_id=search_space_id,
title=f"Notion - {page_title}",
document_type=DocumentType.NOTION_CONNECTOR,
document_metadata={
"page_title": page_title,
"page_id": page_id,
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
},
content=summary_content,
embedding=summary_embedding,
chunks=chunks
)
session.add(document)
documents_indexed += 1
logger.info(f"Successfully indexed new Notion page: {page_title}")
except Exception as e: except Exception as e:
logger.error(f"Error processing Notion page {page.get('title', 'Unknown')}: {str(e)}", exc_info=True) logger.error(f"Error processing Notion page {page.get('title', 'Unknown')}: {str(e)}", exc_info=True)
skipped_pages.append(f"{page.get('title', 'Unknown')} (processing error)") skipped_pages.append(f"{page.get('title', 'Unknown')} (processing error)")
documents_skipped += 1
continue # Skip this page and continue with others continue # Skip this page and continue with others
# Update the last_indexed_at timestamp for the connector only if requested # Update the last_indexed_at timestamp for the connector only if requested
# and if we successfully indexed at least one page # and if we successfully indexed at least one page
if update_last_indexed and documents_indexed > 0: total_processed = documents_indexed + documents_updated
if update_last_indexed and total_processed > 0:
connector.last_indexed_at = datetime.now() connector.last_indexed_at = datetime.now()
logger.info(f"Updated last_indexed_at for connector {connector_id}") logger.info(f"Updated last_indexed_at for connector {connector_id}")
@ -471,10 +517,12 @@ async def index_notion_pages(
# Prepare result message # Prepare result message
result_message = None result_message = None
if skipped_pages: if skipped_pages:
result_message = f"Indexed {documents_indexed} pages. Skipped {len(skipped_pages)} pages: {', '.join(skipped_pages)}" result_message = f"Processed {total_processed} pages ({documents_indexed} new, {documents_updated} updated). Skipped {len(skipped_pages)} pages: {', '.join(skipped_pages)}"
else:
result_message = f"Processed {total_processed} pages ({documents_indexed} new, {documents_updated} updated)."
logger.info(f"Notion indexing completed: {documents_indexed} pages indexed, {len(skipped_pages)} pages skipped") logger.info(f"Notion indexing completed: {documents_indexed} new pages, {documents_updated} updated, {documents_skipped} skipped")
return documents_indexed, result_message return total_processed, result_message
except SQLAlchemyError as db_error: except SQLAlchemyError as db_error:
await session.rollback() await session.rollback()