feat: add cascade deletion for notebooks with delete preview (#471)

* feat: decrease chunking size for maximum ollama compatibility * docs: improve i18n info on Claude.md * feat: add cascade deletion for notebooks with delete preview - Add Notebook.get_delete_preview() to show counts of affected items - Add Notebook.delete(delete_exclusive_sources) for cascade deletion - Always delete notes when notebook is deleted - Allow user to choose: delete or keep exclusive sources - Shared sources are always unlinked but never deleted - Add NotebookDeleteDialog component with radio button options - Add delete-preview API endpoint - Update delete endpoint with delete_exclusive_sources param - Add i18n support for all 5 locales Closes #77 * docs: remove harcoded config settings
2026-04-28 19:40:50 +00:00 · 2026-01-25 14:56:14 -03:00 · 2026-01-25 14:56:14 -03:00 · 4e411e0488
commit 4e411e0488
parent f14020d385
19 changed files with 527 additions and 55 deletions
--- a/open_notebook/domain/CLAUDE.md
+++ b/open_notebook/domain/CLAUDE.md
@ -25,6 +25,8 @@ Two base classes support different persistence patterns: **ObjectModel** (mutabl
 ### notebook.py
 - **Notebook**: Research project container
  - `get_sources()`, `get_notes()`, `get_chat_sessions()`: Navigate relationships
+  - `get_delete_preview()`: Returns counts of notes, exclusive sources, and shared sources that would be affected by deletion
+  - `delete(delete_exclusive_sources)`: Cascade deletion - always deletes notes, optionally deletes exclusive sources, always unlinks all sources

 - **Source**: Content item (file/URL)
  - `vectorize()`: Submit async embedding job (returns command_id, fire-and-forget)
--- a/open_notebook/domain/notebook.py
+++ b/open_notebook/domain/notebook.py
@ -85,6 +85,150 @@ class Notebook(ObjectModel):
            logger.exception(e)
            raise DatabaseOperationError(e)

+    async def get_delete_preview(self) -> Dict[str, Any]:
+        """
+        Get counts of items that would be affected by deleting this notebook.
+
+        Returns a dict with:
+        - note_count: Number of notes that will be deleted
+        - exclusive_source_count: Sources only in this notebook (can be deleted)
+        - shared_source_count: Sources in other notebooks (will be unlinked only)
+        """
+        try:
+            notebook_id = ensure_record_id(self.id)
+
+            # Count notes
+            note_result = await repo_query(
+                "SELECT count() as count FROM artifact WHERE out = $notebook_id GROUP ALL",
+                {"notebook_id": notebook_id},
+            )
+            note_count = note_result[0]["count"] if note_result else 0
+
+            # Get sources with count of references to OTHER notebooks
+            # If assigned_others = 0, source is exclusive to this notebook
+            # If assigned_others > 0, source is shared with other notebooks
+            source_counts = await repo_query(
+                """
+                SELECT
+                    id,
+                    count(->reference[WHERE out != $notebook_id].out) as assigned_others
+                FROM (SELECT VALUE <-reference.in AS sources FROM $notebook_id)[0]
+                """,
+                {"notebook_id": notebook_id},
+            )
+
+            exclusive_count = 0
+            shared_count = 0
+            for src in source_counts:
+                if src.get("assigned_others", 0) == 0:
+                    exclusive_count += 1
+                else:
+                    shared_count += 1
+
+            return {
+                "note_count": note_count,
+                "exclusive_source_count": exclusive_count,
+                "shared_source_count": shared_count,
+            }
+        except Exception as e:
+            logger.error(f"Error getting delete preview for notebook {self.id}: {e}")
+            logger.exception(e)
+            raise DatabaseOperationError(e)
+
+    async def delete(self, delete_exclusive_sources: bool = False) -> Dict[str, int]:
+        """
+        Delete notebook with cascade deletion of notes and optional source deletion.
+
+        Args:
+            delete_exclusive_sources: If True, also delete sources that belong
+                                     only to this notebook. Default is False.
+
+        Returns:
+            Dict with counts: deleted_notes, deleted_sources, unlinked_sources
+        """
+        if self.id is None:
+            raise InvalidInputError("Cannot delete notebook without an ID")
+
+        try:
+            notebook_id = ensure_record_id(self.id)
+            deleted_notes = 0
+            deleted_sources = 0
+            unlinked_sources = 0
+
+            # 1. Get and delete all notes linked to this notebook
+            notes = await self.get_notes()
+            for note in notes:
+                await note.delete()
+                deleted_notes += 1
+            logger.info(f"Deleted {deleted_notes} notes for notebook {self.id}")
+
+            # Delete artifact relationships
+            await repo_query(
+                "DELETE artifact WHERE out = $notebook_id",
+                {"notebook_id": notebook_id},
+            )
+
+            # 2. Handle sources
+            if delete_exclusive_sources:
+                # Find sources with count of references to OTHER notebooks
+                # If assigned_others = 0, source is exclusive to this notebook
+                source_counts = await repo_query(
+                    """
+                    SELECT
+                        id,
+                        count(->reference[WHERE out != $notebook_id].out) as assigned_others
+                    FROM (SELECT VALUE <-reference.in AS sources FROM $notebook_id)[0]
+                    """,
+                    {"notebook_id": notebook_id},
+                )
+
+                for src in source_counts:
+                    source_id = src.get("id")
+                    if source_id and src.get("assigned_others", 0) == 0:
+                        # Exclusive source - delete it
+                        try:
+                            source = await Source.get(str(source_id))
+                            await source.delete()
+                            deleted_sources += 1
+                        except Exception as e:
+                            logger.warning(
+                                f"Failed to delete exclusive source {source_id}: {e}"
+                            )
+                    else:
+                        unlinked_sources += 1
+            else:
+                # Just count sources that will be unlinked
+                source_result = await repo_query(
+                    "SELECT count() as count FROM reference WHERE out = $notebook_id GROUP ALL",
+                    {"notebook_id": notebook_id},
+                )
+                unlinked_sources = source_result[0]["count"] if source_result else 0
+
+            # Delete reference relationships (unlink all sources)
+            await repo_query(
+                "DELETE reference WHERE out = $notebook_id",
+                {"notebook_id": notebook_id},
+            )
+            logger.info(
+                f"Unlinked {unlinked_sources} sources, deleted {deleted_sources} "
+                f"exclusive sources for notebook {self.id}"
+            )
+
+            # 3. Delete the notebook record itself
+            await super().delete()
+            logger.info(f"Deleted notebook {self.id}")
+
+            return {
+                "deleted_notes": deleted_notes,
+                "deleted_sources": deleted_sources,
+                "unlinked_sources": unlinked_sources,
+            }
+
+        except Exception as e:
+            logger.error(f"Error deleting notebook {self.id}: {e}")
+            logger.exception(e)
+            raise DatabaseOperationError(f"Failed to delete notebook: {e}")
+

 class Asset(BaseModel):
    file_path: Optional[str] = None
--- a/open_notebook/utils/CLAUDE.md
+++ b/open_notebook/utils/CLAUDE.md
@ -39,8 +39,8 @@ Each utility is stateless and can be imported independently.

 ### chunking.py
 - **ContentType**: Enum (HTML, MARKDOWN, PLAIN)
- **CHUNK_SIZE**: 1500 characters (constant)
- **CHUNK_OVERLAP**: 225 characters (15% overlap)
+- **CHUNK_SIZE**: constant
+- **CHUNK_OVERLAP**: constant
 - **detect_content_type_from_extension(file_path)**: Detect type from file extension
 - **detect_content_type_from_heuristics(text)**: Detect type from content patterns (returns type + confidence)
 - **detect_content_type(text, file_path)**: Combined detection (extension primary, heuristics fallback)
--- a/open_notebook/utils/chunking.py
+++ b/open_notebook/utils/chunking.py
@ -22,8 +22,8 @@ from langchain_text_splitters import (
 from loguru import logger

 # Constants
-CHUNK_SIZE = 1500  # characters
-CHUNK_OVERLAP = 225  # 15% of chunk size
+CHUNK_SIZE = 1200  # characters
+CHUNK_OVERLAP = 180  # 15% of chunk size
 HIGH_CONFIDENCE_THRESHOLD = 0.8  # Threshold for heuristics to override extension


@ -73,7 +73,9 @@ _EXTENSION_TO_CONTENT_TYPE = {
 }


-def detect_content_type_from_extension(file_path: Optional[str]) -> Optional[ContentType]:
+def detect_content_type_from_extension(
+    file_path: Optional[str],
+) -> Optional[ContentType]:
    """
    Detect content type from file extension.

@ -220,9 +222,7 @@ def _calculate_markdown_score(text: str) -> float:
    return min(score, 1.0)


-def detect_content_type(
-    text: str, file_path: Optional[str] = None
-) -> ContentType:
+def detect_content_type(text: str, file_path: Optional[str] = None) -> ContentType:
    """
    Detect content type using file extension (primary) and heuristics (fallback).

@ -352,12 +352,18 @@ def chunk_text(
        splitter = _get_html_splitter()
        # HTML splitter returns Document objects
        docs = splitter.split_text(text)
-        chunks = [doc.page_content if hasattr(doc, "page_content") else str(doc) for doc in docs]
+        chunks = [
+            doc.page_content if hasattr(doc, "page_content") else str(doc)
+            for doc in docs
+        ]
    elif content_type == ContentType.MARKDOWN:
        splitter = _get_markdown_splitter()
        # Markdown splitter returns Document objects
        docs = splitter.split_text(text)
-        chunks = [doc.page_content if hasattr(doc, "page_content") else str(doc) for doc in docs]
+        chunks = [
+            doc.page_content if hasattr(doc, "page_content") else str(doc)
+            for doc in docs
+        ]
    else:
        # Plain text - use recursive splitter directly
        splitter = _get_plain_splitter()