mirror of
https://github.com/lfnovo/open-notebook.git
synced 2026-04-29 03:50:04 +00:00
* feat: content-type aware chunking and unified embedding - Add chunking.py with HTML, Markdown, and plain text detection - Add embedding.py with mean pooling for large content - Create dedicated commands: embed_note, embed_insight, embed_source - Use fire-and-forget pattern for embedding via submit_command() - Refactor rebuild_embeddings_command to delegate to individual commands - Remove legacy commands and needs_embedding() methods - Reduce chunk size to 1500 chars for Ollama compatibility - Update CLAUDE.md documentation for new architecture Fixes #350, #142 * fix: address code review issues - Note.save() now returns command_id for tracking embedding jobs - Add length check after generate_embeddings() to fail fast on mismatch - Add numpy as explicit dependency (was transitive) - Remove hardcoded chunk sizes from docstrings * docs: address code review comments - Rename "SYNC PATH" to "DOMAIN MODEL PATH" in embedding router - Add test_chunking.py and test_embedding.py to Testing Strategy - Clarify auto-embedding behavior for each domain model * fix: clean thinking tags from prompt graph output Adds clean_thinking_content() to prompt.py to handle extended thinking models that return <think>...</think> tags. This fixes empty titles when saving notes from chat. * chore: remove local docker-compose from git * fix(frontend): handle null parent_id in search results Add defensive check for null parent_id in search results to prevent "Cannot read properties of null (reading 'split')" error. This can happen with orphaned records in the database. * fix: cascade delete embeddings and insights when source is deleted When deleting a Source, now also deletes associated: - source_embedding records - source_insight records This prevents orphaned records that cause null parent_id errors in vector search results. * fix: add cleanup for orphan embedding/insight records in migration 10 Deletes source_embedding and source_insight records where the linked source no longer exists (source.id = NONE). * chore: bump esperanto to 2.16 Increases ctx_num for Ollama models to accommodate larger notebook context windows. See: https://github.com/lfnovo/esperanto/pull/69
61 lines
1.5 KiB
Python
61 lines
1.5 KiB
Python
"""
|
|
Utils package for Open Notebook.
|
|
|
|
To avoid circular imports, import functions directly:
|
|
- from open_notebook.utils.context_builder import ContextBuilder
|
|
- from open_notebook.utils import token_count, compare_versions
|
|
- from open_notebook.utils.chunking import chunk_text, detect_content_type, ContentType
|
|
- from open_notebook.utils.embedding import generate_embedding, generate_embeddings
|
|
"""
|
|
|
|
from .chunking import (
|
|
CHUNK_SIZE,
|
|
ContentType,
|
|
chunk_text,
|
|
detect_content_type,
|
|
detect_content_type_from_extension,
|
|
detect_content_type_from_heuristics,
|
|
)
|
|
from .embedding import (
|
|
generate_embedding,
|
|
generate_embeddings,
|
|
mean_pool_embeddings,
|
|
)
|
|
from .text_utils import (
|
|
clean_thinking_content,
|
|
parse_thinking_content,
|
|
remove_non_ascii,
|
|
remove_non_printable,
|
|
)
|
|
from .token_utils import token_cost, token_count
|
|
from .version_utils import (
|
|
compare_versions,
|
|
get_installed_version,
|
|
get_version_from_github,
|
|
)
|
|
|
|
__all__ = [
|
|
# Chunking
|
|
"CHUNK_SIZE",
|
|
"ContentType",
|
|
"chunk_text",
|
|
"detect_content_type",
|
|
"detect_content_type_from_extension",
|
|
"detect_content_type_from_heuristics",
|
|
# Embedding
|
|
"generate_embedding",
|
|
"generate_embeddings",
|
|
"mean_pool_embeddings",
|
|
# Text utils
|
|
"remove_non_ascii",
|
|
"remove_non_printable",
|
|
"parse_thinking_content",
|
|
"clean_thinking_content",
|
|
# Token utils
|
|
"token_count",
|
|
"token_cost",
|
|
# Version utils
|
|
"compare_versions",
|
|
"get_installed_version",
|
|
"get_version_from_github",
|
|
]
|