open-notebook/tests/test_chunking.py
Luis Novo d8006ff5cb
feat: content-type aware chunking and unified embedding (#444)
* feat: content-type aware chunking and unified embedding

- Add chunking.py with HTML, Markdown, and plain text detection
- Add embedding.py with mean pooling for large content
- Create dedicated commands: embed_note, embed_insight, embed_source
- Use fire-and-forget pattern for embedding via submit_command()
- Refactor rebuild_embeddings_command to delegate to individual commands
- Remove legacy commands and needs_embedding() methods
- Reduce chunk size to 1500 chars for Ollama compatibility
- Update CLAUDE.md documentation for new architecture

Fixes #350, #142

* fix: address code review issues

- Note.save() now returns command_id for tracking embedding jobs
- Add length check after generate_embeddings() to fail fast on mismatch
- Add numpy as explicit dependency (was transitive)
- Remove hardcoded chunk sizes from docstrings

* docs: address code review comments

- Rename "SYNC PATH" to "DOMAIN MODEL PATH" in embedding router
- Add test_chunking.py and test_embedding.py to Testing Strategy
- Clarify auto-embedding behavior for each domain model

* fix: clean thinking tags from prompt graph output

Adds clean_thinking_content() to prompt.py to handle extended thinking
models that return <think>...</think> tags. This fixes empty titles
when saving notes from chat.

* chore: remove local docker-compose from git

* fix(frontend): handle null parent_id in search results

Add defensive check for null parent_id in search results to prevent
"Cannot read properties of null (reading 'split')" error. This can
happen with orphaned records in the database.

* fix: cascade delete embeddings and insights when source is deleted

When deleting a Source, now also deletes associated:
- source_embedding records
- source_insight records

This prevents orphaned records that cause null parent_id errors
in vector search results.

* fix: add cleanup for orphan embedding/insight records in migration 10

Deletes source_embedding and source_insight records where the
linked source no longer exists (source.id = NONE).

* chore: bump esperanto to 2.16

Increases ctx_num for Ollama models to accommodate larger notebook
context windows. See: https://github.com/lfnovo/esperanto/pull/69
2026-01-21 23:49:08 -03:00

297 lines
11 KiB
Python

"""
Unit tests for the open_notebook.utils.chunking module.
Tests content type detection and text chunking functionality.
"""
import pytest
from open_notebook.utils.chunking import (
CHUNK_SIZE,
ContentType,
chunk_text,
detect_content_type,
detect_content_type_from_extension,
detect_content_type_from_heuristics,
)
# ============================================================================
# TEST SUITE 1: Content Type Detection from Extension
# ============================================================================
class TestDetectContentTypeFromExtension:
"""Test suite for extension-based content type detection."""
def test_html_extensions(self):
"""Test HTML file extensions."""
assert detect_content_type_from_extension("file.html") == ContentType.HTML
assert detect_content_type_from_extension("file.htm") == ContentType.HTML
assert detect_content_type_from_extension("file.xhtml") == ContentType.HTML
assert detect_content_type_from_extension("/path/to/file.HTML") == ContentType.HTML
def test_markdown_extensions(self):
"""Test Markdown file extensions."""
assert detect_content_type_from_extension("file.md") == ContentType.MARKDOWN
assert detect_content_type_from_extension("file.markdown") == ContentType.MARKDOWN
assert detect_content_type_from_extension("file.mdown") == ContentType.MARKDOWN
assert detect_content_type_from_extension("/path/to/README.MD") == ContentType.MARKDOWN
def test_plain_text_extensions(self):
"""Test plain text file extensions."""
assert detect_content_type_from_extension("file.txt") == ContentType.PLAIN
assert detect_content_type_from_extension("file.text") == ContentType.PLAIN
def test_code_extensions_as_plain(self):
"""Test code file extensions are treated as plain text."""
assert detect_content_type_from_extension("file.py") == ContentType.PLAIN
assert detect_content_type_from_extension("file.js") == ContentType.PLAIN
assert detect_content_type_from_extension("file.json") == ContentType.PLAIN
assert detect_content_type_from_extension("file.yaml") == ContentType.PLAIN
def test_unknown_extensions(self):
"""Test unknown extensions return None."""
assert detect_content_type_from_extension("file.xyz") is None
assert detect_content_type_from_extension("file.docx") is None
assert detect_content_type_from_extension("file.pdf") is None
def test_no_extension(self):
"""Test files without extension."""
assert detect_content_type_from_extension("Makefile") is None
assert detect_content_type_from_extension("README") is None
def test_none_input(self):
"""Test None input."""
assert detect_content_type_from_extension(None) is None
def test_empty_string(self):
"""Test empty string input."""
assert detect_content_type_from_extension("") is None
# ============================================================================
# TEST SUITE 2: Content Type Detection from Heuristics
# ============================================================================
class TestDetectContentTypeFromHeuristics:
"""Test suite for heuristics-based content type detection."""
def test_html_detection_doctype(self):
"""Test HTML detection with DOCTYPE."""
html_text = "<!DOCTYPE html><html><body>Content</body></html>"
content_type, confidence = detect_content_type_from_heuristics(html_text)
assert content_type == ContentType.HTML
assert confidence >= 0.8
def test_html_detection_tags(self):
"""Test HTML detection with structural tags."""
html_text = "<html><head><title>Test</title></head><body><div><p>Content</p></div></body></html>"
content_type, confidence = detect_content_type_from_heuristics(html_text)
assert content_type == ContentType.HTML
assert confidence >= 0.5
def test_markdown_detection_headers(self):
"""Test Markdown detection with headers."""
md_text = """# Main Title
## Section 1
Some content here.
## Section 2
More content.
### Subsection
Details here.
"""
content_type, confidence = detect_content_type_from_heuristics(md_text)
assert content_type == ContentType.MARKDOWN
assert confidence >= 0.3 # 4 headers give ~0.35 confidence
def test_markdown_detection_links(self):
"""Test Markdown detection with links and headers for stronger signal."""
md_text = """# Documentation
Check out [this link](https://example.com) and [another one](https://test.com).
## References
Here's some more text with [links](url) and `inline code`."""
content_type, confidence = detect_content_type_from_heuristics(md_text)
assert content_type == ContentType.MARKDOWN
assert confidence >= 0.4
def test_markdown_detection_code_blocks(self):
"""Test Markdown detection with code blocks."""
md_text = """# Code Example
```python
def hello():
print("Hello, World!")
```
Some explanation text.
"""
content_type, confidence = detect_content_type_from_heuristics(md_text)
assert content_type == ContentType.MARKDOWN
assert confidence >= 0.5
def test_plain_text_detection(self):
"""Test plain text detection."""
plain_text = """This is just regular plain text.
It has multiple lines but no special formatting.
No headers, no links, no HTML tags.
Just regular sentences and paragraphs."""
content_type, confidence = detect_content_type_from_heuristics(plain_text)
assert content_type == ContentType.PLAIN
def test_short_text(self):
"""Test short text defaults to plain."""
content_type, confidence = detect_content_type_from_heuristics("Hi")
assert content_type == ContentType.PLAIN
def test_empty_text(self):
"""Test empty text defaults to plain."""
content_type, confidence = detect_content_type_from_heuristics("")
assert content_type == ContentType.PLAIN
# ============================================================================
# TEST SUITE 3: Combined Content Type Detection
# ============================================================================
class TestDetectContentType:
"""Test suite for combined content type detection."""
def test_extension_takes_priority(self):
"""Test that file extension takes priority over heuristics."""
# Text looks like markdown but file is .txt
md_text = "# Header\n\nSome [link](url) content"
content_type = detect_content_type(md_text, "file.txt")
# Should use extension (plain) unless heuristics are very high confidence
# In this case, markdown confidence might override
assert content_type in (ContentType.PLAIN, ContentType.MARKDOWN)
def test_no_extension_uses_heuristics(self):
"""Test that heuristics are used when no extension is available."""
html_text = "<!DOCTYPE html><html><body>Test</body></html>"
content_type = detect_content_type(html_text, None)
assert content_type == ContentType.HTML
def test_extension_html(self):
"""Test HTML extension detection."""
content_type = detect_content_type("some text", "file.html")
assert content_type == ContentType.HTML
def test_extension_markdown(self):
"""Test Markdown extension detection."""
content_type = detect_content_type("some text", "file.md")
assert content_type == ContentType.MARKDOWN
def test_high_confidence_override(self):
"""Test that very high confidence heuristics can override plain extension."""
# Strong HTML indicators in a .txt file
html_text = "<!DOCTYPE html><html><head><title>Test</title></head><body><div><p>Content</p></div></body></html>"
content_type = detect_content_type(html_text, "file.txt")
# High confidence HTML should override .txt extension
assert content_type == ContentType.HTML
# ============================================================================
# TEST SUITE 4: Text Chunking
# ============================================================================
class TestChunkText:
"""Test suite for text chunking functionality."""
def test_empty_text(self):
"""Test chunking empty text."""
assert chunk_text("") == []
assert chunk_text(" ") == []
def test_short_text_no_chunking(self):
"""Test that short text is not chunked."""
text = "This is a short text."
chunks = chunk_text(text)
assert len(chunks) == 1
assert chunks[0] == text
def test_text_at_chunk_limit(self):
"""Test text at exactly chunk size limit."""
text = "x" * CHUNK_SIZE
chunks = chunk_text(text)
assert len(chunks) == 1
def test_long_text_is_chunked(self):
"""Test that long text is chunked."""
# Create text longer than chunk size
text = "This is a sentence. " * 200 # ~4000 chars
chunks = chunk_text(text)
assert len(chunks) > 1
# Each chunk should be <= CHUNK_SIZE
for chunk in chunks:
assert len(chunk) <= CHUNK_SIZE + 100 # Allow some flexibility for overlap
def test_explicit_content_type_html(self):
"""Test chunking with explicit HTML content type."""
html_text = """<html>
<body>
<h1>Main Title</h1>
<p>First paragraph with lots of content.</p>
<h2>Section</h2>
<p>Second paragraph.</p>
</body>
</html>"""
chunks = chunk_text(html_text, content_type=ContentType.HTML)
assert len(chunks) >= 1
def test_explicit_content_type_markdown(self):
"""Test chunking with explicit Markdown content type."""
md_text = """# Main Title
Introduction paragraph.
## Section 1
Content for section 1.
## Section 2
Content for section 2.
"""
chunks = chunk_text(md_text, content_type=ContentType.MARKDOWN)
assert len(chunks) >= 1
def test_explicit_content_type_plain(self):
"""Test chunking with explicit plain content type."""
plain_text = "Word " * 500 # ~2500 chars
chunks = chunk_text(plain_text, content_type=ContentType.PLAIN)
assert len(chunks) >= 1
def test_file_path_detection(self):
"""Test chunking with file path for content type detection."""
text = "Some content here"
chunks = chunk_text(text, file_path="document.md")
assert len(chunks) == 1
def test_secondary_chunking_for_large_sections(self):
"""Test that large sections from HTML/MD splitters are further chunked."""
# Create text that would produce a single large section
large_section = "x" * 3000 # Larger than CHUNK_SIZE
md_text = f"# Title\n\n{large_section}"
chunks = chunk_text(md_text, content_type=ContentType.MARKDOWN)
# Should have multiple chunks due to secondary chunking
assert len(chunks) >= 1
for chunk in chunks:
# Allow some flexibility but chunks should be reasonable size
assert len(chunk) <= CHUNK_SIZE + 300
if __name__ == "__main__":
pytest.main([__file__, "-v"])