mirror of
https://github.com/lfnovo/open-notebook.git
synced 2026-04-28 19:40:50 +00:00
feat: add environment variables for chunk size configuration (#520)
Some checks are pending
Some checks are pending
Adds OPEN_NOTEBOOK_CHUNK_SIZE and OPEN_NOTEBOOK_CHUNK_OVERLAP environment variables to allow users to configure chunking behavior for different embedding models with varying context window limits. Key changes: - CHUNK_SIZE is now configurable via OPEN_NOTEBOOK_CHUNK_SIZE (default: 1200) - CHUNK_OVERLAP is configurable via OPEN_NOTEBOOK_CHUNK_OVERLAP (default: 15%) - Validation with warnings for invalid or out-of-range values - Updated documentation with configuration examples This enables users of models like mxbai-embed-large with limited context windows to reduce chunk size accordingly. Closes #510
This commit is contained in:
parent
98eb6ed202
commit
4f33b854dd
2 changed files with 95 additions and 6 deletions
|
|
@ -7,8 +7,13 @@ Supports HTML, Markdown, and plain text with appropriate splitters for each type
|
|||
Key functions:
|
||||
- detect_content_type(): Detects content type from file extension or content heuristics
|
||||
- chunk_text(): Splits text into chunks using appropriate splitter for content type
|
||||
|
||||
Environment Variables:
|
||||
OPEN_NOTEBOOK_CHUNK_SIZE: Maximum chunk size in characters (default: 1200)
|
||||
OPEN_NOTEBOOK_CHUNK_OVERLAP: Overlap between chunks in characters (default: 15% of CHUNK_SIZE)
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
from enum import Enum
|
||||
from pathlib import Path
|
||||
|
|
@ -21,11 +26,71 @@ from langchain_text_splitters import (
|
|||
)
|
||||
from loguru import logger
|
||||
|
||||
# Constants
|
||||
CHUNK_SIZE = 1200 # characters
|
||||
CHUNK_OVERLAP = 180 # 15% of chunk size
|
||||
|
||||
def _get_chunk_size() -> int:
|
||||
"""Get chunk size from environment variable or use default."""
|
||||
chunk_size_str = os.getenv("OPEN_NOTEBOOK_CHUNK_SIZE")
|
||||
if chunk_size_str:
|
||||
try:
|
||||
chunk_size = int(chunk_size_str)
|
||||
if chunk_size < 100:
|
||||
logger.warning(
|
||||
f"OPEN_NOTEBOOK_CHUNK_SIZE ({chunk_size}) is too small. "
|
||||
f"Using minimum value of 100."
|
||||
)
|
||||
return 100
|
||||
if chunk_size > 8192:
|
||||
logger.warning(
|
||||
f"OPEN_NOTEBOOK_CHUNK_SIZE ({chunk_size}) is very large. "
|
||||
f"This may cause issues with some embedding models."
|
||||
)
|
||||
logger.info(f"Using custom chunk size: {chunk_size} characters")
|
||||
return chunk_size
|
||||
except ValueError:
|
||||
logger.warning(
|
||||
f"Invalid OPEN_NOTEBOOK_CHUNK_SIZE value: '{chunk_size_str}'. "
|
||||
f"Using default: 1200"
|
||||
)
|
||||
return 1200
|
||||
|
||||
|
||||
def _get_chunk_overlap(chunk_size: int) -> int:
|
||||
"""Get chunk overlap from environment variable or calculate default (15% of chunk size)."""
|
||||
overlap_str = os.getenv("OPEN_NOTEBOOK_CHUNK_OVERLAP")
|
||||
if overlap_str:
|
||||
try:
|
||||
overlap = int(overlap_str)
|
||||
if overlap < 0:
|
||||
logger.warning(
|
||||
f"OPEN_NOTEBOOK_CHUNK_OVERLAP ({overlap}) cannot be negative. "
|
||||
f"Using 0."
|
||||
)
|
||||
return 0
|
||||
if overlap >= chunk_size:
|
||||
logger.warning(
|
||||
f"OPEN_NOTEBOOK_CHUNK_OVERLAP ({overlap}) cannot be >= chunk size ({chunk_size}). "
|
||||
f"Using 15% of chunk size: {int(chunk_size * 0.15)}"
|
||||
)
|
||||
return int(chunk_size * 0.15)
|
||||
logger.info(f"Using custom chunk overlap: {overlap} characters")
|
||||
return overlap
|
||||
except ValueError:
|
||||
logger.warning(
|
||||
f"Invalid OPEN_NOTEBOOK_CHUNK_OVERLAP value: '{overlap_str}'. "
|
||||
f"Using default: 15% of chunk size"
|
||||
)
|
||||
return int(chunk_size * 0.15)
|
||||
|
||||
|
||||
# Constants (computed at import time from environment variables)
|
||||
CHUNK_SIZE = _get_chunk_size()
|
||||
CHUNK_OVERLAP = _get_chunk_overlap(CHUNK_SIZE)
|
||||
HIGH_CONFIDENCE_THRESHOLD = 0.8 # Threshold for heuristics to override extension
|
||||
|
||||
logger.debug(
|
||||
f"Chunking configuration: CHUNK_SIZE={CHUNK_SIZE}, CHUNK_OVERLAP={CHUNK_OVERLAP}"
|
||||
)
|
||||
|
||||
|
||||
class ContentType(Enum):
|
||||
"""Content type for chunking strategy selection."""
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue