open-notebook/open_notebook/utils/text_utils.py
Luis Novo b7e656a319
Version 1 (#160)
New front-end
Launch Chat API
Manage Sources
Enable re-embedding of all contents
Sources can be added without a notebook now
Improved settings
Enable model selector on all chats
Background processing for better experience
Dark mode
Improved Notes

Improved Docs: 
- Remove all Streamlit references from documentation
- Update deployment guides with React frontend setup
- Fix Docker environment variables format (SURREAL_URL, SURREAL_PASSWORD)
- Update docker image tag from :latest to :v1-latest
- Change navigation references (Settings → Models to just Models)
- Update development setup to include frontend npm commands
- Add MIGRATION.md guide for users upgrading from Streamlit
- Update quick-start guide with correct environment variables
- Add port 5055 documentation for API access
- Update project structure to reflect frontend/ directory
- Remove outdated source-chat documentation files
2025-10-18 12:46:22 -03:00

141 lines
No EOL
4.3 KiB
Python

"""
Text utilities for Open Notebook.
Extracted from main utils to avoid circular imports.
"""
import re
import unicodedata
from typing import Tuple
from langchain_text_splitters import RecursiveCharacterTextSplitter
from .token_utils import token_count
# Pattern for matching thinking content in AI responses
THINK_PATTERN = re.compile(r'<think>(.*?)</think>', re.DOTALL)
def split_text(txt: str, chunk_size=500):
"""
Split the input text into chunks.
Args:
txt (str): The input text to be split.
chunk_size (int): The size of each chunk. Default is 500.
Returns:
list: A list of text chunks.
"""
overlap = int(chunk_size * 0.15)
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=overlap,
length_function=token_count,
separators=[
"\n\n",
"\n",
".",
",",
" ",
"\u200b", # Zero-width space
"\uff0c", # Fullwidth comma
"\u3001", # Ideographic comma
"\uff0e", # Fullwidth full stop
"\u3002", # Ideographic full stop
"",
],
)
return text_splitter.split_text(txt)
def remove_non_ascii(text: str) -> str:
"""Remove non-ASCII characters from text."""
return re.sub(r"[^\x00-\x7F]+", "", text)
def remove_non_printable(text: str) -> str:
"""Remove non-printable characters from text."""
# Replace any special Unicode whitespace characters with a regular space
text = re.sub(r"[\u2000-\u200B\u202F\u205F\u3000]", " ", text)
# Replace unusual line terminators with a single newline
text = re.sub(r"[\u2028\u2029\r]", "\n", text)
# Remove control characters, except newlines and tabs
text = "".join(
char for char in text if unicodedata.category(char)[0] != "C" or char in "\n\t"
)
# Replace non-breaking spaces with regular spaces
text = text.replace("\xa0", " ").strip()
# Keep letters (including accented ones), numbers, spaces, newlines, tabs, and basic punctuation
return re.sub(r"[^\w\s.,!?\-\n\t]", "", text, flags=re.UNICODE)
def parse_thinking_content(content: str) -> Tuple[str, str]:
"""
Parse message content to extract thinking content from <think> tags.
Args:
content (str): The original message content
Returns:
Tuple[str, str]: (thinking_content, cleaned_content)
- thinking_content: Content from within <think> tags
- cleaned_content: Original content with <think> blocks removed
Example:
>>> content = "<think>Let me analyze this</think>Here's my answer"
>>> thinking, cleaned = parse_thinking_content(content)
>>> print(thinking)
"Let me analyze this"
>>> print(cleaned)
"Here's my answer"
"""
# Input validation
if not isinstance(content, str):
return "", str(content) if content is not None else ""
# Limit processing for very large content (100KB limit)
if len(content) > 100000:
return "", content
# Find all thinking blocks
thinking_matches = THINK_PATTERN.findall(content)
if not thinking_matches:
return "", content
# Join all thinking content with double newlines
thinking_content = "\n\n".join(match.strip() for match in thinking_matches)
# Remove all <think>...</think> blocks from the original content
cleaned_content = THINK_PATTERN.sub("", content)
# Clean up extra whitespace
cleaned_content = re.sub(r'\n\s*\n\s*\n', '\n\n', cleaned_content).strip()
return thinking_content, cleaned_content
def clean_thinking_content(content: str) -> str:
"""
Remove thinking content from AI responses, returning only the cleaned content.
This is a convenience function for cases where you only need the cleaned
content and don't need access to the thinking process.
Args:
content (str): The original message content with potential <think> tags
Returns:
str: Content with <think> blocks removed and whitespace cleaned
Example:
>>> content = "<think>Let me think...</think>Here's the answer"
>>> clean_thinking_content(content)
"Here's the answer"
"""
_, cleaned_content = parse_thinking_content(content)
return cleaned_content