Version 1 (#160)

New front-end Launch Chat API Manage Sources Enable re-embedding of all contents Sources can be added without a notebook now Improved settings Enable model selector on all chats Background processing for better experience Dark mode Improved Notes Improved Docs: - Remove all Streamlit references from documentation - Update deployment guides with React frontend setup - Fix Docker environment variables format (SURREAL_URL, SURREAL_PASSWORD) - Update docker image tag from :latest to :v1-latest - Change navigation references (Settings → Models to just Models) - Update development setup to include frontend npm commands - Add MIGRATION.md guide for users upgrading from Streamlit - Update quick-start guide with correct environment variables - Add port 5055 documentation for API access - Update project structure to reflect frontend/ directory - Remove outdated source-chat documentation files
2026-04-29 03:50:04 +00:00 · 2025-10-18 12:46:22 -03:00 · 2025-10-18 12:46:22 -03:00 · b7e656a319
commit b7e656a319
parent 124d7d110c
319 changed files with 46747 additions and 7408 deletions
--- a/open_notebook/utils/text_utils.py
+++ b/open_notebook/utils/text_utils.py
@ -0,0 +1,141 @@
+"""
+Text utilities for Open Notebook.
+Extracted from main utils to avoid circular imports.
+"""
+
+import re
+import unicodedata
+from typing import Tuple
+
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+
+from .token_utils import token_count
+
+# Pattern for matching thinking content in AI responses
+THINK_PATTERN = re.compile(r'<think>(.*?)</think>', re.DOTALL)
+
+
+def split_text(txt: str, chunk_size=500):
+    """
+    Split the input text into chunks.
+
+    Args:
+        txt (str): The input text to be split.
+        chunk_size (int): The size of each chunk. Default is 500.
+
+    Returns:
+        list: A list of text chunks.
+    """
+    overlap = int(chunk_size * 0.15)
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=chunk_size,
+        chunk_overlap=overlap,
+        length_function=token_count,
+        separators=[
+            "\n\n",
+            "\n",
+            ".",
+            ",",
+            " ",
+            "\u200b",  # Zero-width space
+            "\uff0c",  # Fullwidth comma
+            "\u3001",  # Ideographic comma
+            "\uff0e",  # Fullwidth full stop
+            "\u3002",  # Ideographic full stop
+            "",
+        ],
+    )
+    return text_splitter.split_text(txt)
+
+
+def remove_non_ascii(text: str) -> str:
+    """Remove non-ASCII characters from text."""
+    return re.sub(r"[^\x00-\x7F]+", "", text)
+
+
+def remove_non_printable(text: str) -> str:
+    """Remove non-printable characters from text."""
+    # Replace any special Unicode whitespace characters with a regular space
+    text = re.sub(r"[\u2000-\u200B\u202F\u205F\u3000]", " ", text)
+
+    # Replace unusual line terminators with a single newline
+    text = re.sub(r"[\u2028\u2029\r]", "\n", text)
+
+    # Remove control characters, except newlines and tabs
+    text = "".join(
+        char for char in text if unicodedata.category(char)[0] != "C" or char in "\n\t"
+    )
+
+    # Replace non-breaking spaces with regular spaces
+    text = text.replace("\xa0", " ").strip()
+
+    # Keep letters (including accented ones), numbers, spaces, newlines, tabs, and basic punctuation
+    return re.sub(r"[^\w\s.,!?\-\n\t]", "", text, flags=re.UNICODE)
+
+
+def parse_thinking_content(content: str) -> Tuple[str, str]:
+    """
+    Parse message content to extract thinking content from <think> tags.
+    
+    Args:
+        content (str): The original message content
+        
+    Returns:
+        Tuple[str, str]: (thinking_content, cleaned_content)
+            - thinking_content: Content from within <think> tags
+            - cleaned_content: Original content with <think> blocks removed
+    
+    Example:
+        >>> content = "<think>Let me analyze this</think>Here's my answer"
+        >>> thinking, cleaned = parse_thinking_content(content)
+        >>> print(thinking)
+        "Let me analyze this"
+        >>> print(cleaned) 
+        "Here's my answer"
+    """
+    # Input validation
+    if not isinstance(content, str):
+        return "", str(content) if content is not None else ""
+    
+    # Limit processing for very large content (100KB limit)
+    if len(content) > 100000:
+        return "", content
+    
+    # Find all thinking blocks
+    thinking_matches = THINK_PATTERN.findall(content)
+    
+    if not thinking_matches:
+        return "", content
+    
+    # Join all thinking content with double newlines
+    thinking_content = "\n\n".join(match.strip() for match in thinking_matches)
+    
+    # Remove all <think>...</think> blocks from the original content
+    cleaned_content = THINK_PATTERN.sub("", content)
+    
+    # Clean up extra whitespace
+    cleaned_content = re.sub(r'\n\s*\n\s*\n', '\n\n', cleaned_content).strip()
+    
+    return thinking_content, cleaned_content
+
+
+def clean_thinking_content(content: str) -> str:
+    """
+    Remove thinking content from AI responses, returning only the cleaned content.
+    
+    This is a convenience function for cases where you only need the cleaned
+    content and don't need access to the thinking process.
+    
+    Args:
+        content (str): The original message content with potential <think> tags
+        
+    Returns:
+        str: Content with <think> blocks removed and whitespace cleaned
+        
+    Example:
+        >>> content = "<think>Let me think...</think>Here's the answer"
+        >>> clean_thinking_content(content)
+        "Here's the answer"
+    """
+    _, cleaned_content = parse_thinking_content(content)
+    return cleaned_content