mirror of
https://github.com/lfnovo/open-notebook.git
synced 2026-04-29 03:50:04 +00:00
This commit addresses two related issues in the chat interface: 1. **Fix broken reference links (OSS-310)** - Completely rewrote convertReferencesToMarkdownLinks() with greedy pattern matching - Now handles all edge cases: references after commas, nested brackets, bold markdown - Added visual icon indicators (FileText, Lightbulb, FileEdit) for reference types - Implemented proper error handling with toast notifications - Added validation for reference types and ID lengths 2. **Fix long URL/text overflow (#172)** - Added break-words and overflow-wrap classes to chat messages - Long URLs and text now wrap properly within chat bubbles - Applied fix consistently across source chat, notebook chat, and search results **Technical Details:** - Enhanced reference detection algorithm processes from end to start to preserve indices - Context analysis (50 chars before/after) determines original formatting - Icons are 12px, accessible, and themed appropriately - All changes pass linting and build successfully **Files Modified:** - frontend/src/lib/utils/source-references.tsx (core algorithm rewrite) - frontend/src/components/source/ChatPanel.tsx (error handling + text wrapping) - frontend/src/components/search/StreamingResponse.tsx (error handling + text wrapping) - open_notebook/utils/token_utils.py (ruff formatting fix) fixes #172
46 lines
No EOL
1.4 KiB
Python
46 lines
No EOL
1.4 KiB
Python
"""
|
|
Token utilities for Open Notebook.
|
|
Handles token counting and cost calculations for language models.
|
|
"""
|
|
|
|
import os
|
|
|
|
from open_notebook.config import TIKTOKEN_CACHE_DIR
|
|
|
|
# Set tiktoken cache directory before importing tiktoken to ensure
|
|
# tokenizer encodings are cached persistently in the data folder
|
|
os.environ["TIKTOKEN_CACHE_DIR"] = TIKTOKEN_CACHE_DIR
|
|
|
|
|
|
def token_count(input_string: str) -> int:
|
|
"""
|
|
Count the number of tokens in the input string using the 'o200k_base' encoding.
|
|
|
|
Args:
|
|
input_string (str): The input string to count tokens for.
|
|
|
|
Returns:
|
|
int: The number of tokens in the input string.
|
|
"""
|
|
try:
|
|
import tiktoken
|
|
encoding = tiktoken.get_encoding("o200k_base")
|
|
tokens = encoding.encode(input_string)
|
|
return len(tokens)
|
|
except ImportError:
|
|
# Fallback: simple word count estimation
|
|
return int(len(input_string.split()) * 1.3)
|
|
|
|
|
|
def token_cost(token_count: int, cost_per_million: float = 0.150) -> float:
|
|
"""
|
|
Calculate the cost of tokens based on the token count and cost per million tokens.
|
|
|
|
Args:
|
|
token_count (int): The number of tokens.
|
|
cost_per_million (float): The cost per million tokens. Default is 0.150.
|
|
|
|
Returns:
|
|
float: The calculated cost for the given token count.
|
|
"""
|
|
return cost_per_million * (token_count / 1_000_000) |