open-notebook/open_notebook/utils/token_utils.py
orihatav 5a350a7622 fix: narrow exception to (ImportError, OSError) and include error in log
Broad 'except Exception' could silently swallow unexpected failures.
URLError and ConnectionError are both subclasses of OSError, so
'except (ImportError, OSError)' captures all real offline/not-installed
cases while letting genuine programming errors propagate.

Also include the exception detail in the warning message so failures
are diagnosable in logs.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-10 19:45:14 -05:00

54 lines
1.7 KiB
Python

"""
Token utilities for Open Notebook.
Handles token counting and cost calculations for language models.
"""
import os
from open_notebook.config import TIKTOKEN_CACHE_DIR
# Set tiktoken cache directory before importing tiktoken to ensure
# tokenizer encodings are cached persistently in the data folder
os.environ["TIKTOKEN_CACHE_DIR"] = TIKTOKEN_CACHE_DIR
def token_count(input_string: str) -> int:
"""
Count the number of tokens in the input string using the 'o200k_base' encoding.
Args:
input_string (str): The input string to count tokens for.
Returns:
int: The number of tokens in the input string.
"""
try:
import tiktoken
encoding = tiktoken.get_encoding("o200k_base")
tokens = encoding.encode(input_string)
return len(tokens)
except (ImportError, OSError) as e:
# Fallback: handles ImportError (tiktoken not installed) AND network/OS
# errors such as urllib.error.URLError or ConnectionError raised in
# offline environments when the encoding file cannot be downloaded.
from loguru import logger
logger.warning(
"tiktoken unavailable, falling back to word-count estimation: {}", e
)
return int(len(input_string.split()) * 1.3)
def token_cost(token_count: int, cost_per_million: float = 0.150) -> float:
"""
Calculate the cost of tokens based on the token count and cost per million tokens.
Args:
token_count (int): The number of tokens.
cost_per_million (float): The cost per million tokens. Default is 0.150.
Returns:
float: The calculated cost for the given token count.
"""
return cost_per_million * (token_count / 1_000_000)