mirror of
https://github.com/lfnovo/open-notebook.git
synced 2026-04-29 12:00:00 +00:00
In air-gapped / offline Docker deployments, tiktoken.get_encoding() tries
to download the encoding file from openaipublic.blob.core.windows.net.
When that request fails it raises a URLError / OSError — not an ImportError
— so the previous except clause silently missed it and the crash surfaced in
the UI.
Widened `except ImportError` to `except Exception` so all failures —
"not installed" and "network unreachable" — fall through to the word-count
fallback (words × 1.3). Added a loguru WARNING so operators can see when
the fallback is active.
TIKTOKEN_CACHE_DIR now reads from the environment with a blank-safe
fallback (`or` guard prevents os.makedirs("") on empty env var). This lets
Docker images redirect the cache to a path outside /app/data/ so user-data
volume mounts cannot shadow the pre-baked encoding.
Both images now pre-download the o200k_base encoding during the builder
stage (internet is available at build time) and copy it into the runtime
image at /app/tiktoken-cache. ENV TIKTOKEN_CACHE_DIR=/app/tiktoken-cache
is set in the runtime stage so no network call is ever needed at runtime.
Added test_token_count_network_error_fallback in tests/test_utils.py:
patches tiktoken.get_encoding with a URLError and asserts token_count()
returns a positive int instead of raising.
Fixes #264
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
54 lines
1.6 KiB
Python
54 lines
1.6 KiB
Python
"""
|
|
Token utilities for Open Notebook.
|
|
Handles token counting and cost calculations for language models.
|
|
"""
|
|
|
|
import os
|
|
|
|
from open_notebook.config import TIKTOKEN_CACHE_DIR
|
|
|
|
# Set tiktoken cache directory before importing tiktoken to ensure
|
|
# tokenizer encodings are cached persistently in the data folder
|
|
os.environ["TIKTOKEN_CACHE_DIR"] = TIKTOKEN_CACHE_DIR
|
|
|
|
|
|
def token_count(input_string: str) -> int:
|
|
"""
|
|
Count the number of tokens in the input string using the 'o200k_base' encoding.
|
|
|
|
Args:
|
|
input_string (str): The input string to count tokens for.
|
|
|
|
Returns:
|
|
int: The number of tokens in the input string.
|
|
"""
|
|
try:
|
|
import tiktoken
|
|
|
|
encoding = tiktoken.get_encoding("o200k_base")
|
|
tokens = encoding.encode(input_string)
|
|
return len(tokens)
|
|
except Exception:
|
|
# Fallback: handles ImportError (not installed) AND network errors
|
|
# (e.g., offline environments that can't download encoding from internet)
|
|
from loguru import logger
|
|
|
|
logger.warning(
|
|
"tiktoken unavailable (not installed or offline); "
|
|
"falling back to word-count estimation."
|
|
)
|
|
return int(len(input_string.split()) * 1.3)
|
|
|
|
|
|
def token_cost(token_count: int, cost_per_million: float = 0.150) -> float:
|
|
"""
|
|
Calculate the cost of tokens based on the token count and cost per million tokens.
|
|
|
|
Args:
|
|
token_count (int): The number of tokens.
|
|
cost_per_million (float): The cost per million tokens. Default is 0.150.
|
|
|
|
Returns:
|
|
float: The calculated cost for the given token count.
|
|
"""
|
|
return cost_per_million * (token_count / 1_000_000)
|