blt/bytelatent/tokenizers/test_blt_tokenizer.py
2024-12-12 15:32:30 -08:00

42 lines
1.2 KiB
Python

# Copyright (c) Meta Platforms, Inc. and affiliates.
import json
from bytelatent.constants import BLT_DATA
from bytelatent.tokenizers.blt_tokenizer import BltTokenizer
from bytelatent.tokenizers.build_tokenizer import TokenizerArgs
def test_tokenizer_bytes():
with open("fixtures/tokenizer_data.json") as f:
data = json.load(f)
examples: list[str] = data["texts"]
examples_tokens: list[list[int]] = data["tokens"]
tokenizer = BltTokenizer(bpe_delim=False)
for i in range(len(examples)):
assert tokenizer.encode(examples[i]) == examples_tokens[i]
def test_tokenizer_bpe():
with open("fixtures/tokenizer_data_bpe_delim.json") as f:
data = json.load(f)
examples: list[str] = data["texts"]
examples_tokens: list[list[int]] = data["tokens"]
tokenizer = BltTokenizer(bpe_delim=True)
for i in range(len(examples)):
assert tokenizer.encode(examples[i]) == examples_tokens[i]
def test_build_tokenizer_from_args():
tokenizer_args = TokenizerArgs(
name="blt",
init_kwargs={
"bpe_tokenizer_path": BLT_DATA / "tokenizer_final_32k.minus_inf_ws.model"
},
)
tokenizer = tokenizer_args.build()
assert tokenizer.encode("test text") is not None