mirror of
https://github.com/facebookresearch/blt.git
synced 2025-01-18 16:37:46 +00:00
42 lines
1.2 KiB
Python
42 lines
1.2 KiB
Python
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
import json
|
|
|
|
from bytelatent.constants import BLT_DATA
|
|
from bytelatent.tokenizers.blt_tokenizer import BltTokenizer
|
|
from bytelatent.tokenizers.build_tokenizer import TokenizerArgs
|
|
|
|
|
|
def test_tokenizer_bytes():
|
|
with open("fixtures/tokenizer_data.json") as f:
|
|
data = json.load(f)
|
|
|
|
examples: list[str] = data["texts"]
|
|
examples_tokens: list[list[int]] = data["tokens"]
|
|
|
|
tokenizer = BltTokenizer(bpe_delim=False)
|
|
for i in range(len(examples)):
|
|
assert tokenizer.encode(examples[i]) == examples_tokens[i]
|
|
|
|
|
|
def test_tokenizer_bpe():
|
|
with open("fixtures/tokenizer_data_bpe_delim.json") as f:
|
|
data = json.load(f)
|
|
|
|
examples: list[str] = data["texts"]
|
|
examples_tokens: list[list[int]] = data["tokens"]
|
|
|
|
tokenizer = BltTokenizer(bpe_delim=True)
|
|
for i in range(len(examples)):
|
|
assert tokenizer.encode(examples[i]) == examples_tokens[i]
|
|
|
|
|
|
def test_build_tokenizer_from_args():
|
|
tokenizer_args = TokenizerArgs(
|
|
name="blt",
|
|
init_kwargs={
|
|
"bpe_tokenizer_path": BLT_DATA / "tokenizer_final_32k.minus_inf_ws.model"
|
|
},
|
|
)
|
|
tokenizer = tokenizer_args.build()
|
|
assert tokenizer.encode("test text") is not None
|