blt/bytelatent/tokenizers/abstract_tokenizer.py

20 lines
526 B
Python
Raw Normal View History

2024-12-12 23:32:30 +00:00
# Copyright (c) Meta Platforms, Inc. and affiliates.
import abc
class Tokenizer(abc.ABC):
@abc.abstractmethod
def encode(self, text: str, add_bos: bool, add_eos: bool):
pass
@abc.abstractmethod
def decode(self, tokens: list[int]):
pass
@abc.abstractmethod
def get_token_offsets(
self, text: str, tokens: list[int] | None = None
) -> tuple[list[str], list[int]]:
"""Return the offsets of the tokens in the original text. Only used for evaluation."""
pass