mirror of
https://github.com/facebookresearch/blt.git
synced 2025-01-18 16:37:46 +00:00
20 lines
526 B
Python
20 lines
526 B
Python
|
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||
|
import abc
|
||
|
|
||
|
|
||
|
class Tokenizer(abc.ABC):
|
||
|
@abc.abstractmethod
|
||
|
def encode(self, text: str, add_bos: bool, add_eos: bool):
|
||
|
pass
|
||
|
|
||
|
@abc.abstractmethod
|
||
|
def decode(self, tokens: list[int]):
|
||
|
pass
|
||
|
|
||
|
@abc.abstractmethod
|
||
|
def get_token_offsets(
|
||
|
self, text: str, tokens: list[int] | None = None
|
||
|
) -> tuple[list[str], list[int]]:
|
||
|
"""Return the offsets of the tokens in the original text. Only used for evaluation."""
|
||
|
pass
|