mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-05-22 11:16:08 +00:00
vocab : fix HybridDNA tokenizer (#23466)
Some checks are pending
Python Type-Check / python type-check (push) Waiting to run
Some checks are pending
Python Type-Check / python type-check (push) Waiting to run
* vocab : mark hybriddna k-mers to avoid BPE token collisions * improved loop --------- Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
This commit is contained in:
parent
bbce619adb
commit
afcda09d15
2 changed files with 35 additions and 20 deletions
|
|
@ -1617,6 +1617,11 @@ class TextModel(ModelBase):
|
|||
assert max(tokenizer.vocab.values()) < vocab_size # ty: ignore[unresolved-attribute]
|
||||
|
||||
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} # ty: ignore[unresolved-attribute]
|
||||
# k-mers can share text with a base-vocab BPE token (e.g. CCCCCC) and get
|
||||
# dropped by get_vocab(); a reserved marker suffix (U+E000) keeps each
|
||||
# k-mer's own id (llama.cpp strips it on detokenization)
|
||||
for kmer in tokenizer.kmers: # ty: ignore[unresolved-attribute]
|
||||
reverse_vocab[tokenizer.dna_token_to_id[kmer]] = kmer + "\ue000" # ty: ignore[unresolved-attribute]
|
||||
added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute]
|
||||
added_tokens_decoder = tokenizer.added_tokens_decoder # ty: ignore[unresolved-attribute]
|
||||
|
||||
|
|
|
|||
|
|
@ -1581,6 +1581,11 @@ private:
|
|||
const llm_tokenizer_plamo2 & tokenizer;
|
||||
};
|
||||
|
||||
// reserved suffix (U+E000) that keeps DNA k-mers distinct from identical
|
||||
// base-vocab BPE tokens (e.g. CCCCCC) in token_to_id; erased from id_to_token
|
||||
// text at load
|
||||
static const std::string dna_kmer_marker = "\xee\x80\x80";
|
||||
|
||||
struct llm_tokenizer_hybriddna_session : llm_tokenizer_bpe_session {
|
||||
llm_tokenizer_hybriddna_session(const llama_vocab & vocab, const llm_tokenizer_bpe & tokenizer) : llm_tokenizer_bpe_session{vocab, tokenizer}, vocab{vocab} {}
|
||||
|
||||
|
|
@ -1636,34 +1641,22 @@ private:
|
|||
c = char(c - 32);
|
||||
}
|
||||
}
|
||||
auto is_valid_kmer = [](const std::string & s) {
|
||||
for (char c : s) {
|
||||
if (c != 'A' && c != 'C' && c != 'G' && c != 'T') {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
|
||||
// k-mers carry the reserved marker suffix; a non-ACGT k-mer simply
|
||||
// isn't in the vocab and falls back to <oov>
|
||||
auto kmer_token = [&](const std::string & kmer) {
|
||||
const auto tok = vocab.text_to_token(kmer + dna_kmer_marker);
|
||||
return tok != LLAMA_TOKEN_NULL ? tok : oov_id;
|
||||
};
|
||||
|
||||
size_t i = 0;
|
||||
for (; i + k <= seq.size(); i += k) {
|
||||
const std::string kmer = seq.substr(i, k);
|
||||
if (is_valid_kmer(kmer)) {
|
||||
const auto tok = vocab.text_to_token(kmer);
|
||||
output.push_back(tok != LLAMA_TOKEN_NULL ? tok : oov_id);
|
||||
} else {
|
||||
output.push_back(oov_id);
|
||||
}
|
||||
output.push_back(kmer_token(seq.substr(i, k)));
|
||||
}
|
||||
if (i < seq.size()) {
|
||||
std::string kmer = seq.substr(i);
|
||||
kmer.append(k - kmer.size(), 'A');
|
||||
if (is_valid_kmer(kmer)) {
|
||||
const auto tok = vocab.text_to_token(kmer);
|
||||
output.push_back(tok != LLAMA_TOKEN_NULL ? tok : oov_id);
|
||||
} else {
|
||||
output.push_back(oov_id);
|
||||
}
|
||||
output.push_back(kmer_token(kmer));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -2357,6 +2350,23 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|||
}
|
||||
GGML_ASSERT(id_to_token.size() == token_to_id.size());
|
||||
|
||||
// hybriddna: the marker suffix kept k-mer ids distinct in token_to_id; erase
|
||||
// it from id_to_token so the k-mers detokenize to the bare DNA sequence. The
|
||||
// k-mers are the block right after <oov>, so only scan from there.
|
||||
if (tokenizer_model == "hybriddna") {
|
||||
const auto idx = token_to_id.find("<oov>");
|
||||
if (idx != token_to_id.end()) {
|
||||
auto it = id_to_token.begin() + idx->second + 1;
|
||||
for (; it != id_to_token.end(); ++it) {
|
||||
std::string & text = it->text;
|
||||
if (text.size() > dna_kmer_marker.size()
|
||||
&& text.compare(text.size() - dna_kmer_marker.size(), dna_kmer_marker.size(), dna_kmer_marker) == 0) {
|
||||
text.erase(text.size() - dna_kmer_marker.size());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
init_tokenizer(type);
|
||||
|
||||
// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue