From afcda09d154a285cd366135f98ffc1d357f7ddbd Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Fri, 22 May 2026 11:17:31 +0200 Subject: [PATCH] vocab : fix HybridDNA tokenizer (#23466) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * vocab : mark hybriddna k-mers to avoid BPE token collisions * improved loop --------- Co-authored-by: Sigbjørn Skjæret --- conversion/base.py | 5 +++++ src/llama-vocab.cpp | 50 +++++++++++++++++++++++++++------------------ 2 files changed, 35 insertions(+), 20 deletions(-) diff --git a/conversion/base.py b/conversion/base.py index 8e12af6c5..d8f050ed3 100644 --- a/conversion/base.py +++ b/conversion/base.py @@ -1617,6 +1617,11 @@ class TextModel(ModelBase): assert max(tokenizer.vocab.values()) < vocab_size # ty: ignore[unresolved-attribute] reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} # ty: ignore[unresolved-attribute] + # k-mers can share text with a base-vocab BPE token (e.g. CCCCCC) and get + # dropped by get_vocab(); a reserved marker suffix (U+E000) keeps each + # k-mer's own id (llama.cpp strips it on detokenization) + for kmer in tokenizer.kmers: # ty: ignore[unresolved-attribute] + reverse_vocab[tokenizer.dna_token_to_id[kmer]] = kmer + "\ue000" # ty: ignore[unresolved-attribute] added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute] added_tokens_decoder = tokenizer.added_tokens_decoder # ty: ignore[unresolved-attribute] diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index acf832d05..a5cf148b2 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -1581,6 +1581,11 @@ private: const llm_tokenizer_plamo2 & tokenizer; }; +// reserved suffix (U+E000) that keeps DNA k-mers distinct from identical +// base-vocab BPE tokens (e.g. CCCCCC) in token_to_id; erased from id_to_token +// text at load +static const std::string dna_kmer_marker = "\xee\x80\x80"; + struct llm_tokenizer_hybriddna_session : llm_tokenizer_bpe_session { llm_tokenizer_hybriddna_session(const llama_vocab & vocab, const llm_tokenizer_bpe & tokenizer) : llm_tokenizer_bpe_session{vocab, tokenizer}, vocab{vocab} {} @@ -1636,34 +1641,22 @@ private: c = char(c - 32); } } - auto is_valid_kmer = [](const std::string & s) { - for (char c : s) { - if (c != 'A' && c != 'C' && c != 'G' && c != 'T') { - return false; - } - } - return true; + + // k-mers carry the reserved marker suffix; a non-ACGT k-mer simply + // isn't in the vocab and falls back to + auto kmer_token = [&](const std::string & kmer) { + const auto tok = vocab.text_to_token(kmer + dna_kmer_marker); + return tok != LLAMA_TOKEN_NULL ? tok : oov_id; }; size_t i = 0; for (; i + k <= seq.size(); i += k) { - const std::string kmer = seq.substr(i, k); - if (is_valid_kmer(kmer)) { - const auto tok = vocab.text_to_token(kmer); - output.push_back(tok != LLAMA_TOKEN_NULL ? tok : oov_id); - } else { - output.push_back(oov_id); - } + output.push_back(kmer_token(seq.substr(i, k))); } if (i < seq.size()) { std::string kmer = seq.substr(i); kmer.append(k - kmer.size(), 'A'); - if (is_valid_kmer(kmer)) { - const auto tok = vocab.text_to_token(kmer); - output.push_back(tok != LLAMA_TOKEN_NULL ? tok : oov_id); - } else { - output.push_back(oov_id); - } + output.push_back(kmer_token(kmer)); } } @@ -2357,6 +2350,23 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { } GGML_ASSERT(id_to_token.size() == token_to_id.size()); + // hybriddna: the marker suffix kept k-mer ids distinct in token_to_id; erase + // it from id_to_token so the k-mers detokenize to the bare DNA sequence. The + // k-mers are the block right after , so only scan from there. + if (tokenizer_model == "hybriddna") { + const auto idx = token_to_id.find(""); + if (idx != token_to_id.end()) { + auto it = id_to_token.begin() + idx->second + 1; + for (; it != id_to_token.end(); ++it) { + std::string & text = it->text; + if (text.size() > dna_kmer_marker.size() + && text.compare(text.size() - dna_kmer_marker.size(), dna_kmer_marker.size(), dna_kmer_marker) == 0) { + text.erase(text.size() - dna_kmer_marker.size()); + } + } + } + } + init_tokenizer(type); // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'