From 7ea23ddf7b7cb57e37275774f2d4b718db5c5c26 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Thu, 21 May 2026 08:34:32 +0200 Subject: [PATCH] vocab : add Carbon-3B (HybridDNATokenizer) support (#23410) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * vocab : add Carbon-3B (HybridDNATokenizer) support Adds a new BPE pre-type LLAMA_VOCAB_PRE_TYPE_CARBON for the HybridDNATokenizer used by HuggingFaceBio/Carbon-{500M,3B,8B}. The base BPE is Qwen3-4B-Base's; what differs is that text inside ... regions is chunked into fixed 6-mers (right-padded with 'A' on the trailing partial), and any base outside ACGT maps to . * src/llama-vocab.{h,cpp}: new pre-type, dispatched from llm_tokenizer_bpe_session::tokenize. * src/llama-vocab-carbon.h: pure helpers (tokenize_carbon, emit_dna_kmers) factored out for unit testing — no llama_vocab dependency, vocab access goes through a std::function. * conversion/base.py: detect HybridDNATokenizer by class name in get_vocab_base_pre (chktxt collides with Qwen3 base since it has no ), and pass trust_remote_code=True in get_vocab_base so the custom tokenizer class can load. * tests/test-tokenizer-carbon.cpp: 12 cases covering single 6-mer, multi 6-mer, lowercase, invalid base -> , partial k-mer right-pad, mixed text+DNA, empty , unterminated , two regions, vocab miss. * vocab : align Carbon-3B changes with llama.cpp conventions * Fold tokenize_carbon + emit_dna_kmers inline into llm_tokenizer_bpe_session (drop src/llama-vocab-carbon.h), matching how every other tokenizer keeps its helpers inside llama-vocab.cpp. * Replace the standalone unit test with the conventional test-tokenizer-0 row backed by models/ggml-vocab-carbon.gguf (vocab-only conversion) + .inp/.out fixtures covering single 6-mer, multi 6-mer, lowercase, invalid base -> , partial right-pad, mixed text+DNA, empty , unterminated , two regions. * Register "carbon" in convert_hf_to_gguf_update.py's model list (pointing at HuggingFaceBio/Carbon-3B) and teach both AutoTokenizer call sites in the updater to pass trust_remote_code=True for it, matching how t5 is special-cased. * vocab : move Carbon dispatch to _set_vocab_carbon + LlamaModel branch Refactor the conversion-side changes to follow the per-tokenizer-family convention used by _set_vocab_qwen, _set_vocab_interns1, _set_vocab_glm, etc. instead of conditionalising the shared get_vocab_base / get_vocab_base_pre paths. * conversion/base.py: add _set_vocab_carbon — self-contained, loads with trust_remote_code=True so HybridDNATokenizer's merged Qwen3 + DNA vocab is visible, writes tokenizer.ggml.pre = "carbon" directly. * conversion/llama.py: branch in LlamaModel.set_vocab on tokenizer_config.json["tokenizer_class"] == "HybridDNATokenizer" and dispatch to _set_vocab_carbon. Same precedent as conversion/bert.py (tokenizer_class branch between BertTokenizer / RobertaTokenizer) and conversion/phi.py. * conversion/base.py: revert the conditional in get_vocab_base and the class-name short-circuit in the auto-generated get_vocab_base_pre. * tests : expand ggml-vocab-carbon.gguf fixtures with model-card examples Add 6 cases from the Carbon-3B model card on top of the existing edge coverage: the unterminated basic-completion prompt, the closed 33-bp example, the metadata-conditioned prompt (with and which BPE-decompose since they are not in the vocab), the documented anti-pattern of raw DNA without tags, and the two likelihood-scoring examples. Brings the suite to 19 cases. * vocab : promote HybridDNATokenizer to its own LLAMA_VOCAB_TYPE Refactor per upstream review: > This should be its own tokenizer model, ie. carbonhybriddna instead > of gpt2 and not carbon pre-tokenizer. That way you can keep the > correct pre-tokenizer, in case that ever changes. Previously the tokenizer was modelled as LLAMA_VOCAB_TYPE_BPE plus a new LLAMA_VOCAB_PRE_TYPE_CARBON, which (a) put a CARBON-specific branch inside llm_tokenizer_bpe_session::tokenize (only existing pre-types differ in regex, not dispatch logic), and (b) conflated "hybrid DNA tokenization" with "Qwen3 BPE pre-tokenizer". This change moves it to its own vocab type, peer to PLAMO2, with the GGUF model name matching the HF tokenizer class (HybridDNATokenizer): * include/llama.h: new LLAMA_VOCAB_TYPE_HYBRIDDNA = 7. * src/llama-vocab.cpp: new llm_tokenizer_hybriddna + session that owns std::unique_ptr for non- text and routes raw text through a DNA-aware splitter; wired into init_tokenizer, tokenize, type_name, byte_to_token, and the BPE-style token_to_piece case (DNA k-mers + // are pure ASCII, so byte-level BPE decoding handles them). LLAMA_VOCAB_TYPE_HYBRIDDNA gets its own branch in the vocab-type config block alongside SPM/WPM/UGM/RWKV, where pre_type is set to QWEN2 and the matching add_space_prefix / escape_whitespaces / clean_spaces flags are applied — mirroring qwen2's BPE path so byte-level BPE merging stays bit-identical to the Python reference for non-DNA text. * src/llama-vocab.h: drop the short-lived LLAMA_VOCAB_PRE_TYPE_CARBON. * conversion/base.py: _set_vocab_hybriddna writes tokenizer.ggml.model = "hybriddna" (no separate pre). * conversion/llama.py: dispatch on tokenizer_class == "HybridDNATokenizer" same as bert.py / phi.py do. * models/ggml-vocab-hybriddna.gguf{,.inp,.out}: renamed fixture + regenerated metadata. * convert_hf_to_gguf_update.py: drop the stale chkhsh entry and trust_remote_code special-case (no longer needed since dispatch is now class-name driven, not chkhsh). Verified end-to-end against HuggingFaceBio/Carbon-{500M,3B,8B}: tokenization is bit-identical to the Python HybridDNATokenizer for all 19 test fixtures plus the model-card metadata-conditioned prompt; greedy completion produces the same DNA continuation as the Python reference; spec-dec with 500M as draft for 8B still works. * vocab : relax llm_tokenizer_bpe assert to allow HYBRIDDNA * vocab : drop llm_tokenizer_bpe vocab-type assert * vocab : write tokenizer.ggml.pre for HYBRIDDNA, share BPE dispatch * vocab : assert BPE or HYBRIDDNA in llm_tokenizer_bpe * vocab : annotate #endif with PRETOKENIZERDEBUG * vocab : drop local hybriddna fixture (moves to ggml-org/vocabs) * deduplicate * simplify * simplify --------- Co-authored-by: Sigbjørn Skjæret --- conversion/base.py | 36 ++++++++++++++ conversion/llama.py | 16 +++--- src/llama-vocab.cpp | 115 +++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 152 insertions(+), 15 deletions(-) diff --git a/conversion/base.py b/conversion/base.py index 30c2124c2..8e12af6c5 100644 --- a/conversion/base.py +++ b/conversion/base.py @@ -1610,6 +1610,42 @@ class TextModel(ModelBase): special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) special_vocab.add_to_gguf(self.gguf_writer) + def _set_vocab_hybriddna(self): + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True) + vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab)) # ty: ignore[unresolved-attribute] + assert max(tokenizer.vocab.values()) < vocab_size # ty: ignore[unresolved-attribute] + + reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} # ty: ignore[unresolved-attribute] + added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute] + added_tokens_decoder = tokenizer.added_tokens_decoder # ty: ignore[unresolved-attribute] + + tokens: list[str] = [] + toktypes: list[int] = [] + for i in range(vocab_size): + if i not in reverse_vocab: + tokens.append(f"[PAD{i}]") + toktypes.append(gguf.TokenType.UNUSED) + else: + token: str = reverse_vocab[i] + if token in added_vocab: + if added_tokens_decoder[i].special or self.does_token_look_special(token): + toktypes.append(gguf.TokenType.CONTROL) + else: + toktypes.append(gguf.TokenType.USER_DEFINED) + else: + toktypes.append(gguf.TokenType.NORMAL) + tokens.append(token) + + tokpre = self.get_vocab_base_pre(tokenizer) + self.gguf_writer.add_tokenizer_model("hybriddna") + self.gguf_writer.add_tokenizer_pre(tokpre) + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) + special_vocab.add_to_gguf(self.gguf_writer) + def _set_vocab_qwen(self): from .qwen import QwenModel diff --git a/conversion/llama.py b/conversion/llama.py index 41fde5143..fd6167bfd 100644 --- a/conversion/llama.py +++ b/conversion/llama.py @@ -51,6 +51,15 @@ class LlamaModel(TextModel): if path_tekken_json.is_file() and not path_tokenizer_json.is_file(): self._set_vocab_mistral() + tokenizer_config_file = self.dir_model / 'tokenizer_config.json' + if tokenizer_config_file.is_file(): + with open(tokenizer_config_file, "r", encoding="utf-8") as f: + tokenizer_config_json = json.load(f) + if (add_prefix_space := tokenizer_config_json.get("add_prefix_space")) is not None: + self.gguf_writer.add_add_space_prefix(add_prefix_space) + if tokenizer_config_json.get("tokenizer_class") == "HybridDNATokenizer": + return self._set_vocab_hybriddna() + try: self._set_vocab_sentencepiece() except FileNotFoundError: @@ -72,13 +81,6 @@ class LlamaModel(TextModel): special_vocab._set_special_token("eot", 32010) special_vocab.add_to_gguf(self.gguf_writer) - tokenizer_config_file = self.dir_model / 'tokenizer_config.json' - if tokenizer_config_file.is_file(): - with open(tokenizer_config_file, "r", encoding="utf-8") as f: - tokenizer_config_json = json.load(f) - if "add_prefix_space" in tokenizer_config_json: - self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"]) - # Apply to granite small models only if self.hparams.get("vocab_size", 32000) == 49152: self.gguf_writer.add_add_bos_token(False) diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index f43cf546c..acf832d05 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -530,6 +530,8 @@ struct llm_tokenizer_bpe : llm_tokenizer { struct llm_tokenizer_bpe_session { llm_tokenizer_bpe_session(const llama_vocab & vocab, const llm_tokenizer_bpe & tokenizer) : vocab(vocab), tokenizer(tokenizer) {} + virtual ~llm_tokenizer_bpe_session() = default; + static void append(const llama_token token_id, std::vector & output) { output.push_back(token_id); } @@ -567,7 +569,7 @@ struct llm_tokenizer_bpe_session { } } - void tokenize(const std::string & text, std::vector & output) { + virtual void tokenize(const std::string & text, std::vector & output) { int final_prev_index = -1; const auto word_collection = unicode_regex_split(text, tokenizer.regex_exprs, tokenizer.byte_encode); @@ -1579,6 +1581,95 @@ private: const llm_tokenizer_plamo2 & tokenizer; }; +struct llm_tokenizer_hybriddna_session : llm_tokenizer_bpe_session { + llm_tokenizer_hybriddna_session(const llama_vocab & vocab, const llm_tokenizer_bpe & tokenizer) : llm_tokenizer_bpe_session{vocab, tokenizer}, vocab{vocab} {} + + void tokenize(const std::string & text, std::vector & output) override { + static const std::string open_tag = ""; + static const std::string close_tag = ""; + + const auto dna_begin_id = vocab.text_to_token(open_tag); + const auto dna_end_id = vocab.text_to_token(close_tag); + const auto dna_oov_id = vocab.text_to_token(""); + + // Fall back to plain BPE if the DNA pieces aren't in the vocab. + if (dna_begin_id == LLAMA_TOKEN_NULL || dna_end_id == LLAMA_TOKEN_NULL || dna_oov_id == LLAMA_TOKEN_NULL) { + llm_tokenizer_bpe_session::tokenize(text, output); + return; + } + + const size_t k = 6; + size_t pos = 0; + + while (pos < text.size()) { + const size_t start = text.find(open_tag, pos); + if (start == std::string::npos) { + if (pos < text.size()) { + llm_tokenizer_bpe_session::tokenize(text.substr(pos), output); + } + break; + } + if (start > pos) { + llm_tokenizer_bpe_session::tokenize(text.substr(pos, start - pos), output); + } + output.push_back(dna_begin_id); + + const size_t content_start = start + open_tag.size(); + const size_t end = text.find(close_tag, content_start); + const size_t content_end = (end == std::string::npos) ? text.size() : end; + + emit_dna_kmers(text.substr(content_start, content_end - content_start), k, dna_oov_id, output); + + if (end == std::string::npos) { + break; + } + output.push_back(dna_end_id); + pos = end + close_tag.size(); + } + } + +private: + void emit_dna_kmers(const std::string & raw, size_t k, llama_token oov_id, std::vector & output) { + std::string seq = raw; + for (char & c : seq) { + if (c >= 'a' && c <= 'z') { + c = char(c - 32); + } + } + auto is_valid_kmer = [](const std::string & s) { + for (char c : s) { + if (c != 'A' && c != 'C' && c != 'G' && c != 'T') { + return false; + } + } + return true; + }; + + size_t i = 0; + for (; i + k <= seq.size(); i += k) { + const std::string kmer = seq.substr(i, k); + if (is_valid_kmer(kmer)) { + const auto tok = vocab.text_to_token(kmer); + output.push_back(tok != LLAMA_TOKEN_NULL ? tok : oov_id); + } else { + output.push_back(oov_id); + } + } + if (i < seq.size()) { + std::string kmer = seq.substr(i); + kmer.append(k - kmer.size(), 'A'); + if (is_valid_kmer(kmer)) { + const auto tok = vocab.text_to_token(kmer); + output.push_back(tok != LLAMA_TOKEN_NULL ? tok : oov_id); + } else { + output.push_back(oov_id); + } + } + } + + const llama_vocab & vocab; +}; + // // impl // @@ -1808,7 +1899,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { special_mask_id = 103; add_sep = true; - } else if (tokenizer_model == "gpt2") { + } else if (tokenizer_model == "gpt2" || tokenizer_model == "hybriddna") { type = LLAMA_VOCAB_TYPE_BPE; // read bpe merges and populate bpe ranks @@ -3144,11 +3235,19 @@ std::vector llama_vocab::impl::tokenize( } break; case LLAMA_VOCAB_TYPE_BPE: { - llm_tokenizer_bpe_session session(vocab, *static_cast(tokenizer.get())); // it calls some other methods that are not exist in llm_tokenizer, // here just cast it to bpe tokenizer object + const llm_tokenizer_bpe * tok_bpe = static_cast(tokenizer.get()); + + std::unique_ptr session; + if (vocab.get_tokenizer_model() == "hybriddna") { + session = std::make_unique(vocab, *tok_bpe); + } else { + session = std::make_unique(vocab, *tok_bpe); + } + if (add_special) { - session.append_bos(output); + session->append_bos(output); } for (const auto & fragment : fragment_buffer) { if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) { @@ -3161,15 +3260,15 @@ std::vector llama_vocab::impl::tokenize( #ifdef PRETOKENIZERDEBUG LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str()); #endif - session.tokenize(text, output); + session->tokenize(text, output); } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN) - session.append(fragment.token, output); + session->append(fragment.token, output); } } if (add_special) { - session.append_eos(output); - session.check_double_bos_eos(output); + session->append_eos(output); + session->check_double_bos_eos(output); } } break; case LLAMA_VOCAB_TYPE_WPM: