model : add sarvam_moe architecture support (#20275)

This commit is contained in:
Sumit Chatterjee 2026-05-10 00:31:50 +10:00 committed by GitHub
parent 65d7a8bbf0
commit 1e5ad35d56
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 46 additions and 0 deletions

View file

@ -1570,6 +1570,9 @@ class TextModel(ModelBase):
if chkhsh == "862f827721df956049dff5ca81a57f29e575280bc622e290d3bf4e35eca29015":
# ref: https://huggingface.co/codefuse-ai/F2LLM-v2-4B
res = "f2llmv2"
if chkhsh == "62f6fb0a6fd5098caeabb19b07a5c1099cafc8b9c40eab6ea89ece4ec02fbc57":
# ref: https://huggingface.co/sarvamai/sarvam-30b
res = "sarvam-moe"
if res is None:
logger.warning("\n")
@ -11591,6 +11594,34 @@ class BailingMoeV2Model(TextModel):
raise ValueError(f"Unprocessed experts: {experts}")
@ModelBase.register("SarvamMoEForCausalLM", "modeling_sarvam_moe.SarvamMoEForCausalLM")
class SarvamMoEModel(BailingMoeV2Model):
model_arch = gguf.MODEL_ARCH.BAILINGMOE2
# Sarvam-MoE shares the BailingMoeV2 architecture; only differences:
# - full rotary (no partial_rotary_factor)
# - expert bias is zero-mean normalized at load time
def set_gguf_parameters(self):
super().set_gguf_parameters()
hparams = self.hparams
if (rope_dim := hparams.get("head_dim")) is None:
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
# Override the partial-rotary value written by BailingMoeV2 with the full rotary dim
self.gguf_writer.add_rope_dimension_count(rope_dim)
@classmethod
def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
name, gen = item
if name.endswith(".expert_bias"):
# Sarvam normalizes expert bias to zero mean
inner = gen
def gen():
t = inner()
return t - t.mean()
return super().filter_tensors((name, gen))
@ModelBase.register("GroveMoeForCausalLM", "modeling_grove_moe.GroveMoeForCausalLM")
class GroveMoeModel(TextModel):
model_arch = gguf.MODEL_ARCH.GROVEMOE

View file

@ -155,6 +155,7 @@ models = [
{"name": "joyai-llm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jdopensource/JoyAI-LLM-Flash", },
{"name": "kanana2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/kakaocorp/kanana-2-30b-a3b-instruct-2601", },
{"name": "f2llmv2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/codefuse-ai/F2LLM-v2-4B", },
{"name": "sarvam-moe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sarvamai/sarvam-30b", },
]
# some models are known to be broken upstream, so we will skip them as exceptions

View file

@ -503,6 +503,14 @@ struct llm_tokenizer_bpe : llm_tokenizer {
};
byte_encode = false; // uses raw UTF-8, not GPT-2 byte encoding
break;
case LLAMA_VOCAB_PRE_TYPE_SARVAM_MOE:
// Sarvam uses SPM-style BPE (same shape as Gemma4): spaces replaced with U+2581
// by the normalizer, BPE merges over the whole text on raw UTF-8.
regex_exprs = {
"[^\\n]+|[\\n]+",
};
byte_encode = false;
break;
default:
// default regex for BPE tokenization pre-processing
regex_exprs = {
@ -2005,6 +2013,11 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
tokenizer_pre == "gemma4") {
pre_type = LLAMA_VOCAB_PRE_TYPE_GEMMA4;
escape_whitespaces = true;
} else if (
tokenizer_pre == "sarvam-moe") {
pre_type = LLAMA_VOCAB_PRE_TYPE_SARVAM_MOE;
escape_whitespaces = true;
clean_spaces = false;
} else if (
tokenizer_pre == "jina-v1-en" ||
tokenizer_pre == "jina-v2-code" ||

View file

@ -59,6 +59,7 @@ enum llama_vocab_pre_type {
LLAMA_VOCAB_PRE_TYPE_JOYAI_LLM = 48,
LLAMA_VOCAB_PRE_TYPE_JAIS2 = 49,
LLAMA_VOCAB_PRE_TYPE_GEMMA4 = 50,
LLAMA_VOCAB_PRE_TYPE_SARVAM_MOE = 51,
};
struct LLM_KV;