Merge branch 'upstream' into concedo_experimental

# Conflicts:
#	CMakeLists.txt
#	docs/speculative.md
#	ggml/src/ggml-cuda/CMakeLists.txt
#	ggml/src/ggml-hexagon/ggml-hexagon.cpp
#	ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c
#	ggml/src/ggml-hexagon/htp/hmx-ops.h
#	ggml/src/ggml-hexagon/htp/main.c
#	ggml/src/ggml-hexagon/htp/matmul-ops.c
#	ggml/src/ggml-hexagon/htp/rope-ops.c
#	ggml/src/ggml-hexagon/htp/ssm-conv.c
#	ggml/src/ggml-opencl/ggml-opencl.cpp
#	scripts/snapdragon/adb/run-bench.sh
#	scripts/snapdragon/adb/run-cli.sh
#	scripts/snapdragon/adb/run-completion.sh
#	scripts/snapdragon/adb/run-mtmd.sh
#	scripts/snapdragon/windows/run-bench.ps1
#	scripts/snapdragon/windows/run-cli.ps1
#	scripts/snapdragon/windows/run-completion.ps1
#	scripts/snapdragon/windows/run-mtmd.ps1
#	src/llama-vocab.cpp
#	tests/test-backend-ops.cpp
#	tools/batched-bench/CMakeLists.txt
#	tools/batched-bench/batched-bench.cpp
#	tools/cli/CMakeLists.txt
#	tools/cli/README.md
#	tools/cli/cli.cpp
#	tools/completion/CMakeLists.txt
#	tools/completion/README.md
#	tools/llama-bench/CMakeLists.txt
#	tools/llama-bench/llama-bench.cpp
#	tools/mtmd/CMakeLists.txt
#	tools/mtmd/tests/test-deepseek-ocr.py
#	tools/mtmd/tests/tests-requirements.txt
#	tools/perplexity/CMakeLists.txt
#	tools/perplexity/perplexity.cpp
#	tools/quantize/CMakeLists.txt
#	tools/server/CMakeLists.txt
#	tools/server/README.md
#	ty.toml
This commit is contained in:
Concedo 2026-05-21 23:47:21 +08:00
commit 718dc159b6
83 changed files with 1469 additions and 648 deletions

View file

@ -73,7 +73,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
{ "hunyuan-moe", LLM_CHAT_TEMPLATE_HUNYUAN_MOE },
{ "gpt-oss", LLM_CHAT_TEMPLATE_OPENAI_MOE },
{ "hunyuan-dense", LLM_CHAT_TEMPLATE_HUNYUAN_DENSE },
{ "hunyuan-ocr", LLM_CHAT_TEMPLATE_HUNYUAN_OCR },
{ "hunyuan-vl", LLM_CHAT_TEMPLATE_HUNYUAN_VL },
{ "kimi-k2", LLM_CHAT_TEMPLATE_KIMI_K2 },
{ "seed_oss", LLM_CHAT_TEMPLATE_SEED_OSS },
{ "grok-2", LLM_CHAT_TEMPLATE_GROK_2 },
@ -218,7 +218,7 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
} else if (tmpl_contains("<|start|>") && tmpl_contains("<|channel|>")) {
return LLM_CHAT_TEMPLATE_OPENAI_MOE;
} else if (tmpl_contains("<hy_Assistant>") && tmpl_contains("<hy_begin▁of▁sentence>")) {
return LLM_CHAT_TEMPLATE_HUNYUAN_OCR;
return LLM_CHAT_TEMPLATE_HUNYUAN_VL;
} else if (tmpl_contains("<hy_Assistant>") && tmpl_contains("<hy_place▁holder▁no▁3>")) {
return LLM_CHAT_TEMPLATE_HUNYUAN_DENSE;
} else if (tmpl_contains("<|im_assistant|>assistant<|im_middle|>")) {
@ -825,8 +825,8 @@ int32_t llm_chat_apply_template(
ss << "<hy_User>" << chat[i]->content << "<hy_Assistant>";
}
}
} else if (tmpl == LLM_CHAT_TEMPLATE_HUNYUAN_OCR) {
// tencent/HunyuanOCR
} else if (tmpl == LLM_CHAT_TEMPLATE_HUNYUAN_VL) {
// tencent/HunyuanOCR & tencent/HunyuanVL
ss << "<hy_begin▁of▁sentence>";
for (size_t i = 0; i < chat.size(); i++) {
std::string role(chat[i]->role);

View file

@ -53,7 +53,7 @@ enum llm_chat_template {
LLM_CHAT_TEMPLATE_HUNYUAN_MOE,
LLM_CHAT_TEMPLATE_OPENAI_MOE,
LLM_CHAT_TEMPLATE_HUNYUAN_DENSE,
LLM_CHAT_TEMPLATE_HUNYUAN_OCR,
LLM_CHAT_TEMPLATE_HUNYUAN_VL,
LLM_CHAT_TEMPLATE_KIMI_K2,
LLM_CHAT_TEMPLATE_SEED_OSS,
LLM_CHAT_TEMPLATE_GROK_2,

View file

@ -1147,6 +1147,19 @@ bool llama_context::set_sampler(llama_seq_id seq_id, llama_sampler * sampler) {
LLAMA_LOG_DEBUG("%s: seq_id = %d, sampler = %p\n", __func__, (int) seq_id, (void *) sampler);
if (sampler && model.split_mode() == LLAMA_SPLIT_MODE_TENSOR) {
static bool warned = false;
if (!warned) {
LLAMA_LOG_WARN("%s: backend sampling not supported with SPLIT_MODE_TENSOR; using CPU\n", __func__);
warned = true;
}
if (sampling.samplers.count(seq_id) > 0) {
sched_need_reserve = true;
}
sampling.samplers.erase(seq_id);
return false;
}
const bool can_offload =
sampler &&
sampler->iface->backend_init &&

View file

@ -500,15 +500,21 @@ bool llm_graph_input_attn_k::can_reuse(const llm_graph_params & params) {
}
void llm_graph_input_attn_kv_iswa::set_input(const llama_ubatch * ubatch) {
mctx->get_base()->set_input_k_idxs(self_k_idxs, ubatch);
mctx->get_base()->set_input_v_idxs(self_v_idxs, ubatch);
// base tensors may not be allocated if there are no non-SWA attention layers
if (self_k_idxs && self_k_idxs->buffer) {
mctx->get_base()->set_input_k_idxs(self_k_idxs, ubatch);
mctx->get_base()->set_input_v_idxs(self_v_idxs, ubatch);
mctx->get_base()->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
mctx->get_base()->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
}
mctx->get_swa()->set_input_k_idxs(self_k_idxs_swa, ubatch);
mctx->get_swa()->set_input_v_idxs(self_v_idxs_swa, ubatch);
// swa tensors may not be allocated if there are no SWA attention layers
if (self_k_idxs_swa && self_k_idxs_swa->buffer) {
mctx->get_swa()->set_input_k_idxs(self_k_idxs_swa, ubatch);
mctx->get_swa()->set_input_v_idxs(self_v_idxs_swa, ubatch);
mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn);
mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn);
}
if (self_k_rot) {
mctx->get_base()->set_input_k_rot(self_k_rot);
@ -534,14 +540,21 @@ bool llm_graph_input_attn_kv_iswa::can_reuse(const llm_graph_params & params) {
bool res = true;
res &= self_k_idxs->ne[0] == params.ubatch.n_tokens;
//res &= self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
// base tensors may not be allocated if there are no non-SWA attention layers
if (self_k_idxs && self_k_idxs->buffer) {
res &= self_k_idxs->ne[0] == params.ubatch.n_tokens;
//res &= self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
res &= self_k_idxs_swa->ne[0] == params.ubatch.n_tokens;
//res &= self_v_idxs_swa->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
res &= can_reuse_kq_mask(self_kq_mask, mctx->get_base(), params.ubatch, params.cparams);
}
res &= can_reuse_kq_mask(self_kq_mask, mctx->get_base(), params.ubatch, params.cparams);
res &= can_reuse_kq_mask(self_kq_mask_swa, mctx->get_swa(), params.ubatch, params.cparams);
// swa tensors may not be allocated if there are no SWA attention layers
if (self_k_idxs_swa && self_k_idxs_swa->buffer) {
res &= self_k_idxs_swa->ne[0] == params.ubatch.n_tokens;
//res &= self_v_idxs_swa->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
res &= can_reuse_kq_mask(self_kq_mask_swa, mctx->get_swa(), params.ubatch, params.cparams);
}
return res;
}

View file

@ -755,6 +755,8 @@ struct llm_tokenizer_bpe : llm_tokenizer {
struct llm_tokenizer_bpe_session {
llm_tokenizer_bpe_session(const llama_vocab & vocab, const llm_tokenizer_bpe & tokenizer) : vocab(vocab), tokenizer(tokenizer) {}
virtual ~llm_tokenizer_bpe_session() = default;
static void append(const llama_token token_id, std::vector<llama_token> & output) {
output.push_back(token_id);
}
@ -792,7 +794,7 @@ struct llm_tokenizer_bpe_session {
// }
}
void tokenize(const std::string & text, std::vector<llama_token> & output) {
virtual void tokenize(const std::string & text, std::vector<llama_token> & output) {
int final_prev_index = -1;
const auto word_collection = unicode_regex_split(text, tokenizer.regex_exprs, tokenizer.byte_encode);
@ -1804,6 +1806,95 @@ private:
const llm_tokenizer_plamo2 & tokenizer;
};
struct llm_tokenizer_hybriddna_session : llm_tokenizer_bpe_session {
llm_tokenizer_hybriddna_session(const llama_vocab & vocab, const llm_tokenizer_bpe & tokenizer) : llm_tokenizer_bpe_session{vocab, tokenizer}, vocab{vocab} {}
void tokenize(const std::string & text, std::vector<llama_token> & output) override {
static const std::string open_tag = "<dna>";
static const std::string close_tag = "</dna>";
const auto dna_begin_id = vocab.text_to_token(open_tag);
const auto dna_end_id = vocab.text_to_token(close_tag);
const auto dna_oov_id = vocab.text_to_token("<oov>");
// Fall back to plain BPE if the DNA pieces aren't in the vocab.
if (dna_begin_id == LLAMA_TOKEN_NULL || dna_end_id == LLAMA_TOKEN_NULL || dna_oov_id == LLAMA_TOKEN_NULL) {
llm_tokenizer_bpe_session::tokenize(text, output);
return;
}
const size_t k = 6;
size_t pos = 0;
while (pos < text.size()) {
const size_t start = text.find(open_tag, pos);
if (start == std::string::npos) {
if (pos < text.size()) {
llm_tokenizer_bpe_session::tokenize(text.substr(pos), output);
}
break;
}
if (start > pos) {
llm_tokenizer_bpe_session::tokenize(text.substr(pos, start - pos), output);
}
output.push_back(dna_begin_id);
const size_t content_start = start + open_tag.size();
const size_t end = text.find(close_tag, content_start);
const size_t content_end = (end == std::string::npos) ? text.size() : end;
emit_dna_kmers(text.substr(content_start, content_end - content_start), k, dna_oov_id, output);
if (end == std::string::npos) {
break;
}
output.push_back(dna_end_id);
pos = end + close_tag.size();
}
}
private:
void emit_dna_kmers(const std::string & raw, size_t k, llama_token oov_id, std::vector<llama_token> & output) {
std::string seq = raw;
for (char & c : seq) {
if (c >= 'a' && c <= 'z') {
c = char(c - 32);
}
}
auto is_valid_kmer = [](const std::string & s) {
for (char c : s) {
if (c != 'A' && c != 'C' && c != 'G' && c != 'T') {
return false;
}
}
return true;
};
size_t i = 0;
for (; i + k <= seq.size(); i += k) {
const std::string kmer = seq.substr(i, k);
if (is_valid_kmer(kmer)) {
const auto tok = vocab.text_to_token(kmer);
output.push_back(tok != LLAMA_TOKEN_NULL ? tok : oov_id);
} else {
output.push_back(oov_id);
}
}
if (i < seq.size()) {
std::string kmer = seq.substr(i);
kmer.append(k - kmer.size(), 'A');
if (is_valid_kmer(kmer)) {
const auto tok = vocab.text_to_token(kmer);
output.push_back(tok != LLAMA_TOKEN_NULL ? tok : oov_id);
} else {
output.push_back(oov_id);
}
}
}
const llama_vocab & vocab;
};
//
// impl
//
@ -2034,7 +2125,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
special_mask_id = 103;
add_sep = true;
} else if (tokenizer_model == "gpt2") {
} else if (tokenizer_model == "gpt2" || tokenizer_model == "hybriddna") {
type = LLAMA_VOCAB_TYPE_BPE;
// read bpe merges and populate bpe ranks
@ -3421,12 +3512,19 @@ std::vector<llama_token> llama_vocab::impl::tokenize(
}
break;
}
llm_tokenizer_bpe_session session(vocab, *static_cast<const llm_tokenizer_bpe *>(tokenizer.get()));
// it calls some other methods that are not exist in llm_tokenizer,
// here just cast it to bpe tokenizer object
const llm_tokenizer_bpe * tok_bpe = static_cast<const llm_tokenizer_bpe *>(tokenizer.get());
std::unique_ptr<llm_tokenizer_bpe_session> session;
if (vocab.get_tokenizer_model() == "hybriddna") {
session = std::make_unique<llm_tokenizer_hybriddna_session>(vocab, *tok_bpe);
} else {
session = std::make_unique<llm_tokenizer_bpe_session>(vocab, *tok_bpe);
}
if (add_special) {
session.append_bos(output);
session->append_bos(output);
}
for (const auto & fragment : fragment_buffer) {
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
@ -3439,15 +3537,15 @@ std::vector<llama_token> llama_vocab::impl::tokenize(
#ifdef PRETOKENIZERDEBUG
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
#endif
session.tokenize(text, output);
session->tokenize(text, output);
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
session.append(fragment.token, output);
session->append(fragment.token, output);
}
}
if (add_special) {
session.append_eos(output);
session.check_double_bos_eos(output);
session->append_eos(output);
session->check_double_bos_eos(output);
}
} break;
case LLAMA_VOCAB_TYPE_WPM:

View file

@ -525,8 +525,9 @@ llama_model_qwen35::graph_mtp::graph_mtp(const llama_model & model, const llm_gr
res->add_input(std::move(inp));
ggml_tensor * inp_pos = build_inp_pos();
auto * inp_attn = build_attn_inp_kv();
ggml_tensor * inp_pos = build_inp_pos();
ggml_tensor * inp_out_ids = build_inp_out_ids();
auto * inp_attn = build_attn_inp_kv();
ggml_tensor * h_norm = build_norm(h_input, layer.nextn.hnorm, nullptr, LLM_NORM_RMS, il);
cb(h_norm, "mtp_hnorm", il);
@ -615,6 +616,8 @@ llama_model_qwen35::graph_mtp::graph_mtp(const llama_model & model, const llm_gr
cb(cur, "h_pre_norm", -1);
res->t_h_pre_norm = cur;
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
ggml_tensor * head_norm_w = layer.nextn.shared_head_norm
? layer.nextn.shared_head_norm
: model.output_norm;

View file

@ -588,8 +588,10 @@ llama_model_qwen35moe::graph_mtp::graph_mtp(const llama_model & model, const llm
res->add_input(std::move(inp));
ggml_tensor * inp_pos = build_inp_pos();
auto * inp_attn = build_attn_inp_kv();
ggml_tensor * inp_pos = build_inp_pos();
ggml_tensor * inp_out_ids = build_inp_out_ids();
auto * inp_attn = build_attn_inp_kv();
ggml_tensor * h_norm = build_norm(h_input, layer.nextn.hnorm, nullptr, LLM_NORM_RMS, il);
cb(h_norm, "mtp_hnorm", il);
@ -710,6 +712,8 @@ llama_model_qwen35moe::graph_mtp::graph_mtp(const llama_model & model, const llm
cb(cur, "h_pre_norm", -1);
res->t_h_pre_norm = cur;
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
ggml_tensor * head_norm_w = layer.nextn.shared_head_norm
? layer.nextn.shared_head_norm
: model.output_norm;