mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-06-01 22:50:53 +00:00
Merge branch 'upstream' into concedo_experimental
# Conflicts: # CMakeLists.txt # docs/speculative.md # ggml/src/ggml-cuda/CMakeLists.txt # ggml/src/ggml-hexagon/ggml-hexagon.cpp # ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c # ggml/src/ggml-hexagon/htp/hmx-ops.h # ggml/src/ggml-hexagon/htp/main.c # ggml/src/ggml-hexagon/htp/matmul-ops.c # ggml/src/ggml-hexagon/htp/rope-ops.c # ggml/src/ggml-hexagon/htp/ssm-conv.c # ggml/src/ggml-opencl/ggml-opencl.cpp # scripts/snapdragon/adb/run-bench.sh # scripts/snapdragon/adb/run-cli.sh # scripts/snapdragon/adb/run-completion.sh # scripts/snapdragon/adb/run-mtmd.sh # scripts/snapdragon/windows/run-bench.ps1 # scripts/snapdragon/windows/run-cli.ps1 # scripts/snapdragon/windows/run-completion.ps1 # scripts/snapdragon/windows/run-mtmd.ps1 # src/llama-vocab.cpp # tests/test-backend-ops.cpp # tools/batched-bench/CMakeLists.txt # tools/batched-bench/batched-bench.cpp # tools/cli/CMakeLists.txt # tools/cli/README.md # tools/cli/cli.cpp # tools/completion/CMakeLists.txt # tools/completion/README.md # tools/llama-bench/CMakeLists.txt # tools/llama-bench/llama-bench.cpp # tools/mtmd/CMakeLists.txt # tools/mtmd/tests/test-deepseek-ocr.py # tools/mtmd/tests/tests-requirements.txt # tools/perplexity/CMakeLists.txt # tools/perplexity/perplexity.cpp # tools/quantize/CMakeLists.txt # tools/server/CMakeLists.txt # tools/server/README.md # ty.toml
This commit is contained in:
commit
718dc159b6
83 changed files with 1469 additions and 648 deletions
|
|
@ -73,7 +73,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
|
|||
{ "hunyuan-moe", LLM_CHAT_TEMPLATE_HUNYUAN_MOE },
|
||||
{ "gpt-oss", LLM_CHAT_TEMPLATE_OPENAI_MOE },
|
||||
{ "hunyuan-dense", LLM_CHAT_TEMPLATE_HUNYUAN_DENSE },
|
||||
{ "hunyuan-ocr", LLM_CHAT_TEMPLATE_HUNYUAN_OCR },
|
||||
{ "hunyuan-vl", LLM_CHAT_TEMPLATE_HUNYUAN_VL },
|
||||
{ "kimi-k2", LLM_CHAT_TEMPLATE_KIMI_K2 },
|
||||
{ "seed_oss", LLM_CHAT_TEMPLATE_SEED_OSS },
|
||||
{ "grok-2", LLM_CHAT_TEMPLATE_GROK_2 },
|
||||
|
|
@ -218,7 +218,7 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
|
|||
} else if (tmpl_contains("<|start|>") && tmpl_contains("<|channel|>")) {
|
||||
return LLM_CHAT_TEMPLATE_OPENAI_MOE;
|
||||
} else if (tmpl_contains("<|hy_Assistant|>") && tmpl_contains("<|hy_begin▁of▁sentence|>")) {
|
||||
return LLM_CHAT_TEMPLATE_HUNYUAN_OCR;
|
||||
return LLM_CHAT_TEMPLATE_HUNYUAN_VL;
|
||||
} else if (tmpl_contains("<|hy_Assistant|>") && tmpl_contains("<|hy_place▁holder▁no▁3|>")) {
|
||||
return LLM_CHAT_TEMPLATE_HUNYUAN_DENSE;
|
||||
} else if (tmpl_contains("<|im_assistant|>assistant<|im_middle|>")) {
|
||||
|
|
@ -825,8 +825,8 @@ int32_t llm_chat_apply_template(
|
|||
ss << "<|hy_User|>" << chat[i]->content << "<|hy_Assistant|>";
|
||||
}
|
||||
}
|
||||
} else if (tmpl == LLM_CHAT_TEMPLATE_HUNYUAN_OCR) {
|
||||
// tencent/HunyuanOCR
|
||||
} else if (tmpl == LLM_CHAT_TEMPLATE_HUNYUAN_VL) {
|
||||
// tencent/HunyuanOCR & tencent/HunyuanVL
|
||||
ss << "<|hy_begin▁of▁sentence|>";
|
||||
for (size_t i = 0; i < chat.size(); i++) {
|
||||
std::string role(chat[i]->role);
|
||||
|
|
|
|||
|
|
@ -53,7 +53,7 @@ enum llm_chat_template {
|
|||
LLM_CHAT_TEMPLATE_HUNYUAN_MOE,
|
||||
LLM_CHAT_TEMPLATE_OPENAI_MOE,
|
||||
LLM_CHAT_TEMPLATE_HUNYUAN_DENSE,
|
||||
LLM_CHAT_TEMPLATE_HUNYUAN_OCR,
|
||||
LLM_CHAT_TEMPLATE_HUNYUAN_VL,
|
||||
LLM_CHAT_TEMPLATE_KIMI_K2,
|
||||
LLM_CHAT_TEMPLATE_SEED_OSS,
|
||||
LLM_CHAT_TEMPLATE_GROK_2,
|
||||
|
|
|
|||
|
|
@ -1147,6 +1147,19 @@ bool llama_context::set_sampler(llama_seq_id seq_id, llama_sampler * sampler) {
|
|||
|
||||
LLAMA_LOG_DEBUG("%s: seq_id = %d, sampler = %p\n", __func__, (int) seq_id, (void *) sampler);
|
||||
|
||||
if (sampler && model.split_mode() == LLAMA_SPLIT_MODE_TENSOR) {
|
||||
static bool warned = false;
|
||||
if (!warned) {
|
||||
LLAMA_LOG_WARN("%s: backend sampling not supported with SPLIT_MODE_TENSOR; using CPU\n", __func__);
|
||||
warned = true;
|
||||
}
|
||||
if (sampling.samplers.count(seq_id) > 0) {
|
||||
sched_need_reserve = true;
|
||||
}
|
||||
sampling.samplers.erase(seq_id);
|
||||
return false;
|
||||
}
|
||||
|
||||
const bool can_offload =
|
||||
sampler &&
|
||||
sampler->iface->backend_init &&
|
||||
|
|
|
|||
|
|
@ -500,15 +500,21 @@ bool llm_graph_input_attn_k::can_reuse(const llm_graph_params & params) {
|
|||
}
|
||||
|
||||
void llm_graph_input_attn_kv_iswa::set_input(const llama_ubatch * ubatch) {
|
||||
mctx->get_base()->set_input_k_idxs(self_k_idxs, ubatch);
|
||||
mctx->get_base()->set_input_v_idxs(self_v_idxs, ubatch);
|
||||
// base tensors may not be allocated if there are no non-SWA attention layers
|
||||
if (self_k_idxs && self_k_idxs->buffer) {
|
||||
mctx->get_base()->set_input_k_idxs(self_k_idxs, ubatch);
|
||||
mctx->get_base()->set_input_v_idxs(self_v_idxs, ubatch);
|
||||
|
||||
mctx->get_base()->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
|
||||
mctx->get_base()->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
|
||||
}
|
||||
|
||||
mctx->get_swa()->set_input_k_idxs(self_k_idxs_swa, ubatch);
|
||||
mctx->get_swa()->set_input_v_idxs(self_v_idxs_swa, ubatch);
|
||||
// swa tensors may not be allocated if there are no SWA attention layers
|
||||
if (self_k_idxs_swa && self_k_idxs_swa->buffer) {
|
||||
mctx->get_swa()->set_input_k_idxs(self_k_idxs_swa, ubatch);
|
||||
mctx->get_swa()->set_input_v_idxs(self_v_idxs_swa, ubatch);
|
||||
|
||||
mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn);
|
||||
mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn);
|
||||
}
|
||||
|
||||
if (self_k_rot) {
|
||||
mctx->get_base()->set_input_k_rot(self_k_rot);
|
||||
|
|
@ -534,14 +540,21 @@ bool llm_graph_input_attn_kv_iswa::can_reuse(const llm_graph_params & params) {
|
|||
|
||||
bool res = true;
|
||||
|
||||
res &= self_k_idxs->ne[0] == params.ubatch.n_tokens;
|
||||
//res &= self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
|
||||
// base tensors may not be allocated if there are no non-SWA attention layers
|
||||
if (self_k_idxs && self_k_idxs->buffer) {
|
||||
res &= self_k_idxs->ne[0] == params.ubatch.n_tokens;
|
||||
//res &= self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
|
||||
|
||||
res &= self_k_idxs_swa->ne[0] == params.ubatch.n_tokens;
|
||||
//res &= self_v_idxs_swa->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
|
||||
res &= can_reuse_kq_mask(self_kq_mask, mctx->get_base(), params.ubatch, params.cparams);
|
||||
}
|
||||
|
||||
res &= can_reuse_kq_mask(self_kq_mask, mctx->get_base(), params.ubatch, params.cparams);
|
||||
res &= can_reuse_kq_mask(self_kq_mask_swa, mctx->get_swa(), params.ubatch, params.cparams);
|
||||
// swa tensors may not be allocated if there are no SWA attention layers
|
||||
if (self_k_idxs_swa && self_k_idxs_swa->buffer) {
|
||||
res &= self_k_idxs_swa->ne[0] == params.ubatch.n_tokens;
|
||||
//res &= self_v_idxs_swa->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
|
||||
|
||||
res &= can_reuse_kq_mask(self_kq_mask_swa, mctx->get_swa(), params.ubatch, params.cparams);
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -755,6 +755,8 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
|||
struct llm_tokenizer_bpe_session {
|
||||
llm_tokenizer_bpe_session(const llama_vocab & vocab, const llm_tokenizer_bpe & tokenizer) : vocab(vocab), tokenizer(tokenizer) {}
|
||||
|
||||
virtual ~llm_tokenizer_bpe_session() = default;
|
||||
|
||||
static void append(const llama_token token_id, std::vector<llama_token> & output) {
|
||||
output.push_back(token_id);
|
||||
}
|
||||
|
|
@ -792,7 +794,7 @@ struct llm_tokenizer_bpe_session {
|
|||
// }
|
||||
}
|
||||
|
||||
void tokenize(const std::string & text, std::vector<llama_token> & output) {
|
||||
virtual void tokenize(const std::string & text, std::vector<llama_token> & output) {
|
||||
int final_prev_index = -1;
|
||||
const auto word_collection = unicode_regex_split(text, tokenizer.regex_exprs, tokenizer.byte_encode);
|
||||
|
||||
|
|
@ -1804,6 +1806,95 @@ private:
|
|||
const llm_tokenizer_plamo2 & tokenizer;
|
||||
};
|
||||
|
||||
struct llm_tokenizer_hybriddna_session : llm_tokenizer_bpe_session {
|
||||
llm_tokenizer_hybriddna_session(const llama_vocab & vocab, const llm_tokenizer_bpe & tokenizer) : llm_tokenizer_bpe_session{vocab, tokenizer}, vocab{vocab} {}
|
||||
|
||||
void tokenize(const std::string & text, std::vector<llama_token> & output) override {
|
||||
static const std::string open_tag = "<dna>";
|
||||
static const std::string close_tag = "</dna>";
|
||||
|
||||
const auto dna_begin_id = vocab.text_to_token(open_tag);
|
||||
const auto dna_end_id = vocab.text_to_token(close_tag);
|
||||
const auto dna_oov_id = vocab.text_to_token("<oov>");
|
||||
|
||||
// Fall back to plain BPE if the DNA pieces aren't in the vocab.
|
||||
if (dna_begin_id == LLAMA_TOKEN_NULL || dna_end_id == LLAMA_TOKEN_NULL || dna_oov_id == LLAMA_TOKEN_NULL) {
|
||||
llm_tokenizer_bpe_session::tokenize(text, output);
|
||||
return;
|
||||
}
|
||||
|
||||
const size_t k = 6;
|
||||
size_t pos = 0;
|
||||
|
||||
while (pos < text.size()) {
|
||||
const size_t start = text.find(open_tag, pos);
|
||||
if (start == std::string::npos) {
|
||||
if (pos < text.size()) {
|
||||
llm_tokenizer_bpe_session::tokenize(text.substr(pos), output);
|
||||
}
|
||||
break;
|
||||
}
|
||||
if (start > pos) {
|
||||
llm_tokenizer_bpe_session::tokenize(text.substr(pos, start - pos), output);
|
||||
}
|
||||
output.push_back(dna_begin_id);
|
||||
|
||||
const size_t content_start = start + open_tag.size();
|
||||
const size_t end = text.find(close_tag, content_start);
|
||||
const size_t content_end = (end == std::string::npos) ? text.size() : end;
|
||||
|
||||
emit_dna_kmers(text.substr(content_start, content_end - content_start), k, dna_oov_id, output);
|
||||
|
||||
if (end == std::string::npos) {
|
||||
break;
|
||||
}
|
||||
output.push_back(dna_end_id);
|
||||
pos = end + close_tag.size();
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
void emit_dna_kmers(const std::string & raw, size_t k, llama_token oov_id, std::vector<llama_token> & output) {
|
||||
std::string seq = raw;
|
||||
for (char & c : seq) {
|
||||
if (c >= 'a' && c <= 'z') {
|
||||
c = char(c - 32);
|
||||
}
|
||||
}
|
||||
auto is_valid_kmer = [](const std::string & s) {
|
||||
for (char c : s) {
|
||||
if (c != 'A' && c != 'C' && c != 'G' && c != 'T') {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
};
|
||||
|
||||
size_t i = 0;
|
||||
for (; i + k <= seq.size(); i += k) {
|
||||
const std::string kmer = seq.substr(i, k);
|
||||
if (is_valid_kmer(kmer)) {
|
||||
const auto tok = vocab.text_to_token(kmer);
|
||||
output.push_back(tok != LLAMA_TOKEN_NULL ? tok : oov_id);
|
||||
} else {
|
||||
output.push_back(oov_id);
|
||||
}
|
||||
}
|
||||
if (i < seq.size()) {
|
||||
std::string kmer = seq.substr(i);
|
||||
kmer.append(k - kmer.size(), 'A');
|
||||
if (is_valid_kmer(kmer)) {
|
||||
const auto tok = vocab.text_to_token(kmer);
|
||||
output.push_back(tok != LLAMA_TOKEN_NULL ? tok : oov_id);
|
||||
} else {
|
||||
output.push_back(oov_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const llama_vocab & vocab;
|
||||
};
|
||||
|
||||
//
|
||||
// impl
|
||||
//
|
||||
|
|
@ -2034,7 +2125,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|||
special_mask_id = 103;
|
||||
|
||||
add_sep = true;
|
||||
} else if (tokenizer_model == "gpt2") {
|
||||
} else if (tokenizer_model == "gpt2" || tokenizer_model == "hybriddna") {
|
||||
type = LLAMA_VOCAB_TYPE_BPE;
|
||||
|
||||
// read bpe merges and populate bpe ranks
|
||||
|
|
@ -3421,12 +3512,19 @@ std::vector<llama_token> llama_vocab::impl::tokenize(
|
|||
}
|
||||
break;
|
||||
}
|
||||
|
||||
llm_tokenizer_bpe_session session(vocab, *static_cast<const llm_tokenizer_bpe *>(tokenizer.get()));
|
||||
// it calls some other methods that are not exist in llm_tokenizer,
|
||||
// here just cast it to bpe tokenizer object
|
||||
const llm_tokenizer_bpe * tok_bpe = static_cast<const llm_tokenizer_bpe *>(tokenizer.get());
|
||||
|
||||
std::unique_ptr<llm_tokenizer_bpe_session> session;
|
||||
if (vocab.get_tokenizer_model() == "hybriddna") {
|
||||
session = std::make_unique<llm_tokenizer_hybriddna_session>(vocab, *tok_bpe);
|
||||
} else {
|
||||
session = std::make_unique<llm_tokenizer_bpe_session>(vocab, *tok_bpe);
|
||||
}
|
||||
|
||||
if (add_special) {
|
||||
session.append_bos(output);
|
||||
session->append_bos(output);
|
||||
}
|
||||
for (const auto & fragment : fragment_buffer) {
|
||||
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
||||
|
|
@ -3439,15 +3537,15 @@ std::vector<llama_token> llama_vocab::impl::tokenize(
|
|||
#ifdef PRETOKENIZERDEBUG
|
||||
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
|
||||
#endif
|
||||
session.tokenize(text, output);
|
||||
session->tokenize(text, output);
|
||||
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
||||
session.append(fragment.token, output);
|
||||
session->append(fragment.token, output);
|
||||
}
|
||||
}
|
||||
|
||||
if (add_special) {
|
||||
session.append_eos(output);
|
||||
session.check_double_bos_eos(output);
|
||||
session->append_eos(output);
|
||||
session->check_double_bos_eos(output);
|
||||
}
|
||||
} break;
|
||||
case LLAMA_VOCAB_TYPE_WPM:
|
||||
|
|
|
|||
|
|
@ -525,8 +525,9 @@ llama_model_qwen35::graph_mtp::graph_mtp(const llama_model & model, const llm_gr
|
|||
|
||||
res->add_input(std::move(inp));
|
||||
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
auto * inp_attn = build_attn_inp_kv();
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
auto * inp_attn = build_attn_inp_kv();
|
||||
|
||||
ggml_tensor * h_norm = build_norm(h_input, layer.nextn.hnorm, nullptr, LLM_NORM_RMS, il);
|
||||
cb(h_norm, "mtp_hnorm", il);
|
||||
|
|
@ -615,6 +616,8 @@ llama_model_qwen35::graph_mtp::graph_mtp(const llama_model & model, const llm_gr
|
|||
cb(cur, "h_pre_norm", -1);
|
||||
res->t_h_pre_norm = cur;
|
||||
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
|
||||
ggml_tensor * head_norm_w = layer.nextn.shared_head_norm
|
||||
? layer.nextn.shared_head_norm
|
||||
: model.output_norm;
|
||||
|
|
|
|||
|
|
@ -588,8 +588,10 @@ llama_model_qwen35moe::graph_mtp::graph_mtp(const llama_model & model, const llm
|
|||
|
||||
res->add_input(std::move(inp));
|
||||
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
auto * inp_attn = build_attn_inp_kv();
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
auto * inp_attn = build_attn_inp_kv();
|
||||
|
||||
|
||||
ggml_tensor * h_norm = build_norm(h_input, layer.nextn.hnorm, nullptr, LLM_NORM_RMS, il);
|
||||
cb(h_norm, "mtp_hnorm", il);
|
||||
|
|
@ -710,6 +712,8 @@ llama_model_qwen35moe::graph_mtp::graph_mtp(const llama_model & model, const llm
|
|||
cb(cur, "h_pre_norm", -1);
|
||||
res->t_h_pre_norm = cur;
|
||||
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
|
||||
ggml_tensor * head_norm_w = layer.nextn.shared_head_norm
|
||||
? layer.nextn.shared_head_norm
|
||||
: model.output_norm;
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue