Merge branch 'upstream' into concedo_experimental

# Conflicts:
#	.github/workflows/build-cross.yml
#	.github/workflows/build-self-hosted.yml
#	.github/workflows/release.yml
#	examples/llama.android/lib/src/main/cpp/CMakeLists.txt
#	ggml/CMakeLists.txt
#	ggml/src/ggml-rpc/CMakeLists.txt
#	ggml/src/ggml-rpc/ggml-rpc.cpp
#	ggml/src/ggml-sycl/mmvq.cpp
#	ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp
#	ggml/src/ggml-webgpu/ggml-webgpu.cpp
#	scripts/sync_vendor.py
#	tests/test-chat.cpp
#	tests/test-mtmd-c-api.c
#	tools/server/README.md
This commit is contained in:
Concedo 2026-04-20 20:19:11 +08:00
commit cd6788007e
86 changed files with 1384 additions and 1240 deletions

View file

@ -294,7 +294,7 @@ static bool common_params_handle_remote_preset(common_params & params, llama_exa
hf_tag = "default";
}
std::string model_endpoint = get_model_endpoint();
std::string model_endpoint = common_get_model_endpoint();
auto preset_url = model_endpoint + hf_repo + "/resolve/main/preset.ini";
// prepare local path for caching
@ -1318,13 +1318,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
).set_env("LLAMA_ARG_KV_UNIFIED").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_BATCHED, LLAMA_EXAMPLE_BENCH, LLAMA_EXAMPLE_PARALLEL}));
add_opt(common_arg(
{"--clear-idle"},
{"--no-clear-idle"},
{"--cache-idle-slots"},
{"--no-cache-idle-slots"},
"save and clear idle slots on new task (default: enabled, requires unified KV and cache-ram)",
[](common_params & params, bool value) {
params.clear_idle = value;
params.cache_idle_slots = value;
}
).set_env("LLAMA_ARG_CLEAR_IDLE").set_examples({LLAMA_EXAMPLE_SERVER}));
).set_env("LLAMA_ARG_CACHE_IDLE_SLOTS").set_examples({LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
{"--context-shift"},
{"--no-context-shift"},

View file

@ -443,14 +443,14 @@ common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_conte
if (!format.per_call_start.empty()) {
auto wrapped_call = format.per_call_start + p.space() + tool_choice + p.space() + format.per_call_end;
if (inputs.parallel_tool_calls) {
tool_calls = p.trigger_rule("tool-call", wrapped_call + p.zero_or_more(p.space() + wrapped_call));
tool_calls = p.trigger_rule("tool-call", wrapped_call + p.zero_or_more(p.space() + wrapped_call) + p.space());
} else {
tool_calls = p.trigger_rule("tool-call", wrapped_call);
tool_calls = p.trigger_rule("tool-call", wrapped_call + p.space());
}
if (!format.section_start.empty()) {
tool_calls = p.trigger_rule("tool-calls",
p.literal(format.section_start) + p.space() + tool_calls + p.space() +
(format.section_end.empty() ? p.end() : p.literal(format.section_end)));
(format.section_end.empty() ? p.end() : p.literal(format.section_end) + p.space()));
}
} else {
std::string separator = ", "; // Default

View file

@ -2349,7 +2349,7 @@ common_chat_msg common_chat_peg_parse(const common_peg_arena & src_pars
? input
: params.generation_prompt + input;
LOG_DBG("Parsing PEG input with format %s: %s\n", common_chat_format_name(params.format), effective_input.c_str());
//LOG_DBG("Parsing PEG input with format %s: %s\n", common_chat_format_name(params.format), effective_input.c_str());
common_peg_parse_flags flags = COMMON_PEG_PARSE_FLAG_LENIENT;
if (params.debug) {

View file

@ -1388,7 +1388,7 @@ common_init_result_ptr common_init_from_params(common_params & params) {
common_init_result::~common_init_result() = default;
std::string get_model_endpoint() {
std::string common_get_model_endpoint() {
const char * model_endpoint_env = getenv("MODEL_ENDPOINT");
// We still respect the use of environment-variable "HF_ENDPOINT" for backward-compatibility.
const char * hf_endpoint_env = getenv("HF_ENDPOINT");
@ -1403,6 +1403,42 @@ std::string get_model_endpoint() {
return model_endpoint;
}
common_context_seq_rm_type common_context_can_seq_rm(llama_context * ctx) {
auto * mem = llama_get_memory(ctx);
if (mem == nullptr) {
return COMMON_CONTEXT_SEQ_RM_TYPE_NO;
}
common_context_seq_rm_type res = COMMON_CONTEXT_SEQ_RM_TYPE_PART;
llama_memory_clear(mem, true);
// eval 2 tokens to check if the context is compatible
std::vector<llama_token> tmp;
tmp.push_back(0);
tmp.push_back(0);
int ret = llama_decode(ctx, llama_batch_get_one(tmp.data(), tmp.size()));
if (ret != 0) {
LOG_ERR("%s: llama_decode() failed: %d\n", __func__, ret);
res = COMMON_CONTEXT_SEQ_RM_TYPE_NO;
goto done;
}
// try to remove the last tokens
if (!llama_memory_seq_rm(mem, 0, 1, -1)) {
LOG_WRN("%s: the target context does not support partial sequence removal\n", __func__);
res = COMMON_CONTEXT_SEQ_RM_TYPE_FULL;
goto done;
}
done:
llama_memory_clear(mem, true);
llama_synchronize(ctx);
return res;
}
void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora) {
std::vector<llama_adapter_lora *> loras;
std::vector<float> scales;

View file

@ -12,7 +12,6 @@
#include <sstream>
#include <string>
#include <string_view>
#include <variant>
#include <vector>
#include <map>
@ -304,15 +303,15 @@ struct common_params_speculative {
// general-purpose speculative decoding parameters
int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding
int32_t n_min = 0; // minimum number of draft tokens to use for speculative decoding
int32_t n_min = 0; // minimum number of draft tokens to use for speculative decoding
float p_split = 0.1f; // speculative decoding split probability
float p_min = 0.75f; // minimum speculative decoding probability (greedy)
// ngram-based speculative decoding
uint16_t ngram_size_n = 12; // ngram size for lookup
uint16_t ngram_size_m = 48; // mgram size for speculative tokens
uint16_t ngram_min_hits = 1; // minimum hits at ngram/mgram lookup for mgram to be proposed
uint16_t ngram_size_n = 12; // ngram size for lookup
uint16_t ngram_size_m = 48; // mgram size for speculative tokens
uint16_t ngram_min_hits = 1; // minimum hits at ngram/mgram lookup for mgram to be proposed
std::shared_ptr<common_ngram_mod> ngram_mod;
@ -568,7 +567,7 @@ struct common_params {
int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
bool cache_prompt = true; // whether to enable prompt caching
bool clear_idle = true; // save and clear idle slots upon starting a new task
bool cache_idle_slots = true; // save and clear idle slots upon starting a new task
int32_t n_ctx_checkpoints = 32; // max number of context checkpoints per slot
int32_t checkpoint_every_nt = 8192; // make a checkpoint every n tokens during prefill
int32_t cache_ram_mib = 8192; // -1 = no limit, 0 - disable, 1 = 1 MiB, etc.
@ -848,7 +847,23 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p
// clear LoRA adapters from context, then apply new list of adapters
void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);
std::string get_model_endpoint();
// model endpoint from env
std::string common_get_model_endpoint();
//
// Context utils
//
enum common_context_seq_rm_type {
COMMON_CONTEXT_SEQ_RM_TYPE_NO = 0, // seq_rm not supported (e.g. no memory module)
COMMON_CONTEXT_SEQ_RM_TYPE_PART = 1, // can seq_rm partial sequences
COMMON_CONTEXT_SEQ_RM_TYPE_FULL = 2, // can seq_rm full sequences only
};
// check if the llama_context can remove sequences
// note: clears the memory of the context
common_context_seq_rm_type common_context_can_seq_rm(llama_context * ctx);
//
// Batch utils

View file

@ -230,7 +230,7 @@ static nl::json api_get(const std::string & url,
static std::string get_repo_commit(const std::string & repo_id,
const std::string & token) {
try {
auto endpoint = get_model_endpoint();
auto endpoint = common_get_model_endpoint();
auto json = api_get(endpoint + "api/models/" + repo_id + "/refs", token);
if (!json.is_object() ||
@ -308,7 +308,7 @@ hf_files get_repo_files(const std::string & repo_id,
hf_files files;
try {
auto endpoint = get_model_endpoint();
auto endpoint = common_get_model_endpoint();
auto json = api_get(endpoint + "api/models/" + repo_id + "/tree/" + commit + "?recursive=true", token);
if (!json.is_array()) {

View file

@ -208,7 +208,7 @@ void common_ngram_map_begin(
count_keys, count_keys_del, count_values_del, count_map_entries_upd);
}
map.idx_last_check = (map.size_last_begin > 0) ? map.size_last_begin - 1 : 0;
map.idx_last_check = size_begin;
map.size_last_begin = size_begin;
}
@ -231,7 +231,7 @@ void common_ngram_map_draft(common_ngram_map & map,
GGML_ABORT("%s: cur_len exceeds UINT32_MAX: %zu", __func__, cur_len);
}
if (map.idx_last_check > cur_len) {
if (map.idx_last_check > cur_len) {
// Should not happen because of common_ngram_map_begin().
GGML_ABORT("%s: map.idx_last_check > cur_len: %zu > %zu", __func__, map.idx_last_check, cur_len);
}
@ -386,7 +386,7 @@ void common_ngram_map_draft(common_ngram_map & map,
LOG_DBG("%s: key_idx = %zu, key_offset = %zu, key_num = %d, draft.size = %zu\n", __func__,
curr_key.key_idx, key_offset, curr_key.key_num, draft.size());
map.last_draft_created = false;
map.last_draft_created = true;
map.last_draft_key_idx = key_offset;
map.last_draft_value_idx = 0; // value 0 is used for simple mode
return;
@ -524,7 +524,7 @@ void common_ngram_map_accept(common_ngram_map & map, uint16_t n_accepted) {
struct common_ngram_map_value & curr_value = curr_key.values[val_idx]; // value used for draft generation.
// update the value statistics
LOG_INF("common_ngram_map_send_accepted: n_accepted = %d, prev value_num = %d\n",
LOG_DBG("common_ngram_map_send_accepted: n_accepted = %d, prev value_num = %d\n",
n_accepted, curr_value.n_accepted);
curr_value.n_accepted = n_accepted;
}

View file

@ -13,6 +13,7 @@
#include <cstring>
#include <iomanip>
#include <map>
#include <cinttypes>
#define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 128
#define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
@ -144,10 +145,28 @@ struct common_speculative_state {
virtual void accept(uint16_t n_accepted) = 0;
};
struct common_speculative_checkpoint {
llama_pos pos_min = 0;
llama_pos pos_max = 0;
int64_t n_tokens = 0;
std::vector<uint8_t> data;
size_t size() const {
return data.size();
}
size_t ckpt_size = 0;
};
struct common_speculative_state_draft : public common_speculative_state {
llama_context * ctx_tgt; // only used for retokenizing from ctx_dft
llama_context * ctx_dft;
bool use_ckpt = false;
struct common_speculative_checkpoint ckpt;
common_sampler * smpl;
llama_batch batch;
@ -160,10 +179,12 @@ struct common_speculative_state_draft : public common_speculative_state {
enum common_speculative_type type,
llama_context * ctx_tgt,
llama_context * ctx_dft,
const std::vector<std::pair<std::string, std::string>> & replacements)
const std::vector<std::pair<std::string, std::string>> & replacements,
bool use_ckpt)
: common_speculative_state(type)
, ctx_tgt(ctx_tgt)
, ctx_dft(ctx_dft)
, use_ckpt(use_ckpt)
{
batch = llama_batch_init(llama_n_batch(ctx_dft), 0, 1);
smpl = nullptr;
@ -218,7 +239,48 @@ struct common_speculative_state_draft : public common_speculative_state {
}
void begin(const llama_tokens & prompt) override {
GGML_UNUSED(prompt);
if (use_ckpt && ckpt.size() > 0) {
// delete checkpoint
LOG_DBG("%s: delete checkpoint, prompt.size=%zu, pos_min=%d, pos_max=%d, n_tokens=%" PRId64 ", size=%.3f MiB\n",
__func__, prompt.size(), ckpt.pos_min, ckpt.pos_max, ckpt.n_tokens, (float) ckpt.data.size() / 1024 / 1024);
ckpt.pos_min = 0;
ckpt.pos_max = 0;
ckpt.n_tokens = 0;
ckpt.ckpt_size = 0;
ckpt.data.clear();
}
}
size_t draft_create_checkpoint(int n_tokens_prompt, int n_tokens_batch) {
int slot_id = 0;
const size_t checkpoint_size = llama_state_seq_get_size_ext(ctx_dft, slot_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
ckpt.pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx_dft), slot_id);
ckpt.pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx_dft), slot_id);
ckpt.n_tokens = n_tokens_prompt - n_tokens_batch;
ckpt.data.resize(checkpoint_size);
const size_t n = llama_state_seq_get_data_ext(ctx_dft, ckpt.data.data(), checkpoint_size, slot_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
if (n != checkpoint_size) {
GGML_ABORT("checkpoint size mismatch: expected %zu, got %zu\n", checkpoint_size, n);
}
LOG_DBG("%s: pos_min = %d, pos_max = %d, size = %.3f MiB\n", __func__,
ckpt.pos_min, ckpt.pos_max, (float) ckpt.data.size() / 1024 / 1024);
return n;
}
size_t draft_restore_checkpoint(size_t ckpt_size_part_expected) {
int slot_id = 0;
LOG_DBG("%s: pos_min = %d, pos_max = %d\n", __func__, ckpt.pos_min, ckpt.pos_max);
const size_t n = llama_state_seq_set_data_ext(ctx_dft, ckpt.data.data(), ckpt.size(), slot_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
if (n != ckpt_size_part_expected) {
GGML_ABORT("%s: failed to restore context checkpoint (pos_min=%d, pos_max=%d, size=%zu, get_data_ext->%zu, set_data_ext->%zu",
__func__, ckpt.pos_min, ckpt.pos_max, ckpt.size(), ckpt_size_part_expected, n);
}
llama_memory_seq_rm(llama_get_memory(ctx_dft), slot_id, ckpt.pos_max + 1, -1);
return n;
}
void draft(
@ -236,8 +298,8 @@ struct common_speculative_state_draft : public common_speculative_state {
auto * mem_dft = llama_get_memory(ctx_dft);
int reuse_i = 0;
int reuse_n = 0;
int reuse_i = 0; // index of part to be reused in prompt_dft
int reuse_n = 0; // length of part to be reused in prompt_dft
const int n_ctx = llama_n_ctx(ctx_dft) - params.n_max;
@ -287,18 +349,26 @@ struct common_speculative_state_draft : public common_speculative_state {
}
}
LOG_DBG("%s: reuse_i = %d, reuse_n = %d, prompt = %d\n", __func__, reuse_i, reuse_n, (int) prompt_dft.size());
LOG_DBG("%s: reuse_i = %d, reuse_n = %d, #prompt_dft = %zu, #prompt_cur = %zu\n",
__func__, reuse_i, reuse_n, prompt_dft.size(), prompt_cur.size());
if (use_ckpt && ckpt.ckpt_size == 0 && reuse_n > 0) {
LOG_DBG("%s: no checkpoint available, no reuse, (reuse_i=%d, reuse_n=%d) -> (0, 0)\n",
__func__, reuse_i, reuse_n);
reuse_i = 0;
reuse_n = 0;
}
result.clear();
result.reserve(params.n_max);
if (reuse_n == 0) {
bool needs_ckpt = use_ckpt && prompt_dft.size() > 0;
if (reuse_n == 0 || (use_ckpt && reuse_i > 0)) {
llama_memory_clear(mem_dft, false);
prompt_dft.clear();
} else {
// this happens when a previous draft has been discarded (for example, due to being too small), but the
// target model agreed with it. in this case, we simply pass back the previous results to save compute
if (reuse_i + reuse_n < (int) prompt_dft.size() && prompt_dft[reuse_i + reuse_n] == id_last) {
if (reuse_i + reuse_n < (int64_t) prompt_dft.size() && prompt_dft[reuse_i + reuse_n] == id_last) {
for (int i = reuse_i + reuse_n + 1; i < (int) prompt_dft.size(); ++i) {
result.push_back(prompt_dft[i]);
@ -310,19 +380,50 @@ struct common_speculative_state_draft : public common_speculative_state {
return;
}
bool do_restore = false;
if (prompt_dft.size() > prompt_cur.size() && reuse_i + reuse_n < (int64_t) prompt_dft.size()) {
// This can happen after a partial acceptance (speculative decoding with checkpoints)
LOG_DBG("%s: #prompt_dft=%zu, #prompt_cur=%zu, shorten draft\n",
__func__, prompt_dft.size(), prompt_cur.size());
prompt_dft.resize(prompt_cur.size());
do_restore = true;
}
if (reuse_i > 0) {
llama_memory_seq_rm (mem_dft, 0, 0, reuse_i);
bool is_removed = llama_memory_seq_rm (mem_dft, 0, 0, reuse_i);
if (!is_removed) {
LOG_ERR("%s: llama_memory_seq_rm failed, reuse_i=%d\n", __func__, reuse_i);
}
llama_memory_seq_add(mem_dft, 0, reuse_i, -1, -reuse_i);
prompt_dft.erase(prompt_dft.begin(), prompt_dft.begin() + reuse_i);
}
if (reuse_n < (int) prompt_dft.size()) {
llama_memory_seq_rm (mem_dft, 0, reuse_n, -1);
prompt_dft.erase(prompt_dft.begin() + reuse_n, prompt_dft.end());
if (reuse_n < (int) prompt_dft.size() || do_restore) {
if (use_ckpt) {
if (ckpt.n_tokens > (int64_t) prompt_dft.size()) {
LOG_INF("%s: checkpoint is too large, prompt_tgt.size=%zu, ckpt.n_tokens=%" PRId64 ", reuse_n=%d, prompt_dft.size=%zu\n",
__func__, prompt_tgt.size(), ckpt.n_tokens, reuse_n, prompt_dft.size());
}
draft_restore_checkpoint(ckpt.ckpt_size);
reuse_n = ckpt.n_tokens;
prompt_dft.resize(reuse_n);
needs_ckpt = false;
} else {
bool is_removed = llama_memory_seq_rm (mem_dft, 0, reuse_n, -1);
if (!is_removed) {
LOG_ERR("%s: llama_memory_seq_rm failed, reuse_n=%d, prompt_dft.size=%zu\n",
__func__, reuse_n, prompt_dft.size());
}
prompt_dft.erase(prompt_dft.begin() + reuse_n, prompt_dft.end());
}
}
}
if (needs_ckpt) {
ckpt.ckpt_size = draft_create_checkpoint(prompt_dft.size(), batch.n_tokens);
}
// prepare a batch to evaluate any new tokens in the prompt
common_batch_clear(batch);
@ -337,7 +438,11 @@ struct common_speculative_state_draft : public common_speculative_state {
if (batch.n_tokens > 0) {
//LOG_DBG("%s: draft prompt batch: %s\n", __func__, string_from(ctx, batch).c_str());
llama_decode(ctx_dft, batch);
int ret = llama_decode(ctx_dft, batch);
if (ret != 0 && ret != 1) {
LOG_WRN("%s: llama_decode returned %d, prompt_cur.size=%zu\n",
__func__, ret, prompt_cur.size());
}
}
const llama_pos n_past = prompt_dft.size();
@ -351,7 +456,11 @@ struct common_speculative_state_draft : public common_speculative_state {
LOG_DBG("%s: draft prompt: %s\n", __func__, string_from(ctx_dft, prompt_dft).c_str());
llama_decode(ctx_dft, batch);
int ret = llama_decode(ctx_dft, batch);
if (ret != 0 && ret != 1) {
LOG_WRN("%s: llama_decode returned %d, prompt_cur.size=%zu, prompt_dft.size=%zu\n",
__func__, ret, prompt_cur.size(), prompt_dft.size());
}
common_sampler_reset(smpl);
@ -387,7 +496,11 @@ struct common_speculative_state_draft : public common_speculative_state {
common_batch_add(batch, id, n_past + i + 1, { 0 }, true);
// evaluate the drafted tokens on the draft model
llama_decode(ctx_dft, batch);
ret = llama_decode(ctx_dft, batch);
if (ret != 0) {
LOG_WRN("%s: llama_decode[%d] returned %d, prompt_cur.size=%zu, prompt_dft.size=%zu\n",
__func__, i, ret, prompt_cur.size(), prompt_dft.size());
}
prompt_dft.push_back(id);
}
@ -739,6 +852,7 @@ struct common_speculative_state_ngram_cache : public common_speculative_state {
struct common_speculative {
std::vector<std::unique_ptr<common_speculative_state>> impls; // list of implementations to use and their states
common_speculative_state * curr_impl = nullptr; // current implementation in use (for stats)
};
@ -798,42 +912,6 @@ enum common_speculative_type common_speculative_type_from_name(const std::string
return it->second;
}
bool common_speculative_is_compat(llama_context * ctx_tgt) {
auto * mem = llama_get_memory(ctx_tgt);
if (mem == nullptr) {
return false;
}
bool res = true;
llama_memory_clear(mem, true);
// eval 2 tokens to check if the context is compatible
std::vector<llama_token> tmp;
tmp.push_back(0);
tmp.push_back(0);
int ret = llama_decode(ctx_tgt, llama_batch_get_one(tmp.data(), tmp.size()));
if (ret != 0) {
LOG_ERR("%s: llama_decode() failed: %d\n", __func__, ret);
res = false;
goto done;
}
// try to remove the last tokens
if (!llama_memory_seq_rm(mem, 0, 1, -1)) {
LOG_WRN("%s: the target context does not support partial sequence removal\n", __func__);
res = false;
goto done;
}
done:
llama_memory_clear(mem, true);
llama_synchronize(ctx_tgt);
return res;
}
// initialization of the speculative decoding system
//
common_speculative * common_speculative_init(
@ -908,10 +986,13 @@ common_speculative * common_speculative_init(
case COMMON_SPECULATIVE_TYPE_NONE:
break;
case COMMON_SPECULATIVE_TYPE_DRAFT: {
const bool use_ckpt = common_context_can_seq_rm(ctx_dft) == COMMON_CONTEXT_SEQ_RM_TYPE_FULL;
impls.push_back(std::make_unique<common_speculative_state_draft>(config.type,
/* .ctx_tgt = */ ctx_tgt,
/* .ctx_dft = */ ctx_dft,
/* .replacements = */ params.replacements
/* .replacements = */ params.replacements,
/* .use_ckpt = */ use_ckpt
));
break;
}
@ -966,7 +1047,8 @@ common_speculative * common_speculative_init(
}
auto * result = new common_speculative {
/* .impls = */ std::move(impls)
/* .impls = */ std::move(impls),
/* .curr_impl = */ nullptr,
};
return result;

View file

@ -14,10 +14,6 @@ enum common_speculative_type common_speculative_type_from_name(const std::string
// convert type to string
std::string common_speculative_type_to_str(enum common_speculative_type type);
// check if the llama_context is compatible for speculative decoding
// note: clears the memory of the context
bool common_speculative_is_compat(llama_context * ctx_tgt);
common_speculative * common_speculative_init(
common_params_speculative & params,
llama_context * ctx_tgt);
@ -39,3 +35,9 @@ void common_speculative_accept(common_speculative * spec, uint16_t n_accepted);
// print statistics about the speculative decoding
void common_speculative_print_stats(const common_speculative * spec);
struct common_speculative_deleter {
void operator()(common_speculative * s) { common_speculative_free(s); }
};
typedef std::unique_ptr<common_speculative, common_speculative_deleter> common_speculative_ptr;

View file

@ -1850,20 +1850,28 @@ class TextModel(ModelBase):
with open(module_path, encoding="utf-8") as f:
modules = json.load(f)
for mod in modules:
if mod["type"] == "sentence_transformers.models.Pooling":
if mod["type"].endswith("Pooling"):
pooling_path = mod["path"]
break
mode_mapping = {
"mean": gguf.PoolingType.MEAN,
"cls": gguf.PoolingType.CLS,
"lasttoken": gguf.PoolingType.LAST,
}
# get pooling type
if pooling_path is not None:
with open(self.dir_model / pooling_path / "config.json", encoding="utf-8") as f:
pooling = json.load(f)
if pooling["pooling_mode_mean_tokens"]:
if pooling.get("pooling_mode_mean_tokens"):
pooling_type = gguf.PoolingType.MEAN
elif pooling["pooling_mode_cls_token"]:
elif pooling.get("pooling_mode_cls_token"):
pooling_type = gguf.PoolingType.CLS
elif pooling["pooling_mode_lasttoken"]:
elif pooling.get("pooling_mode_lasttoken"):
pooling_type = gguf.PoolingType.LAST
elif (pooling_mode := pooling.get("pooling_mode")) in mode_mapping:
pooling_type = mode_mapping[pooling_mode]
else:
raise NotImplementedError("Only MEAN, CLS, and LAST pooling types supported")
self.gguf_writer.add_pooling_type(pooling_type)
@ -7180,7 +7188,7 @@ class EmbeddingGemma(Gemma3Model):
with open(modules_file, encoding="utf-8") as modules_json_file:
mods = json.load(modules_json_file)
for mod in mods:
if mod["type"] == "sentence_transformers.models.Dense":
if mod["type"].endswith("Dense"):
mod_path = mod["path"]
# check if model.safetensors file for Dense layer exists
model_tensors_file = self.dir_model / mod_path / "model.safetensors"
@ -10912,14 +10920,14 @@ class NemotronHModel(GraniteHybridModel):
vocab_size = -(vocab_size // -pad_vocab) * pad_vocab
self.hparams["vocab_size"] = vocab_size
assert max(tokenizer.vocab.values()) < vocab_size
assert max(tokenizer.vocab.values()) < vocab_size # ty: ignore[unresolved-attribute]
tokpre = self.get_vocab_base_pre(tokenizer)
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
added_vocab = tokenizer.get_added_vocab()
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} # ty: ignore[unresolved-attribute]
added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute]
added_tokens_decoder = tokenizer.added_tokens_decoder
added_tokens_decoder = tokenizer.added_tokens_decoder # ty: ignore[unresolved-attribute]
for i in range(vocab_size):
if i not in reverse_vocab:
@ -10930,7 +10938,7 @@ class NemotronHModel(GraniteHybridModel):
if token in added_vocab:
if not added_tokens_decoder[i].normalized:
previous_token = token
token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False))
token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False)) # ty: ignore[unresolved-attribute, invalid-assignment]
if previous_token != token:
logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer")

View file

@ -1270,7 +1270,45 @@ static void ggml_backend_meta_buffer_get_tensor(ggml_backend_buffer_t buffer, co
GGML_ASSERT(ggml_is_contiguous(tensor));
const ggml_backend_meta_split_state split_state = ggml_backend_meta_get_split_state(tensor, /*assume_sync =*/ false);
GGML_ASSERT(split_state.n_segments == 1);
if (split_state.n_segments != 1) {
GGML_ASSERT(split_state.axis >= 0 && split_state.axis < GGML_MAX_DIMS);
GGML_ASSERT(offset == 0);
GGML_ASSERT(size == ggml_nbytes(tensor));
GGML_ASSERT(tensor->ne[3] == 1);
size_t offset_data = 0;
std::vector<size_t> simple_offsets(n_bufs, 0);
if (split_state.axis == GGML_BACKEND_SPLIT_AXIS_0) {
GGML_ASSERT(tensor->ne[2] == 1);
const int64_t blck_size = ggml_blck_size(tensor->type);
for (size_t s = 0; s < split_state.n_segments; s++) {
for (size_t j = 0; j < n_bufs; j++) {
const ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j);
GGML_ASSERT(split_state.ne[s*n_bufs + j] % blck_size == 0);
const size_t nbytes = split_state.ne[s*n_bufs + j]/blck_size * tensor->nb[0];
ggml_backend_tensor_get_2d(simple_tensor, (char *) data + offset_data, simple_offsets[j], nbytes,
tensor->ne[1], simple_tensor->nb[1], tensor->nb[1]);
offset_data += nbytes;
simple_offsets[j] += nbytes;
}
}
GGML_ASSERT(offset_data*tensor->ne[1] == size);
return;
}
GGML_ASSERT(split_state.axis == GGML_BACKEND_SPLIT_AXIS_1);
for (size_t s = 0; s < split_state.n_segments; s++) {
for (size_t j = 0; j < n_bufs; j++) {
const ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j);
const size_t nbytes = split_state.ne[s*n_bufs + j] * tensor->nb[1];
ggml_backend_tensor_get_2d(simple_tensor, (char *) data + offset_data, simple_offsets[j], nbytes,
tensor->ne[2], simple_tensor->nb[2], tensor->nb[2]);
offset_data += nbytes;
simple_offsets[j] += nbytes;
}
}
GGML_ASSERT(offset_data*tensor->ne[2] == size);
return;
}
switch (split_state.axis) {
case GGML_BACKEND_SPLIT_AXIS_0:
@ -1418,6 +1456,8 @@ struct ggml_backend_meta_context {
int max_nnodes = 0;
size_t max_tmp_size = 0;
size_t max_subgraphs = 0;
size_t n_subgraphs = 0;
uint64_t uid = 0;
void * comm_ctx = nullptr;
ggml_backend_comm_allreduce_tensor_t comm_allreduce = nullptr;
@ -1578,6 +1618,9 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
const size_t n_backends = ggml_backend_meta_n_backends(backend);
ggml_backend_meta_context * backend_ctx = (ggml_backend_meta_context *) backend->context;
// If the previous cgraph had a defined UID it can be used to skip rebuilding the subgraphs per simple backend.
const bool needs_rebuild = (cgraph->uid == 0) || (cgraph->uid != backend_ctx->uid);
bool max_nnodes_raised = false;
if (cgraph->n_nodes > backend_ctx->max_nnodes) {
for (size_t j = 0; j < n_backends; j++) {
@ -1587,173 +1630,181 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
}
backend_ctx->max_nnodes = cgraph->n_nodes;
max_nnodes_raised = true;
assert(needs_rebuild);
}
for (size_t j = 0; j < n_backends; j++) {
auto & bcj = backend_ctx->backend_configs[j];
for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_tensor * node = cgraph->nodes[i];
if (node->view_src != nullptr && node->view_src->op == GGML_OP_NONE && ggml_backend_buffer_is_host(node->view_src->buffer)) {
// FIXME s_copy_main is on the CPU and its view seems to be incorrectly added to the graph nodes.
// For regular usage this doesn't matter since it's a noop but trying to call ggml_backend_meta_buffer_simple_tensor results in a crash.
bcj.nodes[i] = node;
continue;
if (needs_rebuild) {
size_t n_subgraphs = 0;
size_t max_tmp_size = 0;
for (size_t j = 0; j < n_backends; j++) {
auto & bcj = backend_ctx->backend_configs[j];
for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_tensor * node = cgraph->nodes[i];
if (node->view_src != nullptr && node->view_src->op == GGML_OP_NONE && ggml_backend_buffer_is_host(node->view_src->buffer)) {
// FIXME s_copy_main is on the CPU and its view seems to be incorrectly added to the graph nodes.
// For regular usage this doesn't matter since it's a noop but trying to call ggml_backend_meta_buffer_simple_tensor results in a crash.
bcj.nodes[i] = node;
continue;
}
bcj.nodes[i] = ggml_backend_meta_buffer_simple_tensor(node, j);
GGML_ASSERT(bcj.nodes[i]);
}
bcj.nodes[i] = ggml_backend_meta_buffer_simple_tensor(node, j);
GGML_ASSERT(bcj.nodes[i]);
}
}
size_t n_subgraphs = 0;
size_t max_tmp_size = 0;
{
// For MoE models it may make sense to delay the AllReduce in order to reduce I/O:
auto get_i_delayed = [&](const int i) -> int {
int id = i; // i_delayed
int idr = i; // i_delayed return, last safe return value
{
// For MoE models it may make sense to delay the AllReduce in order to reduce I/O:
auto get_i_delayed = [&](const int i) -> int {
int id = i; // i_delayed
int idr = i; // i_delayed return, last safe return value
ggml_tensor * node = cgraph->nodes[id];
int32_t n_used = ggml_node_get_use_count(cgraph, id);
if (id + 1 >= cgraph->n_nodes) {
return idr;
}
{
ggml_tensor * next = cgraph->nodes[id+1];
if (next->op == GGML_OP_ADD_ID && next->src[0] == node &&
ggml_backend_meta_get_split_state(next->src[1], false).axis == GGML_BACKEND_SPLIT_AXIS_PARTIAL &&
ggml_backend_meta_get_split_state(next->src[2], false).axis == GGML_BACKEND_SPLIT_AXIS_MIRRORED) {
node = next;
ggml_tensor * node = cgraph->nodes[id];
int32_t n_used = ggml_node_get_use_count(cgraph, id);
if (id + 1 >= cgraph->n_nodes) {
return idr;
}
{
ggml_tensor * next = cgraph->nodes[id+1];
if (next->op == GGML_OP_ADD_ID && next->src[0] == node &&
ggml_backend_meta_get_split_state(next->src[1], false).axis == GGML_BACKEND_SPLIT_AXIS_PARTIAL &&
ggml_backend_meta_get_split_state(next->src[2], false).axis == GGML_BACKEND_SPLIT_AXIS_MIRRORED) {
node = next;
id++;
idr = id;
n_used = ggml_node_get_use_count(cgraph, id);
}
}
if (id + 1 >= cgraph->n_nodes) {
return idr;
}
{
ggml_tensor * next = cgraph->nodes[id+1];
if (next->op == GGML_OP_MUL && next->src[0] == node &&
ggml_backend_meta_get_split_state(next->src[1], false).axis == GGML_BACKEND_SPLIT_AXIS_MIRRORED) {
node = next;
id++;
idr = id;
n_used = ggml_node_get_use_count(cgraph, id);
}
}
if (n_used != node->ne[1] || id + 2*n_used-1 >= cgraph->n_nodes) {
return idr;
}
for (int32_t k = 0; k < n_used; k++) {
ggml_tensor * next = cgraph->nodes[id+1];
if (next->op != GGML_OP_VIEW || next->view_src != node || next->view_offs != k*node->nb[1] ||
next->ne[0] != node->ne[0] || next->ne[1] != node->ne[2] || next->nb[1] != node->nb[2] ||
ggml_node_get_use_count(cgraph, id+1) != 1) {
return idr;
}
id++;
idr = id;
n_used = ggml_node_get_use_count(cgraph, id);
}
}
if (id + 1 >= cgraph->n_nodes) {
return idr;
}
{
ggml_tensor * next = cgraph->nodes[id+1];
if (next->op == GGML_OP_MUL && next->src[0] == node &&
ggml_backend_meta_get_split_state(next->src[1], false).axis == GGML_BACKEND_SPLIT_AXIS_MIRRORED) {
node = next;
{
ggml_tensor * next = cgraph->nodes[id+1];
if (next->op != GGML_OP_ADD || next->src[0] != cgraph->nodes[id - (n_used-1)] ||
next->src[1] != cgraph->nodes[id - (n_used-2)] || ggml_node_get_use_count(cgraph, id+1) != 1) {
return idr;
}
id++;
idr = id;
n_used = ggml_node_get_use_count(cgraph, id);
}
}
if (n_used != node->ne[1] || id + 2*n_used-1 >= cgraph->n_nodes) {
for (int32_t k = 0; k < n_used - 2; k++) {
ggml_tensor * next = cgraph->nodes[id+1];
if (next->op != GGML_OP_ADD || next->src[0] != cgraph->nodes[id] ||
next->src[1] != cgraph->nodes[id - (n_used-2)] || ggml_node_get_use_count(cgraph, id+1) != 1) {
return idr;
}
id++;
}
idr = id;
return idr;
}
for (int32_t k = 0; k < n_used; k++) {
ggml_tensor * next = cgraph->nodes[id+1];
if (next->op != GGML_OP_VIEW || next->view_src != node || next->view_offs != k*node->nb[1] ||
next->ne[0] != node->ne[0] || next->ne[1] != node->ne[2] || next->nb[1] != node->nb[2] ||
ggml_node_get_use_count(cgraph, id+1) != 1) {
return idr;
}
id++;
}
{
ggml_tensor * next = cgraph->nodes[id+1];
if (next->op != GGML_OP_ADD || next->src[0] != cgraph->nodes[id - (n_used-1)] ||
next->src[1] != cgraph->nodes[id - (n_used-2)] || ggml_node_get_use_count(cgraph, id+1) != 1) {
return idr;
}
id++;
}
for (int32_t k = 0; k < n_used - 2; k++) {
ggml_tensor * next = cgraph->nodes[id+1];
if (next->op != GGML_OP_ADD || next->src[0] != cgraph->nodes[id] ||
next->src[1] != cgraph->nodes[id - (n_used-2)] || ggml_node_get_use_count(cgraph, id+1) != 1) {
return idr;
}
id++;
}
idr = id;
return idr;
};
};
int i_start = 0;
for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_tensor * node = cgraph->nodes[i];
if (node->view_src != nullptr && node->view_src->op == GGML_OP_NONE && ggml_backend_buffer_is_host(node->view_src->buffer)) {
continue;
}
const ggml_backend_meta_split_state split_state = ggml_backend_meta_get_split_state(node, /*assume_sync =*/ false);
if (split_state.axis == GGML_BACKEND_SPLIT_AXIS_PARTIAL) {
max_tmp_size = std::max(max_tmp_size, ggml_nbytes(node));
}
const bool new_subgraph = i + 1 == cgraph->n_nodes || split_state.axis == GGML_BACKEND_SPLIT_AXIS_PARTIAL;
if (!new_subgraph) {
continue;
}
int i_start = 0;
for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_tensor * node = cgraph->nodes[i];
if (node->view_src != nullptr && node->view_src->op == GGML_OP_NONE && ggml_backend_buffer_is_host(node->view_src->buffer)) {
continue;
}
const ggml_backend_meta_split_state split_state = ggml_backend_meta_get_split_state(node, /*assume_sync =*/ false);
if (split_state.axis == GGML_BACKEND_SPLIT_AXIS_PARTIAL) {
max_tmp_size = std::max(max_tmp_size, ggml_nbytes(node));
}
const bool new_subgraph = i + 1 == cgraph->n_nodes || split_state.axis == GGML_BACKEND_SPLIT_AXIS_PARTIAL;
if (!new_subgraph) {
continue;
}
i = get_i_delayed(i);
i = get_i_delayed(i);
for (size_t j = 0; j < n_backends; j++) {
auto & bcj = backend_ctx->backend_configs[j];
bcj.cgraphs[n_subgraphs].offset = i_start;
}
n_subgraphs++;
i_start = i + 1;
}
GGML_ASSERT(i_start == cgraph->n_nodes);
}
backend_ctx->uid = cgraph->uid;
backend_ctx->n_subgraphs = n_subgraphs;
if (max_tmp_size > backend_ctx->max_tmp_size) {
for (size_t j = 0; j < n_backends; j++) {
auto & bcj = backend_ctx->backend_configs[j];
bcj.cgraphs[n_subgraphs].offset = i_start;
bcj.buf.reset(ggml_backend_alloc_buffer(bcj.backend, max_tmp_size));
}
n_subgraphs++;
i_start = i + 1;
backend_ctx->max_tmp_size = max_tmp_size;
}
GGML_ASSERT(i_start == cgraph->n_nodes);
}
if (max_tmp_size > backend_ctx->max_tmp_size) {
for (size_t j = 0; j < n_backends; j++) {
auto & bcj = backend_ctx->backend_configs[j];
bcj.buf.reset(ggml_backend_alloc_buffer(bcj.backend, max_tmp_size));
}
backend_ctx->max_tmp_size = max_tmp_size;
}
if (max_nnodes_raised || n_subgraphs > backend_ctx->max_subgraphs) {
backend_ctx->max_subgraphs = std::max(backend_ctx->max_subgraphs, n_subgraphs);
const size_t n_reduce_steps = backend_ctx->n_reduce_steps();
const size_t n_nodes_per_device = 2 * n_reduce_steps; // tmp + ADD per step
const size_t n_cgraphs_per_device = n_reduce_steps; // 1 ADD graph per step
const size_t mem_per_device_graphs_main = backend_ctx->max_subgraphs*ggml_graph_overhead_custom(backend_ctx->max_nnodes, cgraph->grads);
const size_t mem_per_device_graphs_aux = n_cgraphs_per_device*backend_ctx->max_subgraphs*ggml_graph_overhead_custom(1, cgraph->grads);
const size_t mem_per_device_nodes_aux = n_nodes_per_device*backend_ctx->max_subgraphs*ggml_tensor_overhead();
ggml_init_params params = {
/*.mem_size =*/ n_backends * (mem_per_device_graphs_main + mem_per_device_graphs_aux + mem_per_device_nodes_aux),
/*.mem_buffer =*/ nullptr,
/*.no_alloc =*/ true,
};
backend_ctx->ctx.reset(ggml_init(params));
for (size_t j = 0; j < n_backends; j++) {
auto & bcj = backend_ctx->backend_configs[j];
for (size_t i = 0; i < n_subgraphs; i++) {
bcj.cgraphs[i].cgraph_main = ggml_new_graph_custom(backend_ctx->ctx.get(), cgraph->n_nodes, /*grads =*/ false);
if (max_nnodes_raised || n_subgraphs > backend_ctx->max_subgraphs) {
backend_ctx->max_subgraphs = std::max(backend_ctx->max_subgraphs, n_subgraphs);
const size_t n_reduce_steps = backend_ctx->n_reduce_steps();
const size_t n_nodes_per_device = 2 * n_reduce_steps; // tmp + ADD per step
const size_t n_cgraphs_per_device = n_reduce_steps; // 1 ADD graph per step
const size_t mem_per_device_graphs_main = backend_ctx->max_subgraphs*ggml_graph_overhead_custom(backend_ctx->max_nnodes, cgraph->grads);
const size_t mem_per_device_graphs_aux = n_cgraphs_per_device*backend_ctx->max_subgraphs*ggml_graph_overhead_custom(1, cgraph->grads);
const size_t mem_per_device_nodes_aux = n_nodes_per_device*backend_ctx->max_subgraphs*ggml_tensor_overhead();
ggml_init_params params = {
/*.mem_size =*/ n_backends * (mem_per_device_graphs_main + mem_per_device_graphs_aux + mem_per_device_nodes_aux),
/*.mem_buffer =*/ nullptr,
/*.no_alloc =*/ true,
};
backend_ctx->ctx.reset(ggml_init(params));
for (size_t j = 0; j < n_backends; j++) {
auto & bcj = backend_ctx->backend_configs[j];
for (size_t i = 0; i < n_subgraphs; i++) {
bcj.cgraphs[i].cgraph_main = ggml_new_graph_custom(backend_ctx->ctx.get(), cgraph->n_nodes, /*grads =*/ false);
}
}
backend_ctx->cgraphs_aux.resize(n_backends*n_cgraphs_per_device*backend_ctx->max_subgraphs);
for (size_t k = 0; k < backend_ctx->cgraphs_aux.size(); k++) {
backend_ctx->cgraphs_aux[k] = ggml_new_graph_custom(backend_ctx->ctx.get(), 1, cgraph->grads);
}
backend_ctx->nodes_aux.resize(n_backends*n_nodes_per_device*backend_ctx->max_subgraphs);
for (size_t k = 0; k < backend_ctx->nodes_aux.size(); k++) {
backend_ctx->nodes_aux[k] = ggml_new_tensor_1d(backend_ctx->ctx.get(), GGML_TYPE_F32, 1);
}
}
backend_ctx->cgraphs_aux.resize(n_backends*n_cgraphs_per_device*backend_ctx->max_subgraphs);
for (size_t k = 0; k < backend_ctx->cgraphs_aux.size(); k++) {
backend_ctx->cgraphs_aux[k] = ggml_new_graph_custom(backend_ctx->ctx.get(), 1, cgraph->grads);
}
backend_ctx->nodes_aux.resize(n_backends*n_nodes_per_device*backend_ctx->max_subgraphs);
for (size_t k = 0; k < backend_ctx->nodes_aux.size(); k++) {
backend_ctx->nodes_aux[k] = ggml_new_tensor_1d(backend_ctx->ctx.get(), GGML_TYPE_F32, 1);
}
}
for (size_t j = 0; j < n_backends; j++) {
auto & bcj = backend_ctx->backend_configs[j];
for (size_t i_graph = 0; i_graph < n_subgraphs; i_graph++) {
ggml_cgraph * cgraph_ij = bcj.cgraphs[i_graph].cgraph_main;
const size_t i_node_start = bcj.cgraphs[i_graph].offset;
const size_t i_node_stop = i_graph + 1 < n_subgraphs ? bcj.cgraphs[i_graph + 1].offset : cgraph->n_nodes;
cgraph_ij->n_nodes = i_node_stop - i_node_start;
ggml_hash_set_reset(&cgraph_ij->visited_hash_set);
for (size_t i_node = i_node_start; i_node < i_node_stop; i_node++) {
ggml_tensor * node_ij = bcj.nodes[i_node];
cgraph_ij->nodes[i_node - i_node_start] = node_ij;
const size_t hash_pos_orig = ggml_hash_find(&cgraph->visited_hash_set, cgraph->nodes[i_node]);
const size_t hash_pos_ij = ggml_hash_insert(&cgraph_ij->visited_hash_set, node_ij);
cgraph_ij->use_counts[hash_pos_ij] = cgraph->use_counts[hash_pos_orig];
for (size_t j = 0; j < n_backends; j++) {
auto & bcj = backend_ctx->backend_configs[j];
for (size_t i_graph = 0; i_graph < n_subgraphs; i_graph++) {
ggml_cgraph * cgraph_ij = bcj.cgraphs[i_graph].cgraph_main;
const size_t i_node_start = bcj.cgraphs[i_graph].offset;
const size_t i_node_stop = i_graph + 1 < n_subgraphs ? bcj.cgraphs[i_graph + 1].offset : cgraph->n_nodes;
cgraph_ij->n_nodes = i_node_stop - i_node_start;
ggml_hash_set_reset(&cgraph_ij->visited_hash_set);
for (size_t i_node = i_node_start; i_node < i_node_stop; i_node++) {
ggml_tensor * node_ij = bcj.nodes[i_node];
cgraph_ij->nodes[i_node - i_node_start] = node_ij;
const size_t hash_pos_orig = ggml_hash_find(&cgraph->visited_hash_set, cgraph->nodes[i_node]);
const size_t hash_pos_ij = ggml_hash_insert(&cgraph_ij->visited_hash_set, node_ij);
cgraph_ij->use_counts[hash_pos_ij] = cgraph->use_counts[hash_pos_orig];
}
cgraph_ij->uid = ggml_graph_next_uid();
}
}
}
@ -1860,7 +1911,7 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
};
for (size_t i = 0; i < n_subgraphs; i++) {
for (size_t i = 0; i < backend_ctx->n_subgraphs; i++) {
for (size_t j = 0; j < n_backends; j++) {
auto & bcj = backend_ctx->backend_configs[j];
const ggml_status status = ggml_backend_graph_compute_async(bcj.backend, bcj.cgraphs[i].cgraph_main);
@ -1869,7 +1920,7 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
}
}
if (n_backends > 1 && i < n_subgraphs - 1) {
if (n_backends > 1 && i < backend_ctx->n_subgraphs - 1) {
bool backend_allreduce_success = false;
if (backend_ctx->comm_ctx) {
std::vector<ggml_tensor *> nodes;

View file

@ -274,10 +274,6 @@ static const char * cu_get_error_str(CUresult err) {
#define FLASH_ATTN_AVAILABLE
#endif // !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ < 220)
#if defined(TURING_MMA_AVAILABLE)
#define LDMATRIX_TRANS_AVAILABLE
#endif // defined(TURING_MMA_AVAILABLE)
static bool fp16_available(const int cc) {
return ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_PASCAL ||
(GGML_CUDA_CC_IS_MTHREADS(cc) && cc >= GGML_CUDA_CC_PH1);
@ -1194,6 +1190,7 @@ struct ggml_cuda_graph {
bool disable_due_to_gpu_arch = false;
bool warmup_complete = false;
uint64_t uid = 0;
int64_t last_used_time = 0;
struct node_properties {
ggml_tensor node;
void * node_src_data_ptrs[GGML_MAX_SRC];
@ -1375,12 +1372,28 @@ struct ggml_backend_cuda_context {
// when the computation is split across CPU/GPU (e.g., with --n-cpu-moe)
std::unordered_map<const void *, std::unique_ptr<ggml_cuda_graph>> cuda_graphs;
int64_t last_graph_eviction_sweep = 0;
ggml_cuda_graph * cuda_graph(const void * first_node_ptr) {
const int64_t time_now = ggml_time_us();
// sweep every 5s, evicting cuda graphs unused for >=10s
if (time_now - last_graph_eviction_sweep >= 5'000'000) {
last_graph_eviction_sweep = time_now;
for (auto it = cuda_graphs.begin(); it != cuda_graphs.end(); ) {
if (time_now - it->second->last_used_time >= 10'000'000) {
it = cuda_graphs.erase(it);
} else {
++it;
}
}
}
auto it = cuda_graphs.find(first_node_ptr);
if (it == cuda_graphs.end()) {
cuda_graphs[first_node_ptr] = std::make_unique<ggml_cuda_graph>();
return cuda_graphs[first_node_ptr].get();
it = cuda_graphs.emplace(first_node_ptr, std::make_unique<ggml_cuda_graph>()).first;
}
it->second->last_used_time = time_now;
return it->second.get();
}

View file

@ -305,12 +305,13 @@ static __device__ __forceinline__ void flash_attn_ext_f16_load_tile(
const half2 * const __restrict__ KV, half2 * const __restrict__ tile_KV, const int D2, const int stride_KV, const int i_sup) {
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
// K/V data is loaded with decreasing granularity for D for better memory bandwidth.
// The minimum granularity with cp.async is 16 bytes, with synchronous data loading it's 4 bytes.
// The minimum granularity is 16 bytes.
constexpr int h2_per_chunk = 16/sizeof(half2);
const int chunks_per_row = D2 / h2_per_chunk;
if constexpr (use_cp_async) {
static_assert(warp_size == 32, "bad warp_size");
static_assert(!oob_check, "OOB check not compatible with cp_async");
constexpr int preload = 64;
constexpr int h2_per_chunk = 16/sizeof(half2);
const int chunks_per_row = D2 / h2_per_chunk;
const unsigned int tile_KV_32 = ggml_cuda_cvta_generic_to_shared(tile_KV);
@ -348,11 +349,11 @@ static __device__ __forceinline__ void flash_attn_ext_f16_load_tile(
// 6: max 1*16= 16 bytes, 8 half
ggml_cuda_unroll<6>{}(load);
} else {
// TODO use ggml_cuda_memcpy_1
const half2 zero[4] = {{0.0f, 0.0f}, {0.0f, 0.0f}, {0.0f, 0.0f}, {0.0f, 0.0f}};
auto load = [&] __device__ (const int n) {
const int stride_k = warp_size >> n;
const int k0_start = stride_k == warp_size ? 0 : D2 - D2 % (2*stride_k);
const int k0_stop = D2 - D2 % (1*stride_k);
const int stride_k = 32 >> n;
const int k0_start = stride_k == 32 ? 0 : chunks_per_row - chunks_per_row % (2*stride_k);
const int k0_stop = chunks_per_row - chunks_per_row % (1*stride_k);
const int stride_i = warp_size / stride_k;
if (k0_start == k0_stop) {
@ -371,15 +372,18 @@ static __device__ __forceinline__ void flash_attn_ext_f16_load_tile(
for (int k0 = k0_start; k0 < k0_stop; k0 += stride_k) {
const int k = k0 + (stride_k == warp_size ? threadIdx.x : threadIdx.x % stride_k);
tile_KV[i*stride_tile + k] = !oob_check || i < i_sup ? KV[i*stride_KV + k] : make_half2(0.0f, 0.0f);
ggml_cuda_memcpy_1<16>(tile_KV + i*stride_tile + k*4,
!oob_check || i < i_sup ? KV + i*stride_KV + k*h2_per_chunk : zero);
}
}
};
// 1: max 32* 4=128 bytes, 64 half
// 2: max 16* 4= 64 bytes, 32 half
// 3: max 8* 4= 32 bytes, 16 half
// 4: max 4* 4= 16 bytes, 8 half
ggml_cuda_unroll<4>{}(load);
// 1: max 32*16=512 bytes, 256 half
// 2: max 16*16=256 bytes, 128 half
// 3: max 8*16=128 bytes, 64 half
// 4: max 4*16= 64 bytes, 32 half
// 5: max 2*16= 32 bytes, 16 half
// 6: max 1*16= 16 bytes, 8 half
ggml_cuda_unroll<6>{}(load);
}
}
@ -862,11 +866,6 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
}
#if defined(AMD_WMMA_AVAILABLE) && !defined(LDMATRIX_TRANS_AVAILABLE)
T_A_VKQ A_identity;
make_identity_mat(A_identity);
#endif // defined(AMD_WMMA_AVAILABLE) && !defined(LDMATRIX_TRANS_AVAILABLE)
// Calculate VKQ tile, need to use logical rather than physical elements for i0 due to transposition of V:
#pragma unroll
for (int i0_start = 0; i0_start < DV; i0_start += 2*nbatch_V2) {
@ -897,29 +896,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
const int k0 = k00 + (threadIdx.y % np)*T_A_VKQ::J;
T_A_VKQ A; // Transposed in SRAM but not in registers, gets transposed on load.
#if defined(LDMATRIX_TRANS_AVAILABLE)
load_ldmatrix_trans(A, tile_V_i + 2*k0*stride_tile_V + (i_VKQ_0 - i0_start)/2, stride_tile_V);
#elif defined(AMD_MFMA_AVAILABLE)
// MFMA A register layout: A_mat[i=lane%16][k=4*(lane/16)+reg].
// Normal load gives A_mat[seq][dv] but we need A_mat[dv][seq] = V^T.
// Load with transposed addressing: 4 strided half loads.
{
const half2 * xs0 = tile_V_i + 2*k0*stride_tile_V + (i_VKQ_0 - i0_start)/2;
const half * xs0_h = (const half *) xs0;
const int stride_h = stride_tile_V * 2; // stride in half units
half * A_h = (half *) A.x;
#pragma unroll
for (int l = 0; l < 4; ++l) {
A_h[l] = xs0_h[(4*(threadIdx.x / 16) + l) * stride_h + threadIdx.x % 16];
}
}
#else
// TODO: Try to transpose tile_V when loading gmem to smem.
// Use mma to transpose T_A_VKQ for RDNA.
T_A_VKQ A_trans;
load_ldmatrix(A_trans, tile_V_i + 2*k0*stride_tile_V + (i_VKQ_0 - i0_start)/2, stride_tile_V);
mma(A, A_trans, A_identity);
#endif // defined(LDMATRIX_TRANS_AVAILABLE)
if constexpr (T_B_KQ::I == 8) {
mma(VKQ_C[i_VKQ_0/i0_stride], A, B[k00/(np*T_A_VKQ::J)]);
} else {

View file

@ -86,17 +86,12 @@ namespace ggml_cuda_mma {
// - (I_MAJOR, I_MAJOR_MIRRORED) -> I_MAJOR
// - (I_MAJOR, J_MAJOR_MIRRORED) -> I_MAJOR
static constexpr bool is_i_major(const data_layout dl) {
return dl == DATA_LAYOUT_I_MAJOR ||
dl == DATA_LAYOUT_I_MAJOR_MIRRORED;
}
static constexpr __device__ data_layout get_input_data_layout() {
#if defined(RDNA3) || __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
#if defined(RDNA3) || defined(VOLTA_MMA_AVAILABLE)
return DATA_LAYOUT_I_MAJOR_MIRRORED;
#else
return DATA_LAYOUT_I_MAJOR;
#endif // defined(RDNA3) || __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
#endif // defined(RDNA3) || defined(VOLTA_MMA_AVAILABLE)
}
template <int I_, int J_, typename T, data_layout ds_=DATA_LAYOUT_I_MAJOR>
@ -113,7 +108,6 @@ namespace ggml_cuda_mma {
T x[ne] = {0};
static constexpr __device__ bool supported() {
if (I == 64 && J == 2) return true;
if (I == 16 && J == 8) return true;
if (I == 32 && J == 4) return true;
if (I == 16 && J == 16) return true;
@ -122,7 +116,7 @@ namespace ggml_cuda_mma {
}
static __device__ __forceinline__ int get_i(const int l) {
if constexpr (I == 64 && J == 2) { // Special tile size to load <16, 4> as <16, 8>
if constexpr (I == 16 && J == 4) {
return threadIdx.x % 16;
} else if constexpr (I == 16 && J == 8) {
return threadIdx.x % 16;
@ -139,8 +133,8 @@ namespace ggml_cuda_mma {
}
static __device__ __forceinline__ int get_j(const int l) {
if constexpr (I == 64 && J == 2) { // Special tile size to load <16, 4> as <16, 8>
return (2 * ((threadIdx.x / 16) % 2) + l);
if constexpr (I == 16 && J == 4) {
return threadIdx.x / 16;
} else if constexpr (I == 16 && J == 8) {
return 2 * (threadIdx.x / 16) + l;
} else if constexpr (I == 32 && J == 4) {
@ -154,7 +148,7 @@ namespace ggml_cuda_mma {
return -1;
}
}
#elif __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
#elif defined(VOLTA_MMA_AVAILABLE)
static constexpr int ne = I * J / 32;
T x[ne] = {0};
@ -283,7 +277,7 @@ namespace ggml_cuda_mma {
static constexpr int J = J_;
static constexpr data_layout dl = DATA_LAYOUT_I_MAJOR;
#if __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
#if defined(VOLTA_MMA_AVAILABLE)
static constexpr int ne = I * J / WARP_SIZE;
half2 x[ne] = {{0.0f, 0.0f}};
@ -407,7 +401,7 @@ namespace ggml_cuda_mma {
return -1;
}
}
#endif // __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
#endif // defined(VOLTA_MMA_AVAILABLE)
};
template <int I_, int J_>
@ -701,57 +695,12 @@ namespace ggml_cuda_mma {
}
#endif // defined(TURING_MMA_AVAILABLE)
static __device__ __forceinline__ void make_identity_mat(tile<16, 8, half2> & t) {
#if defined(RDNA4)
const int row = t.get_i(0);
const int left_right = t.get_j(0) / 4;
const int up_down = row / 8;
const int idx = row % 8;
reinterpret_cast<half*>(t.x)[idx] = left_right == up_down ? 1.0f : 0.0f;
#else
GGML_UNUSED_VARS(t);
NO_DEVICE_CODE;
#endif // defined(RDNA4)
}
template <int I, int J, typename T, data_layout dl>
static __device__ __forceinline__ void load_generic(tile<I, J, T, dl> & t, const T * __restrict__ xs0, const int stride) {
#if defined(AMD_MFMA_AVAILABLE)
if constexpr (I == 64 && J == 2) { // Special tile size to load <16, 4> as <16, 8>
#pragma unroll
for (int l = 0; l < t.ne; ++l) {
t.x[l] = xs0[t.get_i(l)*stride + t.get_j(l)];
}
} else {
ggml_cuda_memcpy_1<sizeof(t.x)>(t.x, xs0 + t.get_i(0) * stride + t.get_j(0));
}
#elif defined(AMD_WMMA_AVAILABLE)
// All wmma layout has contiguous data when i-major.
if constexpr (is_i_major(dl)) {
// the data must be aligned to 16 bytes when bigger than ggml_cuda_get_max_cpy_bytes()
constexpr int aligned_copy_bytes = ggml_cuda_get_max_cpy_bytes();
if constexpr (sizeof(t.x) > aligned_copy_bytes) {
static_assert(sizeof(t.x) % aligned_copy_bytes == 0, "bad type size");
constexpr int aligned_copy_count = sizeof(t.x)/aligned_copy_bytes;
#pragma unroll
for (int i = 0; i < aligned_copy_count; ++i) {
ggml_cuda_memcpy_1<aligned_copy_bytes>(t.x + t.ne/aligned_copy_count*i, xs0 + t.get_i(0) * stride + t.get_j(t.ne/aligned_copy_count*i));
}
} else {
ggml_cuda_memcpy_1<sizeof(t.x)>(t.x, xs0 + t.get_i(0) * stride + t.get_j(0));
}
} else {
#pragma unroll
for (int l = 0; l < t.ne; ++l) {
t.x[l] = xs0[t.get_i(l)*stride + t.get_j(l)];
}
}
#else
#pragma unroll
for (int l = 0; l < t.ne; ++l) {
t.x[l] = xs0[t.get_i(l)*stride + t.get_j(l)];
}
#endif // defined(AMD_MFMA_AVAILABLE)
}
template <typename T>
@ -764,26 +713,37 @@ namespace ggml_cuda_mma {
: "=r"(xi[0]), "=r"(xi[1])
: "l"(xs));
#else
load_generic(t, xs0, stride);
GGML_UNUSED_VARS(t, xs0, stride);
NO_DEVICE_CODE;
#endif // TURING_MMA_AVAILABLE
}
template <typename T>
template <typename T, data_layout dl>
static __device__ __forceinline__ void load_ldmatrix(
tile<16, 4, T> & t, const T * __restrict__ xs0, const int stride) {
tile<16, 4, T, dl> & t, const T * __restrict__ xs0, const int stride) {
#ifdef TURING_MMA_AVAILABLE
int * xi = (int *) t.x;
const int * xs = (const int *) xs0 + (threadIdx.x % t.I) * stride;
asm volatile("ldmatrix.sync.aligned.m8n8.x2.b16 {%0, %1}, [%2];"
: "=r"(xi[0]), "=r"(xi[1])
: "l"(xs));
#elif defined(AMD_WMMA_AVAILABLE)
#ifdef RDNA3
static_assert(dl == DATA_LAYOUT_I_MAJOR_MIRRORED, "bad data layout");
static_assert(sizeof(t.x) == 16, "bad ne");
ggml_cuda_memcpy_1<8>(t.x + 0, xs0 + t.get_i(0)*stride + 0);
ggml_cuda_memcpy_1<8>(t.x + 2, xs0 + t.get_i(0)*stride + 2);
#else
static_assert(dl == DATA_LAYOUT_I_MAJOR, "bad data layout");
static_assert(sizeof(t.x) == 8, "bad ne");
ggml_cuda_memcpy_1<8>(t.x, xs0 + t.get_i(0)*stride + t.get_j(0));
#endif // RDNA3
#elif defined(AMD_MFMA_AVAILABLE)
static_assert(sizeof(t.x) == 4, "bad ne");
ggml_cuda_memcpy_1<4>(t.x, xs0 + t.get_i(0)*stride + t.get_j(0));
#else
#if __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
GGML_UNUSED_VARS(t, xs0, stride);
NO_DEVICE_CODE;
#else
load_generic(t, xs0, stride);
#endif // __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
#endif // TURING_MMA_AVAILABLE
}
@ -796,19 +756,26 @@ namespace ggml_cuda_mma {
asm volatile("ldmatrix.sync.aligned.m8n8.x4.b16 {%0, %1, %2, %3}, [%4];"
: "=r"(xi[0]), "=r"(xi[1]), "=r"(xi[2]), "=r"(xi[3])
: "l"(xs));
#else
#if __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
#if 1
// TODO: more generic handling
static_assert(sizeof(T) == 4, "bad type size");
#elif defined(VOLTA_MMA_AVAILABLE)
ggml_cuda_memcpy_1<4*sizeof(T)>(t.x + 0, xs0 + t.get_i(0)*stride + 0);
ggml_cuda_memcpy_1<4*sizeof(T)>(t.x + 4, xs0 + t.get_i(4)*stride + 4);
#elif defined(AMD_WMMA_AVAILABLE)
#ifdef RDNA3
static_assert(dl == DATA_LAYOUT_I_MAJOR_MIRRORED, "bad data layout");
static_assert(sizeof(t.x) == 32, "bad ne");
ggml_cuda_memcpy_1<16>(t.x + 0, xs0 + t.get_i(0)*stride + 0);
ggml_cuda_memcpy_1<16>(t.x + 4, xs0 + t.get_i(0)*stride + 4);
#else
load_generic(t, xs0, stride);
#endif // 1
static_assert(dl == DATA_LAYOUT_I_MAJOR, "bad data layout");
static_assert(sizeof(t.x) == 16, "bad ne");
ggml_cuda_memcpy_1<16>(t.x, xs0 + t.get_i(0)*stride + t.get_j(0));
#endif // RDNA3
#elif defined(AMD_MFMA_AVAILABLE)
static_assert(sizeof(t.x) == 8, "bad ne");
ggml_cuda_memcpy_1<8>(t.x, xs0 + t.get_i(0)*stride + t.get_j(0));
#else
load_generic(t, xs0, stride);
#endif // __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
GGML_UNUSED_VARS(t, xs0, stride);
NO_DEVICE_CODE;
#endif // TURING_MMA_AVAILABLE
}
@ -827,23 +794,30 @@ namespace ggml_cuda_mma {
static __device__ __forceinline__ void load_ldmatrix(
tile<32, 4, half2> & t, const half2 * __restrict__ xs0, const int stride) {
#if __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
#if defined(VOLTA_MMA_AVAILABLE)
ggml_cuda_memcpy_1<4*sizeof(half2)>(t.x, xs0 + t.get_i(0)*stride);
#else
GGML_UNUSED_VARS(t, xs0, stride);
NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
#endif // defined(VOLTA_MMA_AVAILABLE)
}
template <typename T>
static __device__ __forceinline__ void load_ldmatrix_trans(
tile<16, 8, T> & t, const T * __restrict__ xs0, const int stride) {
#ifdef TURING_MMA_AVAILABLE
int * xi = (int * ) t.x;
int * xi = (int *) t.x;
const int * xs = (const int *) xs0 + (threadIdx.x % t.I) * stride + (threadIdx.x / t.I) * (t.J / 2);
asm volatile("ldmatrix.sync.aligned.m8n8.x4.trans.b16 {%0, %1, %2, %3}, [%4];"
: "=r"(xi[0]), "=r"(xi[2]), "=r"(xi[1]), "=r"(xi[3])
: "l"(xs));
#elif defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
half * xh = (half *) t.x;
#pragma unroll
for (int l = 0; l < t.ne; ++l) {
xh[2*l + 0] = ((const half *) xs0)[(2*t.get_j(l) + 0)*(2*stride) + t.get_i(l)];
xh[2*l + 1] = ((const half *) xs0)[(2*t.get_j(l) + 1)*(2*stride) + t.get_i(l)];
}
#else
GGML_UNUSED_VARS(t, xs0, stride);
NO_DEVICE_CODE;
@ -1218,73 +1192,27 @@ namespace ggml_cuda_mma {
using int32x4_t = __attribute__((__vector_size__(4 * sizeof(int)))) int;
int32x4_t * acc = (int32x4_t *) D.x;
#if defined(CDNA4) || defined(CDNA3)
acc[0] = __builtin_amdgcn_mfma_i32_16x16x32_i8(((int64_t *) A.x)[0],
((int64_t *) B.x)[0],
acc[0],
0, 0, 0);
acc[0] = __builtin_amdgcn_mfma_i32_16x16x32_i8(((int64_t *) A.x)[0], ((int64_t *) B.x)[0], acc[0], 0, 0, 0);
#elif defined(CDNA2) || defined(CDNA1)
acc[0] = __builtin_amdgcn_mfma_i32_16x16x16i8(A.x[0],
B.x[0],
acc[0],
0, 0, 0);
acc[0] = __builtin_amdgcn_mfma_i32_16x16x16i8(A.x[1],
B.x[1],
acc[0],
0, 0, 0);
acc[0] = __builtin_amdgcn_mfma_i32_16x16x16i8(A.x[0], B.x[0], acc[0], 0, 0, 0);
acc[0] = __builtin_amdgcn_mfma_i32_16x16x16i8(A.x[1], B.x[1], acc[0], 0, 0, 0);
#endif // defined(CDNA4) || defined(CDNA3)
#elif defined(AMD_WMMA_AVAILABLE)
using int32x8_t = __attribute__((__vector_size__(8 * sizeof(int)))) int;
int32x8_t * acc = (int32x8_t *) D.x;
#if defined(RDNA4)
using int32x2_t = __attribute__((__vector_size__(2 * sizeof(int)))) int;
int32x2_t * a_vec = (int32x2_t *) A.x;
int32x2_t * b_vec = (int32x2_t *) B.x;
acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12(
true,
a_vec[0],
true,
b_vec[0],
acc[0],
true
);
acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12(
true,
a_vec[1],
true,
b_vec[1],
acc[0],
true
);
acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12(true, a_vec[0], true, b_vec[0], acc[0], true);
acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12(true, a_vec[1], true, b_vec[1], acc[0], true);
#elif defined(RDNA3)
using int32x4_t = __attribute__((__vector_size__(4 * sizeof(int)))) int;
int32x4_t * a_vec = (int32x4_t *) A.x;
int32x4_t * b_vec = (int32x4_t *) B.x;
acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32(
true,
a_vec[0],
true,
b_vec[0],
acc[0],
true
);
acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32(
true,
a_vec[1],
true,
b_vec[1],
acc[0],
true
);
acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32(true, a_vec[0], true, b_vec[0], acc[0], true);
acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32(true, a_vec[1], true, b_vec[1], acc[0], true);
#endif // RDNA4
#else
GGML_UNUSED_VARS(D, A, B);
NO_DEVICE_CODE;
@ -1297,19 +1225,10 @@ namespace ggml_cuda_mma {
using int32x16_t = __attribute__((__vector_size__(16 * sizeof(int)))) int;
int32x16_t * acc = (int32x16_t *) D.x;
#if defined(CDNA4) || defined(CDNA3)
acc[0] = __builtin_amdgcn_mfma_i32_32x32x16_i8(((int64_t *) A.x)[0],
((int64_t *) B.x)[0],
acc[0],
0, 0, 0);
acc[0] = __builtin_amdgcn_mfma_i32_32x32x16_i8(((int64_t *) A.x)[0], ((int64_t *) B.x)[0], acc[0], 0, 0, 0);
#elif defined(CDNA2) || defined(CDNA1)
acc[0] = __builtin_amdgcn_mfma_i32_32x32x8i8(A.x[0],
B.x[0],
acc[0],
0, 0, 0);
acc[0] = __builtin_amdgcn_mfma_i32_32x32x8i8(A.x[1],
B.x[1],
acc[0],
0, 0, 0);
acc[0] = __builtin_amdgcn_mfma_i32_32x32x8i8(A.x[0], B.x[0], acc[0], 0, 0, 0);
acc[0] = __builtin_amdgcn_mfma_i32_32x32x8i8(A.x[1], B.x[1], acc[0], 0, 0, 0);
#endif // defined(CDNA4) || defined(CDNA3)
#else
@ -1329,7 +1248,7 @@ namespace ggml_cuda_mma {
static __device__ __forceinline__ void mma(
tile<32, 8, float> & D, const tile<32, 4, half2> & A, const tile<8, 4, half2, DATA_LAYOUT_I_MAJOR_MIRRORED> & B) {
#if __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
#if defined(VOLTA_MMA_AVAILABLE)
const int * Axi = (const int *) A.x;
const int * Bxi = (const int *) B.x;
int * Dxi = (int *) D.x;
@ -1344,12 +1263,12 @@ namespace ggml_cuda_mma {
#else
GGML_UNUSED_VARS(D, A, B);
NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
#endif // defined(VOLTA_MMA_AVAILABLE)
}
static __device__ __forceinline__ void mma(
tile<32, 4, half2> & D, const tile<32, 4, half2> & A, const tile<8, 4, half2, DATA_LAYOUT_J_MAJOR_MIRRORED> & B) {
#if __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
#if defined(VOLTA_MMA_AVAILABLE)
const int * Axi = (const int *) A.x;
const int * Bxi = (const int *) B.x;
int * Dxi = (int *) D.x;
@ -1364,41 +1283,35 @@ namespace ggml_cuda_mma {
#else
GGML_UNUSED_VARS(D, A, B);
NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
#endif // defined(VOLTA_MMA_AVAILABLE)
}
template <data_layout dl_d, data_layout dl_ab>
static __device__ __forceinline__ void mma(
tile<16, 16, int, dl_d> & D, const tile<16, 4, int, dl_ab> & A, const tile<16, 4, int, dl_ab> & B) {
#if defined(AMD_WMMA_AVAILABLE)
#if defined(AMD_MFMA_AVAILABLE)
using int32x4_t = __attribute__((__vector_size__(4 * sizeof(int)))) int;
int32x4_t * acc = (int32x4_t *) D.x;
#if defined(CDNA4) || defined(CDNA3)
const int64_t xA = uint32_t(A.x[0]);
const int64_t xB = uint32_t(B.x[0]);
acc[0] = __builtin_amdgcn_mfma_i32_16x16x32_i8(xA, xB, acc[0], 0, 0, 0);
#elif defined(CDNA2) || defined(CDNA1)
acc[0] = __builtin_amdgcn_mfma_i32_16x16x16i8(A.x[0], B.x[0], acc[0], 0, 0, 0);
#endif // defined(CDNA4) || defined(CDNA3)
#elif defined(AMD_WMMA_AVAILABLE)
using int32x8_t = __attribute__((__vector_size__(8 * sizeof(int)))) int;
int32x8_t * acc = (int32x8_t *) D.x;
#if defined(RDNA4)
using int32x2_t = __attribute__((__vector_size__(2 * sizeof(int)))) int;
int32x2_t * a_vec = (int32x2_t *) A.x;
int32x2_t * b_vec = (int32x2_t *) B.x;
acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12(
true,
a_vec[0],
true,
b_vec[0],
acc[0],
false
);
acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12(true, a_vec[0], true, b_vec[0], acc[0], false);
#elif defined(RDNA3)
using int32x4_t = __attribute__((__vector_size__(4 * sizeof(int)))) int;
int32x4_t * a_vec = (int32x4_t *) A.x;
int32x4_t * b_vec = (int32x4_t *) B.x;
acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32(
true,
a_vec[0],
true,
b_vec[0],
acc[0],
false
);
acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32(true, a_vec[0], true, b_vec[0], acc[0], false);
#endif // RDNA4
#else
GGML_UNUSED(D);

View file

@ -105,7 +105,7 @@ struct tile_x_sizes {
};
static int get_mmq_x_max_host(const int cc) {
return (amd_mfma_available(cc) || turing_mma_available(cc) || amd_wmma_available(cc)) ? 128 :
return (turing_mma_available(cc) || amd_wmma_available(cc)) ? 128 :
GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA ?
#ifdef GGML_CUDA_FORCE_MMQ
128 : 64;
@ -115,9 +115,9 @@ static int get_mmq_x_max_host(const int cc) {
}
static constexpr __device__ int get_mmq_x_max_device() {
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
#if defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
return 128;
#else // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
#else // defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
#if defined(GGML_USE_HIP)
return 64;
@ -1055,13 +1055,13 @@ static __device__ __forceinline__ void vec_dot_q8_0_q8_1_mma(
tile_A A[ntx];
#pragma unroll
for (int n = 0; n < ntx; ++n) {
load_generic(A[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q8_0 + k0, MMQ_MMA_TILE_X_K_Q8_0);
load_ldmatrix(A[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q8_0 + k0, MMQ_MMA_TILE_X_K_Q8_0);
}
#pragma unroll
for (int j0 = 0; j0 < mmq_x; j0 += ntx*tile_C::J) {
tile_B B;
load_generic(B, y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);
load_ldmatrix(B, y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);
float dB;
const int j = j0 + tile_C::get_j(0);
@ -1296,13 +1296,13 @@ static __device__ __forceinline__ void vec_dot_q8_1_q8_1_mma(
tile_A A[ntx];
#pragma unroll
for (int n = 0; n < ntx; ++n) {
load_generic(A[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q8_1 + k0, MMQ_MMA_TILE_X_K_Q8_1);
load_ldmatrix(A[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q8_1 + k0, MMQ_MMA_TILE_X_K_Q8_1);
}
#pragma unroll
for (int j0 = 0; j0 < mmq_x; j0 += ntx*tile_C::J) {
tile_B B;
load_generic(B, y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);
load_ldmatrix(B, y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);
const int j = j0 + tile_C::get_j(0);
const float2 dsB = __half22float2(y_dm[j*MMQ_TILE_Y_K + k01/QI8_1]);
@ -1436,57 +1436,7 @@ static __device__ __forceinline__ void vec_dot_q8_0_16_q8_1_dp4a(
template <int mmq_x, int mmq_y>
static __device__ __forceinline__ void vec_dot_q8_0_16_q8_1_mma(
const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
#if defined(AMD_MFMA_AVAILABLE)
constexpr data_layout input_layout = get_input_data_layout();
typedef tile<16, 8, int, input_layout> tile_A;
typedef tile<16, 8, int, input_layout> tile_B;
typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C;
typedef tile<64, 2, int, input_layout> tile_load;
constexpr int granularity = mmq_get_granularity_device(mmq_x);
constexpr int rows_per_warp = granularity;
constexpr int ntx = rows_per_warp/tile_C::I; // Number of x minitiles per warp.
y += (threadIdx.y % ntx) * (tile_C::J*MMQ_TILE_Y_K);
const int * x_qs = (const int *) x;
const float * x_df = (const float *) x_qs + MMQ_TILE_NE_K*2;
const int * y_qs = (const int *) y + 4;
const float * y_df = (const float *) y;
const int i0 = (threadIdx.y / ntx) * rows_per_warp;
for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += 4) {
const int k0 = k00 + k01;
tile_A A[ntx];
#pragma unroll
for (int n = 0; n < ntx; ++n) {
load_generic(((tile_load *) A)[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q3_K + k0, MMQ_MMA_TILE_X_K_Q3_K);
}
#pragma unroll
for (int j0 = 0; j0 < mmq_x; j0 += ntx*tile_C::J) {
tile_B B[1];
load_generic(((tile_load *) B)[0], y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);
const int j = j0 + tile_C::get_j(0);
const float dB = y_df[j*MMQ_TILE_Y_K + k01/QI8_1] / 2;
#pragma unroll
for (int n = 0; n < ntx; ++n) {
tile_C C;
mma(C, A[n], B[0]);
#pragma unroll
for (int l = 0; l < tile_C::ne; ++l) {
const int i = i0 + n*tile_C::I + tile_C::get_i(l);
sum[(j0/tile_C::J + n)*tile_C::ne + l] += C.x[l] * x_df[i*MMQ_MMA_TILE_X_K_Q3_K + k0/4] * dB;
}
}
}
}
#elif defined(AMD_WMMA_AVAILABLE) //wmma instructions can handle 16x4 tiles, does not require loading 64x2 tiles
#if defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
constexpr data_layout input_layout = get_input_data_layout();
typedef tile<16, 4, int, input_layout> tile_A;
typedef tile<16, 4, int, input_layout> tile_B;
@ -1511,13 +1461,13 @@ static __device__ __forceinline__ void vec_dot_q8_0_16_q8_1_mma(
tile_A A[ntx];
#pragma unroll
for (int n = 0; n < ntx; ++n) {
load_generic(A[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q3_K + k0, MMQ_MMA_TILE_X_K_Q3_K);
load_ldmatrix(A[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q3_K + k0, MMQ_MMA_TILE_X_K_Q3_K);
}
#pragma unroll
for (int j0 = 0; j0 < mmq_x; j0 += ntx*tile_C::J) {
tile_B B;
load_generic(B, y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);
load_ldmatrix(B, y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);
const int j = j0 + tile_C::get_j(0);
const float dB = y_df[j*MMQ_TILE_Y_K + k01/QI8_1];
@ -1743,74 +1693,7 @@ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_dp4a(
template <int mmq_x, int mmq_y>
static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mma(
const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
#if defined(AMD_MFMA_AVAILABLE)
constexpr data_layout input_layout = get_input_data_layout();
typedef tile<16, 8, int, input_layout> tile_A;
typedef tile<16, 8, int, input_layout> tile_B;
typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C;
typedef tile<64, 2, int, input_layout> tile_load;
constexpr int granularity = mmq_get_granularity_device(mmq_x);
constexpr int rows_per_warp = granularity;
constexpr int ntx = rows_per_warp/tile_C::I; // Number of x minitiles per warp.
y += (threadIdx.y % ntx) * (tile_C::J*MMQ_TILE_Y_K);
const int * x_qs = (const int *) x;
const half2 * x_dm = (const half2 *) x_qs + MMQ_TILE_NE_K*2;
const int * y_qs = (const int *) y + 4;
const half2 * y_ds = (const half2 *) y;
const int i0 = (threadIdx.y / ntx) * rows_per_warp;
for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += 4) {
const int k0 = k00 + k01;
tile_A A[ntx];
#pragma unroll
for (int n = 0; n < ntx; ++n) {
load_generic(((tile_load *) A)[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q2_K + k0, MMQ_MMA_TILE_X_K_Q2_K);
}
#pragma unroll
for (int j0 = 0; j0 < mmq_x; j0 += ntx*tile_C::J) {
tile_B B[1];
load_generic(((tile_load *) B)[0], y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);
const int j = j0 + tile_C::get_j(0);
const float dB = (k01 < MMQ_TILE_NE_K/2) ? __half22float2(y_ds[j*MMQ_TILE_Y_K]).x/2 : __half22float2(y_ds[j*MMQ_TILE_Y_K]).y/2;
const float sB = (k01 >= MMQ_TILE_NE_K * 3/4) ? 0
: (((k01/4)%2) ? __half22float2(y_ds[j*MMQ_TILE_Y_K + (1 + k01/QI8_1)]).y
: __half22float2(y_ds[j*MMQ_TILE_Y_K + (1 + k01/QI8_1)]).x);
tile_C Cm;
if (k01 >= MMQ_TILE_NE_K * 3/4) {
tile_A A1;
A1.x[0] = 0x01010101;
A1.x[1] = 0x01010101;
mma(Cm, A1, B[0]);
}
#pragma unroll
for (int n = 0; n < ntx; ++n) {
tile_C Cd;
mma(Cd, A[n], B[0]);
#pragma unroll
for (int l = 0; l < tile_C::ne; ++l) {
const int i = i0 + n*tile_C::I + tile_C::get_i(l);
const float2 dm = __half22float2(x_dm[i*MMQ_MMA_TILE_X_K_Q2_K + k0/4]);
float tmp = Cd.x[l]*dm.x;
if (k01 >= MMQ_TILE_NE_K * 3/4) {
tmp -= Cm.x[l]*dm.y;
}
sum[(j0/tile_C::J + n)*tile_C::ne + l] += tmp*dB;
sum[(j0/tile_C::J + n)*tile_C::ne + l] -= dm.y*sB;
}
}
}
}
#elif defined(AMD_WMMA_AVAILABLE) //wmma instructions can handle 16x4 tiles, does not require loading 64x2 tiles
#if defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
constexpr data_layout input_layout = get_input_data_layout();
typedef tile<16, 4, int, input_layout> tile_A;
typedef tile<16, 4, int, input_layout> tile_B;
@ -1835,13 +1718,13 @@ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mma(
tile_A A[ntx];
#pragma unroll
for (int n = 0; n < ntx; ++n) {
load_generic(A[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q2_K + k0, MMQ_MMA_TILE_X_K_Q2_K);
load_ldmatrix(A[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q2_K + k0, MMQ_MMA_TILE_X_K_Q2_K);
}
#pragma unroll
for (int j0 = 0; j0 < mmq_x; j0 += ntx*tile_C::J) {
tile_B B;
load_generic(B, y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);
load_ldmatrix(B, y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);
const int j = j0 + tile_C::get_j(0);
const float dB = (k01 < MMQ_TILE_NE_K/2) ? __half22float2(y_ds[j*MMQ_TILE_Y_K]).x : __half22float2(y_ds[j*MMQ_TILE_Y_K]).y;
@ -2574,59 +2457,7 @@ static __device__ __forceinline__ void vec_dot_q6_K_q8_1_dp4a(
template <int mmq_x, int mmq_y>
static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mma(
const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
#if defined(AMD_MFMA_AVAILABLE)
constexpr data_layout input_layout = get_input_data_layout();
typedef tile<16, 8, int, input_layout> tile_A;
typedef tile<16, 8, int, input_layout> tile_B;
typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C;
typedef tile<64, 2, int, input_layout> tile_load;
constexpr int granularity = mmq_get_granularity_device(mmq_x);
constexpr int rows_per_warp = granularity;
constexpr int ntx = rows_per_warp/tile_C::I; // Number of x minitiles per warp.
y += (threadIdx.y % ntx) * (tile_C::J*MMQ_TILE_Y_K);
const int * x_qs = (const int *) x;
const float * x_df = (const float *) x_qs + MMQ_TILE_NE_K*2;
const int * x_sc = (const int *) x_df + MMQ_TILE_NE_K/QI6_K;
const int * y_qs = (const int *) y + 4;
const float * y_df = (const float *) y;
const int i0 = (threadIdx.y / ntx) * rows_per_warp;
for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += 4) {
const int k0 = k00 + k01;
tile_A A[ntx];
#pragma unroll
for (int n = 0; n < ntx; ++n) {
load_generic(((tile_load *) A)[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q6_K + k0, MMQ_MMA_TILE_X_K_Q6_K);
}
#pragma unroll
for (int j0 = 0; j0 < mmq_x; j0 += ntx*tile_C::J) {
tile_B B[1];
load_generic(((tile_load *) B)[0], y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);
const int j = j0 + tile_C::get_j(0);
const float dB = y_df[j*MMQ_TILE_Y_K + k01/QI8_1] / 2;
#pragma unroll
for (int n = 0; n < ntx; ++n) {
tile_C C;
mma(C, A[n], B[0]);
#pragma unroll
for (int l = 0; l < tile_C::ne; ++l) {
const int i = i0 + n*tile_C::I + tile_C::get_i(l);
const int8_t * sc = (const int8_t *) (x_sc + i*MMQ_MMA_TILE_X_K_Q6_K + k00/16);
sum[(j0/tile_C::J + n)*tile_C::ne + l] += C.x[l] * sc[k01/4] * x_df[i*MMQ_MMA_TILE_X_K_Q6_K] * dB;
}
}
}
}
#elif defined(AMD_WMMA_AVAILABLE) //wmma instructions can handle 16x4 tiles, does not require loading 64x2 tiles
#if defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
constexpr data_layout input_layout = get_input_data_layout();
typedef tile<16, 4, int, input_layout> tile_A;
typedef tile<16, 4, int, input_layout> tile_B;
@ -2652,13 +2483,13 @@ static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mma(
tile_A A[ntx];
#pragma unroll
for (int n = 0; n < ntx; ++n) {
load_generic(A[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q6_K + k0, MMQ_MMA_TILE_X_K_Q6_K);
load_ldmatrix(A[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q6_K + k0, MMQ_MMA_TILE_X_K_Q6_K);
}
#pragma unroll
for (int j0 = 0; j0 < mmq_x; j0 += ntx*tile_C::J) {
tile_B B;
load_generic(B, y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);
load_ldmatrix(B, y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);
const int j = j0 + tile_C::get_j(0);
const float dB = y_df[j*MMQ_TILE_Y_K + k01/QI8_1];

View file

@ -33,7 +33,6 @@
#define CU_MEM_LOCATION_TYPE_DEVICE hipMemLocationTypeDevice
#define CU_MEM_ACCESS_FLAGS_PROT_READWRITE hipMemAccessFlagsProtReadWrite
#define CU_CHECK(fn) {hipError_t err = fn; if(err != hipSuccess) { GGML_ABORT("HipVMM Failure: %s\n", hipGetErrorString(err)); }}
#define NCCL_CHECK(fn) {ncclResult_t err = fn; if(err != ncclSuccess) { GGML_ABORT("RCCL Failure RCCL returned: %i\n", err); }}
#define __shfl_sync(mask, var, laneMask, width) __shfl(var, laneMask, width)
#define __shfl_up_sync(mask, var, laneMask, width) __shfl_up(var, laneMask, width)
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)

View file

@ -1,7 +1,6 @@
diagnostic(off, subgroup_uniformity);
enable f16;
#define Q_TILE 1
#define KV_TILE 32
#define WG_SIZE 32
@ -11,7 +10,7 @@ struct Params {
seq_len_kv: u32,
stride_mask3: u32,
// Number of KV blocks and Q blocks per batch.
// nblk0 = ceil(seq_len_kv / KV_TILE), nblk1 = ceil(seq_len_q / Q_TILE).
// nblk0 = ceil(seq_len_kv / KV_TILE), nblk1 = seq_len_q.
nblk0: u32,
nblk1: u32,
};
@ -40,7 +39,7 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
return;
}
let q_start = q_blk * Q_TILE;
let q_start = q_blk;
let k_start = kv_blk * KV_TILE;
let mask_batch = select(0u, batch_idx, params.stride_mask3 > 0u);
@ -54,11 +53,8 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
var local_max = -MASK_MAX;
var local_any = 0u;
for (var q_rel = 0u; q_rel < Q_TILE; q_rel += 1u) {
let q_row = q_start + q_rel;
if (q_row >= params.seq_len_q) {
continue;
}
let q_row = q_start;
if (q_row < params.seq_len_q) {
let row_base = mask_batch_base + q_row * params.seq_len_kv;
for (var k_rel = local_id.x; k_rel < KV_TILE; k_rel += WG_SIZE) {
let k_col = k_start + k_rel;

View file

@ -1077,9 +1077,9 @@ llm_graph_qkv llm_graph_context::build_qkv(
// fused QKV path
ggml_tensor * qkv = build_lora_mm(layer.wqkv, cur, layer.wqkv_s);
cb(qkv, "wqkv", il);
if (layer.bqkv) {
qkv = ggml_add(ctx0, qkv, layer.bqkv);
cb(qkv, "bqkv", il);
if (layer.wqkv_b) {
qkv = ggml_add(ctx0, qkv, layer.wqkv_b);
cb(qkv, "wqkv_b", il);
}
if (hparams.f_clamp_kqv > 0.0f) {
qkv = ggml_clamp(ctx0, qkv, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
@ -1097,8 +1097,8 @@ llm_graph_qkv llm_graph_context::build_qkv(
// separate Q/K/V path
Qcur = build_lora_mm(layer.wq, cur, layer.wq_s);
cb(Qcur, "Qcur", il);
if (layer.bq) {
Qcur = ggml_add(ctx0, Qcur, layer.bq);
if (layer.wq_b) {
Qcur = ggml_add(ctx0, Qcur, layer.wq_b);
cb(Qcur, "Qcur", il);
}
if (hparams.f_clamp_kqv > 0.0f) {
@ -1107,8 +1107,8 @@ llm_graph_qkv llm_graph_context::build_qkv(
}
Kcur = build_lora_mm(layer.wk, cur, layer.wk_s);
cb(Kcur, "Kcur", il);
if (layer.bk) {
Kcur = ggml_add(ctx0, Kcur, layer.bk);
if (layer.wk_b) {
Kcur = ggml_add(ctx0, Kcur, layer.wk_b);
cb(Kcur, "Kcur", il);
}
if (hparams.f_clamp_kqv > 0.0f) {
@ -1117,8 +1117,8 @@ llm_graph_qkv llm_graph_context::build_qkv(
}
Vcur = build_lora_mm(layer.wv, cur, layer.wv_s);
cb(Vcur, "Vcur", il);
if (layer.bv) {
Vcur = ggml_add(ctx0, Vcur, layer.bv);
if (layer.wv_b) {
Vcur = ggml_add(ctx0, Vcur, layer.wv_b);
cb(Vcur, "Vcur", il);
}
if (hparams.f_clamp_kqv > 0.0f) {

View file

@ -3217,14 +3217,14 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
const int64_t n_embd_qkv = n_embd_q_ + n_embd_k_ + n_embd_v_;
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", bid), {n_embd_, n_embd_qkv}, TENSOR_NOT_REQUIRED | TENSOR_SKIP_IF_VIRTUAL);
if (layer.wqkv) {
layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", bid), {n_embd_qkv}, TENSOR_NOT_REQUIRED | TENSOR_SKIP_IF_VIRTUAL);
layer.wqkv_b = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", bid), {n_embd_qkv}, TENSOR_NOT_REQUIRED | TENSOR_SKIP_IF_VIRTUAL);
} else {
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", bid), {n_embd_, n_embd_q_}, flags);
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", bid), {n_embd_, n_embd_k_}, flags);
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", bid), {n_embd_, n_embd_v_}, flags);
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", bid), {n_embd_q_}, TENSOR_NOT_REQUIRED);
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", bid), {n_embd_k_}, TENSOR_NOT_REQUIRED);
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", bid), {n_embd_v_}, TENSOR_NOT_REQUIRED);
layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", bid), {n_embd_q_}, TENSOR_NOT_REQUIRED);
layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", bid), {n_embd_k_}, TENSOR_NOT_REQUIRED);
layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", bid), {n_embd_v_}, TENSOR_NOT_REQUIRED);
}
};
@ -3257,7 +3257,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
// optional bias tensors
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
@ -3360,7 +3360,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
// No bias for QKV projections as per config: include_bias=false, include_qkv_bias=false
layer.wo =
create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
@ -3495,9 +3495,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
}
// optional bias tensors
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
if (n_ff > 0) {
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
@ -3717,10 +3716,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
layer.wqkv_b = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
@ -3761,8 +3760,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0);
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0);
@ -3878,23 +3877,16 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
for (int i = 0; i < n_layer; ++i) {
auto & layer = layers[i]; // JinaBertLayer
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0);
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0); //output_dens
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0); //output_dens
layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0); //output_dens
layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0); //output_norm
layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0);
@ -3942,10 +3934,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
layer.wqkv_b = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
@ -3978,10 +3970,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
layer.wqkv_b = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
@ -4048,7 +4040,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd*3}, 0);
layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd*3}, 0);
layer.wqkv_b = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd*3}, 0);
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
@ -4227,7 +4219,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0);
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
@ -4286,7 +4278,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0);
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, 0);
layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, 0);
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), { n_embd }, 0);
@ -4450,10 +4442,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
layer.wqkv_b = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
@ -4488,7 +4480,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0);
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
@ -4805,7 +4797,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
// optional bias tensors
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
@ -5049,7 +5041,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
const int64_t n_embd_v_gqa_i = hparams.n_embd_v_gqa(i);
create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head_i, n_embd_k_gqa_i, n_embd_v_gqa_i, 0);
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head_i, n_embd}, 0);
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
}
// feed forward (w/ optional biases)
@ -5311,10 +5303,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
layer.wqkv_b = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
@ -5729,10 +5721,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
layer.wqkv_b = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
@ -5771,10 +5763,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
// attention biases - all have shape n_embd (output dimension of projections)
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd}, 0);
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd}, 0);
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd}, 0);
layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd}, 0);
layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
@ -6077,7 +6069,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
// optional bias tensors
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
@ -6146,7 +6138,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
const int64_t n_embd_v_gqa_i = hparams.n_embd_v_gqa(i);
create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head_i, n_embd_k_gqa_i, n_embd_v_gqa_i, 0);
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head_i, n_embd}, 0);
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
} else {
if (n_expert != 0) {
const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
@ -6967,7 +6959,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
// optional bias tensors
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
@ -7049,7 +7041,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
// attention layers (with optional bias)
create_tensor_qkv(layer, i, hidden_size, n_embd_head_k * attn_num_attention_head, attn_num_key_value_head * n_embd_head_k, attn_num_key_value_head * n_embd_head_v, 0);
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * attn_num_attention_head, hidden_size}, 0);
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {hidden_size}, TENSOR_NOT_REQUIRED);
layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {hidden_size}, TENSOR_NOT_REQUIRED);
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {hidden_size}, 0);
@ -7185,7 +7177,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
layer.ffn_gate_inp_b = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "bias", i), {n_expert}, 0);
layer.ffn_gate_exps_b = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "bias", i), {n_ff_exp, n_expert}, 0);
@ -7350,7 +7342,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
// optional bias tensors
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
@ -7581,7 +7573,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
// bias tensors
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
@ -8347,114 +8339,114 @@ void llama_model::print_info() const {
LLAMA_LOG_INFO("%s: n_cls_out = %u\n", __func__, hparams.n_cls_out);
size_t i = 0;
for (auto label : classifier_labels) {
for (const auto & label : classifier_labels) {
LLAMA_LOG_INFO("%s: cls_label[%2zu] = %s\n", __func__, i++, label.c_str());
}
}
}
if (arch == LLM_ARCH_MAMBA ||
arch == LLM_ARCH_MAMBA2 ||
arch == LLM_ARCH_JAMBA ||
arch == LLM_ARCH_FALCON_H1 ||
arch == LLM_ARCH_PLAMO2 ||
arch == LLM_ARCH_GRANITE_HYBRID ||
arch == LLM_ARCH_QWEN3NEXT ||
arch == LLM_ARCH_QWEN35 ||
arch == LLM_ARCH_QWEN35MOE ||
arch == LLM_ARCH_NEMOTRON_H ||
arch == LLM_ARCH_NEMOTRON_H_MOE) {
LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
LLAMA_LOG_INFO("%s: ssm_dt_rank = %u\n", __func__, hparams.ssm_dt_rank);
LLAMA_LOG_INFO("%s: ssm_n_group = %u\n", __func__, hparams.ssm_n_group);
LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms = %d\n", __func__, hparams.ssm_dt_b_c_rms);
}
if (arch == LLM_ARCH_MAMBA ||
arch == LLM_ARCH_MAMBA2 ||
arch == LLM_ARCH_JAMBA ||
arch == LLM_ARCH_FALCON_H1 ||
arch == LLM_ARCH_PLAMO2 ||
arch == LLM_ARCH_GRANITE_HYBRID ||
arch == LLM_ARCH_QWEN3NEXT ||
arch == LLM_ARCH_QWEN35 ||
arch == LLM_ARCH_QWEN35MOE ||
arch == LLM_ARCH_NEMOTRON_H ||
arch == LLM_ARCH_NEMOTRON_H_MOE) {
LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
LLAMA_LOG_INFO("%s: ssm_dt_rank = %u\n", __func__, hparams.ssm_dt_rank);
LLAMA_LOG_INFO("%s: ssm_n_group = %u\n", __func__, hparams.ssm_n_group);
LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms = %d\n", __func__, hparams.ssm_dt_b_c_rms);
}
LLAMA_LOG_INFO("%s: model type = %s\n", __func__, type_name().c_str());
if (pimpl->n_elements >= 1e12) {
LLAMA_LOG_INFO("%s: model params = %.2f T\n", __func__, pimpl->n_elements*1e-12);
} else if (pimpl->n_elements >= 1e9) {
LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, pimpl->n_elements*1e-9);
} else if (pimpl->n_elements >= 1e6) {
LLAMA_LOG_INFO("%s: model params = %.2f M\n", __func__, pimpl->n_elements*1e-6);
} else {
LLAMA_LOG_INFO("%s: model params = %.2f K\n", __func__, pimpl->n_elements*1e-3);
}
LLAMA_LOG_INFO("%s: model type = %s\n", __func__, type_name().c_str());
if (pimpl->n_elements >= 1e12) {
LLAMA_LOG_INFO("%s: model params = %.2f T\n", __func__, pimpl->n_elements*1e-12);
} else if (pimpl->n_elements >= 1e9) {
LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, pimpl->n_elements*1e-9);
} else if (pimpl->n_elements >= 1e6) {
LLAMA_LOG_INFO("%s: model params = %.2f M\n", __func__, pimpl->n_elements*1e-6);
} else {
LLAMA_LOG_INFO("%s: model params = %.2f K\n", __func__, pimpl->n_elements*1e-3);
}
// general kv
LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, name.c_str());
// general kv
LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, name.c_str());
if (arch == LLM_ARCH_DEEPSEEK) {
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
}
if (arch == LLM_ARCH_DEEPSEEK) {
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
}
if (arch == LLM_ARCH_DEEPSEEK2 || arch == LLM_ARCH_DEEPSEEK2OCR || arch == LLM_ARCH_GLM_DSA || arch == LLM_ARCH_MISTRAL4) {
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
LLAMA_LOG_INFO("%s: n_lora_kv = %d\n", __func__, hparams.n_lora_kv);
LLAMA_LOG_INFO("%s: n_embd_head_k_mla = %d\n", __func__, hparams.n_embd_head_k_mla());
LLAMA_LOG_INFO("%s: n_embd_head_v_mla = %d\n", __func__, hparams.n_embd_head_v_mla());
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
}
if (arch == LLM_ARCH_DEEPSEEK2 || arch == LLM_ARCH_DEEPSEEK2OCR || arch == LLM_ARCH_GLM_DSA || arch == LLM_ARCH_MISTRAL4) {
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
LLAMA_LOG_INFO("%s: n_lora_kv = %d\n", __func__, hparams.n_lora_kv);
LLAMA_LOG_INFO("%s: n_embd_head_k_mla = %d\n", __func__, hparams.n_embd_head_k_mla());
LLAMA_LOG_INFO("%s: n_embd_head_v_mla = %d\n", __func__, hparams.n_embd_head_v_mla());
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
}
if (arch == LLM_ARCH_QWEN2MOE) {
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
}
if (arch == LLM_ARCH_QWEN2MOE) {
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
}
if (arch == LLM_ARCH_QWEN3MOE || arch == LLM_ARCH_OPENAI_MOE || arch == LLM_ARCH_QWEN3VLMOE || arch == LLM_ARCH_RND1) {
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
}
if (arch == LLM_ARCH_QWEN3MOE || arch == LLM_ARCH_OPENAI_MOE || arch == LLM_ARCH_QWEN3VLMOE || arch == LLM_ARCH_RND1) {
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
}
if (arch == LLM_ARCH_MINICPM ||
arch == LLM_ARCH_GRANITE ||
arch == LLM_ARCH_GRANITE_MOE ||
arch == LLM_ARCH_GRANITE_HYBRID ||
arch == LLM_ARCH_NEMOTRON_H_MOE) {
LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
}
if (arch == LLM_ARCH_MINICPM ||
arch == LLM_ARCH_GRANITE ||
arch == LLM_ARCH_GRANITE_MOE ||
arch == LLM_ARCH_GRANITE_HYBRID ||
arch == LLM_ARCH_NEMOTRON_H_MOE) {
LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
}
if (arch == LLM_ARCH_BAILINGMOE) {
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
}
if (arch == LLM_ARCH_BAILINGMOE) {
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
}
if (arch == LLM_ARCH_BAILINGMOE2) {
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
LLAMA_LOG_INFO("%s: nextn_predict_layers = %d\n", __func__, hparams.nextn_predict_layers);
}
if (arch == LLM_ARCH_BAILINGMOE2) {
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
LLAMA_LOG_INFO("%s: nextn_predict_layers = %d\n", __func__, hparams.nextn_predict_layers);
}
if (arch == LLM_ARCH_SMALLTHINKER || arch == LLM_ARCH_LFM2MOE) {
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
}
if (arch == LLM_ARCH_SMALLTHINKER || arch == LLM_ARCH_LFM2MOE) {
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
}
if (arch == LLM_ARCH_GROVEMOE) {
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
LLAMA_LOG_INFO("%s: n_ff_chexp = %d\n", __func__, hparams.n_ff_chexp);
LLAMA_LOG_INFO("%s: n_group_experts = %d\n", __func__, hparams.n_group_experts);
LLAMA_LOG_INFO("%s: expert_group_scale = %.2f\n", __func__, hparams.expert_group_scale);
if (arch == LLM_ARCH_GROVEMOE) {
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
LLAMA_LOG_INFO("%s: n_ff_chexp = %d\n", __func__, hparams.n_ff_chexp);
LLAMA_LOG_INFO("%s: n_group_experts = %d\n", __func__, hparams.n_group_experts);
LLAMA_LOG_INFO("%s: expert_group_scale = %.2f\n", __func__, hparams.expert_group_scale);
}
}
vocab.print_info();

View file

@ -246,6 +246,8 @@ struct llama_layer {
struct ggml_tensor * wkv_b = nullptr;
struct ggml_tensor * wk_b = nullptr;
struct ggml_tensor * wv_b = nullptr;
struct ggml_tensor * wqkv_b = nullptr;
struct ggml_tensor * wo_b = nullptr;
struct ggml_tensor * wq_cross = nullptr;
struct ggml_tensor * wk_cross = nullptr;
struct ggml_tensor * wv_cross = nullptr;
@ -256,13 +258,6 @@ struct llama_layer {
struct ggml_tensor * wo_enc = nullptr;
struct ggml_tensor * wqkv_gate = nullptr;
// attention bias
struct ggml_tensor * bq = nullptr;
struct ggml_tensor * bk = nullptr;
struct ggml_tensor * bv = nullptr;
struct ggml_tensor * bo = nullptr;
struct ggml_tensor * bqkv = nullptr;
// relative position bias
struct ggml_tensor * attn_rel_b = nullptr;
struct ggml_tensor * attn_rel_b_enc = nullptr;

View file

@ -115,12 +115,16 @@ static std::vector<llama_device_memory_data> llama_get_device_memory_data(
throw std::runtime_error("failed to create llama_context from model");
}
std::vector<llama_device_memory_data> ret(model->devices.size());
const size_t nd = model->n_devices();
std::vector<llama_device_memory_data> ret(nd + 1);
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> memory_breakdown = ctx->memory_breakdown();
for (const auto & [buft, mb] : memory_breakdown) {
if (ggml_backend_buft_is_host(buft)) {
ret.back().mb.model += mb.model;
ret.back().mb.context += mb.context;
ret.back().mb.compute += mb.compute;
continue;
}
@ -128,7 +132,7 @@ static std::vector<llama_device_memory_data> llama_get_device_memory_data(
if (!dev) {
continue;
}
for (size_t i = 0; i < ret.size(); i++) {
for (size_t i = 0; i < nd; i++) {
if (model->devices[i].dev == dev) {
ret[i].mb.model += mb.model;
ret[i].mb.context += mb.context;
@ -137,7 +141,19 @@ static std::vector<llama_device_memory_data> llama_get_device_memory_data(
}
}
}
for (size_t i = 0; i < ret.size(); i++) {
{
ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
if (cpu_dev == nullptr) {
throw std::runtime_error(format("%s: no CPU backend found", __func__));
}
size_t free;
size_t total;
ggml_backend_dev_memory(cpu_dev, &free, &total);
ret.back().free = free;
ret.back().total = total;
}
for (size_t i = 0; i < nd; i++) {
size_t free;
size_t total;
ggml_backend_dev_memory(model->devices[i].dev, &free, &total);
@ -146,11 +162,8 @@ static std::vector<llama_device_memory_data> llama_get_device_memory_data(
// have any to report. in this case, we will use the host memory as a fallback
// fixes: https://github.com/ggml-org/llama.cpp/issues/18577
if (free == 0 && total == 0) {
ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
if (cpu_dev == nullptr) {
throw std::runtime_error(format("%s: no CPU backend found", __func__));
}
ggml_backend_dev_memory(cpu_dev, &free, &total);
free = ret.back().free;
total = ret.back().total;
}
ret[i].free = free;
ret[i].total = total;
@ -204,15 +217,15 @@ static void llama_params_fit_impl(
LLAMA_LOG_DEBUG("%s: getting device memory data for initial parameters:\n", __func__);
const dmds_t dmds_full = llama_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
const size_t nd = devs.size(); // number of devices
if (nd == 0) {
LLAMA_LOG_INFO("%s: no devices with dedicated memory found\n", __func__);
return;
}
std::vector<int64_t> margins; // this function uses int64_t rather than size_t for memory sizes to more conveniently handle deficits
margins.reserve(nd);
for (size_t id = 0; id < nd; id++) {
margins.push_back(margins_s[id]);
if (nd == 0) {
margins.push_back(margins_s[0]);
} else {
for (size_t id = 0; id < nd; id++) {
margins.push_back(margins_s[id]);
}
}
std::vector<std::string> dev_names;
@ -239,46 +252,59 @@ static void llama_params_fit_impl(
std::vector<int64_t> projected_free_per_device;
projected_free_per_device.reserve(nd);
if (nd > 1) {
LLAMA_LOG_INFO("%s: projected memory use with initial parameters [MiB]:\n", __func__);
}
for (size_t id = 0; id < nd; id++) {
const llama_device_memory_data & dmd = dmds_full[id];
const int64_t projected_used = dmd.mb.total();
const int64_t projected_free = dmd.free - projected_used;
projected_free_per_device.push_back(projected_free);
sum_free += dmd.free;
sum_projected_used += projected_used;
sum_projected_free += projected_free;
sum_projected_model += dmd.mb.model;
if (nd > 1) {
LLAMA_LOG_INFO("%s: - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " free vs. target of %6" PRId64 "\n",
__func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, projected_free/MiB, margins[id]/MiB);
}
}
assert(sum_free >= 0 && sum_projected_used >= 0);
LLAMA_LOG_INFO("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n",
__func__, sum_projected_used/MiB, sum_free/MiB);
if (nd == 1) {
if (projected_free_per_device[0] >= margins[0]) {
LLAMA_LOG_INFO("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n",
__func__, projected_free_per_device[0]/MiB, margins[0]/MiB);
if (nd == 0) {
sum_projected_used = dmds_full.back().mb.total();
sum_free = dmds_full.back().total;
sum_projected_free = sum_free - sum_projected_used;
LLAMA_LOG_INFO("%s: projected to use %" PRId64 " MiB of host memory vs. %" PRId64 " MiB of total host memory\n",
__func__, sum_projected_used/MiB, sum_free/MiB);
if (sum_projected_free >= margins[0]) {
LLAMA_LOG_INFO("%s: will leave %" PRId64 " >= %" PRId64 " MiB of system memory, no changes needed\n",
__func__, sum_projected_free/MiB, margins[0]/MiB);
return;
}
} else {
bool changes_needed = false;
if (nd > 1) {
LLAMA_LOG_INFO("%s: projected memory use with initial parameters [MiB]:\n", __func__);
}
for (size_t id = 0; id < nd; id++) {
if (projected_free_per_device[id] < margins[id]) {
changes_needed = true;
break;
const llama_device_memory_data & dmd = dmds_full[id];
const int64_t projected_used = dmd.mb.total();
const int64_t projected_free = dmd.free - projected_used;
projected_free_per_device.push_back(projected_free);
sum_free += dmd.free;
sum_projected_used += projected_used;
sum_projected_free += projected_free;
sum_projected_model += dmd.mb.model;
if (nd > 1) {
LLAMA_LOG_INFO("%s: - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " free vs. target of %6" PRId64 "\n",
__func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, projected_free/MiB, margins[id]/MiB);
}
}
if (!changes_needed) {
LLAMA_LOG_INFO("%s: targets for free memory can be met on all devices, no changes needed\n", __func__);
return;
assert(sum_free >= 0 && sum_projected_used >= 0);
LLAMA_LOG_INFO("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n",
__func__, sum_projected_used/MiB, sum_free/MiB);
if (nd == 1) {
if (projected_free_per_device[0] >= margins[0]) {
LLAMA_LOG_INFO("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n",
__func__, projected_free_per_device[0]/MiB, margins[0]/MiB);
return;
}
} else {
bool changes_needed = false;
for (size_t id = 0; id < nd; id++) {
if (projected_free_per_device[id] < margins[id]) {
changes_needed = true;
break;
}
}
if (!changes_needed) {
LLAMA_LOG_INFO("%s: targets for free memory can be met on all devices, no changes needed\n", __func__);
return;
}
}
}
@ -286,11 +312,15 @@ static void llama_params_fit_impl(
{
int64_t global_surplus = sum_projected_free;
for (size_t id = 0; id < nd; id++) {
global_surplus -= margins[id];
if (nd == 0) {
global_surplus -= margins[0];
} else {
for (size_t id = 0; id < nd; id++) {
global_surplus -= margins[id];
}
}
if (global_surplus < 0) {
if (nd == 1) {
if (nd <= 1) {
LLAMA_LOG_INFO("%s: cannot meet free memory target of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n",
__func__, margins[0]/MiB, -global_surplus/MiB);
} else {
@ -301,8 +331,12 @@ static void llama_params_fit_impl(
if (cparams->n_ctx == 0) {
if (hp_nct > n_ctx_min) {
int64_t sum_used_target = sum_free;
for (size_t id = 0; id < nd; id++) {
sum_used_target -= margins[id];
if (nd == 0) {
sum_used_target -= margins[0];
} else {
for (size_t id = 0; id < nd; id++) {
sum_used_target -= margins[id];
}
}
if (nd > 1) {
// for multiple devices we need to be more conservative in terms of how much context we think can fit:
@ -317,8 +351,12 @@ static void llama_params_fit_impl(
int64_t sum_projected_used_min_ctx = 0;
cparams->n_ctx = n_ctx_min;
const dmds_t dmds_min_ctx = llama_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
for (const auto & dmd : dmds_min_ctx) {
sum_projected_used_min_ctx += dmd.mb.total();
if (nd == 0) {
sum_projected_used_min_ctx = dmds_min_ctx.back().mb.total();
} else {
for (size_t id = 0; id < nd; id++) {
sum_projected_used_min_ctx += dmds_min_ctx[id].mb.total();
}
}
if (sum_used_target > sum_projected_used_min_ctx) {
// linear interpolation between minimum and maximum context size:
@ -330,7 +368,7 @@ static void llama_params_fit_impl(
const int64_t memory_reduction = (hp_nct - cparams->n_ctx) * bytes_per_ctx;
LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
__func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
if (nd == 1) {
if (nd <= 1) {
LLAMA_LOG_INFO("%s: entire model can be fit by reducing context\n", __func__);
return;
}
@ -353,6 +391,9 @@ static void llama_params_fit_impl(
}
}
}
if (nd == 0) {
throw llama_params_fit_exception("was unable to fit model into system memory by reducing context, abort");
}
if (mparams->n_gpu_layers != default_mparams.n_gpu_layers) {
throw llama_params_fit_exception("n_gpu_layers already set by user to " + std::to_string(mparams->n_gpu_layers) + ", abort");
@ -500,8 +541,8 @@ static void llama_params_fit_impl(
std::vector<int64_t> ret;
ret.reserve(nd);
for (const llama_device_memory_data & dmd : dmd_nl) {
ret.push_back(dmd.mb.total());
for (size_t id = 0; id < nd; id++) {
ret.push_back(dmd_nl[id].mb.total());
}
return ret;
};

View file

@ -50,7 +50,7 @@ llm_build_apertus::llm_build_apertus(const llama_model & model, const llm_graph_
cb(Vcur, "Vcur_pos", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
cb(cur, "attn_out", il);
}

View file

@ -55,7 +55,7 @@ llm_build_arcee::llm_build_arcee(const llama_model & model, const llm_graph_para
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
cb(cur, "attn_out", il);
}

View file

@ -48,7 +48,7 @@ llm_build_bailingmoe::llm_build_bailingmoe(const llama_model & model, const llm_
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_rot)), il);
}

View file

@ -48,7 +48,7 @@ llm_build_bailingmoe2::llm_build_bailingmoe2(const llama_model & model, const ll
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
}

View file

@ -72,7 +72,7 @@ llm_build_bert::llm_build_bert(const llama_model & model, const llm_graph_params
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
cb(cur, "kqv_out", il);
}

View file

@ -57,8 +57,8 @@ llm_build_bitnet::llm_build_bitnet(const llama_model & model, const llm_graph_pa
cb(cur, "attn_sub_norm", il);
cur = build_lora_mm(model.layers[il].wo, cur, model.layers[il].wo_s);
if (model.layers[il].bo) {
cur = ggml_add(ctx0, cur, model.layers[il].bo);
if (model.layers[il].wo_b) {
cur = ggml_add(ctx0, cur, model.layers[il].wo_b);
}
cb(cur, "attn_out", il);
}

View file

@ -33,7 +33,7 @@ llm_build_bloom::llm_build_bloom(const llama_model & model, const llm_graph_para
n_embd_head, n_head, n_head_kv, il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}

View file

@ -47,7 +47,7 @@ llm_build_codeshell::llm_build_codeshell(const llama_model & model, const llm_gr
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}

View file

@ -58,7 +58,7 @@ llm_build_cohere2_iswa::llm_build_cohere2_iswa(const llama_model & model, const
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}

View file

@ -54,7 +54,7 @@ llm_build_command_r::llm_build_command_r(const llama_model & model, const llm_gr
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1 && inp_out_ids) {

View file

@ -59,7 +59,7 @@ llm_build_deci::llm_build_deci(const llama_model & model, const llm_graph_params
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
}
if (il == n_layer - 1 && inp_out_ids) {

View file

@ -49,7 +49,7 @@ llm_build_deepseek::llm_build_deepseek(const llama_model & model, const llm_grap
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
}
if (il == n_layer - 1 && inp_out_ids) {

View file

@ -49,7 +49,7 @@ llm_build_dots1::llm_build_dots1(const llama_model & model, const llm_graph_para
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1 && inp_out_ids) {

View file

@ -43,7 +43,7 @@ llm_build_dream::llm_build_dream(const llama_model & model, const llm_graph_para
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1 && inp_out_ids) {

View file

@ -46,7 +46,7 @@ llm_build_exaone::llm_build_exaone(const llama_model & model, const llm_graph_pa
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1 && inp_out_ids) {

View file

@ -37,7 +37,7 @@ llm_build_gpt2::llm_build_gpt2(const llama_model & model, const llm_graph_params
n_embd_head, n_head, n_head_kv, il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}

View file

@ -46,7 +46,7 @@ llm_build_gptneox::llm_build_gptneox(const llama_model & model, const llm_graph_
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}

View file

@ -92,7 +92,7 @@ ggml_tensor * llm_build_granite_hybrid::build_attention_layer(ggml_tensor *
const float kq_scale =
hparams.f_attention_scale == 0.0f ? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
cb(cur, "attn_out", il);
return cur;

View file

@ -101,7 +101,7 @@ ggml_tensor * llm_build_granite::build_attention_layer(
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
cb(cur, "attn_out", il);
return cur;

View file

@ -50,7 +50,7 @@ llm_build_grok::llm_build_grok(const llama_model & model, const llm_graph_params
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
}
if (il == n_layer - 1 && inp_out_ids) {

View file

@ -50,7 +50,7 @@ llm_build_grovemoe::llm_build_grovemoe(const llama_model & model, const llm_grap
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
}

View file

@ -64,7 +64,7 @@ llm_build_hunyuan_dense::llm_build_hunyuan_dense(const llama_model & model, cons
cb(Qcur, "Qcur_norm", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
cb(cur, "attn_out", il);
}

View file

@ -65,7 +65,7 @@ llm_build_hunyuan_moe::llm_build_hunyuan_moe(const llama_model & model, const ll
cb(Qcur, "Qcur_norm", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
cb(cur, "attn_out", il);
}

View file

@ -50,7 +50,7 @@ llm_build_internlm2::llm_build_internlm2(const llama_model & model, const llm_gr
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1 && inp_out_ids) {

View file

@ -27,7 +27,7 @@ llm_build_jais::llm_build_jais(const llama_model & model, const llm_graph_params
n_embd_head, n_head, n_head_kv, il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/float(n_embd_head), il);
}
if (il == n_layer - 1 && inp_out_ids) {

View file

@ -51,7 +51,7 @@ llm_build_jais2::llm_build_jais2(const llama_model & model, const llm_graph_para
cb(Kcur, "Kcur_rope", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}

View file

@ -70,7 +70,7 @@ llm_build_llama<embed>::llm_build_llama(const llama_model & model, const llm_gra
cb(Kcur, "Kcur_normed", il);
}
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
if (model.layers[il].wo_s) {
cur = ggml_mul(ctx0, cur, model.layers[il].wo_s);

View file

@ -84,7 +84,7 @@ llm_build_llama4<iswa>::llm_build_llama4(const llama_model & model, const llm_gr
cb(Kcur, "Kcur_normed", il);
}
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
cb(cur, "attn_out", il);
}

View file

@ -56,7 +56,7 @@ llm_build_maincoder::llm_build_maincoder(const llama_model & model, const llm_gr
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1 && inp_out_ids) {

View file

@ -67,7 +67,7 @@ llm_build_mistral3::llm_build_mistral3(const llama_model & model, const llm_grap
}
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
cb(cur, "attn_out", il);
}

View file

@ -56,7 +56,7 @@ llm_build_mpt::llm_build_mpt(const llama_model & model, const llm_graph_params &
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
}

View file

@ -70,7 +70,7 @@ ggml_tensor * llm_build_nemotron_h::build_attention_layer(ggml_tensor *
const float kq_scale =
hparams.f_attention_scale == 0.0f ? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
cb(cur, "attn_out", il);
return cur;

View file

@ -51,7 +51,7 @@ llm_build_nemotron::llm_build_nemotron(const llama_model & model, const llm_grap
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1 && inp_out_ids) {

View file

@ -48,7 +48,7 @@ llm_build_openai_moe_iswa::llm_build_openai_moe_iswa(const llama_model & model,
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, model.layers[il].attn_sinks, nullptr, 1.0f/sqrtf(float(n_rot)), il);
cb(cur, "attn_out", il);

View file

@ -55,7 +55,7 @@ llm_build_paddleocr::llm_build_paddleocr(const llama_model & model, const llm_gr
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1) {

View file

@ -49,7 +49,7 @@ llm_build_pangu_embedded::llm_build_pangu_embedded(const llama_model & model, co
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}

View file

@ -51,7 +51,7 @@ llm_build_phi2::llm_build_phi2(const llama_model & model, const llm_graph_params
Qcur = ggml_scale(ctx0, Qcur, 1.0f/sqrtf(float(n_embd_head)));
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
}
if (il == n_layer - 1 && inp_out_ids) {

View file

@ -60,7 +60,7 @@ llm_build_phi3<iswa>::llm_build_phi3(const llama_model & model, const llm_graph_
cb(Qcur, "Qcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
}
if (il == n_layer - 1 && inp_out_ids) {

View file

@ -50,7 +50,7 @@ llm_build_qwen2::llm_build_qwen2(const llama_model & model, const llm_graph_para
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1 && inp_out_ids) {

View file

@ -50,7 +50,7 @@ llm_build_qwen2moe::llm_build_qwen2moe(const llama_model & model, const llm_grap
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1 && inp_out_ids) {

View file

@ -53,7 +53,7 @@ llm_build_qwen2vl::llm_build_qwen2vl(const llama_model & model, const llm_graph_
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1 && inp_out_ids) {

View file

@ -56,7 +56,7 @@ llm_build_qwen3::llm_build_qwen3(const llama_model & model, const llm_graph_para
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
if (model.layers[il].wo_s) {
cur = ggml_mul(ctx0, cur, model.layers[il].wo_s);

View file

@ -56,7 +56,7 @@ llm_build_qwen3moe::llm_build_qwen3moe(const llama_model & model, const llm_grap
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
if (model.layers[il].wo_s) {
cur = ggml_mul(ctx0, cur, model.layers[il].wo_s);

View file

@ -62,7 +62,7 @@ llm_build_qwen3vlmoe::llm_build_qwen3vlmoe(const llama_model & model, const llm_
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}

View file

@ -62,7 +62,7 @@ llm_build_qwen3vl::llm_build_qwen3vl(const llama_model & model, const llm_graph_
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}

View file

@ -58,7 +58,7 @@ llm_build_rnd1::llm_build_rnd1(const llama_model & model, const llm_graph_params
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1 && inp_out_ids) {

View file

@ -52,7 +52,7 @@ llm_build_seed_oss::llm_build_seed_oss(const llama_model & model, const llm_grap
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
cb(cur, "attn_out", il);
}

View file

@ -59,7 +59,7 @@ llm_build_smallthinker<iswa>::llm_build_smallthinker(const llama_model & model,
cb(Kcur, "Kcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1 && inp_out_ids) {

View file

@ -55,7 +55,7 @@ llm_build_smollm3::llm_build_smollm3(const llama_model & model, const llm_graph_
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
cb(cur, "attn_out", il);
}

View file

@ -36,7 +36,7 @@ llm_build_starcoder::llm_build_starcoder(const llama_model & model, const llm_gr
n_embd_head, n_head, n_head_kv, il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1 && inp_out_ids) {

View file

@ -50,7 +50,7 @@ llm_build_starcoder2::llm_build_starcoder2(const llama_model & model, const llm_
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1 && inp_out_ids) {

View file

@ -41,7 +41,7 @@ llm_build_t5<false>::llm_build_t5(const llama_model & model, const llm_graph_par
ggml_tensor * kq_b = build_pos_bias(pos_bucket_dec, attn_rel_b);
cur = build_attn(inp_attn_self,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, kq_b, nullptr, nullptr, 1.0f, il);
cb(cur, "kqv_out", il);
}

View file

@ -114,10 +114,10 @@ llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks) {
return n_pos;
}
void mtmd_helper_image_get_decoder_pos(const mtmd_image_tokens * chunks, mtmd_decoder_pos * out_pos) {
void mtmd_helper_image_get_decoder_pos(const mtmd_image_tokens * chunks, llama_pos pos_0, mtmd_decoder_pos * out_pos) {
size_t n_tokens = mtmd_image_tokens_get_n_tokens(chunks);
for (size_t i = 0; i < n_tokens; i++) {
out_pos[i] = mtmd_image_tokens_get_decoder_pos(chunks, i);
out_pos[i] = mtmd_image_tokens_get_decoder_pos(chunks, pos_0, i);
}
}
@ -163,15 +163,15 @@ struct decode_embd_batch {
}
// M-RoPE for image
void set_position_mrope_2d(llama_pos pos_0, const std::vector<mtmd_decoder_pos> & rel_pos, llama_seq_id seq_id) {
void set_position_mrope_2d(const std::vector<mtmd_decoder_pos> & rel_pos, llama_seq_id seq_id) {
GGML_ASSERT(n_pos_per_embd == 4);
GGML_ASSERT(!rel_pos.empty() && (int32_t)rel_pos.size() == batch.n_tokens);
seq_id_0[0] = seq_id;
for (int32_t i = 0; i < batch.n_tokens; i++) {
pos[i ] = pos_0 + rel_pos[i].t;
pos[i + batch.n_tokens ] = pos_0 + rel_pos[i].y;
pos[i + batch.n_tokens * 2] = pos_0 + rel_pos[i].x;
pos[i + batch.n_tokens * 3] = 0; // last pos dim is unused
pos[i ] = rel_pos[i].t;
pos[i + batch.n_tokens ] = rel_pos[i].y;
pos[i + batch.n_tokens * 2] = rel_pos[i].x;
pos[i + batch.n_tokens * 3] = rel_pos[i].z;
}
for (int i = 0; i < batch.n_tokens; i++) {
batch.n_seq_id[i] = 1;
@ -188,7 +188,7 @@ struct decode_embd_batch {
pos[i ] = pos_0 + i;
pos[i + batch.n_tokens ] = pos_0 + i;
pos[i + batch.n_tokens * 2] = pos_0 + i;
pos[i + batch.n_tokens * 3] = 0; // last pos dim is unused
pos[i + batch.n_tokens * 3] = pos_0 + i;
}
for (int i = 0; i < batch.n_tokens; i++) {
batch.n_seq_id[i] = 1;
@ -268,8 +268,8 @@ int32_t mtmd_helper_decode_image_chunk(
}
const auto n_tokens = mtmd_image_tokens_get_n_tokens(image_tokens);
std::vector<mtmd_decoder_pos> rel_pos(n_tokens);
mtmd_helper_image_get_decoder_pos(image_tokens, rel_pos.data());
batch_embd.set_position_mrope_2d(n_past, rel_pos, seq_id);
mtmd_helper_image_get_decoder_pos(image_tokens, n_past, rel_pos.data());
batch_embd.set_position_mrope_2d(rel_pos, seq_id);
} else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
batch_embd.set_position_mrope_1d(n_past, seq_id);
} else {

View file

@ -49,7 +49,7 @@ MTMD_API llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks);
// helper to get the list of relative positions corresponding to the embedding tokens, to be used by M-RoPE
// out_pos must have length == mtmd_helper_get_n_tokens(image)
MTMD_API void mtmd_helper_image_get_decoder_pos(const mtmd_image_tokens * image, struct mtmd_decoder_pos * out_pos);
MTMD_API void mtmd_helper_image_get_decoder_pos(const mtmd_image_tokens * image, llama_pos pos_0, struct mtmd_decoder_pos * out_pos);
// helper function that automatically:
// 1. run llama_decode() on text chunks

View file

@ -1246,11 +1246,14 @@ size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens) {
return image_tokens->ny;
}
mtmd_decoder_pos mtmd_image_tokens_get_decoder_pos(const mtmd_image_tokens * image_tokens, size_t i) {
mtmd_decoder_pos mtmd_image_tokens_get_decoder_pos(const mtmd_image_tokens * image_tokens, llama_pos pos_0, size_t i) {
mtmd_decoder_pos pos;
pos.t = 0;
pos.x = i % image_tokens->nx;
pos.y = i / image_tokens->nx;
// M-RoPE logic
// TODO: support other types of position encoding if needed
pos.t = pos_0;
pos.x = pos_0 + (i % image_tokens->nx);
pos.y = pos_0 + (i / image_tokens->nx);
pos.z = 0; // unused for now
return pos;
}

View file

@ -196,11 +196,13 @@ struct mtmd_decoder_pos {
uint32_t t;
uint32_t x;
uint32_t y;
uint32_t z; // unused for now, reserved for future use
};
// get position for decoder attention, to be used by M-RoPE models
// i is the index of the embedding token, ranging from 0 to mtmd_image_tokens_get_n_tokens() - 1
// pos_0 is the absolute position of the first token
// return relative position (for example, embedding 0 will have position (0, 0, 0); remember to adjust it to the current absolute position)
MTMD_API struct mtmd_decoder_pos mtmd_image_tokens_get_decoder_pos(const mtmd_image_tokens * image_tokens, size_t i);
MTMD_API struct mtmd_decoder_pos mtmd_image_tokens_get_decoder_pos(const mtmd_image_tokens * image_tokens, llama_pos pos_0, size_t i);
// tokenize an input text prompt and a list of bitmaps (images/audio)
// the prompt must have the input image marker (default: "<__media__>") in it

View file

@ -391,15 +391,25 @@ void server_tokens::push_back(server_tokens & tokens) {
}
void server_tokens::insert(const llama_tokens & inp_tokens) {
GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled
tokens.insert(tokens.end(), inp_tokens.begin(), inp_tokens.end());
}
const llama_tokens & server_tokens::get_text_tokens() const {
GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled
const llama_tokens & server_tokens::get_tokens() const {
GGML_ASSERT(!has_mtmd);
return tokens;
}
llama_tokens server_tokens::get_text_tokens() const {
llama_tokens res;
res.reserve(tokens.size());
for (llama_token t : tokens) {
if (t != LLAMA_TOKEN_NULL) {
res.push_back(t);
}
}
return res;
}
void server_tokens::set_token(llama_pos pos, llama_token id) {
GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled
tokens[pos] = id;

View file

@ -190,7 +190,9 @@ public:
void insert(const llama_tokens & inp_tokens);
// for compatibility with speculative decoding, ctx shift, slot save/load
const llama_tokens & get_text_tokens() const;
const llama_tokens & get_tokens() const;
llama_tokens get_text_tokens() const;
// for compatibility with speculative decoding
void set_token(llama_pos pos, llama_token id);

View file

@ -1,3 +1,4 @@
#include "server-context.h"
#include "server-common.h"
#include "server-http.h"
@ -19,6 +20,7 @@
#include <exception>
#include <memory>
#include <filesystem>
#include <utility>
// fix problem with std::min and std::max
#if defined(_WIN32)
@ -33,6 +35,31 @@ using json = nlohmann::ordered_json;
constexpr int HTTP_POLLING_SECONDS = 1;
static server_prompt_checkpoint server_get_checkpoint(llama_context * ctx, int id, int64_t n_tokens, llama_pos pos_min = -1, llama_pos pos_max = -1) {
if (pos_min == -1) {
pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx), id);
}
if (pos_max == -1) {
pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx), id);
}
const size_t checkpoint_size = llama_state_seq_get_size_ext(ctx, id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
auto cur = server_prompt_checkpoint {
/*.pos_min = */ pos_min,
/*.pos_max = */ pos_max,
/*.n_tokens = */ n_tokens,
/*.data = */ std::vector<uint8_t>(checkpoint_size),
};
const size_t n = llama_state_seq_get_data_ext(ctx, cur.data.data(), checkpoint_size, id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
if (n != checkpoint_size) {
GGML_ABORT("checkpoint size mismatch: expected %zu, got %zu\n", checkpoint_size, n);
}
return cur;
}
// state diagram: https://github.com/ggml-org/llama.cpp/pull/9283
enum slot_state {
SLOT_STATE_IDLE,
@ -51,13 +78,18 @@ enum server_state {
struct server_slot {
int id;
// TODO: change to unique_ptrs for consistency:
llama_context * ctx = nullptr;
common_context_seq_rm_type ctx_seq_rm_type = COMMON_CONTEXT_SEQ_RM_TYPE_NO;
// multimodal
mtmd_context * mctx = nullptr;
common_speculative * spec = nullptr;
// speculative decoding
llama_tokens spec_draft;
std::vector<int32_t> spec_i_batch;
server_prompt_checkpoint spec_ckpt;
common_speculative_ptr spec;
// TODO: move members that belong to the task (such as `generated_text`, `has_new_line`) to task_results_state
// see https://github.com/ggml-org/llama.cpp/pull/18283#issuecomment-3710175837
@ -83,11 +115,6 @@ struct server_slot {
std::string debug_generated_text;
llama_tokens generated_tokens;
// idx of draft tokens in the main batch
// non-empty if we went to evaluate draft tokens
// ref: https://github.com/ggml-org/llama.cpp/pull/17808
std::vector<int32_t> i_batch_dft;
std::vector<completion_token_output> generated_token_probs;
bool has_next_token = true;
@ -147,8 +174,7 @@ struct server_slot {
common_sampler_ptr smpl;
llama_token sampled; // in speculative mode, this is the last accepted token
llama_tokens drafted;
llama_token sampled; // in speculative mode, this is the last accepted token
// stats
size_t n_sent_text = 0; // number of sent text character
@ -178,8 +204,11 @@ struct server_slot {
stopping_word = "";
n_sent_text = 0;
drafted.clear();
i_batch_dft.clear();
if (can_speculate()) {
spec_draft.clear();
spec_i_batch.clear();
spec_ckpt.clear();
}
generated_tokens.clear();
generated_token_probs.clear();
json_schema = json();
@ -300,6 +329,83 @@ struct server_slot {
return n_draft_max;
}
void update_batch(llama_batch & batch) {
const int n_draft_max = get_n_draft_max();
if (n_draft_max > 0) {
GGML_ASSERT(can_speculate());
// generate draft tokens in speculative decoding mode
// TODO: rework to have a single draft llama_context shared across all slots [TAG_SERVER_SPEC_REWORK]
// perform the speculative drafting for all sequences at the same time in a single batch
const llama_tokens & tokens = prompt.tokens.get_text_tokens();
const auto & params_spec = task->params.speculative;
if (!spec_draft.empty()) {
// we have a previous (partial) draft to reuse
if (ctx_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_FULL) {
GGML_ASSERT(!spec_ckpt.empty());
}
} else {
GGML_ASSERT(spec_i_batch.empty());
// generate a new draft
spec_draft = common_speculative_draft(spec.get(), params_spec, tokens, sampled);
if (spec_draft.size() > (size_t) n_draft_max) {
SLT_WRN(*this, "draft size %d exceeds max %d, truncating\n", (int) spec_draft.size(), n_draft_max);
spec_draft.resize(n_draft_max);
}
if (spec_draft.size() < (size_t) params_spec.n_min) {
SLT_DBG(*this, "ignoring small draft: %d < %d\n", (int) spec_draft.size(), params_spec.n_min);
spec_draft.clear();
}
if (!spec_draft.empty() && ctx_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_FULL) {
const auto n_tokens = prompt.tokens.size();
spec_ckpt = server_get_checkpoint(ctx, this->id, n_tokens);
SLT_DBG(*this, "created speculative checkpoint (pos_min = %d, pos_max = %d, n_tokens = %zu, size = %.3f MiB)\n",
spec_ckpt.pos_min, spec_ckpt.pos_max, n_tokens, (float) spec_ckpt.data.size() / 1024 / 1024);
}
}
GGML_ASSERT(spec_draft.size() <= (size_t) n_draft_max);
}
if (spec_draft.empty()) {
// no speculative decoding
i_batch = batch.n_tokens;
common_batch_add(batch, sampled, prompt.tokens.pos_next(), { this->id }, true);
SLT_DBG(*this, "slot decode token, id=%d, n_ctx = %d, n_tokens = %d, truncated = %d\n",
sampled, n_ctx, prompt.n_tokens(), truncated);
} else {
SLT_DBG(*this, "generate_draft: id=%d, #tokens=%zu, #draft=%zu, pos_next=%d\n",
sampled, prompt.tokens.size(), spec_draft.size(), prompt.tokens.pos_next());
GGML_ASSERT(spec_i_batch.empty());
spec_i_batch.push_back(batch.n_tokens);
for (size_t i = 0; i < spec_draft.size(); i++) {
spec_i_batch.push_back(batch.n_tokens + i + 1);
}
auto pos0 = prompt.tokens.pos_next();
common_batch_add(batch, sampled, pos0++, { this->id }, true);
for (auto token : spec_draft) {
common_batch_add(batch, token, pos0++, { this->id }, true);
}
}
prompt.tokens.push_back(sampled);
prompt.tokens.insert(spec_draft);
}
void release() {
if (is_processing()) {
GGML_ASSERT(task);
@ -400,7 +506,7 @@ struct server_slot {
);
}
common_speculative_print_stats(spec);
common_speculative_print_stats(spec.get());
}
json to_json(bool only_metrics = false) const {
@ -591,16 +697,17 @@ private:
void destroy() {
llama_init.reset();
ctx = nullptr;
model = nullptr;
mtmd_free(mctx);
mctx = nullptr;
// Clear any sampling context
for (server_slot & slot : slots) {
common_speculative_free(slot.spec);
slot.spec = nullptr;
if (slot.can_speculate()) {
slot.spec.reset();
}
}
llama_batch_free(batch);
@ -642,9 +749,6 @@ private:
llama_init = common_init_from_params(params_base);
// propagate model-metadata sampling defaults back to caller
params.sampling = params_base.sampling;
model = llama_init->model();
ctx = llama_init->context();
@ -660,6 +764,7 @@ private:
add_bos_token = llama_vocab_get_add_bos(vocab);
if (params_base.speculative.has_dft()) {
// TODO speculative: move to common/speculative.cpp?
SRV_INF("loading draft model '%s'\n", params_base.speculative.mparams_dft.path.c_str());
const auto & params_spec = params_base.speculative;
@ -727,11 +832,6 @@ private:
params_base.n_cache_reuse = 0;
SRV_WRN("%s\n", "cache_reuse is not supported by multimodal, it will be disabled");
}
if (params_base.speculative.type != COMMON_SPECULATIVE_TYPE_NONE) {
params_base.speculative.type = COMMON_SPECULATIVE_TYPE_NONE;
SRV_WRN("%s\n", "speculative decoding is not supported by multimodal, it will be disabled");
}
}
if (!llama_memory_can_shift(llama_get_memory(ctx))) {
@ -769,33 +869,38 @@ private:
slots.clear();
const bool can_spec = common_speculative_is_compat(ctx);
if (!can_spec) {
const auto ctx_seq_rm_type = common_context_can_seq_rm(ctx);
if (ctx_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_NO) {
SRV_WRN("%s", "speculative decoding not supported by this context\n");
}
if (ctx_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_FULL) {
SRV_WRN("%s", "speculative decoding will use checkpoints\n");
}
// initialize slots
for (int i = 0; i < params_base.n_parallel; i++) {
server_slot slot;
slots.emplace_back();
}
for (int i = 0; i < params_base.n_parallel; i++) {
server_slot & slot = slots[i];
slot.id = i;
slot.ctx = ctx;
slot.n_ctx = n_ctx_slot;
slot.ctx_seq_rm_type = ctx_seq_rm_type;
slot.mctx = mctx;
slot.prompt.tokens.has_mtmd = mctx != nullptr;
// try speculative decoding
if (can_spec) {
slot.spec = common_speculative_init(params_base.speculative, slot.ctx);
if (ctx_seq_rm_type != COMMON_CONTEXT_SEQ_RM_TYPE_NO) {
slot.spec.reset(common_speculative_init(params_base.speculative, slot.ctx));
if (slot.spec) {
if (mctx) {
SRV_ERR("%s\n", "speculative decoding is not supported with multimodal");
return false;
}
SLT_INF(slot, "%s", "speculative decoding context initialized\n");
} else {
SLT_INF(slot, "%s", "speculative decoding context not initialized\n");
}
}
@ -806,8 +911,6 @@ private:
};
slot.reset();
slots.push_back(std::move(slot));
}
{
@ -854,6 +957,9 @@ private:
model_aliases = params_base.model_alias;
model_tags = params_base.model_tags;
// propagate new defaults back to caller
params = params_base;
if (!is_resume) {
return init();
}
@ -880,13 +986,13 @@ private:
metrics.init();
if (params_base.clear_idle) {
if (params_base.cache_idle_slots) {
if (!params_base.kv_unified) {
SRV_WRN("%s: --clear-idle requires --kv-unified, disabling\n", __func__);
params_base.clear_idle = false;
SRV_WRN("%s: --cache-idle-slots requires --kv-unified, disabling\n", __func__);
params_base.cache_idle_slots = false;
} else if (params_base.cache_ram_mib == 0) {
SRV_WRN("%s: --clear-idle requires --cache-ram, disabling\n", __func__);
params_base.clear_idle = false;
SRV_WRN("%s: --cache-idle-slots requires --cache-ram, disabling\n", __func__);
params_base.cache_idle_slots = false;
} else {
SRV_INF("%s: idle slots will be saved to prompt cache and cleared upon starting a new task\n", __func__);
SRV_DBG("%s", "__TEST_TAG_CLEAR_IDLE_ENABLED__\n");
@ -1197,7 +1303,7 @@ private:
backend_sampling &= task.params.sampling.backend_sampling;
// TODO: speculative decoding requires multiple samples per batch - not supported yet
backend_sampling &= !(slot.spec && task.params.speculative.n_max > 0);
backend_sampling &= !(slot.can_speculate() && task.params.speculative.n_max > 0);
// TODO: getting post/pre sampling logits is not yet supported with backend sampling
backend_sampling &= !need_logits;
@ -1703,6 +1809,26 @@ private:
return true;
}
// n_tokens_cur: the number of tokens added to the batch for the current slot
void create_checkpoint(server_slot & slot, const int64_t n_tokens_cur, llama_pos pos_min, llama_pos pos_max) {
while (slot.prompt.checkpoints.size() >= (size_t) params_base.n_ctx_checkpoints) {
// make room for the new checkpoint, if needed
const auto & cur = slot.prompt.checkpoints.front();
SLT_WRN(slot, "erasing old context checkpoint (pos_min = %d, pos_max = %d, n_tokens = %" PRId64 ", size = %.3f MiB)\n",
cur.pos_min, cur.pos_max, cur.n_tokens, (float) cur.data.size() / 1024 / 1024);
slot.prompt.checkpoints.erase(slot.prompt.checkpoints.begin());
}
const auto & cur = slot.prompt.checkpoints.emplace_back(server_get_checkpoint(ctx, slot.id, slot.prompt.n_tokens() - n_tokens_cur, pos_min, pos_max));
SLT_WRN(slot,
"created context checkpoint %d of %d (pos_min = %d, pos_max = %d, n_tokens = %" PRId64 ", size = %.3f MiB)\n",
(int) slot.prompt.checkpoints.size(), params_base.n_ctx_checkpoints, cur.pos_min,
cur.pos_max, cur.n_tokens, (float) cur.data.size() / 1024 / 1024);
}
void process_single_task(server_task && task) {
switch (task.type) {
case SERVER_TASK_TYPE_COMPLETION:
@ -1759,7 +1885,7 @@ private:
break; // drop the task
}
if (params_base.clear_idle) {
if (params_base.cache_idle_slots) {
for (auto & s : slots) {
if (!s.is_processing()) {
slot_save_and_clear(s);
@ -1854,7 +1980,7 @@ private:
std::string filename = task.slot_action.filename;
std::string filepath = task.slot_action.filepath;
const llama_tokens & tokens = slot->prompt.tokens.get_text_tokens();
const llama_tokens & tokens = slot->prompt.tokens.get_tokens();
const size_t nwrite = llama_state_seq_save_file(ctx, filepath.c_str(), slot->id, tokens.data(), token_count);
const int64_t t_end = ggml_time_us();
@ -2061,7 +2187,7 @@ private:
{
GGML_ASSERT(!slot.prompt.tokens.has_mtmd);
llama_tokens new_tokens = slot.prompt.tokens.get_text_tokens(); // copy
llama_tokens new_tokens = slot.prompt.tokens.get_tokens(); // copy
for (size_t i = n_keep + n_discard; i < new_tokens.size(); i++) {
new_tokens[i - n_discard] = new_tokens[i];
}
@ -2100,61 +2226,7 @@ private:
continue;
}
// generate draft tokens in speculative decoding mode
// TODO: rework to have a single draft llama_context shared across all slots [TAG_SERVER_SPEC_REWORK]
// perform the speculative drafting for all sequences at the same time in a single batch
const int n_draft_max = slot.get_n_draft_max();
if (n_draft_max > 0) {
if (mctx) {
// we should never reach this, as speculative is automatically disabled if mmproj is loaded
GGML_ABORT("not supported by multimodal");
}
const llama_tokens & cached_text_tokens = slot.prompt.tokens.get_text_tokens();
const auto & params_spec = slot.task->params.speculative;
llama_tokens draft = common_speculative_draft(slot.spec, params_spec, cached_text_tokens, slot.sampled);
if (draft.size() > (size_t) n_draft_max) {
SLT_WRN(slot, "draft size %d exceeds max %d, truncating\n", (int) draft.size(), n_draft_max);
draft.resize(n_draft_max);
}
// add the sampled token to the batch
slot.i_batch_dft.push_back(batch.n_tokens);
common_batch_add(batch, slot.sampled, slot.prompt.tokens.pos_next(), { slot.id }, true);
slot.prompt.tokens.push_back(slot.sampled);
if (slot.task->params.speculative.n_min > (int) draft.size()) {
SLT_DBG(slot, "ignoring small draft: %d < %d\n", (int) draft.size(), slot.task->params.speculative.n_min);
// fallback to normal decoding
slot.i_batch = slot.i_batch_dft[0];
slot.drafted.clear();
slot.i_batch_dft.clear();
} else {
// keep track of total number of drafted tokens tested
slot.n_draft_total += draft.size();
// add all drafted tokens to the batch
for (size_t i = 0; i < draft.size(); i++) {
slot.i_batch_dft.push_back(batch.n_tokens);
common_batch_add(batch, draft[i], slot.prompt.tokens.pos_next(), { slot.id }, true);
slot.prompt.tokens.push_back(draft[i]);
}
slot.drafted = std::move(draft);
}
} else {
// no speculative decoding
slot.i_batch = batch.n_tokens;
common_batch_add(batch, slot.sampled, slot.prompt.tokens.pos_next(), { slot.id }, true);
slot.prompt.tokens.push_back(slot.sampled);
SLT_DBG(slot, "slot decode token, n_ctx = %d, n_tokens = %d, truncated = %d\n",
slot.n_ctx, slot.prompt.n_tokens(), slot.truncated);
}
slot.update_batch(batch);
}
// process in chunks of params.n_batch
@ -2515,15 +2587,11 @@ private:
// make a checkpoint of the parts of the memory that cannot be rolled back.
// checkpoints are created only if:
// - the model does not support partial sequence removal
// - the model uses SWA and we are not using `swa_full`
// - the model architecture is marked as recurrent or hybrid
//
// TODO: try to make this conditional on the context or the memory module, instead of the model type
do_checkpoint = do_checkpoint && (
llama_model_is_recurrent(model) ||
llama_model_is_hybrid(model) ||
(llama_model_n_swa(model) > 0 && !params_base.swa_full)
);
(slot.ctx_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_FULL) ||
(llama_model_n_swa(model) > 0 && !params_base.swa_full));
bool has_mtmd = false;
@ -2651,40 +2719,12 @@ private:
// no need to create checkpoints that are too close together
do_checkpoint = do_checkpoint && (slot.prompt.checkpoints.empty() || slot.prompt.n_tokens() - n_tokens_cur > slot.prompt.checkpoints.back().n_tokens + 64);
SLT_DBG(slot, "main/do_checkpoint = %s, pos_min = %d, pos_max = %d\n", do_checkpoint ? "yes" : "no", pos_min, pos_max);
// note: we create the checkpoint before calling llama_decode(), so the current batch is not
// yet processed and therefore it is not part of the checkpoint.
if (do_checkpoint) {
while (slot.prompt.checkpoints.size() >= (size_t) params_base.n_ctx_checkpoints) {
// make room for the new checkpoint, if needed
const auto & cur = slot.prompt.checkpoints.front();
SLT_WRN(slot,
"erasing old context checkpoint (pos_min = %d, pos_max = %d, n_tokens = %" PRId64
", size = %.3f MiB)\n",
cur.pos_min, cur.pos_max, cur.n_tokens, (float) cur.data.size() / 1024 / 1024);
slot.prompt.checkpoints.erase(slot.prompt.checkpoints.begin());
}
const size_t checkpoint_size =
llama_state_seq_get_size_ext(ctx, slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
auto & cur = slot.prompt.checkpoints.emplace_back(server_prompt_checkpoint{
/*.pos_min = */ pos_min,
/*.pos_max = */ pos_max,
/*.n_tokens = */ slot.prompt.n_tokens() - n_tokens_cur,
/*.data = */ std::vector<uint8_t>(checkpoint_size),
});
llama_state_seq_get_data_ext(ctx, cur.data.data(), checkpoint_size, slot.id,
LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
SLT_WRN(slot,
"created context checkpoint %d of %d (pos_min = %d, pos_max = %d, n_tokens = %" PRId64
", size = %.3f MiB)\n",
(int) slot.prompt.checkpoints.size(), params_base.n_ctx_checkpoints, cur.pos_min,
cur.pos_max, cur.n_tokens, (float) cur.data.size() / 1024 / 1024);
create_checkpoint(slot, n_tokens_cur, pos_min, pos_max);
}
}
@ -2856,19 +2896,19 @@ private:
slot.state = SLOT_STATE_GENERATING;
if (slot.can_speculate()) {
common_speculative_begin(slot.spec, slot.prompt.tokens.get_text_tokens());
common_speculative_begin(slot.spec.get(), slot.prompt.tokens.get_text_tokens());
}
} else if (slot.state != SLOT_STATE_GENERATING) {
continue; // continue loop of slots
}
if (slot.i_batch_dft.size() > 0) {
if (slot.can_speculate() && !slot.spec_draft.empty()) {
continue; // sample using speculative decoding
}
const int tok_idx = slot.i_batch - i;
llama_token id = common_sampler_sample(slot.smpl.get(), ctx, tok_idx);
llama_token id = common_sampler_sample(slot.smpl.get(), slot.ctx, tok_idx);
slot.i_batch = -1;
@ -2889,7 +2929,7 @@ private:
completion_token_output result;
result.tok = id;
result.text_to_send = common_token_to_piece(ctx, result.tok, accept_special_token(slot, result.tok));
result.text_to_send = common_token_to_piece(slot.ctx, result.tok, accept_special_token(slot, result.tok));
result.prob = 1.0f; // TODO: set it here instead of doing inside populate_token_probs
if (slot.task->params.sampling.n_probs > 0) {
@ -2909,43 +2949,85 @@ private:
// speculative decoding - main model sample and accept
for (auto & slot : slots) {
if (slot.state != SLOT_STATE_GENERATING || slot.i_batch_dft.empty()) {
if (slot.state != SLOT_STATE_GENERATING || !slot.can_speculate() || slot.spec_draft.empty()) {
continue;
}
const size_t n_draft = slot.drafted.size();
// save the original draft size
const size_t n_draft = slot.spec_draft.size();
// the accepted tokens from the speculation
const auto ids = common_sampler_sample_and_accept_n(slot.smpl.get(), ctx, slot.i_batch_dft, slot.drafted);
slot.i_batch_dft.clear();
slot.drafted.clear();
GGML_ASSERT(n_draft > 0);
// verify and try to accept the draft
{
common_sampler_ptr smpl_save(common_sampler_clone(slot.smpl.get()));
GGML_ASSERT(slot.spec_i_batch.size() == n_draft + 1);
auto accepted = common_sampler_sample_and_accept_n(slot.smpl.get(), slot.ctx, slot.spec_i_batch, slot.spec_draft);
slot.spec_i_batch.clear();
SLT_DBG(slot, "%s: n_draft=%zu, accepted=%zu\n", __func__, slot.spec_draft.size(), accepted.size());
GGML_ASSERT(accepted.size() >= 1);
// check for partial draft acceptance
if (accepted.size() < slot.spec_draft.size() + 1) {
if (slot.ctx_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_FULL) {
// partial acceptance is not supported by the context -> truncate the draft and restore the state
slot.spec_draft = std::move(accepted);
const auto & ckpt = slot.spec_ckpt;
SLT_DBG(slot, "restoring speculative checkpoint (pos_min = %d, pos_max = %d, size = %zu)\n",
ckpt.pos_min, ckpt.pos_max, ckpt.size());
const size_t n = llama_state_seq_set_data_ext(slot.ctx, ckpt.data.data(), ckpt.size(), slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
if (n != ckpt.size()) {
GGML_ABORT("%s: failed to restore context checkpoint (pos_min=%d, pos_max=%d, size=%zu, get_data_ext->%zu, set_data_ext->%zu",
__func__, ckpt.pos_min, ckpt.pos_max, ckpt.size(), ckpt.size(), n);
}
llama_memory_seq_rm(llama_get_memory(slot.ctx), slot.id, ckpt.pos_max + 1, -1);
slot.prompt.tokens.keep_first(ckpt.n_tokens);
slot.smpl = std::move(smpl_save);
continue;
}
LOG_DBG("%s: partial acceptance: %zu < %zu\n", __func__, accepted.size(), slot.spec_draft.size());
}
common_speculative_accept(slot.spec.get(), accepted.size() - 1);
slot.spec_draft = std::move(accepted);
}
const int64_t t_current = ggml_time_us();
slot.n_decoded += ids.size();
const auto ids = std::move(slot.spec_draft);
slot.n_decoded += ids.size();
slot.t_token_generation = std::max<int64_t>(1, t_current - slot.t_start_generation) / 1e3;
// update how many tokens out of those tested were accepted
slot.n_draft_accepted += ids.size() - 1;
// inform the speculative decoding about the number of accepted tokens
common_speculative_accept(slot.spec, ids.size() - 1);
// rollback to the state before sampling the draft tokens
slot.prompt.tokens.keep_first(slot.prompt.n_tokens() - n_draft);
slot.n_draft_total += n_draft;
// add accepted tokens to the prompt
slot.prompt.tokens.keep_first(slot.prompt.n_tokens() - n_draft);
slot.prompt.tokens.insert({ids.begin(), ids.end() - 1});
slot.sampled = ids.back(); // last accepted token
llama_memory_seq_rm(llama_get_memory(ctx), slot.id, slot.prompt.n_tokens(), -1);
slot.sampled = ids.back(); // last accepted token
SLT_DBG(slot, "add accepted tokens: sampled=%d, ids.size=%zu, n_draft=%zu\n", slot.sampled, ids.size(), n_draft);
llama_memory_seq_rm(llama_get_memory(slot.ctx), slot.id, slot.prompt.n_tokens(), -1);
for (size_t i = 0; i < ids.size(); ++i) {
completion_token_output result;
result.tok = ids[i];
result.text_to_send = common_token_to_piece(ctx, result.tok, accept_special_token(slot, result.tok));
result.text_to_send = common_token_to_piece(slot.ctx, result.tok, accept_special_token(slot, result.tok));
result.prob = 1.0f; // set later
// TODO: set result.probs
@ -3537,6 +3619,7 @@ void server_routes::init_routes() {
{"vision", meta->has_inp_image},
{"audio", meta->has_inp_audio},
} },
{ "media_marker", get_media_marker() },
{ "endpoint_slots", params.endpoint_slots },
{ "endpoint_props", params.endpoint_props },
{ "endpoint_metrics", params.endpoint_metrics },
@ -3664,7 +3747,7 @@ void server_routes::init_routes() {
params.n_predict,
meta->slot_n_ctx,
params.spm_infill,
tokenized_prompts[0].get_text_tokens() // TODO: this could maybe be multimodal.
tokenized_prompts[0].get_tokens() // TODO: this could maybe be multimodal.
);
std::vector<raw_buffer> files; // dummy

View file

@ -162,7 +162,7 @@ common_chat_msg task_result_state::update_chat_msg(
bool filter_tool_calls) {
generated_text += text_added;
auto msg_prv_copy = chat_msg;
SRV_DBG("Parsing chat message: %s\n", generated_text.c_str());
//SRV_DBG("Parsing chat message: %s\n", generated_text.c_str());
auto new_msg = common_chat_parse(
generated_text,
is_partial,
@ -304,6 +304,8 @@ task_params server_task::params_from_json_cmpl(
params.sampling.backend_sampling = json_value(data, "backend_sampling", defaults.sampling.backend_sampling);
params.post_sampling_probs = json_value(data, "post_sampling_probs", defaults.post_sampling_probs);
params.speculative = defaults.speculative;
params.speculative.n_min = json_value(data, "speculative.n_min", defaults.speculative.n_min);
params.speculative.n_max = json_value(data, "speculative.n_max", defaults.speculative.n_max);
params.speculative.p_min = json_value(data, "speculative.p_min", defaults.speculative.p_min);

View file

@ -576,6 +576,17 @@ struct server_prompt_checkpoint {
size_t size() const {
return data.size();
}
bool empty() const {
return data.empty();
}
void clear() {
pos_min = 0;
pos_max = 0;
n_tokens = 0;
data.clear();
}
};
struct server_prompt {

View file

@ -91,7 +91,7 @@ def test_clear_and_restore():
def test_disabled_with_flag():
global server
server.no_clear_idle = True
server.no_cache_idle_slots = True
server.start()
log = LogReader(server.log_path)

View file

@ -103,7 +103,7 @@ class ServerProcess:
media_path: str | None = None
sleep_idle_seconds: int | None = None
cache_ram: int | None = None
no_clear_idle: bool = False
no_cache_idle_slots: bool = False
log_path: str | None = None
webui_mcp_proxy: bool = False
@ -242,8 +242,8 @@ class ServerProcess:
server_args.extend(["--sleep-idle-seconds", self.sleep_idle_seconds])
if self.cache_ram is not None:
server_args.extend(["--cache-ram", self.cache_ram])
if self.no_clear_idle:
server_args.append("--no-clear-idle")
if self.no_cache_idle_slots:
server_args.append("--no-cache-idle-slots")
if self.webui_mcp_proxy:
server_args.append("--webui-mcp-proxy")

View file

@ -1,7 +1,5 @@
#include "httplib.h"
namespace httplib {
// httplib::any — type-erased value container (C++11 compatible)
// On C++17+ builds, thin wrappers around std::any are provided.
/*
* Implementation that will be part of the .cc file if split into .h + .cc.
@ -1877,7 +1875,7 @@ int getaddrinfo_with_timeout(const char *node, const char *service,
}
return ret;
#elif TARGET_OS_MAC
#elif TARGET_OS_MAC && defined(__clang__)
if (!node) { return EAI_NONAME; }
// macOS implementation using CFHost API for asynchronous DNS resolution
CFStringRef hostname_ref = CFStringCreateWithCString(
@ -5836,6 +5834,17 @@ std::string Request::get_param_value(const std::string &key,
return std::string();
}
std::vector<std::string>
Request::get_param_values(const std::string &key) const {
auto rng = params.equal_range(key);
std::vector<std::string> values;
values.reserve(static_cast<size_t>(std::distance(rng.first, rng.second)));
for (auto it = rng.first; it != rng.second; ++it) {
values.push_back(it->second);
}
return values;
}
size_t Request::get_param_value_count(const std::string &key) const {
auto r = params.equal_range(key);
return static_cast<size_t>(std::distance(r.first, r.second));
@ -7013,6 +7022,15 @@ Server &Server::set_keep_alive_timeout(time_t sec) {
return *this;
}
template <class Rep, class Period>
Server &Server::set_keep_alive_timeout(
const std::chrono::duration<Rep, Period> &duration) {
detail::duration_to_sec_and_usec(duration, [&](time_t sec, time_t /*usec*/) {
set_keep_alive_timeout(sec);
});
return *this;
}
Server &Server::set_read_timeout(time_t sec, time_t usec) {
read_timeout_sec_ = sec;
read_timeout_usec_ = usec;
@ -9119,20 +9137,21 @@ bool ClientImpl::redirect(Request &req, Response &res, Error &error) {
auto location = res.get_header_value("location");
if (location.empty()) { return false; }
thread_local const std::regex re(
R"((?:(https?):)?(?://(?:\[([a-fA-F\d:]+)\]|([^:/?#]+))(?::(\d+))?)?([^?#]*)(\?[^#]*)?(?:#.*)?)");
detail::UrlComponents uc;
if (!detail::parse_url(location, uc)) { return false; }
std::smatch m;
if (!std::regex_match(location, m, re)) { return false; }
// Only follow http/https redirects
if (!uc.scheme.empty() && uc.scheme != "http" && uc.scheme != "https") {
return false;
}
auto scheme = is_ssl() ? "https" : "http";
auto next_scheme = m[1].str();
auto next_host = m[2].str();
if (next_host.empty()) { next_host = m[3].str(); }
auto port_str = m[4].str();
auto next_path = m[5].str();
auto next_query = m[6].str();
auto next_scheme = std::move(uc.scheme);
auto next_host = std::move(uc.host);
auto port_str = std::move(uc.port);
auto next_path = std::move(uc.path);
auto next_query = std::move(uc.query);
auto next_port = port_;
if (!port_str.empty()) {
@ -9145,7 +9164,7 @@ bool ClientImpl::redirect(Request &req, Response &res, Error &error) {
if (next_host.empty()) { next_host = host_; }
if (next_path.empty()) { next_path = "/"; }
auto path = decode_query_component(next_path, true) + next_query;
auto path = decode_path_component(next_path) + next_query;
// Same host redirect - use current client
if (next_scheme == scheme && next_host == host_ && next_port == port_) {
@ -10869,12 +10888,9 @@ Client::Client(const std::string &scheme_host_port)
Client::Client(const std::string &scheme_host_port,
const std::string &client_cert_path,
const std::string &client_key_path) {
const static std::regex re(
R"((?:([a-z]+):\/\/)?(?:\[([a-fA-F\d:]+)\]|([^:/?#]+))(?::(\d+))?)");
std::smatch m;
if (std::regex_match(scheme_host_port, m, re)) {
auto scheme = m[1].str();
detail::UrlComponents uc;
if (detail::parse_url(scheme_host_port, uc) && !uc.host.empty()) {
auto &scheme = uc.scheme;
#ifdef CPPHTTPLIB_SSL_ENABLED
if (!scheme.empty() && (scheme != "http" && scheme != "https")) {
@ -10890,12 +10906,10 @@ Client::Client(const std::string &scheme_host_port,
auto is_ssl = scheme == "https";
auto host = m[2].str();
if (host.empty()) { host = m[3].str(); }
auto host = std::move(uc.host);
auto port_str = m[4].str();
auto port = is_ssl ? 443 : 80;
if (!port_str.empty() && !detail::parse_port(port_str, port)) { return; }
if (!uc.port.empty() && !detail::parse_port(uc.port, port)) { return; }
if (is_ssl) {
#ifdef CPPHTTPLIB_SSL_ENABLED
@ -12466,6 +12480,18 @@ std::string Request::sni() const {
*/
#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
// These wrappers forward to deprecated APIs that will be removed by v1.0.0.
// Suppress C4996 / -Wdeprecated-declarations so that MSVC /sdl builds (which
// promote C4996 to an error) compile cleanly even though the wrappers
// themselves are also marked [[deprecated]].
#if defined(_MSC_VER)
#pragma warning(push)
#pragma warning(disable : 4996)
#elif defined(__GNUC__) || defined(__clang__)
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
#endif
SSL_CTX *Client::ssl_context() const {
if (is_ssl_) { return static_cast<SSLClient &>(*cli_).ssl_context(); }
return nullptr;
@ -12480,6 +12506,12 @@ long Client::get_verify_result() const {
if (is_ssl_) { return static_cast<SSLClient &>(*cli_).get_verify_result(); }
return -1; // NOTE: -1 doesn't match any of X509_V_ERR_???
}
#if defined(_MSC_VER)
#pragma warning(pop)
#elif defined(__GNUC__) || defined(__clang__)
#pragma GCC diagnostic pop
#endif
#endif // CPPHTTPLIB_OPENSSL_SUPPORT
/*
@ -16302,12 +16334,10 @@ bool WebSocket::is_open() const { return !closed_; }
WebSocketClient::WebSocketClient(
const std::string &scheme_host_port_path, const Headers &headers)
: headers_(headers) {
const static std::regex re(
R"(([a-z]+):\/\/(?:\[([a-fA-F\d:]+)\]|([^:/?#]+))(?::(\d+))?(\/.*))");
std::smatch m;
if (std::regex_match(scheme_host_port_path, m, re)) {
auto scheme = m[1].str();
detail::UrlComponents uc;
if (detail::parse_url(scheme_host_port_path, uc) && !uc.scheme.empty() &&
!uc.host.empty() && !uc.path.empty()) {
auto &scheme = uc.scheme;
#ifdef CPPHTTPLIB_SSL_ENABLED
if (scheme != "ws" && scheme != "wss") {
@ -16323,14 +16353,12 @@ WebSocketClient::WebSocketClient(
auto is_ssl = scheme == "wss";
host_ = m[2].str();
if (host_.empty()) { host_ = m[3].str(); }
host_ = std::move(uc.host);
auto port_str = m[4].str();
port_ = is_ssl ? 443 : 80;
if (!port_str.empty() && !detail::parse_port(port_str, port_)) { return; }
if (!uc.port.empty() && !detail::parse_port(uc.port, port_)) { return; }
path_ = m[5].str();
path_ = std::move(uc.path);
#ifdef CPPHTTPLIB_SSL_ENABLED
is_ssl_ = is_ssl;

View file

@ -8,8 +8,8 @@
#ifndef CPPHTTPLIB_HTTPLIB_H
#define CPPHTTPLIB_HTTPLIB_H
#define CPPHTTPLIB_VERSION "0.40.0"
#define CPPHTTPLIB_VERSION_NUM "0x002800"
#define CPPHTTPLIB_VERSION "0.42.0"
#define CPPHTTPLIB_VERSION_NUM "0x002a00"
#ifdef _WIN32
#if defined(_WIN32_WINNT) && _WIN32_WINNT < 0x0A00
@ -333,13 +333,10 @@ using socket_t = int;
#include <unordered_map>
#include <unordered_set>
#include <utility>
#if __cplusplus >= 201703L
#include <any>
#endif
// On macOS with a TLS backend, enable Keychain root certificates by default
// unless the user explicitly opts out.
#if defined(__APPLE__) && \
#if defined(__APPLE__) && defined(__clang__) && \
!defined(CPPHTTPLIB_DISABLE_MACOSX_AUTOMATIC_ROOT_CERTIFICATES) && \
(defined(CPPHTTPLIB_OPENSSL_SUPPORT) || \
defined(CPPHTTPLIB_MBEDTLS_SUPPORT) || \
@ -358,7 +355,7 @@ using socket_t = int;
#if defined(CPPHTTPLIB_USE_NON_BLOCKING_GETADDRINFO) || \
defined(CPPHTTPLIB_USE_CERTS_FROM_MACOSX_KEYCHAIN)
#if TARGET_OS_MAC
#if TARGET_OS_MAC && defined(__clang__)
#include <CFNetwork/CFHost.h>
#include <CoreFoundation/CoreFoundation.h>
#endif
@ -701,9 +698,96 @@ inline bool parse_port(const std::string &s, int &port) {
return parse_port(s.data(), s.size(), port);
}
struct UrlComponents {
std::string scheme;
std::string host;
std::string port;
std::string path;
std::string query;
};
inline bool parse_url(const std::string &url, UrlComponents &uc) {
uc = {};
size_t pos = 0;
auto sep = url.find("://");
if (sep != std::string::npos) {
uc.scheme = url.substr(0, sep);
// Scheme must be [a-z]+ only
if (uc.scheme.empty()) { return false; }
for (auto c : uc.scheme) {
if (c < 'a' || c > 'z') { return false; }
}
pos = sep + 3;
} else if (url.compare(0, 2, "//") == 0) {
pos = 2;
}
auto has_authority_prefix = pos > 0;
auto has_authority = has_authority_prefix || (!url.empty() && url[0] != '/' &&
url[0] != '?' && url[0] != '#');
if (has_authority) {
if (pos < url.size() && url[pos] == '[') {
auto close = url.find(']', pos);
if (close == std::string::npos) { return false; }
uc.host = url.substr(pos + 1, close - pos - 1);
// IPv6 host must be [a-fA-F0-9:]+ only
if (uc.host.empty()) { return false; }
for (auto c : uc.host) {
if (!((c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F') ||
(c >= '0' && c <= '9') || c == ':')) {
return false;
}
}
pos = close + 1;
} else {
auto end = url.find_first_of(":/?#", pos);
if (end == std::string::npos) { end = url.size(); }
uc.host = url.substr(pos, end - pos);
pos = end;
}
if (pos < url.size() && url[pos] == ':') {
++pos;
auto end = url.find_first_of("/?#", pos);
if (end == std::string::npos) { end = url.size(); }
uc.port = url.substr(pos, end - pos);
pos = end;
}
// Without :// or //, the entire input must be consumed as host[:port].
// If there is leftover (path, query, etc.), this is not a valid
// host[:port] string — clear and reparse as a plain path.
if (!has_authority_prefix && pos < url.size()) {
uc.host.clear();
uc.port.clear();
pos = 0;
}
}
if (pos < url.size() && url[pos] != '?' && url[pos] != '#') {
auto end = url.find_first_of("?#", pos);
if (end == std::string::npos) { end = url.size(); }
uc.path = url.substr(pos, end - pos);
pos = end;
}
if (pos < url.size() && url[pos] == '?') {
auto end = url.find('#', pos);
if (end == std::string::npos) { end = url.size(); }
uc.query = url.substr(pos, end - pos);
}
return true;
}
} // namespace detail
enum SSLVerifierResponse {
enum class SSLVerifierResponse {
// no decision has been made, use the built-in certificate verifier
NoDecisionMade,
// connection certificate is verified and accepted
@ -797,38 +881,15 @@ using Match = std::smatch;
using DownloadProgress = std::function<bool(size_t current, size_t total)>;
using UploadProgress = std::function<bool(size_t current, size_t total)>;
#if __cplusplus >= 201703L
using any = std::any;
using bad_any_cast = std::bad_any_cast;
template <typename T> T any_cast(const any &a) { return std::any_cast<T>(a); }
template <typename T> T any_cast(any &a) { return std::any_cast<T>(a); }
template <typename T> T any_cast(any &&a) {
return std::any_cast<T>(std::move(a));
}
template <typename T> const T *any_cast(const any *a) noexcept {
return std::any_cast<T>(a);
}
template <typename T> T *any_cast(any *a) noexcept {
return std::any_cast<T>(a);
}
#else // C++11/14 implementation
class bad_any_cast : public std::bad_cast {
public:
const char *what() const noexcept override { return "bad any_cast"; }
};
/*
* detail: type-erased storage used by UserData.
* ABI-stable regardless of C++ standard always uses this custom
* implementation instead of std::any.
*/
namespace detail {
using any_type_id = const void *;
// Returns a unique per-type ID without RTTI.
// The static address is stable across TUs because function templates are
// implicitly inline and the ODR merges their statics into one.
template <typename T> any_type_id any_typeid() noexcept {
static const char id = 0;
return &id;
@ -851,89 +912,60 @@ template <typename T> struct any_value final : any_storage {
} // namespace detail
class any {
std::unique_ptr<detail::any_storage> storage_;
class UserData {
public:
any() noexcept = default;
any(const any &o) : storage_(o.storage_ ? o.storage_->clone() : nullptr) {}
any(any &&) noexcept = default;
any &operator=(const any &o) {
storage_ = o.storage_ ? o.storage_->clone() : nullptr;
return *this;
UserData() = default;
UserData(UserData &&) noexcept = default;
UserData &operator=(UserData &&) noexcept = default;
UserData(const UserData &o) {
for (const auto &e : o.entries_) {
if (e.second) { entries_[e.first] = e.second->clone(); }
}
}
any &operator=(any &&) noexcept = default;
template <
typename T, typename D = typename std::decay<T>::type,
typename std::enable_if<!std::is_same<D, any>::value, int>::type = 0>
any(T &&v) : storage_(new detail::any_value<D>(std::forward<T>(v))) {}
template <
typename T, typename D = typename std::decay<T>::type,
typename std::enable_if<!std::is_same<D, any>::value, int>::type = 0>
any &operator=(T &&v) {
storage_.reset(new detail::any_value<D>(std::forward<T>(v)));
UserData &operator=(const UserData &o) {
if (this != &o) {
entries_.clear();
for (const auto &e : o.entries_) {
if (e.second) { entries_[e.first] = e.second->clone(); }
}
}
return *this;
}
bool has_value() const noexcept { return storage_ != nullptr; }
void reset() noexcept { storage_.reset(); }
template <typename T> void set(const std::string &key, T &&value) {
using D = typename std::decay<T>::type;
entries_[key].reset(new detail::any_value<D>(std::forward<T>(value)));
}
template <typename T> friend T *any_cast(any *a) noexcept;
template <typename T> friend const T *any_cast(const any *a) noexcept;
template <typename T> T *get(const std::string &key) noexcept {
auto it = entries_.find(key);
if (it == entries_.end() || !it->second) { return nullptr; }
if (it->second->type_id() != detail::any_typeid<T>()) { return nullptr; }
return &static_cast<detail::any_value<T> *>(it->second.get())->value;
}
template <typename T> const T *get(const std::string &key) const noexcept {
auto it = entries_.find(key);
if (it == entries_.end() || !it->second) { return nullptr; }
if (it->second->type_id() != detail::any_typeid<T>()) { return nullptr; }
return &static_cast<const detail::any_value<T> *>(it->second.get())->value;
}
bool has(const std::string &key) const noexcept {
return entries_.find(key) != entries_.end();
}
void erase(const std::string &key) { entries_.erase(key); }
void clear() noexcept { entries_.clear(); }
private:
std::unordered_map<std::string, std::unique_ptr<detail::any_storage>>
entries_;
};
template <typename T> T *any_cast(any *a) noexcept {
if (!a || !a->storage_) { return nullptr; }
if (a->storage_->type_id() != detail::any_typeid<T>()) { return nullptr; }
return &static_cast<detail::any_value<T> *>(a->storage_.get())->value;
}
template <typename T> const T *any_cast(const any *a) noexcept {
if (!a || !a->storage_) { return nullptr; }
if (a->storage_->type_id() != detail::any_typeid<T>()) { return nullptr; }
return &static_cast<const detail::any_value<T> *>(a->storage_.get())->value;
}
template <typename T> T any_cast(const any &a) {
using U =
typename std::remove_cv<typename std::remove_reference<T>::type>::type;
const U *p = any_cast<U>(&a);
#ifndef CPPHTTPLIB_NO_EXCEPTIONS
if (!p) { throw bad_any_cast{}; }
#else
if (!p) { std::abort(); }
#endif
return static_cast<T>(*p);
}
template <typename T> T any_cast(any &a) {
using U =
typename std::remove_cv<typename std::remove_reference<T>::type>::type;
U *p = any_cast<U>(&a);
#ifndef CPPHTTPLIB_NO_EXCEPTIONS
if (!p) { throw bad_any_cast{}; }
#else
if (!p) { std::abort(); }
#endif
return static_cast<T>(*p);
}
template <typename T> T any_cast(any &&a) {
using U =
typename std::remove_cv<typename std::remove_reference<T>::type>::type;
U *p = any_cast<U>(&a);
#ifndef CPPHTTPLIB_NO_EXCEPTIONS
if (!p) { throw bad_any_cast{}; }
#else
if (!p) { std::abort(); }
#endif
return static_cast<T>(std::move(*p));
}
#endif // __cplusplus >= 201703L
struct Response;
using ResponseHandler = std::function<bool(const Response &response)>;
@ -1261,6 +1293,7 @@ struct Request {
bool has_param(const std::string &key) const;
std::string get_param_value(const std::string &key, size_t id = 0) const;
std::vector<std::string> get_param_values(const std::string &key) const;
size_t get_param_value_count(const std::string &key) const;
bool is_multipart_form_data() const;
@ -1293,7 +1326,7 @@ struct Response {
// User-defined context — set by pre-routing/pre-request handlers and read
// by route handlers to pass arbitrary data (e.g. decoded auth tokens).
std::map<std::string, any> user_data;
UserData user_data;
bool has_header(const std::string &key) const;
std::string get_header_value(const std::string &key, const char *def = "",
@ -1664,6 +1697,9 @@ public:
Server &set_keep_alive_max_count(size_t count);
Server &set_keep_alive_timeout(time_t sec);
template <class Rep, class Period>
Server &
set_keep_alive_timeout(const std::chrono::duration<Rep, Period> &duration);
Server &set_read_timeout(time_t sec, time_t usec = 0);
template <class Rep, class Period>
@ -2790,10 +2826,26 @@ public:
"This function will be removed by v1.0.0.")]]
SSL_CTX *ssl_context() const;
// Override of a deprecated virtual in ClientImpl. Suppress C4996 /
// -Wdeprecated-declarations on the override declaration itself so that
// MSVC /sdl builds compile cleanly. Will be removed together with the
// base virtual by v1.0.0.
#if defined(_MSC_VER)
#pragma warning(push)
#pragma warning(disable : 4996)
#elif defined(__GNUC__) || defined(__clang__)
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
#endif
[[deprecated("Use set_session_verifier(session_t) instead. "
"This function will be removed by v1.0.0.")]]
void set_server_certificate_verifier(
std::function<SSLVerifierResponse(SSL *ssl)> verifier) override;
#if defined(_MSC_VER)
#pragma warning(pop)
#elif defined(__GNUC__) || defined(__clang__)
#pragma GCC diagnostic pop
#endif
private:
bool verify_host(X509 *server_cert) const;