mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-05-19 08:00:25 +00:00
Merge branch 'upstream' into concedo_experimental
# Conflicts: # .github/workflows/build-cross.yml # .github/workflows/build-self-hosted.yml # .github/workflows/release.yml # examples/llama.android/lib/src/main/cpp/CMakeLists.txt # ggml/CMakeLists.txt # ggml/src/ggml-rpc/CMakeLists.txt # ggml/src/ggml-rpc/ggml-rpc.cpp # ggml/src/ggml-sycl/mmvq.cpp # ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp # ggml/src/ggml-webgpu/ggml-webgpu.cpp # scripts/sync_vendor.py # tests/test-chat.cpp # tests/test-mtmd-c-api.c # tools/server/README.md
This commit is contained in:
commit
cd6788007e
86 changed files with 1384 additions and 1240 deletions
|
|
@ -294,7 +294,7 @@ static bool common_params_handle_remote_preset(common_params & params, llama_exa
|
|||
hf_tag = "default";
|
||||
}
|
||||
|
||||
std::string model_endpoint = get_model_endpoint();
|
||||
std::string model_endpoint = common_get_model_endpoint();
|
||||
auto preset_url = model_endpoint + hf_repo + "/resolve/main/preset.ini";
|
||||
|
||||
// prepare local path for caching
|
||||
|
|
@ -1318,13 +1318,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||
}
|
||||
).set_env("LLAMA_ARG_KV_UNIFIED").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_BATCHED, LLAMA_EXAMPLE_BENCH, LLAMA_EXAMPLE_PARALLEL}));
|
||||
add_opt(common_arg(
|
||||
{"--clear-idle"},
|
||||
{"--no-clear-idle"},
|
||||
{"--cache-idle-slots"},
|
||||
{"--no-cache-idle-slots"},
|
||||
"save and clear idle slots on new task (default: enabled, requires unified KV and cache-ram)",
|
||||
[](common_params & params, bool value) {
|
||||
params.clear_idle = value;
|
||||
params.cache_idle_slots = value;
|
||||
}
|
||||
).set_env("LLAMA_ARG_CLEAR_IDLE").set_examples({LLAMA_EXAMPLE_SERVER}));
|
||||
).set_env("LLAMA_ARG_CACHE_IDLE_SLOTS").set_examples({LLAMA_EXAMPLE_SERVER}));
|
||||
add_opt(common_arg(
|
||||
{"--context-shift"},
|
||||
{"--no-context-shift"},
|
||||
|
|
|
|||
|
|
@ -443,14 +443,14 @@ common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_conte
|
|||
if (!format.per_call_start.empty()) {
|
||||
auto wrapped_call = format.per_call_start + p.space() + tool_choice + p.space() + format.per_call_end;
|
||||
if (inputs.parallel_tool_calls) {
|
||||
tool_calls = p.trigger_rule("tool-call", wrapped_call + p.zero_or_more(p.space() + wrapped_call));
|
||||
tool_calls = p.trigger_rule("tool-call", wrapped_call + p.zero_or_more(p.space() + wrapped_call) + p.space());
|
||||
} else {
|
||||
tool_calls = p.trigger_rule("tool-call", wrapped_call);
|
||||
tool_calls = p.trigger_rule("tool-call", wrapped_call + p.space());
|
||||
}
|
||||
if (!format.section_start.empty()) {
|
||||
tool_calls = p.trigger_rule("tool-calls",
|
||||
p.literal(format.section_start) + p.space() + tool_calls + p.space() +
|
||||
(format.section_end.empty() ? p.end() : p.literal(format.section_end)));
|
||||
(format.section_end.empty() ? p.end() : p.literal(format.section_end) + p.space()));
|
||||
}
|
||||
} else {
|
||||
std::string separator = ", "; // Default
|
||||
|
|
|
|||
|
|
@ -2349,7 +2349,7 @@ common_chat_msg common_chat_peg_parse(const common_peg_arena & src_pars
|
|||
? input
|
||||
: params.generation_prompt + input;
|
||||
|
||||
LOG_DBG("Parsing PEG input with format %s: %s\n", common_chat_format_name(params.format), effective_input.c_str());
|
||||
//LOG_DBG("Parsing PEG input with format %s: %s\n", common_chat_format_name(params.format), effective_input.c_str());
|
||||
|
||||
common_peg_parse_flags flags = COMMON_PEG_PARSE_FLAG_LENIENT;
|
||||
if (params.debug) {
|
||||
|
|
|
|||
|
|
@ -1388,7 +1388,7 @@ common_init_result_ptr common_init_from_params(common_params & params) {
|
|||
|
||||
common_init_result::~common_init_result() = default;
|
||||
|
||||
std::string get_model_endpoint() {
|
||||
std::string common_get_model_endpoint() {
|
||||
const char * model_endpoint_env = getenv("MODEL_ENDPOINT");
|
||||
// We still respect the use of environment-variable "HF_ENDPOINT" for backward-compatibility.
|
||||
const char * hf_endpoint_env = getenv("HF_ENDPOINT");
|
||||
|
|
@ -1403,6 +1403,42 @@ std::string get_model_endpoint() {
|
|||
return model_endpoint;
|
||||
}
|
||||
|
||||
common_context_seq_rm_type common_context_can_seq_rm(llama_context * ctx) {
|
||||
auto * mem = llama_get_memory(ctx);
|
||||
if (mem == nullptr) {
|
||||
return COMMON_CONTEXT_SEQ_RM_TYPE_NO;
|
||||
}
|
||||
|
||||
common_context_seq_rm_type res = COMMON_CONTEXT_SEQ_RM_TYPE_PART;
|
||||
|
||||
llama_memory_clear(mem, true);
|
||||
|
||||
// eval 2 tokens to check if the context is compatible
|
||||
std::vector<llama_token> tmp;
|
||||
tmp.push_back(0);
|
||||
tmp.push_back(0);
|
||||
|
||||
int ret = llama_decode(ctx, llama_batch_get_one(tmp.data(), tmp.size()));
|
||||
if (ret != 0) {
|
||||
LOG_ERR("%s: llama_decode() failed: %d\n", __func__, ret);
|
||||
res = COMMON_CONTEXT_SEQ_RM_TYPE_NO;
|
||||
goto done;
|
||||
}
|
||||
|
||||
// try to remove the last tokens
|
||||
if (!llama_memory_seq_rm(mem, 0, 1, -1)) {
|
||||
LOG_WRN("%s: the target context does not support partial sequence removal\n", __func__);
|
||||
res = COMMON_CONTEXT_SEQ_RM_TYPE_FULL;
|
||||
goto done;
|
||||
}
|
||||
|
||||
done:
|
||||
llama_memory_clear(mem, true);
|
||||
llama_synchronize(ctx);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora) {
|
||||
std::vector<llama_adapter_lora *> loras;
|
||||
std::vector<float> scales;
|
||||
|
|
|
|||
|
|
@ -12,7 +12,6 @@
|
|||
#include <sstream>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <variant>
|
||||
#include <vector>
|
||||
#include <map>
|
||||
|
||||
|
|
@ -304,15 +303,15 @@ struct common_params_speculative {
|
|||
// general-purpose speculative decoding parameters
|
||||
|
||||
int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding
|
||||
int32_t n_min = 0; // minimum number of draft tokens to use for speculative decoding
|
||||
int32_t n_min = 0; // minimum number of draft tokens to use for speculative decoding
|
||||
float p_split = 0.1f; // speculative decoding split probability
|
||||
float p_min = 0.75f; // minimum speculative decoding probability (greedy)
|
||||
|
||||
// ngram-based speculative decoding
|
||||
|
||||
uint16_t ngram_size_n = 12; // ngram size for lookup
|
||||
uint16_t ngram_size_m = 48; // mgram size for speculative tokens
|
||||
uint16_t ngram_min_hits = 1; // minimum hits at ngram/mgram lookup for mgram to be proposed
|
||||
uint16_t ngram_size_n = 12; // ngram size for lookup
|
||||
uint16_t ngram_size_m = 48; // mgram size for speculative tokens
|
||||
uint16_t ngram_min_hits = 1; // minimum hits at ngram/mgram lookup for mgram to be proposed
|
||||
|
||||
std::shared_ptr<common_ngram_mod> ngram_mod;
|
||||
|
||||
|
|
@ -568,7 +567,7 @@ struct common_params {
|
|||
int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
|
||||
int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
|
||||
bool cache_prompt = true; // whether to enable prompt caching
|
||||
bool clear_idle = true; // save and clear idle slots upon starting a new task
|
||||
bool cache_idle_slots = true; // save and clear idle slots upon starting a new task
|
||||
int32_t n_ctx_checkpoints = 32; // max number of context checkpoints per slot
|
||||
int32_t checkpoint_every_nt = 8192; // make a checkpoint every n tokens during prefill
|
||||
int32_t cache_ram_mib = 8192; // -1 = no limit, 0 - disable, 1 = 1 MiB, etc.
|
||||
|
|
@ -848,7 +847,23 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p
|
|||
// clear LoRA adapters from context, then apply new list of adapters
|
||||
void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);
|
||||
|
||||
std::string get_model_endpoint();
|
||||
// model endpoint from env
|
||||
std::string common_get_model_endpoint();
|
||||
|
||||
//
|
||||
// Context utils
|
||||
//
|
||||
|
||||
enum common_context_seq_rm_type {
|
||||
COMMON_CONTEXT_SEQ_RM_TYPE_NO = 0, // seq_rm not supported (e.g. no memory module)
|
||||
COMMON_CONTEXT_SEQ_RM_TYPE_PART = 1, // can seq_rm partial sequences
|
||||
COMMON_CONTEXT_SEQ_RM_TYPE_FULL = 2, // can seq_rm full sequences only
|
||||
};
|
||||
|
||||
// check if the llama_context can remove sequences
|
||||
// note: clears the memory of the context
|
||||
common_context_seq_rm_type common_context_can_seq_rm(llama_context * ctx);
|
||||
|
||||
|
||||
//
|
||||
// Batch utils
|
||||
|
|
|
|||
|
|
@ -230,7 +230,7 @@ static nl::json api_get(const std::string & url,
|
|||
static std::string get_repo_commit(const std::string & repo_id,
|
||||
const std::string & token) {
|
||||
try {
|
||||
auto endpoint = get_model_endpoint();
|
||||
auto endpoint = common_get_model_endpoint();
|
||||
auto json = api_get(endpoint + "api/models/" + repo_id + "/refs", token);
|
||||
|
||||
if (!json.is_object() ||
|
||||
|
|
@ -308,7 +308,7 @@ hf_files get_repo_files(const std::string & repo_id,
|
|||
hf_files files;
|
||||
|
||||
try {
|
||||
auto endpoint = get_model_endpoint();
|
||||
auto endpoint = common_get_model_endpoint();
|
||||
auto json = api_get(endpoint + "api/models/" + repo_id + "/tree/" + commit + "?recursive=true", token);
|
||||
|
||||
if (!json.is_array()) {
|
||||
|
|
|
|||
|
|
@ -208,7 +208,7 @@ void common_ngram_map_begin(
|
|||
count_keys, count_keys_del, count_values_del, count_map_entries_upd);
|
||||
}
|
||||
|
||||
map.idx_last_check = (map.size_last_begin > 0) ? map.size_last_begin - 1 : 0;
|
||||
map.idx_last_check = size_begin;
|
||||
map.size_last_begin = size_begin;
|
||||
}
|
||||
|
||||
|
|
@ -231,7 +231,7 @@ void common_ngram_map_draft(common_ngram_map & map,
|
|||
GGML_ABORT("%s: cur_len exceeds UINT32_MAX: %zu", __func__, cur_len);
|
||||
}
|
||||
|
||||
if (map.idx_last_check > cur_len) {
|
||||
if (map.idx_last_check > cur_len) {
|
||||
// Should not happen because of common_ngram_map_begin().
|
||||
GGML_ABORT("%s: map.idx_last_check > cur_len: %zu > %zu", __func__, map.idx_last_check, cur_len);
|
||||
}
|
||||
|
|
@ -386,7 +386,7 @@ void common_ngram_map_draft(common_ngram_map & map,
|
|||
LOG_DBG("%s: key_idx = %zu, key_offset = %zu, key_num = %d, draft.size = %zu\n", __func__,
|
||||
curr_key.key_idx, key_offset, curr_key.key_num, draft.size());
|
||||
|
||||
map.last_draft_created = false;
|
||||
map.last_draft_created = true;
|
||||
map.last_draft_key_idx = key_offset;
|
||||
map.last_draft_value_idx = 0; // value 0 is used for simple mode
|
||||
return;
|
||||
|
|
@ -524,7 +524,7 @@ void common_ngram_map_accept(common_ngram_map & map, uint16_t n_accepted) {
|
|||
struct common_ngram_map_value & curr_value = curr_key.values[val_idx]; // value used for draft generation.
|
||||
|
||||
// update the value statistics
|
||||
LOG_INF("common_ngram_map_send_accepted: n_accepted = %d, prev value_num = %d\n",
|
||||
LOG_DBG("common_ngram_map_send_accepted: n_accepted = %d, prev value_num = %d\n",
|
||||
n_accepted, curr_value.n_accepted);
|
||||
curr_value.n_accepted = n_accepted;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -13,6 +13,7 @@
|
|||
#include <cstring>
|
||||
#include <iomanip>
|
||||
#include <map>
|
||||
#include <cinttypes>
|
||||
|
||||
#define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 128
|
||||
#define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
|
||||
|
|
@ -144,10 +145,28 @@ struct common_speculative_state {
|
|||
virtual void accept(uint16_t n_accepted) = 0;
|
||||
};
|
||||
|
||||
struct common_speculative_checkpoint {
|
||||
llama_pos pos_min = 0;
|
||||
llama_pos pos_max = 0;
|
||||
|
||||
int64_t n_tokens = 0;
|
||||
|
||||
std::vector<uint8_t> data;
|
||||
|
||||
size_t size() const {
|
||||
return data.size();
|
||||
}
|
||||
|
||||
size_t ckpt_size = 0;
|
||||
};
|
||||
|
||||
struct common_speculative_state_draft : public common_speculative_state {
|
||||
llama_context * ctx_tgt; // only used for retokenizing from ctx_dft
|
||||
llama_context * ctx_dft;
|
||||
|
||||
bool use_ckpt = false;
|
||||
struct common_speculative_checkpoint ckpt;
|
||||
|
||||
common_sampler * smpl;
|
||||
|
||||
llama_batch batch;
|
||||
|
|
@ -160,10 +179,12 @@ struct common_speculative_state_draft : public common_speculative_state {
|
|||
enum common_speculative_type type,
|
||||
llama_context * ctx_tgt,
|
||||
llama_context * ctx_dft,
|
||||
const std::vector<std::pair<std::string, std::string>> & replacements)
|
||||
const std::vector<std::pair<std::string, std::string>> & replacements,
|
||||
bool use_ckpt)
|
||||
: common_speculative_state(type)
|
||||
, ctx_tgt(ctx_tgt)
|
||||
, ctx_dft(ctx_dft)
|
||||
, use_ckpt(use_ckpt)
|
||||
{
|
||||
batch = llama_batch_init(llama_n_batch(ctx_dft), 0, 1);
|
||||
smpl = nullptr;
|
||||
|
|
@ -218,7 +239,48 @@ struct common_speculative_state_draft : public common_speculative_state {
|
|||
}
|
||||
|
||||
void begin(const llama_tokens & prompt) override {
|
||||
GGML_UNUSED(prompt);
|
||||
if (use_ckpt && ckpt.size() > 0) {
|
||||
// delete checkpoint
|
||||
LOG_DBG("%s: delete checkpoint, prompt.size=%zu, pos_min=%d, pos_max=%d, n_tokens=%" PRId64 ", size=%.3f MiB\n",
|
||||
__func__, prompt.size(), ckpt.pos_min, ckpt.pos_max, ckpt.n_tokens, (float) ckpt.data.size() / 1024 / 1024);
|
||||
ckpt.pos_min = 0;
|
||||
ckpt.pos_max = 0;
|
||||
ckpt.n_tokens = 0;
|
||||
ckpt.ckpt_size = 0;
|
||||
ckpt.data.clear();
|
||||
}
|
||||
}
|
||||
|
||||
size_t draft_create_checkpoint(int n_tokens_prompt, int n_tokens_batch) {
|
||||
int slot_id = 0;
|
||||
const size_t checkpoint_size = llama_state_seq_get_size_ext(ctx_dft, slot_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
|
||||
|
||||
ckpt.pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx_dft), slot_id);
|
||||
ckpt.pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx_dft), slot_id);
|
||||
ckpt.n_tokens = n_tokens_prompt - n_tokens_batch;
|
||||
ckpt.data.resize(checkpoint_size);
|
||||
|
||||
const size_t n = llama_state_seq_get_data_ext(ctx_dft, ckpt.data.data(), checkpoint_size, slot_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
|
||||
if (n != checkpoint_size) {
|
||||
GGML_ABORT("checkpoint size mismatch: expected %zu, got %zu\n", checkpoint_size, n);
|
||||
}
|
||||
|
||||
LOG_DBG("%s: pos_min = %d, pos_max = %d, size = %.3f MiB\n", __func__,
|
||||
ckpt.pos_min, ckpt.pos_max, (float) ckpt.data.size() / 1024 / 1024);
|
||||
return n;
|
||||
}
|
||||
|
||||
size_t draft_restore_checkpoint(size_t ckpt_size_part_expected) {
|
||||
int slot_id = 0;
|
||||
LOG_DBG("%s: pos_min = %d, pos_max = %d\n", __func__, ckpt.pos_min, ckpt.pos_max);
|
||||
const size_t n = llama_state_seq_set_data_ext(ctx_dft, ckpt.data.data(), ckpt.size(), slot_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
|
||||
if (n != ckpt_size_part_expected) {
|
||||
GGML_ABORT("%s: failed to restore context checkpoint (pos_min=%d, pos_max=%d, size=%zu, get_data_ext->%zu, set_data_ext->%zu",
|
||||
__func__, ckpt.pos_min, ckpt.pos_max, ckpt.size(), ckpt_size_part_expected, n);
|
||||
}
|
||||
llama_memory_seq_rm(llama_get_memory(ctx_dft), slot_id, ckpt.pos_max + 1, -1);
|
||||
|
||||
return n;
|
||||
}
|
||||
|
||||
void draft(
|
||||
|
|
@ -236,8 +298,8 @@ struct common_speculative_state_draft : public common_speculative_state {
|
|||
|
||||
auto * mem_dft = llama_get_memory(ctx_dft);
|
||||
|
||||
int reuse_i = 0;
|
||||
int reuse_n = 0;
|
||||
int reuse_i = 0; // index of part to be reused in prompt_dft
|
||||
int reuse_n = 0; // length of part to be reused in prompt_dft
|
||||
|
||||
const int n_ctx = llama_n_ctx(ctx_dft) - params.n_max;
|
||||
|
||||
|
|
@ -287,18 +349,26 @@ struct common_speculative_state_draft : public common_speculative_state {
|
|||
}
|
||||
}
|
||||
|
||||
LOG_DBG("%s: reuse_i = %d, reuse_n = %d, prompt = %d\n", __func__, reuse_i, reuse_n, (int) prompt_dft.size());
|
||||
LOG_DBG("%s: reuse_i = %d, reuse_n = %d, #prompt_dft = %zu, #prompt_cur = %zu\n",
|
||||
__func__, reuse_i, reuse_n, prompt_dft.size(), prompt_cur.size());
|
||||
if (use_ckpt && ckpt.ckpt_size == 0 && reuse_n > 0) {
|
||||
LOG_DBG("%s: no checkpoint available, no reuse, (reuse_i=%d, reuse_n=%d) -> (0, 0)\n",
|
||||
__func__, reuse_i, reuse_n);
|
||||
reuse_i = 0;
|
||||
reuse_n = 0;
|
||||
}
|
||||
|
||||
result.clear();
|
||||
result.reserve(params.n_max);
|
||||
|
||||
if (reuse_n == 0) {
|
||||
bool needs_ckpt = use_ckpt && prompt_dft.size() > 0;
|
||||
if (reuse_n == 0 || (use_ckpt && reuse_i > 0)) {
|
||||
llama_memory_clear(mem_dft, false);
|
||||
prompt_dft.clear();
|
||||
} else {
|
||||
// this happens when a previous draft has been discarded (for example, due to being too small), but the
|
||||
// target model agreed with it. in this case, we simply pass back the previous results to save compute
|
||||
if (reuse_i + reuse_n < (int) prompt_dft.size() && prompt_dft[reuse_i + reuse_n] == id_last) {
|
||||
if (reuse_i + reuse_n < (int64_t) prompt_dft.size() && prompt_dft[reuse_i + reuse_n] == id_last) {
|
||||
for (int i = reuse_i + reuse_n + 1; i < (int) prompt_dft.size(); ++i) {
|
||||
result.push_back(prompt_dft[i]);
|
||||
|
||||
|
|
@ -310,19 +380,50 @@ struct common_speculative_state_draft : public common_speculative_state {
|
|||
return;
|
||||
}
|
||||
|
||||
bool do_restore = false;
|
||||
if (prompt_dft.size() > prompt_cur.size() && reuse_i + reuse_n < (int64_t) prompt_dft.size()) {
|
||||
// This can happen after a partial acceptance (speculative decoding with checkpoints)
|
||||
LOG_DBG("%s: #prompt_dft=%zu, #prompt_cur=%zu, shorten draft\n",
|
||||
__func__, prompt_dft.size(), prompt_cur.size());
|
||||
prompt_dft.resize(prompt_cur.size());
|
||||
do_restore = true;
|
||||
}
|
||||
|
||||
if (reuse_i > 0) {
|
||||
llama_memory_seq_rm (mem_dft, 0, 0, reuse_i);
|
||||
bool is_removed = llama_memory_seq_rm (mem_dft, 0, 0, reuse_i);
|
||||
if (!is_removed) {
|
||||
LOG_ERR("%s: llama_memory_seq_rm failed, reuse_i=%d\n", __func__, reuse_i);
|
||||
}
|
||||
llama_memory_seq_add(mem_dft, 0, reuse_i, -1, -reuse_i);
|
||||
|
||||
prompt_dft.erase(prompt_dft.begin(), prompt_dft.begin() + reuse_i);
|
||||
}
|
||||
|
||||
if (reuse_n < (int) prompt_dft.size()) {
|
||||
llama_memory_seq_rm (mem_dft, 0, reuse_n, -1);
|
||||
prompt_dft.erase(prompt_dft.begin() + reuse_n, prompt_dft.end());
|
||||
if (reuse_n < (int) prompt_dft.size() || do_restore) {
|
||||
if (use_ckpt) {
|
||||
if (ckpt.n_tokens > (int64_t) prompt_dft.size()) {
|
||||
LOG_INF("%s: checkpoint is too large, prompt_tgt.size=%zu, ckpt.n_tokens=%" PRId64 ", reuse_n=%d, prompt_dft.size=%zu\n",
|
||||
__func__, prompt_tgt.size(), ckpt.n_tokens, reuse_n, prompt_dft.size());
|
||||
}
|
||||
draft_restore_checkpoint(ckpt.ckpt_size);
|
||||
reuse_n = ckpt.n_tokens;
|
||||
prompt_dft.resize(reuse_n);
|
||||
needs_ckpt = false;
|
||||
} else {
|
||||
bool is_removed = llama_memory_seq_rm (mem_dft, 0, reuse_n, -1);
|
||||
if (!is_removed) {
|
||||
LOG_ERR("%s: llama_memory_seq_rm failed, reuse_n=%d, prompt_dft.size=%zu\n",
|
||||
__func__, reuse_n, prompt_dft.size());
|
||||
}
|
||||
prompt_dft.erase(prompt_dft.begin() + reuse_n, prompt_dft.end());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (needs_ckpt) {
|
||||
ckpt.ckpt_size = draft_create_checkpoint(prompt_dft.size(), batch.n_tokens);
|
||||
}
|
||||
|
||||
// prepare a batch to evaluate any new tokens in the prompt
|
||||
common_batch_clear(batch);
|
||||
|
||||
|
|
@ -337,7 +438,11 @@ struct common_speculative_state_draft : public common_speculative_state {
|
|||
if (batch.n_tokens > 0) {
|
||||
//LOG_DBG("%s: draft prompt batch: %s\n", __func__, string_from(ctx, batch).c_str());
|
||||
|
||||
llama_decode(ctx_dft, batch);
|
||||
int ret = llama_decode(ctx_dft, batch);
|
||||
if (ret != 0 && ret != 1) {
|
||||
LOG_WRN("%s: llama_decode returned %d, prompt_cur.size=%zu\n",
|
||||
__func__, ret, prompt_cur.size());
|
||||
}
|
||||
}
|
||||
|
||||
const llama_pos n_past = prompt_dft.size();
|
||||
|
|
@ -351,7 +456,11 @@ struct common_speculative_state_draft : public common_speculative_state {
|
|||
|
||||
LOG_DBG("%s: draft prompt: %s\n", __func__, string_from(ctx_dft, prompt_dft).c_str());
|
||||
|
||||
llama_decode(ctx_dft, batch);
|
||||
int ret = llama_decode(ctx_dft, batch);
|
||||
if (ret != 0 && ret != 1) {
|
||||
LOG_WRN("%s: llama_decode returned %d, prompt_cur.size=%zu, prompt_dft.size=%zu\n",
|
||||
__func__, ret, prompt_cur.size(), prompt_dft.size());
|
||||
}
|
||||
|
||||
common_sampler_reset(smpl);
|
||||
|
||||
|
|
@ -387,7 +496,11 @@ struct common_speculative_state_draft : public common_speculative_state {
|
|||
common_batch_add(batch, id, n_past + i + 1, { 0 }, true);
|
||||
|
||||
// evaluate the drafted tokens on the draft model
|
||||
llama_decode(ctx_dft, batch);
|
||||
ret = llama_decode(ctx_dft, batch);
|
||||
if (ret != 0) {
|
||||
LOG_WRN("%s: llama_decode[%d] returned %d, prompt_cur.size=%zu, prompt_dft.size=%zu\n",
|
||||
__func__, i, ret, prompt_cur.size(), prompt_dft.size());
|
||||
}
|
||||
|
||||
prompt_dft.push_back(id);
|
||||
}
|
||||
|
|
@ -739,6 +852,7 @@ struct common_speculative_state_ngram_cache : public common_speculative_state {
|
|||
|
||||
struct common_speculative {
|
||||
std::vector<std::unique_ptr<common_speculative_state>> impls; // list of implementations to use and their states
|
||||
|
||||
common_speculative_state * curr_impl = nullptr; // current implementation in use (for stats)
|
||||
};
|
||||
|
||||
|
|
@ -798,42 +912,6 @@ enum common_speculative_type common_speculative_type_from_name(const std::string
|
|||
return it->second;
|
||||
}
|
||||
|
||||
bool common_speculative_is_compat(llama_context * ctx_tgt) {
|
||||
auto * mem = llama_get_memory(ctx_tgt);
|
||||
if (mem == nullptr) {
|
||||
return false;
|
||||
}
|
||||
|
||||
bool res = true;
|
||||
|
||||
llama_memory_clear(mem, true);
|
||||
|
||||
// eval 2 tokens to check if the context is compatible
|
||||
std::vector<llama_token> tmp;
|
||||
tmp.push_back(0);
|
||||
tmp.push_back(0);
|
||||
|
||||
int ret = llama_decode(ctx_tgt, llama_batch_get_one(tmp.data(), tmp.size()));
|
||||
if (ret != 0) {
|
||||
LOG_ERR("%s: llama_decode() failed: %d\n", __func__, ret);
|
||||
res = false;
|
||||
goto done;
|
||||
}
|
||||
|
||||
// try to remove the last tokens
|
||||
if (!llama_memory_seq_rm(mem, 0, 1, -1)) {
|
||||
LOG_WRN("%s: the target context does not support partial sequence removal\n", __func__);
|
||||
res = false;
|
||||
goto done;
|
||||
}
|
||||
|
||||
done:
|
||||
llama_memory_clear(mem, true);
|
||||
llama_synchronize(ctx_tgt);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
// initialization of the speculative decoding system
|
||||
//
|
||||
common_speculative * common_speculative_init(
|
||||
|
|
@ -908,10 +986,13 @@ common_speculative * common_speculative_init(
|
|||
case COMMON_SPECULATIVE_TYPE_NONE:
|
||||
break;
|
||||
case COMMON_SPECULATIVE_TYPE_DRAFT: {
|
||||
const bool use_ckpt = common_context_can_seq_rm(ctx_dft) == COMMON_CONTEXT_SEQ_RM_TYPE_FULL;
|
||||
|
||||
impls.push_back(std::make_unique<common_speculative_state_draft>(config.type,
|
||||
/* .ctx_tgt = */ ctx_tgt,
|
||||
/* .ctx_dft = */ ctx_dft,
|
||||
/* .replacements = */ params.replacements
|
||||
/* .replacements = */ params.replacements,
|
||||
/* .use_ckpt = */ use_ckpt
|
||||
));
|
||||
break;
|
||||
}
|
||||
|
|
@ -966,7 +1047,8 @@ common_speculative * common_speculative_init(
|
|||
}
|
||||
|
||||
auto * result = new common_speculative {
|
||||
/* .impls = */ std::move(impls)
|
||||
/* .impls = */ std::move(impls),
|
||||
/* .curr_impl = */ nullptr,
|
||||
};
|
||||
|
||||
return result;
|
||||
|
|
|
|||
|
|
@ -14,10 +14,6 @@ enum common_speculative_type common_speculative_type_from_name(const std::string
|
|||
// convert type to string
|
||||
std::string common_speculative_type_to_str(enum common_speculative_type type);
|
||||
|
||||
// check if the llama_context is compatible for speculative decoding
|
||||
// note: clears the memory of the context
|
||||
bool common_speculative_is_compat(llama_context * ctx_tgt);
|
||||
|
||||
common_speculative * common_speculative_init(
|
||||
common_params_speculative & params,
|
||||
llama_context * ctx_tgt);
|
||||
|
|
@ -39,3 +35,9 @@ void common_speculative_accept(common_speculative * spec, uint16_t n_accepted);
|
|||
|
||||
// print statistics about the speculative decoding
|
||||
void common_speculative_print_stats(const common_speculative * spec);
|
||||
|
||||
struct common_speculative_deleter {
|
||||
void operator()(common_speculative * s) { common_speculative_free(s); }
|
||||
};
|
||||
|
||||
typedef std::unique_ptr<common_speculative, common_speculative_deleter> common_speculative_ptr;
|
||||
|
|
|
|||
|
|
@ -1850,20 +1850,28 @@ class TextModel(ModelBase):
|
|||
with open(module_path, encoding="utf-8") as f:
|
||||
modules = json.load(f)
|
||||
for mod in modules:
|
||||
if mod["type"] == "sentence_transformers.models.Pooling":
|
||||
if mod["type"].endswith("Pooling"):
|
||||
pooling_path = mod["path"]
|
||||
break
|
||||
|
||||
mode_mapping = {
|
||||
"mean": gguf.PoolingType.MEAN,
|
||||
"cls": gguf.PoolingType.CLS,
|
||||
"lasttoken": gguf.PoolingType.LAST,
|
||||
}
|
||||
|
||||
# get pooling type
|
||||
if pooling_path is not None:
|
||||
with open(self.dir_model / pooling_path / "config.json", encoding="utf-8") as f:
|
||||
pooling = json.load(f)
|
||||
if pooling["pooling_mode_mean_tokens"]:
|
||||
if pooling.get("pooling_mode_mean_tokens"):
|
||||
pooling_type = gguf.PoolingType.MEAN
|
||||
elif pooling["pooling_mode_cls_token"]:
|
||||
elif pooling.get("pooling_mode_cls_token"):
|
||||
pooling_type = gguf.PoolingType.CLS
|
||||
elif pooling["pooling_mode_lasttoken"]:
|
||||
elif pooling.get("pooling_mode_lasttoken"):
|
||||
pooling_type = gguf.PoolingType.LAST
|
||||
elif (pooling_mode := pooling.get("pooling_mode")) in mode_mapping:
|
||||
pooling_type = mode_mapping[pooling_mode]
|
||||
else:
|
||||
raise NotImplementedError("Only MEAN, CLS, and LAST pooling types supported")
|
||||
self.gguf_writer.add_pooling_type(pooling_type)
|
||||
|
|
@ -7180,7 +7188,7 @@ class EmbeddingGemma(Gemma3Model):
|
|||
with open(modules_file, encoding="utf-8") as modules_json_file:
|
||||
mods = json.load(modules_json_file)
|
||||
for mod in mods:
|
||||
if mod["type"] == "sentence_transformers.models.Dense":
|
||||
if mod["type"].endswith("Dense"):
|
||||
mod_path = mod["path"]
|
||||
# check if model.safetensors file for Dense layer exists
|
||||
model_tensors_file = self.dir_model / mod_path / "model.safetensors"
|
||||
|
|
@ -10912,14 +10920,14 @@ class NemotronHModel(GraniteHybridModel):
|
|||
vocab_size = -(vocab_size // -pad_vocab) * pad_vocab
|
||||
self.hparams["vocab_size"] = vocab_size
|
||||
|
||||
assert max(tokenizer.vocab.values()) < vocab_size
|
||||
assert max(tokenizer.vocab.values()) < vocab_size # ty: ignore[unresolved-attribute]
|
||||
|
||||
tokpre = self.get_vocab_base_pre(tokenizer)
|
||||
|
||||
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
|
||||
added_vocab = tokenizer.get_added_vocab()
|
||||
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} # ty: ignore[unresolved-attribute]
|
||||
added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute]
|
||||
|
||||
added_tokens_decoder = tokenizer.added_tokens_decoder
|
||||
added_tokens_decoder = tokenizer.added_tokens_decoder # ty: ignore[unresolved-attribute]
|
||||
|
||||
for i in range(vocab_size):
|
||||
if i not in reverse_vocab:
|
||||
|
|
@ -10930,7 +10938,7 @@ class NemotronHModel(GraniteHybridModel):
|
|||
if token in added_vocab:
|
||||
if not added_tokens_decoder[i].normalized:
|
||||
previous_token = token
|
||||
token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False))
|
||||
token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False)) # ty: ignore[unresolved-attribute, invalid-assignment]
|
||||
if previous_token != token:
|
||||
logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer")
|
||||
|
||||
|
|
|
|||
|
|
@ -1270,7 +1270,45 @@ static void ggml_backend_meta_buffer_get_tensor(ggml_backend_buffer_t buffer, co
|
|||
GGML_ASSERT(ggml_is_contiguous(tensor));
|
||||
|
||||
const ggml_backend_meta_split_state split_state = ggml_backend_meta_get_split_state(tensor, /*assume_sync =*/ false);
|
||||
GGML_ASSERT(split_state.n_segments == 1);
|
||||
|
||||
if (split_state.n_segments != 1) {
|
||||
GGML_ASSERT(split_state.axis >= 0 && split_state.axis < GGML_MAX_DIMS);
|
||||
GGML_ASSERT(offset == 0);
|
||||
GGML_ASSERT(size == ggml_nbytes(tensor));
|
||||
GGML_ASSERT(tensor->ne[3] == 1);
|
||||
size_t offset_data = 0;
|
||||
std::vector<size_t> simple_offsets(n_bufs, 0);
|
||||
if (split_state.axis == GGML_BACKEND_SPLIT_AXIS_0) {
|
||||
GGML_ASSERT(tensor->ne[2] == 1);
|
||||
const int64_t blck_size = ggml_blck_size(tensor->type);
|
||||
for (size_t s = 0; s < split_state.n_segments; s++) {
|
||||
for (size_t j = 0; j < n_bufs; j++) {
|
||||
const ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j);
|
||||
GGML_ASSERT(split_state.ne[s*n_bufs + j] % blck_size == 0);
|
||||
const size_t nbytes = split_state.ne[s*n_bufs + j]/blck_size * tensor->nb[0];
|
||||
ggml_backend_tensor_get_2d(simple_tensor, (char *) data + offset_data, simple_offsets[j], nbytes,
|
||||
tensor->ne[1], simple_tensor->nb[1], tensor->nb[1]);
|
||||
offset_data += nbytes;
|
||||
simple_offsets[j] += nbytes;
|
||||
}
|
||||
}
|
||||
GGML_ASSERT(offset_data*tensor->ne[1] == size);
|
||||
return;
|
||||
}
|
||||
GGML_ASSERT(split_state.axis == GGML_BACKEND_SPLIT_AXIS_1);
|
||||
for (size_t s = 0; s < split_state.n_segments; s++) {
|
||||
for (size_t j = 0; j < n_bufs; j++) {
|
||||
const ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j);
|
||||
const size_t nbytes = split_state.ne[s*n_bufs + j] * tensor->nb[1];
|
||||
ggml_backend_tensor_get_2d(simple_tensor, (char *) data + offset_data, simple_offsets[j], nbytes,
|
||||
tensor->ne[2], simple_tensor->nb[2], tensor->nb[2]);
|
||||
offset_data += nbytes;
|
||||
simple_offsets[j] += nbytes;
|
||||
}
|
||||
}
|
||||
GGML_ASSERT(offset_data*tensor->ne[2] == size);
|
||||
return;
|
||||
}
|
||||
|
||||
switch (split_state.axis) {
|
||||
case GGML_BACKEND_SPLIT_AXIS_0:
|
||||
|
|
@ -1418,6 +1456,8 @@ struct ggml_backend_meta_context {
|
|||
int max_nnodes = 0;
|
||||
size_t max_tmp_size = 0;
|
||||
size_t max_subgraphs = 0;
|
||||
size_t n_subgraphs = 0;
|
||||
uint64_t uid = 0;
|
||||
|
||||
void * comm_ctx = nullptr;
|
||||
ggml_backend_comm_allreduce_tensor_t comm_allreduce = nullptr;
|
||||
|
|
@ -1578,6 +1618,9 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
|
|||
const size_t n_backends = ggml_backend_meta_n_backends(backend);
|
||||
ggml_backend_meta_context * backend_ctx = (ggml_backend_meta_context *) backend->context;
|
||||
|
||||
// If the previous cgraph had a defined UID it can be used to skip rebuilding the subgraphs per simple backend.
|
||||
const bool needs_rebuild = (cgraph->uid == 0) || (cgraph->uid != backend_ctx->uid);
|
||||
|
||||
bool max_nnodes_raised = false;
|
||||
if (cgraph->n_nodes > backend_ctx->max_nnodes) {
|
||||
for (size_t j = 0; j < n_backends; j++) {
|
||||
|
|
@ -1587,173 +1630,181 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
|
|||
}
|
||||
backend_ctx->max_nnodes = cgraph->n_nodes;
|
||||
max_nnodes_raised = true;
|
||||
assert(needs_rebuild);
|
||||
}
|
||||
for (size_t j = 0; j < n_backends; j++) {
|
||||
auto & bcj = backend_ctx->backend_configs[j];
|
||||
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
ggml_tensor * node = cgraph->nodes[i];
|
||||
if (node->view_src != nullptr && node->view_src->op == GGML_OP_NONE && ggml_backend_buffer_is_host(node->view_src->buffer)) {
|
||||
// FIXME s_copy_main is on the CPU and its view seems to be incorrectly added to the graph nodes.
|
||||
// For regular usage this doesn't matter since it's a noop but trying to call ggml_backend_meta_buffer_simple_tensor results in a crash.
|
||||
bcj.nodes[i] = node;
|
||||
continue;
|
||||
if (needs_rebuild) {
|
||||
size_t n_subgraphs = 0;
|
||||
size_t max_tmp_size = 0;
|
||||
|
||||
for (size_t j = 0; j < n_backends; j++) {
|
||||
auto & bcj = backend_ctx->backend_configs[j];
|
||||
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
ggml_tensor * node = cgraph->nodes[i];
|
||||
if (node->view_src != nullptr && node->view_src->op == GGML_OP_NONE && ggml_backend_buffer_is_host(node->view_src->buffer)) {
|
||||
// FIXME s_copy_main is on the CPU and its view seems to be incorrectly added to the graph nodes.
|
||||
// For regular usage this doesn't matter since it's a noop but trying to call ggml_backend_meta_buffer_simple_tensor results in a crash.
|
||||
bcj.nodes[i] = node;
|
||||
continue;
|
||||
}
|
||||
bcj.nodes[i] = ggml_backend_meta_buffer_simple_tensor(node, j);
|
||||
GGML_ASSERT(bcj.nodes[i]);
|
||||
}
|
||||
bcj.nodes[i] = ggml_backend_meta_buffer_simple_tensor(node, j);
|
||||
GGML_ASSERT(bcj.nodes[i]);
|
||||
}
|
||||
}
|
||||
|
||||
size_t n_subgraphs = 0;
|
||||
size_t max_tmp_size = 0;
|
||||
{
|
||||
// For MoE models it may make sense to delay the AllReduce in order to reduce I/O:
|
||||
auto get_i_delayed = [&](const int i) -> int {
|
||||
int id = i; // i_delayed
|
||||
int idr = i; // i_delayed return, last safe return value
|
||||
{
|
||||
// For MoE models it may make sense to delay the AllReduce in order to reduce I/O:
|
||||
auto get_i_delayed = [&](const int i) -> int {
|
||||
int id = i; // i_delayed
|
||||
int idr = i; // i_delayed return, last safe return value
|
||||
|
||||
ggml_tensor * node = cgraph->nodes[id];
|
||||
int32_t n_used = ggml_node_get_use_count(cgraph, id);
|
||||
if (id + 1 >= cgraph->n_nodes) {
|
||||
return idr;
|
||||
}
|
||||
{
|
||||
ggml_tensor * next = cgraph->nodes[id+1];
|
||||
if (next->op == GGML_OP_ADD_ID && next->src[0] == node &&
|
||||
ggml_backend_meta_get_split_state(next->src[1], false).axis == GGML_BACKEND_SPLIT_AXIS_PARTIAL &&
|
||||
ggml_backend_meta_get_split_state(next->src[2], false).axis == GGML_BACKEND_SPLIT_AXIS_MIRRORED) {
|
||||
node = next;
|
||||
ggml_tensor * node = cgraph->nodes[id];
|
||||
int32_t n_used = ggml_node_get_use_count(cgraph, id);
|
||||
if (id + 1 >= cgraph->n_nodes) {
|
||||
return idr;
|
||||
}
|
||||
{
|
||||
ggml_tensor * next = cgraph->nodes[id+1];
|
||||
if (next->op == GGML_OP_ADD_ID && next->src[0] == node &&
|
||||
ggml_backend_meta_get_split_state(next->src[1], false).axis == GGML_BACKEND_SPLIT_AXIS_PARTIAL &&
|
||||
ggml_backend_meta_get_split_state(next->src[2], false).axis == GGML_BACKEND_SPLIT_AXIS_MIRRORED) {
|
||||
node = next;
|
||||
id++;
|
||||
idr = id;
|
||||
n_used = ggml_node_get_use_count(cgraph, id);
|
||||
}
|
||||
}
|
||||
if (id + 1 >= cgraph->n_nodes) {
|
||||
return idr;
|
||||
}
|
||||
{
|
||||
ggml_tensor * next = cgraph->nodes[id+1];
|
||||
if (next->op == GGML_OP_MUL && next->src[0] == node &&
|
||||
ggml_backend_meta_get_split_state(next->src[1], false).axis == GGML_BACKEND_SPLIT_AXIS_MIRRORED) {
|
||||
node = next;
|
||||
id++;
|
||||
idr = id;
|
||||
n_used = ggml_node_get_use_count(cgraph, id);
|
||||
}
|
||||
}
|
||||
|
||||
if (n_used != node->ne[1] || id + 2*n_used-1 >= cgraph->n_nodes) {
|
||||
return idr;
|
||||
}
|
||||
for (int32_t k = 0; k < n_used; k++) {
|
||||
ggml_tensor * next = cgraph->nodes[id+1];
|
||||
if (next->op != GGML_OP_VIEW || next->view_src != node || next->view_offs != k*node->nb[1] ||
|
||||
next->ne[0] != node->ne[0] || next->ne[1] != node->ne[2] || next->nb[1] != node->nb[2] ||
|
||||
ggml_node_get_use_count(cgraph, id+1) != 1) {
|
||||
return idr;
|
||||
}
|
||||
id++;
|
||||
idr = id;
|
||||
n_used = ggml_node_get_use_count(cgraph, id);
|
||||
}
|
||||
}
|
||||
if (id + 1 >= cgraph->n_nodes) {
|
||||
return idr;
|
||||
}
|
||||
{
|
||||
ggml_tensor * next = cgraph->nodes[id+1];
|
||||
if (next->op == GGML_OP_MUL && next->src[0] == node &&
|
||||
ggml_backend_meta_get_split_state(next->src[1], false).axis == GGML_BACKEND_SPLIT_AXIS_MIRRORED) {
|
||||
node = next;
|
||||
{
|
||||
ggml_tensor * next = cgraph->nodes[id+1];
|
||||
if (next->op != GGML_OP_ADD || next->src[0] != cgraph->nodes[id - (n_used-1)] ||
|
||||
next->src[1] != cgraph->nodes[id - (n_used-2)] || ggml_node_get_use_count(cgraph, id+1) != 1) {
|
||||
return idr;
|
||||
}
|
||||
id++;
|
||||
idr = id;
|
||||
n_used = ggml_node_get_use_count(cgraph, id);
|
||||
}
|
||||
}
|
||||
|
||||
if (n_used != node->ne[1] || id + 2*n_used-1 >= cgraph->n_nodes) {
|
||||
for (int32_t k = 0; k < n_used - 2; k++) {
|
||||
ggml_tensor * next = cgraph->nodes[id+1];
|
||||
if (next->op != GGML_OP_ADD || next->src[0] != cgraph->nodes[id] ||
|
||||
next->src[1] != cgraph->nodes[id - (n_used-2)] || ggml_node_get_use_count(cgraph, id+1) != 1) {
|
||||
return idr;
|
||||
}
|
||||
id++;
|
||||
}
|
||||
idr = id;
|
||||
return idr;
|
||||
}
|
||||
for (int32_t k = 0; k < n_used; k++) {
|
||||
ggml_tensor * next = cgraph->nodes[id+1];
|
||||
if (next->op != GGML_OP_VIEW || next->view_src != node || next->view_offs != k*node->nb[1] ||
|
||||
next->ne[0] != node->ne[0] || next->ne[1] != node->ne[2] || next->nb[1] != node->nb[2] ||
|
||||
ggml_node_get_use_count(cgraph, id+1) != 1) {
|
||||
return idr;
|
||||
}
|
||||
id++;
|
||||
}
|
||||
{
|
||||
ggml_tensor * next = cgraph->nodes[id+1];
|
||||
if (next->op != GGML_OP_ADD || next->src[0] != cgraph->nodes[id - (n_used-1)] ||
|
||||
next->src[1] != cgraph->nodes[id - (n_used-2)] || ggml_node_get_use_count(cgraph, id+1) != 1) {
|
||||
return idr;
|
||||
}
|
||||
id++;
|
||||
}
|
||||
for (int32_t k = 0; k < n_used - 2; k++) {
|
||||
ggml_tensor * next = cgraph->nodes[id+1];
|
||||
if (next->op != GGML_OP_ADD || next->src[0] != cgraph->nodes[id] ||
|
||||
next->src[1] != cgraph->nodes[id - (n_used-2)] || ggml_node_get_use_count(cgraph, id+1) != 1) {
|
||||
return idr;
|
||||
}
|
||||
id++;
|
||||
}
|
||||
idr = id;
|
||||
return idr;
|
||||
};
|
||||
};
|
||||
|
||||
int i_start = 0;
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
ggml_tensor * node = cgraph->nodes[i];
|
||||
if (node->view_src != nullptr && node->view_src->op == GGML_OP_NONE && ggml_backend_buffer_is_host(node->view_src->buffer)) {
|
||||
continue;
|
||||
}
|
||||
const ggml_backend_meta_split_state split_state = ggml_backend_meta_get_split_state(node, /*assume_sync =*/ false);
|
||||
if (split_state.axis == GGML_BACKEND_SPLIT_AXIS_PARTIAL) {
|
||||
max_tmp_size = std::max(max_tmp_size, ggml_nbytes(node));
|
||||
}
|
||||
const bool new_subgraph = i + 1 == cgraph->n_nodes || split_state.axis == GGML_BACKEND_SPLIT_AXIS_PARTIAL;
|
||||
if (!new_subgraph) {
|
||||
continue;
|
||||
}
|
||||
int i_start = 0;
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
ggml_tensor * node = cgraph->nodes[i];
|
||||
if (node->view_src != nullptr && node->view_src->op == GGML_OP_NONE && ggml_backend_buffer_is_host(node->view_src->buffer)) {
|
||||
continue;
|
||||
}
|
||||
const ggml_backend_meta_split_state split_state = ggml_backend_meta_get_split_state(node, /*assume_sync =*/ false);
|
||||
if (split_state.axis == GGML_BACKEND_SPLIT_AXIS_PARTIAL) {
|
||||
max_tmp_size = std::max(max_tmp_size, ggml_nbytes(node));
|
||||
}
|
||||
const bool new_subgraph = i + 1 == cgraph->n_nodes || split_state.axis == GGML_BACKEND_SPLIT_AXIS_PARTIAL;
|
||||
if (!new_subgraph) {
|
||||
continue;
|
||||
}
|
||||
|
||||
i = get_i_delayed(i);
|
||||
i = get_i_delayed(i);
|
||||
|
||||
for (size_t j = 0; j < n_backends; j++) {
|
||||
auto & bcj = backend_ctx->backend_configs[j];
|
||||
bcj.cgraphs[n_subgraphs].offset = i_start;
|
||||
}
|
||||
n_subgraphs++;
|
||||
i_start = i + 1;
|
||||
}
|
||||
GGML_ASSERT(i_start == cgraph->n_nodes);
|
||||
}
|
||||
|
||||
backend_ctx->uid = cgraph->uid;
|
||||
backend_ctx->n_subgraphs = n_subgraphs;
|
||||
|
||||
if (max_tmp_size > backend_ctx->max_tmp_size) {
|
||||
for (size_t j = 0; j < n_backends; j++) {
|
||||
auto & bcj = backend_ctx->backend_configs[j];
|
||||
bcj.cgraphs[n_subgraphs].offset = i_start;
|
||||
bcj.buf.reset(ggml_backend_alloc_buffer(bcj.backend, max_tmp_size));
|
||||
}
|
||||
n_subgraphs++;
|
||||
i_start = i + 1;
|
||||
backend_ctx->max_tmp_size = max_tmp_size;
|
||||
}
|
||||
GGML_ASSERT(i_start == cgraph->n_nodes);
|
||||
}
|
||||
|
||||
if (max_tmp_size > backend_ctx->max_tmp_size) {
|
||||
for (size_t j = 0; j < n_backends; j++) {
|
||||
auto & bcj = backend_ctx->backend_configs[j];
|
||||
bcj.buf.reset(ggml_backend_alloc_buffer(bcj.backend, max_tmp_size));
|
||||
}
|
||||
backend_ctx->max_tmp_size = max_tmp_size;
|
||||
}
|
||||
|
||||
|
||||
if (max_nnodes_raised || n_subgraphs > backend_ctx->max_subgraphs) {
|
||||
backend_ctx->max_subgraphs = std::max(backend_ctx->max_subgraphs, n_subgraphs);
|
||||
const size_t n_reduce_steps = backend_ctx->n_reduce_steps();
|
||||
const size_t n_nodes_per_device = 2 * n_reduce_steps; // tmp + ADD per step
|
||||
const size_t n_cgraphs_per_device = n_reduce_steps; // 1 ADD graph per step
|
||||
const size_t mem_per_device_graphs_main = backend_ctx->max_subgraphs*ggml_graph_overhead_custom(backend_ctx->max_nnodes, cgraph->grads);
|
||||
const size_t mem_per_device_graphs_aux = n_cgraphs_per_device*backend_ctx->max_subgraphs*ggml_graph_overhead_custom(1, cgraph->grads);
|
||||
const size_t mem_per_device_nodes_aux = n_nodes_per_device*backend_ctx->max_subgraphs*ggml_tensor_overhead();
|
||||
ggml_init_params params = {
|
||||
/*.mem_size =*/ n_backends * (mem_per_device_graphs_main + mem_per_device_graphs_aux + mem_per_device_nodes_aux),
|
||||
/*.mem_buffer =*/ nullptr,
|
||||
/*.no_alloc =*/ true,
|
||||
};
|
||||
backend_ctx->ctx.reset(ggml_init(params));
|
||||
for (size_t j = 0; j < n_backends; j++) {
|
||||
auto & bcj = backend_ctx->backend_configs[j];
|
||||
for (size_t i = 0; i < n_subgraphs; i++) {
|
||||
bcj.cgraphs[i].cgraph_main = ggml_new_graph_custom(backend_ctx->ctx.get(), cgraph->n_nodes, /*grads =*/ false);
|
||||
if (max_nnodes_raised || n_subgraphs > backend_ctx->max_subgraphs) {
|
||||
backend_ctx->max_subgraphs = std::max(backend_ctx->max_subgraphs, n_subgraphs);
|
||||
const size_t n_reduce_steps = backend_ctx->n_reduce_steps();
|
||||
const size_t n_nodes_per_device = 2 * n_reduce_steps; // tmp + ADD per step
|
||||
const size_t n_cgraphs_per_device = n_reduce_steps; // 1 ADD graph per step
|
||||
const size_t mem_per_device_graphs_main = backend_ctx->max_subgraphs*ggml_graph_overhead_custom(backend_ctx->max_nnodes, cgraph->grads);
|
||||
const size_t mem_per_device_graphs_aux = n_cgraphs_per_device*backend_ctx->max_subgraphs*ggml_graph_overhead_custom(1, cgraph->grads);
|
||||
const size_t mem_per_device_nodes_aux = n_nodes_per_device*backend_ctx->max_subgraphs*ggml_tensor_overhead();
|
||||
ggml_init_params params = {
|
||||
/*.mem_size =*/ n_backends * (mem_per_device_graphs_main + mem_per_device_graphs_aux + mem_per_device_nodes_aux),
|
||||
/*.mem_buffer =*/ nullptr,
|
||||
/*.no_alloc =*/ true,
|
||||
};
|
||||
backend_ctx->ctx.reset(ggml_init(params));
|
||||
for (size_t j = 0; j < n_backends; j++) {
|
||||
auto & bcj = backend_ctx->backend_configs[j];
|
||||
for (size_t i = 0; i < n_subgraphs; i++) {
|
||||
bcj.cgraphs[i].cgraph_main = ggml_new_graph_custom(backend_ctx->ctx.get(), cgraph->n_nodes, /*grads =*/ false);
|
||||
}
|
||||
}
|
||||
backend_ctx->cgraphs_aux.resize(n_backends*n_cgraphs_per_device*backend_ctx->max_subgraphs);
|
||||
for (size_t k = 0; k < backend_ctx->cgraphs_aux.size(); k++) {
|
||||
backend_ctx->cgraphs_aux[k] = ggml_new_graph_custom(backend_ctx->ctx.get(), 1, cgraph->grads);
|
||||
}
|
||||
backend_ctx->nodes_aux.resize(n_backends*n_nodes_per_device*backend_ctx->max_subgraphs);
|
||||
for (size_t k = 0; k < backend_ctx->nodes_aux.size(); k++) {
|
||||
backend_ctx->nodes_aux[k] = ggml_new_tensor_1d(backend_ctx->ctx.get(), GGML_TYPE_F32, 1);
|
||||
}
|
||||
}
|
||||
backend_ctx->cgraphs_aux.resize(n_backends*n_cgraphs_per_device*backend_ctx->max_subgraphs);
|
||||
for (size_t k = 0; k < backend_ctx->cgraphs_aux.size(); k++) {
|
||||
backend_ctx->cgraphs_aux[k] = ggml_new_graph_custom(backend_ctx->ctx.get(), 1, cgraph->grads);
|
||||
}
|
||||
backend_ctx->nodes_aux.resize(n_backends*n_nodes_per_device*backend_ctx->max_subgraphs);
|
||||
for (size_t k = 0; k < backend_ctx->nodes_aux.size(); k++) {
|
||||
backend_ctx->nodes_aux[k] = ggml_new_tensor_1d(backend_ctx->ctx.get(), GGML_TYPE_F32, 1);
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t j = 0; j < n_backends; j++) {
|
||||
auto & bcj = backend_ctx->backend_configs[j];
|
||||
for (size_t i_graph = 0; i_graph < n_subgraphs; i_graph++) {
|
||||
ggml_cgraph * cgraph_ij = bcj.cgraphs[i_graph].cgraph_main;
|
||||
const size_t i_node_start = bcj.cgraphs[i_graph].offset;
|
||||
const size_t i_node_stop = i_graph + 1 < n_subgraphs ? bcj.cgraphs[i_graph + 1].offset : cgraph->n_nodes;
|
||||
cgraph_ij->n_nodes = i_node_stop - i_node_start;
|
||||
ggml_hash_set_reset(&cgraph_ij->visited_hash_set);
|
||||
for (size_t i_node = i_node_start; i_node < i_node_stop; i_node++) {
|
||||
ggml_tensor * node_ij = bcj.nodes[i_node];
|
||||
cgraph_ij->nodes[i_node - i_node_start] = node_ij;
|
||||
const size_t hash_pos_orig = ggml_hash_find(&cgraph->visited_hash_set, cgraph->nodes[i_node]);
|
||||
const size_t hash_pos_ij = ggml_hash_insert(&cgraph_ij->visited_hash_set, node_ij);
|
||||
cgraph_ij->use_counts[hash_pos_ij] = cgraph->use_counts[hash_pos_orig];
|
||||
for (size_t j = 0; j < n_backends; j++) {
|
||||
auto & bcj = backend_ctx->backend_configs[j];
|
||||
for (size_t i_graph = 0; i_graph < n_subgraphs; i_graph++) {
|
||||
ggml_cgraph * cgraph_ij = bcj.cgraphs[i_graph].cgraph_main;
|
||||
const size_t i_node_start = bcj.cgraphs[i_graph].offset;
|
||||
const size_t i_node_stop = i_graph + 1 < n_subgraphs ? bcj.cgraphs[i_graph + 1].offset : cgraph->n_nodes;
|
||||
cgraph_ij->n_nodes = i_node_stop - i_node_start;
|
||||
ggml_hash_set_reset(&cgraph_ij->visited_hash_set);
|
||||
for (size_t i_node = i_node_start; i_node < i_node_stop; i_node++) {
|
||||
ggml_tensor * node_ij = bcj.nodes[i_node];
|
||||
cgraph_ij->nodes[i_node - i_node_start] = node_ij;
|
||||
const size_t hash_pos_orig = ggml_hash_find(&cgraph->visited_hash_set, cgraph->nodes[i_node]);
|
||||
const size_t hash_pos_ij = ggml_hash_insert(&cgraph_ij->visited_hash_set, node_ij);
|
||||
cgraph_ij->use_counts[hash_pos_ij] = cgraph->use_counts[hash_pos_orig];
|
||||
}
|
||||
cgraph_ij->uid = ggml_graph_next_uid();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1860,7 +1911,7 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
|
|||
};
|
||||
|
||||
|
||||
for (size_t i = 0; i < n_subgraphs; i++) {
|
||||
for (size_t i = 0; i < backend_ctx->n_subgraphs; i++) {
|
||||
for (size_t j = 0; j < n_backends; j++) {
|
||||
auto & bcj = backend_ctx->backend_configs[j];
|
||||
const ggml_status status = ggml_backend_graph_compute_async(bcj.backend, bcj.cgraphs[i].cgraph_main);
|
||||
|
|
@ -1869,7 +1920,7 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
|
|||
}
|
||||
}
|
||||
|
||||
if (n_backends > 1 && i < n_subgraphs - 1) {
|
||||
if (n_backends > 1 && i < backend_ctx->n_subgraphs - 1) {
|
||||
bool backend_allreduce_success = false;
|
||||
if (backend_ctx->comm_ctx) {
|
||||
std::vector<ggml_tensor *> nodes;
|
||||
|
|
|
|||
|
|
@ -274,10 +274,6 @@ static const char * cu_get_error_str(CUresult err) {
|
|||
#define FLASH_ATTN_AVAILABLE
|
||||
#endif // !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ < 220)
|
||||
|
||||
#if defined(TURING_MMA_AVAILABLE)
|
||||
#define LDMATRIX_TRANS_AVAILABLE
|
||||
#endif // defined(TURING_MMA_AVAILABLE)
|
||||
|
||||
static bool fp16_available(const int cc) {
|
||||
return ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_PASCAL ||
|
||||
(GGML_CUDA_CC_IS_MTHREADS(cc) && cc >= GGML_CUDA_CC_PH1);
|
||||
|
|
@ -1194,6 +1190,7 @@ struct ggml_cuda_graph {
|
|||
bool disable_due_to_gpu_arch = false;
|
||||
bool warmup_complete = false;
|
||||
uint64_t uid = 0;
|
||||
int64_t last_used_time = 0;
|
||||
struct node_properties {
|
||||
ggml_tensor node;
|
||||
void * node_src_data_ptrs[GGML_MAX_SRC];
|
||||
|
|
@ -1375,12 +1372,28 @@ struct ggml_backend_cuda_context {
|
|||
// when the computation is split across CPU/GPU (e.g., with --n-cpu-moe)
|
||||
std::unordered_map<const void *, std::unique_ptr<ggml_cuda_graph>> cuda_graphs;
|
||||
|
||||
int64_t last_graph_eviction_sweep = 0;
|
||||
|
||||
ggml_cuda_graph * cuda_graph(const void * first_node_ptr) {
|
||||
const int64_t time_now = ggml_time_us();
|
||||
|
||||
// sweep every 5s, evicting cuda graphs unused for >=10s
|
||||
if (time_now - last_graph_eviction_sweep >= 5'000'000) {
|
||||
last_graph_eviction_sweep = time_now;
|
||||
for (auto it = cuda_graphs.begin(); it != cuda_graphs.end(); ) {
|
||||
if (time_now - it->second->last_used_time >= 10'000'000) {
|
||||
it = cuda_graphs.erase(it);
|
||||
} else {
|
||||
++it;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
auto it = cuda_graphs.find(first_node_ptr);
|
||||
if (it == cuda_graphs.end()) {
|
||||
cuda_graphs[first_node_ptr] = std::make_unique<ggml_cuda_graph>();
|
||||
return cuda_graphs[first_node_ptr].get();
|
||||
it = cuda_graphs.emplace(first_node_ptr, std::make_unique<ggml_cuda_graph>()).first;
|
||||
}
|
||||
it->second->last_used_time = time_now;
|
||||
return it->second.get();
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -305,12 +305,13 @@ static __device__ __forceinline__ void flash_attn_ext_f16_load_tile(
|
|||
const half2 * const __restrict__ KV, half2 * const __restrict__ tile_KV, const int D2, const int stride_KV, const int i_sup) {
|
||||
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
||||
// K/V data is loaded with decreasing granularity for D for better memory bandwidth.
|
||||
// The minimum granularity with cp.async is 16 bytes, with synchronous data loading it's 4 bytes.
|
||||
// The minimum granularity is 16 bytes.
|
||||
constexpr int h2_per_chunk = 16/sizeof(half2);
|
||||
const int chunks_per_row = D2 / h2_per_chunk;
|
||||
if constexpr (use_cp_async) {
|
||||
static_assert(warp_size == 32, "bad warp_size");
|
||||
static_assert(!oob_check, "OOB check not compatible with cp_async");
|
||||
constexpr int preload = 64;
|
||||
constexpr int h2_per_chunk = 16/sizeof(half2);
|
||||
const int chunks_per_row = D2 / h2_per_chunk;
|
||||
|
||||
const unsigned int tile_KV_32 = ggml_cuda_cvta_generic_to_shared(tile_KV);
|
||||
|
||||
|
|
@ -348,11 +349,11 @@ static __device__ __forceinline__ void flash_attn_ext_f16_load_tile(
|
|||
// 6: max 1*16= 16 bytes, 8 half
|
||||
ggml_cuda_unroll<6>{}(load);
|
||||
} else {
|
||||
// TODO use ggml_cuda_memcpy_1
|
||||
const half2 zero[4] = {{0.0f, 0.0f}, {0.0f, 0.0f}, {0.0f, 0.0f}, {0.0f, 0.0f}};
|
||||
auto load = [&] __device__ (const int n) {
|
||||
const int stride_k = warp_size >> n;
|
||||
const int k0_start = stride_k == warp_size ? 0 : D2 - D2 % (2*stride_k);
|
||||
const int k0_stop = D2 - D2 % (1*stride_k);
|
||||
const int stride_k = 32 >> n;
|
||||
const int k0_start = stride_k == 32 ? 0 : chunks_per_row - chunks_per_row % (2*stride_k);
|
||||
const int k0_stop = chunks_per_row - chunks_per_row % (1*stride_k);
|
||||
const int stride_i = warp_size / stride_k;
|
||||
|
||||
if (k0_start == k0_stop) {
|
||||
|
|
@ -371,15 +372,18 @@ static __device__ __forceinline__ void flash_attn_ext_f16_load_tile(
|
|||
for (int k0 = k0_start; k0 < k0_stop; k0 += stride_k) {
|
||||
const int k = k0 + (stride_k == warp_size ? threadIdx.x : threadIdx.x % stride_k);
|
||||
|
||||
tile_KV[i*stride_tile + k] = !oob_check || i < i_sup ? KV[i*stride_KV + k] : make_half2(0.0f, 0.0f);
|
||||
ggml_cuda_memcpy_1<16>(tile_KV + i*stride_tile + k*4,
|
||||
!oob_check || i < i_sup ? KV + i*stride_KV + k*h2_per_chunk : zero);
|
||||
}
|
||||
}
|
||||
};
|
||||
// 1: max 32* 4=128 bytes, 64 half
|
||||
// 2: max 16* 4= 64 bytes, 32 half
|
||||
// 3: max 8* 4= 32 bytes, 16 half
|
||||
// 4: max 4* 4= 16 bytes, 8 half
|
||||
ggml_cuda_unroll<4>{}(load);
|
||||
// 1: max 32*16=512 bytes, 256 half
|
||||
// 2: max 16*16=256 bytes, 128 half
|
||||
// 3: max 8*16=128 bytes, 64 half
|
||||
// 4: max 4*16= 64 bytes, 32 half
|
||||
// 5: max 2*16= 32 bytes, 16 half
|
||||
// 6: max 1*16= 16 bytes, 8 half
|
||||
ggml_cuda_unroll<6>{}(load);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -862,11 +866,6 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
|
|||
}
|
||||
|
||||
|
||||
#if defined(AMD_WMMA_AVAILABLE) && !defined(LDMATRIX_TRANS_AVAILABLE)
|
||||
T_A_VKQ A_identity;
|
||||
make_identity_mat(A_identity);
|
||||
#endif // defined(AMD_WMMA_AVAILABLE) && !defined(LDMATRIX_TRANS_AVAILABLE)
|
||||
|
||||
// Calculate VKQ tile, need to use logical rather than physical elements for i0 due to transposition of V:
|
||||
#pragma unroll
|
||||
for (int i0_start = 0; i0_start < DV; i0_start += 2*nbatch_V2) {
|
||||
|
|
@ -897,29 +896,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
|
|||
const int k0 = k00 + (threadIdx.y % np)*T_A_VKQ::J;
|
||||
|
||||
T_A_VKQ A; // Transposed in SRAM but not in registers, gets transposed on load.
|
||||
#if defined(LDMATRIX_TRANS_AVAILABLE)
|
||||
load_ldmatrix_trans(A, tile_V_i + 2*k0*stride_tile_V + (i_VKQ_0 - i0_start)/2, stride_tile_V);
|
||||
#elif defined(AMD_MFMA_AVAILABLE)
|
||||
// MFMA A register layout: A_mat[i=lane%16][k=4*(lane/16)+reg].
|
||||
// Normal load gives A_mat[seq][dv] but we need A_mat[dv][seq] = V^T.
|
||||
// Load with transposed addressing: 4 strided half loads.
|
||||
{
|
||||
const half2 * xs0 = tile_V_i + 2*k0*stride_tile_V + (i_VKQ_0 - i0_start)/2;
|
||||
const half * xs0_h = (const half *) xs0;
|
||||
const int stride_h = stride_tile_V * 2; // stride in half units
|
||||
half * A_h = (half *) A.x;
|
||||
#pragma unroll
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
A_h[l] = xs0_h[(4*(threadIdx.x / 16) + l) * stride_h + threadIdx.x % 16];
|
||||
}
|
||||
}
|
||||
#else
|
||||
// TODO: Try to transpose tile_V when loading gmem to smem.
|
||||
// Use mma to transpose T_A_VKQ for RDNA.
|
||||
T_A_VKQ A_trans;
|
||||
load_ldmatrix(A_trans, tile_V_i + 2*k0*stride_tile_V + (i_VKQ_0 - i0_start)/2, stride_tile_V);
|
||||
mma(A, A_trans, A_identity);
|
||||
#endif // defined(LDMATRIX_TRANS_AVAILABLE)
|
||||
if constexpr (T_B_KQ::I == 8) {
|
||||
mma(VKQ_C[i_VKQ_0/i0_stride], A, B[k00/(np*T_A_VKQ::J)]);
|
||||
} else {
|
||||
|
|
|
|||
|
|
@ -86,17 +86,12 @@ namespace ggml_cuda_mma {
|
|||
// - (I_MAJOR, I_MAJOR_MIRRORED) -> I_MAJOR
|
||||
// - (I_MAJOR, J_MAJOR_MIRRORED) -> I_MAJOR
|
||||
|
||||
static constexpr bool is_i_major(const data_layout dl) {
|
||||
return dl == DATA_LAYOUT_I_MAJOR ||
|
||||
dl == DATA_LAYOUT_I_MAJOR_MIRRORED;
|
||||
}
|
||||
|
||||
static constexpr __device__ data_layout get_input_data_layout() {
|
||||
#if defined(RDNA3) || __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
|
||||
#if defined(RDNA3) || defined(VOLTA_MMA_AVAILABLE)
|
||||
return DATA_LAYOUT_I_MAJOR_MIRRORED;
|
||||
#else
|
||||
return DATA_LAYOUT_I_MAJOR;
|
||||
#endif // defined(RDNA3) || __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
|
||||
#endif // defined(RDNA3) || defined(VOLTA_MMA_AVAILABLE)
|
||||
}
|
||||
|
||||
template <int I_, int J_, typename T, data_layout ds_=DATA_LAYOUT_I_MAJOR>
|
||||
|
|
@ -113,7 +108,6 @@ namespace ggml_cuda_mma {
|
|||
T x[ne] = {0};
|
||||
|
||||
static constexpr __device__ bool supported() {
|
||||
if (I == 64 && J == 2) return true;
|
||||
if (I == 16 && J == 8) return true;
|
||||
if (I == 32 && J == 4) return true;
|
||||
if (I == 16 && J == 16) return true;
|
||||
|
|
@ -122,7 +116,7 @@ namespace ggml_cuda_mma {
|
|||
}
|
||||
|
||||
static __device__ __forceinline__ int get_i(const int l) {
|
||||
if constexpr (I == 64 && J == 2) { // Special tile size to load <16, 4> as <16, 8>
|
||||
if constexpr (I == 16 && J == 4) {
|
||||
return threadIdx.x % 16;
|
||||
} else if constexpr (I == 16 && J == 8) {
|
||||
return threadIdx.x % 16;
|
||||
|
|
@ -139,8 +133,8 @@ namespace ggml_cuda_mma {
|
|||
}
|
||||
|
||||
static __device__ __forceinline__ int get_j(const int l) {
|
||||
if constexpr (I == 64 && J == 2) { // Special tile size to load <16, 4> as <16, 8>
|
||||
return (2 * ((threadIdx.x / 16) % 2) + l);
|
||||
if constexpr (I == 16 && J == 4) {
|
||||
return threadIdx.x / 16;
|
||||
} else if constexpr (I == 16 && J == 8) {
|
||||
return 2 * (threadIdx.x / 16) + l;
|
||||
} else if constexpr (I == 32 && J == 4) {
|
||||
|
|
@ -154,7 +148,7 @@ namespace ggml_cuda_mma {
|
|||
return -1;
|
||||
}
|
||||
}
|
||||
#elif __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
|
||||
#elif defined(VOLTA_MMA_AVAILABLE)
|
||||
static constexpr int ne = I * J / 32;
|
||||
T x[ne] = {0};
|
||||
|
||||
|
|
@ -283,7 +277,7 @@ namespace ggml_cuda_mma {
|
|||
static constexpr int J = J_;
|
||||
static constexpr data_layout dl = DATA_LAYOUT_I_MAJOR;
|
||||
|
||||
#if __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
|
||||
#if defined(VOLTA_MMA_AVAILABLE)
|
||||
static constexpr int ne = I * J / WARP_SIZE;
|
||||
half2 x[ne] = {{0.0f, 0.0f}};
|
||||
|
||||
|
|
@ -407,7 +401,7 @@ namespace ggml_cuda_mma {
|
|||
return -1;
|
||||
}
|
||||
}
|
||||
#endif // __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
|
||||
#endif // defined(VOLTA_MMA_AVAILABLE)
|
||||
};
|
||||
|
||||
template <int I_, int J_>
|
||||
|
|
@ -701,57 +695,12 @@ namespace ggml_cuda_mma {
|
|||
}
|
||||
#endif // defined(TURING_MMA_AVAILABLE)
|
||||
|
||||
static __device__ __forceinline__ void make_identity_mat(tile<16, 8, half2> & t) {
|
||||
#if defined(RDNA4)
|
||||
const int row = t.get_i(0);
|
||||
const int left_right = t.get_j(0) / 4;
|
||||
const int up_down = row / 8;
|
||||
const int idx = row % 8;
|
||||
reinterpret_cast<half*>(t.x)[idx] = left_right == up_down ? 1.0f : 0.0f;
|
||||
#else
|
||||
GGML_UNUSED_VARS(t);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // defined(RDNA4)
|
||||
}
|
||||
|
||||
template <int I, int J, typename T, data_layout dl>
|
||||
static __device__ __forceinline__ void load_generic(tile<I, J, T, dl> & t, const T * __restrict__ xs0, const int stride) {
|
||||
#if defined(AMD_MFMA_AVAILABLE)
|
||||
if constexpr (I == 64 && J == 2) { // Special tile size to load <16, 4> as <16, 8>
|
||||
#pragma unroll
|
||||
for (int l = 0; l < t.ne; ++l) {
|
||||
t.x[l] = xs0[t.get_i(l)*stride + t.get_j(l)];
|
||||
}
|
||||
} else {
|
||||
ggml_cuda_memcpy_1<sizeof(t.x)>(t.x, xs0 + t.get_i(0) * stride + t.get_j(0));
|
||||
}
|
||||
#elif defined(AMD_WMMA_AVAILABLE)
|
||||
// All wmma layout has contiguous data when i-major.
|
||||
if constexpr (is_i_major(dl)) {
|
||||
// the data must be aligned to 16 bytes when bigger than ggml_cuda_get_max_cpy_bytes()
|
||||
constexpr int aligned_copy_bytes = ggml_cuda_get_max_cpy_bytes();
|
||||
if constexpr (sizeof(t.x) > aligned_copy_bytes) {
|
||||
static_assert(sizeof(t.x) % aligned_copy_bytes == 0, "bad type size");
|
||||
constexpr int aligned_copy_count = sizeof(t.x)/aligned_copy_bytes;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < aligned_copy_count; ++i) {
|
||||
ggml_cuda_memcpy_1<aligned_copy_bytes>(t.x + t.ne/aligned_copy_count*i, xs0 + t.get_i(0) * stride + t.get_j(t.ne/aligned_copy_count*i));
|
||||
}
|
||||
} else {
|
||||
ggml_cuda_memcpy_1<sizeof(t.x)>(t.x, xs0 + t.get_i(0) * stride + t.get_j(0));
|
||||
}
|
||||
} else {
|
||||
#pragma unroll
|
||||
for (int l = 0; l < t.ne; ++l) {
|
||||
t.x[l] = xs0[t.get_i(l)*stride + t.get_j(l)];
|
||||
}
|
||||
}
|
||||
#else
|
||||
#pragma unroll
|
||||
for (int l = 0; l < t.ne; ++l) {
|
||||
t.x[l] = xs0[t.get_i(l)*stride + t.get_j(l)];
|
||||
}
|
||||
#endif // defined(AMD_MFMA_AVAILABLE)
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
|
|
@ -764,26 +713,37 @@ namespace ggml_cuda_mma {
|
|||
: "=r"(xi[0]), "=r"(xi[1])
|
||||
: "l"(xs));
|
||||
#else
|
||||
load_generic(t, xs0, stride);
|
||||
GGML_UNUSED_VARS(t, xs0, stride);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // TURING_MMA_AVAILABLE
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
template <typename T, data_layout dl>
|
||||
static __device__ __forceinline__ void load_ldmatrix(
|
||||
tile<16, 4, T> & t, const T * __restrict__ xs0, const int stride) {
|
||||
tile<16, 4, T, dl> & t, const T * __restrict__ xs0, const int stride) {
|
||||
#ifdef TURING_MMA_AVAILABLE
|
||||
int * xi = (int *) t.x;
|
||||
const int * xs = (const int *) xs0 + (threadIdx.x % t.I) * stride;
|
||||
asm volatile("ldmatrix.sync.aligned.m8n8.x2.b16 {%0, %1}, [%2];"
|
||||
: "=r"(xi[0]), "=r"(xi[1])
|
||||
: "l"(xs));
|
||||
#elif defined(AMD_WMMA_AVAILABLE)
|
||||
#ifdef RDNA3
|
||||
static_assert(dl == DATA_LAYOUT_I_MAJOR_MIRRORED, "bad data layout");
|
||||
static_assert(sizeof(t.x) == 16, "bad ne");
|
||||
ggml_cuda_memcpy_1<8>(t.x + 0, xs0 + t.get_i(0)*stride + 0);
|
||||
ggml_cuda_memcpy_1<8>(t.x + 2, xs0 + t.get_i(0)*stride + 2);
|
||||
#else
|
||||
static_assert(dl == DATA_LAYOUT_I_MAJOR, "bad data layout");
|
||||
static_assert(sizeof(t.x) == 8, "bad ne");
|
||||
ggml_cuda_memcpy_1<8>(t.x, xs0 + t.get_i(0)*stride + t.get_j(0));
|
||||
#endif // RDNA3
|
||||
#elif defined(AMD_MFMA_AVAILABLE)
|
||||
static_assert(sizeof(t.x) == 4, "bad ne");
|
||||
ggml_cuda_memcpy_1<4>(t.x, xs0 + t.get_i(0)*stride + t.get_j(0));
|
||||
#else
|
||||
#if __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
|
||||
GGML_UNUSED_VARS(t, xs0, stride);
|
||||
NO_DEVICE_CODE;
|
||||
#else
|
||||
load_generic(t, xs0, stride);
|
||||
#endif // __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
|
||||
#endif // TURING_MMA_AVAILABLE
|
||||
}
|
||||
|
||||
|
|
@ -796,19 +756,26 @@ namespace ggml_cuda_mma {
|
|||
asm volatile("ldmatrix.sync.aligned.m8n8.x4.b16 {%0, %1, %2, %3}, [%4];"
|
||||
: "=r"(xi[0]), "=r"(xi[1]), "=r"(xi[2]), "=r"(xi[3])
|
||||
: "l"(xs));
|
||||
#else
|
||||
#if __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
|
||||
#if 1
|
||||
// TODO: more generic handling
|
||||
static_assert(sizeof(T) == 4, "bad type size");
|
||||
#elif defined(VOLTA_MMA_AVAILABLE)
|
||||
ggml_cuda_memcpy_1<4*sizeof(T)>(t.x + 0, xs0 + t.get_i(0)*stride + 0);
|
||||
ggml_cuda_memcpy_1<4*sizeof(T)>(t.x + 4, xs0 + t.get_i(4)*stride + 4);
|
||||
#elif defined(AMD_WMMA_AVAILABLE)
|
||||
#ifdef RDNA3
|
||||
static_assert(dl == DATA_LAYOUT_I_MAJOR_MIRRORED, "bad data layout");
|
||||
static_assert(sizeof(t.x) == 32, "bad ne");
|
||||
ggml_cuda_memcpy_1<16>(t.x + 0, xs0 + t.get_i(0)*stride + 0);
|
||||
ggml_cuda_memcpy_1<16>(t.x + 4, xs0 + t.get_i(0)*stride + 4);
|
||||
#else
|
||||
load_generic(t, xs0, stride);
|
||||
#endif // 1
|
||||
static_assert(dl == DATA_LAYOUT_I_MAJOR, "bad data layout");
|
||||
static_assert(sizeof(t.x) == 16, "bad ne");
|
||||
ggml_cuda_memcpy_1<16>(t.x, xs0 + t.get_i(0)*stride + t.get_j(0));
|
||||
#endif // RDNA3
|
||||
#elif defined(AMD_MFMA_AVAILABLE)
|
||||
static_assert(sizeof(t.x) == 8, "bad ne");
|
||||
ggml_cuda_memcpy_1<8>(t.x, xs0 + t.get_i(0)*stride + t.get_j(0));
|
||||
#else
|
||||
load_generic(t, xs0, stride);
|
||||
#endif // __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
|
||||
GGML_UNUSED_VARS(t, xs0, stride);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // TURING_MMA_AVAILABLE
|
||||
}
|
||||
|
||||
|
|
@ -827,23 +794,30 @@ namespace ggml_cuda_mma {
|
|||
|
||||
static __device__ __forceinline__ void load_ldmatrix(
|
||||
tile<32, 4, half2> & t, const half2 * __restrict__ xs0, const int stride) {
|
||||
#if __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
|
||||
#if defined(VOLTA_MMA_AVAILABLE)
|
||||
ggml_cuda_memcpy_1<4*sizeof(half2)>(t.x, xs0 + t.get_i(0)*stride);
|
||||
#else
|
||||
GGML_UNUSED_VARS(t, xs0, stride);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
|
||||
#endif // defined(VOLTA_MMA_AVAILABLE)
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static __device__ __forceinline__ void load_ldmatrix_trans(
|
||||
tile<16, 8, T> & t, const T * __restrict__ xs0, const int stride) {
|
||||
#ifdef TURING_MMA_AVAILABLE
|
||||
int * xi = (int * ) t.x;
|
||||
int * xi = (int *) t.x;
|
||||
const int * xs = (const int *) xs0 + (threadIdx.x % t.I) * stride + (threadIdx.x / t.I) * (t.J / 2);
|
||||
asm volatile("ldmatrix.sync.aligned.m8n8.x4.trans.b16 {%0, %1, %2, %3}, [%4];"
|
||||
: "=r"(xi[0]), "=r"(xi[2]), "=r"(xi[1]), "=r"(xi[3])
|
||||
: "l"(xs));
|
||||
#elif defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
|
||||
half * xh = (half *) t.x;
|
||||
#pragma unroll
|
||||
for (int l = 0; l < t.ne; ++l) {
|
||||
xh[2*l + 0] = ((const half *) xs0)[(2*t.get_j(l) + 0)*(2*stride) + t.get_i(l)];
|
||||
xh[2*l + 1] = ((const half *) xs0)[(2*t.get_j(l) + 1)*(2*stride) + t.get_i(l)];
|
||||
}
|
||||
#else
|
||||
GGML_UNUSED_VARS(t, xs0, stride);
|
||||
NO_DEVICE_CODE;
|
||||
|
|
@ -1218,73 +1192,27 @@ namespace ggml_cuda_mma {
|
|||
using int32x4_t = __attribute__((__vector_size__(4 * sizeof(int)))) int;
|
||||
int32x4_t * acc = (int32x4_t *) D.x;
|
||||
#if defined(CDNA4) || defined(CDNA3)
|
||||
acc[0] = __builtin_amdgcn_mfma_i32_16x16x32_i8(((int64_t *) A.x)[0],
|
||||
((int64_t *) B.x)[0],
|
||||
acc[0],
|
||||
0, 0, 0);
|
||||
acc[0] = __builtin_amdgcn_mfma_i32_16x16x32_i8(((int64_t *) A.x)[0], ((int64_t *) B.x)[0], acc[0], 0, 0, 0);
|
||||
#elif defined(CDNA2) || defined(CDNA1)
|
||||
acc[0] = __builtin_amdgcn_mfma_i32_16x16x16i8(A.x[0],
|
||||
B.x[0],
|
||||
acc[0],
|
||||
0, 0, 0);
|
||||
acc[0] = __builtin_amdgcn_mfma_i32_16x16x16i8(A.x[1],
|
||||
B.x[1],
|
||||
acc[0],
|
||||
0, 0, 0);
|
||||
acc[0] = __builtin_amdgcn_mfma_i32_16x16x16i8(A.x[0], B.x[0], acc[0], 0, 0, 0);
|
||||
acc[0] = __builtin_amdgcn_mfma_i32_16x16x16i8(A.x[1], B.x[1], acc[0], 0, 0, 0);
|
||||
#endif // defined(CDNA4) || defined(CDNA3)
|
||||
|
||||
#elif defined(AMD_WMMA_AVAILABLE)
|
||||
|
||||
using int32x8_t = __attribute__((__vector_size__(8 * sizeof(int)))) int;
|
||||
int32x8_t * acc = (int32x8_t *) D.x;
|
||||
|
||||
#if defined(RDNA4)
|
||||
using int32x2_t = __attribute__((__vector_size__(2 * sizeof(int)))) int;
|
||||
int32x2_t * a_vec = (int32x2_t *) A.x;
|
||||
int32x2_t * b_vec = (int32x2_t *) B.x;
|
||||
|
||||
acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12(
|
||||
true,
|
||||
a_vec[0],
|
||||
true,
|
||||
b_vec[0],
|
||||
acc[0],
|
||||
true
|
||||
);
|
||||
|
||||
acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12(
|
||||
true,
|
||||
a_vec[1],
|
||||
true,
|
||||
b_vec[1],
|
||||
acc[0],
|
||||
true
|
||||
);
|
||||
|
||||
acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12(true, a_vec[0], true, b_vec[0], acc[0], true);
|
||||
acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12(true, a_vec[1], true, b_vec[1], acc[0], true);
|
||||
#elif defined(RDNA3)
|
||||
using int32x4_t = __attribute__((__vector_size__(4 * sizeof(int)))) int;
|
||||
int32x4_t * a_vec = (int32x4_t *) A.x;
|
||||
int32x4_t * b_vec = (int32x4_t *) B.x;
|
||||
|
||||
acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32(
|
||||
true,
|
||||
a_vec[0],
|
||||
true,
|
||||
b_vec[0],
|
||||
acc[0],
|
||||
true
|
||||
);
|
||||
|
||||
acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32(
|
||||
true,
|
||||
a_vec[1],
|
||||
true,
|
||||
b_vec[1],
|
||||
acc[0],
|
||||
true
|
||||
);
|
||||
acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32(true, a_vec[0], true, b_vec[0], acc[0], true);
|
||||
acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32(true, a_vec[1], true, b_vec[1], acc[0], true);
|
||||
#endif // RDNA4
|
||||
|
||||
#else
|
||||
GGML_UNUSED_VARS(D, A, B);
|
||||
NO_DEVICE_CODE;
|
||||
|
|
@ -1297,19 +1225,10 @@ namespace ggml_cuda_mma {
|
|||
using int32x16_t = __attribute__((__vector_size__(16 * sizeof(int)))) int;
|
||||
int32x16_t * acc = (int32x16_t *) D.x;
|
||||
#if defined(CDNA4) || defined(CDNA3)
|
||||
acc[0] = __builtin_amdgcn_mfma_i32_32x32x16_i8(((int64_t *) A.x)[0],
|
||||
((int64_t *) B.x)[0],
|
||||
acc[0],
|
||||
0, 0, 0);
|
||||
acc[0] = __builtin_amdgcn_mfma_i32_32x32x16_i8(((int64_t *) A.x)[0], ((int64_t *) B.x)[0], acc[0], 0, 0, 0);
|
||||
#elif defined(CDNA2) || defined(CDNA1)
|
||||
acc[0] = __builtin_amdgcn_mfma_i32_32x32x8i8(A.x[0],
|
||||
B.x[0],
|
||||
acc[0],
|
||||
0, 0, 0);
|
||||
acc[0] = __builtin_amdgcn_mfma_i32_32x32x8i8(A.x[1],
|
||||
B.x[1],
|
||||
acc[0],
|
||||
0, 0, 0);
|
||||
acc[0] = __builtin_amdgcn_mfma_i32_32x32x8i8(A.x[0], B.x[0], acc[0], 0, 0, 0);
|
||||
acc[0] = __builtin_amdgcn_mfma_i32_32x32x8i8(A.x[1], B.x[1], acc[0], 0, 0, 0);
|
||||
#endif // defined(CDNA4) || defined(CDNA3)
|
||||
|
||||
#else
|
||||
|
|
@ -1329,7 +1248,7 @@ namespace ggml_cuda_mma {
|
|||
|
||||
static __device__ __forceinline__ void mma(
|
||||
tile<32, 8, float> & D, const tile<32, 4, half2> & A, const tile<8, 4, half2, DATA_LAYOUT_I_MAJOR_MIRRORED> & B) {
|
||||
#if __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
|
||||
#if defined(VOLTA_MMA_AVAILABLE)
|
||||
const int * Axi = (const int *) A.x;
|
||||
const int * Bxi = (const int *) B.x;
|
||||
int * Dxi = (int *) D.x;
|
||||
|
|
@ -1344,12 +1263,12 @@ namespace ggml_cuda_mma {
|
|||
#else
|
||||
GGML_UNUSED_VARS(D, A, B);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
|
||||
#endif // defined(VOLTA_MMA_AVAILABLE)
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ void mma(
|
||||
tile<32, 4, half2> & D, const tile<32, 4, half2> & A, const tile<8, 4, half2, DATA_LAYOUT_J_MAJOR_MIRRORED> & B) {
|
||||
#if __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
|
||||
#if defined(VOLTA_MMA_AVAILABLE)
|
||||
const int * Axi = (const int *) A.x;
|
||||
const int * Bxi = (const int *) B.x;
|
||||
int * Dxi = (int *) D.x;
|
||||
|
|
@ -1364,41 +1283,35 @@ namespace ggml_cuda_mma {
|
|||
#else
|
||||
GGML_UNUSED_VARS(D, A, B);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
|
||||
#endif // defined(VOLTA_MMA_AVAILABLE)
|
||||
}
|
||||
|
||||
template <data_layout dl_d, data_layout dl_ab>
|
||||
static __device__ __forceinline__ void mma(
|
||||
tile<16, 16, int, dl_d> & D, const tile<16, 4, int, dl_ab> & A, const tile<16, 4, int, dl_ab> & B) {
|
||||
#if defined(AMD_WMMA_AVAILABLE)
|
||||
#if defined(AMD_MFMA_AVAILABLE)
|
||||
using int32x4_t = __attribute__((__vector_size__(4 * sizeof(int)))) int;
|
||||
int32x4_t * acc = (int32x4_t *) D.x;
|
||||
#if defined(CDNA4) || defined(CDNA3)
|
||||
const int64_t xA = uint32_t(A.x[0]);
|
||||
const int64_t xB = uint32_t(B.x[0]);
|
||||
acc[0] = __builtin_amdgcn_mfma_i32_16x16x32_i8(xA, xB, acc[0], 0, 0, 0);
|
||||
#elif defined(CDNA2) || defined(CDNA1)
|
||||
acc[0] = __builtin_amdgcn_mfma_i32_16x16x16i8(A.x[0], B.x[0], acc[0], 0, 0, 0);
|
||||
#endif // defined(CDNA4) || defined(CDNA3)
|
||||
#elif defined(AMD_WMMA_AVAILABLE)
|
||||
using int32x8_t = __attribute__((__vector_size__(8 * sizeof(int)))) int;
|
||||
int32x8_t * acc = (int32x8_t *) D.x;
|
||||
#if defined(RDNA4)
|
||||
using int32x2_t = __attribute__((__vector_size__(2 * sizeof(int)))) int;
|
||||
int32x2_t * a_vec = (int32x2_t *) A.x;
|
||||
int32x2_t * b_vec = (int32x2_t *) B.x;
|
||||
|
||||
acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12(
|
||||
true,
|
||||
a_vec[0],
|
||||
true,
|
||||
b_vec[0],
|
||||
acc[0],
|
||||
false
|
||||
);
|
||||
acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12(true, a_vec[0], true, b_vec[0], acc[0], false);
|
||||
#elif defined(RDNA3)
|
||||
using int32x4_t = __attribute__((__vector_size__(4 * sizeof(int)))) int;
|
||||
int32x4_t * a_vec = (int32x4_t *) A.x;
|
||||
int32x4_t * b_vec = (int32x4_t *) B.x;
|
||||
|
||||
acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32(
|
||||
true,
|
||||
a_vec[0],
|
||||
true,
|
||||
b_vec[0],
|
||||
acc[0],
|
||||
false
|
||||
);
|
||||
acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32(true, a_vec[0], true, b_vec[0], acc[0], false);
|
||||
#endif // RDNA4
|
||||
#else
|
||||
GGML_UNUSED(D);
|
||||
|
|
|
|||
|
|
@ -105,7 +105,7 @@ struct tile_x_sizes {
|
|||
};
|
||||
|
||||
static int get_mmq_x_max_host(const int cc) {
|
||||
return (amd_mfma_available(cc) || turing_mma_available(cc) || amd_wmma_available(cc)) ? 128 :
|
||||
return (turing_mma_available(cc) || amd_wmma_available(cc)) ? 128 :
|
||||
GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA ?
|
||||
#ifdef GGML_CUDA_FORCE_MMQ
|
||||
128 : 64;
|
||||
|
|
@ -115,9 +115,9 @@ static int get_mmq_x_max_host(const int cc) {
|
|||
}
|
||||
|
||||
static constexpr __device__ int get_mmq_x_max_device() {
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
|
||||
#if defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
|
||||
return 128;
|
||||
#else // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
|
||||
#else // defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
|
||||
|
||||
#if defined(GGML_USE_HIP)
|
||||
return 64;
|
||||
|
|
@ -1055,13 +1055,13 @@ static __device__ __forceinline__ void vec_dot_q8_0_q8_1_mma(
|
|||
tile_A A[ntx];
|
||||
#pragma unroll
|
||||
for (int n = 0; n < ntx; ++n) {
|
||||
load_generic(A[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q8_0 + k0, MMQ_MMA_TILE_X_K_Q8_0);
|
||||
load_ldmatrix(A[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q8_0 + k0, MMQ_MMA_TILE_X_K_Q8_0);
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < mmq_x; j0 += ntx*tile_C::J) {
|
||||
tile_B B;
|
||||
load_generic(B, y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);
|
||||
load_ldmatrix(B, y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);
|
||||
|
||||
float dB;
|
||||
const int j = j0 + tile_C::get_j(0);
|
||||
|
|
@ -1296,13 +1296,13 @@ static __device__ __forceinline__ void vec_dot_q8_1_q8_1_mma(
|
|||
tile_A A[ntx];
|
||||
#pragma unroll
|
||||
for (int n = 0; n < ntx; ++n) {
|
||||
load_generic(A[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q8_1 + k0, MMQ_MMA_TILE_X_K_Q8_1);
|
||||
load_ldmatrix(A[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q8_1 + k0, MMQ_MMA_TILE_X_K_Q8_1);
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < mmq_x; j0 += ntx*tile_C::J) {
|
||||
tile_B B;
|
||||
load_generic(B, y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);
|
||||
load_ldmatrix(B, y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);
|
||||
|
||||
const int j = j0 + tile_C::get_j(0);
|
||||
const float2 dsB = __half22float2(y_dm[j*MMQ_TILE_Y_K + k01/QI8_1]);
|
||||
|
|
@ -1436,57 +1436,7 @@ static __device__ __forceinline__ void vec_dot_q8_0_16_q8_1_dp4a(
|
|||
template <int mmq_x, int mmq_y>
|
||||
static __device__ __forceinline__ void vec_dot_q8_0_16_q8_1_mma(
|
||||
const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
|
||||
#if defined(AMD_MFMA_AVAILABLE)
|
||||
constexpr data_layout input_layout = get_input_data_layout();
|
||||
typedef tile<16, 8, int, input_layout> tile_A;
|
||||
typedef tile<16, 8, int, input_layout> tile_B;
|
||||
typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C;
|
||||
typedef tile<64, 2, int, input_layout> tile_load;
|
||||
|
||||
constexpr int granularity = mmq_get_granularity_device(mmq_x);
|
||||
constexpr int rows_per_warp = granularity;
|
||||
constexpr int ntx = rows_per_warp/tile_C::I; // Number of x minitiles per warp.
|
||||
|
||||
y += (threadIdx.y % ntx) * (tile_C::J*MMQ_TILE_Y_K);
|
||||
|
||||
const int * x_qs = (const int *) x;
|
||||
const float * x_df = (const float *) x_qs + MMQ_TILE_NE_K*2;
|
||||
const int * y_qs = (const int *) y + 4;
|
||||
const float * y_df = (const float *) y;
|
||||
|
||||
const int i0 = (threadIdx.y / ntx) * rows_per_warp;
|
||||
|
||||
for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += 4) {
|
||||
const int k0 = k00 + k01;
|
||||
|
||||
tile_A A[ntx];
|
||||
#pragma unroll
|
||||
for (int n = 0; n < ntx; ++n) {
|
||||
load_generic(((tile_load *) A)[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q3_K + k0, MMQ_MMA_TILE_X_K_Q3_K);
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < mmq_x; j0 += ntx*tile_C::J) {
|
||||
tile_B B[1];
|
||||
load_generic(((tile_load *) B)[0], y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);
|
||||
|
||||
const int j = j0 + tile_C::get_j(0);
|
||||
const float dB = y_df[j*MMQ_TILE_Y_K + k01/QI8_1] / 2;
|
||||
|
||||
#pragma unroll
|
||||
for (int n = 0; n < ntx; ++n) {
|
||||
tile_C C;
|
||||
mma(C, A[n], B[0]);
|
||||
|
||||
#pragma unroll
|
||||
for (int l = 0; l < tile_C::ne; ++l) {
|
||||
const int i = i0 + n*tile_C::I + tile_C::get_i(l);
|
||||
sum[(j0/tile_C::J + n)*tile_C::ne + l] += C.x[l] * x_df[i*MMQ_MMA_TILE_X_K_Q3_K + k0/4] * dB;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#elif defined(AMD_WMMA_AVAILABLE) //wmma instructions can handle 16x4 tiles, does not require loading 64x2 tiles
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
|
||||
constexpr data_layout input_layout = get_input_data_layout();
|
||||
typedef tile<16, 4, int, input_layout> tile_A;
|
||||
typedef tile<16, 4, int, input_layout> tile_B;
|
||||
|
|
@ -1511,13 +1461,13 @@ static __device__ __forceinline__ void vec_dot_q8_0_16_q8_1_mma(
|
|||
tile_A A[ntx];
|
||||
#pragma unroll
|
||||
for (int n = 0; n < ntx; ++n) {
|
||||
load_generic(A[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q3_K + k0, MMQ_MMA_TILE_X_K_Q3_K);
|
||||
load_ldmatrix(A[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q3_K + k0, MMQ_MMA_TILE_X_K_Q3_K);
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < mmq_x; j0 += ntx*tile_C::J) {
|
||||
tile_B B;
|
||||
load_generic(B, y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);
|
||||
load_ldmatrix(B, y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);
|
||||
|
||||
const int j = j0 + tile_C::get_j(0);
|
||||
const float dB = y_df[j*MMQ_TILE_Y_K + k01/QI8_1];
|
||||
|
|
@ -1743,74 +1693,7 @@ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_dp4a(
|
|||
template <int mmq_x, int mmq_y>
|
||||
static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mma(
|
||||
const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
|
||||
#if defined(AMD_MFMA_AVAILABLE)
|
||||
constexpr data_layout input_layout = get_input_data_layout();
|
||||
typedef tile<16, 8, int, input_layout> tile_A;
|
||||
typedef tile<16, 8, int, input_layout> tile_B;
|
||||
typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C;
|
||||
typedef tile<64, 2, int, input_layout> tile_load;
|
||||
|
||||
constexpr int granularity = mmq_get_granularity_device(mmq_x);
|
||||
constexpr int rows_per_warp = granularity;
|
||||
constexpr int ntx = rows_per_warp/tile_C::I; // Number of x minitiles per warp.
|
||||
|
||||
y += (threadIdx.y % ntx) * (tile_C::J*MMQ_TILE_Y_K);
|
||||
|
||||
const int * x_qs = (const int *) x;
|
||||
const half2 * x_dm = (const half2 *) x_qs + MMQ_TILE_NE_K*2;
|
||||
const int * y_qs = (const int *) y + 4;
|
||||
const half2 * y_ds = (const half2 *) y;
|
||||
|
||||
const int i0 = (threadIdx.y / ntx) * rows_per_warp;
|
||||
|
||||
for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += 4) {
|
||||
const int k0 = k00 + k01;
|
||||
|
||||
tile_A A[ntx];
|
||||
#pragma unroll
|
||||
for (int n = 0; n < ntx; ++n) {
|
||||
load_generic(((tile_load *) A)[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q2_K + k0, MMQ_MMA_TILE_X_K_Q2_K);
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < mmq_x; j0 += ntx*tile_C::J) {
|
||||
tile_B B[1];
|
||||
load_generic(((tile_load *) B)[0], y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);
|
||||
|
||||
const int j = j0 + tile_C::get_j(0);
|
||||
const float dB = (k01 < MMQ_TILE_NE_K/2) ? __half22float2(y_ds[j*MMQ_TILE_Y_K]).x/2 : __half22float2(y_ds[j*MMQ_TILE_Y_K]).y/2;
|
||||
const float sB = (k01 >= MMQ_TILE_NE_K * 3/4) ? 0
|
||||
: (((k01/4)%2) ? __half22float2(y_ds[j*MMQ_TILE_Y_K + (1 + k01/QI8_1)]).y
|
||||
: __half22float2(y_ds[j*MMQ_TILE_Y_K + (1 + k01/QI8_1)]).x);
|
||||
|
||||
tile_C Cm;
|
||||
if (k01 >= MMQ_TILE_NE_K * 3/4) {
|
||||
tile_A A1;
|
||||
A1.x[0] = 0x01010101;
|
||||
A1.x[1] = 0x01010101;
|
||||
mma(Cm, A1, B[0]);
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int n = 0; n < ntx; ++n) {
|
||||
tile_C Cd;
|
||||
mma(Cd, A[n], B[0]);
|
||||
|
||||
#pragma unroll
|
||||
for (int l = 0; l < tile_C::ne; ++l) {
|
||||
const int i = i0 + n*tile_C::I + tile_C::get_i(l);
|
||||
const float2 dm = __half22float2(x_dm[i*MMQ_MMA_TILE_X_K_Q2_K + k0/4]);
|
||||
float tmp = Cd.x[l]*dm.x;
|
||||
if (k01 >= MMQ_TILE_NE_K * 3/4) {
|
||||
tmp -= Cm.x[l]*dm.y;
|
||||
}
|
||||
sum[(j0/tile_C::J + n)*tile_C::ne + l] += tmp*dB;
|
||||
sum[(j0/tile_C::J + n)*tile_C::ne + l] -= dm.y*sB;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#elif defined(AMD_WMMA_AVAILABLE) //wmma instructions can handle 16x4 tiles, does not require loading 64x2 tiles
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
|
||||
constexpr data_layout input_layout = get_input_data_layout();
|
||||
typedef tile<16, 4, int, input_layout> tile_A;
|
||||
typedef tile<16, 4, int, input_layout> tile_B;
|
||||
|
|
@ -1835,13 +1718,13 @@ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mma(
|
|||
tile_A A[ntx];
|
||||
#pragma unroll
|
||||
for (int n = 0; n < ntx; ++n) {
|
||||
load_generic(A[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q2_K + k0, MMQ_MMA_TILE_X_K_Q2_K);
|
||||
load_ldmatrix(A[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q2_K + k0, MMQ_MMA_TILE_X_K_Q2_K);
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < mmq_x; j0 += ntx*tile_C::J) {
|
||||
tile_B B;
|
||||
load_generic(B, y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);
|
||||
load_ldmatrix(B, y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);
|
||||
|
||||
const int j = j0 + tile_C::get_j(0);
|
||||
const float dB = (k01 < MMQ_TILE_NE_K/2) ? __half22float2(y_ds[j*MMQ_TILE_Y_K]).x : __half22float2(y_ds[j*MMQ_TILE_Y_K]).y;
|
||||
|
|
@ -2574,59 +2457,7 @@ static __device__ __forceinline__ void vec_dot_q6_K_q8_1_dp4a(
|
|||
template <int mmq_x, int mmq_y>
|
||||
static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mma(
|
||||
const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
|
||||
#if defined(AMD_MFMA_AVAILABLE)
|
||||
constexpr data_layout input_layout = get_input_data_layout();
|
||||
typedef tile<16, 8, int, input_layout> tile_A;
|
||||
typedef tile<16, 8, int, input_layout> tile_B;
|
||||
typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C;
|
||||
typedef tile<64, 2, int, input_layout> tile_load;
|
||||
|
||||
constexpr int granularity = mmq_get_granularity_device(mmq_x);
|
||||
constexpr int rows_per_warp = granularity;
|
||||
constexpr int ntx = rows_per_warp/tile_C::I; // Number of x minitiles per warp.
|
||||
|
||||
y += (threadIdx.y % ntx) * (tile_C::J*MMQ_TILE_Y_K);
|
||||
|
||||
const int * x_qs = (const int *) x;
|
||||
const float * x_df = (const float *) x_qs + MMQ_TILE_NE_K*2;
|
||||
const int * x_sc = (const int *) x_df + MMQ_TILE_NE_K/QI6_K;
|
||||
const int * y_qs = (const int *) y + 4;
|
||||
const float * y_df = (const float *) y;
|
||||
|
||||
const int i0 = (threadIdx.y / ntx) * rows_per_warp;
|
||||
|
||||
for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += 4) {
|
||||
const int k0 = k00 + k01;
|
||||
|
||||
tile_A A[ntx];
|
||||
#pragma unroll
|
||||
for (int n = 0; n < ntx; ++n) {
|
||||
load_generic(((tile_load *) A)[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q6_K + k0, MMQ_MMA_TILE_X_K_Q6_K);
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < mmq_x; j0 += ntx*tile_C::J) {
|
||||
tile_B B[1];
|
||||
load_generic(((tile_load *) B)[0], y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);
|
||||
|
||||
const int j = j0 + tile_C::get_j(0);
|
||||
const float dB = y_df[j*MMQ_TILE_Y_K + k01/QI8_1] / 2;
|
||||
|
||||
#pragma unroll
|
||||
for (int n = 0; n < ntx; ++n) {
|
||||
tile_C C;
|
||||
mma(C, A[n], B[0]);
|
||||
|
||||
#pragma unroll
|
||||
for (int l = 0; l < tile_C::ne; ++l) {
|
||||
const int i = i0 + n*tile_C::I + tile_C::get_i(l);
|
||||
const int8_t * sc = (const int8_t *) (x_sc + i*MMQ_MMA_TILE_X_K_Q6_K + k00/16);
|
||||
sum[(j0/tile_C::J + n)*tile_C::ne + l] += C.x[l] * sc[k01/4] * x_df[i*MMQ_MMA_TILE_X_K_Q6_K] * dB;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#elif defined(AMD_WMMA_AVAILABLE) //wmma instructions can handle 16x4 tiles, does not require loading 64x2 tiles
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
|
||||
constexpr data_layout input_layout = get_input_data_layout();
|
||||
typedef tile<16, 4, int, input_layout> tile_A;
|
||||
typedef tile<16, 4, int, input_layout> tile_B;
|
||||
|
|
@ -2652,13 +2483,13 @@ static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mma(
|
|||
tile_A A[ntx];
|
||||
#pragma unroll
|
||||
for (int n = 0; n < ntx; ++n) {
|
||||
load_generic(A[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q6_K + k0, MMQ_MMA_TILE_X_K_Q6_K);
|
||||
load_ldmatrix(A[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q6_K + k0, MMQ_MMA_TILE_X_K_Q6_K);
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < mmq_x; j0 += ntx*tile_C::J) {
|
||||
tile_B B;
|
||||
load_generic(B, y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);
|
||||
load_ldmatrix(B, y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);
|
||||
|
||||
const int j = j0 + tile_C::get_j(0);
|
||||
const float dB = y_df[j*MMQ_TILE_Y_K + k01/QI8_1];
|
||||
|
|
|
|||
1
ggml/src/ggml-cuda/vendors/hip.h
vendored
1
ggml/src/ggml-cuda/vendors/hip.h
vendored
|
|
@ -33,7 +33,6 @@
|
|||
#define CU_MEM_LOCATION_TYPE_DEVICE hipMemLocationTypeDevice
|
||||
#define CU_MEM_ACCESS_FLAGS_PROT_READWRITE hipMemAccessFlagsProtReadWrite
|
||||
#define CU_CHECK(fn) {hipError_t err = fn; if(err != hipSuccess) { GGML_ABORT("HipVMM Failure: %s\n", hipGetErrorString(err)); }}
|
||||
#define NCCL_CHECK(fn) {ncclResult_t err = fn; if(err != ncclSuccess) { GGML_ABORT("RCCL Failure RCCL returned: %i\n", err); }}
|
||||
#define __shfl_sync(mask, var, laneMask, width) __shfl(var, laneMask, width)
|
||||
#define __shfl_up_sync(mask, var, laneMask, width) __shfl_up(var, laneMask, width)
|
||||
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
|
||||
|
|
|
|||
|
|
@ -1,7 +1,6 @@
|
|||
diagnostic(off, subgroup_uniformity);
|
||||
enable f16;
|
||||
|
||||
#define Q_TILE 1
|
||||
#define KV_TILE 32
|
||||
#define WG_SIZE 32
|
||||
|
||||
|
|
@ -11,7 +10,7 @@ struct Params {
|
|||
seq_len_kv: u32,
|
||||
stride_mask3: u32,
|
||||
// Number of KV blocks and Q blocks per batch.
|
||||
// nblk0 = ceil(seq_len_kv / KV_TILE), nblk1 = ceil(seq_len_q / Q_TILE).
|
||||
// nblk0 = ceil(seq_len_kv / KV_TILE), nblk1 = seq_len_q.
|
||||
nblk0: u32,
|
||||
nblk1: u32,
|
||||
};
|
||||
|
|
@ -40,7 +39,7 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
|
|||
return;
|
||||
}
|
||||
|
||||
let q_start = q_blk * Q_TILE;
|
||||
let q_start = q_blk;
|
||||
let k_start = kv_blk * KV_TILE;
|
||||
|
||||
let mask_batch = select(0u, batch_idx, params.stride_mask3 > 0u);
|
||||
|
|
@ -54,11 +53,8 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
|
|||
var local_max = -MASK_MAX;
|
||||
var local_any = 0u;
|
||||
|
||||
for (var q_rel = 0u; q_rel < Q_TILE; q_rel += 1u) {
|
||||
let q_row = q_start + q_rel;
|
||||
if (q_row >= params.seq_len_q) {
|
||||
continue;
|
||||
}
|
||||
let q_row = q_start;
|
||||
if (q_row < params.seq_len_q) {
|
||||
let row_base = mask_batch_base + q_row * params.seq_len_kv;
|
||||
for (var k_rel = local_id.x; k_rel < KV_TILE; k_rel += WG_SIZE) {
|
||||
let k_col = k_start + k_rel;
|
||||
|
|
|
|||
|
|
@ -1077,9 +1077,9 @@ llm_graph_qkv llm_graph_context::build_qkv(
|
|||
// fused QKV path
|
||||
ggml_tensor * qkv = build_lora_mm(layer.wqkv, cur, layer.wqkv_s);
|
||||
cb(qkv, "wqkv", il);
|
||||
if (layer.bqkv) {
|
||||
qkv = ggml_add(ctx0, qkv, layer.bqkv);
|
||||
cb(qkv, "bqkv", il);
|
||||
if (layer.wqkv_b) {
|
||||
qkv = ggml_add(ctx0, qkv, layer.wqkv_b);
|
||||
cb(qkv, "wqkv_b", il);
|
||||
}
|
||||
if (hparams.f_clamp_kqv > 0.0f) {
|
||||
qkv = ggml_clamp(ctx0, qkv, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
||||
|
|
@ -1097,8 +1097,8 @@ llm_graph_qkv llm_graph_context::build_qkv(
|
|||
// separate Q/K/V path
|
||||
Qcur = build_lora_mm(layer.wq, cur, layer.wq_s);
|
||||
cb(Qcur, "Qcur", il);
|
||||
if (layer.bq) {
|
||||
Qcur = ggml_add(ctx0, Qcur, layer.bq);
|
||||
if (layer.wq_b) {
|
||||
Qcur = ggml_add(ctx0, Qcur, layer.wq_b);
|
||||
cb(Qcur, "Qcur", il);
|
||||
}
|
||||
if (hparams.f_clamp_kqv > 0.0f) {
|
||||
|
|
@ -1107,8 +1107,8 @@ llm_graph_qkv llm_graph_context::build_qkv(
|
|||
}
|
||||
Kcur = build_lora_mm(layer.wk, cur, layer.wk_s);
|
||||
cb(Kcur, "Kcur", il);
|
||||
if (layer.bk) {
|
||||
Kcur = ggml_add(ctx0, Kcur, layer.bk);
|
||||
if (layer.wk_b) {
|
||||
Kcur = ggml_add(ctx0, Kcur, layer.wk_b);
|
||||
cb(Kcur, "Kcur", il);
|
||||
}
|
||||
if (hparams.f_clamp_kqv > 0.0f) {
|
||||
|
|
@ -1117,8 +1117,8 @@ llm_graph_qkv llm_graph_context::build_qkv(
|
|||
}
|
||||
Vcur = build_lora_mm(layer.wv, cur, layer.wv_s);
|
||||
cb(Vcur, "Vcur", il);
|
||||
if (layer.bv) {
|
||||
Vcur = ggml_add(ctx0, Vcur, layer.bv);
|
||||
if (layer.wv_b) {
|
||||
Vcur = ggml_add(ctx0, Vcur, layer.wv_b);
|
||||
cb(Vcur, "Vcur", il);
|
||||
}
|
||||
if (hparams.f_clamp_kqv > 0.0f) {
|
||||
|
|
|
|||
|
|
@ -3217,14 +3217,14 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
const int64_t n_embd_qkv = n_embd_q_ + n_embd_k_ + n_embd_v_;
|
||||
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", bid), {n_embd_, n_embd_qkv}, TENSOR_NOT_REQUIRED | TENSOR_SKIP_IF_VIRTUAL);
|
||||
if (layer.wqkv) {
|
||||
layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", bid), {n_embd_qkv}, TENSOR_NOT_REQUIRED | TENSOR_SKIP_IF_VIRTUAL);
|
||||
layer.wqkv_b = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", bid), {n_embd_qkv}, TENSOR_NOT_REQUIRED | TENSOR_SKIP_IF_VIRTUAL);
|
||||
} else {
|
||||
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", bid), {n_embd_, n_embd_q_}, flags);
|
||||
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", bid), {n_embd_, n_embd_k_}, flags);
|
||||
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", bid), {n_embd_, n_embd_v_}, flags);
|
||||
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", bid), {n_embd_q_}, TENSOR_NOT_REQUIRED);
|
||||
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", bid), {n_embd_k_}, TENSOR_NOT_REQUIRED);
|
||||
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", bid), {n_embd_v_}, TENSOR_NOT_REQUIRED);
|
||||
layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", bid), {n_embd_q_}, TENSOR_NOT_REQUIRED);
|
||||
layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", bid), {n_embd_k_}, TENSOR_NOT_REQUIRED);
|
||||
layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", bid), {n_embd_v_}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
};
|
||||
|
||||
|
|
@ -3257,7 +3257,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
||||
|
||||
// optional bias tensors
|
||||
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
||||
layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
||||
|
||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||
|
||||
|
|
@ -3360,7 +3360,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
// No bias for QKV projections as per config: include_bias=false, include_qkv_bias=false
|
||||
layer.wo =
|
||||
create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
|
||||
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
|
||||
layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
|
||||
|
||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
|
||||
|
||||
|
|
@ -3495,9 +3495,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
||||
}
|
||||
|
||||
|
||||
// optional bias tensors
|
||||
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
||||
layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
||||
|
||||
if (n_ff > 0) {
|
||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||
|
|
@ -3717,10 +3716,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
|
||||
|
||||
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
|
||||
layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
|
||||
layer.wqkv_b = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
|
||||
|
||||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
||||
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
|
||||
layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
|
||||
|
||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||
layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
|
||||
|
|
@ -3761,8 +3760,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
|
||||
create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0);
|
||||
|
||||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
||||
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
||||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
||||
layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
||||
|
||||
layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
|
||||
layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0);
|
||||
|
|
@ -3878,23 +3877,16 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
for (int i = 0; i < n_layer; ++i) {
|
||||
auto & layer = layers[i]; // JinaBertLayer
|
||||
|
||||
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
|
||||
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
|
||||
create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0);
|
||||
|
||||
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
||||
layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
||||
|
||||
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
||||
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
|
||||
|
||||
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
||||
layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
||||
|
||||
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
||||
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
|
||||
|
||||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0); //output_dens
|
||||
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0); //output_dens
|
||||
layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0); //output_dens
|
||||
|
||||
layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0); //output_norm
|
||||
layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0);
|
||||
|
|
@ -3942,10 +3934,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
|
||||
|
||||
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
|
||||
layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
|
||||
layer.wqkv_b = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
|
||||
|
||||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
||||
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
|
||||
layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
|
||||
|
||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||
layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
|
||||
|
|
@ -3978,10 +3970,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
||||
|
||||
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
|
||||
layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
|
||||
layer.wqkv_b = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
|
||||
|
||||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
||||
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
||||
layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
||||
|
||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||
layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
||||
|
|
@ -4048,7 +4040,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
||||
|
||||
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd*3}, 0);
|
||||
layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd*3}, 0);
|
||||
layer.wqkv_b = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd*3}, 0);
|
||||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
||||
|
||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||
|
|
@ -4227,7 +4219,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0);
|
||||
|
||||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
||||
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
|
||||
layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
|
||||
|
||||
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
||||
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
|
||||
|
|
@ -4286,7 +4278,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
|
||||
create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0);
|
||||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
|
||||
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, 0);
|
||||
layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, 0);
|
||||
|
||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
|
||||
layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), { n_embd }, 0);
|
||||
|
|
@ -4450,10 +4442,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
|
||||
|
||||
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
|
||||
layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
|
||||
layer.wqkv_b = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
|
||||
|
||||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
||||
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
|
||||
layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
|
||||
|
||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||
layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
|
||||
|
|
@ -4488,7 +4480,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0);
|
||||
|
||||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
||||
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
|
||||
layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
|
||||
|
||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||
layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
|
||||
|
|
@ -4805,7 +4797,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
||||
|
||||
// optional bias tensors
|
||||
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
|
||||
layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
|
||||
|
||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||
layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
|
||||
|
|
@ -5049,7 +5041,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
const int64_t n_embd_v_gqa_i = hparams.n_embd_v_gqa(i);
|
||||
create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head_i, n_embd_k_gqa_i, n_embd_v_gqa_i, 0);
|
||||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head_i, n_embd}, 0);
|
||||
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
||||
layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
|
||||
// feed forward (w/ optional biases)
|
||||
|
|
@ -5311,10 +5303,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
|
||||
|
||||
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
|
||||
layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
|
||||
layer.wqkv_b = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
|
||||
|
||||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
||||
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
|
||||
layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
|
||||
|
||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||
layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
|
||||
|
|
@ -5729,10 +5721,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
|
||||
|
||||
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
|
||||
layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
|
||||
layer.wqkv_b = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
|
||||
|
||||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
||||
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
|
||||
layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
|
||||
|
||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||
layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
|
||||
|
|
@ -5771,10 +5763,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
||||
|
||||
// attention biases - all have shape n_embd (output dimension of projections)
|
||||
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
|
||||
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd}, 0);
|
||||
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd}, 0);
|
||||
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
|
||||
layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
|
||||
layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd}, 0);
|
||||
layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd}, 0);
|
||||
layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
|
||||
|
||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||
layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
|
||||
|
|
@ -6077,7 +6069,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
||||
|
||||
// optional bias tensors
|
||||
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
||||
layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
||||
|
||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||
layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
|
||||
|
|
@ -6146,7 +6138,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
const int64_t n_embd_v_gqa_i = hparams.n_embd_v_gqa(i);
|
||||
create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head_i, n_embd_k_gqa_i, n_embd_v_gqa_i, 0);
|
||||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head_i, n_embd}, 0);
|
||||
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
||||
layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
||||
} else {
|
||||
if (n_expert != 0) {
|
||||
const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
|
||||
|
|
@ -6967,7 +6959,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
||||
|
||||
// optional bias tensors
|
||||
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
||||
layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
||||
|
||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||
|
||||
|
|
@ -7049,7 +7041,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
// attention layers (with optional bias)
|
||||
create_tensor_qkv(layer, i, hidden_size, n_embd_head_k * attn_num_attention_head, attn_num_key_value_head * n_embd_head_k, attn_num_key_value_head * n_embd_head_v, 0);
|
||||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * attn_num_attention_head, hidden_size}, 0);
|
||||
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {hidden_size}, TENSOR_NOT_REQUIRED);
|
||||
layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {hidden_size}, TENSOR_NOT_REQUIRED);
|
||||
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {hidden_size}, 0);
|
||||
|
||||
|
||||
|
|
@ -7185,7 +7177,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
|
||||
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
||||
|
||||
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
|
||||
layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
|
||||
|
||||
layer.ffn_gate_inp_b = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "bias", i), {n_expert}, 0);
|
||||
layer.ffn_gate_exps_b = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "bias", i), {n_ff_exp, n_expert}, 0);
|
||||
|
|
@ -7350,7 +7342,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
|
||||
|
||||
// optional bias tensors
|
||||
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
|
||||
layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
|
||||
|
||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
|
||||
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
|
||||
|
|
@ -7581,7 +7573,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
||||
|
||||
// bias tensors
|
||||
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
|
||||
layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
|
||||
|
||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||
|
||||
|
|
@ -8347,114 +8339,114 @@ void llama_model::print_info() const {
|
|||
LLAMA_LOG_INFO("%s: n_cls_out = %u\n", __func__, hparams.n_cls_out);
|
||||
|
||||
size_t i = 0;
|
||||
for (auto label : classifier_labels) {
|
||||
for (const auto & label : classifier_labels) {
|
||||
LLAMA_LOG_INFO("%s: cls_label[%2zu] = %s\n", __func__, i++, label.c_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (arch == LLM_ARCH_MAMBA ||
|
||||
arch == LLM_ARCH_MAMBA2 ||
|
||||
arch == LLM_ARCH_JAMBA ||
|
||||
arch == LLM_ARCH_FALCON_H1 ||
|
||||
arch == LLM_ARCH_PLAMO2 ||
|
||||
arch == LLM_ARCH_GRANITE_HYBRID ||
|
||||
arch == LLM_ARCH_QWEN3NEXT ||
|
||||
arch == LLM_ARCH_QWEN35 ||
|
||||
arch == LLM_ARCH_QWEN35MOE ||
|
||||
arch == LLM_ARCH_NEMOTRON_H ||
|
||||
arch == LLM_ARCH_NEMOTRON_H_MOE) {
|
||||
LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
|
||||
LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
|
||||
LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
|
||||
LLAMA_LOG_INFO("%s: ssm_dt_rank = %u\n", __func__, hparams.ssm_dt_rank);
|
||||
LLAMA_LOG_INFO("%s: ssm_n_group = %u\n", __func__, hparams.ssm_n_group);
|
||||
LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms = %d\n", __func__, hparams.ssm_dt_b_c_rms);
|
||||
}
|
||||
if (arch == LLM_ARCH_MAMBA ||
|
||||
arch == LLM_ARCH_MAMBA2 ||
|
||||
arch == LLM_ARCH_JAMBA ||
|
||||
arch == LLM_ARCH_FALCON_H1 ||
|
||||
arch == LLM_ARCH_PLAMO2 ||
|
||||
arch == LLM_ARCH_GRANITE_HYBRID ||
|
||||
arch == LLM_ARCH_QWEN3NEXT ||
|
||||
arch == LLM_ARCH_QWEN35 ||
|
||||
arch == LLM_ARCH_QWEN35MOE ||
|
||||
arch == LLM_ARCH_NEMOTRON_H ||
|
||||
arch == LLM_ARCH_NEMOTRON_H_MOE) {
|
||||
LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
|
||||
LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
|
||||
LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
|
||||
LLAMA_LOG_INFO("%s: ssm_dt_rank = %u\n", __func__, hparams.ssm_dt_rank);
|
||||
LLAMA_LOG_INFO("%s: ssm_n_group = %u\n", __func__, hparams.ssm_n_group);
|
||||
LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms = %d\n", __func__, hparams.ssm_dt_b_c_rms);
|
||||
}
|
||||
|
||||
LLAMA_LOG_INFO("%s: model type = %s\n", __func__, type_name().c_str());
|
||||
if (pimpl->n_elements >= 1e12) {
|
||||
LLAMA_LOG_INFO("%s: model params = %.2f T\n", __func__, pimpl->n_elements*1e-12);
|
||||
} else if (pimpl->n_elements >= 1e9) {
|
||||
LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, pimpl->n_elements*1e-9);
|
||||
} else if (pimpl->n_elements >= 1e6) {
|
||||
LLAMA_LOG_INFO("%s: model params = %.2f M\n", __func__, pimpl->n_elements*1e-6);
|
||||
} else {
|
||||
LLAMA_LOG_INFO("%s: model params = %.2f K\n", __func__, pimpl->n_elements*1e-3);
|
||||
}
|
||||
LLAMA_LOG_INFO("%s: model type = %s\n", __func__, type_name().c_str());
|
||||
if (pimpl->n_elements >= 1e12) {
|
||||
LLAMA_LOG_INFO("%s: model params = %.2f T\n", __func__, pimpl->n_elements*1e-12);
|
||||
} else if (pimpl->n_elements >= 1e9) {
|
||||
LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, pimpl->n_elements*1e-9);
|
||||
} else if (pimpl->n_elements >= 1e6) {
|
||||
LLAMA_LOG_INFO("%s: model params = %.2f M\n", __func__, pimpl->n_elements*1e-6);
|
||||
} else {
|
||||
LLAMA_LOG_INFO("%s: model params = %.2f K\n", __func__, pimpl->n_elements*1e-3);
|
||||
}
|
||||
|
||||
// general kv
|
||||
LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, name.c_str());
|
||||
// general kv
|
||||
LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, name.c_str());
|
||||
|
||||
if (arch == LLM_ARCH_DEEPSEEK) {
|
||||
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
|
||||
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
||||
LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
|
||||
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
|
||||
}
|
||||
if (arch == LLM_ARCH_DEEPSEEK) {
|
||||
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
|
||||
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
||||
LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
|
||||
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
|
||||
}
|
||||
|
||||
if (arch == LLM_ARCH_DEEPSEEK2 || arch == LLM_ARCH_DEEPSEEK2OCR || arch == LLM_ARCH_GLM_DSA || arch == LLM_ARCH_MISTRAL4) {
|
||||
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
|
||||
LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
|
||||
LLAMA_LOG_INFO("%s: n_lora_kv = %d\n", __func__, hparams.n_lora_kv);
|
||||
LLAMA_LOG_INFO("%s: n_embd_head_k_mla = %d\n", __func__, hparams.n_embd_head_k_mla());
|
||||
LLAMA_LOG_INFO("%s: n_embd_head_v_mla = %d\n", __func__, hparams.n_embd_head_v_mla());
|
||||
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
||||
LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
|
||||
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
|
||||
LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
|
||||
LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
|
||||
}
|
||||
if (arch == LLM_ARCH_DEEPSEEK2 || arch == LLM_ARCH_DEEPSEEK2OCR || arch == LLM_ARCH_GLM_DSA || arch == LLM_ARCH_MISTRAL4) {
|
||||
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
|
||||
LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
|
||||
LLAMA_LOG_INFO("%s: n_lora_kv = %d\n", __func__, hparams.n_lora_kv);
|
||||
LLAMA_LOG_INFO("%s: n_embd_head_k_mla = %d\n", __func__, hparams.n_embd_head_k_mla());
|
||||
LLAMA_LOG_INFO("%s: n_embd_head_v_mla = %d\n", __func__, hparams.n_embd_head_v_mla());
|
||||
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
||||
LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
|
||||
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
|
||||
LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
|
||||
LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
|
||||
}
|
||||
|
||||
if (arch == LLM_ARCH_QWEN2MOE) {
|
||||
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
||||
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
|
||||
}
|
||||
if (arch == LLM_ARCH_QWEN2MOE) {
|
||||
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
||||
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
|
||||
}
|
||||
|
||||
if (arch == LLM_ARCH_QWEN3MOE || arch == LLM_ARCH_OPENAI_MOE || arch == LLM_ARCH_QWEN3VLMOE || arch == LLM_ARCH_RND1) {
|
||||
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
||||
}
|
||||
if (arch == LLM_ARCH_QWEN3MOE || arch == LLM_ARCH_OPENAI_MOE || arch == LLM_ARCH_QWEN3VLMOE || arch == LLM_ARCH_RND1) {
|
||||
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
||||
}
|
||||
|
||||
if (arch == LLM_ARCH_MINICPM ||
|
||||
arch == LLM_ARCH_GRANITE ||
|
||||
arch == LLM_ARCH_GRANITE_MOE ||
|
||||
arch == LLM_ARCH_GRANITE_HYBRID ||
|
||||
arch == LLM_ARCH_NEMOTRON_H_MOE) {
|
||||
LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
|
||||
LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
|
||||
LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
|
||||
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
|
||||
}
|
||||
if (arch == LLM_ARCH_MINICPM ||
|
||||
arch == LLM_ARCH_GRANITE ||
|
||||
arch == LLM_ARCH_GRANITE_MOE ||
|
||||
arch == LLM_ARCH_GRANITE_HYBRID ||
|
||||
arch == LLM_ARCH_NEMOTRON_H_MOE) {
|
||||
LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
|
||||
LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
|
||||
LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
|
||||
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
|
||||
}
|
||||
|
||||
if (arch == LLM_ARCH_BAILINGMOE) {
|
||||
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
|
||||
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
||||
LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
|
||||
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
|
||||
LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
|
||||
}
|
||||
if (arch == LLM_ARCH_BAILINGMOE) {
|
||||
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
|
||||
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
||||
LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
|
||||
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
|
||||
LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
|
||||
}
|
||||
|
||||
if (arch == LLM_ARCH_BAILINGMOE2) {
|
||||
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
|
||||
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
||||
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
|
||||
LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
|
||||
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
|
||||
LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
|
||||
LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
|
||||
LLAMA_LOG_INFO("%s: nextn_predict_layers = %d\n", __func__, hparams.nextn_predict_layers);
|
||||
}
|
||||
if (arch == LLM_ARCH_BAILINGMOE2) {
|
||||
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
|
||||
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
||||
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
|
||||
LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
|
||||
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
|
||||
LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
|
||||
LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
|
||||
LLAMA_LOG_INFO("%s: nextn_predict_layers = %d\n", __func__, hparams.nextn_predict_layers);
|
||||
}
|
||||
|
||||
if (arch == LLM_ARCH_SMALLTHINKER || arch == LLM_ARCH_LFM2MOE) {
|
||||
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
||||
LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
|
||||
}
|
||||
if (arch == LLM_ARCH_SMALLTHINKER || arch == LLM_ARCH_LFM2MOE) {
|
||||
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
||||
LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
|
||||
}
|
||||
|
||||
if (arch == LLM_ARCH_GROVEMOE) {
|
||||
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
||||
LLAMA_LOG_INFO("%s: n_ff_chexp = %d\n", __func__, hparams.n_ff_chexp);
|
||||
LLAMA_LOG_INFO("%s: n_group_experts = %d\n", __func__, hparams.n_group_experts);
|
||||
LLAMA_LOG_INFO("%s: expert_group_scale = %.2f\n", __func__, hparams.expert_group_scale);
|
||||
if (arch == LLM_ARCH_GROVEMOE) {
|
||||
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
||||
LLAMA_LOG_INFO("%s: n_ff_chexp = %d\n", __func__, hparams.n_ff_chexp);
|
||||
LLAMA_LOG_INFO("%s: n_group_experts = %d\n", __func__, hparams.n_group_experts);
|
||||
LLAMA_LOG_INFO("%s: expert_group_scale = %.2f\n", __func__, hparams.expert_group_scale);
|
||||
}
|
||||
}
|
||||
|
||||
vocab.print_info();
|
||||
|
|
|
|||
|
|
@ -246,6 +246,8 @@ struct llama_layer {
|
|||
struct ggml_tensor * wkv_b = nullptr;
|
||||
struct ggml_tensor * wk_b = nullptr;
|
||||
struct ggml_tensor * wv_b = nullptr;
|
||||
struct ggml_tensor * wqkv_b = nullptr;
|
||||
struct ggml_tensor * wo_b = nullptr;
|
||||
struct ggml_tensor * wq_cross = nullptr;
|
||||
struct ggml_tensor * wk_cross = nullptr;
|
||||
struct ggml_tensor * wv_cross = nullptr;
|
||||
|
|
@ -256,13 +258,6 @@ struct llama_layer {
|
|||
struct ggml_tensor * wo_enc = nullptr;
|
||||
struct ggml_tensor * wqkv_gate = nullptr;
|
||||
|
||||
// attention bias
|
||||
struct ggml_tensor * bq = nullptr;
|
||||
struct ggml_tensor * bk = nullptr;
|
||||
struct ggml_tensor * bv = nullptr;
|
||||
struct ggml_tensor * bo = nullptr;
|
||||
struct ggml_tensor * bqkv = nullptr;
|
||||
|
||||
// relative position bias
|
||||
struct ggml_tensor * attn_rel_b = nullptr;
|
||||
struct ggml_tensor * attn_rel_b_enc = nullptr;
|
||||
|
|
|
|||
157
src/llama.cpp
157
src/llama.cpp
|
|
@ -115,12 +115,16 @@ static std::vector<llama_device_memory_data> llama_get_device_memory_data(
|
|||
throw std::runtime_error("failed to create llama_context from model");
|
||||
}
|
||||
|
||||
std::vector<llama_device_memory_data> ret(model->devices.size());
|
||||
const size_t nd = model->n_devices();
|
||||
std::vector<llama_device_memory_data> ret(nd + 1);
|
||||
|
||||
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> memory_breakdown = ctx->memory_breakdown();
|
||||
|
||||
for (const auto & [buft, mb] : memory_breakdown) {
|
||||
if (ggml_backend_buft_is_host(buft)) {
|
||||
ret.back().mb.model += mb.model;
|
||||
ret.back().mb.context += mb.context;
|
||||
ret.back().mb.compute += mb.compute;
|
||||
continue;
|
||||
}
|
||||
|
||||
|
|
@ -128,7 +132,7 @@ static std::vector<llama_device_memory_data> llama_get_device_memory_data(
|
|||
if (!dev) {
|
||||
continue;
|
||||
}
|
||||
for (size_t i = 0; i < ret.size(); i++) {
|
||||
for (size_t i = 0; i < nd; i++) {
|
||||
if (model->devices[i].dev == dev) {
|
||||
ret[i].mb.model += mb.model;
|
||||
ret[i].mb.context += mb.context;
|
||||
|
|
@ -137,7 +141,19 @@ static std::vector<llama_device_memory_data> llama_get_device_memory_data(
|
|||
}
|
||||
}
|
||||
}
|
||||
for (size_t i = 0; i < ret.size(); i++) {
|
||||
|
||||
{
|
||||
ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||
if (cpu_dev == nullptr) {
|
||||
throw std::runtime_error(format("%s: no CPU backend found", __func__));
|
||||
}
|
||||
size_t free;
|
||||
size_t total;
|
||||
ggml_backend_dev_memory(cpu_dev, &free, &total);
|
||||
ret.back().free = free;
|
||||
ret.back().total = total;
|
||||
}
|
||||
for (size_t i = 0; i < nd; i++) {
|
||||
size_t free;
|
||||
size_t total;
|
||||
ggml_backend_dev_memory(model->devices[i].dev, &free, &total);
|
||||
|
|
@ -146,11 +162,8 @@ static std::vector<llama_device_memory_data> llama_get_device_memory_data(
|
|||
// have any to report. in this case, we will use the host memory as a fallback
|
||||
// fixes: https://github.com/ggml-org/llama.cpp/issues/18577
|
||||
if (free == 0 && total == 0) {
|
||||
ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||
if (cpu_dev == nullptr) {
|
||||
throw std::runtime_error(format("%s: no CPU backend found", __func__));
|
||||
}
|
||||
ggml_backend_dev_memory(cpu_dev, &free, &total);
|
||||
free = ret.back().free;
|
||||
total = ret.back().total;
|
||||
}
|
||||
ret[i].free = free;
|
||||
ret[i].total = total;
|
||||
|
|
@ -204,15 +217,15 @@ static void llama_params_fit_impl(
|
|||
LLAMA_LOG_DEBUG("%s: getting device memory data for initial parameters:\n", __func__);
|
||||
const dmds_t dmds_full = llama_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
|
||||
const size_t nd = devs.size(); // number of devices
|
||||
if (nd == 0) {
|
||||
LLAMA_LOG_INFO("%s: no devices with dedicated memory found\n", __func__);
|
||||
return;
|
||||
}
|
||||
|
||||
std::vector<int64_t> margins; // this function uses int64_t rather than size_t for memory sizes to more conveniently handle deficits
|
||||
margins.reserve(nd);
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
margins.push_back(margins_s[id]);
|
||||
if (nd == 0) {
|
||||
margins.push_back(margins_s[0]);
|
||||
} else {
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
margins.push_back(margins_s[id]);
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::string> dev_names;
|
||||
|
|
@ -239,46 +252,59 @@ static void llama_params_fit_impl(
|
|||
std::vector<int64_t> projected_free_per_device;
|
||||
projected_free_per_device.reserve(nd);
|
||||
|
||||
if (nd > 1) {
|
||||
LLAMA_LOG_INFO("%s: projected memory use with initial parameters [MiB]:\n", __func__);
|
||||
}
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
const llama_device_memory_data & dmd = dmds_full[id];
|
||||
|
||||
const int64_t projected_used = dmd.mb.total();
|
||||
const int64_t projected_free = dmd.free - projected_used;
|
||||
projected_free_per_device.push_back(projected_free);
|
||||
|
||||
sum_free += dmd.free;
|
||||
sum_projected_used += projected_used;
|
||||
sum_projected_free += projected_free;
|
||||
sum_projected_model += dmd.mb.model;
|
||||
|
||||
if (nd > 1) {
|
||||
LLAMA_LOG_INFO("%s: - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " free vs. target of %6" PRId64 "\n",
|
||||
__func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, projected_free/MiB, margins[id]/MiB);
|
||||
}
|
||||
}
|
||||
assert(sum_free >= 0 && sum_projected_used >= 0);
|
||||
LLAMA_LOG_INFO("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n",
|
||||
__func__, sum_projected_used/MiB, sum_free/MiB);
|
||||
if (nd == 1) {
|
||||
if (projected_free_per_device[0] >= margins[0]) {
|
||||
LLAMA_LOG_INFO("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n",
|
||||
__func__, projected_free_per_device[0]/MiB, margins[0]/MiB);
|
||||
if (nd == 0) {
|
||||
sum_projected_used = dmds_full.back().mb.total();
|
||||
sum_free = dmds_full.back().total;
|
||||
sum_projected_free = sum_free - sum_projected_used;
|
||||
LLAMA_LOG_INFO("%s: projected to use %" PRId64 " MiB of host memory vs. %" PRId64 " MiB of total host memory\n",
|
||||
__func__, sum_projected_used/MiB, sum_free/MiB);
|
||||
if (sum_projected_free >= margins[0]) {
|
||||
LLAMA_LOG_INFO("%s: will leave %" PRId64 " >= %" PRId64 " MiB of system memory, no changes needed\n",
|
||||
__func__, sum_projected_free/MiB, margins[0]/MiB);
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
bool changes_needed = false;
|
||||
if (nd > 1) {
|
||||
LLAMA_LOG_INFO("%s: projected memory use with initial parameters [MiB]:\n", __func__);
|
||||
}
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
if (projected_free_per_device[id] < margins[id]) {
|
||||
changes_needed = true;
|
||||
break;
|
||||
const llama_device_memory_data & dmd = dmds_full[id];
|
||||
|
||||
const int64_t projected_used = dmd.mb.total();
|
||||
const int64_t projected_free = dmd.free - projected_used;
|
||||
projected_free_per_device.push_back(projected_free);
|
||||
|
||||
sum_free += dmd.free;
|
||||
sum_projected_used += projected_used;
|
||||
sum_projected_free += projected_free;
|
||||
sum_projected_model += dmd.mb.model;
|
||||
|
||||
if (nd > 1) {
|
||||
LLAMA_LOG_INFO("%s: - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " free vs. target of %6" PRId64 "\n",
|
||||
__func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, projected_free/MiB, margins[id]/MiB);
|
||||
}
|
||||
}
|
||||
if (!changes_needed) {
|
||||
LLAMA_LOG_INFO("%s: targets for free memory can be met on all devices, no changes needed\n", __func__);
|
||||
return;
|
||||
assert(sum_free >= 0 && sum_projected_used >= 0);
|
||||
LLAMA_LOG_INFO("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n",
|
||||
__func__, sum_projected_used/MiB, sum_free/MiB);
|
||||
if (nd == 1) {
|
||||
if (projected_free_per_device[0] >= margins[0]) {
|
||||
LLAMA_LOG_INFO("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n",
|
||||
__func__, projected_free_per_device[0]/MiB, margins[0]/MiB);
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
bool changes_needed = false;
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
if (projected_free_per_device[id] < margins[id]) {
|
||||
changes_needed = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!changes_needed) {
|
||||
LLAMA_LOG_INFO("%s: targets for free memory can be met on all devices, no changes needed\n", __func__);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -286,11 +312,15 @@ static void llama_params_fit_impl(
|
|||
|
||||
{
|
||||
int64_t global_surplus = sum_projected_free;
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
global_surplus -= margins[id];
|
||||
if (nd == 0) {
|
||||
global_surplus -= margins[0];
|
||||
} else {
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
global_surplus -= margins[id];
|
||||
}
|
||||
}
|
||||
if (global_surplus < 0) {
|
||||
if (nd == 1) {
|
||||
if (nd <= 1) {
|
||||
LLAMA_LOG_INFO("%s: cannot meet free memory target of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n",
|
||||
__func__, margins[0]/MiB, -global_surplus/MiB);
|
||||
} else {
|
||||
|
|
@ -301,8 +331,12 @@ static void llama_params_fit_impl(
|
|||
if (cparams->n_ctx == 0) {
|
||||
if (hp_nct > n_ctx_min) {
|
||||
int64_t sum_used_target = sum_free;
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
sum_used_target -= margins[id];
|
||||
if (nd == 0) {
|
||||
sum_used_target -= margins[0];
|
||||
} else {
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
sum_used_target -= margins[id];
|
||||
}
|
||||
}
|
||||
if (nd > 1) {
|
||||
// for multiple devices we need to be more conservative in terms of how much context we think can fit:
|
||||
|
|
@ -317,8 +351,12 @@ static void llama_params_fit_impl(
|
|||
int64_t sum_projected_used_min_ctx = 0;
|
||||
cparams->n_ctx = n_ctx_min;
|
||||
const dmds_t dmds_min_ctx = llama_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
|
||||
for (const auto & dmd : dmds_min_ctx) {
|
||||
sum_projected_used_min_ctx += dmd.mb.total();
|
||||
if (nd == 0) {
|
||||
sum_projected_used_min_ctx = dmds_min_ctx.back().mb.total();
|
||||
} else {
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
sum_projected_used_min_ctx += dmds_min_ctx[id].mb.total();
|
||||
}
|
||||
}
|
||||
if (sum_used_target > sum_projected_used_min_ctx) {
|
||||
// linear interpolation between minimum and maximum context size:
|
||||
|
|
@ -330,7 +368,7 @@ static void llama_params_fit_impl(
|
|||
const int64_t memory_reduction = (hp_nct - cparams->n_ctx) * bytes_per_ctx;
|
||||
LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
|
||||
__func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
|
||||
if (nd == 1) {
|
||||
if (nd <= 1) {
|
||||
LLAMA_LOG_INFO("%s: entire model can be fit by reducing context\n", __func__);
|
||||
return;
|
||||
}
|
||||
|
|
@ -353,6 +391,9 @@ static void llama_params_fit_impl(
|
|||
}
|
||||
}
|
||||
}
|
||||
if (nd == 0) {
|
||||
throw llama_params_fit_exception("was unable to fit model into system memory by reducing context, abort");
|
||||
}
|
||||
|
||||
if (mparams->n_gpu_layers != default_mparams.n_gpu_layers) {
|
||||
throw llama_params_fit_exception("n_gpu_layers already set by user to " + std::to_string(mparams->n_gpu_layers) + ", abort");
|
||||
|
|
@ -500,8 +541,8 @@ static void llama_params_fit_impl(
|
|||
|
||||
std::vector<int64_t> ret;
|
||||
ret.reserve(nd);
|
||||
for (const llama_device_memory_data & dmd : dmd_nl) {
|
||||
ret.push_back(dmd.mb.total());
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
ret.push_back(dmd_nl[id].mb.total());
|
||||
}
|
||||
return ret;
|
||||
};
|
||||
|
|
|
|||
|
|
@ -50,7 +50,7 @@ llm_build_apertus::llm_build_apertus(const llama_model & model, const llm_graph_
|
|||
cb(Vcur, "Vcur_pos", il);
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
|
||||
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
||||
cb(cur, "attn_out", il);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -55,7 +55,7 @@ llm_build_arcee::llm_build_arcee(const llama_model & model, const llm_graph_para
|
|||
cb(Vcur, "Vcur", il);
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
|
||||
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
||||
cb(cur, "attn_out", il);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -48,7 +48,7 @@ llm_build_bailingmoe::llm_build_bailingmoe(const llama_model & model, const llm_
|
|||
cb(Vcur, "Vcur", il);
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
|
||||
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_rot)), il);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -48,7 +48,7 @@ llm_build_bailingmoe2::llm_build_bailingmoe2(const llama_model & model, const ll
|
|||
cb(Vcur, "Vcur", il);
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
|
||||
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -72,7 +72,7 @@ llm_build_bert::llm_build_bert(const llama_model & model, const llm_graph_params
|
|||
cb(Vcur, "Vcur", il);
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
|
||||
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
|
||||
cb(cur, "kqv_out", il);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -57,8 +57,8 @@ llm_build_bitnet::llm_build_bitnet(const llama_model & model, const llm_graph_pa
|
|||
cb(cur, "attn_sub_norm", il);
|
||||
|
||||
cur = build_lora_mm(model.layers[il].wo, cur, model.layers[il].wo_s);
|
||||
if (model.layers[il].bo) {
|
||||
cur = ggml_add(ctx0, cur, model.layers[il].bo);
|
||||
if (model.layers[il].wo_b) {
|
||||
cur = ggml_add(ctx0, cur, model.layers[il].wo_b);
|
||||
}
|
||||
cb(cur, "attn_out", il);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -33,7 +33,7 @@ llm_build_bloom::llm_build_bloom(const llama_model & model, const llm_graph_para
|
|||
n_embd_head, n_head, n_head_kv, il);
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
|
||||
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -47,7 +47,7 @@ llm_build_codeshell::llm_build_codeshell(const llama_model & model, const llm_gr
|
|||
cb(Vcur, "Vcur", il);
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
|
||||
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -58,7 +58,7 @@ llm_build_cohere2_iswa::llm_build_cohere2_iswa(const llama_model & model, const
|
|||
cb(Vcur, "Vcur", il);
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
|
||||
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -54,7 +54,7 @@ llm_build_command_r::llm_build_command_r(const llama_model & model, const llm_gr
|
|||
cb(Vcur, "Vcur", il);
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
|
||||
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
|
||||
}
|
||||
if (il == n_layer - 1 && inp_out_ids) {
|
||||
|
|
|
|||
|
|
@ -59,7 +59,7 @@ llm_build_deci::llm_build_deci(const llama_model & model, const llm_graph_params
|
|||
cb(Vcur, "Vcur", il);
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
|
||||
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
||||
}
|
||||
if (il == n_layer - 1 && inp_out_ids) {
|
||||
|
|
|
|||
|
|
@ -49,7 +49,7 @@ llm_build_deepseek::llm_build_deepseek(const llama_model & model, const llm_grap
|
|||
cb(Vcur, "Vcur", il);
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
|
||||
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
||||
}
|
||||
if (il == n_layer - 1 && inp_out_ids) {
|
||||
|
|
|
|||
|
|
@ -49,7 +49,7 @@ llm_build_dots1::llm_build_dots1(const llama_model & model, const llm_graph_para
|
|||
cb(Vcur, "Vcur", il);
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
|
||||
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
|
||||
}
|
||||
if (il == n_layer - 1 && inp_out_ids) {
|
||||
|
|
|
|||
|
|
@ -43,7 +43,7 @@ llm_build_dream::llm_build_dream(const llama_model & model, const llm_graph_para
|
|||
cb(Vcur, "Vcur", il);
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
|
||||
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
|
||||
}
|
||||
if (il == n_layer - 1 && inp_out_ids) {
|
||||
|
|
|
|||
|
|
@ -46,7 +46,7 @@ llm_build_exaone::llm_build_exaone(const llama_model & model, const llm_graph_pa
|
|||
cb(Vcur, "Vcur", il);
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
|
||||
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
|
||||
}
|
||||
if (il == n_layer - 1 && inp_out_ids) {
|
||||
|
|
|
|||
|
|
@ -37,7 +37,7 @@ llm_build_gpt2::llm_build_gpt2(const llama_model & model, const llm_graph_params
|
|||
n_embd_head, n_head, n_head_kv, il);
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
|
||||
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -46,7 +46,7 @@ llm_build_gptneox::llm_build_gptneox(const llama_model & model, const llm_graph_
|
|||
cb(Vcur, "Vcur", il);
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
|
||||
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -92,7 +92,7 @@ ggml_tensor * llm_build_granite_hybrid::build_attention_layer(ggml_tensor *
|
|||
const float kq_scale =
|
||||
hparams.f_attention_scale == 0.0f ? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
|
||||
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
||||
cb(cur, "attn_out", il);
|
||||
return cur;
|
||||
|
|
|
|||
|
|
@ -101,7 +101,7 @@ ggml_tensor * llm_build_granite::build_attention_layer(
|
|||
|
||||
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
|
||||
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
||||
cb(cur, "attn_out", il);
|
||||
return cur;
|
||||
|
|
|
|||
|
|
@ -50,7 +50,7 @@ llm_build_grok::llm_build_grok(const llama_model & model, const llm_graph_params
|
|||
cb(Vcur, "Vcur", il);
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
|
||||
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
|
||||
}
|
||||
if (il == n_layer - 1 && inp_out_ids) {
|
||||
|
|
|
|||
|
|
@ -50,7 +50,7 @@ llm_build_grovemoe::llm_build_grovemoe(const llama_model & model, const llm_grap
|
|||
cb(Vcur, "Vcur", il);
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
|
||||
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -64,7 +64,7 @@ llm_build_hunyuan_dense::llm_build_hunyuan_dense(const llama_model & model, cons
|
|||
cb(Qcur, "Qcur_norm", il);
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
|
||||
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
||||
cb(cur, "attn_out", il);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -65,7 +65,7 @@ llm_build_hunyuan_moe::llm_build_hunyuan_moe(const llama_model & model, const ll
|
|||
cb(Qcur, "Qcur_norm", il);
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
|
||||
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
||||
cb(cur, "attn_out", il);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -50,7 +50,7 @@ llm_build_internlm2::llm_build_internlm2(const llama_model & model, const llm_gr
|
|||
cb(Vcur, "Vcur", il);
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
|
||||
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||
}
|
||||
if (il == n_layer - 1 && inp_out_ids) {
|
||||
|
|
|
|||
|
|
@ -27,7 +27,7 @@ llm_build_jais::llm_build_jais(const llama_model & model, const llm_graph_params
|
|||
n_embd_head, n_head, n_head_kv, il);
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
|
||||
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/float(n_embd_head), il);
|
||||
}
|
||||
if (il == n_layer - 1 && inp_out_ids) {
|
||||
|
|
|
|||
|
|
@ -51,7 +51,7 @@ llm_build_jais2::llm_build_jais2(const llama_model & model, const llm_graph_para
|
|||
cb(Kcur, "Kcur_rope", il);
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
|
||||
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -70,7 +70,7 @@ llm_build_llama<embed>::llm_build_llama(const llama_model & model, const llm_gra
|
|||
cb(Kcur, "Kcur_normed", il);
|
||||
}
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
|
||||
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
||||
if (model.layers[il].wo_s) {
|
||||
cur = ggml_mul(ctx0, cur, model.layers[il].wo_s);
|
||||
|
|
|
|||
|
|
@ -84,7 +84,7 @@ llm_build_llama4<iswa>::llm_build_llama4(const llama_model & model, const llm_gr
|
|||
cb(Kcur, "Kcur_normed", il);
|
||||
}
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
|
||||
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
||||
cb(cur, "attn_out", il);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -56,7 +56,7 @@ llm_build_maincoder::llm_build_maincoder(const llama_model & model, const llm_gr
|
|||
cb(Vcur, "Vcur", il);
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
|
||||
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||
}
|
||||
if (il == n_layer - 1 && inp_out_ids) {
|
||||
|
|
|
|||
|
|
@ -67,7 +67,7 @@ llm_build_mistral3::llm_build_mistral3(const llama_model & model, const llm_grap
|
|||
}
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
|
||||
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
||||
cb(cur, "attn_out", il);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -56,7 +56,7 @@ llm_build_mpt::llm_build_mpt(const llama_model & model, const llm_graph_params &
|
|||
cb(Vcur, "Vcur", il);
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
|
||||
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -70,7 +70,7 @@ ggml_tensor * llm_build_nemotron_h::build_attention_layer(ggml_tensor *
|
|||
const float kq_scale =
|
||||
hparams.f_attention_scale == 0.0f ? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
|
||||
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
||||
cb(cur, "attn_out", il);
|
||||
return cur;
|
||||
|
|
|
|||
|
|
@ -51,7 +51,7 @@ llm_build_nemotron::llm_build_nemotron(const llama_model & model, const llm_grap
|
|||
cb(Vcur, "Vcur", il);
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
|
||||
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||
}
|
||||
if (il == n_layer - 1 && inp_out_ids) {
|
||||
|
|
|
|||
|
|
@ -48,7 +48,7 @@ llm_build_openai_moe_iswa::llm_build_openai_moe_iswa(const llama_model & model,
|
|||
cb(Vcur, "Vcur", il);
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
|
||||
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||
Qcur, Kcur, Vcur, nullptr, model.layers[il].attn_sinks, nullptr, 1.0f/sqrtf(float(n_rot)), il);
|
||||
|
||||
cb(cur, "attn_out", il);
|
||||
|
|
|
|||
|
|
@ -55,7 +55,7 @@ llm_build_paddleocr::llm_build_paddleocr(const llama_model & model, const llm_gr
|
|||
cb(Vcur, "Vcur", il);
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
|
||||
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||
}
|
||||
if (il == n_layer - 1) {
|
||||
|
|
|
|||
|
|
@ -49,7 +49,7 @@ llm_build_pangu_embedded::llm_build_pangu_embedded(const llama_model & model, co
|
|||
cb(Vcur, "Vcur", il);
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
|
||||
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -51,7 +51,7 @@ llm_build_phi2::llm_build_phi2(const llama_model & model, const llm_graph_params
|
|||
Qcur = ggml_scale(ctx0, Qcur, 1.0f/sqrtf(float(n_embd_head)));
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
|
||||
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
|
||||
}
|
||||
if (il == n_layer - 1 && inp_out_ids) {
|
||||
|
|
|
|||
|
|
@ -60,7 +60,7 @@ llm_build_phi3<iswa>::llm_build_phi3(const llama_model & model, const llm_graph_
|
|||
cb(Qcur, "Qcur", il);
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
|
||||
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
|
||||
}
|
||||
if (il == n_layer - 1 && inp_out_ids) {
|
||||
|
|
|
|||
|
|
@ -50,7 +50,7 @@ llm_build_qwen2::llm_build_qwen2(const llama_model & model, const llm_graph_para
|
|||
cb(Vcur, "Vcur", il);
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
|
||||
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||
}
|
||||
if (il == n_layer - 1 && inp_out_ids) {
|
||||
|
|
|
|||
|
|
@ -50,7 +50,7 @@ llm_build_qwen2moe::llm_build_qwen2moe(const llama_model & model, const llm_grap
|
|||
cb(Vcur, "Vcur", il);
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
|
||||
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||
}
|
||||
if (il == n_layer - 1 && inp_out_ids) {
|
||||
|
|
|
|||
|
|
@ -53,7 +53,7 @@ llm_build_qwen2vl::llm_build_qwen2vl(const llama_model & model, const llm_graph_
|
|||
cb(Vcur, "Vcur", il);
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
|
||||
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||
}
|
||||
if (il == n_layer - 1 && inp_out_ids) {
|
||||
|
|
|
|||
|
|
@ -56,7 +56,7 @@ llm_build_qwen3::llm_build_qwen3(const llama_model & model, const llm_graph_para
|
|||
cb(Vcur, "Vcur", il);
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
|
||||
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||
if (model.layers[il].wo_s) {
|
||||
cur = ggml_mul(ctx0, cur, model.layers[il].wo_s);
|
||||
|
|
|
|||
|
|
@ -56,7 +56,7 @@ llm_build_qwen3moe::llm_build_qwen3moe(const llama_model & model, const llm_grap
|
|||
cb(Vcur, "Vcur", il);
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
|
||||
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||
if (model.layers[il].wo_s) {
|
||||
cur = ggml_mul(ctx0, cur, model.layers[il].wo_s);
|
||||
|
|
|
|||
|
|
@ -62,7 +62,7 @@ llm_build_qwen3vlmoe::llm_build_qwen3vlmoe(const llama_model & model, const llm_
|
|||
cb(Vcur, "Vcur", il);
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
|
||||
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -62,7 +62,7 @@ llm_build_qwen3vl::llm_build_qwen3vl(const llama_model & model, const llm_graph_
|
|||
cb(Vcur, "Vcur", il);
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
|
||||
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -58,7 +58,7 @@ llm_build_rnd1::llm_build_rnd1(const llama_model & model, const llm_graph_params
|
|||
cb(Vcur, "Vcur", il);
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
|
||||
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||
}
|
||||
if (il == n_layer - 1 && inp_out_ids) {
|
||||
|
|
|
|||
|
|
@ -52,7 +52,7 @@ llm_build_seed_oss::llm_build_seed_oss(const llama_model & model, const llm_grap
|
|||
cb(Vcur, "Vcur", il);
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
|
||||
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
||||
cb(cur, "attn_out", il);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -59,7 +59,7 @@ llm_build_smallthinker<iswa>::llm_build_smallthinker(const llama_model & model,
|
|||
cb(Kcur, "Kcur", il);
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
|
||||
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
|
||||
}
|
||||
if (il == n_layer - 1 && inp_out_ids) {
|
||||
|
|
|
|||
|
|
@ -55,7 +55,7 @@ llm_build_smollm3::llm_build_smollm3(const llama_model & model, const llm_graph_
|
|||
cb(Vcur, "Vcur", il);
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
|
||||
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
||||
cb(cur, "attn_out", il);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -36,7 +36,7 @@ llm_build_starcoder::llm_build_starcoder(const llama_model & model, const llm_gr
|
|||
n_embd_head, n_head, n_head_kv, il);
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
|
||||
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||
}
|
||||
if (il == n_layer - 1 && inp_out_ids) {
|
||||
|
|
|
|||
|
|
@ -50,7 +50,7 @@ llm_build_starcoder2::llm_build_starcoder2(const llama_model & model, const llm_
|
|||
cb(Vcur, "Vcur", il);
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
|
||||
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||
}
|
||||
if (il == n_layer - 1 && inp_out_ids) {
|
||||
|
|
|
|||
|
|
@ -41,7 +41,7 @@ llm_build_t5<false>::llm_build_t5(const llama_model & model, const llm_graph_par
|
|||
ggml_tensor * kq_b = build_pos_bias(pos_bucket_dec, attn_rel_b);
|
||||
|
||||
cur = build_attn(inp_attn_self,
|
||||
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
|
||||
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||
Qcur, Kcur, Vcur, kq_b, nullptr, nullptr, 1.0f, il);
|
||||
cb(cur, "kqv_out", il);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -114,10 +114,10 @@ llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks) {
|
|||
return n_pos;
|
||||
}
|
||||
|
||||
void mtmd_helper_image_get_decoder_pos(const mtmd_image_tokens * chunks, mtmd_decoder_pos * out_pos) {
|
||||
void mtmd_helper_image_get_decoder_pos(const mtmd_image_tokens * chunks, llama_pos pos_0, mtmd_decoder_pos * out_pos) {
|
||||
size_t n_tokens = mtmd_image_tokens_get_n_tokens(chunks);
|
||||
for (size_t i = 0; i < n_tokens; i++) {
|
||||
out_pos[i] = mtmd_image_tokens_get_decoder_pos(chunks, i);
|
||||
out_pos[i] = mtmd_image_tokens_get_decoder_pos(chunks, pos_0, i);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -163,15 +163,15 @@ struct decode_embd_batch {
|
|||
}
|
||||
|
||||
// M-RoPE for image
|
||||
void set_position_mrope_2d(llama_pos pos_0, const std::vector<mtmd_decoder_pos> & rel_pos, llama_seq_id seq_id) {
|
||||
void set_position_mrope_2d(const std::vector<mtmd_decoder_pos> & rel_pos, llama_seq_id seq_id) {
|
||||
GGML_ASSERT(n_pos_per_embd == 4);
|
||||
GGML_ASSERT(!rel_pos.empty() && (int32_t)rel_pos.size() == batch.n_tokens);
|
||||
seq_id_0[0] = seq_id;
|
||||
for (int32_t i = 0; i < batch.n_tokens; i++) {
|
||||
pos[i ] = pos_0 + rel_pos[i].t;
|
||||
pos[i + batch.n_tokens ] = pos_0 + rel_pos[i].y;
|
||||
pos[i + batch.n_tokens * 2] = pos_0 + rel_pos[i].x;
|
||||
pos[i + batch.n_tokens * 3] = 0; // last pos dim is unused
|
||||
pos[i ] = rel_pos[i].t;
|
||||
pos[i + batch.n_tokens ] = rel_pos[i].y;
|
||||
pos[i + batch.n_tokens * 2] = rel_pos[i].x;
|
||||
pos[i + batch.n_tokens * 3] = rel_pos[i].z;
|
||||
}
|
||||
for (int i = 0; i < batch.n_tokens; i++) {
|
||||
batch.n_seq_id[i] = 1;
|
||||
|
|
@ -188,7 +188,7 @@ struct decode_embd_batch {
|
|||
pos[i ] = pos_0 + i;
|
||||
pos[i + batch.n_tokens ] = pos_0 + i;
|
||||
pos[i + batch.n_tokens * 2] = pos_0 + i;
|
||||
pos[i + batch.n_tokens * 3] = 0; // last pos dim is unused
|
||||
pos[i + batch.n_tokens * 3] = pos_0 + i;
|
||||
}
|
||||
for (int i = 0; i < batch.n_tokens; i++) {
|
||||
batch.n_seq_id[i] = 1;
|
||||
|
|
@ -268,8 +268,8 @@ int32_t mtmd_helper_decode_image_chunk(
|
|||
}
|
||||
const auto n_tokens = mtmd_image_tokens_get_n_tokens(image_tokens);
|
||||
std::vector<mtmd_decoder_pos> rel_pos(n_tokens);
|
||||
mtmd_helper_image_get_decoder_pos(image_tokens, rel_pos.data());
|
||||
batch_embd.set_position_mrope_2d(n_past, rel_pos, seq_id);
|
||||
mtmd_helper_image_get_decoder_pos(image_tokens, n_past, rel_pos.data());
|
||||
batch_embd.set_position_mrope_2d(rel_pos, seq_id);
|
||||
} else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
|
||||
batch_embd.set_position_mrope_1d(n_past, seq_id);
|
||||
} else {
|
||||
|
|
|
|||
|
|
@ -49,7 +49,7 @@ MTMD_API llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks);
|
|||
|
||||
// helper to get the list of relative positions corresponding to the embedding tokens, to be used by M-RoPE
|
||||
// out_pos must have length == mtmd_helper_get_n_tokens(image)
|
||||
MTMD_API void mtmd_helper_image_get_decoder_pos(const mtmd_image_tokens * image, struct mtmd_decoder_pos * out_pos);
|
||||
MTMD_API void mtmd_helper_image_get_decoder_pos(const mtmd_image_tokens * image, llama_pos pos_0, struct mtmd_decoder_pos * out_pos);
|
||||
|
||||
// helper function that automatically:
|
||||
// 1. run llama_decode() on text chunks
|
||||
|
|
|
|||
|
|
@ -1246,11 +1246,14 @@ size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens) {
|
|||
return image_tokens->ny;
|
||||
}
|
||||
|
||||
mtmd_decoder_pos mtmd_image_tokens_get_decoder_pos(const mtmd_image_tokens * image_tokens, size_t i) {
|
||||
mtmd_decoder_pos mtmd_image_tokens_get_decoder_pos(const mtmd_image_tokens * image_tokens, llama_pos pos_0, size_t i) {
|
||||
mtmd_decoder_pos pos;
|
||||
pos.t = 0;
|
||||
pos.x = i % image_tokens->nx;
|
||||
pos.y = i / image_tokens->nx;
|
||||
// M-RoPE logic
|
||||
// TODO: support other types of position encoding if needed
|
||||
pos.t = pos_0;
|
||||
pos.x = pos_0 + (i % image_tokens->nx);
|
||||
pos.y = pos_0 + (i / image_tokens->nx);
|
||||
pos.z = 0; // unused for now
|
||||
return pos;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -196,11 +196,13 @@ struct mtmd_decoder_pos {
|
|||
uint32_t t;
|
||||
uint32_t x;
|
||||
uint32_t y;
|
||||
uint32_t z; // unused for now, reserved for future use
|
||||
};
|
||||
// get position for decoder attention, to be used by M-RoPE models
|
||||
// i is the index of the embedding token, ranging from 0 to mtmd_image_tokens_get_n_tokens() - 1
|
||||
// pos_0 is the absolute position of the first token
|
||||
// return relative position (for example, embedding 0 will have position (0, 0, 0); remember to adjust it to the current absolute position)
|
||||
MTMD_API struct mtmd_decoder_pos mtmd_image_tokens_get_decoder_pos(const mtmd_image_tokens * image_tokens, size_t i);
|
||||
MTMD_API struct mtmd_decoder_pos mtmd_image_tokens_get_decoder_pos(const mtmd_image_tokens * image_tokens, llama_pos pos_0, size_t i);
|
||||
|
||||
// tokenize an input text prompt and a list of bitmaps (images/audio)
|
||||
// the prompt must have the input image marker (default: "<__media__>") in it
|
||||
|
|
|
|||
|
|
@ -391,15 +391,25 @@ void server_tokens::push_back(server_tokens & tokens) {
|
|||
}
|
||||
|
||||
void server_tokens::insert(const llama_tokens & inp_tokens) {
|
||||
GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled
|
||||
tokens.insert(tokens.end(), inp_tokens.begin(), inp_tokens.end());
|
||||
}
|
||||
|
||||
const llama_tokens & server_tokens::get_text_tokens() const {
|
||||
GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled
|
||||
const llama_tokens & server_tokens::get_tokens() const {
|
||||
GGML_ASSERT(!has_mtmd);
|
||||
return tokens;
|
||||
}
|
||||
|
||||
llama_tokens server_tokens::get_text_tokens() const {
|
||||
llama_tokens res;
|
||||
res.reserve(tokens.size());
|
||||
for (llama_token t : tokens) {
|
||||
if (t != LLAMA_TOKEN_NULL) {
|
||||
res.push_back(t);
|
||||
}
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
void server_tokens::set_token(llama_pos pos, llama_token id) {
|
||||
GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled
|
||||
tokens[pos] = id;
|
||||
|
|
|
|||
|
|
@ -190,7 +190,9 @@ public:
|
|||
void insert(const llama_tokens & inp_tokens);
|
||||
|
||||
// for compatibility with speculative decoding, ctx shift, slot save/load
|
||||
const llama_tokens & get_text_tokens() const;
|
||||
const llama_tokens & get_tokens() const;
|
||||
|
||||
llama_tokens get_text_tokens() const;
|
||||
|
||||
// for compatibility with speculative decoding
|
||||
void set_token(llama_pos pos, llama_token id);
|
||||
|
|
|
|||
|
|
@ -1,3 +1,4 @@
|
|||
|
||||
#include "server-context.h"
|
||||
#include "server-common.h"
|
||||
#include "server-http.h"
|
||||
|
|
@ -19,6 +20,7 @@
|
|||
#include <exception>
|
||||
#include <memory>
|
||||
#include <filesystem>
|
||||
#include <utility>
|
||||
|
||||
// fix problem with std::min and std::max
|
||||
#if defined(_WIN32)
|
||||
|
|
@ -33,6 +35,31 @@ using json = nlohmann::ordered_json;
|
|||
|
||||
constexpr int HTTP_POLLING_SECONDS = 1;
|
||||
|
||||
static server_prompt_checkpoint server_get_checkpoint(llama_context * ctx, int id, int64_t n_tokens, llama_pos pos_min = -1, llama_pos pos_max = -1) {
|
||||
if (pos_min == -1) {
|
||||
pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx), id);
|
||||
}
|
||||
if (pos_max == -1) {
|
||||
pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx), id);
|
||||
}
|
||||
|
||||
const size_t checkpoint_size = llama_state_seq_get_size_ext(ctx, id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
|
||||
|
||||
auto cur = server_prompt_checkpoint {
|
||||
/*.pos_min = */ pos_min,
|
||||
/*.pos_max = */ pos_max,
|
||||
/*.n_tokens = */ n_tokens,
|
||||
/*.data = */ std::vector<uint8_t>(checkpoint_size),
|
||||
};
|
||||
|
||||
const size_t n = llama_state_seq_get_data_ext(ctx, cur.data.data(), checkpoint_size, id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
|
||||
if (n != checkpoint_size) {
|
||||
GGML_ABORT("checkpoint size mismatch: expected %zu, got %zu\n", checkpoint_size, n);
|
||||
}
|
||||
|
||||
return cur;
|
||||
}
|
||||
|
||||
// state diagram: https://github.com/ggml-org/llama.cpp/pull/9283
|
||||
enum slot_state {
|
||||
SLOT_STATE_IDLE,
|
||||
|
|
@ -51,13 +78,18 @@ enum server_state {
|
|||
struct server_slot {
|
||||
int id;
|
||||
|
||||
// TODO: change to unique_ptrs for consistency:
|
||||
llama_context * ctx = nullptr;
|
||||
|
||||
common_context_seq_rm_type ctx_seq_rm_type = COMMON_CONTEXT_SEQ_RM_TYPE_NO;
|
||||
|
||||
// multimodal
|
||||
mtmd_context * mctx = nullptr;
|
||||
|
||||
common_speculative * spec = nullptr;
|
||||
// speculative decoding
|
||||
llama_tokens spec_draft;
|
||||
std::vector<int32_t> spec_i_batch;
|
||||
server_prompt_checkpoint spec_ckpt;
|
||||
common_speculative_ptr spec;
|
||||
|
||||
// TODO: move members that belong to the task (such as `generated_text`, `has_new_line`) to task_results_state
|
||||
// see https://github.com/ggml-org/llama.cpp/pull/18283#issuecomment-3710175837
|
||||
|
|
@ -83,11 +115,6 @@ struct server_slot {
|
|||
std::string debug_generated_text;
|
||||
llama_tokens generated_tokens;
|
||||
|
||||
// idx of draft tokens in the main batch
|
||||
// non-empty if we went to evaluate draft tokens
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/17808
|
||||
std::vector<int32_t> i_batch_dft;
|
||||
|
||||
std::vector<completion_token_output> generated_token_probs;
|
||||
|
||||
bool has_next_token = true;
|
||||
|
|
@ -147,8 +174,7 @@ struct server_slot {
|
|||
|
||||
common_sampler_ptr smpl;
|
||||
|
||||
llama_token sampled; // in speculative mode, this is the last accepted token
|
||||
llama_tokens drafted;
|
||||
llama_token sampled; // in speculative mode, this is the last accepted token
|
||||
|
||||
// stats
|
||||
size_t n_sent_text = 0; // number of sent text character
|
||||
|
|
@ -178,8 +204,11 @@ struct server_slot {
|
|||
stopping_word = "";
|
||||
n_sent_text = 0;
|
||||
|
||||
drafted.clear();
|
||||
i_batch_dft.clear();
|
||||
if (can_speculate()) {
|
||||
spec_draft.clear();
|
||||
spec_i_batch.clear();
|
||||
spec_ckpt.clear();
|
||||
}
|
||||
generated_tokens.clear();
|
||||
generated_token_probs.clear();
|
||||
json_schema = json();
|
||||
|
|
@ -300,6 +329,83 @@ struct server_slot {
|
|||
return n_draft_max;
|
||||
}
|
||||
|
||||
void update_batch(llama_batch & batch) {
|
||||
const int n_draft_max = get_n_draft_max();
|
||||
if (n_draft_max > 0) {
|
||||
GGML_ASSERT(can_speculate());
|
||||
|
||||
// generate draft tokens in speculative decoding mode
|
||||
// TODO: rework to have a single draft llama_context shared across all slots [TAG_SERVER_SPEC_REWORK]
|
||||
// perform the speculative drafting for all sequences at the same time in a single batch
|
||||
const llama_tokens & tokens = prompt.tokens.get_text_tokens();
|
||||
|
||||
const auto & params_spec = task->params.speculative;
|
||||
|
||||
if (!spec_draft.empty()) {
|
||||
// we have a previous (partial) draft to reuse
|
||||
if (ctx_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_FULL) {
|
||||
GGML_ASSERT(!spec_ckpt.empty());
|
||||
}
|
||||
} else {
|
||||
GGML_ASSERT(spec_i_batch.empty());
|
||||
|
||||
// generate a new draft
|
||||
spec_draft = common_speculative_draft(spec.get(), params_spec, tokens, sampled);
|
||||
|
||||
if (spec_draft.size() > (size_t) n_draft_max) {
|
||||
SLT_WRN(*this, "draft size %d exceeds max %d, truncating\n", (int) spec_draft.size(), n_draft_max);
|
||||
spec_draft.resize(n_draft_max);
|
||||
}
|
||||
|
||||
if (spec_draft.size() < (size_t) params_spec.n_min) {
|
||||
SLT_DBG(*this, "ignoring small draft: %d < %d\n", (int) spec_draft.size(), params_spec.n_min);
|
||||
spec_draft.clear();
|
||||
}
|
||||
|
||||
if (!spec_draft.empty() && ctx_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_FULL) {
|
||||
const auto n_tokens = prompt.tokens.size();
|
||||
|
||||
spec_ckpt = server_get_checkpoint(ctx, this->id, n_tokens);
|
||||
|
||||
SLT_DBG(*this, "created speculative checkpoint (pos_min = %d, pos_max = %d, n_tokens = %zu, size = %.3f MiB)\n",
|
||||
spec_ckpt.pos_min, spec_ckpt.pos_max, n_tokens, (float) spec_ckpt.data.size() / 1024 / 1024);
|
||||
}
|
||||
}
|
||||
|
||||
GGML_ASSERT(spec_draft.size() <= (size_t) n_draft_max);
|
||||
}
|
||||
|
||||
if (spec_draft.empty()) {
|
||||
// no speculative decoding
|
||||
i_batch = batch.n_tokens;
|
||||
|
||||
common_batch_add(batch, sampled, prompt.tokens.pos_next(), { this->id }, true);
|
||||
|
||||
SLT_DBG(*this, "slot decode token, id=%d, n_ctx = %d, n_tokens = %d, truncated = %d\n",
|
||||
sampled, n_ctx, prompt.n_tokens(), truncated);
|
||||
} else {
|
||||
SLT_DBG(*this, "generate_draft: id=%d, #tokens=%zu, #draft=%zu, pos_next=%d\n",
|
||||
sampled, prompt.tokens.size(), spec_draft.size(), prompt.tokens.pos_next());
|
||||
|
||||
GGML_ASSERT(spec_i_batch.empty());
|
||||
|
||||
spec_i_batch.push_back(batch.n_tokens);
|
||||
for (size_t i = 0; i < spec_draft.size(); i++) {
|
||||
spec_i_batch.push_back(batch.n_tokens + i + 1);
|
||||
}
|
||||
|
||||
auto pos0 = prompt.tokens.pos_next();
|
||||
|
||||
common_batch_add(batch, sampled, pos0++, { this->id }, true);
|
||||
for (auto token : spec_draft) {
|
||||
common_batch_add(batch, token, pos0++, { this->id }, true);
|
||||
}
|
||||
}
|
||||
|
||||
prompt.tokens.push_back(sampled);
|
||||
prompt.tokens.insert(spec_draft);
|
||||
}
|
||||
|
||||
void release() {
|
||||
if (is_processing()) {
|
||||
GGML_ASSERT(task);
|
||||
|
|
@ -400,7 +506,7 @@ struct server_slot {
|
|||
);
|
||||
}
|
||||
|
||||
common_speculative_print_stats(spec);
|
||||
common_speculative_print_stats(spec.get());
|
||||
}
|
||||
|
||||
json to_json(bool only_metrics = false) const {
|
||||
|
|
@ -591,16 +697,17 @@ private:
|
|||
|
||||
void destroy() {
|
||||
llama_init.reset();
|
||||
|
||||
ctx = nullptr;
|
||||
model = nullptr;
|
||||
|
||||
mtmd_free(mctx);
|
||||
mctx = nullptr;
|
||||
|
||||
// Clear any sampling context
|
||||
for (server_slot & slot : slots) {
|
||||
common_speculative_free(slot.spec);
|
||||
slot.spec = nullptr;
|
||||
if (slot.can_speculate()) {
|
||||
slot.spec.reset();
|
||||
}
|
||||
}
|
||||
|
||||
llama_batch_free(batch);
|
||||
|
|
@ -642,9 +749,6 @@ private:
|
|||
|
||||
llama_init = common_init_from_params(params_base);
|
||||
|
||||
// propagate model-metadata sampling defaults back to caller
|
||||
params.sampling = params_base.sampling;
|
||||
|
||||
model = llama_init->model();
|
||||
ctx = llama_init->context();
|
||||
|
||||
|
|
@ -660,6 +764,7 @@ private:
|
|||
add_bos_token = llama_vocab_get_add_bos(vocab);
|
||||
|
||||
if (params_base.speculative.has_dft()) {
|
||||
// TODO speculative: move to common/speculative.cpp?
|
||||
SRV_INF("loading draft model '%s'\n", params_base.speculative.mparams_dft.path.c_str());
|
||||
|
||||
const auto & params_spec = params_base.speculative;
|
||||
|
|
@ -727,11 +832,6 @@ private:
|
|||
params_base.n_cache_reuse = 0;
|
||||
SRV_WRN("%s\n", "cache_reuse is not supported by multimodal, it will be disabled");
|
||||
}
|
||||
|
||||
if (params_base.speculative.type != COMMON_SPECULATIVE_TYPE_NONE) {
|
||||
params_base.speculative.type = COMMON_SPECULATIVE_TYPE_NONE;
|
||||
SRV_WRN("%s\n", "speculative decoding is not supported by multimodal, it will be disabled");
|
||||
}
|
||||
}
|
||||
|
||||
if (!llama_memory_can_shift(llama_get_memory(ctx))) {
|
||||
|
|
@ -769,33 +869,38 @@ private:
|
|||
|
||||
slots.clear();
|
||||
|
||||
const bool can_spec = common_speculative_is_compat(ctx);
|
||||
if (!can_spec) {
|
||||
const auto ctx_seq_rm_type = common_context_can_seq_rm(ctx);
|
||||
if (ctx_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_NO) {
|
||||
SRV_WRN("%s", "speculative decoding not supported by this context\n");
|
||||
}
|
||||
|
||||
if (ctx_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_FULL) {
|
||||
SRV_WRN("%s", "speculative decoding will use checkpoints\n");
|
||||
}
|
||||
|
||||
// initialize slots
|
||||
for (int i = 0; i < params_base.n_parallel; i++) {
|
||||
server_slot slot;
|
||||
slots.emplace_back();
|
||||
}
|
||||
|
||||
for (int i = 0; i < params_base.n_parallel; i++) {
|
||||
server_slot & slot = slots[i];
|
||||
|
||||
slot.id = i;
|
||||
slot.ctx = ctx;
|
||||
slot.n_ctx = n_ctx_slot;
|
||||
|
||||
slot.ctx_seq_rm_type = ctx_seq_rm_type;
|
||||
|
||||
slot.mctx = mctx;
|
||||
slot.prompt.tokens.has_mtmd = mctx != nullptr;
|
||||
|
||||
// try speculative decoding
|
||||
if (can_spec) {
|
||||
slot.spec = common_speculative_init(params_base.speculative, slot.ctx);
|
||||
if (ctx_seq_rm_type != COMMON_CONTEXT_SEQ_RM_TYPE_NO) {
|
||||
slot.spec.reset(common_speculative_init(params_base.speculative, slot.ctx));
|
||||
|
||||
if (slot.spec) {
|
||||
if (mctx) {
|
||||
SRV_ERR("%s\n", "speculative decoding is not supported with multimodal");
|
||||
return false;
|
||||
}
|
||||
SLT_INF(slot, "%s", "speculative decoding context initialized\n");
|
||||
} else {
|
||||
SLT_INF(slot, "%s", "speculative decoding context not initialized\n");
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -806,8 +911,6 @@ private:
|
|||
};
|
||||
|
||||
slot.reset();
|
||||
|
||||
slots.push_back(std::move(slot));
|
||||
}
|
||||
|
||||
{
|
||||
|
|
@ -854,6 +957,9 @@ private:
|
|||
model_aliases = params_base.model_alias;
|
||||
model_tags = params_base.model_tags;
|
||||
|
||||
// propagate new defaults back to caller
|
||||
params = params_base;
|
||||
|
||||
if (!is_resume) {
|
||||
return init();
|
||||
}
|
||||
|
|
@ -880,13 +986,13 @@ private:
|
|||
|
||||
metrics.init();
|
||||
|
||||
if (params_base.clear_idle) {
|
||||
if (params_base.cache_idle_slots) {
|
||||
if (!params_base.kv_unified) {
|
||||
SRV_WRN("%s: --clear-idle requires --kv-unified, disabling\n", __func__);
|
||||
params_base.clear_idle = false;
|
||||
SRV_WRN("%s: --cache-idle-slots requires --kv-unified, disabling\n", __func__);
|
||||
params_base.cache_idle_slots = false;
|
||||
} else if (params_base.cache_ram_mib == 0) {
|
||||
SRV_WRN("%s: --clear-idle requires --cache-ram, disabling\n", __func__);
|
||||
params_base.clear_idle = false;
|
||||
SRV_WRN("%s: --cache-idle-slots requires --cache-ram, disabling\n", __func__);
|
||||
params_base.cache_idle_slots = false;
|
||||
} else {
|
||||
SRV_INF("%s: idle slots will be saved to prompt cache and cleared upon starting a new task\n", __func__);
|
||||
SRV_DBG("%s", "__TEST_TAG_CLEAR_IDLE_ENABLED__\n");
|
||||
|
|
@ -1197,7 +1303,7 @@ private:
|
|||
backend_sampling &= task.params.sampling.backend_sampling;
|
||||
|
||||
// TODO: speculative decoding requires multiple samples per batch - not supported yet
|
||||
backend_sampling &= !(slot.spec && task.params.speculative.n_max > 0);
|
||||
backend_sampling &= !(slot.can_speculate() && task.params.speculative.n_max > 0);
|
||||
|
||||
// TODO: getting post/pre sampling logits is not yet supported with backend sampling
|
||||
backend_sampling &= !need_logits;
|
||||
|
|
@ -1703,6 +1809,26 @@ private:
|
|||
return true;
|
||||
}
|
||||
|
||||
// n_tokens_cur: the number of tokens added to the batch for the current slot
|
||||
void create_checkpoint(server_slot & slot, const int64_t n_tokens_cur, llama_pos pos_min, llama_pos pos_max) {
|
||||
while (slot.prompt.checkpoints.size() >= (size_t) params_base.n_ctx_checkpoints) {
|
||||
// make room for the new checkpoint, if needed
|
||||
const auto & cur = slot.prompt.checkpoints.front();
|
||||
|
||||
SLT_WRN(slot, "erasing old context checkpoint (pos_min = %d, pos_max = %d, n_tokens = %" PRId64 ", size = %.3f MiB)\n",
|
||||
cur.pos_min, cur.pos_max, cur.n_tokens, (float) cur.data.size() / 1024 / 1024);
|
||||
|
||||
slot.prompt.checkpoints.erase(slot.prompt.checkpoints.begin());
|
||||
}
|
||||
|
||||
const auto & cur = slot.prompt.checkpoints.emplace_back(server_get_checkpoint(ctx, slot.id, slot.prompt.n_tokens() - n_tokens_cur, pos_min, pos_max));
|
||||
|
||||
SLT_WRN(slot,
|
||||
"created context checkpoint %d of %d (pos_min = %d, pos_max = %d, n_tokens = %" PRId64 ", size = %.3f MiB)\n",
|
||||
(int) slot.prompt.checkpoints.size(), params_base.n_ctx_checkpoints, cur.pos_min,
|
||||
cur.pos_max, cur.n_tokens, (float) cur.data.size() / 1024 / 1024);
|
||||
}
|
||||
|
||||
void process_single_task(server_task && task) {
|
||||
switch (task.type) {
|
||||
case SERVER_TASK_TYPE_COMPLETION:
|
||||
|
|
@ -1759,7 +1885,7 @@ private:
|
|||
break; // drop the task
|
||||
}
|
||||
|
||||
if (params_base.clear_idle) {
|
||||
if (params_base.cache_idle_slots) {
|
||||
for (auto & s : slots) {
|
||||
if (!s.is_processing()) {
|
||||
slot_save_and_clear(s);
|
||||
|
|
@ -1854,7 +1980,7 @@ private:
|
|||
std::string filename = task.slot_action.filename;
|
||||
std::string filepath = task.slot_action.filepath;
|
||||
|
||||
const llama_tokens & tokens = slot->prompt.tokens.get_text_tokens();
|
||||
const llama_tokens & tokens = slot->prompt.tokens.get_tokens();
|
||||
const size_t nwrite = llama_state_seq_save_file(ctx, filepath.c_str(), slot->id, tokens.data(), token_count);
|
||||
|
||||
const int64_t t_end = ggml_time_us();
|
||||
|
|
@ -2061,7 +2187,7 @@ private:
|
|||
{
|
||||
GGML_ASSERT(!slot.prompt.tokens.has_mtmd);
|
||||
|
||||
llama_tokens new_tokens = slot.prompt.tokens.get_text_tokens(); // copy
|
||||
llama_tokens new_tokens = slot.prompt.tokens.get_tokens(); // copy
|
||||
for (size_t i = n_keep + n_discard; i < new_tokens.size(); i++) {
|
||||
new_tokens[i - n_discard] = new_tokens[i];
|
||||
}
|
||||
|
|
@ -2100,61 +2226,7 @@ private:
|
|||
continue;
|
||||
}
|
||||
|
||||
// generate draft tokens in speculative decoding mode
|
||||
// TODO: rework to have a single draft llama_context shared across all slots [TAG_SERVER_SPEC_REWORK]
|
||||
// perform the speculative drafting for all sequences at the same time in a single batch
|
||||
const int n_draft_max = slot.get_n_draft_max();
|
||||
if (n_draft_max > 0) {
|
||||
if (mctx) {
|
||||
// we should never reach this, as speculative is automatically disabled if mmproj is loaded
|
||||
GGML_ABORT("not supported by multimodal");
|
||||
}
|
||||
|
||||
const llama_tokens & cached_text_tokens = slot.prompt.tokens.get_text_tokens();
|
||||
|
||||
const auto & params_spec = slot.task->params.speculative;
|
||||
|
||||
llama_tokens draft = common_speculative_draft(slot.spec, params_spec, cached_text_tokens, slot.sampled);
|
||||
|
||||
if (draft.size() > (size_t) n_draft_max) {
|
||||
SLT_WRN(slot, "draft size %d exceeds max %d, truncating\n", (int) draft.size(), n_draft_max);
|
||||
draft.resize(n_draft_max);
|
||||
}
|
||||
|
||||
// add the sampled token to the batch
|
||||
slot.i_batch_dft.push_back(batch.n_tokens);
|
||||
common_batch_add(batch, slot.sampled, slot.prompt.tokens.pos_next(), { slot.id }, true);
|
||||
slot.prompt.tokens.push_back(slot.sampled);
|
||||
|
||||
if (slot.task->params.speculative.n_min > (int) draft.size()) {
|
||||
SLT_DBG(slot, "ignoring small draft: %d < %d\n", (int) draft.size(), slot.task->params.speculative.n_min);
|
||||
// fallback to normal decoding
|
||||
slot.i_batch = slot.i_batch_dft[0];
|
||||
slot.drafted.clear();
|
||||
slot.i_batch_dft.clear();
|
||||
} else {
|
||||
// keep track of total number of drafted tokens tested
|
||||
slot.n_draft_total += draft.size();
|
||||
|
||||
// add all drafted tokens to the batch
|
||||
for (size_t i = 0; i < draft.size(); i++) {
|
||||
slot.i_batch_dft.push_back(batch.n_tokens);
|
||||
common_batch_add(batch, draft[i], slot.prompt.tokens.pos_next(), { slot.id }, true);
|
||||
slot.prompt.tokens.push_back(draft[i]);
|
||||
}
|
||||
slot.drafted = std::move(draft);
|
||||
}
|
||||
} else {
|
||||
// no speculative decoding
|
||||
slot.i_batch = batch.n_tokens;
|
||||
|
||||
common_batch_add(batch, slot.sampled, slot.prompt.tokens.pos_next(), { slot.id }, true);
|
||||
|
||||
slot.prompt.tokens.push_back(slot.sampled);
|
||||
|
||||
SLT_DBG(slot, "slot decode token, n_ctx = %d, n_tokens = %d, truncated = %d\n",
|
||||
slot.n_ctx, slot.prompt.n_tokens(), slot.truncated);
|
||||
}
|
||||
slot.update_batch(batch);
|
||||
}
|
||||
|
||||
// process in chunks of params.n_batch
|
||||
|
|
@ -2515,15 +2587,11 @@ private:
|
|||
|
||||
// make a checkpoint of the parts of the memory that cannot be rolled back.
|
||||
// checkpoints are created only if:
|
||||
// - the model does not support partial sequence removal
|
||||
// - the model uses SWA and we are not using `swa_full`
|
||||
// - the model architecture is marked as recurrent or hybrid
|
||||
//
|
||||
// TODO: try to make this conditional on the context or the memory module, instead of the model type
|
||||
do_checkpoint = do_checkpoint && (
|
||||
llama_model_is_recurrent(model) ||
|
||||
llama_model_is_hybrid(model) ||
|
||||
(llama_model_n_swa(model) > 0 && !params_base.swa_full)
|
||||
);
|
||||
(slot.ctx_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_FULL) ||
|
||||
(llama_model_n_swa(model) > 0 && !params_base.swa_full));
|
||||
|
||||
bool has_mtmd = false;
|
||||
|
||||
|
|
@ -2651,40 +2719,12 @@ private:
|
|||
|
||||
// no need to create checkpoints that are too close together
|
||||
do_checkpoint = do_checkpoint && (slot.prompt.checkpoints.empty() || slot.prompt.n_tokens() - n_tokens_cur > slot.prompt.checkpoints.back().n_tokens + 64);
|
||||
SLT_DBG(slot, "main/do_checkpoint = %s, pos_min = %d, pos_max = %d\n", do_checkpoint ? "yes" : "no", pos_min, pos_max);
|
||||
|
||||
// note: we create the checkpoint before calling llama_decode(), so the current batch is not
|
||||
// yet processed and therefore it is not part of the checkpoint.
|
||||
if (do_checkpoint) {
|
||||
while (slot.prompt.checkpoints.size() >= (size_t) params_base.n_ctx_checkpoints) {
|
||||
// make room for the new checkpoint, if needed
|
||||
const auto & cur = slot.prompt.checkpoints.front();
|
||||
|
||||
SLT_WRN(slot,
|
||||
"erasing old context checkpoint (pos_min = %d, pos_max = %d, n_tokens = %" PRId64
|
||||
", size = %.3f MiB)\n",
|
||||
cur.pos_min, cur.pos_max, cur.n_tokens, (float) cur.data.size() / 1024 / 1024);
|
||||
|
||||
slot.prompt.checkpoints.erase(slot.prompt.checkpoints.begin());
|
||||
}
|
||||
|
||||
const size_t checkpoint_size =
|
||||
llama_state_seq_get_size_ext(ctx, slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
|
||||
|
||||
auto & cur = slot.prompt.checkpoints.emplace_back(server_prompt_checkpoint{
|
||||
/*.pos_min = */ pos_min,
|
||||
/*.pos_max = */ pos_max,
|
||||
/*.n_tokens = */ slot.prompt.n_tokens() - n_tokens_cur,
|
||||
/*.data = */ std::vector<uint8_t>(checkpoint_size),
|
||||
});
|
||||
|
||||
llama_state_seq_get_data_ext(ctx, cur.data.data(), checkpoint_size, slot.id,
|
||||
LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
|
||||
|
||||
SLT_WRN(slot,
|
||||
"created context checkpoint %d of %d (pos_min = %d, pos_max = %d, n_tokens = %" PRId64
|
||||
", size = %.3f MiB)\n",
|
||||
(int) slot.prompt.checkpoints.size(), params_base.n_ctx_checkpoints, cur.pos_min,
|
||||
cur.pos_max, cur.n_tokens, (float) cur.data.size() / 1024 / 1024);
|
||||
create_checkpoint(slot, n_tokens_cur, pos_min, pos_max);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -2856,19 +2896,19 @@ private:
|
|||
slot.state = SLOT_STATE_GENERATING;
|
||||
|
||||
if (slot.can_speculate()) {
|
||||
common_speculative_begin(slot.spec, slot.prompt.tokens.get_text_tokens());
|
||||
common_speculative_begin(slot.spec.get(), slot.prompt.tokens.get_text_tokens());
|
||||
}
|
||||
} else if (slot.state != SLOT_STATE_GENERATING) {
|
||||
continue; // continue loop of slots
|
||||
}
|
||||
|
||||
if (slot.i_batch_dft.size() > 0) {
|
||||
if (slot.can_speculate() && !slot.spec_draft.empty()) {
|
||||
continue; // sample using speculative decoding
|
||||
}
|
||||
|
||||
const int tok_idx = slot.i_batch - i;
|
||||
|
||||
llama_token id = common_sampler_sample(slot.smpl.get(), ctx, tok_idx);
|
||||
llama_token id = common_sampler_sample(slot.smpl.get(), slot.ctx, tok_idx);
|
||||
|
||||
slot.i_batch = -1;
|
||||
|
||||
|
|
@ -2889,7 +2929,7 @@ private:
|
|||
|
||||
completion_token_output result;
|
||||
result.tok = id;
|
||||
result.text_to_send = common_token_to_piece(ctx, result.tok, accept_special_token(slot, result.tok));
|
||||
result.text_to_send = common_token_to_piece(slot.ctx, result.tok, accept_special_token(slot, result.tok));
|
||||
result.prob = 1.0f; // TODO: set it here instead of doing inside populate_token_probs
|
||||
|
||||
if (slot.task->params.sampling.n_probs > 0) {
|
||||
|
|
@ -2909,43 +2949,85 @@ private:
|
|||
|
||||
// speculative decoding - main model sample and accept
|
||||
for (auto & slot : slots) {
|
||||
if (slot.state != SLOT_STATE_GENERATING || slot.i_batch_dft.empty()) {
|
||||
if (slot.state != SLOT_STATE_GENERATING || !slot.can_speculate() || slot.spec_draft.empty()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const size_t n_draft = slot.drafted.size();
|
||||
// save the original draft size
|
||||
const size_t n_draft = slot.spec_draft.size();
|
||||
|
||||
// the accepted tokens from the speculation
|
||||
const auto ids = common_sampler_sample_and_accept_n(slot.smpl.get(), ctx, slot.i_batch_dft, slot.drafted);
|
||||
slot.i_batch_dft.clear();
|
||||
slot.drafted.clear();
|
||||
GGML_ASSERT(n_draft > 0);
|
||||
|
||||
// verify and try to accept the draft
|
||||
{
|
||||
common_sampler_ptr smpl_save(common_sampler_clone(slot.smpl.get()));
|
||||
|
||||
GGML_ASSERT(slot.spec_i_batch.size() == n_draft + 1);
|
||||
auto accepted = common_sampler_sample_and_accept_n(slot.smpl.get(), slot.ctx, slot.spec_i_batch, slot.spec_draft);
|
||||
slot.spec_i_batch.clear();
|
||||
|
||||
SLT_DBG(slot, "%s: n_draft=%zu, accepted=%zu\n", __func__, slot.spec_draft.size(), accepted.size());
|
||||
|
||||
GGML_ASSERT(accepted.size() >= 1);
|
||||
|
||||
// check for partial draft acceptance
|
||||
if (accepted.size() < slot.spec_draft.size() + 1) {
|
||||
if (slot.ctx_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_FULL) {
|
||||
// partial acceptance is not supported by the context -> truncate the draft and restore the state
|
||||
slot.spec_draft = std::move(accepted);
|
||||
|
||||
const auto & ckpt = slot.spec_ckpt;
|
||||
|
||||
SLT_DBG(slot, "restoring speculative checkpoint (pos_min = %d, pos_max = %d, size = %zu)\n",
|
||||
ckpt.pos_min, ckpt.pos_max, ckpt.size());
|
||||
|
||||
const size_t n = llama_state_seq_set_data_ext(slot.ctx, ckpt.data.data(), ckpt.size(), slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
|
||||
if (n != ckpt.size()) {
|
||||
GGML_ABORT("%s: failed to restore context checkpoint (pos_min=%d, pos_max=%d, size=%zu, get_data_ext->%zu, set_data_ext->%zu",
|
||||
__func__, ckpt.pos_min, ckpt.pos_max, ckpt.size(), ckpt.size(), n);
|
||||
}
|
||||
|
||||
llama_memory_seq_rm(llama_get_memory(slot.ctx), slot.id, ckpt.pos_max + 1, -1);
|
||||
|
||||
slot.prompt.tokens.keep_first(ckpt.n_tokens);
|
||||
slot.smpl = std::move(smpl_save);
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
LOG_DBG("%s: partial acceptance: %zu < %zu\n", __func__, accepted.size(), slot.spec_draft.size());
|
||||
}
|
||||
|
||||
common_speculative_accept(slot.spec.get(), accepted.size() - 1);
|
||||
|
||||
slot.spec_draft = std::move(accepted);
|
||||
}
|
||||
|
||||
const int64_t t_current = ggml_time_us();
|
||||
|
||||
slot.n_decoded += ids.size();
|
||||
const auto ids = std::move(slot.spec_draft);
|
||||
|
||||
slot.n_decoded += ids.size();
|
||||
slot.t_token_generation = std::max<int64_t>(1, t_current - slot.t_start_generation) / 1e3;
|
||||
|
||||
// update how many tokens out of those tested were accepted
|
||||
slot.n_draft_accepted += ids.size() - 1;
|
||||
|
||||
// inform the speculative decoding about the number of accepted tokens
|
||||
common_speculative_accept(slot.spec, ids.size() - 1);
|
||||
|
||||
// rollback to the state before sampling the draft tokens
|
||||
slot.prompt.tokens.keep_first(slot.prompt.n_tokens() - n_draft);
|
||||
slot.n_draft_total += n_draft;
|
||||
|
||||
// add accepted tokens to the prompt
|
||||
slot.prompt.tokens.keep_first(slot.prompt.n_tokens() - n_draft);
|
||||
slot.prompt.tokens.insert({ids.begin(), ids.end() - 1});
|
||||
slot.sampled = ids.back(); // last accepted token
|
||||
|
||||
llama_memory_seq_rm(llama_get_memory(ctx), slot.id, slot.prompt.n_tokens(), -1);
|
||||
slot.sampled = ids.back(); // last accepted token
|
||||
SLT_DBG(slot, "add accepted tokens: sampled=%d, ids.size=%zu, n_draft=%zu\n", slot.sampled, ids.size(), n_draft);
|
||||
|
||||
llama_memory_seq_rm(llama_get_memory(slot.ctx), slot.id, slot.prompt.n_tokens(), -1);
|
||||
|
||||
for (size_t i = 0; i < ids.size(); ++i) {
|
||||
completion_token_output result;
|
||||
|
||||
result.tok = ids[i];
|
||||
result.text_to_send = common_token_to_piece(ctx, result.tok, accept_special_token(slot, result.tok));
|
||||
result.text_to_send = common_token_to_piece(slot.ctx, result.tok, accept_special_token(slot, result.tok));
|
||||
result.prob = 1.0f; // set later
|
||||
|
||||
// TODO: set result.probs
|
||||
|
|
@ -3537,6 +3619,7 @@ void server_routes::init_routes() {
|
|||
{"vision", meta->has_inp_image},
|
||||
{"audio", meta->has_inp_audio},
|
||||
} },
|
||||
{ "media_marker", get_media_marker() },
|
||||
{ "endpoint_slots", params.endpoint_slots },
|
||||
{ "endpoint_props", params.endpoint_props },
|
||||
{ "endpoint_metrics", params.endpoint_metrics },
|
||||
|
|
@ -3664,7 +3747,7 @@ void server_routes::init_routes() {
|
|||
params.n_predict,
|
||||
meta->slot_n_ctx,
|
||||
params.spm_infill,
|
||||
tokenized_prompts[0].get_text_tokens() // TODO: this could maybe be multimodal.
|
||||
tokenized_prompts[0].get_tokens() // TODO: this could maybe be multimodal.
|
||||
);
|
||||
|
||||
std::vector<raw_buffer> files; // dummy
|
||||
|
|
|
|||
|
|
@ -162,7 +162,7 @@ common_chat_msg task_result_state::update_chat_msg(
|
|||
bool filter_tool_calls) {
|
||||
generated_text += text_added;
|
||||
auto msg_prv_copy = chat_msg;
|
||||
SRV_DBG("Parsing chat message: %s\n", generated_text.c_str());
|
||||
//SRV_DBG("Parsing chat message: %s\n", generated_text.c_str());
|
||||
auto new_msg = common_chat_parse(
|
||||
generated_text,
|
||||
is_partial,
|
||||
|
|
@ -304,6 +304,8 @@ task_params server_task::params_from_json_cmpl(
|
|||
params.sampling.backend_sampling = json_value(data, "backend_sampling", defaults.sampling.backend_sampling);
|
||||
params.post_sampling_probs = json_value(data, "post_sampling_probs", defaults.post_sampling_probs);
|
||||
|
||||
params.speculative = defaults.speculative;
|
||||
|
||||
params.speculative.n_min = json_value(data, "speculative.n_min", defaults.speculative.n_min);
|
||||
params.speculative.n_max = json_value(data, "speculative.n_max", defaults.speculative.n_max);
|
||||
params.speculative.p_min = json_value(data, "speculative.p_min", defaults.speculative.p_min);
|
||||
|
|
|
|||
|
|
@ -576,6 +576,17 @@ struct server_prompt_checkpoint {
|
|||
size_t size() const {
|
||||
return data.size();
|
||||
}
|
||||
|
||||
bool empty() const {
|
||||
return data.empty();
|
||||
}
|
||||
|
||||
void clear() {
|
||||
pos_min = 0;
|
||||
pos_max = 0;
|
||||
n_tokens = 0;
|
||||
data.clear();
|
||||
}
|
||||
};
|
||||
|
||||
struct server_prompt {
|
||||
|
|
|
|||
|
|
@ -91,7 +91,7 @@ def test_clear_and_restore():
|
|||
|
||||
def test_disabled_with_flag():
|
||||
global server
|
||||
server.no_clear_idle = True
|
||||
server.no_cache_idle_slots = True
|
||||
server.start()
|
||||
log = LogReader(server.log_path)
|
||||
|
||||
|
|
|
|||
|
|
@ -103,7 +103,7 @@ class ServerProcess:
|
|||
media_path: str | None = None
|
||||
sleep_idle_seconds: int | None = None
|
||||
cache_ram: int | None = None
|
||||
no_clear_idle: bool = False
|
||||
no_cache_idle_slots: bool = False
|
||||
log_path: str | None = None
|
||||
webui_mcp_proxy: bool = False
|
||||
|
||||
|
|
@ -242,8 +242,8 @@ class ServerProcess:
|
|||
server_args.extend(["--sleep-idle-seconds", self.sleep_idle_seconds])
|
||||
if self.cache_ram is not None:
|
||||
server_args.extend(["--cache-ram", self.cache_ram])
|
||||
if self.no_clear_idle:
|
||||
server_args.append("--no-clear-idle")
|
||||
if self.no_cache_idle_slots:
|
||||
server_args.append("--no-cache-idle-slots")
|
||||
if self.webui_mcp_proxy:
|
||||
server_args.append("--webui-mcp-proxy")
|
||||
|
||||
|
|
|
|||
98
vendor/cpp-httplib/httplib.cpp
vendored
98
vendor/cpp-httplib/httplib.cpp
vendored
|
|
@ -1,7 +1,5 @@
|
|||
#include "httplib.h"
|
||||
namespace httplib {
|
||||
// httplib::any — type-erased value container (C++11 compatible)
|
||||
// On C++17+ builds, thin wrappers around std::any are provided.
|
||||
|
||||
/*
|
||||
* Implementation that will be part of the .cc file if split into .h + .cc.
|
||||
|
|
@ -1877,7 +1875,7 @@ int getaddrinfo_with_timeout(const char *node, const char *service,
|
|||
}
|
||||
|
||||
return ret;
|
||||
#elif TARGET_OS_MAC
|
||||
#elif TARGET_OS_MAC && defined(__clang__)
|
||||
if (!node) { return EAI_NONAME; }
|
||||
// macOS implementation using CFHost API for asynchronous DNS resolution
|
||||
CFStringRef hostname_ref = CFStringCreateWithCString(
|
||||
|
|
@ -5836,6 +5834,17 @@ std::string Request::get_param_value(const std::string &key,
|
|||
return std::string();
|
||||
}
|
||||
|
||||
std::vector<std::string>
|
||||
Request::get_param_values(const std::string &key) const {
|
||||
auto rng = params.equal_range(key);
|
||||
std::vector<std::string> values;
|
||||
values.reserve(static_cast<size_t>(std::distance(rng.first, rng.second)));
|
||||
for (auto it = rng.first; it != rng.second; ++it) {
|
||||
values.push_back(it->second);
|
||||
}
|
||||
return values;
|
||||
}
|
||||
|
||||
size_t Request::get_param_value_count(const std::string &key) const {
|
||||
auto r = params.equal_range(key);
|
||||
return static_cast<size_t>(std::distance(r.first, r.second));
|
||||
|
|
@ -7013,6 +7022,15 @@ Server &Server::set_keep_alive_timeout(time_t sec) {
|
|||
return *this;
|
||||
}
|
||||
|
||||
template <class Rep, class Period>
|
||||
Server &Server::set_keep_alive_timeout(
|
||||
const std::chrono::duration<Rep, Period> &duration) {
|
||||
detail::duration_to_sec_and_usec(duration, [&](time_t sec, time_t /*usec*/) {
|
||||
set_keep_alive_timeout(sec);
|
||||
});
|
||||
return *this;
|
||||
}
|
||||
|
||||
Server &Server::set_read_timeout(time_t sec, time_t usec) {
|
||||
read_timeout_sec_ = sec;
|
||||
read_timeout_usec_ = usec;
|
||||
|
|
@ -9119,20 +9137,21 @@ bool ClientImpl::redirect(Request &req, Response &res, Error &error) {
|
|||
auto location = res.get_header_value("location");
|
||||
if (location.empty()) { return false; }
|
||||
|
||||
thread_local const std::regex re(
|
||||
R"((?:(https?):)?(?://(?:\[([a-fA-F\d:]+)\]|([^:/?#]+))(?::(\d+))?)?([^?#]*)(\?[^#]*)?(?:#.*)?)");
|
||||
detail::UrlComponents uc;
|
||||
if (!detail::parse_url(location, uc)) { return false; }
|
||||
|
||||
std::smatch m;
|
||||
if (!std::regex_match(location, m, re)) { return false; }
|
||||
// Only follow http/https redirects
|
||||
if (!uc.scheme.empty() && uc.scheme != "http" && uc.scheme != "https") {
|
||||
return false;
|
||||
}
|
||||
|
||||
auto scheme = is_ssl() ? "https" : "http";
|
||||
|
||||
auto next_scheme = m[1].str();
|
||||
auto next_host = m[2].str();
|
||||
if (next_host.empty()) { next_host = m[3].str(); }
|
||||
auto port_str = m[4].str();
|
||||
auto next_path = m[5].str();
|
||||
auto next_query = m[6].str();
|
||||
auto next_scheme = std::move(uc.scheme);
|
||||
auto next_host = std::move(uc.host);
|
||||
auto port_str = std::move(uc.port);
|
||||
auto next_path = std::move(uc.path);
|
||||
auto next_query = std::move(uc.query);
|
||||
|
||||
auto next_port = port_;
|
||||
if (!port_str.empty()) {
|
||||
|
|
@ -9145,7 +9164,7 @@ bool ClientImpl::redirect(Request &req, Response &res, Error &error) {
|
|||
if (next_host.empty()) { next_host = host_; }
|
||||
if (next_path.empty()) { next_path = "/"; }
|
||||
|
||||
auto path = decode_query_component(next_path, true) + next_query;
|
||||
auto path = decode_path_component(next_path) + next_query;
|
||||
|
||||
// Same host redirect - use current client
|
||||
if (next_scheme == scheme && next_host == host_ && next_port == port_) {
|
||||
|
|
@ -10869,12 +10888,9 @@ Client::Client(const std::string &scheme_host_port)
|
|||
Client::Client(const std::string &scheme_host_port,
|
||||
const std::string &client_cert_path,
|
||||
const std::string &client_key_path) {
|
||||
const static std::regex re(
|
||||
R"((?:([a-z]+):\/\/)?(?:\[([a-fA-F\d:]+)\]|([^:/?#]+))(?::(\d+))?)");
|
||||
|
||||
std::smatch m;
|
||||
if (std::regex_match(scheme_host_port, m, re)) {
|
||||
auto scheme = m[1].str();
|
||||
detail::UrlComponents uc;
|
||||
if (detail::parse_url(scheme_host_port, uc) && !uc.host.empty()) {
|
||||
auto &scheme = uc.scheme;
|
||||
|
||||
#ifdef CPPHTTPLIB_SSL_ENABLED
|
||||
if (!scheme.empty() && (scheme != "http" && scheme != "https")) {
|
||||
|
|
@ -10890,12 +10906,10 @@ Client::Client(const std::string &scheme_host_port,
|
|||
|
||||
auto is_ssl = scheme == "https";
|
||||
|
||||
auto host = m[2].str();
|
||||
if (host.empty()) { host = m[3].str(); }
|
||||
auto host = std::move(uc.host);
|
||||
|
||||
auto port_str = m[4].str();
|
||||
auto port = is_ssl ? 443 : 80;
|
||||
if (!port_str.empty() && !detail::parse_port(port_str, port)) { return; }
|
||||
if (!uc.port.empty() && !detail::parse_port(uc.port, port)) { return; }
|
||||
|
||||
if (is_ssl) {
|
||||
#ifdef CPPHTTPLIB_SSL_ENABLED
|
||||
|
|
@ -12466,6 +12480,18 @@ std::string Request::sni() const {
|
|||
*/
|
||||
|
||||
#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
|
||||
// These wrappers forward to deprecated APIs that will be removed by v1.0.0.
|
||||
// Suppress C4996 / -Wdeprecated-declarations so that MSVC /sdl builds (which
|
||||
// promote C4996 to an error) compile cleanly even though the wrappers
|
||||
// themselves are also marked [[deprecated]].
|
||||
#if defined(_MSC_VER)
|
||||
#pragma warning(push)
|
||||
#pragma warning(disable : 4996)
|
||||
#elif defined(__GNUC__) || defined(__clang__)
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
|
||||
#endif
|
||||
|
||||
SSL_CTX *Client::ssl_context() const {
|
||||
if (is_ssl_) { return static_cast<SSLClient &>(*cli_).ssl_context(); }
|
||||
return nullptr;
|
||||
|
|
@ -12480,6 +12506,12 @@ long Client::get_verify_result() const {
|
|||
if (is_ssl_) { return static_cast<SSLClient &>(*cli_).get_verify_result(); }
|
||||
return -1; // NOTE: -1 doesn't match any of X509_V_ERR_???
|
||||
}
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#pragma warning(pop)
|
||||
#elif defined(__GNUC__) || defined(__clang__)
|
||||
#pragma GCC diagnostic pop
|
||||
#endif
|
||||
#endif // CPPHTTPLIB_OPENSSL_SUPPORT
|
||||
|
||||
/*
|
||||
|
|
@ -16302,12 +16334,10 @@ bool WebSocket::is_open() const { return !closed_; }
|
|||
WebSocketClient::WebSocketClient(
|
||||
const std::string &scheme_host_port_path, const Headers &headers)
|
||||
: headers_(headers) {
|
||||
const static std::regex re(
|
||||
R"(([a-z]+):\/\/(?:\[([a-fA-F\d:]+)\]|([^:/?#]+))(?::(\d+))?(\/.*))");
|
||||
|
||||
std::smatch m;
|
||||
if (std::regex_match(scheme_host_port_path, m, re)) {
|
||||
auto scheme = m[1].str();
|
||||
detail::UrlComponents uc;
|
||||
if (detail::parse_url(scheme_host_port_path, uc) && !uc.scheme.empty() &&
|
||||
!uc.host.empty() && !uc.path.empty()) {
|
||||
auto &scheme = uc.scheme;
|
||||
|
||||
#ifdef CPPHTTPLIB_SSL_ENABLED
|
||||
if (scheme != "ws" && scheme != "wss") {
|
||||
|
|
@ -16323,14 +16353,12 @@ WebSocketClient::WebSocketClient(
|
|||
|
||||
auto is_ssl = scheme == "wss";
|
||||
|
||||
host_ = m[2].str();
|
||||
if (host_.empty()) { host_ = m[3].str(); }
|
||||
host_ = std::move(uc.host);
|
||||
|
||||
auto port_str = m[4].str();
|
||||
port_ = is_ssl ? 443 : 80;
|
||||
if (!port_str.empty() && !detail::parse_port(port_str, port_)) { return; }
|
||||
if (!uc.port.empty() && !detail::parse_port(uc.port, port_)) { return; }
|
||||
|
||||
path_ = m[5].str();
|
||||
path_ = std::move(uc.path);
|
||||
|
||||
#ifdef CPPHTTPLIB_SSL_ENABLED
|
||||
is_ssl_ = is_ssl;
|
||||
|
|
|
|||
274
vendor/cpp-httplib/httplib.h
vendored
274
vendor/cpp-httplib/httplib.h
vendored
|
|
@ -8,8 +8,8 @@
|
|||
#ifndef CPPHTTPLIB_HTTPLIB_H
|
||||
#define CPPHTTPLIB_HTTPLIB_H
|
||||
|
||||
#define CPPHTTPLIB_VERSION "0.40.0"
|
||||
#define CPPHTTPLIB_VERSION_NUM "0x002800"
|
||||
#define CPPHTTPLIB_VERSION "0.42.0"
|
||||
#define CPPHTTPLIB_VERSION_NUM "0x002a00"
|
||||
|
||||
#ifdef _WIN32
|
||||
#if defined(_WIN32_WINNT) && _WIN32_WINNT < 0x0A00
|
||||
|
|
@ -333,13 +333,10 @@ using socket_t = int;
|
|||
#include <unordered_map>
|
||||
#include <unordered_set>
|
||||
#include <utility>
|
||||
#if __cplusplus >= 201703L
|
||||
#include <any>
|
||||
#endif
|
||||
|
||||
// On macOS with a TLS backend, enable Keychain root certificates by default
|
||||
// unless the user explicitly opts out.
|
||||
#if defined(__APPLE__) && \
|
||||
#if defined(__APPLE__) && defined(__clang__) && \
|
||||
!defined(CPPHTTPLIB_DISABLE_MACOSX_AUTOMATIC_ROOT_CERTIFICATES) && \
|
||||
(defined(CPPHTTPLIB_OPENSSL_SUPPORT) || \
|
||||
defined(CPPHTTPLIB_MBEDTLS_SUPPORT) || \
|
||||
|
|
@ -358,7 +355,7 @@ using socket_t = int;
|
|||
|
||||
#if defined(CPPHTTPLIB_USE_NON_BLOCKING_GETADDRINFO) || \
|
||||
defined(CPPHTTPLIB_USE_CERTS_FROM_MACOSX_KEYCHAIN)
|
||||
#if TARGET_OS_MAC
|
||||
#if TARGET_OS_MAC && defined(__clang__)
|
||||
#include <CFNetwork/CFHost.h>
|
||||
#include <CoreFoundation/CoreFoundation.h>
|
||||
#endif
|
||||
|
|
@ -701,9 +698,96 @@ inline bool parse_port(const std::string &s, int &port) {
|
|||
return parse_port(s.data(), s.size(), port);
|
||||
}
|
||||
|
||||
struct UrlComponents {
|
||||
std::string scheme;
|
||||
std::string host;
|
||||
std::string port;
|
||||
std::string path;
|
||||
std::string query;
|
||||
};
|
||||
|
||||
inline bool parse_url(const std::string &url, UrlComponents &uc) {
|
||||
uc = {};
|
||||
size_t pos = 0;
|
||||
|
||||
auto sep = url.find("://");
|
||||
if (sep != std::string::npos) {
|
||||
uc.scheme = url.substr(0, sep);
|
||||
|
||||
// Scheme must be [a-z]+ only
|
||||
if (uc.scheme.empty()) { return false; }
|
||||
for (auto c : uc.scheme) {
|
||||
if (c < 'a' || c > 'z') { return false; }
|
||||
}
|
||||
|
||||
pos = sep + 3;
|
||||
} else if (url.compare(0, 2, "//") == 0) {
|
||||
pos = 2;
|
||||
}
|
||||
|
||||
auto has_authority_prefix = pos > 0;
|
||||
auto has_authority = has_authority_prefix || (!url.empty() && url[0] != '/' &&
|
||||
url[0] != '?' && url[0] != '#');
|
||||
if (has_authority) {
|
||||
if (pos < url.size() && url[pos] == '[') {
|
||||
auto close = url.find(']', pos);
|
||||
if (close == std::string::npos) { return false; }
|
||||
uc.host = url.substr(pos + 1, close - pos - 1);
|
||||
|
||||
// IPv6 host must be [a-fA-F0-9:]+ only
|
||||
if (uc.host.empty()) { return false; }
|
||||
for (auto c : uc.host) {
|
||||
if (!((c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F') ||
|
||||
(c >= '0' && c <= '9') || c == ':')) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
pos = close + 1;
|
||||
} else {
|
||||
auto end = url.find_first_of(":/?#", pos);
|
||||
if (end == std::string::npos) { end = url.size(); }
|
||||
uc.host = url.substr(pos, end - pos);
|
||||
pos = end;
|
||||
}
|
||||
|
||||
if (pos < url.size() && url[pos] == ':') {
|
||||
++pos;
|
||||
auto end = url.find_first_of("/?#", pos);
|
||||
if (end == std::string::npos) { end = url.size(); }
|
||||
uc.port = url.substr(pos, end - pos);
|
||||
pos = end;
|
||||
}
|
||||
|
||||
// Without :// or //, the entire input must be consumed as host[:port].
|
||||
// If there is leftover (path, query, etc.), this is not a valid
|
||||
// host[:port] string — clear and reparse as a plain path.
|
||||
if (!has_authority_prefix && pos < url.size()) {
|
||||
uc.host.clear();
|
||||
uc.port.clear();
|
||||
pos = 0;
|
||||
}
|
||||
}
|
||||
|
||||
if (pos < url.size() && url[pos] != '?' && url[pos] != '#') {
|
||||
auto end = url.find_first_of("?#", pos);
|
||||
if (end == std::string::npos) { end = url.size(); }
|
||||
uc.path = url.substr(pos, end - pos);
|
||||
pos = end;
|
||||
}
|
||||
|
||||
if (pos < url.size() && url[pos] == '?') {
|
||||
auto end = url.find('#', pos);
|
||||
if (end == std::string::npos) { end = url.size(); }
|
||||
uc.query = url.substr(pos, end - pos);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
|
||||
enum SSLVerifierResponse {
|
||||
enum class SSLVerifierResponse {
|
||||
// no decision has been made, use the built-in certificate verifier
|
||||
NoDecisionMade,
|
||||
// connection certificate is verified and accepted
|
||||
|
|
@ -797,38 +881,15 @@ using Match = std::smatch;
|
|||
using DownloadProgress = std::function<bool(size_t current, size_t total)>;
|
||||
using UploadProgress = std::function<bool(size_t current, size_t total)>;
|
||||
|
||||
|
||||
#if __cplusplus >= 201703L
|
||||
|
||||
using any = std::any;
|
||||
using bad_any_cast = std::bad_any_cast;
|
||||
|
||||
template <typename T> T any_cast(const any &a) { return std::any_cast<T>(a); }
|
||||
template <typename T> T any_cast(any &a) { return std::any_cast<T>(a); }
|
||||
template <typename T> T any_cast(any &&a) {
|
||||
return std::any_cast<T>(std::move(a));
|
||||
}
|
||||
template <typename T> const T *any_cast(const any *a) noexcept {
|
||||
return std::any_cast<T>(a);
|
||||
}
|
||||
template <typename T> T *any_cast(any *a) noexcept {
|
||||
return std::any_cast<T>(a);
|
||||
}
|
||||
|
||||
#else // C++11/14 implementation
|
||||
|
||||
class bad_any_cast : public std::bad_cast {
|
||||
public:
|
||||
const char *what() const noexcept override { return "bad any_cast"; }
|
||||
};
|
||||
|
||||
/*
|
||||
* detail: type-erased storage used by UserData.
|
||||
* ABI-stable regardless of C++ standard — always uses this custom
|
||||
* implementation instead of std::any.
|
||||
*/
|
||||
namespace detail {
|
||||
|
||||
using any_type_id = const void *;
|
||||
|
||||
// Returns a unique per-type ID without RTTI.
|
||||
// The static address is stable across TUs because function templates are
|
||||
// implicitly inline and the ODR merges their statics into one.
|
||||
template <typename T> any_type_id any_typeid() noexcept {
|
||||
static const char id = 0;
|
||||
return &id;
|
||||
|
|
@ -851,89 +912,60 @@ template <typename T> struct any_value final : any_storage {
|
|||
|
||||
} // namespace detail
|
||||
|
||||
class any {
|
||||
std::unique_ptr<detail::any_storage> storage_;
|
||||
|
||||
class UserData {
|
||||
public:
|
||||
any() noexcept = default;
|
||||
any(const any &o) : storage_(o.storage_ ? o.storage_->clone() : nullptr) {}
|
||||
any(any &&) noexcept = default;
|
||||
any &operator=(const any &o) {
|
||||
storage_ = o.storage_ ? o.storage_->clone() : nullptr;
|
||||
return *this;
|
||||
UserData() = default;
|
||||
UserData(UserData &&) noexcept = default;
|
||||
UserData &operator=(UserData &&) noexcept = default;
|
||||
|
||||
UserData(const UserData &o) {
|
||||
for (const auto &e : o.entries_) {
|
||||
if (e.second) { entries_[e.first] = e.second->clone(); }
|
||||
}
|
||||
}
|
||||
any &operator=(any &&) noexcept = default;
|
||||
|
||||
template <
|
||||
typename T, typename D = typename std::decay<T>::type,
|
||||
typename std::enable_if<!std::is_same<D, any>::value, int>::type = 0>
|
||||
any(T &&v) : storage_(new detail::any_value<D>(std::forward<T>(v))) {}
|
||||
|
||||
template <
|
||||
typename T, typename D = typename std::decay<T>::type,
|
||||
typename std::enable_if<!std::is_same<D, any>::value, int>::type = 0>
|
||||
any &operator=(T &&v) {
|
||||
storage_.reset(new detail::any_value<D>(std::forward<T>(v)));
|
||||
UserData &operator=(const UserData &o) {
|
||||
if (this != &o) {
|
||||
entries_.clear();
|
||||
for (const auto &e : o.entries_) {
|
||||
if (e.second) { entries_[e.first] = e.second->clone(); }
|
||||
}
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
bool has_value() const noexcept { return storage_ != nullptr; }
|
||||
void reset() noexcept { storage_.reset(); }
|
||||
template <typename T> void set(const std::string &key, T &&value) {
|
||||
using D = typename std::decay<T>::type;
|
||||
entries_[key].reset(new detail::any_value<D>(std::forward<T>(value)));
|
||||
}
|
||||
|
||||
template <typename T> friend T *any_cast(any *a) noexcept;
|
||||
template <typename T> friend const T *any_cast(const any *a) noexcept;
|
||||
template <typename T> T *get(const std::string &key) noexcept {
|
||||
auto it = entries_.find(key);
|
||||
if (it == entries_.end() || !it->second) { return nullptr; }
|
||||
if (it->second->type_id() != detail::any_typeid<T>()) { return nullptr; }
|
||||
return &static_cast<detail::any_value<T> *>(it->second.get())->value;
|
||||
}
|
||||
|
||||
template <typename T> const T *get(const std::string &key) const noexcept {
|
||||
auto it = entries_.find(key);
|
||||
if (it == entries_.end() || !it->second) { return nullptr; }
|
||||
if (it->second->type_id() != detail::any_typeid<T>()) { return nullptr; }
|
||||
return &static_cast<const detail::any_value<T> *>(it->second.get())->value;
|
||||
}
|
||||
|
||||
bool has(const std::string &key) const noexcept {
|
||||
return entries_.find(key) != entries_.end();
|
||||
}
|
||||
|
||||
void erase(const std::string &key) { entries_.erase(key); }
|
||||
|
||||
void clear() noexcept { entries_.clear(); }
|
||||
|
||||
private:
|
||||
std::unordered_map<std::string, std::unique_ptr<detail::any_storage>>
|
||||
entries_;
|
||||
};
|
||||
|
||||
template <typename T> T *any_cast(any *a) noexcept {
|
||||
if (!a || !a->storage_) { return nullptr; }
|
||||
if (a->storage_->type_id() != detail::any_typeid<T>()) { return nullptr; }
|
||||
return &static_cast<detail::any_value<T> *>(a->storage_.get())->value;
|
||||
}
|
||||
|
||||
template <typename T> const T *any_cast(const any *a) noexcept {
|
||||
if (!a || !a->storage_) { return nullptr; }
|
||||
if (a->storage_->type_id() != detail::any_typeid<T>()) { return nullptr; }
|
||||
return &static_cast<const detail::any_value<T> *>(a->storage_.get())->value;
|
||||
}
|
||||
|
||||
template <typename T> T any_cast(const any &a) {
|
||||
using U =
|
||||
typename std::remove_cv<typename std::remove_reference<T>::type>::type;
|
||||
const U *p = any_cast<U>(&a);
|
||||
#ifndef CPPHTTPLIB_NO_EXCEPTIONS
|
||||
if (!p) { throw bad_any_cast{}; }
|
||||
#else
|
||||
if (!p) { std::abort(); }
|
||||
#endif
|
||||
return static_cast<T>(*p);
|
||||
}
|
||||
|
||||
template <typename T> T any_cast(any &a) {
|
||||
using U =
|
||||
typename std::remove_cv<typename std::remove_reference<T>::type>::type;
|
||||
U *p = any_cast<U>(&a);
|
||||
#ifndef CPPHTTPLIB_NO_EXCEPTIONS
|
||||
if (!p) { throw bad_any_cast{}; }
|
||||
#else
|
||||
if (!p) { std::abort(); }
|
||||
#endif
|
||||
return static_cast<T>(*p);
|
||||
}
|
||||
|
||||
template <typename T> T any_cast(any &&a) {
|
||||
using U =
|
||||
typename std::remove_cv<typename std::remove_reference<T>::type>::type;
|
||||
U *p = any_cast<U>(&a);
|
||||
#ifndef CPPHTTPLIB_NO_EXCEPTIONS
|
||||
if (!p) { throw bad_any_cast{}; }
|
||||
#else
|
||||
if (!p) { std::abort(); }
|
||||
#endif
|
||||
return static_cast<T>(std::move(*p));
|
||||
}
|
||||
|
||||
#endif // __cplusplus >= 201703L
|
||||
|
||||
struct Response;
|
||||
using ResponseHandler = std::function<bool(const Response &response)>;
|
||||
|
||||
|
|
@ -1261,6 +1293,7 @@ struct Request {
|
|||
|
||||
bool has_param(const std::string &key) const;
|
||||
std::string get_param_value(const std::string &key, size_t id = 0) const;
|
||||
std::vector<std::string> get_param_values(const std::string &key) const;
|
||||
size_t get_param_value_count(const std::string &key) const;
|
||||
|
||||
bool is_multipart_form_data() const;
|
||||
|
|
@ -1293,7 +1326,7 @@ struct Response {
|
|||
|
||||
// User-defined context — set by pre-routing/pre-request handlers and read
|
||||
// by route handlers to pass arbitrary data (e.g. decoded auth tokens).
|
||||
std::map<std::string, any> user_data;
|
||||
UserData user_data;
|
||||
|
||||
bool has_header(const std::string &key) const;
|
||||
std::string get_header_value(const std::string &key, const char *def = "",
|
||||
|
|
@ -1664,6 +1697,9 @@ public:
|
|||
|
||||
Server &set_keep_alive_max_count(size_t count);
|
||||
Server &set_keep_alive_timeout(time_t sec);
|
||||
template <class Rep, class Period>
|
||||
Server &
|
||||
set_keep_alive_timeout(const std::chrono::duration<Rep, Period> &duration);
|
||||
|
||||
Server &set_read_timeout(time_t sec, time_t usec = 0);
|
||||
template <class Rep, class Period>
|
||||
|
|
@ -2790,10 +2826,26 @@ public:
|
|||
"This function will be removed by v1.0.0.")]]
|
||||
SSL_CTX *ssl_context() const;
|
||||
|
||||
// Override of a deprecated virtual in ClientImpl. Suppress C4996 /
|
||||
// -Wdeprecated-declarations on the override declaration itself so that
|
||||
// MSVC /sdl builds compile cleanly. Will be removed together with the
|
||||
// base virtual by v1.0.0.
|
||||
#if defined(_MSC_VER)
|
||||
#pragma warning(push)
|
||||
#pragma warning(disable : 4996)
|
||||
#elif defined(__GNUC__) || defined(__clang__)
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
|
||||
#endif
|
||||
[[deprecated("Use set_session_verifier(session_t) instead. "
|
||||
"This function will be removed by v1.0.0.")]]
|
||||
void set_server_certificate_verifier(
|
||||
std::function<SSLVerifierResponse(SSL *ssl)> verifier) override;
|
||||
#if defined(_MSC_VER)
|
||||
#pragma warning(pop)
|
||||
#elif defined(__GNUC__) || defined(__clang__)
|
||||
#pragma GCC diagnostic pop
|
||||
#endif
|
||||
|
||||
private:
|
||||
bool verify_host(X509 *server_cert) const;
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue