mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-05-09 19:46:11 +00:00
Merge commit '128d522c04' into concedo_experimental
# Conflicts: # .github/workflows/build.yml # .github/workflows/release.yml # ggml/src/ggml-vulkan/ggml-vulkan.cpp # tests/test-alloc.cpp # tests/test-chat.cpp
This commit is contained in:
commit
1d728bbc89
18 changed files with 381 additions and 226 deletions
|
|
@ -1955,13 +1955,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||
}
|
||||
).set_env("LLAMA_ARG_SWA_FULL"));
|
||||
add_opt(common_arg(
|
||||
{"--swa-checkpoints"}, "N",
|
||||
string_format("max number of SWA checkpoints per slot to create (default: %d)\n"
|
||||
"[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)", params.n_swa_checkpoints),
|
||||
{"--ctx-checkpoints", "--swa-checkpoints"}, "N",
|
||||
string_format("max number of context checkpoints to create per slot (default: %d)\n"
|
||||
"[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)", params.n_ctx_checkpoints),
|
||||
[](common_params & params, int value) {
|
||||
params.n_swa_checkpoints = value;
|
||||
params.n_ctx_checkpoints = value;
|
||||
}
|
||||
).set_env("LLAMA_ARG_SWA_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER}));
|
||||
).set_env("LLAMA_ARG_CTX_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER}));
|
||||
add_opt(common_arg(
|
||||
{"--kv-unified", "-kvu"},
|
||||
string_format("use single unified KV buffer for the KV cache of all sequences (default: %s)\n"
|
||||
|
|
|
|||
|
|
@ -625,6 +625,7 @@ const char * common_chat_format_name(common_chat_format format) {
|
|||
case COMMON_CHAT_FORMAT_CONTENT_ONLY: return "Content-only";
|
||||
case COMMON_CHAT_FORMAT_GENERIC: return "Generic";
|
||||
case COMMON_CHAT_FORMAT_MISTRAL_NEMO: return "Mistral Nemo";
|
||||
case COMMON_CHAT_FORMAT_MAGISTRAL: return "Magistral";
|
||||
case COMMON_CHAT_FORMAT_LLAMA_3_X: return "Llama 3.x";
|
||||
case COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS: return "Llama 3.x with builtin tools";
|
||||
case COMMON_CHAT_FORMAT_DEEPSEEK_R1: return "DeepSeek R1";
|
||||
|
|
@ -984,6 +985,65 @@ static common_chat_params common_chat_params_init_mistral_nemo(const common_chat
|
|||
data.format = COMMON_CHAT_FORMAT_MISTRAL_NEMO;
|
||||
return data;
|
||||
}
|
||||
|
||||
static common_chat_params common_chat_params_init_magistral(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
||||
common_chat_params data;
|
||||
data.prompt = apply(tmpl, inputs);
|
||||
data.format = COMMON_CHAT_FORMAT_MAGISTRAL;
|
||||
data.preserved_tokens = {
|
||||
"[THINK]",
|
||||
"[/THINK]",
|
||||
};
|
||||
|
||||
if (inputs.tools.is_array() && !inputs.tools.empty()) {
|
||||
data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
|
||||
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
||||
auto schemas = json::array();
|
||||
foreach_function(inputs.tools, [&](const json & tool) {
|
||||
const auto & function = tool.at("function");
|
||||
schemas.push_back({
|
||||
{"type", "object"},
|
||||
{"properties", {
|
||||
{"name", {
|
||||
{"type", "string"},
|
||||
{"const", function.at("name")},
|
||||
}},
|
||||
{"arguments", function.at("parameters")},
|
||||
{"id", {
|
||||
{"type", "string"},
|
||||
{"pattern", "^[a-zA-Z0-9]{9}$"},
|
||||
}},
|
||||
}},
|
||||
{"required", json::array({"name", "arguments", "id"})},
|
||||
});
|
||||
});
|
||||
auto schema = json {
|
||||
{"type", "array"},
|
||||
{"items", schemas.size() == 1 ? schemas[0] : json {{"anyOf", schemas}}},
|
||||
{"minItems", 1},
|
||||
};
|
||||
if (!inputs.parallel_tool_calls) {
|
||||
schema["maxItems"] = 1;
|
||||
}
|
||||
builder.add_rule("root", "\"[TOOL_CALLS]\" " + builder.add_schema("tool_calls", schema));
|
||||
});
|
||||
data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "[TOOL_CALLS]"});
|
||||
data.preserved_tokens.push_back("[TOOL_CALLS]");
|
||||
} else {
|
||||
data.grammar_lazy = false;
|
||||
if (!inputs.json_schema.is_null()) {
|
||||
if (!inputs.grammar.empty()) {
|
||||
throw std::runtime_error("Either \"json_schema\" or \"grammar\" can be specified, but not both");
|
||||
}
|
||||
data.grammar = json_schema_to_grammar(inputs.json_schema);
|
||||
} else {
|
||||
data.grammar = inputs.grammar;
|
||||
}
|
||||
}
|
||||
|
||||
return data;
|
||||
}
|
||||
|
||||
static void common_chat_parse_mistral_nemo(common_chat_msg_parser & builder) {
|
||||
if (!builder.syntax().parse_tool_calls) {
|
||||
builder.add_content(builder.consume_rest());
|
||||
|
|
@ -994,6 +1054,18 @@ static void common_chat_parse_mistral_nemo(common_chat_msg_parser & builder) {
|
|||
parse_prefixed_json_tool_call_array(builder, prefix);
|
||||
}
|
||||
|
||||
static void common_chat_parse_magistral(common_chat_msg_parser & builder) {
|
||||
builder.try_parse_reasoning("[THINK]", "[/THINK]");
|
||||
|
||||
if (!builder.syntax().parse_tool_calls) {
|
||||
builder.add_content(builder.consume_rest());
|
||||
return;
|
||||
}
|
||||
|
||||
static const common_regex prefix(regex_escape("[TOOL_CALLS]"));
|
||||
parse_prefixed_json_tool_call_array(builder, prefix);
|
||||
}
|
||||
|
||||
static common_chat_params common_chat_params_init_command_r7b(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
||||
common_chat_params data;
|
||||
|
||||
|
|
@ -2702,6 +2774,10 @@ static common_chat_params common_chat_templates_apply_jinja(
|
|||
return common_chat_params_init_llama_3_x(tmpl, params, allow_python_tag_builtin_tools);
|
||||
}
|
||||
|
||||
if (src.find("[THINK]") != std::string::npos && src.find("[/THINK]") != std::string::npos) {
|
||||
return common_chat_params_init_magistral(tmpl, params);
|
||||
}
|
||||
|
||||
// Plain handler (no tools)
|
||||
if (params.tools.is_null() || inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_NONE) {
|
||||
return common_chat_params_init_without_tools(tmpl, params);
|
||||
|
|
@ -2802,6 +2878,9 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
|
|||
case COMMON_CHAT_FORMAT_MISTRAL_NEMO:
|
||||
common_chat_parse_mistral_nemo(builder);
|
||||
break;
|
||||
case COMMON_CHAT_FORMAT_MAGISTRAL:
|
||||
common_chat_parse_magistral(builder);
|
||||
break;
|
||||
case COMMON_CHAT_FORMAT_LLAMA_3_X:
|
||||
common_chat_parse_llama_3_1(builder);
|
||||
break;
|
||||
|
|
|
|||
|
|
@ -101,6 +101,7 @@ enum common_chat_format {
|
|||
COMMON_CHAT_FORMAT_CONTENT_ONLY,
|
||||
COMMON_CHAT_FORMAT_GENERIC,
|
||||
COMMON_CHAT_FORMAT_MISTRAL_NEMO,
|
||||
COMMON_CHAT_FORMAT_MAGISTRAL,
|
||||
COMMON_CHAT_FORMAT_LLAMA_3_X,
|
||||
COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS,
|
||||
COMMON_CHAT_FORMAT_DEEPSEEK_R1,
|
||||
|
|
|
|||
|
|
@ -420,7 +420,7 @@ struct common_params {
|
|||
int32_t timeout_write = timeout_read; // http write timeout in seconds
|
||||
int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
|
||||
int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
|
||||
int32_t n_swa_checkpoints = 3; // max number of SWA checkpoints per slot
|
||||
int32_t n_ctx_checkpoints = 3; // max number of context checkpoints per slot
|
||||
|
||||
std::string hostname = "127.0.0.1";
|
||||
std::string public_path = ""; // NOLINT
|
||||
|
|
|
|||
|
|
@ -392,12 +392,8 @@ static void ggml_dyn_tallocr_free(struct ggml_dyn_tallocr * alloc) {
|
|||
free(alloc);
|
||||
}
|
||||
|
||||
static size_t ggml_dyn_tallocr_max_size(struct ggml_dyn_tallocr * alloc) {
|
||||
size_t max_size = 0;
|
||||
for (int i = 0; i < alloc->n_chunks; i++) {
|
||||
max_size += alloc->chunks[i]->max_size;
|
||||
}
|
||||
return max_size;
|
||||
static size_t ggml_dyn_tallocr_max_size(struct ggml_dyn_tallocr * alloc, int chunk) {
|
||||
return chunk < alloc->n_chunks ? alloc->chunks[chunk]->max_size : 0;
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -417,10 +413,8 @@ static void ggml_vbuffer_free(struct vbuffer * buf) {
|
|||
free(buf);
|
||||
}
|
||||
|
||||
static int ggml_vbuffer_n_chunks(struct vbuffer * buf) {
|
||||
int n = 0;
|
||||
while (n < GGML_VBUFFER_MAX_CHUNKS && buf->chunks[n]) n++;
|
||||
return n;
|
||||
static size_t ggml_vbuffer_chunk_size(struct vbuffer * buf, int chunk) {
|
||||
return buf->chunks[chunk] ? ggml_backend_buffer_get_size(buf->chunks[chunk]) : 0;
|
||||
}
|
||||
|
||||
static size_t ggml_vbuffer_size(struct vbuffer * buf) {
|
||||
|
|
@ -885,12 +879,20 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|||
}
|
||||
}
|
||||
|
||||
size_t cur_size = galloc->buffers[i] ? ggml_vbuffer_size(galloc->buffers[i]) : 0;
|
||||
size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]);
|
||||
|
||||
// even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views
|
||||
if (new_size > cur_size || galloc->buffers[i] == NULL) {
|
||||
bool realloc = galloc->buffers[i] == NULL;
|
||||
size_t new_size = 0;
|
||||
for (int c = 0; c < galloc->buf_tallocs[i]->n_chunks; c++) {
|
||||
size_t cur_chunk_size = galloc->buffers[i] ? ggml_vbuffer_chunk_size(galloc->buffers[i], c) : 0;
|
||||
size_t new_chunk_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i], c);
|
||||
new_size += new_chunk_size;
|
||||
if (new_chunk_size > cur_chunk_size) {
|
||||
realloc = true;
|
||||
}
|
||||
}
|
||||
if (realloc) {
|
||||
#ifndef NDEBUG
|
||||
size_t cur_size = galloc->buffers[i] ? ggml_vbuffer_size(galloc->buffers[i]) : 0;
|
||||
GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
|
||||
#endif
|
||||
|
||||
|
|
|
|||
|
|
@ -112,7 +112,7 @@ static bool ggml_mem_ranges_add_dst(ggml_mem_ranges_t mrs, const ggml_tensor * t
|
|||
}
|
||||
|
||||
bool ggml_mem_ranges_add(ggml_mem_ranges_t mrs, const ggml_tensor * tensor) {
|
||||
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
||||
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||
if (tensor->src[i]) {
|
||||
ggml_mem_ranges_add_src(mrs, tensor->src[i]);
|
||||
}
|
||||
|
|
@ -173,7 +173,7 @@ static bool ggml_mem_ranges_check_dst(ggml_mem_ranges_t mrs, const ggml_tensor *
|
|||
}
|
||||
|
||||
bool ggml_mem_ranges_check(ggml_mem_ranges_t mrs, const ggml_tensor * tensor) {
|
||||
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
||||
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||
if (tensor->src[i]) {
|
||||
if (!ggml_mem_ranges_check_src(mrs, tensor->src[i])) {
|
||||
return false;
|
||||
|
|
|
|||
|
|
@ -409,6 +409,7 @@ struct vk_device_struct {
|
|||
vk::PhysicalDeviceProperties properties;
|
||||
std::string name;
|
||||
uint64_t max_memory_allocation_size;
|
||||
uint64_t max_buffer_size;
|
||||
uint64_t suballocation_block_size;
|
||||
bool fp16;
|
||||
bool bf16;
|
||||
|
|
@ -1579,6 +1580,12 @@ typedef void (*ggml_vk_func_t)(ggml_backend_vk_context * ctx, vk_context& subctx
|
|||
|
||||
static void ggml_backend_vk_free(ggml_backend_t backend);
|
||||
|
||||
static VkDeviceSize ggml_vk_get_max_buffer_range(const ggml_backend_vk_context * ctx, const vk_buffer &buf, const VkDeviceSize offset) {
|
||||
const VkDeviceSize range = std::min(VkDeviceSize{buf->size - offset},
|
||||
VkDeviceSize{ctx->device->properties.limits.maxStorageBufferRange});
|
||||
return range;
|
||||
}
|
||||
|
||||
// Wait for ctx->fence to be signaled.
|
||||
static void ggml_vk_wait_for_fence(ggml_backend_vk_context * ctx) {
|
||||
// Use waitForFences while most of the graph executes. Hopefully the CPU can sleep
|
||||
|
|
@ -2028,8 +2035,8 @@ static uint32_t find_properties(const vk::PhysicalDeviceMemoryProperties* mem_pr
|
|||
|
||||
static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, const std::initializer_list<vk::MemoryPropertyFlags> & req_flags_list) {
|
||||
VK_LOG_DEBUG("ggml_vk_create_buffer(" << device->name << ", " << size << ", " << to_string(req_flags_list.begin()[0]) << ", " << to_string(req_flags_list.begin()[req_flags_list.size()-1]) << ")");
|
||||
if (size > device->max_memory_allocation_size) {
|
||||
printf("\nWARNING: Requested buffer size (%zu) exceeds device memory allocation limit (%zu)!\n",size,device->max_memory_allocation_size);
|
||||
if (size > device->max_buffer_size) {
|
||||
printf("\nWARNING: Requested buffer size (%zu) exceeds device max_buffer_size limit (%zu)!\n",size,device->max_buffer_size);
|
||||
}
|
||||
|
||||
vk_buffer buf = std::make_shared<vk_buffer_struct>();
|
||||
|
|
@ -2175,8 +2182,8 @@ static void ggml_vk_destroy_buffer(vk_buffer& buf) {
|
|||
buf.reset();
|
||||
}
|
||||
|
||||
static vk_subbuffer ggml_vk_subbuffer(vk_buffer& buf) {
|
||||
return { buf, 0, VK_WHOLE_SIZE };
|
||||
static vk_subbuffer ggml_vk_subbuffer(const ggml_backend_vk_context* ctx, const vk_buffer& buf, size_t offset = 0) {
|
||||
return { buf, offset, ggml_vk_get_max_buffer_range(ctx, buf, offset) };
|
||||
}
|
||||
|
||||
static void ggml_vk_sync_buffers(ggml_backend_vk_context* ctx, vk_context& subctx) {
|
||||
|
|
@ -2630,8 +2637,6 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|||
const uint32_t D_lsb = D ^ (D & (D-1));
|
||||
uint32_t D_split = std::min(std::min(device->subgroup_size, 8u), D_lsb / 4);
|
||||
|
||||
// mask dim1 is padded to 64, we rely on this to avoid clamping mask loads
|
||||
GGML_ASSERT((GGML_KQ_MASK_PAD % rows_cols[0]) == 0);
|
||||
return {wg_size, rows_cols[0], rows_cols[1], hsk, hsv, clamp, D_split};
|
||||
};
|
||||
|
||||
|
|
@ -3881,17 +3886,27 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
|||
const char* GGML_VK_FORCE_MAX_ALLOCATION_SIZE = getenv("GGML_VK_FORCE_MAX_ALLOCATION_SIZE");
|
||||
|
||||
if (GGML_VK_FORCE_MAX_ALLOCATION_SIZE != nullptr) {
|
||||
device->max_memory_allocation_size = std::stoul(GGML_VK_FORCE_MAX_ALLOCATION_SIZE);
|
||||
device->max_memory_allocation_size = std::stoull(GGML_VK_FORCE_MAX_ALLOCATION_SIZE);
|
||||
} else if (maintenance4_support) {
|
||||
device->max_memory_allocation_size = std::min(props3.maxMemoryAllocationSize, props4.maxBufferSize);
|
||||
} else {
|
||||
device->max_memory_allocation_size = props3.maxMemoryAllocationSize;
|
||||
}
|
||||
|
||||
const char* GGML_VK_FORCE_MAX_BUFFER_SIZE = getenv("GGML_VK_FORCE_MAX_BUFFER_SIZE");
|
||||
|
||||
if (GGML_VK_FORCE_MAX_BUFFER_SIZE != nullptr) {
|
||||
device->max_buffer_size = std::stoull(GGML_VK_FORCE_MAX_BUFFER_SIZE);
|
||||
} else if (maintenance4_support) {
|
||||
device->max_buffer_size = props4.maxBufferSize;
|
||||
} else {
|
||||
device->max_buffer_size = device->max_memory_allocation_size;
|
||||
}
|
||||
|
||||
const char* GGML_VK_SUBALLOCATION_BLOCK_SIZE = getenv("GGML_VK_SUBALLOCATION_BLOCK_SIZE");
|
||||
|
||||
if (GGML_VK_SUBALLOCATION_BLOCK_SIZE != nullptr) {
|
||||
device->suballocation_block_size = std::stoul(GGML_VK_SUBALLOCATION_BLOCK_SIZE);
|
||||
device->suballocation_block_size = std::stoull(GGML_VK_SUBALLOCATION_BLOCK_SIZE);
|
||||
} else {
|
||||
// Limit batching of allocations to 1GB by default to avoid fragmentation issues
|
||||
device->suballocation_block_size = 1024*1024*1024;
|
||||
|
|
@ -6180,9 +6195,9 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
|
|||
}
|
||||
const uint64_t split_k_size = split_k > 1 ? d_sz * ne12 * ne13 * split_k : 0;
|
||||
if (
|
||||
(qx_needs_dequant && x_sz_upd > ctx->device->max_memory_allocation_size) ||
|
||||
(qy_needs_dequant && y_sz_upd > ctx->device->max_memory_allocation_size) ||
|
||||
(split_k > 1 && split_k_size > ctx->device->max_memory_allocation_size)) {
|
||||
(qx_needs_dequant && x_sz_upd > ctx->device->properties.limits.maxStorageBufferRange) ||
|
||||
(qy_needs_dequant && y_sz_upd > ctx->device->properties.limits.maxStorageBufferRange) ||
|
||||
(split_k > 1 && split_k_size > ctx->device->properties.limits.maxStorageBufferRange)) {
|
||||
printf("\nWarning: Requested preallocation size is too large");
|
||||
}
|
||||
if (qx_needs_dequant && ctx->prealloc_size_x < x_sz_upd) {
|
||||
|
|
@ -6257,7 +6272,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
|
|||
}
|
||||
|
||||
if (x_non_contig) {
|
||||
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE });
|
||||
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, ggml_vk_subbuffer(ctx, d_Qx, qx_buf_offset), ggml_vk_subbuffer(ctx, d_X, 0));
|
||||
} else if (qx_needs_dequant) {
|
||||
const std::vector<uint32_t> pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) };
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc, { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
|
||||
|
|
@ -6269,7 +6284,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
|
|||
if (ctx->prealloc_y_need_sync) {
|
||||
ggml_vk_sync_buffers(ctx, subctx);
|
||||
}
|
||||
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
|
||||
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, ggml_vk_subbuffer(ctx, d_Qy, qy_buf_offset), ggml_vk_subbuffer(ctx, d_Y, 0));
|
||||
ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get();
|
||||
ctx->prealloc_y_last_tensor_used = src1;
|
||||
}
|
||||
|
|
@ -6280,7 +6295,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
|
|||
if (ctx->prealloc_y_need_sync) {
|
||||
ggml_vk_sync_buffers(ctx, subctx);
|
||||
}
|
||||
ggml_vk_quantize_q8_1(ctx, subctx, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }, y_ne * ne12 * ne13, true);
|
||||
ggml_vk_quantize_q8_1(ctx, subctx, ggml_vk_subbuffer(ctx, d_Qy, qy_buf_offset), ggml_vk_subbuffer(ctx, d_Y, 0), y_ne * ne12 * ne13, true);
|
||||
ctx->prealloc_y_last_pipeline_used = to_q8_1.get();
|
||||
ctx->prealloc_y_last_tensor_used = src1;
|
||||
}
|
||||
|
|
@ -6302,14 +6317,11 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
|
|||
y_sz_total = CEIL_DIV(y_sz_total, 144) * 144;
|
||||
}
|
||||
|
||||
// No bounds checking is needed for dst. This is basically VK_WHOLE_SIZE but clamped to maxStorageBufferRange.
|
||||
VkDeviceSize d_range = std::min(VkDeviceSize{d_D->size - d_buf_offset}, VkDeviceSize{ctx->device->properties.limits.maxStorageBufferRange});
|
||||
|
||||
// compute
|
||||
ggml_vk_matmul(
|
||||
ctx, subctx, pipeline,
|
||||
{ d_X, x_buf_offset, x_sz * ne02 * ne03 }, { d_Y, y_buf_offset, y_sz_total },
|
||||
{ d_D, d_buf_offset, d_range }, { ctx->prealloc_split_k, 0, d_sz * ne12 * ne13 * split_k },
|
||||
ggml_vk_subbuffer(ctx, d_D, d_buf_offset), { ctx->prealloc_split_k, 0, d_sz * ne12 * ne13 * split_k },
|
||||
ne01, ne11, ne10,
|
||||
ne10, ne10, stride_d, stride_batch_x, stride_batch_y, stride_batch_d,
|
||||
split_k, ne12*ne13, ne02, ne12, r2, r3, padded_n
|
||||
|
|
@ -6476,8 +6488,8 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
|
|||
y_sz_upd = CEIL_DIV(y_sz_upd, 144) * 144;
|
||||
}
|
||||
if (
|
||||
(qx_needs_dequant && x_sz_upd > ctx->device->max_memory_allocation_size) ||
|
||||
(qy_needs_dequant && y_sz_upd > ctx->device->max_memory_allocation_size)) {
|
||||
(qx_needs_dequant && x_sz_upd > ctx->device->properties.limits.maxStorageBufferRange) ||
|
||||
(qy_needs_dequant && y_sz_upd > ctx->device->properties.limits.maxStorageBufferRange)) {
|
||||
printf("\nWarning: Requested preallocation size is too large");
|
||||
}
|
||||
if (qx_needs_dequant && ctx->prealloc_size_x < x_sz_upd) {
|
||||
|
|
@ -6542,7 +6554,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
|
|||
}
|
||||
|
||||
GGML_ASSERT(x_sz == ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment));
|
||||
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE });
|
||||
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, ggml_vk_subbuffer(ctx, d_Qx, qx_buf_offset), ggml_vk_subbuffer(ctx, d_X, 0));
|
||||
}
|
||||
if (y_non_contig) {
|
||||
GGML_ASSERT(y_sz == ggml_type_size(src1->type) * y_ne);
|
||||
|
|
@ -6551,7 +6563,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
|
|||
if (ctx->prealloc_y_need_sync) {
|
||||
ggml_vk_sync_buffers(ctx, subctx);
|
||||
}
|
||||
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
|
||||
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, ggml_vk_subbuffer(ctx, d_Qy, qy_buf_offset), ggml_vk_subbuffer(ctx, d_Y, 0));
|
||||
ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get();
|
||||
ctx->prealloc_y_last_tensor_used = src1;
|
||||
}
|
||||
|
|
@ -6562,7 +6574,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
|
|||
if (ctx->prealloc_y_need_sync) {
|
||||
ggml_vk_sync_buffers(ctx, subctx);
|
||||
}
|
||||
ggml_vk_quantize_q8_1(ctx, subctx, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }, y_ne * ne12 * ne13, true);
|
||||
ggml_vk_quantize_q8_1(ctx, subctx, ggml_vk_subbuffer(ctx, d_Qy, qy_buf_offset), ggml_vk_subbuffer(ctx, d_Y, 0), y_ne * ne12 * ne13, true);
|
||||
ctx->prealloc_y_last_pipeline_used = to_q8_1.get();
|
||||
ctx->prealloc_y_last_tensor_used = src1;
|
||||
}
|
||||
|
|
@ -6961,8 +6973,8 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&
|
|||
const uint64_t x_sz_upd = x_sz * ne02 * ne03;
|
||||
const uint64_t y_sz_upd = y_sz * ne12 * ne13;
|
||||
if (
|
||||
(qx_needs_dequant && x_sz_upd > ctx->device->max_memory_allocation_size) ||
|
||||
(qy_needs_dequant && y_sz_upd > ctx->device->max_memory_allocation_size)) {
|
||||
(qx_needs_dequant && x_sz_upd > ctx->device->properties.limits.maxStorageBufferRange) ||
|
||||
(qy_needs_dequant && y_sz_upd > ctx->device->properties.limits.maxStorageBufferRange)) {
|
||||
printf("\nWarning: Requested preallocation size is too large");
|
||||
}
|
||||
if (qx_needs_dequant && ctx->prealloc_size_x < x_sz_upd) {
|
||||
|
|
@ -7029,7 +7041,7 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&
|
|||
}
|
||||
|
||||
if (x_non_contig) {
|
||||
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE });
|
||||
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, ggml_vk_subbuffer(ctx, d_Qx, qx_buf_offset), ggml_vk_subbuffer(ctx, d_X, 0));
|
||||
} else if (qx_needs_dequant) {
|
||||
const std::vector<uint32_t> pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) };
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0,
|
||||
|
|
@ -7042,7 +7054,7 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&
|
|||
if (ctx->prealloc_y_need_sync) {
|
||||
ggml_vk_sync_buffers(ctx, subctx);
|
||||
}
|
||||
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
|
||||
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, ggml_vk_subbuffer(ctx, d_Qy, qy_buf_offset), ggml_vk_subbuffer(ctx, d_Y, 0));
|
||||
ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get();
|
||||
ctx->prealloc_y_last_tensor_used = src1;
|
||||
}
|
||||
|
|
@ -7175,8 +7187,8 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
|
|||
const uint64_t x_sz_upd = x_sz * ne02 * ne03;
|
||||
const uint64_t y_sz_upd = y_sz * ne12 * ne13;
|
||||
if (
|
||||
(qx_needs_dequant && x_sz_upd > ctx->device->max_memory_allocation_size) ||
|
||||
(qy_needs_dequant && y_sz_upd > ctx->device->max_memory_allocation_size)) {
|
||||
(qx_needs_dequant && x_sz_upd > ctx->device->properties.limits.maxStorageBufferRange) ||
|
||||
(qy_needs_dequant && y_sz_upd > ctx->device->properties.limits.maxStorageBufferRange)) {
|
||||
printf("\nWarning: Requested preallocation size is too large");
|
||||
}
|
||||
if (qx_needs_dequant && ctx->prealloc_size_x < x_sz_upd) {
|
||||
|
|
@ -7242,7 +7254,7 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
|
|||
|
||||
if (x_non_contig) {
|
||||
GGML_ASSERT(x_sz == ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment));
|
||||
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE });
|
||||
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, ggml_vk_subbuffer(ctx, d_Qx, qx_buf_offset), ggml_vk_subbuffer(ctx, d_X, 0));
|
||||
}
|
||||
if (y_non_contig) {
|
||||
GGML_ASSERT(y_sz == ggml_type_size(src1->type) * y_ne);
|
||||
|
|
@ -7251,7 +7263,7 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
|
|||
if (ctx->prealloc_y_need_sync) {
|
||||
ggml_vk_sync_buffers(ctx, subctx);
|
||||
}
|
||||
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
|
||||
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, ggml_vk_subbuffer(ctx, d_Qy, qy_buf_offset), ggml_vk_subbuffer(ctx, d_Y, 0));
|
||||
ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get();
|
||||
ctx->prealloc_y_last_tensor_used = src1;
|
||||
}
|
||||
|
|
@ -7487,8 +7499,6 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
|
|||
if (((HSK | HSV) % 16) != 0 && path == FA_COOPMAT2) {
|
||||
aligned = false;
|
||||
}
|
||||
// mask dim1 is padded to 64, we rely on this to avoid clamping mask loads
|
||||
GGML_ASSERT((nem1 % GGML_KQ_MASK_PAD) == 0);
|
||||
|
||||
bool f32acc = path == FA_SCALAR || dst->op_params[3] == GGML_PREC_F32;
|
||||
|
||||
|
|
@ -7528,7 +7538,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
|
|||
// Reserve space for split_k temporaries. For each split x batch, we need to store the O matrix (D x ne1)
|
||||
// and the per-row m and L values (ne1 rows). We store all the matrices first, followed by the rows.
|
||||
const uint64_t split_k_size = split_k > 1 ? (HSV * ne1 * sizeof(float) + ne1 * sizeof(float) * 2) * split_k * ne3 : 0;
|
||||
if (split_k_size > ctx->device->max_memory_allocation_size) {
|
||||
if (split_k_size > ctx->device->properties.limits.maxStorageBufferRange) {
|
||||
GGML_ABORT("Requested preallocation size is too large");
|
||||
}
|
||||
if (ctx->prealloc_size_split_k < split_k_size) {
|
||||
|
|
@ -7650,12 +7660,12 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
|
|||
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
|
||||
{
|
||||
vk_subbuffer{d_Q, q_buf_offset, VK_WHOLE_SIZE},
|
||||
vk_subbuffer{d_K, k_buf_offset, VK_WHOLE_SIZE},
|
||||
vk_subbuffer{d_V, v_buf_offset, VK_WHOLE_SIZE},
|
||||
vk_subbuffer{d_M, m_buf_offset, VK_WHOLE_SIZE},
|
||||
vk_subbuffer{d_S, s_buf_offset, VK_WHOLE_SIZE},
|
||||
vk_subbuffer{ctx->prealloc_split_k, 0, VK_WHOLE_SIZE},
|
||||
ggml_vk_subbuffer(ctx, d_Q, q_buf_offset),
|
||||
ggml_vk_subbuffer(ctx, d_K, k_buf_offset),
|
||||
ggml_vk_subbuffer(ctx, d_V, v_buf_offset),
|
||||
ggml_vk_subbuffer(ctx, d_M, m_buf_offset),
|
||||
ggml_vk_subbuffer(ctx, d_S, s_buf_offset),
|
||||
ggml_vk_subbuffer(ctx, ctx->prealloc_split_k, 0),
|
||||
},
|
||||
// We only use split_k when group query attention is enabled, which means
|
||||
// there's no more than one tile of rows (i.e. workgroups_x would have been
|
||||
|
|
@ -7667,21 +7677,21 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
|
|||
const std::array<uint32_t, 5> pc2 = { HSV, (uint32_t)ne1, (uint32_t)ne3, split_k, (sinks != nullptr) };
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_flash_attn_split_k_reduce,
|
||||
{
|
||||
vk_subbuffer{ctx->prealloc_split_k, 0, VK_WHOLE_SIZE},
|
||||
vk_subbuffer{d_S, s_buf_offset, VK_WHOLE_SIZE},
|
||||
vk_subbuffer{d_D, d_buf_offset, VK_WHOLE_SIZE},
|
||||
ggml_vk_subbuffer(ctx, ctx->prealloc_split_k, 0),
|
||||
ggml_vk_subbuffer(ctx, d_S, s_buf_offset),
|
||||
ggml_vk_subbuffer(ctx, d_D, d_buf_offset),
|
||||
},
|
||||
pc2, { (uint32_t)ne1, HSV, (uint32_t)ne3 });
|
||||
ctx->prealloc_split_k_need_sync = true;
|
||||
} else {
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
|
||||
{
|
||||
vk_subbuffer{d_Q, q_buf_offset, VK_WHOLE_SIZE},
|
||||
vk_subbuffer{d_K, k_buf_offset, VK_WHOLE_SIZE},
|
||||
vk_subbuffer{d_V, v_buf_offset, VK_WHOLE_SIZE},
|
||||
vk_subbuffer{d_M, m_buf_offset, VK_WHOLE_SIZE},
|
||||
vk_subbuffer{d_S, s_buf_offset, VK_WHOLE_SIZE},
|
||||
vk_subbuffer{d_D, d_buf_offset, VK_WHOLE_SIZE},
|
||||
ggml_vk_subbuffer(ctx, d_Q, q_buf_offset),
|
||||
ggml_vk_subbuffer(ctx, d_K, k_buf_offset),
|
||||
ggml_vk_subbuffer(ctx, d_V, v_buf_offset),
|
||||
ggml_vk_subbuffer(ctx, d_M, m_buf_offset),
|
||||
ggml_vk_subbuffer(ctx, d_S, s_buf_offset),
|
||||
ggml_vk_subbuffer(ctx, d_D, d_buf_offset),
|
||||
},
|
||||
pc, { workgroups_x, workgroups_y, workgroups_z });
|
||||
}
|
||||
|
|
@ -8390,18 +8400,8 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
|||
}
|
||||
}
|
||||
|
||||
uint64_t x_sz = ggml_type_size(src0->type)/ggml_blck_size(src0->type) * ne0;
|
||||
uint64_t y_sz = use_src1 ? ggml_type_size(src1->type) * ne1 : 0;
|
||||
uint64_t z_sz = use_src2 ? ggml_type_size(src2->type) * ne2 : 0;
|
||||
uint64_t d_sz = ggml_type_size(dst->type) * ned;
|
||||
|
||||
vk_buffer d_D = dst_buf_ctx->dev_buffer;
|
||||
|
||||
// Workaround for tiny tensor inputs on ROPE
|
||||
if (op == GGML_OP_ROPE && use_src1 && y_sz > d_D->size) {
|
||||
y_sz = VK_WHOLE_SIZE;
|
||||
}
|
||||
|
||||
GGML_ASSERT(d_D != nullptr);
|
||||
uint64_t d_buf_offset = vk_tensor_offset(dst) + dst->view_offs;
|
||||
if(!src0_uma) {
|
||||
|
|
@ -8426,26 +8426,6 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
|||
z_buf_offset &= ~(ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1);
|
||||
d_buf_offset &= ~(ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1);
|
||||
|
||||
if (op_supports_incontiguous) {
|
||||
x_sz = ggml_nbytes(src0) + get_misalign_bytes(ctx, src0);
|
||||
y_sz = use_src1 ? ggml_nbytes(src1) + get_misalign_bytes(ctx, src1) : 0;
|
||||
z_sz = use_src2 ? ggml_nbytes(src2) + get_misalign_bytes(ctx, src2) : 0;
|
||||
d_sz = ggml_nbytes(dst) + get_misalign_bytes(ctx, dst);
|
||||
|
||||
if (x_buf_offset + x_sz >= d_X->size) {
|
||||
x_sz = VK_WHOLE_SIZE;
|
||||
}
|
||||
if (use_src1 && y_buf_offset + y_sz >= d_Y->size) {
|
||||
y_sz = VK_WHOLE_SIZE;
|
||||
}
|
||||
if (use_src2 && z_buf_offset + z_sz >= d_Z->size) {
|
||||
z_sz = VK_WHOLE_SIZE;
|
||||
}
|
||||
if (d_buf_offset + d_sz >= d_D->size) {
|
||||
d_sz = VK_WHOLE_SIZE;
|
||||
}
|
||||
}
|
||||
|
||||
std::array<uint32_t, 3> elements;
|
||||
|
||||
// Single call if dimension 2 is contiguous
|
||||
|
|
@ -8636,19 +8616,31 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
|||
break;
|
||||
}
|
||||
|
||||
if (!op_supports_incontiguous) {
|
||||
if (x_sz != VK_WHOLE_SIZE) {
|
||||
x_sz *= ne02 * ne03;
|
||||
uint64_t x_sz, y_sz, z_sz, d_sz;
|
||||
|
||||
if (op_supports_incontiguous) {
|
||||
x_sz = ggml_nbytes(src0) + get_misalign_bytes(ctx, src0);
|
||||
y_sz = use_src1 ? ggml_nbytes(src1) + get_misalign_bytes(ctx, src1) : 0;
|
||||
z_sz = use_src2 ? ggml_nbytes(src2) + get_misalign_bytes(ctx, src2) : 0;
|
||||
d_sz = ggml_nbytes(dst) + get_misalign_bytes(ctx, dst);
|
||||
|
||||
if (x_buf_offset + x_sz >= d_X->size) {
|
||||
x_sz = ggml_vk_get_max_buffer_range(ctx, d_X, x_buf_offset);
|
||||
}
|
||||
if (use_src1 && y_sz != VK_WHOLE_SIZE) {
|
||||
y_sz *= ne12 * ne13;
|
||||
if (use_src1 && y_buf_offset + y_sz >= d_Y->size) {
|
||||
y_sz = ggml_vk_get_max_buffer_range(ctx, d_Y, y_buf_offset);
|
||||
}
|
||||
if (use_src2 && z_sz != VK_WHOLE_SIZE) {
|
||||
z_sz *= ne22 * ne23;
|
||||
if (use_src2 && z_buf_offset + z_sz >= d_Z->size) {
|
||||
z_sz = ggml_vk_get_max_buffer_range(ctx, d_Z, z_buf_offset);
|
||||
}
|
||||
if (d_sz != VK_WHOLE_SIZE) {
|
||||
d_sz *= ned2 * ned3;
|
||||
if (d_buf_offset + d_sz >= d_D->size) {
|
||||
d_sz = ggml_vk_get_max_buffer_range(ctx, d_D, d_buf_offset);
|
||||
}
|
||||
} else {
|
||||
x_sz = ggml_type_size(src0->type)/ggml_blck_size(src0->type) * ne0 * ne02 * ne03;
|
||||
y_sz = use_src1 ? ggml_type_size(src1->type) * ne1 * ne12 * ne13 : 0;
|
||||
z_sz = use_src2 ? ggml_type_size(src2->type) * ne2 * ne22 * ne23 : 0;
|
||||
d_sz = ggml_type_size(dst->type) * ned * ned2 * ned3;
|
||||
}
|
||||
|
||||
if (op == GGML_OP_ADD || op == GGML_OP_RMS_NORM) {
|
||||
|
|
@ -8658,7 +8650,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
|||
{ vk_subbuffer{ d_X, x_buf_offset, x_sz },
|
||||
vk_subbuffer{ d_Y, y_buf_offset, y_sz },
|
||||
vk_subbuffer{ d_D, d_buf_offset, d_sz },
|
||||
vk_subbuffer{ d_A, a_buf_offset, VK_WHOLE_SIZE },
|
||||
ggml_vk_subbuffer(ctx, d_A, a_buf_offset),
|
||||
}, pc, elements);
|
||||
} else if (op == GGML_OP_GLU) {
|
||||
// Empty src1 is possible in glu, but the shader needs a buffer
|
||||
|
|
@ -8851,18 +8843,18 @@ static void ggml_vk_multi_add(ggml_backend_vk_context * ctx, vk_context& subctx,
|
|||
static_assert(MAX_PARAMETER_COUNT == 12);
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
|
||||
{
|
||||
vk_subbuffer{ buf[0], offset[0], VK_WHOLE_SIZE },
|
||||
vk_subbuffer{ buf[1], offset[1], VK_WHOLE_SIZE },
|
||||
vk_subbuffer{ buf[2], offset[2], VK_WHOLE_SIZE },
|
||||
vk_subbuffer{ buf[3], offset[3], VK_WHOLE_SIZE },
|
||||
vk_subbuffer{ buf[4], offset[4], VK_WHOLE_SIZE },
|
||||
vk_subbuffer{ buf[5], offset[5], VK_WHOLE_SIZE },
|
||||
vk_subbuffer{ buf[6], offset[6], VK_WHOLE_SIZE },
|
||||
vk_subbuffer{ buf[7], offset[7], VK_WHOLE_SIZE },
|
||||
vk_subbuffer{ buf[8], offset[8], VK_WHOLE_SIZE },
|
||||
vk_subbuffer{ buf[9], offset[9], VK_WHOLE_SIZE },
|
||||
vk_subbuffer{ buf[10], offset[10], VK_WHOLE_SIZE },
|
||||
vk_subbuffer{ buf[11], offset[11], VK_WHOLE_SIZE },
|
||||
ggml_vk_subbuffer(ctx, buf[0], offset[0]),
|
||||
ggml_vk_subbuffer(ctx, buf[1], offset[1]),
|
||||
ggml_vk_subbuffer(ctx, buf[2], offset[2]),
|
||||
ggml_vk_subbuffer(ctx, buf[3], offset[3]),
|
||||
ggml_vk_subbuffer(ctx, buf[4], offset[4]),
|
||||
ggml_vk_subbuffer(ctx, buf[5], offset[5]),
|
||||
ggml_vk_subbuffer(ctx, buf[6], offset[6]),
|
||||
ggml_vk_subbuffer(ctx, buf[7], offset[7]),
|
||||
ggml_vk_subbuffer(ctx, buf[8], offset[8]),
|
||||
ggml_vk_subbuffer(ctx, buf[9], offset[9]),
|
||||
ggml_vk_subbuffer(ctx, buf[10], offset[10]),
|
||||
ggml_vk_subbuffer(ctx, buf[11], offset[11]),
|
||||
}, pc, elements);
|
||||
}
|
||||
|
||||
|
|
@ -10036,7 +10028,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
|||
ggml_vk_ctx_begin(ctx->device, subctx);
|
||||
for (size_t i = 0; i < num_it; i++) {
|
||||
ggml_vk_matmul(
|
||||
ctx, subctx, p, ggml_vk_subbuffer(d_X), ggml_vk_subbuffer(d_Y), ggml_vk_subbuffer(d_D), ggml_vk_subbuffer(ctx->prealloc_split_k),
|
||||
ctx, subctx, p, ggml_vk_subbuffer(ctx, d_X), ggml_vk_subbuffer(ctx, d_Y), ggml_vk_subbuffer(ctx, d_D), ggml_vk_subbuffer(ctx, ctx->prealloc_split_k),
|
||||
m, n, k,
|
||||
k, k, m, k*m, k*n, m*n,
|
||||
split_k, batch, batch, batch, 1, 1, n
|
||||
|
|
@ -10347,7 +10339,7 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
|
|||
//
|
||||
// vk_context subctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
|
||||
// ggml_vk_ctx_begin(ctx->device, subctx);
|
||||
// ggml_vk_quantize_q8_1(ctx, subctx, ggml_vk_subbuffer(x_buf), ggml_vk_subbuffer(qx_buf), ne);
|
||||
// ggml_vk_quantize_q8_1(ctx, subctx, ggml_vk_subbuffer(ctx, x_buf), ggml_vk_subbuffer(ctx, qx_buf), ne);
|
||||
// ggml_vk_ctx_end(subctx);
|
||||
//
|
||||
// auto begin = std::chrono::high_resolution_clock::now();
|
||||
|
|
|
|||
|
|
@ -153,12 +153,13 @@ void main() {
|
|||
}
|
||||
|
||||
if ((p.mask_n_head_log2 & MASK_ENABLE_BIT) != 0) {
|
||||
bool nem1_bounds_check = !(p.gqa_ratio > 1) && (p.nem1 % Br) != 0;
|
||||
|
||||
[[unroll]] for (uint32_t idx = 0; idx < Bc * Br; idx += gl_WorkGroupSize.x) {
|
||||
uint32_t c = (idx + tid) % Bc;
|
||||
uint32_t r = (idx + tid) / Bc;
|
||||
if (idx + tid < Bc * Br) {
|
||||
if (!KV_bounds_check || j * Bc + c < KV) {
|
||||
if ((!KV_bounds_check || j * Bc + c < KV) && (!nem1_bounds_check || i * Br + r < p.nem1)) {
|
||||
masksh[c][r] = float(data_m[m_offset + (i * Br + r) * m_stride + (j * Bc + c)]);
|
||||
} else {
|
||||
masksh[c][r] = float(0);
|
||||
|
|
|
|||
|
|
@ -201,11 +201,13 @@ void main() {
|
|||
}
|
||||
|
||||
if ((p.mask_n_head_log2 & MASK_ENABLE_BIT) != 0) {
|
||||
bool nem1_bounds_check = !(p.gqa_ratio > 1) && (p.nem1 % Br) != 0;
|
||||
|
||||
[[unroll]] for (uint32_t idx = 0; idx < Bc * Br; idx += gl_WorkGroupSize.x) {
|
||||
uint32_t c = (idx + tid) % Bc;
|
||||
uint32_t r = (idx + tid) / Bc;
|
||||
if (idx + tid < Bc * Br || idx + gl_WorkGroupSize.x <= Bc * Br) {
|
||||
if (!KV_bounds_check || j * Bc + c < KV) {
|
||||
if ((!KV_bounds_check || j * Bc + c < KV) && (!nem1_bounds_check || i * Br + r < p.nem1)) {
|
||||
sfsh[c * sfshstride + r] += ACC_TYPE(slope[r] * float(data_m[m_offset + (i * Br + r) * m_stride + (j * Bc + c)]));
|
||||
}
|
||||
}
|
||||
|
|
@ -356,8 +358,8 @@ void main() {
|
|||
}
|
||||
|
||||
if ((p.mask_n_head_log2 & SINK_ENABLE_BIT) != 0) {
|
||||
[[unroll]] for (uint32_t r = 0; r < Br; ++r) {
|
||||
float sink = perElemOpGetSink(r, 0u, ACC_TYPE(0), iq2);
|
||||
[[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
|
||||
float sink = perElemOpGetSink(tile_row(r), 0u, ACC_TYPE(0), iq2);
|
||||
|
||||
float ms = 1.0f;
|
||||
float vs = 1.0f;
|
||||
|
|
|
|||
|
|
@ -154,15 +154,31 @@ void main() {
|
|||
}
|
||||
|
||||
if ((p.mask_n_head_log2 & MASK_ENABLE_BIT) != 0) {
|
||||
tensorLayoutNV<2, Clamp> tensorLayoutM = createTensorLayoutNV(2, Clamp);
|
||||
tensorLayoutM = setTensorLayoutDimensionNV(tensorLayoutM, p.nem1, KV);
|
||||
tensorLayoutM = setTensorLayoutStrideNV(tensorLayoutM, m_stride, 1);
|
||||
bool nem1_bounds_check = !(p.gqa_ratio > 1) && (p.nem1 % Br) != 0;
|
||||
|
||||
coopmat<float16_t, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator> mv;
|
||||
if (nem1_bounds_check) {
|
||||
tensorLayoutNV<2, gl_CooperativeMatrixClampModeConstantNV> tensorLayoutM = createTensorLayoutNV(2, gl_CooperativeMatrixClampModeConstantNV);
|
||||
tensorLayoutM = setTensorLayoutDimensionNV(tensorLayoutM, p.nem1, KV);
|
||||
tensorLayoutM = setTensorLayoutStrideNV(tensorLayoutM, m_stride, 1);
|
||||
|
||||
coopMatLoadTensorNV(mv, data_m, m_offset, sliceTensorLayoutNV(tensorLayoutM, i * Br, Br, j * Bc, Bc));
|
||||
coopmat<float16_t, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator> mv;
|
||||
|
||||
S += slopeMat*coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator>(mv);
|
||||
coopMatLoadTensorNV(mv, data_m, m_offset, sliceTensorLayoutNV(tensorLayoutM, i * Br, Br, j * Bc, Bc));
|
||||
|
||||
S += slopeMat*coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator>(mv);
|
||||
} else {
|
||||
tensorLayoutNV<2, Clamp> tensorLayoutM = createTensorLayoutNV(2, Clamp);
|
||||
// Don't clamp against nem1 when GQA is enabled
|
||||
uint32_t m_height = p.gqa_ratio > 1 ? ~0 : p.nem1;
|
||||
tensorLayoutM = setTensorLayoutDimensionNV(tensorLayoutM, m_height, KV);
|
||||
tensorLayoutM = setTensorLayoutStrideNV(tensorLayoutM, m_stride, 1);
|
||||
|
||||
coopmat<float16_t, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator> mv;
|
||||
|
||||
coopMatLoadTensorNV(mv, data_m, m_offset, sliceTensorLayoutNV(tensorLayoutM, i * Br, Br, j * Bc, Bc));
|
||||
|
||||
S += slopeMat*coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator>(mv);
|
||||
}
|
||||
}
|
||||
|
||||
// Clear padding elements to -inf, so they don't contribute to rowmax
|
||||
|
|
|
|||
|
|
@ -546,6 +546,9 @@ extern "C" {
|
|||
// Returns true if the model is recurrent (like Mamba, RWKV, etc.)
|
||||
LLAMA_API bool llama_model_is_recurrent(const struct llama_model * model);
|
||||
|
||||
// Returns true if the model is hybrid (like Jamba, Granite, etc.)
|
||||
LLAMA_API bool llama_model_is_hybrid(const struct llama_model * model);
|
||||
|
||||
// Returns true if the model is diffusion-based (like LLaDA, Dream, etc.)
|
||||
LLAMA_API bool llama_model_is_diffusion(const struct llama_model * model);
|
||||
|
||||
|
|
@ -794,8 +797,12 @@ extern "C" {
|
|||
size_t n_token_capacity,
|
||||
size_t * n_token_count_out);
|
||||
|
||||
// for backwards-compat
|
||||
#define LLAMA_STATE_SEQ_FLAGS_SWA_ONLY 1
|
||||
|
||||
// work only with partial states, such as SWA KV cache or recurrent cache (e.g. Mamba)
|
||||
#define LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY 1
|
||||
|
||||
typedef uint32_t llama_state_seq_flags;
|
||||
|
||||
LLAMA_API size_t llama_state_seq_get_size_ext(
|
||||
|
|
|
|||
|
|
@ -224,7 +224,7 @@ bool llama_kv_cache_iswa::get_can_shift() const {
|
|||
}
|
||||
|
||||
void llama_kv_cache_iswa::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
|
||||
if ((flags & LLAMA_STATE_SEQ_FLAGS_SWA_ONLY) == 0) {
|
||||
if ((flags & LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY) == 0) {
|
||||
kv_base->state_write(io, seq_id, flags);
|
||||
}
|
||||
|
||||
|
|
@ -232,7 +232,7 @@ void llama_kv_cache_iswa::state_write(llama_io_write_i & io, llama_seq_id seq_id
|
|||
}
|
||||
|
||||
void llama_kv_cache_iswa::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
|
||||
if ((flags & LLAMA_STATE_SEQ_FLAGS_SWA_ONLY) == 0) {
|
||||
if ((flags & LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY) == 0) {
|
||||
kv_base->state_read(io, seq_id, flags);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -175,17 +175,17 @@ std::map<ggml_backend_buffer_type_t, size_t> llama_memory_hybrid::memory_breakdo
|
|||
}
|
||||
|
||||
void llama_memory_hybrid::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
|
||||
GGML_UNUSED(flags);
|
||||
|
||||
mem_attn->state_write(io, seq_id);
|
||||
mem_recr->state_write(io, seq_id);
|
||||
if ((flags & LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY) == 0) {
|
||||
mem_attn->state_write(io, seq_id, flags);
|
||||
}
|
||||
mem_recr->state_write(io, seq_id, flags);
|
||||
}
|
||||
|
||||
void llama_memory_hybrid::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
|
||||
GGML_UNUSED(flags);
|
||||
|
||||
mem_attn->state_read(io, seq_id);
|
||||
mem_recr->state_read(io, seq_id);
|
||||
if ((flags & LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY) == 0) {
|
||||
mem_attn->state_read(io, seq_id, flags);
|
||||
}
|
||||
mem_recr->state_read(io, seq_id, flags);
|
||||
}
|
||||
|
||||
llama_kv_cache * llama_memory_hybrid::get_mem_attn() const {
|
||||
|
|
|
|||
|
|
@ -136,6 +136,7 @@ void llama_memory_recurrent::clear(bool data) {
|
|||
}
|
||||
|
||||
bool llama_memory_recurrent::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
|
||||
//printf("[DEBUG] calling llama_memory_recurrent::seq_rm` with `seq_id=%d, p0=%d, p1=%d`\n", seq_id, p0, p1);
|
||||
uint32_t new_head = size;
|
||||
|
||||
if (p0 < 0) {
|
||||
|
|
@ -156,7 +157,8 @@ bool llama_memory_recurrent::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos
|
|||
if (tail_id >= 0) {
|
||||
const auto & cell = cells[tail_id];
|
||||
// partial intersection is invalid
|
||||
if ((0 < p0 && p0 <= cell.pos) || (0 < p1 && p1 <= cell.pos)) {
|
||||
if ((0 < p0 && p0 < cell.pos) || (0 < p1 && p1 <= cell.pos)) {
|
||||
//printf("[DEBUG] inside `llama_memory_recurrent::seq_rm`: partial intersection is invalid, so returning false\n");
|
||||
return false;
|
||||
}
|
||||
// invalidate tails which will be cleared
|
||||
|
|
@ -167,6 +169,7 @@ bool llama_memory_recurrent::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos
|
|||
} else {
|
||||
// seq_id is negative, then the range should include everything or nothing
|
||||
if (p0 != p1 && (p0 != 0 || p1 != std::numeric_limits<llama_pos>::max())) {
|
||||
//printf("[DEBUG] inside `llama_memory_recurrent::seq_rm`: `seq_id` is negative, so returning false\n");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -7905,6 +7905,8 @@ struct llm_build_bert : public llm_graph_context {
|
|||
}
|
||||
|
||||
if (model.layers[il].attn_q_norm) {
|
||||
Qcur = ggml_reshape_2d(ctx0, Qcur, n_embd_head*n_head, n_tokens);
|
||||
|
||||
Qcur = build_norm(Qcur,
|
||||
model.layers[il].attn_q_norm,
|
||||
model.layers[il].attn_q_norm_b,
|
||||
|
|
@ -7914,6 +7916,8 @@ struct llm_build_bert : public llm_graph_context {
|
|||
}
|
||||
|
||||
if (model.layers[il].attn_k_norm) {
|
||||
Kcur = ggml_reshape_2d(ctx0, Kcur, n_embd_head*n_head_kv, n_tokens);
|
||||
|
||||
Kcur = build_norm(Kcur,
|
||||
model.layers[il].attn_k_norm,
|
||||
model.layers[il].attn_k_norm_b,
|
||||
|
|
@ -8296,6 +8300,9 @@ struct llm_build_mpt : public llm_graph_context {
|
|||
|
||||
// Q/K Layernorm
|
||||
if (model.layers[il].attn_q_norm) {
|
||||
Qcur = ggml_reshape_2d(ctx0, Qcur, n_embd_head*n_head, n_tokens);
|
||||
Kcur = ggml_reshape_2d(ctx0, Kcur, n_embd_head*n_head_kv, n_tokens);
|
||||
|
||||
Qcur = build_norm(Qcur,
|
||||
model.layers[il].attn_q_norm,
|
||||
model.layers[il].attn_q_norm_b,
|
||||
|
|
@ -20206,6 +20213,10 @@ bool llama_model_is_recurrent(const llama_model * model) {
|
|||
return llm_arch_is_recurrent(model->arch);
|
||||
}
|
||||
|
||||
bool llama_model_is_hybrid(const llama_model * model) {
|
||||
return llm_arch_is_hybrid(model->arch);
|
||||
}
|
||||
|
||||
bool llama_model_is_diffusion(const llama_model * model) {
|
||||
return llm_arch_is_diffusion(model->arch);
|
||||
}
|
||||
|
|
|
|||
Binary file not shown.
|
|
@ -764,7 +764,7 @@ struct completion_token_output {
|
|||
}
|
||||
};
|
||||
|
||||
struct swa_checkpoint {
|
||||
struct ctx_checkpoint {
|
||||
llama_pos pos_min;
|
||||
llama_pos pos_max;
|
||||
|
||||
|
|
@ -1460,7 +1460,7 @@ struct server_slot {
|
|||
|
||||
std::vector<completion_token_output> generated_token_probs;
|
||||
|
||||
std::vector<swa_checkpoint> swa_checkpoints;
|
||||
std::vector<ctx_checkpoint> ctx_checkpoints;
|
||||
|
||||
bool has_next_token = true;
|
||||
bool has_new_line = false;
|
||||
|
|
@ -3541,7 +3541,11 @@ struct server_context {
|
|||
slot.n_past = 0;
|
||||
}
|
||||
|
||||
const auto n_swa = llama_model_n_swa(model);
|
||||
// note: when n_swa == 0, the model does not use SWA, which is equivalent to a window of 1
|
||||
const auto n_swa = std::max(1, llama_model_n_swa(model));
|
||||
|
||||
// the largest pos_min required for a checkpoint to be useful
|
||||
const auto pos_min_thold = std::max(0, slot.n_past - n_swa);
|
||||
|
||||
if (slot.n_past > 0 && slot.n_past < (int) slot.cache_tokens.size()) {
|
||||
const auto pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx), slot.id);
|
||||
|
|
@ -3550,66 +3554,62 @@ struct server_context {
|
|||
GGML_ABORT("pos_min == -1, but n_past > 0 - should not happen: https://github.com/ggml-org/llama.cpp/pull/13833#discussion_r2116181237");
|
||||
}
|
||||
|
||||
const auto pos_min_thold = std::max(0, slot.n_past - n_swa);
|
||||
|
||||
if (pos_min > pos_min_thold) {
|
||||
SLT_WRN(slot, "n_past = %d, cache_tokens.size() = %d, seq_id = %d, pos_min = %d, n_swa = %d\n", slot.n_past, (int) slot.cache_tokens.size(), slot.id, pos_min, n_swa);
|
||||
|
||||
// search for a SWA checkpoint
|
||||
// search for a context checkpoint
|
||||
const auto it = std::find_if(
|
||||
slot.swa_checkpoints.rbegin(),
|
||||
slot.swa_checkpoints.rend(),
|
||||
slot.ctx_checkpoints.rbegin(),
|
||||
slot.ctx_checkpoints.rend(),
|
||||
[&](const auto & cur) {
|
||||
return cur.pos_min <= pos_min_thold;
|
||||
// guarantee that a checkpoint will result in at least one token being processed [TAG_PROMPT_LOGITS]
|
||||
return cur.pos_min < pos_min_thold;
|
||||
}
|
||||
);
|
||||
|
||||
bool do_reset = it == slot.swa_checkpoints.rend();
|
||||
bool do_reset = it == slot.ctx_checkpoints.rend();
|
||||
//printf("[DEBUG] `do_reset` was set to `%s`\n", do_reset ? "true" : "false");
|
||||
|
||||
if (!do_reset) {
|
||||
// restore the checkpoint
|
||||
const size_t swa_size = it->data.size();
|
||||
const size_t n = llama_state_seq_set_data_ext(ctx, it->data.data(), swa_size, slot.id, LLAMA_STATE_SEQ_FLAGS_SWA_ONLY);
|
||||
// restore the context checkpoint
|
||||
const size_t ctx_checkpoint_size = it->data.size();
|
||||
const size_t n = llama_state_seq_set_data_ext(ctx, it->data.data(), ctx_checkpoint_size, slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
|
||||
|
||||
if (n != swa_size) {
|
||||
SLT_ERR(slot, "failed to restore SWA checkpoint, pos_min = %d, pos_max = %d, size = %.3f MiB\n", it->pos_min, it->pos_max, (float) swa_size / 1024 / 1024);
|
||||
if (n != ctx_checkpoint_size) {
|
||||
SLT_ERR(slot, "failed to restore context checkpoint (pos_min = %d, pos_max = %d, size = %.3f MiB)\n", it->pos_min, it->pos_max, (float) ctx_checkpoint_size / 1024 / 1024);
|
||||
do_reset = true;
|
||||
//printf("[DEBUG] `do_reset` was set to `true` after failing to restore a checkpoint");
|
||||
} else {
|
||||
slot.n_past = std::min(slot.n_past, it->pos_max);
|
||||
|
||||
SLT_WRN(slot, "SWA checkpoint restore, pos_min = %d, pos_max = %d, size = %.3f MiB\n", it->pos_min, it->pos_max, (float) swa_size / 1024 / 1024);
|
||||
slot.n_past = std::min(slot.n_past, std::max(it->pos_min + 1, it->pos_max));
|
||||
SLT_WRN(slot, "restored context checkpoint (pos_min = %d, pos_max = %d, size = %.3f MiB)\n", it->pos_min, it->pos_max, (float) ctx_checkpoint_size / 1024 / 1024);
|
||||
}
|
||||
}
|
||||
|
||||
if (do_reset) {
|
||||
SLT_WRN(slot, "forcing full prompt re-processing due to lack of cache data (likely due to SWA, see %s)\n",
|
||||
SLT_WRN(slot, "forcing full prompt re-processing due to lack of cache data (likely due to SWA or hybrid/recurrent memory, see %s)\n",
|
||||
"https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055");
|
||||
|
||||
slot.n_past = 0;
|
||||
slot.swa_checkpoints.clear();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (n_swa > 0) {
|
||||
const auto pos_min_thold = std::max(0, slot.n_past - n_swa);
|
||||
|
||||
{
|
||||
// erase any checkpoints with pos_min > pos_min_thold
|
||||
for (int i = (int) slot.swa_checkpoints.size() - 1; i >= 0; i--) {
|
||||
const auto & cur = slot.swa_checkpoints[i];
|
||||
for (int i = (int) slot.ctx_checkpoints.size() - 1; i >= 0; i--) {
|
||||
const auto & cur = slot.ctx_checkpoints[i];
|
||||
if (cur.pos_min > pos_min_thold) {
|
||||
slot.swa_checkpoints.erase(slot.swa_checkpoints.begin() + i);
|
||||
|
||||
SLT_WRN(slot, "SWA checkpoint erase, pos_min = %d, pos_max = %d, size = %.3f MiB\n", cur.pos_min, cur.pos_max, (float) cur.data.size() / 1024 / 1024);
|
||||
SLT_WRN(slot, "erased invalidated context checkpoint (pos_min = %d, pos_max = %d, n_swa = %d, size = %.3f MiB)\n", cur.pos_min, cur.pos_max, n_swa, (float) cur.data.size() / 1024 / 1024);
|
||||
slot.ctx_checkpoints.erase(slot.ctx_checkpoints.begin() + i);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// [TAG_PROMPT_LOGITS]
|
||||
if (slot.n_past == slot.n_prompt_tokens && slot.n_past > 0) {
|
||||
SLT_WRN(slot, "need to evaluate at least 1 token for each active slot, n_past = %d, n_prompt_tokens = %d\n", slot.n_past, slot.n_prompt_tokens);
|
||||
|
||||
SLT_WRN(slot, "need to evaluate at least 1 token for each active slot (n_past = %d, n_prompt_tokens = %d)\n", slot.n_past, slot.n_prompt_tokens);
|
||||
slot.n_past--;
|
||||
SLT_WRN(slot, "n_past was set to %d\n", slot.n_past);
|
||||
}
|
||||
|
||||
slot.n_prompt_tokens_cache = slot.n_past;
|
||||
|
|
@ -3623,9 +3623,9 @@ struct server_context {
|
|||
}
|
||||
}
|
||||
|
||||
// keep only the common part
|
||||
// truncate any tokens that are beyond n_past for this slot
|
||||
if (!llama_memory_seq_rm(llama_get_memory(ctx), slot.id, slot.n_past, -1)) {
|
||||
// could not partially delete (likely using a non-Transformer model)
|
||||
SLT_WRN(slot, "failed to truncate tokens beyond n_past = %d\n", slot.n_past);
|
||||
llama_memory_seq_rm(llama_get_memory(ctx), slot.id, -1, -1);
|
||||
|
||||
// there is no common part left
|
||||
|
|
@ -3633,7 +3633,7 @@ struct server_context {
|
|||
slot.n_prompt_tokens_cache = 0;
|
||||
}
|
||||
|
||||
SLT_INF(slot, "kv cache rm [%d, end)\n", slot.n_past);
|
||||
SLT_INF(slot, "n_past = %d, memory_seq_rm [%d, end)\n", slot.n_past, slot.n_past);
|
||||
|
||||
// remove the non-common part from the cache
|
||||
slot.cache_tokens.keep_first(slot.n_past);
|
||||
|
|
@ -3854,37 +3854,38 @@ struct server_context {
|
|||
// prompt evaluated for next-token prediction
|
||||
slot.state = SLOT_STATE_GENERATING;
|
||||
|
||||
// make a checkpoint with the SWA memory
|
||||
// checkpoints are needed only if we are not using "--swa-full"
|
||||
if (llama_model_n_swa(model) > 0 && !params_base.swa_full && params_base.n_swa_checkpoints > 0) {
|
||||
if (slot.swa_checkpoints.size() >= (size_t) params_base.n_swa_checkpoints) {
|
||||
{
|
||||
const auto & cur = slot.swa_checkpoints.back();
|
||||
// make a checkpoint of the parts of the memory that cannot be rolled back.
|
||||
// checkpoints are created only if:
|
||||
// - the model uses SWA and we are not using `swa_full`
|
||||
// - the model architecture is marked as recurrent or hybrid
|
||||
//
|
||||
// TODO: try to make this conditional on the context or the memory module, instead of the model type
|
||||
const bool do_checkpoint =
|
||||
(llama_model_is_recurrent(model) || llama_model_is_hybrid(model)) ||
|
||||
(llama_model_n_swa(model) > 0 && !params_base.swa_full);
|
||||
|
||||
SLT_WRN(slot, "SWA checkpoint erase, pos_min = %d, pos_max = %d, size = %.3f MiB\n",
|
||||
cur.pos_min, cur.pos_max, (float) cur.data.size() / 1024 / 1024);
|
||||
}
|
||||
if (do_checkpoint && params_base.n_ctx_checkpoints > 0) {
|
||||
while (slot.ctx_checkpoints.size() >= (size_t) params_base.n_ctx_checkpoints) {
|
||||
// make room for the new checkpoint, if needed
|
||||
const auto & cur = slot.ctx_checkpoints.front();
|
||||
SLT_WRN(slot, "erasing old context checkpoint (pos_min = %d, pos_max = %d, size = %.3f MiB)\n",
|
||||
cur.pos_min, cur.pos_max, (float) cur.data.size() / 1024 / 1024);
|
||||
|
||||
slot.swa_checkpoints.erase(slot.swa_checkpoints.begin());
|
||||
slot.ctx_checkpoints.erase(slot.ctx_checkpoints.begin());
|
||||
}
|
||||
|
||||
const size_t swa_size = llama_state_seq_get_size_ext(ctx, slot.id, LLAMA_STATE_SEQ_FLAGS_SWA_ONLY);
|
||||
const size_t checkpoint_size = llama_state_seq_get_size_ext(ctx, slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
|
||||
|
||||
auto & cur = slot.swa_checkpoints.emplace_back(swa_checkpoint{
|
||||
auto & cur = slot.ctx_checkpoints.emplace_back(ctx_checkpoint{
|
||||
/*.pos_min = */ llama_memory_seq_pos_min(llama_get_memory(ctx), slot.id),
|
||||
/*.pos_max = */ llama_memory_seq_pos_max(llama_get_memory(ctx), slot.id),
|
||||
/*.data = */ std::vector<uint8_t>(swa_size),
|
||||
/*.data = */ std::vector<uint8_t>(checkpoint_size),
|
||||
});
|
||||
|
||||
llama_state_seq_get_data_ext(ctx, cur.data.data(), swa_size, slot.id, LLAMA_STATE_SEQ_FLAGS_SWA_ONLY);
|
||||
llama_state_seq_get_data_ext(ctx, cur.data.data(), checkpoint_size, slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
|
||||
|
||||
float size_total = 0.0f;
|
||||
for (const auto & checkpoint : slot.swa_checkpoints) {
|
||||
size_total += (float) checkpoint.data.size() / 1024 / 1024;
|
||||
}
|
||||
|
||||
SLT_WRN(slot, "SWA checkpoint create, pos_min = %d, pos_max = %d, size = %.3f MiB, total = %d/%d (%.3f MiB)\n",
|
||||
cur.pos_min, cur.pos_max, (float) cur.data.size() / 1024 / 1024, (int) slot.swa_checkpoints.size(), params_base.n_swa_checkpoints, size_total);
|
||||
SLT_WRN(slot, "saved context checkpoint %d of %d (pos_min = %d, pos_max = %d, size = %.3f MiB)\n",
|
||||
(int) slot.ctx_checkpoints.size(), params_base.n_ctx_checkpoints, cur.pos_min, cur.pos_max, (float) cur.data.size() / 1024 / 1024);
|
||||
}
|
||||
} else if (slot.state != SLOT_STATE_GENERATING) {
|
||||
continue; // continue loop of slots
|
||||
|
|
|
|||
|
|
@ -307,8 +307,30 @@ class ChatStore {
|
|||
onError?: (error: Error) => void
|
||||
): Promise<void> {
|
||||
let streamedContent = '';
|
||||
|
||||
let streamedReasoningContent = '';
|
||||
let modelCaptured = false;
|
||||
|
||||
const captureModelIfNeeded = (updateDbImmediately = true): string | undefined => {
|
||||
if (!modelCaptured) {
|
||||
const currentModelName = serverStore.modelName;
|
||||
|
||||
if (currentModelName) {
|
||||
if (updateDbImmediately) {
|
||||
DatabaseStore.updateMessage(assistantMessage.id, { model: currentModelName }).catch(
|
||||
console.error
|
||||
);
|
||||
}
|
||||
|
||||
const messageIndex = this.findMessageIndex(assistantMessage.id);
|
||||
|
||||
this.updateMessageAtIndex(messageIndex, { model: currentModelName });
|
||||
modelCaptured = true;
|
||||
|
||||
return currentModelName;
|
||||
}
|
||||
}
|
||||
return undefined;
|
||||
};
|
||||
|
||||
slotsService.startStreaming();
|
||||
|
||||
|
|
@ -319,6 +341,8 @@ class ChatStore {
|
|||
streamedContent += chunk;
|
||||
this.currentResponse = streamedContent;
|
||||
|
||||
captureModelIfNeeded();
|
||||
|
||||
const partialThinking = extractPartialThinking(streamedContent);
|
||||
const messageIndex = this.findMessageIndex(assistantMessage.id);
|
||||
this.updateMessageAtIndex(messageIndex, {
|
||||
|
|
@ -328,7 +352,11 @@ class ChatStore {
|
|||
|
||||
onReasoningChunk: (reasoningChunk: string) => {
|
||||
streamedReasoningContent += reasoningChunk;
|
||||
|
||||
captureModelIfNeeded();
|
||||
|
||||
const messageIndex = this.findMessageIndex(assistantMessage.id);
|
||||
|
||||
this.updateMessageAtIndex(messageIndex, { thinking: streamedReasoningContent });
|
||||
},
|
||||
|
||||
|
|
@ -339,17 +367,36 @@ class ChatStore {
|
|||
) => {
|
||||
slotsService.stopStreaming();
|
||||
|
||||
await DatabaseStore.updateMessage(assistantMessage.id, {
|
||||
const updateData: {
|
||||
content: string;
|
||||
thinking: string;
|
||||
timings?: ChatMessageTimings;
|
||||
model?: string;
|
||||
} = {
|
||||
content: finalContent || streamedContent,
|
||||
thinking: reasoningContent || streamedReasoningContent,
|
||||
timings: timings
|
||||
});
|
||||
};
|
||||
|
||||
const capturedModel = captureModelIfNeeded(false);
|
||||
|
||||
if (capturedModel) {
|
||||
updateData.model = capturedModel;
|
||||
}
|
||||
|
||||
await DatabaseStore.updateMessage(assistantMessage.id, updateData);
|
||||
|
||||
const messageIndex = this.findMessageIndex(assistantMessage.id);
|
||||
|
||||
this.updateMessageAtIndex(messageIndex, {
|
||||
const localUpdateData: { timings?: ChatMessageTimings; model?: string } = {
|
||||
timings: timings
|
||||
});
|
||||
};
|
||||
|
||||
if (updateData.model) {
|
||||
localUpdateData.model = updateData.model;
|
||||
}
|
||||
|
||||
this.updateMessageAtIndex(messageIndex, localUpdateData);
|
||||
|
||||
await DatabaseStore.updateCurrentNode(this.activeConversation!.id, assistantMessage.id);
|
||||
this.activeConversation!.currNode = assistantMessage.id;
|
||||
|
|
@ -478,9 +525,6 @@ class ChatStore {
|
|||
private async createAssistantMessage(parentId?: string): Promise<DatabaseMessage | null> {
|
||||
if (!this.activeConversation) return null;
|
||||
|
||||
// Capture the current model name when creating the assistant message
|
||||
const currentModelName = serverStore.modelName;
|
||||
|
||||
return await DatabaseStore.createMessageBranch(
|
||||
{
|
||||
convId: this.activeConversation.id,
|
||||
|
|
@ -489,8 +533,7 @@ class ChatStore {
|
|||
content: '',
|
||||
timestamp: Date.now(),
|
||||
thinking: '',
|
||||
children: [],
|
||||
model: currentModelName || undefined
|
||||
children: []
|
||||
},
|
||||
parentId || null
|
||||
);
|
||||
|
|
@ -817,15 +860,18 @@ class ChatStore {
|
|||
this.currentResponse = '';
|
||||
|
||||
try {
|
||||
const assistantMessage = await this.createAssistantMessage();
|
||||
const parentMessageId =
|
||||
this.activeMessages.length > 0
|
||||
? this.activeMessages[this.activeMessages.length - 1].id
|
||||
: null;
|
||||
|
||||
const assistantMessage = await this.createAssistantMessage(parentMessageId);
|
||||
|
||||
if (!assistantMessage) {
|
||||
throw new Error('Failed to create assistant message');
|
||||
}
|
||||
|
||||
this.activeMessages.push(assistantMessage);
|
||||
await DatabaseStore.updateCurrentNode(this.activeConversation.id, assistantMessage.id);
|
||||
this.activeConversation.currNode = assistantMessage.id;
|
||||
|
||||
const conversationContext = this.activeMessages.slice(0, -1);
|
||||
|
||||
|
|
@ -1081,8 +1127,10 @@ class ChatStore {
|
|||
(m) => m.role === 'user' && m.parent === rootMessage?.id
|
||||
);
|
||||
|
||||
await DatabaseStore.updateCurrentNode(this.activeConversation.id, siblingId);
|
||||
this.activeConversation.currNode = siblingId;
|
||||
const currentLeafNodeId = findLeafNode(allMessages, siblingId);
|
||||
|
||||
await DatabaseStore.updateCurrentNode(this.activeConversation.id, currentLeafNodeId);
|
||||
this.activeConversation.currNode = currentLeafNodeId;
|
||||
await this.refreshActiveMessages();
|
||||
|
||||
// Only show title dialog if we're navigating between different first user message siblings
|
||||
|
|
@ -1287,9 +1335,6 @@ class ChatStore {
|
|||
this.isLoading = true;
|
||||
this.currentResponse = '';
|
||||
|
||||
// Capture the current model name when creating the assistant message
|
||||
const currentModelName = serverStore.modelName;
|
||||
|
||||
const newAssistantMessage = await DatabaseStore.createMessageBranch(
|
||||
{
|
||||
convId: this.activeConversation.id,
|
||||
|
|
@ -1298,8 +1343,7 @@ class ChatStore {
|
|||
role: 'assistant',
|
||||
content: '',
|
||||
thinking: '',
|
||||
children: [],
|
||||
model: currentModelName || undefined
|
||||
children: []
|
||||
},
|
||||
parentMessage.id
|
||||
);
|
||||
|
|
@ -1346,9 +1390,6 @@ class ChatStore {
|
|||
false
|
||||
) as DatabaseMessage[];
|
||||
|
||||
// Capture the current model name when creating the assistant message
|
||||
const currentModelName = serverStore.modelName;
|
||||
|
||||
// Create new assistant message branch
|
||||
const assistantMessage = await DatabaseStore.createMessageBranch(
|
||||
{
|
||||
|
|
@ -1358,8 +1399,7 @@ class ChatStore {
|
|||
role: 'assistant',
|
||||
content: '',
|
||||
thinking: '',
|
||||
children: [],
|
||||
model: currentModelName || undefined
|
||||
children: []
|
||||
},
|
||||
userMessageId
|
||||
);
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue