diff --git a/CMakeLists.txt b/CMakeLists.txt index 364bab0de..a8e347c98 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -426,6 +426,7 @@ add_library(ggml ggml/src/ggml-backend.cpp ggml/src/ggml-backend-impl.h ggml/include/ggml-backend.h + ggml/include/ggml-cpp.h ggml/src/ggml-quants.c ggml/src/ggml-quants.h ggml/src/llamafile/sgemm.cpp diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py index bc68f68af..915e21836 100755 --- a/convert_lora_to_gguf.py +++ b/convert_lora_to_gguf.py @@ -230,7 +230,7 @@ def get_base_tensor_name(lora_tensor_name: str) -> str: def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser( - description="Convert a huggingface PEFT LoRA adapter to a GGML compatible file") + description="Convert a Hugging Face PEFT LoRA adapter to a GGUF file") parser.add_argument( "--outfile", type=Path, help="path to write to; default: based on input. {ftype} will be replaced by the outtype.", @@ -257,11 +257,11 @@ def parse_args() -> argparse.Namespace: ) parser.add_argument( "--base", type=Path, required=True, - help="directory containing base model file", + help="directory containing Hugging Face model config files (config.json, tokenizer.json) for the base model that the adapter is based on - only config is needed, actual model weights are not required", ) parser.add_argument( "lora_path", type=Path, - help="directory containing LoRA adapter file", + help="directory containing Hugging Face PEFT LoRA config (adapter_model.json) and weights (adapter_model.safetensors or adapter_model.bin)", ) return parser.parse_args() diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 8fab5c0d8..f90dfcca8 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -726,12 +726,12 @@ struct server_context { return nullptr; } - server_slot * get_available_slot(const std::string & prompt) { + server_slot * get_available_slot(const server_task & task) { server_slot * ret = nullptr; // find the slot that has at least n% prompt similarity - if (ret == nullptr && slot_prompt_similarity != 0.0f && !prompt.empty()) { - int max_lcp_len = 0; + if (ret == nullptr && slot_prompt_similarity != 0.0f) { + int max_lcs_len = 0; float similarity = 0; for (server_slot & slot : slots) { @@ -741,25 +741,25 @@ struct server_context { } // skip the slot if it does not contains cached tokens - if (slot.prompt_tokens.empty()) { + if (slot.cache_tokens.empty()) { continue; } - // length of the Longest Common Prefix between the current slot's prompt and the input prompt - int lcp_len = longest_common_prefix(slot.cache_tokens, slot.prompt_tokens); + // length of the Longest Common Subsequence between the current slot's prompt and the input prompt + int lcs_len = longest_common_subsequence(slot.cache_tokens, task.prompt_tokens); - // fraction of the common substring length compared to the current slot's prompt length - similarity = static_cast(lcp_len) / static_cast(slot.prompt_tokens.size()); + // fraction of the common subsequence length compared to the current slot's prompt length + similarity = static_cast(lcs_len) / static_cast(slot.cache_tokens.size()); // select the current slot if the criteria match - if (lcp_len > max_lcp_len && similarity > slot_prompt_similarity) { - max_lcp_len = lcp_len; + if (lcs_len > max_lcs_len && similarity > slot_prompt_similarity) { + max_lcs_len = lcs_len; ret = &slot; } } if (ret != nullptr) { - SLT_DBG(*ret, "selected slot by lcp similarity, max_lcp_len = %d, similarity = %f\n", max_lcp_len, similarity); + SLT_DBG(*ret, "selected slot by lcs similarity, max_lcs_len = %d, similarity = %f\n", max_lcs_len, similarity); } } @@ -1515,18 +1515,7 @@ struct server_context { { const int id_slot = json_value(task.data, "id_slot", -1); - server_slot * slot; - - if (id_slot != -1) { - slot = get_slot_by_id(id_slot); - } else { - std::string prompt; - if (task.data.contains("prompt") && task.data.at("prompt").is_string()) { - prompt = json_value(task.data, "prompt", std::string()); - } - - slot = get_available_slot(prompt); - } + server_slot * slot = id_slot != -1 ? get_slot_by_id(id_slot) : get_available_slot(task); if (slot == nullptr) { // if no slot is available, we defer this task for processing later @@ -3260,7 +3249,7 @@ int main(int argc, char ** argv) { ctx_server.queue_tasks.terminate(); }; - LOG_INF("%s: server is listening on %s:%d - starting the main loop\n", __func__, params.hostname.c_str(), params.port); + LOG_INF("%s: server is listening on http://%s:%d - starting the main loop\n", __func__, params.hostname.c_str(), params.port); ctx_server.queue_tasks.start_loop(); diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index 58f5a5684..871a17a4f 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -439,18 +439,60 @@ static std::string gen_chatcmplid() { // other common utils // -static size_t longest_common_prefix(const std::vector & a, const std::vector & b) { +static size_t longest_common_prefix(const llama_tokens & a, const llama_tokens & b) { size_t i; for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {} return i; } -static size_t longest_common_prefix(const std::string & a, const std::string & b) { - size_t i; - for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {} +static size_t longest_common_subsequence(const llama_tokens & a, const llama_tokens & b) { + // check for empty sequences + if (a.empty() || b.empty()) { + return 0; + } - return i; + // get the lengths of the input sequences + int a_len = a.size(); + int b_len = b.size(); + + // initialize the maximum length of the longest common subsequence (LCS) + int max_length = 0; + + // use two rows instead of a 2D matrix to optimize space + std::vector prev_row(b_len + 1, 0); + std::vector curr_row(b_len + 1, 0); + + // iterate through the elements of a + for (int i = 1; i <= a_len; i++) { + // iterate through the elements of b + for (int j = 1; j <= b_len; j++) { + // if elements at the current positions match + if (a[i - 1] == b[j - 1]) { + // if it's the first element of either sequences, set LCS length to 1 + if (i == 1 || j == 1) { + curr_row[j] = 1; + } else { + // increment LCS length by 1 compared to the previous element + curr_row[j] = prev_row[j - 1] + 1; + } + + // update max_length if necessary + if (curr_row[j] > max_length) { + max_length = curr_row[j]; + } + } else { + // reset LCS length if elements don't match + curr_row[j] = 0; + } + } + + // update the previous row for the next iteration + prev_row = curr_row; + } + + // return the maximum length of the LCS + return max_length; } static bool ends_with(const std::string & str, const std::string & suffix) { diff --git a/examples/simple-chat/CMakeLists.txt b/examples/simple-chat/CMakeLists.txt new file mode 100644 index 000000000..87723533b --- /dev/null +++ b/examples/simple-chat/CMakeLists.txt @@ -0,0 +1,5 @@ +set(TARGET llama-simple-chat) +add_executable(${TARGET} simple-chat.cpp) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/simple-chat/README.md b/examples/simple-chat/README.md new file mode 100644 index 000000000..f0099ce3d --- /dev/null +++ b/examples/simple-chat/README.md @@ -0,0 +1,7 @@ +# llama.cpp/example/simple-chat + +The purpose of this example is to demonstrate a minimal usage of llama.cpp to create a simple chat program using the chat template from the GGUF file. + +```bash +./llama-simple-chat -m Meta-Llama-3.1-8B-Instruct.gguf -c 2048 +... diff --git a/examples/simple-chat/simple-chat.cpp b/examples/simple-chat/simple-chat.cpp new file mode 100644 index 000000000..14264cfcb --- /dev/null +++ b/examples/simple-chat/simple-chat.cpp @@ -0,0 +1,197 @@ +#include "llama.h" +#include +#include +#include +#include +#include + +static void print_usage(int, char ** argv) { + printf("\nexample usage:\n"); + printf("\n %s -m model.gguf [-c context_size] [-ngl n_gpu_layers]\n", argv[0]); + printf("\n"); +} + +int main(int argc, char ** argv) { + std::string model_path; + int ngl = 99; + int n_ctx = 2048; + + // parse command line arguments + for (int i = 1; i < argc; i++) { + try { + if (strcmp(argv[i], "-m") == 0) { + if (i + 1 < argc) { + model_path = argv[++i]; + } else { + print_usage(argc, argv); + return 1; + } + } else if (strcmp(argv[i], "-c") == 0) { + if (i + 1 < argc) { + n_ctx = std::stoi(argv[++i]); + } else { + print_usage(argc, argv); + return 1; + } + } else if (strcmp(argv[i], "-ngl") == 0) { + if (i + 1 < argc) { + ngl = std::stoi(argv[++i]); + } else { + print_usage(argc, argv); + return 1; + } + } else { + print_usage(argc, argv); + return 1; + } + } catch (std::exception & e) { + fprintf(stderr, "error: %s\n", e.what()); + print_usage(argc, argv); + return 1; + } + } + if (model_path.empty()) { + print_usage(argc, argv); + return 1; + } + + // only print errors + llama_log_set([](enum ggml_log_level level, const char * text, void * /* user_data */) { + if (level >= GGML_LOG_LEVEL_ERROR) { + fprintf(stderr, "%s", text); + } + }, nullptr); + + // initialize the model + llama_model_params model_params = llama_model_default_params(); + model_params.n_gpu_layers = ngl; + + llama_model * model = llama_load_model_from_file(model_path.c_str(), model_params); + if (!model) { + fprintf(stderr , "%s: error: unable to load model\n" , __func__); + return 1; + } + + // initialize the context + llama_context_params ctx_params = llama_context_default_params(); + ctx_params.n_ctx = n_ctx; + ctx_params.n_batch = n_ctx; + + llama_context * ctx = llama_new_context_with_model(model, ctx_params); + if (!ctx) { + fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__); + return 1; + } + + // initialize the sampler + llama_sampler * smpl = llama_sampler_chain_init(llama_sampler_chain_default_params()); + llama_sampler_chain_add(smpl, llama_sampler_init_min_p(0.05f, 1)); + llama_sampler_chain_add(smpl, llama_sampler_init_temp(0.8f)); + llama_sampler_chain_add(smpl, llama_sampler_init_dist(LLAMA_DEFAULT_SEED)); + + // helper function to evaluate a prompt and generate a response + auto generate = [&](const std::string & prompt) { + std::string response; + + // tokenize the prompt + const int n_prompt_tokens = -llama_tokenize(model, prompt.c_str(), prompt.size(), NULL, 0, true, true); + std::vector prompt_tokens(n_prompt_tokens); + if (llama_tokenize(model, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true, true) < 0) { + GGML_ABORT("failed to tokenize the prompt\n"); + } + + // prepare a batch for the prompt + llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size()); + llama_token new_token_id; + while (true) { + // check if we have enough space in the context to evaluate this batch + int n_ctx = llama_n_ctx(ctx); + int n_ctx_used = llama_get_kv_cache_used_cells(ctx); + if (n_ctx_used + batch.n_tokens > n_ctx) { + printf("\033[0m\n"); + fprintf(stderr, "context size exceeded\n"); + exit(0); + } + + if (llama_decode(ctx, batch)) { + GGML_ABORT("failed to decode\n"); + } + + // sample the next token + new_token_id = llama_sampler_sample(smpl, ctx, -1); + + // is it an end of generation? + if (llama_token_is_eog(model, new_token_id)) { + break; + } + + // convert the token to a string, print it and add it to the response + char buf[256]; + int n = llama_token_to_piece(model, new_token_id, buf, sizeof(buf), 0, true); + if (n < 0) { + GGML_ABORT("failed to convert token to piece\n"); + } + std::string piece(buf, n); + printf("%s", piece.c_str()); + fflush(stdout); + response += piece; + + // prepare the next batch with the sampled token + batch = llama_batch_get_one(&new_token_id, 1); + } + + return response; + }; + + std::vector messages; + std::vector formatted(llama_n_ctx(ctx)); + int prev_len = 0; + while (true) { + // get user input + printf("\033[32m> \033[0m"); + std::string user; + std::getline(std::cin, user); + + if (user.empty()) { + break; + } + + // add the user input to the message list and format it + messages.push_back({"user", strdup(user.c_str())}); + int new_len = llama_chat_apply_template(model, nullptr, messages.data(), messages.size(), true, formatted.data(), formatted.size()); + if (new_len > (int)formatted.size()) { + formatted.resize(new_len); + new_len = llama_chat_apply_template(model, nullptr, messages.data(), messages.size(), true, formatted.data(), formatted.size()); + } + if (new_len < 0) { + fprintf(stderr, "failed to apply the chat template\n"); + return 1; + } + + // remove previous messages to obtain the prompt to generate the response + std::string prompt(formatted.begin() + prev_len, formatted.begin() + new_len); + + // generate a response + printf("\033[33m"); + std::string response = generate(prompt); + printf("\n\033[0m"); + + // add the response to the messages + messages.push_back({"assistant", strdup(response.c_str())}); + prev_len = llama_chat_apply_template(model, nullptr, messages.data(), messages.size(), false, nullptr, 0); + if (prev_len < 0) { + fprintf(stderr, "failed to apply the chat template\n"); + return 1; + } + } + + // free resources + for (auto & msg : messages) { + free(const_cast(msg.content)); + } + llama_sampler_free(smpl); + llama_free(ctx); + llama_free_model(model); + + return 0; +} diff --git a/ggml/include/ggml-cpp.h b/ggml/include/ggml-cpp.h new file mode 100644 index 000000000..219361af4 --- /dev/null +++ b/ggml/include/ggml-cpp.h @@ -0,0 +1,38 @@ +#pragma once + +#ifndef __cplusplus +#error "This header is for C++ only" +#endif + +#include "ggml.h" +#include "ggml-alloc.h" +#include "ggml-backend.h" +#include + +// Smart pointers for ggml types + +// ggml + +struct ggml_context_deleter { void operator()(ggml_context * ctx) { ggml_free(ctx); } }; +struct gguf_context_deleter { void operator()(gguf_context * ctx) { gguf_free(ctx); } }; + +typedef std::unique_ptr ggml_context_ptr; +typedef std::unique_ptr gguf_context_ptr; + +// ggml-alloc + +struct ggml_gallocr_deleter { void operator()(ggml_gallocr_t galloc) { ggml_gallocr_free(galloc); } }; + +typedef std::unique_ptr ggml_gallocr_ptr; + +// ggml-backend + +struct ggml_backend_deleter { void operator()(ggml_backend_t backend) { ggml_backend_free(backend); } }; +struct ggml_backend_buffer_deleter { void operator()(ggml_backend_buffer_t buffer) { ggml_backend_buffer_free(buffer); } }; +struct ggml_backend_event_deleter { void operator()(ggml_backend_event_t event) { ggml_backend_event_free(event); } }; +struct ggml_backend_sched_deleter { void operator()(ggml_backend_sched_t sched) { ggml_backend_sched_free(sched); } }; + +typedef std::unique_ptr ggml_backend_ptr; +typedef std::unique_ptr ggml_backend_buffer_ptr; +typedef std::unique_ptr ggml_backend_event_ptr; +typedef std::unique_ptr ggml_backend_sched_ptr; diff --git a/ggml/include/ggml-kompute.h b/ggml/include/ggml-kompute.h index 171465456..c0c43521b 100644 --- a/ggml/include/ggml-kompute.h +++ b/ggml/include/ggml-kompute.h @@ -11,6 +11,8 @@ extern "C" { #endif +#define GGML_KOMPUTE_MAX_DEVICES 16 + struct ggml_vk_device { int index; int type; // same as VkPhysicalDeviceType @@ -41,6 +43,8 @@ GGML_API bool ggml_backend_is_kompute(ggml_backend_t backend); GGML_API ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device); +GGML_API ggml_backend_reg_t ggml_backend_kompute_reg(void); + #ifdef __cplusplus } #endif diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 6c9066c43..6457a5f15 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -217,7 +217,6 @@ #define GGML_MAX_DIMS 4 #define GGML_MAX_PARAMS 2048 -#define GGML_MAX_CONTEXTS 64 #define GGML_MAX_SRC 10 #define GGML_MAX_N_THREADS 512 #define GGML_MAX_OP_PARAMS 64 @@ -565,10 +564,10 @@ extern "C" { enum ggml_log_level { GGML_LOG_LEVEL_NONE = 0, - GGML_LOG_LEVEL_INFO = 1, - GGML_LOG_LEVEL_WARN = 2, - GGML_LOG_LEVEL_ERROR = 3, - GGML_LOG_LEVEL_DEBUG = 4, + GGML_LOG_LEVEL_DEBUG = 1, + GGML_LOG_LEVEL_INFO = 2, + GGML_LOG_LEVEL_WARN = 3, + GGML_LOG_LEVEL_ERROR = 4, GGML_LOG_LEVEL_CONT = 5, // continue previous log }; @@ -662,13 +661,6 @@ extern "C" { void * abort_callback_data; }; - // scratch buffer - struct ggml_scratch { - size_t offs; - size_t size; - void * data; - }; - struct ggml_init_params { // memory pool size_t mem_size; // bytes @@ -766,12 +758,12 @@ extern "C" { // main - GGML_API struct ggml_context * ggml_init(struct ggml_init_params params); - GGML_API void ggml_free(struct ggml_context * ctx); + GGML_API struct ggml_context * ggml_init (struct ggml_init_params params); + GGML_API void ggml_reset(struct ggml_context * ctx); + GGML_API void ggml_free (struct ggml_context * ctx); GGML_API size_t ggml_used_mem(const struct ggml_context * ctx); - GGML_API size_t ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch); GGML_API bool ggml_get_no_alloc(struct ggml_context * ctx); GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc); diff --git a/ggml/src/ggml-aarch64.c b/ggml/src/ggml-aarch64.c index b27f41147..eb30f8944 100644 --- a/ggml/src/ggml-aarch64.c +++ b/ggml/src/ggml-aarch64.c @@ -991,6 +991,73 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * } } return; +#elif defined(__riscv_v_intrinsic) + if (__riscv_vlenb() >= QK4_0) { + const size_t vl = QK4_0; + + const block_q8_0 * a_ptr = (const block_q8_0 *) vy; + for (int x = 0; x < nc / ncols_interleaved; x++) { + const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb); + + vfloat32m1_t sumf = __riscv_vfmv_v_f_f32m1(0.0, vl / 4); + for (int l = 0; l < nb; l++) { + const int64_t a0 = *(const int64_t *)&a_ptr[l].qs[0]; + const int64_t a1 = *(const int64_t *)&a_ptr[l].qs[8]; + const int64_t a2 = *(const int64_t *)&a_ptr[l].qs[16]; + const int64_t a3 = *(const int64_t *)&a_ptr[l].qs[24]; + __asm__ __volatile__("" ::: "memory"); // prevent gcc from emitting fused vlse64, violating alignment + const vint8m2_t lhs_0_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(a0, vl / 4)); + const vint8m2_t lhs_1_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(a1, vl / 4)); + const vint8m2_t lhs_2_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(a2, vl / 4)); + const vint8m2_t lhs_3_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(a3, vl / 4)); + + const vint8m4_t rhs_raw_vec = __riscv_vle8_v_i8m4((const int8_t *)b_ptr[l].qs, vl * 4); + const vint8m4_t rhs_vec_lo = __riscv_vsra_vx_i8m4(__riscv_vsll_vx_i8m4(rhs_raw_vec, 4, vl * 4), 4, vl * 4); + const vint8m4_t rhs_vec_hi = __riscv_vsra_vx_i8m4(rhs_raw_vec, 4, vl * 4); + const vint8m2_t rhs_vec_lo_0 = __riscv_vget_v_i8m4_i8m2(rhs_vec_lo, 0); + const vint8m2_t rhs_vec_lo_1 = __riscv_vget_v_i8m4_i8m2(rhs_vec_lo, 1); + const vint8m2_t rhs_vec_hi_0 = __riscv_vget_v_i8m4_i8m2(rhs_vec_hi, 0); + const vint8m2_t rhs_vec_hi_1 = __riscv_vget_v_i8m4_i8m2(rhs_vec_hi, 1); + + const vint16m4_t sumi_lo_0 = __riscv_vwmul_vv_i16m4(rhs_vec_lo_0, lhs_0_8, vl * 2); + const vint16m4_t sumi_lo_1 = __riscv_vwmacc_vv_i16m4(sumi_lo_0, rhs_vec_lo_1, lhs_1_8, vl * 2); + const vint16m4_t sumi_hi_0 = __riscv_vwmacc_vv_i16m4(sumi_lo_1, rhs_vec_hi_0, lhs_2_8, vl * 2); + const vint16m4_t sumi_hi_m = __riscv_vwmacc_vv_i16m4(sumi_hi_0, rhs_vec_hi_1, lhs_3_8, vl * 2); + + const vuint32m4_t sumi_i32 = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vreinterpret_v_i16m4_i32m4(sumi_hi_m)); + const vuint16m2_t sumi_h2_0 = __riscv_vnsrl_wx_u16m2(sumi_i32, 0, vl); + const vuint16m2_t sumi_h2_1 = __riscv_vnsrl_wx_u16m2(sumi_i32, 16, vl); + const vuint16m2_t sumi_h2 = __riscv_vadd_vv_u16m2(sumi_h2_0, sumi_h2_1, vl); + const vuint32m2_t sumi_h2_i32 = __riscv_vreinterpret_v_u16m2_u32m2(sumi_h2); + const vuint16m1_t sumi_h4_0 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 0, vl / 2); + const vuint16m1_t sumi_h4_1 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 16, vl / 2); + const vuint16m1_t sumi_h4 = __riscv_vadd_vv_u16m1(sumi_h4_0, sumi_h4_1, vl / 2); + const vuint32m1_t sumi_h4_i32 = __riscv_vreinterpret_v_u16m1_u32m1(sumi_h4); + const vint16mf2_t sumi_h8_0 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 0, vl / 4)); + const vint16mf2_t sumi_h8_1 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 16, vl / 4)); + const vint32m1_t sumi_h8 = __riscv_vwadd_vv_i32m1(sumi_h8_0, sumi_h8_1, vl / 4); + const vfloat32m1_t facc = __riscv_vfcvt_f_x_v_f32m1(sumi_h8, vl / 4); + + // vector version needs Zvfhmin extension + const float a_scale = GGML_FP16_TO_FP32(a_ptr[l].d); + const float b_scales[8] = { + GGML_FP16_TO_FP32(b_ptr[l].d[0]), + GGML_FP16_TO_FP32(b_ptr[l].d[1]), + GGML_FP16_TO_FP32(b_ptr[l].d[2]), + GGML_FP16_TO_FP32(b_ptr[l].d[3]), + GGML_FP16_TO_FP32(b_ptr[l].d[4]), + GGML_FP16_TO_FP32(b_ptr[l].d[5]), + GGML_FP16_TO_FP32(b_ptr[l].d[6]), + GGML_FP16_TO_FP32(b_ptr[l].d[7]) + }; + const vfloat32m1_t b_scales_vec = __riscv_vle32_v_f32m1(b_scales, vl / 4); + const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1(facc, a_scale, vl / 4); + sumf = __riscv_vfmacc_vv_f32m1(sumf, tmp1, b_scales_vec, vl / 4); + } + __riscv_vse32_v_f32m1(s + x * ncols_interleaved, sumf, vl / 4); + } + return; + } #endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) { float sumf[8]; @@ -3171,6 +3238,207 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * } } } + return; + } +#elif defined(__riscv_v_intrinsic) + if (__riscv_vlenb() >= QK4_0) { + const size_t vl = QK4_0; + + for (int y = 0; y < nr / 4; y++) { + const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb); + for (int x = 0; x < nc / ncols_interleaved; x++) { + const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb); + vfloat32m1_t sumf0 = __riscv_vfmv_v_f_f32m1(0.0, vl / 4); + vfloat32m1_t sumf1 = __riscv_vfmv_v_f_f32m1(0.0, vl / 4); + vfloat32m1_t sumf2 = __riscv_vfmv_v_f_f32m1(0.0, vl / 4); + vfloat32m1_t sumf3 = __riscv_vfmv_v_f_f32m1(0.0, vl / 4); + for (int l = 0; l < nb; l++) { + const vint8m4_t rhs_raw_vec = __riscv_vle8_v_i8m4((const int8_t *)b_ptr[l].qs, vl * 4); + const vint8m4_t rhs_vec_lo = __riscv_vsra_vx_i8m4(__riscv_vsll_vx_i8m4(rhs_raw_vec, 4, vl * 4), 4, vl * 4); + const vint8m4_t rhs_vec_hi = __riscv_vsra_vx_i8m4(rhs_raw_vec, 4, vl * 4); + const vint8m2_t rhs_vec_lo_0 = __riscv_vget_v_i8m4_i8m2(rhs_vec_lo, 0); + const vint8m2_t rhs_vec_lo_1 = __riscv_vget_v_i8m4_i8m2(rhs_vec_lo, 1); + const vint8m2_t rhs_vec_hi_0 = __riscv_vget_v_i8m4_i8m2(rhs_vec_hi, 0); + const vint8m2_t rhs_vec_hi_1 = __riscv_vget_v_i8m4_i8m2(rhs_vec_hi, 1); + + // vector version needs Zvfhmin extension + const float a_scales[4] = { + GGML_FP16_TO_FP32(a_ptr[l].d[0]), + GGML_FP16_TO_FP32(a_ptr[l].d[1]), + GGML_FP16_TO_FP32(a_ptr[l].d[2]), + GGML_FP16_TO_FP32(a_ptr[l].d[3]) + }; + const float b_scales[8] = { + GGML_FP16_TO_FP32(b_ptr[l].d[0]), + GGML_FP16_TO_FP32(b_ptr[l].d[1]), + GGML_FP16_TO_FP32(b_ptr[l].d[2]), + GGML_FP16_TO_FP32(b_ptr[l].d[3]), + GGML_FP16_TO_FP32(b_ptr[l].d[4]), + GGML_FP16_TO_FP32(b_ptr[l].d[5]), + GGML_FP16_TO_FP32(b_ptr[l].d[6]), + GGML_FP16_TO_FP32(b_ptr[l].d[7]) + }; + const vfloat32m1_t b_scales_vec = __riscv_vle32_v_f32m1(b_scales, vl / 4); + + const int64_t A0 = *(const int64_t *)&a_ptr[l].qs[0]; + const int64_t A4 = *(const int64_t *)&a_ptr[l].qs[32]; + const int64_t A8 = *(const int64_t *)&a_ptr[l].qs[64]; + const int64_t Ac = *(const int64_t *)&a_ptr[l].qs[96]; + __asm__ __volatile__("" ::: "memory"); // prevent gcc from emitting fused vlse64, violating alignment + vint16m4_t sumi_l0; + { + const vint8m2_t lhs_0_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A0, vl / 4)); + const vint8m2_t lhs_1_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A4, vl / 4)); + const vint8m2_t lhs_2_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A8, vl / 4)); + const vint8m2_t lhs_3_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Ac, vl / 4)); + const vint16m4_t sumi_lo_0 = __riscv_vwmul_vv_i16m4(rhs_vec_lo_0, lhs_0_8, vl * 2); + const vint16m4_t sumi_lo_1 = __riscv_vwmacc_vv_i16m4(sumi_lo_0, rhs_vec_lo_1, lhs_1_8, vl * 2); + const vint16m4_t sumi_hi_0 = __riscv_vwmacc_vv_i16m4(sumi_lo_1, rhs_vec_hi_0, lhs_2_8, vl * 2); + const vint16m4_t sumi_hi_m = __riscv_vwmacc_vv_i16m4(sumi_hi_0, rhs_vec_hi_1, lhs_3_8, vl * 2); + + sumi_l0 = sumi_hi_m; + } + + { + const vuint32m4_t sumi_i32 = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vreinterpret_v_i16m4_i32m4(sumi_l0)); + const vuint16m2_t sumi_h2_0 = __riscv_vnsrl_wx_u16m2(sumi_i32, 0, vl); + const vuint16m2_t sumi_h2_1 = __riscv_vnsrl_wx_u16m2(sumi_i32, 16, vl); + const vuint16m2_t sumi_h2 = __riscv_vadd_vv_u16m2(sumi_h2_0, sumi_h2_1, vl); + const vuint32m2_t sumi_h2_i32 = __riscv_vreinterpret_v_u16m2_u32m2(sumi_h2); + const vuint16m1_t sumi_h4_0 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 0, vl / 2); + const vuint16m1_t sumi_h4_1 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 16, vl / 2); + const vuint16m1_t sumi_h4 = __riscv_vadd_vv_u16m1(sumi_h4_0, sumi_h4_1, vl / 2); + const vuint32m1_t sumi_h4_i32 = __riscv_vreinterpret_v_u16m1_u32m1(sumi_h4); + const vint16mf2_t sumi_h8_0 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 0, vl / 4)); + const vint16mf2_t sumi_h8_1 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 16, vl / 4)); + const vint32m1_t sumi_h8 = __riscv_vwadd_vv_i32m1(sumi_h8_0, sumi_h8_1, vl / 4); + const vfloat32m1_t facc = __riscv_vfcvt_f_x_v_f32m1(sumi_h8, vl / 4); + + const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1(facc, a_scales[0], vl / 4); + sumf0 = __riscv_vfmacc_vv_f32m1(sumf0, tmp1, b_scales_vec, vl / 4); + } + + const int64_t A1 = *(const int64_t *)&a_ptr[l].qs[8]; + const int64_t A5 = *(const int64_t *)&a_ptr[l].qs[40]; + const int64_t A9 = *(const int64_t *)&a_ptr[l].qs[72]; + const int64_t Ad = *(const int64_t *)&a_ptr[l].qs[104]; + __asm__ __volatile__("" ::: "memory"); // prevent gcc from emitting fused vlse64, violating alignment + vint16m4_t sumi_l1; + { + const vint8m2_t lhs_0_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A1, vl / 4)); + const vint8m2_t lhs_1_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A5, vl / 4)); + const vint8m2_t lhs_2_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A9, vl / 4)); + const vint8m2_t lhs_3_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Ad, vl / 4)); + const vint16m4_t sumi_lo_0 = __riscv_vwmul_vv_i16m4(rhs_vec_lo_0, lhs_0_8, vl * 2); + const vint16m4_t sumi_lo_1 = __riscv_vwmacc_vv_i16m4(sumi_lo_0, rhs_vec_lo_1, lhs_1_8, vl * 2); + const vint16m4_t sumi_hi_0 = __riscv_vwmacc_vv_i16m4(sumi_lo_1, rhs_vec_hi_0, lhs_2_8, vl * 2); + const vint16m4_t sumi_hi_m = __riscv_vwmacc_vv_i16m4(sumi_hi_0, rhs_vec_hi_1, lhs_3_8, vl * 2); + + sumi_l1 = sumi_hi_m; + } + + { + const vuint32m4_t sumi_i32 = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vreinterpret_v_i16m4_i32m4(sumi_l1)); + const vuint16m2_t sumi_h2_0 = __riscv_vnsrl_wx_u16m2(sumi_i32, 0, vl); + const vuint16m2_t sumi_h2_1 = __riscv_vnsrl_wx_u16m2(sumi_i32, 16, vl); + const vuint16m2_t sumi_h2 = __riscv_vadd_vv_u16m2(sumi_h2_0, sumi_h2_1, vl); + const vuint32m2_t sumi_h2_i32 = __riscv_vreinterpret_v_u16m2_u32m2(sumi_h2); + const vuint16m1_t sumi_h4_0 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 0, vl / 2); + const vuint16m1_t sumi_h4_1 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 16, vl / 2); + const vuint16m1_t sumi_h4 = __riscv_vadd_vv_u16m1(sumi_h4_0, sumi_h4_1, vl / 2); + const vuint32m1_t sumi_h4_i32 = __riscv_vreinterpret_v_u16m1_u32m1(sumi_h4); + const vint16mf2_t sumi_h8_0 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 0, vl / 4)); + const vint16mf2_t sumi_h8_1 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 16, vl / 4)); + const vint32m1_t sumi_h8 = __riscv_vwadd_vv_i32m1(sumi_h8_0, sumi_h8_1, vl / 4); + const vfloat32m1_t facc = __riscv_vfcvt_f_x_v_f32m1(sumi_h8, vl / 4); + + const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1(facc, a_scales[1], vl / 4); + sumf1 = __riscv_vfmacc_vv_f32m1(sumf1, tmp1, b_scales_vec, vl / 4); + } + + const int64_t A2 = *(const int64_t *)&a_ptr[l].qs[16]; + const int64_t A6 = *(const int64_t *)&a_ptr[l].qs[48]; + const int64_t Aa = *(const int64_t *)&a_ptr[l].qs[80]; + const int64_t Ae = *(const int64_t *)&a_ptr[l].qs[112]; + __asm__ __volatile__("" ::: "memory"); // prevent gcc from emitting fused vlse64, violating alignment + vint16m4_t sumi_l2; + { + const vint8m2_t lhs_0_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A2, vl / 4)); + const vint8m2_t lhs_1_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A6, vl / 4)); + const vint8m2_t lhs_2_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Aa, vl / 4)); + const vint8m2_t lhs_3_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Ae, vl / 4)); + const vint16m4_t sumi_lo_0 = __riscv_vwmul_vv_i16m4(rhs_vec_lo_0, lhs_0_8, vl * 2); + const vint16m4_t sumi_lo_1 = __riscv_vwmacc_vv_i16m4(sumi_lo_0, rhs_vec_lo_1, lhs_1_8, vl * 2); + const vint16m4_t sumi_hi_0 = __riscv_vwmacc_vv_i16m4(sumi_lo_1, rhs_vec_hi_0, lhs_2_8, vl * 2); + const vint16m4_t sumi_hi_m = __riscv_vwmacc_vv_i16m4(sumi_hi_0, rhs_vec_hi_1, lhs_3_8, vl * 2); + + sumi_l2 = sumi_hi_m; + } + + { + const vuint32m4_t sumi_i32 = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vreinterpret_v_i16m4_i32m4(sumi_l2)); + const vuint16m2_t sumi_h2_0 = __riscv_vnsrl_wx_u16m2(sumi_i32, 0, vl); + const vuint16m2_t sumi_h2_1 = __riscv_vnsrl_wx_u16m2(sumi_i32, 16, vl); + const vuint16m2_t sumi_h2 = __riscv_vadd_vv_u16m2(sumi_h2_0, sumi_h2_1, vl); + const vuint32m2_t sumi_h2_i32 = __riscv_vreinterpret_v_u16m2_u32m2(sumi_h2); + const vuint16m1_t sumi_h4_0 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 0, vl / 2); + const vuint16m1_t sumi_h4_1 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 16, vl / 2); + const vuint16m1_t sumi_h4 = __riscv_vadd_vv_u16m1(sumi_h4_0, sumi_h4_1, vl / 2); + const vuint32m1_t sumi_h4_i32 = __riscv_vreinterpret_v_u16m1_u32m1(sumi_h4); + const vint16mf2_t sumi_h8_0 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 0, vl / 4)); + const vint16mf2_t sumi_h8_1 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 16, vl / 4)); + const vint32m1_t sumi_h8 = __riscv_vwadd_vv_i32m1(sumi_h8_0, sumi_h8_1, vl / 4); + const vfloat32m1_t facc = __riscv_vfcvt_f_x_v_f32m1(sumi_h8, vl / 4); + + const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1(facc, a_scales[2], vl / 4); + sumf2 = __riscv_vfmacc_vv_f32m1(sumf2, tmp1, b_scales_vec, vl / 4); + } + + const int64_t A3 = *(const int64_t *)&a_ptr[l].qs[24]; + const int64_t A7 = *(const int64_t *)&a_ptr[l].qs[56]; + const int64_t Ab = *(const int64_t *)&a_ptr[l].qs[88]; + const int64_t Af = *(const int64_t *)&a_ptr[l].qs[120]; + __asm__ __volatile__("" ::: "memory"); // prevent gcc from emitting fused vlse64, violating alignment + vint16m4_t sumi_l3; + { + const vint8m2_t lhs_0_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A3, vl / 4)); + const vint8m2_t lhs_1_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A7, vl / 4)); + const vint8m2_t lhs_2_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Ab, vl / 4)); + const vint8m2_t lhs_3_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Af, vl / 4)); + const vint16m4_t sumi_lo_0 = __riscv_vwmul_vv_i16m4(rhs_vec_lo_0, lhs_0_8, vl * 2); + const vint16m4_t sumi_lo_1 = __riscv_vwmacc_vv_i16m4(sumi_lo_0, rhs_vec_lo_1, lhs_1_8, vl * 2); + const vint16m4_t sumi_hi_0 = __riscv_vwmacc_vv_i16m4(sumi_lo_1, rhs_vec_hi_0, lhs_2_8, vl * 2); + const vint16m4_t sumi_hi_m = __riscv_vwmacc_vv_i16m4(sumi_hi_0, rhs_vec_hi_1, lhs_3_8, vl * 2); + + sumi_l3 = sumi_hi_m; + } + + { + const vuint32m4_t sumi_i32 = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vreinterpret_v_i16m4_i32m4(sumi_l3)); + const vuint16m2_t sumi_h2_0 = __riscv_vnsrl_wx_u16m2(sumi_i32, 0, vl); + const vuint16m2_t sumi_h2_1 = __riscv_vnsrl_wx_u16m2(sumi_i32, 16, vl); + const vuint16m2_t sumi_h2 = __riscv_vadd_vv_u16m2(sumi_h2_0, sumi_h2_1, vl); + const vuint32m2_t sumi_h2_i32 = __riscv_vreinterpret_v_u16m2_u32m2(sumi_h2); + const vuint16m1_t sumi_h4_0 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 0, vl / 2); + const vuint16m1_t sumi_h4_1 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 16, vl / 2); + const vuint16m1_t sumi_h4 = __riscv_vadd_vv_u16m1(sumi_h4_0, sumi_h4_1, vl / 2); + const vuint32m1_t sumi_h4_i32 = __riscv_vreinterpret_v_u16m1_u32m1(sumi_h4); + const vint16mf2_t sumi_h8_0 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 0, vl / 4)); + const vint16mf2_t sumi_h8_1 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 16, vl / 4)); + const vint32m1_t sumi_h8 = __riscv_vwadd_vv_i32m1(sumi_h8_0, sumi_h8_1, vl / 4); + const vfloat32m1_t facc = __riscv_vfcvt_f_x_v_f32m1(sumi_h8, vl / 4); + + const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1(facc, a_scales[3], vl / 4); + sumf3 = __riscv_vfmacc_vv_f32m1(sumf3, tmp1, b_scales_vec, vl / 4); + } + } + __riscv_vse32_v_f32m1(&s[(y * 4 + 0) * bs + x * ncols_interleaved], sumf0, vl / 4); + __riscv_vse32_v_f32m1(&s[(y * 4 + 1) * bs + x * ncols_interleaved], sumf1, vl / 4); + __riscv_vse32_v_f32m1(&s[(y * 4 + 2) * bs + x * ncols_interleaved], sumf2, vl / 4); + __riscv_vse32_v_f32m1(&s[(y * 4 + 3) * bs + x * ncols_interleaved], sumf3, vl / 4); + } + } + return; } #endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 9bb17a103..233c0c63e 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -562,6 +562,10 @@ void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * na #include "ggml-cann.h" #endif +#ifdef GGML_USE_KOMPUTE +#include "ggml-kompute.h" +#endif + struct ggml_backend_registry { std::vector backends; std::vector devices; @@ -591,8 +595,9 @@ struct ggml_backend_registry { #ifdef GGML_USE_AMX register_backend(ggml_backend_amx_reg()); #endif - - // TODO: kompute +#ifdef GGML_USE_KOMPUTE + register_backend(ggml_backend_kompute_reg()); +#endif register_backend(ggml_backend_cpu_reg()); } @@ -1503,7 +1508,7 @@ static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, co return -1; } -#if 1 +#if 0 #define GGML_SCHED_MAX_SPLITS_DEBUG 4096 static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS_DEBUG*GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only #define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__) diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index 9050c724d..9895d3feb 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -3111,18 +3111,20 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g } return false; } break; + case GGML_OP_NORM: + case GGML_OP_RMS_NORM: + return ggml_is_contiguous(op->src[0]) && op->ne[0] % WARP_SIZE == 0; + break; case GGML_OP_NONE: case GGML_OP_RESHAPE: case GGML_OP_VIEW: case GGML_OP_PERMUTE: case GGML_OP_TRANSPOSE: - case GGML_OP_NORM: case GGML_OP_ADD: case GGML_OP_ADD1: case GGML_OP_SUB: case GGML_OP_MUL: case GGML_OP_DIV: - case GGML_OP_RMS_NORM: case GGML_OP_SCALE: case GGML_OP_SQR: case GGML_OP_SQRT: diff --git a/ggml/src/ggml-kompute.cpp b/ggml/src/ggml-kompute.cpp index 1f2220234..2fea9e4cc 100644 --- a/ggml/src/ggml-kompute.cpp +++ b/ggml/src/ggml-kompute.cpp @@ -20,6 +20,7 @@ #include "shaderop_mul_mat_q8_0.h" #include "shaderop_mul_mat_q4_0.h" #include "shaderop_mul_mat_q4_1.h" +#include "shaderop_mul_mat_q4_k.h" #include "shaderop_mul_mat_q6_k.h" #include "shaderop_mul_mat_mat_f32.h" #include "shaderop_getrows_f32.h" @@ -42,6 +43,7 @@ #include #include #include +#include #include #include #include @@ -273,18 +275,9 @@ static std::vector ggml_vk_available_devices_internal(size_t mem return results; } -// public API returns a C-style array -ggml_vk_device * ggml_vk_available_devices(size_t memoryRequired, size_t * count) { - auto devices = ggml_vk_available_devices_internal(memoryRequired); - *count = devices.size(); - if (devices.empty()) { - return nullptr; - } - - size_t nbytes = sizeof (ggml_vk_device) * (devices.size()); - auto * arr = static_cast(malloc(nbytes)); - memcpy(arr, devices.data(), nbytes); - return arr; +static std::vector& ggml_vk_available_devices() { + static std::vector devices = ggml_vk_available_devices_internal(0); + return devices; } static void ggml_vk_filterByVendor(std::vector& devices, const std::string& targetVendor) { @@ -341,7 +334,7 @@ ggml_vk_device ggml_vk_current_device() { if (!komputeManager()->hasDevice()) return ggml_vk_device(); - auto devices = ggml_vk_available_devices_internal(0); + auto devices = ggml_vk_available_devices(); ggml_vk_filterByName(devices, komputeManager()->physicalDevice()->getProperties().deviceName.data()); GGML_ASSERT(!devices.empty()); return devices.front(); @@ -1075,6 +1068,40 @@ static void ggml_vk_mul_mat_q8_0(Args&&... args) { ggml_vk_mul_mat_impl(spirv, "q8_0", 1/*We access blocks unaligned*/, std::forward(args)...); } +static void ggml_vk_mul_mat_q4_k( + kp::Sequence& seq, + const std::shared_ptr& inA, + const std::shared_ptr& inB, + const std::shared_ptr& out, + uint32_t inAOff, uint32_t inBOff, uint32_t outOff, + int32_t ne00, int32_t ne01, int32_t ne02, int32_t ne10, + int32_t ne11, int32_t ne12, int32_t ne13, int32_t ne0, + int32_t ne1, int32_t r2, int32_t r3 +) { + const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q4_k_comp_spv, + kp::shader_data::op_mul_mat_q4_k_comp_spv_len); + + struct PushConstants { + uint32_t inAOff, inBOff, outOff; + int32_t ne00, ne10, ne0, ne1, ne01, ne02, ne12, r2, r3; + } pushConsts { + 0, 0, 0, + ne00, ne10, ne0, ne1, ne01, ne02, ne12, r2, r3 + }; + + std::shared_ptr s_algo = nullptr; + if (!komputeManager()->hasAlgorithm(__func__)) { + s_algo = komputeManager()->algorithm(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 3)/4), unsigned(ne11), unsigned(ne12) * unsigned(ne13)}, {}, {pushConsts}); + } else { + s_algo = komputeManager()->getAlgorithm(__func__); + s_algo->setTensors({inA, inB, out}); + s_algo->setWorkgroup({unsigned((ne01 + 3)/4), unsigned(ne11), unsigned(ne12) * unsigned(ne13)}); + s_algo->setPushConstants({pushConsts}); + s_algo->updateDescriptors(s_kompute_context->pool.get()); + } + seq.record(s_algo); +} + static void ggml_vk_mul_mat_q6_k( kp::Sequence& seq, const std::shared_ptr& inA, @@ -1323,17 +1350,7 @@ static void ggml_vk_cpy_f16_f32(Args&&... args) { ggml_vk_cpy(spirv, 2, 4, std::forward(args)...); } -static bool ggml_vk_supports_op(const struct ggml_tensor * op) { - switch (op->type) { - case GGML_TYPE_F16: - case GGML_TYPE_F32: - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - break; - default: - return false; - } - +static bool ggml_backend_kompute_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) { switch (op->op) { case GGML_OP_UNARY: switch (ggml_get_unary_op(op)) { @@ -1402,6 +1419,7 @@ static bool ggml_vk_supports_op(const struct ggml_tensor * op) { case GGML_TYPE_Q8_0: case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: + case GGML_TYPE_Q4_K: return true; default: ; @@ -1410,6 +1428,8 @@ static bool ggml_vk_supports_op(const struct ggml_tensor * op) { ; } return false; + + GGML_UNUSED(dev); } static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph * gf) { @@ -1458,11 +1478,6 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml any_commands_recorded = true; - if (!ggml_vk_supports_op(dst)) { - fprintf(stderr, "%s: error: unsupported op '%s'\n", __func__, ggml_op_desc(dst)); - GGML_ABORT("unsupported op"); - } - const int32_t ne00 = src0 ? src0->ne[0] : 0; const int32_t ne01 = src0 ? src0->ne[1] : 0; const int32_t ne02 = src0 ? src0->ne[2] : 0; @@ -1656,6 +1671,12 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1, r2, r3 ); break; + case GGML_TYPE_Q4_K: + ggml_vk_mul_mat_q4_k( + seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, + ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1, ne12/ne02, ne13/ne03 + ); + break; case GGML_TYPE_Q6_K: ggml_vk_mul_mat_q6_k( seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, @@ -1907,25 +1928,31 @@ static ggml_backend_buffer_type_i ggml_backend_kompute_buffer_type_interface = { }; ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device) { - static std::vector bufts = []() { - std::vector vec; - auto devices = ggml_vk_available_devices_internal(0); - vec.reserve(devices.size()); + static std::mutex mutex; + std::lock_guard lock(mutex); - for (const auto & dev : devices) { - vec.push_back({ - /* .iface = */ ggml_backend_kompute_buffer_type_interface, - /* .device = */ nullptr, - /* .context = */ new ggml_backend_kompute_buffer_type_context(dev.index, dev.bufferAlignment, dev.maxAlloc) - }); + auto devices = ggml_vk_available_devices(); + int32_t device_count = (int32_t) devices.size(); + GGML_ASSERT(device < device_count); + GGML_ASSERT(devices.size() <= GGML_KOMPUTE_MAX_DEVICES); + + static ggml_backend_buffer_type + ggml_backend_kompute_buffer_types[GGML_KOMPUTE_MAX_DEVICES]; + + static bool ggml_backend_kompute_buffer_type_initialized = false; + + if (!ggml_backend_kompute_buffer_type_initialized) { + for (int32_t i = 0; i < device_count; i++) { + ggml_backend_kompute_buffer_types[i] = { + /* .iface = */ ggml_backend_kompute_buffer_type_interface, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_kompute_reg(), i), + /* .context = */ new ggml_backend_kompute_buffer_type_context{ i, devices[i].bufferAlignment, devices[i].maxAlloc }, + }; } - return vec; - }(); + ggml_backend_kompute_buffer_type_initialized = true; + } - auto it = std::find_if(bufts.begin(), bufts.end(), [device](const ggml_backend_buffer_type & t) { - return device == static_cast(t.context)->device; - }); - return it < bufts.end() ? &*it : nullptr; + return &ggml_backend_kompute_buffer_types[device]; } // backend @@ -1953,16 +1980,6 @@ static ggml_status ggml_backend_kompute_graph_compute(ggml_backend_t backend, st return GGML_STATUS_SUCCESS; } -static bool ggml_backend_kompute_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { - GGML_UNUSED(backend); - return ggml_vk_supports_op(op); -} - -static bool ggml_backend_kompute_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) { - GGML_UNUSED(backend); - return buft->iface.get_name == ggml_backend_kompute_buffer_type_get_name; -} - static struct ggml_backend_i kompute_backend_i = { /* .get_name = */ ggml_backend_kompute_name, /* .free = */ ggml_backend_kompute_free, @@ -1991,7 +2008,7 @@ ggml_backend_t ggml_backend_kompute_init(int device) { ggml_backend_t kompute_backend = new ggml_backend { /* .guid = */ ggml_backend_kompute_guid(), /* .interface = */ kompute_backend_i, - /* .device = */ nullptr, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_kompute_reg(), device), /* .context = */ s_kompute_context, }; @@ -2001,3 +2018,167 @@ ggml_backend_t ggml_backend_kompute_init(int device) { bool ggml_backend_is_kompute(ggml_backend_t backend) { return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_kompute_guid()); } + +static size_t ggml_backend_kompute_get_device_count() { + auto devices = ggml_vk_available_devices(); + return devices.size(); +} + +static void ggml_backend_kompute_get_device_description(int device, char * description, size_t description_size) { + auto devices = ggml_vk_available_devices(); + GGML_ASSERT((size_t) device < devices.size()); + snprintf(description, description_size, "%s", devices[device].name); +} + +static void ggml_backend_kompute_get_device_memory(int device, size_t * free, size_t * total) { + auto devices = ggml_vk_available_devices(); + GGML_ASSERT((size_t) device < devices.size()); + *total = devices[device].heapSize; + *free = devices[device].heapSize; +} + +////////////////////////// + +struct ggml_backend_kompute_device_context { + int device; + std::string name; + std::string description; +}; + +static const char * ggml_backend_kompute_device_get_name(ggml_backend_dev_t dev) { + ggml_backend_kompute_device_context * ctx = (ggml_backend_kompute_device_context *)dev->context; + return ctx->name.c_str(); +} + +static const char * ggml_backend_kompute_device_get_description(ggml_backend_dev_t dev) { + ggml_backend_kompute_device_context * ctx = (ggml_backend_kompute_device_context *)dev->context; + return ctx->description.c_str(); +} + +static void ggml_backend_kompute_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { + ggml_backend_kompute_device_context * ctx = (ggml_backend_kompute_device_context *)dev->context; + ggml_backend_kompute_get_device_memory(ctx->device, free, total); +} + +static ggml_backend_buffer_type_t ggml_backend_kompute_device_get_buffer_type(ggml_backend_dev_t dev) { + ggml_backend_kompute_device_context * ctx = (ggml_backend_kompute_device_context *)dev->context; + return ggml_backend_kompute_buffer_type(ctx->device); +} + +static bool ggml_backend_kompute_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { + if (buft->iface.get_name != ggml_backend_kompute_buffer_type_get_name) { + return false; + } + + ggml_backend_kompute_device_context * ctx = (ggml_backend_kompute_device_context *)dev->context; + ggml_backend_kompute_buffer_type_context * buft_ctx = (ggml_backend_kompute_buffer_type_context *)buft->context; + + return buft_ctx->device == ctx->device; +} + +static enum ggml_backend_dev_type ggml_backend_kompute_device_get_type(ggml_backend_dev_t dev) { + GGML_UNUSED(dev); + return GGML_BACKEND_DEVICE_TYPE_GPU; +} + +static void ggml_backend_kompute_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) { + props->name = ggml_backend_kompute_device_get_name(dev); + props->description = ggml_backend_kompute_device_get_description(dev); + props->type = ggml_backend_kompute_device_get_type(dev); + ggml_backend_kompute_device_get_memory(dev, &props->memory_free, &props->memory_total); + props->caps = { + /* async = */ false, + /* host_buffer = */ false, + /* .buffer_from_host_ptr = */ false, + /* events = */ false, + }; +} + +static ggml_backend_t ggml_backend_kompute_device_init(ggml_backend_dev_t dev, const char * params) { + GGML_UNUSED(params); + ggml_backend_kompute_device_context * ctx = (ggml_backend_kompute_device_context *)dev->context; + return ggml_backend_kompute_init(ctx->device); +} + +static bool ggml_backend_kompute_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) { + const int min_batch_size = 32; + + return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) || + (op->ne[2] >= min_batch_size && op->op == GGML_OP_MUL_MAT_ID); + + GGML_UNUSED(dev); +} + +static const struct ggml_backend_device_i ggml_backend_kompute_device_i = { + /* .get_name = */ ggml_backend_kompute_device_get_name, + /* .get_description = */ ggml_backend_kompute_device_get_description, + /* .get_memory = */ ggml_backend_kompute_device_get_memory, + /* .get_type = */ ggml_backend_kompute_device_get_type, + /* .get_props = */ ggml_backend_kompute_device_get_props, + /* .init_backend = */ ggml_backend_kompute_device_init, + /* .get_buffer_type = */ ggml_backend_kompute_device_get_buffer_type, + /* .get_host_buffer_type = */ NULL, + /* .buffer_from_host_ptr = */ NULL, + /* .supports_op = */ ggml_backend_kompute_device_supports_op, + /* .supports_buft = */ ggml_backend_kompute_device_supports_buft, + /* .offload_op = */ ggml_backend_kompute_device_offload_op, + /* .event_new = */ NULL, + /* .event_free = */ NULL, + /* .event_synchronize = */ NULL, +}; + +static const char * ggml_backend_kompute_reg_get_name(ggml_backend_reg_t reg) { + GGML_UNUSED(reg); + return "Kompute"; +} + +static size_t ggml_backend_kompute_reg_get_device_count(ggml_backend_reg_t reg) { + GGML_UNUSED(reg); + return ggml_backend_kompute_get_device_count(); +} + +static ggml_backend_dev_t ggml_backend_kompute_reg_get_device(ggml_backend_reg_t reg, size_t device) { + static std::vector devices; + + static bool initialized = false; + + { + static std::mutex mutex; + std::lock_guard lock(mutex); + if (!initialized) { + for (size_t i = 0; i < ggml_backend_kompute_get_device_count(); i++) { + ggml_backend_kompute_device_context * ctx = new ggml_backend_kompute_device_context; + char desc[256]; + ggml_backend_kompute_get_device_description(i, desc, sizeof(desc)); + ctx->device = i; + ctx->name = "Kompute" + std::to_string(i); + ctx->description = desc; + devices.push_back(new ggml_backend_device { + /* .iface = */ ggml_backend_kompute_device_i, + /* .reg = */ reg, + /* .context = */ ctx, + }); + } + initialized = true; + } + } + + GGML_ASSERT(device < devices.size()); + return devices[device]; +} + +static const struct ggml_backend_reg_i ggml_backend_kompute_reg_i = { + /* .get_name = */ ggml_backend_kompute_reg_get_name, + /* .get_device_count = */ ggml_backend_kompute_reg_get_device_count, + /* .get_device = */ ggml_backend_kompute_reg_get_device, + /* .get_proc_address = */ NULL, +}; + +ggml_backend_reg_t ggml_backend_kompute_reg() { + static ggml_backend_reg reg = { + /* .iface = */ ggml_backend_kompute_reg_i, + /* .context = */ nullptr, + }; + + return ® +} diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp index ba8ba82b4..693e76de3 100644 --- a/ggml/src/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan.cpp @@ -1047,7 +1047,6 @@ static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, vk::Memor return buf; } - buf->size = size; vk::BufferCreateInfo buffer_create_info{ vk::BufferCreateFlags(), size, @@ -1075,7 +1074,6 @@ static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, vk::Memor if (memory_type_index == UINT32_MAX) { device->device.destroyBuffer(buf->buffer); - buf->size = 0; throw vk::OutOfDeviceMemoryError("No suitable memory type found"); } @@ -1092,13 +1090,11 @@ static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, vk::Memor } catch (const vk::SystemError& e) { device->device.destroyBuffer(buf->buffer); - buf->size = 0; throw e; } } else { // Out of Host/Device memory, clean up buffer device->device.destroyBuffer(buf->buffer); - buf->size = 0; throw e; } } @@ -1111,6 +1107,7 @@ static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, vk::Memor device->device.bindBufferMemory(buf->buffer, buf->device_memory, 0); buf->device = device; + buf->size = size; #ifdef GGML_VULKAN_MEMORY_DEBUG device->memory_logger->log_allocation(buf, size); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 04dce20b8..50d6b497d 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -306,6 +306,7 @@ void ggml_abort(const char * file, int line, const char * fmt, ...) { } #define GGML_DEBUG 0 + #define GGML_GELU_FP16 #define GGML_GELU_QUICK_FP16 @@ -2022,18 +2023,14 @@ static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object); struct ggml_context { size_t mem_size; - void* mem_buffer; + void * mem_buffer; bool mem_buffer_owned; bool no_alloc; - bool no_alloc_save; // this is used to save the no_alloc state when using scratch buffers int n_objects; struct ggml_object * objects_begin; struct ggml_object * objects_end; - - struct ggml_scratch scratch; - struct ggml_scratch scratch_save; }; struct ggml_context_container { @@ -3280,7 +3277,6 @@ struct ggml_numa_nodes { // struct ggml_state { - struct ggml_context_container contexts[GGML_MAX_CONTEXTS]; struct ggml_numa_nodes numa; }; @@ -3866,7 +3862,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { const uint64_t t_start = ggml_time_us(); UNUSED(t_start); g_state = (struct ggml_state) { - /*.contexts =*/ { { 0 } }, /*.numa =*/ { .n_nodes = 0, .total_cpus = 0, @@ -3889,26 +3884,9 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { is_first_call = false; } - // find non-used context in g_state - struct ggml_context * ctx = NULL; + ggml_critical_section_end(); - for (int i = 0; i < GGML_MAX_CONTEXTS; i++) { - if (!g_state.contexts[i].used) { - g_state.contexts[i].used = true; - ctx = &g_state.contexts[i].context; - - GGML_PRINT_DEBUG("%s: found unused context %d\n", __func__, i); - break; - } - } - - if (ctx == NULL) { - GGML_PRINT_DEBUG("%s: no unused context found\n", __func__); - - ggml_critical_section_end(); - - return NULL; - } + struct ggml_context * ctx = GGML_MALLOC(sizeof(struct ggml_context)); // allow to call ggml_init with 0 size if (params.mem_size == 0) { @@ -3922,12 +3900,9 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { /*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : ggml_aligned_malloc(mem_size), /*.mem_buffer_owned =*/ params.mem_buffer ? false : true, /*.no_alloc =*/ params.no_alloc, - /*.no_alloc_save =*/ params.no_alloc, /*.n_objects =*/ 0, /*.objects_begin =*/ NULL, /*.objects_end =*/ NULL, - /*.scratch =*/ { 0, 0, NULL, }, - /*.scratch_save =*/ { 0, 0, NULL, }, }; GGML_ASSERT(ctx->mem_buffer != NULL); @@ -3936,56 +3911,35 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { GGML_PRINT_DEBUG("%s: context initialized\n", __func__); - ggml_critical_section_end(); - return ctx; } +void ggml_reset(struct ggml_context * ctx) { + if (ctx == NULL) { + return; + } + + ctx->n_objects = 0; + ctx->objects_begin = NULL; + ctx->objects_end = NULL; +} + void ggml_free(struct ggml_context * ctx) { if (ctx == NULL) { return; } - // make this function thread safe - ggml_critical_section_start(); - - bool found = false; - - for (int i = 0; i < GGML_MAX_CONTEXTS; i++) { - if (&g_state.contexts[i].context == ctx) { - g_state.contexts[i].used = false; - - GGML_PRINT_DEBUG("%s: context %d has been freed. memory used = %zu\n", - __func__, i, ggml_used_mem(ctx)); - - if (ctx->mem_buffer_owned) { - ggml_aligned_free(ctx->mem_buffer, ctx->mem_size); - } - - found = true; - break; - } + if (ctx->mem_buffer_owned) { + ggml_aligned_free(ctx->mem_buffer, ctx->mem_size); } - if (!found) { - GGML_PRINT_DEBUG("%s: context not found\n", __func__); - } - - ggml_critical_section_end(); + GGML_FREE(ctx); } size_t ggml_used_mem(const struct ggml_context * ctx) { return ctx->objects_end == NULL ? 0 : ctx->objects_end->offs + ctx->objects_end->size; } -size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch) { - const size_t result = ctx->scratch.data ? ctx->scratch.offs : 0; - - ctx->scratch = scratch; - - return result; -} - bool ggml_get_no_alloc(struct ggml_context * ctx) { return ctx->no_alloc; } @@ -4013,27 +3967,6 @@ size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) { return max_size; } -// IMPORTANT: -// when creating "opt" tensors, always save and load the scratch buffer -// this is an error prone process, but it is necessary to support inplace -// operators when using scratch buffers -// TODO: implement a better way -static void ggml_scratch_save(struct ggml_context * ctx) { - // this is needed to allow opt tensors to store their data - // TODO: again, need to find a better way - ctx->no_alloc_save = ctx->no_alloc; - ctx->no_alloc = false; - - ctx->scratch_save = ctx->scratch; - ctx->scratch.data = NULL; -} - -static void ggml_scratch_load(struct ggml_context * ctx) { - ctx->no_alloc = ctx->no_alloc_save; - - ctx->scratch = ctx->scratch_save; -} - //////////////////////////////////////////////////////////////////////////////// static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml_object_type type, size_t size) { @@ -4114,29 +4047,13 @@ static struct ggml_tensor * ggml_new_tensor_impl( size_t obj_alloc_size = 0; if (view_src == NULL && !ctx->no_alloc) { - if (ctx->scratch.data != NULL) { - // allocate tensor data in the scratch buffer - if (ctx->scratch.offs + data_size > ctx->scratch.size) { - GGML_LOG_WARN("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n", - __func__, ctx->scratch.offs + data_size, ctx->scratch.size); - assert(false); - return NULL; - } - - data = (char * const) ctx->scratch.data + ctx->scratch.offs; - - ctx->scratch.offs += data_size; - } else { - // allocate tensor data in the context's memory pool - obj_alloc_size = data_size; - } + // allocate tensor data in the context's memory pool + obj_alloc_size = data_size; } struct ggml_object * const obj_new = ggml_new_object(ctx, GGML_OBJECT_TYPE_TENSOR, GGML_TENSOR_SIZE + obj_alloc_size); GGML_ASSERT(obj_new); - // TODO: for recoverable errors, we would need to free the data allocated from the scratch buffer here - struct ggml_tensor * const result = (struct ggml_tensor *)((char *)ctx->mem_buffer + obj_new->offs); #ifdef __clang__ @@ -4232,24 +4149,16 @@ struct ggml_tensor * ggml_new_tensor_4d( } struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value) { - ggml_scratch_save(ctx); - struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1); - ggml_scratch_load(ctx); - ggml_set_i32(result, value); return result; } struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value) { - ggml_scratch_save(ctx); - struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1); - ggml_scratch_load(ctx); - ggml_set_f32(result, value); return result; @@ -7297,6 +7206,7 @@ struct ggml_tensor * ggml_ssm_conv( const int64_t n_s = sx->ne[2]; // TODO: maybe support other strides than 1? + // FIXME: this is always true? GGML_ASSERT(sx->ne[0] == d_conv - 1 + n_t); GGML_ASSERT(sx->ne[1] == d_inner); GGML_ASSERT(n_t >= 0); @@ -20351,7 +20261,6 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) { uint64_t size_eval = 0; // compute size of intermediate results - // TODO: does not take into account scratch buffers !!!! for (int i = 0; i < cgraph->n_nodes; ++i) { size_eval += ggml_nbytes_pad(cgraph->nodes[i]); } @@ -22162,18 +22071,46 @@ static size_t gguf_type_size(enum gguf_type type) { return GGUF_TYPE_SIZE[type]; } -static void gguf_tensor_info_sanitize(struct gguf_tensor_info * info) { - GGML_ASSERT(info->n_dims <= GGML_MAX_DIMS); - GGML_ASSERT(0 <= info->type && info->type < GGML_TYPE_COUNT); +static bool gguf_tensor_info_sanitize(struct gguf_tensor_info * info) { + if (info->n_dims > GGML_MAX_DIMS) { + fprintf(stderr, "%s: invalid number of dimensions (%" PRIu32 ")\n", __func__, info->n_dims); + return false; + } + + if (info->type < 0 || info->type >= GGML_TYPE_COUNT) { + fprintf(stderr, "%s: invalid type (%d)\n", __func__, info->type); + return false; + } + + if (strlen(info->name.data) >= GGML_MAX_NAME) { + fprintf(stderr, "%s: tensor '%s' name is too long\n", __func__, info->name.data); + return false; + } for (uint32_t i = 0; i < info->n_dims; ++i) { - GGML_ASSERT(info->ne[i] > 0); + if (info->ne[i] <= 0) { + fprintf(stderr, "%s: invalid number of elements (%" PRIu64 ")\n", __func__, info->ne[i]); + return false; + } } // prevent overflow for total number of elements - GGML_ASSERT(INT64_MAX/info->ne[1] > info->ne[0]); - GGML_ASSERT(INT64_MAX/info->ne[2] > info->ne[0]*info->ne[1]); - GGML_ASSERT(INT64_MAX/info->ne[3] > info->ne[0]*info->ne[1]*info->ne[2]); + if (INT64_MAX/info->ne[1] <= info->ne[0]) { + fprintf(stderr, "%s: invalid number of elements (%" PRIu64 ")\n", __func__, info->ne[1]); + return false; + } + + if (INT64_MAX/info->ne[2] <= info->ne[0]*info->ne[1]) { + fprintf(stderr, "%s: invalid number of elements (%" PRIu64 ")\n", __func__, info->ne[2]); + return false; + } + + if (INT64_MAX/info->ne[3] <= info->ne[0]*info->ne[1]*info->ne[2]) { + fprintf(stderr, "%s: invalid number of elements (%" PRIu64 ")\n", __func__, info->ne[3]); + return false; + } + + return true; } static bool gguf_fread_el(FILE * file, void * dst, size_t size, size_t * offset) { @@ -22197,7 +22134,11 @@ static bool gguf_fread_str_cur(FILE * file, struct gguf_str * p, size_t * offset return false; } - p->data = GGML_CALLOC(p->n + 1, 1); + p->data = calloc(p->n + 1, 1); + if (!p->data) { + fprintf(stderr, "%s: failed to allocate memory for string of length %" PRIu64 "\n", __func__, p->n); + return false; + } ok = ok && gguf_fread_el(file, p->data, p->n, offset); @@ -22244,7 +22185,11 @@ static void gguf_free_kv(struct gguf_kv * kv) { } struct gguf_context * gguf_init_empty(void) { - struct gguf_context * ctx = GGML_CALLOC(1, sizeof(struct gguf_context)); + struct gguf_context * ctx = calloc(1, sizeof(struct gguf_context)); + if (!ctx) { + fprintf(stderr, "%s: failed to allocate memory for context\n", __func__); + return NULL; + } memcpy(ctx->header.magic, GGUF_MAGIC, sizeof(ctx->header.magic)); ctx->header.version = GGUF_VERSION; @@ -22290,7 +22235,12 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p bool ok = true; - struct gguf_context * ctx = GGML_CALLOC(1, sizeof(struct gguf_context)); + struct gguf_context * ctx = calloc(1, sizeof(struct gguf_context)); + if (!ctx) { + fprintf(stderr, "%s: failed to allocate memory for context\n", __func__); + fclose(file); + return NULL; + } // read the header { @@ -22345,9 +22295,13 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p { const uint64_t n_kv = ctx->header.n_kv; - // header.n_kv will hold the actual value of pairs that were successfully read in the loop below - ctx->header.n_kv = 0; - ctx->kv = GGML_CALLOC(n_kv, sizeof(struct gguf_kv)); + ctx->kv = calloc(n_kv, sizeof(struct gguf_kv)); + if (!ctx->kv) { + fprintf(stderr, "%s: failed to allocate memory for kv pairs\n", __func__); + fclose(file); + gguf_free(ctx); + return NULL; + } for (uint64_t i = 0; i < n_kv; ++i) { struct gguf_kv * kv = &ctx->kv[i]; @@ -22406,7 +22360,13 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p return NULL; } - kv->value.arr.data = GGML_CALLOC(kv->value.arr.n, gguf_type_size(kv->value.arr.type)); + kv->value.arr.data = calloc(kv->value.arr.n, gguf_type_size(kv->value.arr.type)); + if (!kv->value.arr.data) { + fprintf(stderr, "%s: failed to allocate memory for array\n", __func__); + fclose(file); + gguf_free(ctx); + return NULL; + } ok = ok && gguf_fread_el(file, kv->value.arr.data, kv->value.arr.n * gguf_type_size(kv->value.arr.type), &offset); } break; @@ -22420,24 +22380,36 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p return NULL; } - kv->value.arr.data = GGML_CALLOC(kv->value.arr.n, sizeof(struct gguf_str)); + kv->value.arr.data = calloc(kv->value.arr.n, sizeof(struct gguf_str)); + if (!kv->value.arr.data) { + fprintf(stderr, "%s: failed to allocate memory for array\n", __func__); + fclose(file); + gguf_free(ctx); + return NULL; + } for (uint64_t j = 0; j < kv->value.arr.n; ++j) { ok = ok && gguf_fread_str(file, &((struct gguf_str *) kv->value.arr.data)[j], &offset); } } break; case GGUF_TYPE_ARRAY: - default: GGML_ABORT("invalid type"); + default: + { + fprintf(stderr, "%s: invalid array type %d\n", __func__, kv->value.arr.type); + ok = false; + } break; } } break; - default: GGML_ABORT("invalid type"); + default: + { + fprintf(stderr, "%s: invalid type %d\n", __func__, kv->type); + ok = false; + } break; } if (!ok) { break; } - - ctx->header.n_kv++; } if (!ok) { @@ -22450,7 +22422,13 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p // read the tensor infos if (ctx->header.n_tensors > 0) { - ctx->infos = GGML_CALLOC(ctx->header.n_tensors, sizeof(struct gguf_tensor_info)); + ctx->infos = calloc(ctx->header.n_tensors, sizeof(struct gguf_tensor_info)); + if (!ctx->infos) { + fprintf(stderr, "%s: failed to allocate memory for tensor infos\n", __func__); + fclose(file); + gguf_free(ctx); + return NULL; + } for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) { struct gguf_tensor_info * info = &ctx->infos[i]; @@ -22477,8 +22455,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p ok = ok && gguf_fread_el (file, &info->type, sizeof(info->type), &offset); ok = ok && gguf_fread_el (file, &info->offset, sizeof(info->offset), &offset); - // TODO: return an error instead of crashing with GGML_ASSERT - gguf_tensor_info_sanitize(info); + ok = ok && gguf_tensor_info_sanitize(info); // make sure there is no duplicated tensor names for (uint64_t j = 0; j < i && ok; ++j) { diff --git a/ggml/src/kompute-shaders/op_mul_mat_q4_k.comp b/ggml/src/kompute-shaders/op_mul_mat_q4_k.comp new file mode 100644 index 000000000..fc8e45aa9 --- /dev/null +++ b/ggml/src/kompute-shaders/op_mul_mat_q4_k.comp @@ -0,0 +1,133 @@ +#version 450 + +#include "common.comp" + +#define N_DST 4 +#define SIZE_OF_BLOCK sizeof_block_q4_k + +layout(local_size_x = 4) in; +layout(local_size_y = 8) in; +layout(local_size_z = 1) in; + +layout (binding = 0) readonly buffer tensorInA { block_q4_k inA[]; }; +layout (binding = 1) readonly buffer tensorInB { float inB[]; }; +layout (binding = 2) writeonly buffer tensorOut { float out_[]; }; + +layout (push_constant) uniform parameter { + uint inAOff; + uint inBOff; + uint outOff; + int ne00; + int ne10; + int ne0; + int ne1; + int ne01; + int ne02; + int ne12; + int r2; + int r3; +} pcs; + +void main() { + const uint16_t kmask1 = uint16_t(0x3f3f); + const uint16_t kmask2 = uint16_t(0x0f0f); + const uint16_t kmask3 = uint16_t(0xc0c0); + + const uint ix = gl_SubgroupInvocationID/8; // 0...3 + const uint it = gl_SubgroupInvocationID%8; // 0...7 + const uint iq = it/4; // 0 or 1 + const uint ir = it%4; // 0...3 + + const uint nb = pcs.ne00/QK_K; + + const uint r0 = gl_WorkGroupID.x; + const uint r1 = gl_WorkGroupID.y; + const uint im = gl_WorkGroupID.z; + + const uint first_row = r0 * N_DST; + const uint ib_row = first_row * nb; + + const uint i12 = im%pcs.ne12; + const uint i13 = im/pcs.ne12; + + const uint offset0 = (i12/pcs.r2)*(nb*pcs.ne01) + (i13/pcs.r3)*(nb*pcs.ne01*pcs.ne02); + + const uint xblk = ib_row + offset0 + pcs.inAOff; + const uint y = r1*pcs.ne10 + im*pcs.ne00*pcs.ne1 + pcs.inBOff; + + float yl[16]; + float yh[16]; + float sumf[N_DST] = {0.f, 0.f, 0.f, 0.f}; + float all_sum = 0.f; + + uint y4 = y + ix * QK_K + 64 * iq + 8 * ir; + + for (uint ib = ix; ib < nb; ib += 4) { + const uint blk_idx = ib + xblk; + + float sumy[4] = {0.f, 0.f, 0.f, 0.f}; + for (int i = 0; i < 8; ++i) { + yl[i+0] = inB[y4+i+ 0]; sumy[0] += yl[i+0]; + yl[i+8] = inB[y4+i+ 32]; sumy[1] += yl[i+8]; + yh[i+0] = inB[y4+i+128]; sumy[2] += yh[i+0]; + yh[i+8] = inB[y4+i+160]; sumy[3] += yh[i+8]; + } + + for (int row = 0; row < N_DST; row++) { + uint row_idx = row * nb; + + uint16_t sc_0 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 0); + uint16_t sc_1 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 2); + uint16_t sc_2 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 4); + uint16_t sc_3 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 6); + uint16_t sc_4 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 8); + + uint16_t sc16[4]; + sc16[0] = sc_0 & kmask1; + sc16[1] = sc_2 & kmask1; + sc16[2] = ((sc_4 >> 0) & kmask2) | ((sc_0 & kmask3) >> 2); + sc16[3] = ((sc_4 >> 4) & kmask2) | ((sc_2 & kmask3) >> 2); + + float acc1[4] = {0.f, 0.f, 0.f, 0.f}; + float acc2[4] = {0.f, 0.f, 0.f, 0.f}; + for (int i = 0; i < 8; i += 2) { + uint16_t q1 = u8BufToU16(inA[blk_idx + row_idx].qs, 32 * iq + 8 * ir + i); + uint16_t q2 = u8BufToU16(inA[blk_idx + row_idx].qs, 64 + 32 * iq + 8 * ir + i); + acc1[0] += yl[i+0] * (q1 & 0x000F); + acc1[1] += yl[i+1] * (q1 & 0x0F00); + acc1[2] += yl[i+8] * (q1 & 0x00F0); + acc1[3] += yl[i+9] * (q1 & 0xF000); + acc2[0] += yh[i+0] * (q2 & 0x000F); + acc2[1] += yh[i+1] * (q2 & 0x0F00); + acc2[2] += yh[i+8] * (q2 & 0x00F0); + acc2[3] += yh[i+9] * (q2 & 0xF000); + } + + uint8_t sc8_0 = uint8_t(sc16[0] & 0xFF); + uint8_t sc8_1 = uint8_t(sc16[0] >> 8 ); + uint8_t sc8_2 = uint8_t(sc16[1] & 0xFF); + uint8_t sc8_3 = uint8_t(sc16[1] >> 8 ); + uint8_t sc8_4 = uint8_t(sc16[2] & 0xFF); + uint8_t sc8_5 = uint8_t(sc16[2] >> 8 ); + uint8_t sc8_6 = uint8_t(sc16[3] & 0xFF); + uint8_t sc8_7 = uint8_t(sc16[3] >> 8 ); + + float dall = float(inA[blk_idx + row_idx].d); + float dmin = float(inA[blk_idx + row_idx].dmin); + sumf[row] += dall * ((acc1[0] + 1.f/256.f * acc1[1]) * sc8_0 + + (acc1[2] + 1.f/256.f * acc1[3]) * sc8_1 * 1.f/16.f + + (acc2[0] + 1.f/256.f * acc2[1]) * sc8_4 + + (acc2[2] + 1.f/256.f * acc2[3]) * sc8_5 * 1.f/16.f) - + dmin * (sumy[0] * sc8_2 + sumy[1] * sc8_3 + sumy[2] * sc8_6 + sumy[3] * sc8_7); + } + + y4 += 4 * QK_K; + } + + for (int row = 0; row < N_DST; ++row) { + all_sum = subgroupAdd(sumf[row]); + if (subgroupElect()) { + out_[r1*pcs.ne0 + im*pcs.ne0*pcs.ne1 + first_row + row + pcs.outOff] = all_sum; + } + } +} diff --git a/klite.embd b/klite.embd index 12c4f23f1..230dffb85 100644 --- a/klite.embd +++ b/klite.embd @@ -8836,12 +8836,7 @@ Current version indicated by LITEVER below. if(desired_oai_key!=""){ oaiheaders["Authorization"] = "Bearer " + desired_oai_key; }; - if (!desired_oai_ep.toLowerCase().includes("api.mistral.ai")) { - if(desired_oai_key!="") - { - oaiheaders["x-api-key"] = desired_oai_key; - } - }else{ + if (desired_oai_ep.toLowerCase().includes("api.mistral.ai")) { if(desired_oai_key=="") { msgbox("MistralAI API requires an API key to fetch model list!"); @@ -13229,12 +13224,6 @@ Current version indicated by LITEVER below. 'Authorization': 'Bearer ' + custom_oai_key }; - if(!targetep.toLowerCase().includes("openrouter.ai") && - !targetep.toLowerCase().includes("api.mistral.ai")) - { - oaiheaders["x-api-key"] = custom_oai_key; - } - if(targetep.toLowerCase().includes("openrouter.ai")) { oaiheaders["HTTP-Referer"] = "https://lite.koboldai.net"; diff --git a/spm-headers/ggml-cpp.h b/spm-headers/ggml-cpp.h new file mode 120000 index 000000000..8a8604cc2 --- /dev/null +++ b/spm-headers/ggml-cpp.h @@ -0,0 +1 @@ +../ggml/include/ggml-cpp.h \ No newline at end of file diff --git a/src/llama.cpp b/src/llama.cpp index 3adc642b4..882e90be6 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -10,6 +10,7 @@ #include "ggml.h" #include "ggml-alloc.h" #include "ggml-backend.h" +#include "ggml-cpp.h" #if defined(GGML_USE_CLBLAST) # include "ggml-opencl.h" @@ -2811,31 +2812,22 @@ struct llama_kv_cache { std::vector k_l; // per layer std::vector v_l; - std::vector ctxs; - std::vector bufs; + std::vector ctxs; + std::vector bufs; - size_t total_size() const { + size_t total_size() { size_t size = 0; - for (ggml_backend_buffer_t buf : bufs) { - size += ggml_backend_buffer_get_size(buf); + for (auto & buf : bufs) { + size += ggml_backend_buffer_get_size(buf.get()); } return size; } - - ~llama_kv_cache() { - for (struct ggml_context * ctx : ctxs) { - ggml_free(ctx); - } - for (ggml_backend_buffer_t buf : bufs) { - ggml_backend_buffer_free(buf); - } - } }; struct llama_control_vector { std::vector tensors; // per layer - std::vector ctxs; - std::vector bufs; + std::vector ctxs; + std::vector bufs; int32_t layer_start = -1; int32_t layer_end = -1; @@ -2854,15 +2846,6 @@ struct llama_control_vector { } return cur; } - - ~llama_control_vector() { - for (struct ggml_context * ctx : ctxs) { - ggml_free(ctx); - } - for (ggml_backend_buffer_t buf : bufs) { - ggml_backend_buffer_free(buf); - } - } }; struct llama_model { @@ -2922,10 +2905,10 @@ struct llama_model { std::vector dev_layer; // contexts where the model tensors metadata is stored - std::vector ctxs; + std::vector ctxs; // the model memory buffers for the tensor data - std::vector bufs; + std::vector bufs; // model memory mapped files llama_mmaps mappings; @@ -2944,13 +2927,7 @@ struct llama_model { std::set lora_adapters; ~llama_model() { - for (struct ggml_context * ctx : ctxs) { - ggml_free(ctx); - } - for (ggml_backend_buffer_t buf : bufs) { - ggml_backend_buffer_free(buf); - } - while (!lora_adapters.empty()) { + while (!lora_adapters.empty()) { llama_lora_adapter_free(*lora_adapters.begin()); } } @@ -3267,16 +3244,6 @@ struct llama_context { , t_start_us(model.t_start_us) , t_load_us(model.t_load_us) {} - ~llama_context() { - ggml_backend_sched_free(sched); - - for (ggml_backend_t backend : backends) { - ggml_backend_free(backend); - } - - ggml_backend_buffer_free(buf_output); - } - const struct llama_model & model; struct llama_cparams cparams; @@ -3286,7 +3253,7 @@ struct llama_context { std::unordered_map lora_adapters; - std::vector backends; + std::vector backends; std::vector> set_n_threads_fns; ggml_backend_t backend_cpu = nullptr; @@ -3308,7 +3275,7 @@ struct llama_context { mutable int32_t n_eval = 0; // number of eval calls // host buffer for the model output (logits and embeddings) - ggml_backend_buffer_t buf_output = nullptr; + ggml_backend_buffer_ptr buf_output; // decode output (2-dimensional array: [n_outputs][n_vocab]) size_t logits_size = 0; // capacity (of floats) for logits @@ -3338,7 +3305,7 @@ struct llama_context { // memory buffers used to evaluate the model std::vector buf_compute_meta; - ggml_backend_sched_t sched = nullptr; + ggml_backend_sched_ptr sched; ggml_abort_callback abort_callback = nullptr; void * abort_callback_data = nullptr; @@ -3372,8 +3339,8 @@ struct llama_lora_adapter { struct llama_model * base_model; // map tensor name to lora_a_b std::unordered_map ab_map; - std::vector ctxs; - std::vector bufs; + std::vector ctxs; + std::vector bufs; float alpha; @@ -3391,12 +3358,6 @@ struct llama_lora_adapter { } ~llama_lora_adapter() { - for (struct ggml_context * ctx : ctxs) { - ggml_free(ctx); - } - for (ggml_backend_buffer_t buf : bufs) { - ggml_backend_buffer_free(buf); - } auto pos = base_model->lora_adapters.find(this); if (pos != base_model->lora_adapters.end()) { base_model->lora_adapters.erase(pos); @@ -3415,7 +3376,7 @@ static bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t d /*.mem_buffer =*/ NULL, /*.no_alloc =*/ true, }; - ggml_context * ctx = ggml_init(params); + ggml_context_ptr ctx { ggml_init(params) }; if (!ctx) { throw std::runtime_error(format("failed to create ggml context")); } @@ -3423,20 +3384,16 @@ static bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t d #if defined(GGML_USE_CLBLAST) ggml_cl_init(); #endif - - ggml_backend_buffer_t buf = ggml_backend_buft_alloc_buffer(buft, 0); - ggml_tensor * op_tensor = fn(ctx); + ggml_backend_buffer_ptr buf { ggml_backend_buft_alloc_buffer(buft, 0) }; + ggml_tensor * op_tensor = fn(ctx.get()); for (int i = 0; i < GGML_MAX_SRC; i++) { if (op_tensor->src[i] != nullptr) { assert(op_tensor->src[i]->buffer == nullptr); - op_tensor->src[i]->buffer = buf; + op_tensor->src[i]->buffer = buf.get(); } } bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor); - ggml_free(ctx); - ggml_backend_buffer_free(buf); - return op_supported; } @@ -3488,7 +3445,8 @@ static bool llama_kv_cache_init( // create a context for each buffer type std::map ctx_map; auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * { - if (ctx_map.count(buft) == 0) { + auto it = ctx_map.find(buft); + if (it == ctx_map.end()) { struct ggml_init_params params = { /*.mem_size =*/ size_t(2u*n_layer*ggml_tensor_overhead()), /*.mem_buffer =*/ NULL, @@ -3499,9 +3457,10 @@ static bool llama_kv_cache_init( return nullptr; } ctx_map[buft] = ctx; - cache.ctxs.push_back(ctx); + cache.ctxs.emplace_back(ctx); + return ctx; } - return ctx_map.at(buft); + return it->second; }; cache.k_l.reserve(n_layer); @@ -3553,7 +3512,7 @@ static bool llama_kv_cache_init( } ggml_backend_buffer_clear(buf, 0); LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0); - cache.bufs.push_back(buf); + cache.bufs.emplace_back(buf); } return true; @@ -3806,7 +3765,7 @@ static void llama_kv_cache_clear(struct llama_kv_cache & cache) { cache.used = 0; for (auto & buf : cache.bufs) { - ggml_backend_buffer_clear(buf, 0); + ggml_backend_buffer_clear(buf.get(), 0); } } @@ -4289,21 +4248,38 @@ struct llama_model_loader { ggml_tensor * tensor; - llama_tensor_weight(const llama_file * file, uint16_t idx, const char * name, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) { - const int tensor_idx = gguf_find_tensor(gguf_ctx, name); - offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx); + llama_tensor_weight(const llama_file * file, uint16_t idx, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) { + const int tensor_idx = gguf_find_tensor(gguf_ctx, ggml_get_name(tensor)); + if (tensor_idx < 0) { + throw std::runtime_error(format("tensor '%s' not found in the model", ggml_get_name(tensor))); + } + offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx); if (offs + ggml_nbytes(tensor) < offs || offs + ggml_nbytes(tensor) > file->size) { - throw std::runtime_error(format("tensor '%s' data is not within the file bounds, model is corrupted or incomplete", name)); + throw std::runtime_error(format("tensor '%s' data is not within the file bounds, model is corrupted or incomplete", ggml_get_name(tensor))); } } }; - std::vector weights; + // custom comparator to sort weights more nicely by layer + struct weight_name_comparer { + bool operator()(const std::string & a, const std::string & b) const { + int a_layer = -1; + int b_layer = -1; + sscanf(a.c_str(), "blk.%d.", &a_layer); + sscanf(b.c_str(), "blk.%d.", &b_layer); + if (a_layer != b_layer) { + return a_layer < b_layer; + } + return a < b; + } + }; + + std::map weights_map; std::unordered_map kv_overrides; - struct gguf_context * meta = NULL; - std::vector contexts; + gguf_context_ptr meta; + std::vector contexts; std::string arch_name; LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN); @@ -4326,7 +4302,7 @@ struct llama_model_loader { /*.ctx = */ &ctx, }; - meta = gguf_init_from_file(fname.c_str(), params); + meta.reset(gguf_init_from_file(fname.c_str(), params)); if (!meta) { throw std::runtime_error(format("%s: failed to load model from %s\n", __func__, fname.c_str())); } @@ -4341,7 +4317,14 @@ struct llama_model_loader { // For subsidiary files, `meta` tensor data offset must not be used, // so we build a unified tensors index for weights. for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) { - weights.emplace_back(files.back().get(), 0, cur->name, meta, cur); + std::string tensor_name = std::string(cur->name); + // make sure there is no duplicated tensor names + if (weights_map.find(tensor_name) != weights_map.end()) { + throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur))); + } + n_elements += ggml_nelements(cur); + n_bytes += ggml_nbytes(cur); + weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), 0, meta.get(), cur)); } uint16_t n_split = 0; get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false); @@ -4371,7 +4354,7 @@ struct llama_model_loader { /*.no_alloc = */ true, /*.ctx = */ &ctx, }; - struct gguf_context * ctx_gguf = gguf_init_from_file(split_path, split_params); + gguf_context_ptr ctx_gguf { gguf_init_from_file(split_path, split_params) }; if (!ctx_gguf) { throw std::runtime_error(format("%s: failed to load GGUF split from %s\n", __func__, split_path)); } @@ -4381,17 +4364,22 @@ struct llama_model_loader { // Save tensors data offset info of the shard. for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) { - weights.emplace_back(files.back().get(), idx, cur->name, ctx_gguf, cur); + std::string tensor_name = std::string(cur->name); + // make sure there is no duplicated tensor names + if (weights_map.find(tensor_name) != weights_map.end()) { + throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur))); + } + n_elements += ggml_nelements(cur); + n_bytes += ggml_nbytes(cur); + weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), idx, ctx_gguf.get(), cur)); } - - gguf_free(ctx_gguf); } get_key(llm_kv(LLM_KV_SPLIT_TENSORS_COUNT), n_tensors); // sanity check { - const int n_tensors_loaded = (int) weights.size(); + const int n_tensors_loaded = (int) weights_map.size(); if (n_tensors != n_tensors_loaded) { throw std::runtime_error(format("corrupted model: %d tensors expected but %d found", n_tensors, n_tensors_loaded)); } @@ -4400,23 +4388,10 @@ struct llama_model_loader { LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n", __func__, n_split - 1); } - n_kv = gguf_get_n_kv(meta); - n_tensors = weights.size(); + n_kv = gguf_get_n_kv(meta.get()); + n_tensors = weights_map.size(); - fver = (enum llama_fver) gguf_get_version(meta); - - std::set tensor_names; - for (auto & w : weights) { - n_elements += ggml_nelements(w.tensor); - n_bytes += ggml_nbytes(w.tensor); - // make sure there is no duplicated tensor names - const std::string name(w.tensor->name); - auto found = tensor_names.find(name); - if (found != tensor_names.end()) { - throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", w.tensor->name)); - } - tensor_names.insert(name); - } + fver = (enum llama_fver) gguf_get_version(meta.get()); LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n", __func__, n_kv, n_tensors, fname.c_str(), llama_file_version_name(fver)); @@ -4430,8 +4405,10 @@ struct llama_model_loader { uint32_t n_type_max = 0; enum ggml_type type_max = GGML_TYPE_F32; - for (int i = 0; i < n_tensors; i++) { - const ggml_tensor * tensor = weights.at(i).tensor; + for (const auto & it : weights_map) { + const llama_tensor_weight & w = it.second; + const ggml_tensor * tensor = w.tensor; + enum ggml_type type = tensor->type; n_type[type]++; @@ -4442,8 +4419,8 @@ struct llama_model_loader { } if (trace > 0) { - const uint16_t sid = weights.at(i).idx; - LLAMA_LOG_INFO("%s: - tensor %4d, split %2d: %32s %-8s [ %s ]\n", __func__, i, sid, ggml_get_name(tensor), ggml_type_name(type), llama_format_tensor_shape(tensor).c_str()); + const uint16_t sid = w.idx; + LLAMA_LOG_INFO("%s: - tensor split %2d: %32s %-8s [ %s ]\n", __func__, sid, ggml_get_name(tensor), ggml_type_name(type), llama_format_tensor_shape(tensor).c_str()); } } @@ -4486,23 +4463,23 @@ struct llama_model_loader { ftype = (llama_ftype) (ftype | LLAMA_FTYPE_GUESSED); { - const int kid = gguf_find_key(meta, "general.file_type"); // TODO: use LLM_KV + const int kid = gguf_find_key(meta.get(), "general.file_type"); // TODO: use LLM_KV if (kid >= 0) { - ftype = (llama_ftype) gguf_get_val_u32(meta, kid); + ftype = (llama_ftype) gguf_get_val_u32(meta.get(), kid); } } LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__); for (int i = 0; i < n_kv; i++) { - const char * name = gguf_get_key(meta, i); - const enum gguf_type type = gguf_get_kv_type(meta, i); + const char * name = gguf_get_key(meta.get(), i); + const enum gguf_type type = gguf_get_kv_type(meta.get(), i); const std::string type_name = type == GGUF_TYPE_ARRAY - ? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(meta, i)), gguf_get_arr_n(meta, i)) + ? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(meta.get(), i)), gguf_get_arr_n(meta.get(), i)) : gguf_type_name(type); - std::string value = gguf_kv_to_str(meta, i); + std::string value = gguf_kv_to_str(meta.get(), i); const size_t MAX_VALUE_LEN = 40; if (value.size() > MAX_VALUE_LEN) { value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str()); @@ -4531,19 +4508,10 @@ struct llama_model_loader { this->check_tensors = check_tensors; } - ~llama_model_loader() { - if (meta) { - gguf_free(meta); - } - for (auto * ctx : contexts) { - ggml_free(ctx); - } - } - template typename std::enable_if::value, bool>::type get_arr_n(const std::string & key, T & result, const bool required = true) { - const int kid = gguf_find_key(meta, key.c_str()); + const int kid = gguf_find_key(meta.get(), key.c_str()); if (kid < 0) { if (required) { @@ -4553,7 +4521,7 @@ struct llama_model_loader { } struct GGUFMeta::ArrayInfo arr_info = - GGUFMeta::GKV::get_kv(meta, kid); + GGUFMeta::GKV::get_kv(meta.get(), kid); result = arr_info.length; @@ -4568,9 +4536,9 @@ struct llama_model_loader { template bool get_arr(const std::string & key, std::vector & result, const bool required = true) { - const int kid = gguf_find_key(meta, key.c_str()); + const int kid = gguf_find_key(meta.get(), key.c_str()); - if (kid < 0 || gguf_get_kv_type(meta, kid) != GGUF_TYPE_ARRAY) { + if (kid < 0 || gguf_get_kv_type(meta.get(), kid) != GGUF_TYPE_ARRAY) { if (required) { throw std::runtime_error(format("array key not found in model: %s", key.c_str())); } @@ -4578,7 +4546,7 @@ struct llama_model_loader { } struct GGUFMeta::ArrayInfo arr_info = - GGUFMeta::GKV::get_kv(meta, kid); + GGUFMeta::GKV::get_kv(meta.get(), kid); switch (arr_info.gt) { case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same::value)); break; @@ -4597,9 +4565,9 @@ struct llama_model_loader { template bool get_arr(const std::string & key, std::array & result, const bool required = true) { - const int kid = gguf_find_key(meta, key.c_str()); + const int kid = gguf_find_key(meta.get(), key.c_str()); - if (kid < 0 || gguf_get_kv_type(meta, kid) != GGUF_TYPE_ARRAY) { + if (kid < 0 || gguf_get_kv_type(meta.get(), kid) != GGUF_TYPE_ARRAY) { if (required) { throw std::runtime_error(format("array key not found in model: %s", key.c_str())); } @@ -4607,7 +4575,7 @@ struct llama_model_loader { } struct GGUFMeta::ArrayInfo arr_info = - GGUFMeta::GKV::get_kv(meta, kid); + GGUFMeta::GKV::get_kv(meta.get(), kid); switch (arr_info.gt) { case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same::value)); break; @@ -4639,7 +4607,7 @@ struct llama_model_loader { const struct llama_model_kv_override * override = it != kv_overrides.end() ? &it->second : nullptr; - const bool found = GGUFMeta::GKV::set(meta, key, result, override); + const bool found = GGUFMeta::GKV::set(meta.get(), key, result, override); if (required && !found) { throw std::runtime_error(format("key not found in model: %s", key.c_str())); @@ -4656,7 +4624,7 @@ struct llama_model_loader { // get array of n <= N_MAX elements, or a single element repeated n times template bool get_key_or_arr(const std::string & key, std::array & result, uint32_t n, const bool required = true) { - const int kid = gguf_find_key(meta, key.c_str()); + const int kid = gguf_find_key(meta.get(), key.c_str()); if (kid < 0) { if (required) { @@ -4669,9 +4637,9 @@ struct llama_model_loader { throw std::runtime_error(format("n > N_MAX: %u > %u for key %s", (uint32_t) n, (uint32_t) N_MAX, key.c_str())); } - if (gguf_get_kv_type(meta, kid) == GGUF_TYPE_ARRAY) { + if (gguf_get_kv_type(meta.get(), kid) == GGUF_TYPE_ARRAY) { struct GGUFMeta::ArrayInfo arr_info = - GGUFMeta::GKV::get_kv(meta, kid); + GGUFMeta::GKV::get_kv(meta.get(), kid); if (n != arr_info.length) { throw std::runtime_error(format("key %s has wrong array length; expected %u, got %u", key.c_str(), n, (uint32_t) arr_info.length)); @@ -4707,21 +4675,13 @@ struct llama_model_loader { return llm_kv.arch; } - const char * get_tensor_name(int i) const { - return weights.at(i).tensor->name; - } - const llama_tensor_weight * get_weight(const char * name) const { - for (const auto & weight : weights) { - if (strcmp(name, weight.tensor->name) == 0) { - return &weight; - } + auto pos = weights_map.find(name); + if (pos != weights_map.end()) { + return &pos->second; } - return nullptr; - } - const llama_tensor_weight * get_weight(int i) const { - return get_weight(get_tensor_name(i)); + return nullptr; } const llama_tensor_weight & require_weight(const char * name) const { @@ -4748,10 +4708,6 @@ struct llama_model_loader { return tensor; } - struct ggml_tensor * get_tensor_meta(int i) const { - return get_tensor_meta(get_tensor_name(i)); - } - const struct ggml_tensor * check_tensor_dims(const std::string & name, const std::vector & ne, bool required) const { const struct ggml_tensor * cur = get_tensor_meta(name.c_str()); @@ -4858,8 +4814,8 @@ struct llama_model_loader { } // compute the total size of all tensors for progress reporting - for (auto & w : weights) { - size_data += ggml_nbytes(w.tensor); + for (const auto & it : weights_map) { + size_data += ggml_nbytes(it.second.tensor); } } @@ -4871,19 +4827,12 @@ struct llama_model_loader { *last = 0; *addr = mapping->addr; for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) { - try { - const auto * weight = get_weight(ggml_get_name(tensor)); - if (!weight) { - continue; - } - if (weight->idx != idx) { - continue; - } - *first = std::min(*first, weight->offs); - *last = std::max(*last, weight->offs + ggml_nbytes(tensor)); - } catch(...) { - // the tensor is not in the model + const auto * weight = get_weight(ggml_get_name(tensor)); + if (!weight || weight->idx != idx) { + continue; } + *first = std::min(*first, weight->offs); + *last = std::max(*last, weight->offs + ggml_nbytes(tensor)); } } @@ -5060,7 +5009,6 @@ struct llama_model_loader { ggml_backend_tensor_set(cur, data, 0, n_size); } } else { - GGML_ASSERT(weight->idx < files.size()); const auto & file = files.at(weight->idx); if (ggml_backend_buffer_is_host(cur->buffer)) { file->seek(weight->offs, SEEK_SET); @@ -5371,7 +5319,7 @@ static void llm_load_hparams( llama_model_loader & ml, llama_model & model) { auto & hparams = model.hparams; - const gguf_context * ctx = ml.meta; + const gguf_context * ctx = ml.meta.get(); // get metadata as string for (int i = 0; i < gguf_get_n_kv(ctx); i++) { @@ -6138,7 +6086,7 @@ static void llm_load_vocab( llama_model & model) { auto & vocab = model.vocab; - struct gguf_context * ctx = ml.meta; + struct gguf_context * ctx = ml.meta.get(); const auto kv = LLM_KV(model.arch); @@ -7143,10 +7091,11 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w /*.mem_buffer =*/ NULL, /*.no_alloc =*/ true, }; - ggml_context * ctx = ggml_init(params); - if (!ctx) { + ggml_context_ptr ctx_ptr { ggml_init(params) }; + if (!ctx_ptr) { throw std::runtime_error(format("failed to create ggml context")); } + ggml_context * ctx = ctx_ptr.get(); ggml_tensor * op_tensor = nullptr; @@ -7158,7 +7107,7 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w } break; case GGML_OP_MUL_MAT: { - ggml_tensor * b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, w->ne[0], 512); + ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], 512, w->ne[2], w->ne[3]); op_tensor = ggml_mul_mat(ctx, w, b); } break; case GGML_OP_MUL_MAT_ID: @@ -7198,18 +7147,38 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w } break; case GGML_OP_SSM_CONV: { - // TODO: ggml_ssm_conv(ctx, conv_x, model.layers[il].ssm_conv1d); - op_tensor = ggml_ssm_conv(ctx, nullptr, w); + // FIXME + ggml_tensor * conv_x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 12345, w->ne[1], 6789); + op_tensor = ggml_ssm_conv(ctx, conv_x, w); } break; case GGML_OP_SSM_SCAN: { - // TODO: ggml_ssm_scan(ctx, ssm, x, dt, model.layers[il].ssm_a, B, C); - op_tensor = ggml_ssm_scan(ctx, nullptr, nullptr, nullptr, w, nullptr, nullptr); + // FIXME + const int64_t d_state = w->ne[0]; + const int64_t d_inner = w->ne[1]; + const int64_t n_seq_tokens = 512; + const int64_t n_seqs = 1; + ggml_tensor * s = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_state, d_inner, n_seqs); + ggml_tensor * x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_inner, n_seq_tokens, n_seqs); + ggml_tensor * dt = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_inner, n_seq_tokens, n_seqs); + ggml_tensor * B = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_state, n_seq_tokens, n_seqs); + ggml_tensor * C = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_state, n_seq_tokens, n_seqs); + op_tensor = ggml_ssm_scan(ctx, s, x, dt, w, B, C); } break; case GGML_OP_RWKV_WKV: { - // TODO: ggml_rwkv_wkv(ctx, k, v, r, layer->time_mix_first, w, *wkv_state); - op_tensor = ggml_rwkv_wkv(ctx, nullptr, nullptr, nullptr, w, nullptr, nullptr); + // FIXME + const int64_t S = 123; + const int64_t H = 123; + const int64_t n_tokens = 123; + const int64_t n_seqs = 123; + ggml_tensor * k = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, S, 1, H, n_tokens); + ggml_tensor * v = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 1, S, H, n_tokens); + ggml_tensor * r = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 1, S, H, n_tokens); + ggml_tensor * tf = w; + ggml_tensor * td = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 1, S, H, n_tokens); + ggml_tensor * state = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, S, n_seqs, S, H); + op_tensor = ggml_rwkv_wkv(ctx, k, v, r, tf, td, state); } break; default: GGML_ABORT("%s: missing test for op %s for tensor %s", __func__, ggml_op_name(op), w->name); @@ -7222,8 +7191,6 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w ggml_backend_buffer_free(w->buffer); w->buffer = nullptr; - ggml_free(ctx); - return op_supported; } @@ -7421,7 +7388,8 @@ static bool llm_load_tensors( std::map ctx_map; auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * { - if (ctx_map.count(buft) == 0) { + auto it = ctx_map.find(buft); + if (it == ctx_map.end()) { ggml_init_params params = { /*.mem_size =*/ ctx_size, /*.mem_buffer =*/ NULL, @@ -7432,9 +7400,10 @@ static bool llm_load_tensors( throw std::runtime_error(format("failed to create ggml context")); } ctx_map[buft] = ctx; - model.ctxs.push_back(ctx); + model.ctxs.emplace_back(ctx); + return ctx; } - return ctx_map.at(buft); + return it->second; }; // create tensors for the weights @@ -7472,7 +7441,7 @@ static bool llm_load_tensors( if (flags & llama_model_loader::TENSOR_NOT_REQUIRED) { return nullptr; } - throw std::runtime_error(format("missing tensor %s", tn.str().c_str())); + throw std::runtime_error(format("missing tensor '%s'", tn.str().c_str())); } // some models use the token embedding tensor as the output, but since these are used in different layers and with different ops @@ -7491,7 +7460,7 @@ static bool llm_load_tensors( // tensors with "bias" suffix are always used with GGML_OP_ADD ggml_op op; - bool bias = strcmp(tn.suffix, "bias") == 0; + bool bias = tn.suffix != nullptr && strcmp(tn.suffix, "bias") == 0; if (bias) { op = GGML_OP_ADD; } else { @@ -9160,7 +9129,7 @@ static bool llm_load_tensors( if (buf == nullptr) { throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft))); } - model.bufs.push_back(buf); + model.bufs.emplace_back(buf); bufs.emplace(idx, buf); } } @@ -9169,7 +9138,7 @@ static bool llm_load_tensors( if (buf == nullptr) { throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft))); } - model.bufs.push_back(buf); + model.bufs.emplace_back(buf); if (use_mlock && ggml_backend_buffer_is_host(buf)) { model.mlock_bufs.emplace_back(new llama_mlock); auto & mlock_buf = model.mlock_bufs.back(); @@ -9209,13 +9178,13 @@ static bool llm_load_tensors( } // print memory requirements per buffer type - for (ggml_backend_buffer_t buf : model.bufs) { - LLAMA_LOG_INFO("%s: %10s model buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0); + for (auto & buf : model.bufs) { + LLAMA_LOG_INFO("%s: %10s model buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0); } // populate tensors_by_name - for (ggml_context * ctx : model.ctxs) { - for (auto * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) { + for (auto & ctx : model.ctxs) { + for (auto * cur = ggml_get_first_tensor(ctx.get()); cur != NULL; cur = ggml_get_next_tensor(ctx.get(), cur)) { model.tensors_by_name.emplace_back(ggml_get_name(cur), cur); } } @@ -10320,10 +10289,8 @@ struct llm_build_context { } void free() { - if (ctx0) { - ggml_free(ctx0); - ctx0 = nullptr; - } + ggml_free(ctx0); + ctx0 = nullptr; } struct ggml_cgraph * build_k_shift() { @@ -10351,10 +10318,10 @@ struct llm_build_context { // dequantize to f32 -> RoPE -> quantize back tmp = ggml_cast(ctx0, k, GGML_TYPE_F32); cb(tmp, "K_f32", il); - for (auto * backend : lctx.backends) { + for (auto & backend : lctx.backends) { // Figure out which backend KV cache belongs to - if (ggml_backend_supports_buft(backend, ggml_backend_buffer_get_type(kv_self.k_l[il]->buffer))) { - ggml_backend_sched_set_tensor_backend(lctx.sched, tmp, backend); + if (ggml_backend_supports_buft(backend.get(), ggml_backend_buffer_get_type(kv_self.k_l[il]->buffer))) { + ggml_backend_sched_set_tensor_backend(lctx.sched.get(), tmp, backend.get()); break; } } @@ -16487,7 +16454,7 @@ static struct ggml_cgraph * llama_build_graph( if (!lctx.cparams.offload_kqv) { if (strcmp(name, "kqv_merged_cont") == 0) { // all nodes between the KV store and the attention output are run on the CPU - ggml_backend_sched_set_tensor_backend(lctx.sched, cur, lctx.backend_cpu); + ggml_backend_sched_set_tensor_backend(lctx.sched.get(), cur, lctx.backend_cpu); } } @@ -16497,10 +16464,10 @@ static struct ggml_cgraph * llama_build_graph( if (ubatch.n_tokens < 32 || full_offload) { if (il != -1 && strcmp(name, "norm") == 0) { const auto & dev_layer = lctx.model.dev_layer.at(il); - for (auto * backend : lctx.backends) { - if (ggml_backend_get_device(backend) == dev_layer.dev) { - if (ggml_backend_supports_op(backend, cur)) { - ggml_backend_sched_set_tensor_backend(lctx.sched, cur, backend); + for (auto & backend : lctx.backends) { + if (ggml_backend_get_device(backend.get()) == dev_layer.dev) { + if (ggml_backend_supports_op(backend.get(), cur)) { + ggml_backend_sched_set_tensor_backend(lctx.sched.get(), cur, backend.get()); } } } @@ -17187,7 +17154,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) { lctx.output_ids.resize(n_batch); } - const size_t prev_size = lctx.buf_output ? ggml_backend_buffer_get_size(lctx.buf_output) : 0; + const size_t prev_size = lctx.buf_output ? ggml_backend_buffer_get_size(lctx.buf_output.get()) : 0; const size_t new_size = (logits_size + embd_size) * sizeof(float); // alloc only when more than the current capacity is required @@ -17198,7 +17165,6 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) { // This doesn't happen often, but may be annoying in some cases (like the HellaSwag benchmark) LLAMA_LOG_INFO("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0); #endif - ggml_backend_buffer_free(lctx.buf_output); lctx.buf_output = nullptr; lctx.logits = nullptr; lctx.embd = nullptr; @@ -17206,27 +17172,19 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) { auto * buft = ggml_backend_cpu_buffer_type(); // try to use the host buffer of the device where the output tensor is allocated for faster transfer to system memory - ggml_tensor * output_tensor = lctx.model.output; - if (!output_tensor) { - // bert models don't have an output tensor, use the last layer - output_tensor = lctx.model.layers.back().layer_out_norm; + auto * output_dev = lctx.model.dev_output.dev; + auto * output_dev_host_buft = output_dev ? ggml_backend_dev_host_buffer_type(output_dev) : nullptr; + if (output_dev_host_buft) { + buft = output_dev_host_buft; } - if (output_tensor) { - auto * output_buft = ggml_backend_buffer_get_type(output_tensor->buffer); - auto * output_dev = ggml_backend_buft_get_device(output_buft); - auto * output_dev_host_buft = ggml_backend_dev_host_buffer_type(output_dev); - if (output_dev_host_buft) { - buft = output_dev_host_buft; - } - } - lctx.buf_output = ggml_backend_buft_alloc_buffer(buft, new_size); + lctx.buf_output.reset(ggml_backend_buft_alloc_buffer(buft, new_size)); if (lctx.buf_output == nullptr) { LLAMA_LOG_ERROR("%s: failed to allocate output buffer of size %.2f MiB\n", __func__, new_size / (1024.0 * 1024.0)); return 0; } } - float * output_base = (float *) ggml_backend_buffer_get_base(lctx.buf_output); + float * output_base = (float *) ggml_backend_buffer_get_base(lctx.buf_output.get()); lctx.logits = has_logits ? output_base : nullptr; lctx.embd = has_embd ? output_base + logits_size : nullptr; @@ -17238,7 +17196,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) { // set all ids as invalid (negative) std::fill(lctx.output_ids.begin(), lctx.output_ids.end(), -1); - ggml_backend_buffer_clear(lctx.buf_output, 0); + ggml_backend_buffer_clear(lctx.buf_output.get(), 0); lctx.n_outputs = 0; @@ -17298,7 +17256,7 @@ static void llama_graph_compute( set_n_threads_fn.second(set_n_threads_fn.first, n_threads); } - auto err = ggml_backend_sched_graph_compute_async(lctx.sched, gf); + auto err = ggml_backend_sched_graph_compute_async(lctx.sched.get(), gf); if (err != GGML_STATUS_SUCCESS) { LLAMA_LOG_ERROR("%s: ggml_backend_sched_graph_compute_async failed with error %d\n", __func__, err); } @@ -17456,8 +17414,8 @@ static int llama_decode_internal( //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head); - ggml_backend_sched_reset(lctx.sched); - ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data); + ggml_backend_sched_reset(lctx.sched.get()); + ggml_backend_sched_set_eval_callback(lctx.sched.get(), lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data); ggml_cgraph * gf = llama_build_graph(lctx, ubatch, false); @@ -17485,7 +17443,7 @@ static int llama_decode_internal( } // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs); - ggml_backend_sched_alloc_graph(lctx.sched, gf); + ggml_backend_sched_alloc_graph(lctx.sched.get(), gf); llama_set_inputs(lctx, ubatch); @@ -17508,7 +17466,7 @@ static int llama_decode_internal( // extract logits if (res) { - ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(lctx.sched, res); + ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(lctx.sched.get(), res); GGML_ASSERT(backend_res != nullptr); GGML_ASSERT(lctx.logits != nullptr); @@ -17524,7 +17482,7 @@ static int llama_decode_internal( // extract embeddings if (embd) { - ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(lctx.sched, embd); + ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(lctx.sched.get(), embd); GGML_ASSERT(backend_embd != nullptr); switch (cparams.pooling_type) { @@ -17619,7 +17577,7 @@ static int llama_decode_internal( // Reset state for the next token before backend sync, to allow the CPU activities in the reset to // overlap with device computation. - ggml_backend_sched_reset(lctx.sched); + ggml_backend_sched_reset(lctx.sched.get()); return 0; } @@ -17697,8 +17655,8 @@ static int llama_encode_internal( GGML_ASSERT(n_threads > 0); - ggml_backend_sched_reset(lctx.sched); - ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data); + ggml_backend_sched_reset(lctx.sched.get()); + ggml_backend_sched_set_eval_callback(lctx.sched.get(), lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data); ggml_cgraph * gf = llama_build_graph(lctx, ubatch, false); @@ -17722,7 +17680,7 @@ static int llama_encode_internal( } } - ggml_backend_sched_alloc_graph(lctx.sched, gf); + ggml_backend_sched_alloc_graph(lctx.sched.get(), gf); llama_set_inputs(lctx, ubatch); @@ -17730,7 +17688,7 @@ static int llama_encode_internal( // extract embeddings if (embd) { - ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(lctx.sched, embd); + ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(lctx.sched.get(), embd); GGML_ASSERT(backend_embd != nullptr); if (llama_model_has_decoder(&lctx.model)) { @@ -17797,7 +17755,7 @@ static int llama_encode_internal( // Reset state for the next token before backend sync, to allow the CPU activities in the reset to // overlap with device computation. - ggml_backend_sched_reset(lctx.sched); + ggml_backend_sched_reset(lctx.sched.get()); return 0; } @@ -18011,7 +17969,7 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) { #else // ggml_graph defrag - ggml_backend_sched_reset(lctx.sched); + ggml_backend_sched_reset(lctx.sched.get()); ggml_cgraph * gf = llama_build_graph_defrag(lctx, ids); @@ -18033,11 +17991,11 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) { } { - ggml_backend_sched_reset(lctx.sched); + ggml_backend_sched_reset(lctx.sched.get()); ggml_cgraph * gf = llama_build_graph_k_shift(lctx); - ggml_backend_sched_alloc_graph(lctx.sched, gf); + ggml_backend_sched_alloc_graph(lctx.sched.get(), gf); llama_set_k_shift(lctx); @@ -18077,8 +18035,8 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) { ggml_cgraph * gf = llama_build_graph(lctx, ubatch, true); // initialize scheduler with the worst-case graph - ggml_backend_sched_reset(lctx.sched); - if (!ggml_backend_sched_reserve(lctx.sched, gf)) { + ggml_backend_sched_reset(lctx.sched.get()); + if (!ggml_backend_sched_reserve(lctx.sched.get(), gf)) { LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__); } } @@ -18629,40 +18587,57 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } const size_t align = GGUF_DEFAULT_ALIGNMENT; - struct gguf_context * ctx_out = gguf_init_empty(); + gguf_context_ptr ctx_out { gguf_init_empty() }; // copy the KV pairs from the input file - gguf_set_kv (ctx_out, ml.meta); - gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION); // TODO: use LLM_KV - gguf_set_val_u32(ctx_out, "general.file_type", ftype); // TODO: use LLM_KV + gguf_set_kv (ctx_out.get(), ml.meta.get()); + gguf_set_val_u32(ctx_out.get(), "general.quantization_version", GGML_QNT_VERSION); // TODO: use LLM_KV + gguf_set_val_u32(ctx_out.get(), "general.file_type", ftype); // TODO: use LLM_KV // Remove split metadata - gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_NO).c_str()); - gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str()); - gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str()); + gguf_remove_key(ctx_out.get(), ml.llm_kv(LLM_KV_SPLIT_NO).c_str()); + gguf_remove_key(ctx_out.get(), ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str()); + gguf_remove_key(ctx_out.get(), ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str()); if (params->kv_overrides) { const std::vector & overrides = *(const std::vector *)params->kv_overrides; - for (auto & o : overrides) { + for (const auto & o : overrides) { if (o.key[0] == 0) break; if (o.tag == LLAMA_KV_OVERRIDE_TYPE_FLOAT) { - gguf_set_val_f32(ctx_out, o.key, o.val_f64); + gguf_set_val_f32(ctx_out.get(), o.key, o.val_f64); } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_INT) { - gguf_set_val_i32(ctx_out, o.key, o.val_i64); + gguf_set_val_i32(ctx_out.get(), o.key, o.val_i64); } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_BOOL) { - gguf_set_val_bool(ctx_out, o.key, o.val_bool); + gguf_set_val_bool(ctx_out.get(), o.key, o.val_bool); } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_STR) { - gguf_set_val_str(ctx_out, o.key, o.val_str); + gguf_set_val_str(ctx_out.get(), o.key, o.val_str); } else { LLAMA_LOG_WARN("%s: unknown KV override type for key %s\n", __func__, o.key); } } } - for (int i = 0; i < ml.n_tensors; ++i) { - const struct ggml_tensor * meta = ml.get_tensor_meta(i); + // make a list of weights + std::vector tensors; + tensors.reserve(ml.weights_map.size()); + for (const auto & it : ml.weights_map) { + tensors.push_back(&it.second); + } - const std::string name = ggml_get_name(meta); + // keep_split requires that the weights are sorted by split index + if (params->keep_split) { + std::sort(tensors.begin(), tensors.end(), [](const llama_model_loader::llama_tensor_weight * a, const llama_model_loader::llama_tensor_weight * b) { + if (a->idx == b->idx) { + return a->offs < b->offs; + } + return a->idx < b->idx; + }); + } + + for (const auto * it : tensors) { + const struct ggml_tensor * tensor = it->tensor; + + const std::string name = ggml_get_name(tensor); // TODO: avoid hardcoded tensor names - use the TN_* constants if (name.find("attn_v.weight") != std::string::npos || @@ -18700,32 +18675,32 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s std::vector> f32_conv_buf; uint16_t n_split = 1; + // Assume split index is continuous if (params->keep_split) { - for (int i = 0; i < ml.n_tensors; ++i) { - n_split = std::max(uint16_t(ml.get_weight(i)->idx+1), n_split); + for (const auto * it : tensors) { + n_split = std::max(uint16_t(it->idx + 1), n_split); } } - std::vector ctx_outs(n_split, NULL); - ctx_outs[0] = ctx_out; + std::vector ctx_outs(n_split); + ctx_outs[0] = std::move(ctx_out); // populate the original tensors so we get an initial meta data - for (int i = 0; i < ml.n_tensors; ++i) { - auto weight = ml.get_weight(i); - uint16_t i_split = params->keep_split ? weight->idx : 0; - struct ggml_tensor * tensor = weight->tensor; - if (ctx_outs[i_split] == NULL) { - ctx_outs[i_split] = gguf_init_empty(); + for (const auto * it : tensors) { + uint16_t i_split = params->keep_split ? it->idx : 0; + struct ggml_tensor * tensor = it->tensor; + if (!ctx_outs[i_split]) { + ctx_outs[i_split].reset(gguf_init_empty()); } - gguf_add_tensor(ctx_outs[i_split], tensor); + gguf_add_tensor(ctx_outs[i_split].get(), tensor); } // Set split info if needed if (n_split > 1) { for (size_t i = 0; i < ctx_outs.size(); ++i) { - gguf_set_val_u16(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_NO).c_str(), i); - gguf_set_val_u16(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str(), n_split); - gguf_set_val_i32(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str(), ml.n_tensors); + gguf_set_val_u16(ctx_outs[i].get(), ml.llm_kv(LLM_KV_SPLIT_NO).c_str(), i); + gguf_set_val_u16(ctx_outs[i].get(), ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str(), n_split); + gguf_set_val_i32(ctx_outs[i].get(), ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str(), ml.n_tensors); } } @@ -18735,8 +18710,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s // Write metadata and close file handler if (fout.is_open()) { fout.seekp(0); - std::vector data(gguf_get_meta_size(ctx_outs[cur_split])); - gguf_get_meta_data(ctx_outs[cur_split], data.data()); + std::vector data(gguf_get_meta_size(ctx_outs[cur_split].get())); + gguf_get_meta_data(ctx_outs[cur_split].get(), data.data()); fout.write((const char *) data.data(), data.size()); fout.close(); } @@ -18753,19 +18728,19 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s fout = std::ofstream(fname, std::ios::binary); fout.exceptions(std::ofstream::failbit); // fail fast on write errors - const size_t meta_size = gguf_get_meta_size(ctx_outs[cur_split]); + const size_t meta_size = gguf_get_meta_size(ctx_outs[cur_split].get()); // placeholder for the meta data ::zeros(fout, meta_size); }; const auto tn = LLM_TN(model.arch); new_ofstream(0); - for (int i = 0; i < ml.n_tensors; ++i) { - auto weight = ml.get_weight(i); - struct ggml_tensor * tensor = weight->tensor; - if (weight->idx != cur_split && params->keep_split) { + for (const auto * it : tensors) { + const auto & weight = *it; + struct ggml_tensor * tensor = weight.tensor; + if (weight.idx != cur_split && params->keep_split) { close_ofstream(); - new_ofstream(weight->idx); + new_ofstream(weight.idx); } const std::string name = ggml_get_name(tensor); @@ -18938,17 +18913,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s total_size_new += new_size; // update the gguf meta data as we go - gguf_set_tensor_type(ctx_outs[cur_split], name.c_str(), new_type); - gguf_set_tensor_data(ctx_outs[cur_split], name.c_str(), new_data, new_size); + gguf_set_tensor_type(ctx_outs[cur_split].get(), name.c_str(), new_type); + gguf_set_tensor_data(ctx_outs[cur_split].get(), name.c_str(), new_data, new_size); // write tensor data + padding fout.write((const char *) new_data, new_size); zeros(fout, GGML_PAD(new_size, align) - new_size); } close_ofstream(); - for (auto & c:ctx_outs) { - gguf_free(c); - } LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0); LLAMA_LOG_INFO("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0); @@ -18962,51 +18934,51 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s static void llama_lora_adapter_init_internal(struct llama_model * model, const char * path_lora, struct llama_lora_adapter & adapter) { LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora); - ggml_context * ctx = nullptr; + ggml_context * ctx_init; struct gguf_init_params meta_gguf_params = { /* .no_alloc = */ true, - /* .ctx = */ &ctx, + /* .ctx = */ &ctx_init, }; - struct gguf_context * ctx_gguf = gguf_init_from_file(path_lora, meta_gguf_params); + + gguf_context_ptr ctx_gguf { gguf_init_from_file(path_lora, meta_gguf_params) }; if (!ctx_gguf) { throw std::runtime_error("failed to load lora adapter file from " + std::string(path_lora)); } + ggml_context_ptr ctx { ctx_init }; + // check metadata { auto get_kv_str = [&](const std::string & key) -> std::string { - int id = gguf_find_key(ctx_gguf, key.c_str()); - return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf, id)); + int id = gguf_find_key(ctx_gguf.get(), key.c_str()); + return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf.get(), id)); }; auto get_kv_f32 = [&](const std::string & key) -> float { - int id = gguf_find_key(ctx_gguf, key.c_str()); - return id < 0 ? 0.0f : gguf_get_val_f32(ctx_gguf, id); + int id = gguf_find_key(ctx_gguf.get(), key.c_str()); + return id < 0 ? 0.0f : gguf_get_val_f32(ctx_gguf.get(), id); }; LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN); auto general_type = get_kv_str(llm_kv(LLM_KV_GENERAL_TYPE)); if (general_type != "adapter") { - gguf_free(ctx_gguf); throw std::runtime_error("expect general.type to be 'adapter', but got: " + general_type); } auto general_arch_str = get_kv_str(llm_kv(LLM_KV_GENERAL_ARCHITECTURE)); auto general_arch = llm_arch_from_string(general_arch_str); if (general_arch != model->arch) { - gguf_free(ctx_gguf); throw std::runtime_error("model arch and LoRA arch mismatch"); } auto adapter_type = get_kv_str(llm_kv(LLM_KV_ADAPTER_TYPE)); if (adapter_type != "lora") { - gguf_free(ctx_gguf); throw std::runtime_error("expect adapter.type to be 'lora', but got: " + adapter_type); } adapter.alpha = get_kv_f32(llm_kv(LLM_KV_ADAPTER_LORA_ALPHA)); } - int n_tensors = gguf_get_n_tensors(ctx_gguf); + int n_tensors = gguf_get_n_tensors(ctx_gguf.get()); // contexts for each buffer type std::map ctx_map; @@ -19020,7 +18992,11 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c /*.no_alloc =*/ true, }; ggml_context * buft_ctx = ggml_init(params); + if (!buft_ctx) { + return nullptr; + } ctx_map[buft] = buft_ctx; + adapter.ctxs.emplace_back(buft_ctx); return buft_ctx; }; return it->second; @@ -19031,7 +19007,7 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c auto str_endswith = [](const std::string & str, const std::string & suffix) { return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0; }; - for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) { + for (ggml_tensor * cur = ggml_get_first_tensor(ctx.get()); cur; cur = ggml_get_next_tensor(ctx.get(), cur)) { std::string name(cur->name); if (str_endswith(name, ".lora_a")) { replace_all(name, ".lora_a", ""); @@ -19048,8 +19024,6 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c ab_map[name].b = cur; } } else { - gguf_free(ctx_gguf); - ggml_free(ctx); throw std::runtime_error("LoRA tensor '" + name + "' has unexpected suffix"); } } @@ -19060,28 +19034,20 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c llama_lora_weight & w = it.second; if (!w.a || !w.b) { - gguf_free(ctx_gguf); - ggml_free(ctx); throw std::runtime_error("LoRA tensor pair for '" + name + "' is missing one component"); } // device buft and device ctx auto * model_tensor = llama_get_model_tensor(model, name.c_str()); if (!model_tensor) { - gguf_free(ctx_gguf); - ggml_free(ctx); throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model"); } struct ggml_context * dev_ctx = ctx_for_buft(ggml_backend_buffer_get_type(model_tensor->buffer)); // validate tensor shape if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) { - gguf_free(ctx_gguf); - ggml_free(ctx); throw std::runtime_error("tensor '" + name + "' has incorrect shape"); } if (w.a->ne[1] != w.b->ne[0]) { - gguf_free(ctx_gguf); - ggml_free(ctx); throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)"); } // save tensor to adapter @@ -19096,18 +19062,15 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c { adapter.ctxs.reserve(ctx_map.size()); adapter.bufs.reserve(ctx_map.size()); - for (auto it : ctx_map) { + for (auto & it : ctx_map) { ggml_backend_buffer_type_t buft = it.first; ggml_context * ctx_dev = it.second; - ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx_dev, buft); + ggml_backend_buffer_ptr buf { ggml_backend_alloc_ctx_tensors_from_buft(ctx_dev, buft) }; if (!buf) { - gguf_free(ctx_gguf); - ggml_free(ctx); throw std::runtime_error("failed to allocate buffer for lora adapter\n"); } - LLAMA_LOG_INFO("%s: %10s LoRA buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0); - adapter.ctxs.push_back(ctx_dev); - adapter.bufs.push_back(buf); + LLAMA_LOG_INFO("%s: %10s LoRA buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get())/1024.0/1024.0); + adapter.bufs.emplace_back(std::move(buf)); } } @@ -19116,7 +19079,7 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c llama_file gguf_file(path_lora, "rb"); std::vector read_buf; auto set_tensor = [&](struct ggml_tensor * orig, struct ggml_tensor * dev) { - size_t offs = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, gguf_find_tensor(ctx_gguf, orig->name)); + size_t offs = gguf_get_data_offset(ctx_gguf.get()) + gguf_get_tensor_offset(ctx_gguf.get(), gguf_find_tensor(ctx_gguf.get(), orig->name)); size_t size = ggml_nbytes(orig); read_buf.resize(size); gguf_file.seek(offs, SEEK_SET); @@ -19132,10 +19095,6 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c } LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2); - - // free ctx for reading gguf - gguf_free(ctx_gguf); - ggml_free(ctx); } int32_t llama_lora_adapter_set( @@ -19588,7 +19547,7 @@ struct llama_context * llama_new_context_with_model( llama_free(ctx); return nullptr; } - ctx->backends.push_back(backend); + ctx->backends.emplace_back(backend); } // add ACCEL backends (such as BLAS) @@ -19601,7 +19560,7 @@ struct llama_context * llama_new_context_with_model( llama_free(ctx); return nullptr; } - ctx->backends.push_back(backend); + ctx->backends.emplace_back(backend); } } @@ -19612,16 +19571,16 @@ struct llama_context * llama_new_context_with_model( llama_free(ctx); return nullptr; } - ctx->backends.push_back(ctx->backend_cpu); + ctx->backends.emplace_back(ctx->backend_cpu); // create a list of the set_n_threads functions in the backends - for (auto * backend : ctx->backends) { - ggml_backend_dev_t dev = ggml_backend_get_device(backend); + for (auto & backend : ctx->backends) { + ggml_backend_dev_t dev = ggml_backend_get_device(backend.get()); ggml_backend_reg_t reg = dev ? ggml_backend_dev_backend_reg(dev) : nullptr; if (reg) { auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads"); if (ggml_backend_set_n_threads_fn) { - ctx->set_n_threads_fns.emplace_back(backend, ggml_backend_set_n_threads_fn); + ctx->set_n_threads_fns.emplace_back(backend.get(), ggml_backend_set_n_threads_fn); } } } @@ -19660,17 +19619,18 @@ struct llama_context * llama_new_context_with_model( } LLAMA_LOG_INFO("%s: %10s output buffer size = %8.2f MiB\n", __func__, - ggml_backend_buffer_name(ctx->buf_output), - ggml_backend_buffer_get_size(ctx->buf_output) / 1024.0 / 1024.0); + ggml_backend_buffer_name(ctx->buf_output.get()), + ggml_backend_buffer_get_size(ctx->buf_output.get()) / 1024.0 / 1024.0); } // scheduler and compute buffers { // buffer types used for the compute buffer of each backend std::vector backend_buft; - for (auto * backend : ctx->backends) { - auto * buft = ggml_backend_get_default_buffer_type(backend); - if (ggml_backend_is_cpu(backend) && !model->devices.empty()) { + std::vector backend_ptrs; + for (auto & backend : ctx->backends) { + auto * buft = ggml_backend_get_default_buffer_type(backend.get()); + if (ggml_backend_is_cpu(backend.get()) && !model->devices.empty()) { // use the host buffer of the first device CPU for faster transfer of the intermediate state auto * dev = model->devices[0]; auto * host_buft = ggml_backend_dev_host_buffer_type(dev); @@ -19679,6 +19639,7 @@ struct llama_context * llama_new_context_with_model( } } backend_buft.push_back(buft); + backend_ptrs.push_back(backend.get()); } const size_t max_nodes = llama_model_max_nodes(*model); @@ -19696,12 +19657,12 @@ struct llama_context * llama_new_context_with_model( // pipeline parallelism requires support for async compute and events in all devices if (pipeline_parallel) { - for (auto * backend : ctx->backends) { - if (ggml_backend_is_cpu(backend)) { + for (auto & backend : ctx->backends) { + if (ggml_backend_is_cpu(backend.get())) { // ignore CPU backend continue; } - auto * dev = ggml_backend_get_device(backend); + auto * dev = ggml_backend_get_device(backend.get()); ggml_backend_dev_props props; ggml_backend_dev_get_props(dev, &props); if (!props.caps.async || !props.caps.events) { @@ -19712,10 +19673,10 @@ struct llama_context * llama_new_context_with_model( } } - ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), max_nodes, pipeline_parallel); + ctx->sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, pipeline_parallel)); if (pipeline_parallel) { - LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(ctx->sched)); + LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(ctx->sched.get())); } // initialize scheduler with the worst-case graph @@ -19727,29 +19688,29 @@ struct llama_context * llama_new_context_with_model( ggml_cgraph * gf_pp = llama_build_graph(*ctx, ubatch_pp, true); // reserve pp graph first so that buffers are only allocated once - ggml_backend_sched_reserve(ctx->sched, gf_pp); - int n_splits_pp = ggml_backend_sched_get_n_splits(ctx->sched); + ggml_backend_sched_reserve(ctx->sched.get(), gf_pp); + int n_splits_pp = ggml_backend_sched_get_n_splits(ctx->sched.get()); int n_nodes_pp = ggml_graph_n_nodes(gf_pp); // reserve with tg graph to get the number of splits and nodes llama_ubatch ubatch_tg = { true, 1, 1, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; ggml_cgraph * gf_tg = llama_build_graph(*ctx, ubatch_tg, true); - ggml_backend_sched_reserve(ctx->sched, gf_tg); - int n_splits_tg = ggml_backend_sched_get_n_splits(ctx->sched); + ggml_backend_sched_reserve(ctx->sched.get(), gf_tg); + int n_splits_tg = ggml_backend_sched_get_n_splits(ctx->sched.get()); int n_nodes_tg = ggml_graph_n_nodes(gf_tg); // reserve again with pp graph to avoid ggml-alloc reallocations during inference - gf_pp = llama_build_graph(*ctx, ubatch_pp, false); - if (!ggml_backend_sched_reserve(ctx->sched, gf_pp)) { + gf_pp = llama_build_graph(*ctx, ubatch_pp, true); + if (!ggml_backend_sched_reserve(ctx->sched.get(), gf_pp)) { LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__); llama_free(ctx); return nullptr; } - for (size_t i = 0; i < ctx->backends.size(); i++) { - ggml_backend_t backend = ctx->backends[i]; + for (size_t i = 0; i < backend_ptrs.size(); ++i) { + ggml_backend_t backend = backend_ptrs[i]; ggml_backend_buffer_type_t buft = backend_buft[i]; - size_t size = ggml_backend_sched_get_buffer_size(ctx->sched, backend); + size_t size = ggml_backend_sched_get_buffer_size(ctx->sched.get(), backend); if (size > 1) { LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__, ggml_backend_buft_name(buft), @@ -20029,7 +19990,8 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const // create a context for each buffer type std::map ctx_map; auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * { - if (ctx_map.count(buft) == 0) { + auto it = ctx_map.find(buft); + if (it == ctx_map.end()) { struct ggml_init_params params = { /*.mem_size =*/ model.hparams.n_layer*ggml_tensor_overhead(), /*.mem_buffer =*/ NULL, @@ -20040,12 +20002,12 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const return nullptr; } ctx_map[buft] = ctx; - cvec.ctxs.push_back(ctx); + cvec.ctxs.emplace_back(ctx); + return ctx; } - return ctx_map.at(buft); + return it->second; }; - // make tensors cvec.tensors.reserve(model.hparams.n_layer); cvec.tensors.push_back(nullptr); // there's never a tensor for layer 0 @@ -20076,7 +20038,7 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const return false; } ggml_backend_buffer_clear(buf, 0); - cvec.bufs.push_back(buf); + cvec.bufs.emplace_back(buf); } return true; @@ -21354,7 +21316,7 @@ int32_t llama_decode( } void llama_synchronize(struct llama_context * ctx) { - ggml_backend_sched_synchronize(ctx->sched); + ggml_backend_sched_synchronize(ctx->sched.get()); // FIXME: if multiple single tokens are evaluated without a synchronization, // the stats will be added to the prompt evaluation stats