diff --git a/common/jinja/runtime.cpp b/common/jinja/runtime.cpp index f234d9284..4453d86e6 100644 --- a/common/jinja/runtime.cpp +++ b/common/jinja/runtime.cpp @@ -144,6 +144,13 @@ value binary_expression::execute_impl(context & ctx) { return false; }; + auto test_is_in = [&]() -> bool { + func_args args(ctx); + args.push_back(left_val); + args.push_back(right_val); + return global_builtins().at("test_is_in")(args)->as_bool(); + }; + // Handle undefined and null values if (is_val(left_val) || is_val(right_val)) { if (is_val(right_val) && (op.value == "in" || op.value == "not in")) { @@ -223,19 +230,11 @@ value binary_expression::execute_impl(context & ctx) { return result; } } else if (is_val(right_val)) { - auto & arr = right_val->as_array(); - bool member = false; - for (const auto & item : arr) { - if (*left_val == *item) { - member = true; - break; - } - } + // case: 1 in [0, 1, 2] + bool member = test_is_in(); if (op.value == "in") { - JJ_DEBUG("Checking membership: %s in Array is %d", left_val->type().c_str(), member); return mk_val(member); } else if (op.value == "not in") { - JJ_DEBUG("Checking non-membership: %s not in Array is %d", left_val->type().c_str(), !member); return mk_val(!member); } } @@ -252,22 +251,23 @@ value binary_expression::execute_impl(context & ctx) { // String membership if (is_val(left_val) && is_val(right_val)) { - auto left_str = left_val->as_string().str(); - auto right_str = right_val->as_string().str(); + // case: "a" in "abc" + bool member = test_is_in(); if (op.value == "in") { - return mk_val(right_str.find(left_str) != std::string::npos); + return mk_val(member); } else if (op.value == "not in") { - return mk_val(right_str.find(left_str) == std::string::npos); + return mk_val(!member); } } // Value key in object if (is_val(right_val)) { - bool has_key = right_val->has_key(left_val); + // case: key in {key: value} + bool member = test_is_in(); if (op.value == "in") { - return mk_val(has_key); + return mk_val(member); } else if (op.value == "not in") { - return mk_val(!has_key); + return mk_val(!member); } } diff --git a/common/jinja/value.cpp b/common/jinja/value.cpp index f254ae925..2aa156b17 100644 --- a/common/jinja/value.cpp +++ b/common/jinja/value.cpp @@ -393,6 +393,33 @@ const func_builtins & global_builtins() { {"test_is_lt", test_compare_fn}, {"test_is_lessthan", test_compare_fn}, {"test_is_ne", test_compare_fn}, + {"test_is_in", [](const func_args & args) -> value { + args.ensure_count(2); + auto needle = args.get_pos(0); + auto haystack = args.get_pos(1); + if (is_val(haystack)) { + return mk_val(false); + } + if (is_val(haystack)) { + for (const auto & item : haystack->as_array()) { + if (*needle == *item) { + return mk_val(true); + } + } + return mk_val(false); + } + if (is_val(haystack)) { + if (!is_val(needle)) { + throw raised_exception("'in' test expects args[1] as string when args[0] is string, got args[1] as " + needle->type()); + } + return mk_val( + haystack->as_string().str().find(needle->as_string().str()) != std::string::npos); + } + if (is_val(haystack)) { + return mk_val(haystack->has_key(needle)); + } + throw raised_exception("'in' test expects iterable as first argument, got " + haystack->type()); + }}, {"test_is_test", [](const func_args & args) -> value { args.ensure_vals(); auto & builtins = global_builtins(); diff --git a/common/ngram-map.cpp b/common/ngram-map.cpp index 84fd76136..cab231bad 100644 --- a/common/ngram-map.cpp +++ b/common/ngram-map.cpp @@ -7,6 +7,18 @@ #include #include +// prime number used for LCG hash function (32 bit), it is near (sqrt(5) - 1)/2 * 2^32. +#define LCG_FACTOR 2654435761UL + +// Compute the LCG hash of a n-gram of size len at offset start. +static uint32_t common_ngram_map_hash(const llama_tokens & tokens, size_t start, size_t len) { + uint32_t hash = 0; + for (size_t i = 0; i < len; ++i) { + hash = hash * LCG_FACTOR + tokens[start + i]; + } + return hash; +} + // Print the values of a sublist of `llama_tokens & inp` to a string in the form [v0, v1, v2, ...]. static std::string common_tokens_to_str(const llama_tokens & inp, size_t start, size_t length) { std::ostringstream oss; @@ -115,6 +127,100 @@ llama_tokens common_ngram_simple_draft( // maximum number of counted values of a ngram map value. #define COMMON_NGRAM_MAX_VALUE_COUNT 16380 +void common_ngram_map_begin( + common_ngram_map & map, const llama_tokens & tokens) { + size_t size_begin = tokens.size(); + + LOG_DBG("%s: begin, idx_last_draft=%zu, new begin=%zu, #keys=%zu\n", __func__, + map.idx_last_check, size_begin, map.keys.size()); + + size_t count_map_entries_upd = 0; + if (!map.key_map.empty() && size_begin < map.idx_last_check) { + if (map.show_key_map_stats) { + // Print statistics of hash map map_key. + size_t count_nonzero = 0; + uint32_t min_idx = UINT32_MAX; + uint32_t max_idx = 0; + for (size_t i = 0; i < map.key_map.size(); ++i) { + uint32_t key_idx = map.key_map[i]; + if (key_idx != 0) { + ++count_nonzero; + if (key_idx < min_idx) min_idx = key_idx; + if (key_idx > max_idx) max_idx = key_idx; + } + } + if (count_nonzero == 0) { + min_idx = 0; + } + LOG_INF("%s: key_map stats: entries=%zu, min_idx=%u, max_idx=%u, key_map_last_idx=%u\n", + __func__, count_nonzero, min_idx, max_idx, map.key_map_last_idx); + } + + // Update the map from hash to key index (clear outdated entries). + for (size_t i = 0; i < map.key_map.size(); ++i) { + uint32_t key_idx = map.key_map[i]; + if (key_idx >= map.size_last_begin) { + map.key_map[i] = 0; + count_map_entries_upd++; + } + } + map.key_map_last_idx = (map.size_last_begin > 0) ? map.size_last_begin - 1 : 0; + } + + if (size_begin < map.idx_last_check && !map.keys.empty()) { + // The next token generation will start at index size_begin. + // The tokens between map.size_last_begin and size_begin are no longer valid. + // + // Refresh map: Remove all entries with index >= map.size_last_begin. + size_t count_keys = map.keys.size(); + size_t count_keys_del = 0; + size_t count_values_del = 0; + for (int32_t i = map.keys.size() - 1; i >= 0; --i) { + common_ngram_map_key & key = map.keys[i]; + if (key.key_idx >= map.size_last_begin) { + // Delete the key. + LOG_DBG("%s: delete key %d at index %zu (>= size_last_begin=%zu)\n", __func__, i, key.key_idx, map.size_last_begin); + map.keys.erase(map.keys.begin() + i); + count_keys_del++; + continue; + } + if (map.key_only) { + continue; + } + + // Check the indices of the values. + for (int16_t j = COMMON_NGRAM_MAX_VALUES - 1; j >= 0; --j) { + common_ngram_map_value & value = key.values[j]; + if (value.value_idx >= map.size_last_begin) { + // Delete the value. + count_values_del++; + + // Move all values after this value to the left. + for (uint16_t k = j; k < COMMON_NGRAM_MAX_VALUES - 1; ++k) { + key.values[k] = key.values[k + 1]; + } + // Clear the last value. + key.values[COMMON_NGRAM_MAX_VALUES - 1].value_idx = 0; + key.values[COMMON_NGRAM_MAX_VALUES - 1].value_num = 0; + } + } + if (key.values[0].value_idx == 0) { + // No values left, delete the key. + LOG_DBG("%s: delete key %d at index %zu (no values left)\n", __func__, i, key.key_idx); + map.keys.erase(map.keys.begin() + i); + count_keys_del++; + } + } + + LOG_INF("%s: refresh map: idx_last_draft=%zu, new begin=%zu, #keys_checked=%zu, #keys_del=%zu, #values_del=%zu, #hashes_upd=%zu\n", __func__, + map.idx_last_check, size_begin, + count_keys, count_keys_del, count_values_del, count_map_entries_upd); + } + + map.idx_last_check = (map.size_last_begin > 0) ? map.size_last_begin - 1 : 0; + map.size_last_begin = size_begin; +} + void common_ngram_map_draft(common_ngram_map & map, const llama_tokens & inp, llama_token sampled, llama_tokens & draft) { @@ -129,6 +235,10 @@ void common_ngram_map_draft(common_ngram_map & map, if (cur_len < static_cast(2 * n + m)) { return; } + if (cur_len >= static_cast(UINT32_MAX)) { + // key_map uses uint32_t instead of size_t. + GGML_ABORT("%s: cur_len exceeds UINT32_MAX: %zu", __func__, cur_len); + } // Only check every check_rate tokens to save compute // i.e., perform check if (cur_len - idx_last_check) >= check_rate @@ -147,24 +257,92 @@ void common_ngram_map_draft(common_ngram_map & map, // search for the key in the map size_t match_pos = 0; - for (size_t j = cur_len - n - m - 1; j > 0; --j) { - bool match = true; - for (size_t k = 0; k < n; ++k) { - if (inp[j + k] != key_tokens[k]) { - match = false; - break; + if (map.size_last_begin > cur_len) { + GGML_ABORT("%s: map.size_last_begin > cur_len: %zu > %zu", __func__, map.size_last_begin, cur_len); + } + if (!map.key_map.empty()) { + // Search for the key in the map key_map from hash of ngrams to index of ngram. + uint32_t idx_hash = (common_ngram_map_hash(key_tokens, 0, n) % map.key_map.size()); + uint32_t idx_key = map.key_map[idx_hash]; + if (idx_key != 0 && idx_key < cur_len - n - m - 1) { + // Check if the key matches the key at idx_key (because of possible collisions). + bool match = true; + for (size_t k = 0; k < n; ++k) { + if (inp[idx_key + k] != key_tokens[k]) { + match = false; + break; + } + } + LOG_DBG("%s: key hash %x -> idx_key %d: match %d\n", __func__, idx_hash, idx_key, match ? 1 : 0); + if (match) { + match_pos = idx_key; } } - if (match) { - match_pos = j; - break; + } + if (match_pos == 0 && map.size_last_begin > (size_t) (n + m + 1)) { + // Search for the key in [1, map.size_last_begin - n - m -1], descending. + for (size_t j = map.size_last_begin - n - m - 1; j > map.key_map_last_idx; --j) { + // Check if the key matches the key. + bool match = true; + for (size_t k = 0; k < n; ++k) { + if (inp[j + k] != key_tokens[k]) { + match = false; + break; + } + } + if (match) { + match_pos = j; + break; + } + } + } + if (match_pos == 0) { + // In case of a reasoning chat, the part after size_last_begin may be deleted/reordered later. + // + // Search in [size_last_begin, cur_len - n - m - 1], descending. + for (size_t j = cur_len - n - m - 1; j > map.size_last_begin && j > map.key_map_last_idx; --j) { + bool match = true; + for (size_t k = 0; k < n; ++k) { + if (inp[j + k] != key_tokens[k]) { + match = false; + break; + } + } + if (match) { + match_pos = j; + break; + } } } if (match_pos > 0) { - LOG_INF("%s: cur_len = %zu, n = %d, m = %d, sz_tkns = %zu, sampled = %d, match_pos = %zu\n", __func__, + LOG_DBG("%s: cur_len = %zu, n = %d, m = %d, sz_tkns = %zu, sampled = %d, match_pos = %zu\n", __func__, cur_len, n, m, key_tokens.size(), sampled, match_pos); } + if (!map.key_map.empty()) { + // Add hashes of new ngrams in key_map. + // + // Use the same order as above. + if (map.size_last_begin > (size_t) (n + m + 1)) { + for (size_t j = map.size_last_begin - n - m - 1; j > map.key_map_last_idx; --j) { + // compute hash and store index of ngram at idx j in the map. + uint32_t idx_hash = (common_ngram_map_hash(inp, j, n) % map.key_map.size()); + if (map.key_map[idx_hash] == 0) { + map.key_map[idx_hash] = j; // collisions may occur + } + } + } + + for (size_t j = cur_len - n - m - 1; j > map.size_last_begin && j > map.key_map_last_idx; --j) { + // compute hash and store index of ngram at idx j in the map. + uint32_t idx_hash = (common_ngram_map_hash(inp, j, n) % map.key_map.size()); + if (map.key_map[idx_hash] == 0) { + map.key_map[idx_hash] = j; + } + } + map.key_map_last_idx = std::max(static_cast(cur_len - n - m - 1), map.key_map_last_idx); + } + if (match_pos == 0) { return; } @@ -215,8 +393,8 @@ void common_ngram_map_draft(common_ngram_map & map, draft.push_back(inp[match_pos + n + i]); } - LOG_INF("%s: key_offset = %zu, key_num = %d, draft.size = %zu\n", __func__, - key_offset, curr_key.key_num, draft.size()); + LOG_DBG("%s: key_idx = %zu, key_offset = %zu, key_num = %d, draft.size = %zu\n", __func__, + curr_key.key_idx, key_offset, curr_key.key_num, draft.size()); map.last_draft_created = false; map.last_draft_key_idx = key_offset; @@ -318,7 +496,7 @@ void common_ngram_map_draft(common_ngram_map & map, } } - if (sum_occur > 0 && max_occur < 3 * sum_occur) { + if (sum_occur > 0 && max_occur < 2 * sum_occur) { // The most frequent value is not much more frequent than the other values. // We do not use the draft. return; diff --git a/common/ngram-map.h b/common/ngram-map.h index b365034ac..c094d513d 100644 --- a/common/ngram-map.h +++ b/common/ngram-map.h @@ -9,6 +9,8 @@ // 2. ngram_map: lookup of n-grams followed by m-grams in token history using a map. // The map is a vector of key n-grams, and for each key n-gram there is a list of value m-grams. // +// ref: https://github.com/ggml-org/llama.cpp/pull/18471 +// #include "llama.h" #include "common.h" @@ -51,10 +53,13 @@ llama_tokens common_ngram_simple_draft( // maximum number of m-gram values stored for each key n-gram. #define COMMON_NGRAM_MAX_VALUES 4 +// number of entries in the (optional, size 0 to disable) map from ngram-hash to ngram-index. +#define COMMON_NGRAM_HASH_MAP_SIZE 262144 + // statistics of a m-gram after a known n-gram struct common_ngram_map_value { - size_t value_idx = 0; // index of value m-gram in token-history (0 if unused) - uint16_t value_num = 0; // number of occurences of this value m-gram after the key n-gram (0 in an unused values-slot) + size_t value_idx = 0; // index of value m-gram in token-history (0 if unused) + uint16_t value_num = 0; // number of occurences of this value m-gram after the key n-gram (0 in an unused values-slot) int16_t n_accepted = -1; // number of accepted tokens at last draft (-1 if unused) }; @@ -74,23 +79,43 @@ struct common_ngram_map { bool key_only; // true if only key n-grams are used, no values. - // first draft: vector only, no map. std::vector keys; // key n-grams which occur several times in token-history uint16_t check_rate; // check for speculative decoding without draft model for each check_rate token uint16_t min_hits; // minimum number of key hits to consider a draft + bool show_key_map_stats = false; // true, if statitics of the key_map should be printed. + common_ngram_map(uint16_t sz_key, uint16_t sz_value, bool only_keys, uint16_t check_rate, uint16_t min_hits) : size_key(sz_key), size_value(sz_value), key_only(only_keys), - check_rate(check_rate), min_hits(min_hits) {} + check_rate(check_rate), min_hits(min_hits) { + key_map.resize(COMMON_NGRAM_HASH_MAP_SIZE); // 2^18 hash entries, 0 entries if key_map shouldn't be used + } + + // In reasoning chats the previous reasoning block will be removed from context history. + // A rebuild of the ngram map is needed after that. + + size_t size_last_begin = 0; // number of tokens at previous start of generation bool last_draft_created = false; // true if a draft was created at last call. - size_t last_draft_key_idx = 0; // index of last key used for draft generation. + size_t last_draft_key_idx = 0; // index of last key used for draft generation (0 = no draft) uint16_t last_draft_value_idx = 0; // index of last value used for draft generation. size_t idx_last_check = 0; // index of last check in context history + + // optional map "hash to ngram-index" for faster lookup of n-grams. map is empty if unused. + // + // uint32_t instead of size_t (size of current histories is << UINT32_MAX) + std::vector key_map; // key_map[hash] = index of ngram in context window + uint32_t key_map_last_idx = 0; // index of the last ngram added to key_map }; +// Initialize the n-gram map with the given token history. +// map: the ngram map to initialize. +// tokens: the token history to base the map on. +void common_ngram_map_begin( + common_ngram_map & map, + const llama_tokens & tokens); // Searches for the n-gram in the history and checks whether a draft sequence should be generated. // map: the ngram map to search in. diff --git a/common/speculative.cpp b/common/speculative.cpp index a1a3b51c1..80cd31e35 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -124,9 +124,9 @@ struct common_speculative_state { // TODO: track performance of most recent calls const bool gen_perf = true; // whether to generate performance stats. - // TODO: rename to t_draft_us - // TODO: add t_begin_us, t_accept_us - int64_t gen_duration_us = 0; // total time spent in this implementation in microseconds. + int64_t t_begin_us = 0; // total time spent in refresh of this implementation in microseconds. + int64_t t_draft_us = 0; // total time spent in generating drafts in this implementation in microseconds. + int64_t t_accept_us = 0; // total time spent in accumulation of this implementation in microseconds. common_speculative_state(enum common_speculative_type type) : type(type) {} @@ -499,7 +499,7 @@ struct common_speculative_state_ngram_map_k : public common_speculative_state { : common_speculative_state(type), map(std::move(map)) {} void begin(const llama_tokens & prompt) override { - GGML_UNUSED(prompt); + common_ngram_map_begin(map, prompt); } void draft( @@ -951,6 +951,7 @@ void common_speculative_begin(common_speculative * spec, const llama_tokens & pr } for (auto & impl : spec->impls) { + common_time_meas tm(impl->t_begin_us, !impl->gen_perf); impl->begin(prompt); } } @@ -966,14 +967,9 @@ llama_tokens common_speculative_draft( for (auto & impl : spec->impls) { { - const int64_t t_start_us = impl->gen_perf ? ggml_time_us() : 0; - + common_time_meas tm(impl->t_draft_us, !impl->gen_perf); impl->draft(params, prompt_tgt, id_last, result); - - const int64_t t_now_us = impl->gen_perf ? ggml_time_us() : 0; - impl->drafts_call_count++; - impl->gen_duration_us += t_now_us - t_start_us; // accumulate duration for this implementation } if (!result.empty()) { @@ -1001,12 +997,15 @@ void common_speculative_accept(common_speculative * spec, uint16_t n_accepted) { GGML_ASSERT(impl); - if (n_accepted > 0) { - impl->drafts_accepted_count++; - impl->drafts_accepted_tokens += n_accepted; - } + { + common_time_meas tm(impl->t_accept_us, !impl->gen_perf); + if (n_accepted > 0) { + impl->drafts_accepted_count++; + impl->drafts_accepted_tokens += n_accepted; + } - impl->accept(n_accepted); + impl->accept(n_accepted); + } } void common_speculative_print_stats(const common_speculative * spec) { @@ -1018,13 +1017,14 @@ void common_speculative_print_stats(const common_speculative * spec) { std::string str_perf; if (impl->gen_perf) { std::ostringstream oss; - oss << std::fixed << std::setprecision(3) << impl->gen_duration_us / 1000.0; - str_perf = ", dur = " + oss.str() + " ms"; + oss << std::fixed << std::setprecision(3) << impl->t_begin_us / 1000.0 << ", "; + oss << std::fixed << std::setprecision(3) << impl->t_draft_us / 1000.0 << ", "; + oss << std::fixed << std::setprecision(3) << impl->t_accept_us / 1000.0; + str_perf = ", dur(b,g,a) = " + oss.str() + " ms"; } else { str_perf = ""; } - // TODO: report time for begin() and accept() LOG_INF("statistics %s: #calls = %zu, #gen drafts = %zu, #acc drafts = %zu, #gen tokens = %zu, #acc tokens = %zu%s\n", common_speculative_type_to_str(impl->type).c_str(), impl->drafts_call_count, diff --git a/ggml/include/ggml-cpu.h b/ggml/include/ggml-cpu.h index 4f3b99c8d..e3e067c91 100644 --- a/ggml/include/ggml-cpu.h +++ b/ggml/include/ggml-cpu.h @@ -19,6 +19,9 @@ extern "C" { // abort ggml_graph_compute when true ggml_abort_callback abort_callback; void * abort_callback_data; + + // use only reference implementations + bool use_ref; }; // numa strategies @@ -132,6 +135,8 @@ extern "C" { GGML_BACKEND_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool); GGML_BACKEND_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data); + GGML_BACKEND_API void ggml_backend_cpu_set_use_ref(ggml_backend_t backend_cpu, bool use_ref); + GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void); GGML_BACKEND_API void ggml_cpu_fp32_to_fp32(const float *, float *, int64_t); diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index cadc157ea..307ff9cca 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -6,7 +6,7 @@ // This documentation is still a work in progress. // If you wish some specific topics to be covered, feel free to drop a comment: // -// https://github.com/ggerganov/whisper.cpp/issues/40 +// https://github.com/ggml-org/whisper.cpp/issues/40 // // ## Overview // diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 1e37454b1..cc9196899 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -258,6 +258,7 @@ void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds"); if (backend->iface.set_tensor_async == NULL) { + ggml_backend_synchronize(backend); ggml_backend_tensor_set(tensor, data, offset, size); } else { backend->iface.set_tensor_async(backend, tensor, data, offset, size); @@ -271,6 +272,7 @@ void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_ten GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds"); if (backend->iface.get_tensor_async == NULL) { + ggml_backend_synchronize(backend); ggml_backend_tensor_get(tensor, data, offset, size); } else { backend->iface.get_tensor_async(backend, tensor, data, offset, size); diff --git a/ggml/src/ggml-cpu/ggml-cpu-impl.h b/ggml/src/ggml-cpu/ggml-cpu-impl.h index 0e8dd0ae0..88a9c9ec0 100644 --- a/ggml/src/ggml-cpu/ggml-cpu-impl.h +++ b/ggml/src/ggml-cpu/ggml-cpu-impl.h @@ -24,6 +24,9 @@ struct ggml_compute_params { void * wdata; struct ggml_threadpool * threadpool; + + // use reference implementation + bool use_ref; }; diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 3490f434e..d11020021 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -5,7 +5,6 @@ #include "ggml-backend.h" #include "traits.h" #include "ggml-cpu-impl.h" -#include "ggml-cpu.h" #include "ggml-impl.h" #include "quants.h" #include "ggml-threading.h" @@ -3704,12 +3703,20 @@ struct ggml_cplan ggml_graph_plan( } break; case GGML_OP_FLASH_ATTN_EXT: { + const int64_t neq2 = node->src[0]->ne[2]; // number of query heads const int64_t DK = node->src[1]->ne[0]; const int64_t DV = node->src[2]->ne[0]; // Tiled flash attention scratch (tile sizes defined in common.h) // Per-thread: Q_q + KQ + mask + VKQ32 + V32 + padding - cur = sizeof(float)*(GGML_FA_TILE_Q*DK + 2*GGML_FA_TILE_Q*GGML_FA_TILE_KV + GGML_FA_TILE_Q*DV + GGML_FA_TILE_KV*DV)*n_tasks; + size_t prefill = sizeof(float)*(GGML_FA_TILE_Q*DK + 2*GGML_FA_TILE_Q*GGML_FA_TILE_KV + GGML_FA_TILE_Q*DV + GGML_FA_TILE_KV*DV)*n_tasks; + + // Decode path: n_kv_chunks = n_tasks (one chunk per thread) + // Per-thread: VKQ accmulator (DV), partial M, partial S + intra-thread scratch for V, Q and VKQ + size_t n_chunks = n_tasks; + size_t decode = sizeof(float)*(neq2*n_chunks*(2+DV) + n_tasks*(DK + 2*DV)); + + cur += MAX(prefill, decode); } break; case GGML_OP_FLASH_ATTN_BACK: { @@ -3778,11 +3785,12 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { set_numa_thread_affinity(state->ith); struct ggml_compute_params params = { - /*.ith =*/ state->ith, - /*.nth =*/ atomic_load_explicit(&tp->n_graph, memory_order_relaxed) & GGML_THREADPOOL_N_THREADS_MASK, - /*.wsize =*/ cplan->work_size, - /*.wdata =*/ cplan->work_data, - /*.threadpool=*/ tp, + /*.ith =*/ state->ith, + /*.nth =*/ atomic_load_explicit(&tp->n_graph, memory_order_relaxed) & GGML_THREADPOOL_N_THREADS_MASK, + /*.wsize =*/ cplan->work_size, + /*.wdata =*/ cplan->work_data, + /*.threadpool =*/ tp, + /*.use_ref =*/ cplan->use_ref, }; GGML_PRINT_DEBUG("thread #%d compute-start cplan %p last-graph %d \n", state->ith, cplan, state->last_graph); diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp index db325f515..efb83e9be 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.cpp +++ b/ggml/src/ggml-cpu/ggml-cpu.cpp @@ -105,6 +105,8 @@ struct ggml_backend_cpu_context { ggml_abort_callback abort_callback; void * abort_callback_data; + + bool use_ref; // use reference implementation }; static const char * ggml_backend_cpu_get_name(ggml_backend_t backend) { @@ -143,6 +145,7 @@ static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback; cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data; + cpu_plan->cplan.use_ref = cpu_ctx->use_ref; return cpu_plan; } @@ -182,6 +185,7 @@ static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, s cplan.abort_callback = cpu_ctx->abort_callback; cplan.abort_callback_data = cpu_ctx->abort_callback_data; + cplan.use_ref = cpu_ctx->use_ref; return ggml_graph_compute(cgraph, &cplan); } @@ -223,6 +227,7 @@ ggml_backend_t ggml_backend_cpu_init(void) { ctx->work_size = 0; ctx->abort_callback = NULL; ctx->abort_callback_data = NULL; + ctx->use_ref = false; ggml_backend_t cpu_backend = new ggml_backend { /* .guid = */ ggml_backend_cpu_guid(), @@ -270,6 +275,13 @@ void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_ ctx->abort_callback_data = abort_callback_data; } +void ggml_backend_cpu_set_use_ref(ggml_backend_t backend_cpu, bool use_ref) { + GGML_ASSERT(ggml_backend_is_cpu(backend_cpu)); + + struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context; + ctx->use_ref = use_ref; +} + // CPU backend - device struct ggml_backend_cpu_device_context { @@ -646,6 +658,9 @@ static void * ggml_backend_cpu_get_proc_address(ggml_backend_reg_t reg, const ch if (strcmp(name, "ggml_backend_cpu_is_numa") == 0) { return (void *)ggml_is_numa; } + if (strcmp(name, "ggml_backend_cpu_set_use_ref") == 0) { + return (void *)ggml_backend_cpu_set_use_ref; + } // threadpool - TODO: move to ggml-base if (strcmp(name, "ggml_threadpool_new") == 0) { diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index 48c896436..ce15b18ce 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -8042,12 +8042,14 @@ void ggml_compute_forward_top_k( } } -// ggml_compute_forward_flash_attn_ext - static void ggml_compute_forward_flash_attn_ext_f16_one_chunk( const ggml_compute_params * params, ggml_tensor * dst, - int ir0, int ir1) { + int ir0, int ir1, + int64_t ic_start, int64_t ic_end, + float * partials, int64_t partial_stride) { + + const bool write_partials = (partials != nullptr); const ggml_tensor * q = dst->src[0]; const ggml_tensor * k = dst->src[1]; const ggml_tensor * v = dst->src[2]; @@ -8124,7 +8126,6 @@ static void ggml_compute_forward_flash_attn_ext_f16_one_chunk( int ith = params->ith; - // loop over n_batch and n_head for (int ir = ir0; ir < ir1; ++ir) { // q indices const int iq3 = ir/(neq2*neq1); @@ -8165,7 +8166,7 @@ static void ggml_compute_forward_flash_attn_ext_f16_one_chunk( // loop over n_kv and n_head_kv // ref: https://arxiv.org/pdf/2112.05682.pdf - for (int64_t ic = 0; ic < nek1; ++ic) { + for (int64_t ic = ic_start; ic < ic_end; ++ic) { const float mv = mp ? slope*GGML_CPU_FP16_TO_FP32(mp[ic]) : 0.0f; if (mv == -INFINITY) { continue; @@ -8238,8 +8239,8 @@ static void ggml_compute_forward_flash_attn_ext_f16_one_chunk( } } - // sinks - if (sinks) { + // sinks - apply only on the first kv-chunk + if (sinks && ic_start == 0) { const float s = ((float *)((char *) sinks->data))[h]; float ms = 1.0f; @@ -8247,6 +8248,7 @@ static void ggml_compute_forward_flash_attn_ext_f16_one_chunk( if (s > M) { ms = expf(M - s); + M = s; ggml_vec_scale_f32(DV, VKQ32, ms); } else { vs = expf(s - M); @@ -8255,20 +8257,26 @@ static void ggml_compute_forward_flash_attn_ext_f16_one_chunk( S = S*ms + vs; } - // V /= S - const float S_inv = S == 0.0f ? 0.0f : 1.0f/S; - ggml_vec_scale_f32(DV, VKQ32, S_inv); + if (write_partials) { + // Write M, S, VKQ to partials for later reduction + // partials layout: [M, S, VKQ[DV]] per query head + float * partial = partials + ir * partial_stride; + partial[0] = M; + partial[1] = S; + memcpy(partial + 2, VKQ32, DV * sizeof(float)); + } else { + // V /= S + const float S_inv = S == 0.0f ? 0.0f : 1.0f/S; + ggml_vec_scale_f32(DV, VKQ32, S_inv); - // dst indices - const int i1 = iq1; - const int i2 = iq2; - const int i3 = iq3; + // dst indices + const int i1 = iq1; + const int i2 = iq2; + const int i3 = iq3; - // original - //memcpy((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3), V, nev0*sizeof(float)); - - // permute(0, 2, 1, 3) - memcpy((char *) dst->data + (i3*ne2*ne1 + i2 + i1*ne1)*nb1, VKQ32, nb1); + // permute(0, 2, 1, 3) + memcpy((char *) dst->data + (i3*ne2*ne1 + i2 + i1*ne1)*nb1, VKQ32, nb1); + } } } @@ -8546,6 +8554,78 @@ static void ggml_compute_forward_flash_attn_ext_tiled( } } +// Reduction function: combines partial results across KV chunks +// Partials layout in wdata: [n_q_heads][n_chunks][2 + DV] +static void ggml_flash_attn_ext_reduce_partials( + const ggml_compute_params * params, + ggml_tensor * dst, + const int64_t n_chunks, + const int64_t chunk_size) { + + const ggml_tensor * q = dst->src[0]; + const ggml_tensor * k = dst->src[1]; + const ggml_tensor * v = dst->src[2]; + + const int64_t DK = k->ne[0]; + const int64_t DV = v->ne[0]; + const int64_t nek1 = k->ne[1]; + const int64_t n_q_heads = q->ne[2]; + + const int ith = params->ith; + const int nth = params->nth; + + const int64_t wdata_per_thread = DK + 2*DV + CACHE_LINE_SIZE_F32; + float * thread_wdata = (float *) params->wdata + ith * wdata_per_thread; + + const int64_t partials_offset = nth * (DK + 2*DV + CACHE_LINE_SIZE_F32); + const int64_t partial_size = 2 + DV; + const float * partials_base = (const float *) params->wdata + partials_offset; + + // Output layout + const int64_t ne1 = dst->ne[1]; + const int64_t ne2 = dst->ne[2]; + const size_t nb1 = dst->nb[1]; + + // Each thread reduces a subset of query heads + for (int64_t q_head = ith; q_head < n_q_heads; q_head += nth) { + float M_final = -INFINITY; + float S_final = 0.0f; + float * VKQ_final = thread_wdata; + memset(VKQ_final, 0, DV * sizeof(float)); + + // Combine partials from all chunks + for (int64_t chunk_idx = 0; chunk_idx < n_chunks; ++chunk_idx) { + const int64_t ic_start = chunk_idx * chunk_size; + if (ic_start >= nek1) continue; + + const float * partial = partials_base + (q_head * n_chunks + chunk_idx) * partial_size; + const float M_chunk = partial[0]; + const float S_chunk = partial[1]; + const float * VKQ_chunk = partial + 2; + + if (S_chunk == 0.0f) continue; + + const float M_new = fmaxf(M_final, M_chunk); + const float scale_old = expf(M_final - M_new); + const float scale_new = expf(M_chunk - M_new); + + for (int64_t d = 0; d < DV; ++d) { + VKQ_final[d] = VKQ_final[d] * scale_old + VKQ_chunk[d] * scale_new; + } + S_final = S_final * scale_old + S_chunk * scale_new; + M_final = M_new; + } + + // Normalize and write to output + if (S_final != 0.0f) { + const float S_inv = 1.0f / S_final; + ggml_vec_scale_f32(DV, VKQ_final, S_inv); + } + // iq1=0, iq3=0 for decode + memcpy((char *) dst->data + (0*ne2*ne1 + q_head + 0*ne1)*nb1, VKQ_final, nb1); + } +} + static void ggml_compute_forward_flash_attn_ext_f16( const ggml_compute_params * params, ggml_tensor * dst) { @@ -8567,6 +8647,7 @@ static void ggml_compute_forward_flash_attn_ext_f16( const int64_t DV = nev0; const int64_t N = neq1; + GGML_ASSERT(ne0 == DV); GGML_ASSERT(ne2 == N); @@ -8587,60 +8668,92 @@ static void ggml_compute_forward_flash_attn_ext_f16( GGML_ASSERT(nb1 <= nb2); GGML_ASSERT(nb2 <= nb3); - // parallelize by q rows using ggml_vec_dot_f32 - - // total rows in q - const int64_t nr = neq1*neq2*neq3; - - // rows per thread const int ith = params->ith; const int nth = params->nth; - // disable for NUMA - const bool disable_chunking = ggml_is_numa(); + // When use_ref is set, force the vec-only reference implementation (no tiling, no KV-chunking) + const bool use_ref = params->use_ref; - // 4x chunks per thread - int nth_scaled = nth * 4; - int64_t chunk_size = (nr + nth_scaled - 1) / nth_scaled; - int64_t nchunk = (nr + chunk_size - 1) / chunk_size; - - if (nth == 1 || nchunk < nth || disable_chunking) { - nchunk = nth; - } - - if (ith == 0) { - // Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start. - ggml_threadpool_chunk_set(params->threadpool, nth); - } - - ggml_barrier(params->threadpool); - - // The number of elements in each chunk - const int64_t dr = (nr + nchunk - 1) / nchunk; - - static constexpr int64_t KV_TILE_SZ = ggml_fa_tile_config::KV; - static constexpr int64_t Q_TILE_SZ = ggml_fa_tile_config::Q; const bool kv_is_f32_or_f16 = (k->type == GGML_TYPE_F32 || k->type == GGML_TYPE_F16); - const bool use_tiled = (q->type == GGML_TYPE_F32 && - kv_is_f32_or_f16 && - k->type == v->type && - nek1 % KV_TILE_SZ == 0 && - neq1 >= Q_TILE_SZ); // Only use tiled for batch >= tile size + const bool use_split_kv_path = !use_ref && (neq1 == 1 && neq3 == 1) && kv_is_f32_or_f16 && (k->type == v->type) && q->type == GGML_TYPE_F32 && nek1 >= 512; - // The first chunk comes from our thread_id, the rest will get auto-assigned. - int current_chunk = ith; + if (use_split_kv_path) { + const int64_t chunk_size = (nek1 + nth - 1) / nth; - while (current_chunk < nchunk) { - const int64_t ir0 = dr * current_chunk; - const int64_t ir1 = MIN(ir0 + dr, nr); + // Partials buffer layout: [q_head][kv_chunk][M, S, VKQ] + const int64_t partial_size = 2 + DV; + float * partials_base = (float *) params->wdata + nth * (DK + 2*DV + CACHE_LINE_SIZE_F32); - if (use_tiled) { - ggml_compute_forward_flash_attn_ext_tiled(params, dst, ir0, ir1); + const int64_t ic_start = ith * chunk_size; + const int64_t ic_end = std::min(ic_start + chunk_size, nek1); + + const int64_t partial_stride = nth * partial_size; + float * chunk_partials = partials_base + ith * partial_size; + + if (ic_start < nek1) { + for (int64_t q_head = 0; q_head < neq2; q_head++) { + ggml_compute_forward_flash_attn_ext_f16_one_chunk( + params, dst, q_head, q_head + 1, ic_start, ic_end, + chunk_partials, partial_stride); + } } else { - ggml_compute_forward_flash_attn_ext_f16_one_chunk(params, dst, ir0, ir1); + for (int64_t q_head = 0; q_head < neq2; q_head++) { + float * q_partials = chunk_partials + q_head * partial_stride; + q_partials[0] = -INFINITY; // M + q_partials[1] = 0.0f; // S + } } - current_chunk = ggml_threadpool_chunk_add(params->threadpool, 1); + ggml_barrier(params->threadpool); + ggml_flash_attn_ext_reduce_partials(params, dst, nth, chunk_size); + } else { + + // total rows in q + const int64_t nr = neq1*neq2*neq3; + + // disable for NUMA + const bool disable_chunking = ggml_is_numa(); + + // 4x chunks per thread + int nth_scaled = nth * 4; + int64_t chunk_size = (nr + nth_scaled - 1) / nth_scaled; + int64_t nchunk = (nr + chunk_size - 1) / chunk_size; + + if (nth == 1 || nchunk < nth || disable_chunking) { + nchunk = nth; + } + + if (ith == 0) { + ggml_threadpool_chunk_set(params->threadpool, nth); + } + + ggml_barrier(params->threadpool); + + const int64_t dr = (nr + nchunk - 1) / nchunk; + + static constexpr int64_t KV_TILE_SZ = ggml_fa_tile_config::KV; + static constexpr int64_t Q_TILE_SZ = ggml_fa_tile_config::Q; + const bool use_tiled = !use_ref && + (q->type == GGML_TYPE_F32 && + kv_is_f32_or_f16 && + k->type == v->type && + nek1 % KV_TILE_SZ == 0 && + neq1 >= Q_TILE_SZ); + + int current_chunk = ith; + + while (current_chunk < nchunk) { + const int64_t ir0 = dr * current_chunk; + const int64_t ir1 = MIN(ir0 + dr, nr); + + if (use_tiled) { + ggml_compute_forward_flash_attn_ext_tiled(params, dst, ir0, ir1); + } else { + ggml_compute_forward_flash_attn_ext_f16_one_chunk(params, dst, ir0, ir1, 0, nek1, nullptr, 0); + } + + current_chunk = ggml_threadpool_chunk_add(params->threadpool, 1); + } } } diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 9861be08e..44a3348e2 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -5067,16 +5067,6 @@ ggml_backend_reg_t ggml_backend_cuda_reg() { static std::mutex mutex; std::lock_guard lock(mutex); if (!initialized) { - // Set CUDA_SCALE_LAUNCH_QUEUES before any CUDA API call to improve multi-GPU pipeline parallelism performance - // PR: https://github.com/ggml-org/llama.cpp/pull/19042 - if (getenv("CUDA_SCALE_LAUNCH_QUEUES") == nullptr) { -#ifdef _WIN32 - _putenv_s("CUDA_SCALE_LAUNCH_QUEUES", "4x"); -#else - setenv("CUDA_SCALE_LAUNCH_QUEUES", "4x", 0); // don't overwrite if already set -#endif // _WIN32 - } - ggml_backend_cuda_reg_context * ctx = new ggml_backend_cuda_reg_context; const int min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32; diff --git a/ggml/src/ggml-metal/ggml-metal-context.h b/ggml/src/ggml-metal/ggml-metal-context.h index ec2b686b7..abf4b06ed 100644 --- a/ggml/src/ggml-metal/ggml-metal-context.h +++ b/ggml/src/ggml-metal/ggml-metal-context.h @@ -15,14 +15,22 @@ typedef struct ggml_metal * ggml_metal_t; ggml_metal_t ggml_metal_init(ggml_metal_device_t dev); void ggml_metal_free(ggml_metal_t ctx); +const char * ggml_metal_get_name(ggml_metal_t ctx); + void ggml_metal_synchronize(ggml_metal_t ctx); void ggml_metal_set_tensor_async(ggml_metal_t ctx, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); void ggml_metal_get_tensor_async(ggml_metal_t ctx, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); +bool ggml_metal_cpy_tensor_async(ggml_metal_t ctx_src, ggml_metal_t ctx_dst, const struct ggml_tensor * src, struct ggml_tensor * dst); enum ggml_status ggml_metal_graph_compute (ggml_metal_t ctx, struct ggml_cgraph * gf); void ggml_metal_graph_optimize(ggml_metal_t ctx, struct ggml_cgraph * gf); +void ggml_metal_event_record(ggml_metal_t ctx, ggml_metal_event_t ev); +void ggml_metal_event_wait (ggml_metal_t ctx, ggml_metal_event_t ev); + +ggml_metal_event_t ggml_metal_get_ev_cpy(ggml_metal_t ctx); + void ggml_metal_set_n_cb (ggml_metal_t ctx, int n_cb); void ggml_metal_set_abort_callback (ggml_metal_t ctx, ggml_abort_callback abort_callback, void * user_data); bool ggml_metal_supports_family (ggml_metal_t ctx, int family); diff --git a/ggml/src/ggml-metal/ggml-metal-context.m b/ggml/src/ggml-metal/ggml-metal-context.m index 42a35736e..a412d70ae 100644 --- a/ggml/src/ggml-metal/ggml-metal-context.m +++ b/ggml/src/ggml-metal/ggml-metal-context.m @@ -24,9 +24,13 @@ struct ggml_metal_command_buffer { }; struct ggml_metal { + char name[128]; + ggml_metal_device_t dev; ggml_metal_library_t lib; + ggml_metal_event_t ev_cpy; // for async copies + dispatch_queue_t d_queue; // additional, inference-time compiled pipelines @@ -117,7 +121,11 @@ ggml_metal_t ggml_metal_init(ggml_metal_device_t dev) { } } - //const struct ggml_metal_device_props * props_dev = ggml_metal_device_get_props(dev); + res->ev_cpy = ggml_metal_device_event_init(dev); + + const struct ggml_metal_device_props * props_dev = ggml_metal_device_get_props(dev); + + snprintf(res->name, sizeof(res->name), "%s", props_dev->name); res->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT); @@ -206,9 +214,15 @@ void ggml_metal_free(ggml_metal_t ctx) { dispatch_release(ctx->d_queue); + ggml_metal_device_event_free(ctx->dev, ctx->ev_cpy); + free(ctx); } +const char * ggml_metal_get_name(ggml_metal_t ctx) { + return ctx->name; +} + void ggml_metal_synchronize(ggml_metal_t ctx) { // wait for any backend operations to finish if (ctx->cmd_buf_last) { @@ -273,8 +287,8 @@ void ggml_metal_set_tensor_async(ggml_metal_t ctx, struct ggml_tensor * tensor, // wrap the source data into a Metal buffer id device = ggml_metal_device_get_obj(ctx->dev); id buf_src = [device newBufferWithBytes:data - length:size - options:MTLResourceStorageModeShared]; + length:size + options:MTLResourceStorageModeShared]; GGML_ASSERT(buf_src); @@ -316,9 +330,9 @@ void ggml_metal_get_tensor_async(ggml_metal_t ctx, const struct ggml_tensor * te @autoreleasepool { id device = ggml_metal_device_get_obj(ctx->dev); id buf_dst = [device newBufferWithBytesNoCopy:data - length:size - options:MTLResourceStorageModeShared - deallocator:nil]; + length:size + options:MTLResourceStorageModeShared + deallocator:nil]; GGML_ASSERT(buf_dst); @@ -356,6 +370,49 @@ void ggml_metal_get_tensor_async(ggml_metal_t ctx, const struct ggml_tensor * te } } +bool ggml_metal_cpy_tensor_async(ggml_metal_t ctx_src, ggml_metal_t ctx_dst, const struct ggml_tensor * src, struct ggml_tensor * dst) { + @autoreleasepool { + struct ggml_metal_buffer_id bid_src = ggml_metal_get_buffer_id(src); + struct ggml_metal_buffer_id bid_dst = ggml_metal_get_buffer_id(dst); + + if (bid_src.metal == nil || bid_dst.metal == nil) { + return false; + } + + // queue the copy operation into the Metal context + // this will be queued at the end, after any currently ongoing GPU operations + id queue = ggml_metal_device_get_queue(ctx_src->dev); + id cmd_buf = [queue commandBuffer]; + id encoder = [cmd_buf blitCommandEncoder]; + + [encoder copyFromBuffer:bid_src.metal + sourceOffset:bid_src.offs + toBuffer:bid_dst.metal + destinationOffset:bid_dst.offs + size:ggml_nbytes(src)]; + + [encoder endEncoding]; + + ggml_metal_event_t ev_cpy = ggml_metal_get_ev_cpy(ctx_src); + ggml_metal_event_record(ctx_src, ev_cpy); + + [cmd_buf commit]; + + // do not wait here for completion + //[cmd_buf waitUntilCompleted]; + + // instead, remember a reference to the command buffer and wait for it later if needed + [ctx_src->cmd_bufs_ext addObject:cmd_buf]; + ctx_src->cmd_buf_last = cmd_buf; + + [cmd_buf retain]; + + ggml_metal_event_wait(ctx_dst, ev_cpy); + + return true; + } +} + enum ggml_status ggml_metal_graph_compute(ggml_metal_t ctx, struct ggml_cgraph * gf) { // number of nodes encoded by the main thread (empirically determined) const int n_main = 64; @@ -530,6 +587,42 @@ void ggml_metal_graph_optimize(ggml_metal_t ctx, struct ggml_cgraph * gf) { //printf("%s: graph optimize took %.3f ms\n", __func__, (ggml_time_us() - t_start) / 1000.0); } +void ggml_metal_event_record(ggml_metal_t ctx, ggml_metal_event_t ev) { + @autoreleasepool { + id queue = ggml_metal_device_get_queue(ctx->dev); + id cmd_buf = [queue commandBuffer]; + + ggml_metal_event_encode_signal(ev, cmd_buf); + + [cmd_buf commit]; + + [ctx->cmd_bufs_ext addObject:cmd_buf]; + ctx->cmd_buf_last = cmd_buf; + + [cmd_buf retain]; + } +} + +void ggml_metal_event_wait(ggml_metal_t ctx, ggml_metal_event_t ev) { + @autoreleasepool { + id queue = ggml_metal_device_get_queue(ctx->dev); + id cmd_buf = [queue commandBuffer]; + + ggml_metal_event_encode_wait(ev, cmd_buf); + + [cmd_buf commit]; + + [ctx->cmd_bufs_ext addObject:cmd_buf]; + ctx->cmd_buf_last = cmd_buf; + + [cmd_buf retain]; + } +} + +ggml_metal_event_t ggml_metal_get_ev_cpy(ggml_metal_t ctx) { + return ctx->ev_cpy; +} + void ggml_metal_set_n_cb(ggml_metal_t ctx, int n_cb) { if (ctx->n_cb != n_cb) { ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_COMMAND_BUFFERS); diff --git a/ggml/src/ggml-metal/ggml-metal-device.cpp b/ggml/src/ggml-metal/ggml-metal-device.cpp index 04c6137c5..377b0d3eb 100644 --- a/ggml/src/ggml-metal/ggml-metal-device.cpp +++ b/ggml/src/ggml-metal/ggml-metal-device.cpp @@ -17,10 +17,12 @@ struct ggml_metal_device_deleter { typedef std::unique_ptr ggml_metal_device_ptr; -ggml_metal_device_t ggml_metal_device_get(void) { - static ggml_metal_device_ptr ctx { ggml_metal_device_init() }; +ggml_metal_device_t ggml_metal_device_get(int device) { + static std::vector devs; - return ctx.get(); + devs.emplace_back(ggml_metal_device_init(device)); + + return devs.back().get(); } struct ggml_metal_pipelines { diff --git a/ggml/src/ggml-metal/ggml-metal-device.h b/ggml/src/ggml-metal/ggml-metal-device.h index 3d01c56fb..afb091e72 100644 --- a/ggml/src/ggml-metal/ggml-metal-device.h +++ b/ggml/src/ggml-metal/ggml-metal-device.h @@ -205,7 +205,9 @@ void ggml_metal_rsets_free(ggml_metal_rsets_t rsets); // struct ggml_metal_device_props { + int device; char name[128]; + char desc[128]; size_t max_buffer_size; size_t max_working_set_size; @@ -224,11 +226,15 @@ struct ggml_metal_device_props { int op_offload_min_batch_size; }; -ggml_metal_device_t ggml_metal_device_init(void); +typedef struct ggml_metal_event * ggml_metal_event_t; + +void ggml_metal_event_encode_signal(ggml_metal_event_t ev, ggml_metal_cmd_buf_t cmd_buf); +void ggml_metal_event_encode_wait (ggml_metal_event_t ev, ggml_metal_cmd_buf_t cmd_buf); + +ggml_metal_device_t ggml_metal_device_init(int device); void ggml_metal_device_free(ggml_metal_device_t dev); -// return a singleton that is automatically destroyed when the program exits -ggml_metal_device_t ggml_metal_device_get(void); +ggml_metal_device_t ggml_metal_device_get(int device); void * ggml_metal_device_get_obj (ggml_metal_device_t dev); // id void * ggml_metal_device_get_queue(ggml_metal_device_t dev); // id @@ -240,6 +246,10 @@ void ggml_metal_device_rsets_rm (ggml_metal_device_t dev, ggml_metal_rset_t rset void ggml_metal_device_rsets_keep_alive(ggml_metal_device_t dev); +ggml_metal_event_t ggml_metal_device_event_init(ggml_metal_device_t dev); +void ggml_metal_device_event_free(ggml_metal_device_t dev, ggml_metal_event_t ev); +void ggml_metal_device_event_synchronize(ggml_metal_device_t dev, ggml_metal_event_t ev); + void ggml_metal_device_get_memory(ggml_metal_device_t dev, size_t * free, size_t * total); bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_tensor * op); diff --git a/ggml/src/ggml-metal/ggml-metal-device.m b/ggml/src/ggml-metal/ggml-metal-device.m index 878784f5d..150fd5a4e 100644 --- a/ggml/src/ggml-metal/ggml-metal-device.m +++ b/ggml/src/ggml-metal/ggml-metal-device.m @@ -24,9 +24,6 @@ static const NSInteger MTLGPUFamilyMetal3_GGML = 5001; static const NSInteger MTLGPUFamilyMetal4_GGML = 5002; -// virtual address for GPU memory allocations -static atomic_uintptr_t g_addr_device = 0x000000400ULL; - #if !GGML_METAL_EMBED_LIBRARY // Here to assist with NSBundle Path Hack @interface GGMLMetalClass : NSObject @@ -523,6 +520,9 @@ struct ggml_metal_device { ggml_metal_library_t library; struct ggml_metal_device_props props; + + // virtual address for GPU memory allocations + atomic_uintptr_t addr_virt; }; // @@ -624,7 +624,7 @@ void ggml_metal_rsets_free(ggml_metal_rsets_t rsets) { free(rsets); } -ggml_metal_device_t ggml_metal_device_init(void) { +ggml_metal_device_t ggml_metal_device_init(int device) { ggml_metal_device_t dev = calloc(1, sizeof(struct ggml_metal_device)); assert(dev != NULL); @@ -638,6 +638,9 @@ ggml_metal_device_t ggml_metal_device_init(void) { GGML_LOG_ERROR("%s: error: failed to create command queue\n", __func__); } + dev->addr_virt = 0x000000400ULL; + + dev->props.device = device; dev->props.has_simdgroup_reduction = [dev->mtl_device supportsFamily:MTLGPUFamilyApple7]; dev->props.has_simdgroup_reduction |= [dev->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML]; @@ -798,7 +801,8 @@ ggml_metal_device_t ggml_metal_device_init(void) { dev->props.max_working_set_size = dev->mtl_device.maxBufferLength; } - strncpy(dev->props.name, [[dev->mtl_device name] UTF8String], sizeof(dev->props.name) - 1); + snprintf(dev->props.name, sizeof(dev->props.name), "%s%d", "MTL", device); + snprintf(dev->props.desc, sizeof(dev->props.desc), "%s", [[dev->mtl_device name] UTF8String]); dev->library = ggml_metal_library_init(dev); if (!dev->library) { @@ -928,6 +932,59 @@ void ggml_metal_device_rsets_keep_alive(ggml_metal_device_t dev) { atomic_store_explicit(&dev->rsets->d_loop, 2*dev->rsets->keep_alive_s, memory_order_relaxed); } +struct ggml_metal_event { + void * obj; // id + + atomic_int value; +}; + +void ggml_metal_event_encode_signal(ggml_metal_event_t ev, ggml_metal_cmd_buf_t cmd_buf_raw) { + id event = (id)ev->obj; + + id cmd_buf = (id) cmd_buf_raw; + + [cmd_buf encodeSignalEvent:event value:atomic_fetch_add_explicit(&ev->value, 1, memory_order_relaxed) + 1]; +} + +void ggml_metal_event_encode_wait(ggml_metal_event_t ev, ggml_metal_cmd_buf_t cmd_buf_raw) { + id event = (id)ev->obj; + + id cmd_buf = (id) cmd_buf_raw; + + [cmd_buf encodeWaitForEvent:event value:atomic_load_explicit(&ev->value, memory_order_relaxed)]; +} + +ggml_metal_event_t ggml_metal_device_event_init(ggml_metal_device_t dev) { + id event = [dev->mtl_device newEvent]; + + ggml_metal_event_t ev = calloc(1, sizeof(struct ggml_metal_event)); + + ev->obj = (__bridge void *)event; + ev->value = 0; + + return ev; +} + +void ggml_metal_device_event_free(ggml_metal_device_t dev, ggml_metal_event_t ev) { + id event = ev->obj; + [event release]; + + free(ev); + + GGML_UNUSED(dev); +} + +void ggml_metal_device_event_synchronize(ggml_metal_device_t dev, ggml_metal_event_t ev) { + @autoreleasepool { + id event = ev->obj; + + id cmd_buf = [dev->mtl_queue commandBuffer]; + [cmd_buf encodeWaitForEvent:event value:atomic_load_explicit(&ev->value, memory_order_relaxed)]; + [cmd_buf commit]; + [cmd_buf waitUntilCompleted]; + } +} + void ggml_metal_device_get_memory(ggml_metal_device_t dev, size_t * free, size_t * total) { if (@available(macOS 10.12, iOS 16.0, *)) { *total = dev->mtl_device.recommendedMaxWorkingSetSize; @@ -1350,8 +1407,8 @@ ggml_metal_buffer_t ggml_metal_buffer_init(ggml_metal_device_t dev, size_t size, res->all_data = ggml_metal_host_malloc(size_aligned); res->is_shared = true; } else { - // use virtual address from g_addr_device counter - res->all_data = (void *) atomic_fetch_add_explicit(&g_addr_device, size_aligned, memory_order_relaxed); + // use virtual address + res->all_data = (void *) atomic_fetch_add_explicit(&dev->addr_virt, size_aligned, memory_order_relaxed); res->is_shared = false; } res->all_size = size_aligned; diff --git a/ggml/src/ggml-metal/ggml-metal.cpp b/ggml/src/ggml-metal/ggml-metal.cpp index 56b59f0af..a616dcdb4 100644 --- a/ggml/src/ggml-metal/ggml-metal.cpp +++ b/ggml/src/ggml-metal/ggml-metal.cpp @@ -7,11 +7,12 @@ #include "ggml-metal-context.h" #include "ggml-metal-ops.h" -// globals +#define GGML_METAL_NAME "MTL" +#define GGML_METAL_MAX_DEVICES 16 -// initialized in ggml_backend_metal_reg -static ggml_backend_reg g_ggml_metal_reg; -static ggml_backend_device g_ggml_metal_device; +// number of Metal devices +// note: can be overriden with GGML_METAL_DEVICES env to simulate virtual devices +static int g_devices = 1; //////////////////////////////////////////////////////////////////////////////// // backend interface @@ -165,10 +166,28 @@ static ggml_backend_buffer_i ggml_backend_metal_buffer_private_i = { /* .reset = */ NULL, }; +static bool ggml_backend_buffer_is_metal(ggml_backend_buffer_t buffer) { + return buffer->iface.free_buffer == ggml_backend_metal_buffer_shared_free_buffer || + buffer->iface.free_buffer == ggml_backend_metal_buffer_private_free_buffer; +} + // // buffer types // +struct ggml_backend_metal_buffer_type { + int device; + std::string name; +}; + +struct ggml_backend_metal_buffer_type_deleter { + void operator()(ggml_backend_metal_buffer_type * ctx) const { + delete ctx; + } +}; + +typedef std::unique_ptr ggml_backend_metal_buffer_type_ptr; + // common method for allocating shread or private Metal buffers static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size, bool shared) { ggml_metal_device_t ctx_dev = (ggml_metal_device_t)buft->device->context; @@ -218,9 +237,9 @@ static size_t ggml_backend_metal_buffer_type_get_alloc_size(ggml_backend_buffer_ // default (shared) buffer type static const char * ggml_backend_metal_buffer_type_shared_get_name(ggml_backend_buffer_type_t buft) { - return "Metal"; + ggml_backend_metal_buffer_type * ctx = (ggml_backend_metal_buffer_type *)buft->context; - GGML_UNUSED(buft); + return ctx->name.c_str(); } static ggml_backend_buffer_t ggml_backend_metal_buffer_type_shared_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { @@ -249,29 +268,54 @@ static bool ggml_backend_metal_buffer_type_shared_is_host(ggml_backend_buffer_ty GGML_UNUSED(buft); } -static ggml_backend_buffer_type_t ggml_backend_metal_buffer_type_shared(void) { - static ggml_backend_buffer_type ggml_backend_buffer_type_metal = { - /* .iface = */ { - /* .get_name = */ ggml_backend_metal_buffer_type_shared_get_name, - /* .alloc_buffer = */ ggml_backend_metal_buffer_type_shared_alloc_buffer, - /* .get_alignment = */ ggml_backend_metal_buffer_type_shared_get_alignment, - /* .get_max_size = */ ggml_backend_metal_buffer_type_shared_get_max_size, - /* .get_alloc_size = */ ggml_backend_metal_buffer_type_shared_get_alloc_size, - /* .is_host = */ ggml_backend_metal_buffer_type_shared_is_host, - }, - /* .device = */ &g_ggml_metal_device, - /* .context = */ NULL, - }; +static ggml_backend_buffer_type_t ggml_backend_metal_buffer_type_shared(int device) { + static std::mutex mutex; + std::lock_guard lock(mutex); - return &ggml_backend_buffer_type_metal; + static std::vector bufts; + static std::vector ctxs; + + static bool initialized = false; + if (!initialized) { + bufts.reserve(g_devices); + ctxs.reserve(g_devices); + + for (int i = 0; i < g_devices; ++i) { + ggml_backend_metal_buffer_type * raw_ctx = + new ggml_backend_metal_buffer_type { + /* .device = */ i, + /* .name = */ GGML_METAL_NAME + std::to_string(i), + }; + ctxs.emplace_back(raw_ctx); + + ggml_backend_buffer_type buft = { + /* .iface = */ { + /* .get_name = */ ggml_backend_metal_buffer_type_shared_get_name, + /* .alloc_buffer = */ ggml_backend_metal_buffer_type_shared_alloc_buffer, + /* .get_alignment = */ ggml_backend_metal_buffer_type_shared_get_alignment, + /* .get_max_size = */ ggml_backend_metal_buffer_type_shared_get_max_size, + /* .get_alloc_size = */ ggml_backend_metal_buffer_type_shared_get_alloc_size, + /* .is_host = */ ggml_backend_metal_buffer_type_shared_is_host, + }, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_metal_reg(), i), + /* .context = */ raw_ctx, + }; + + bufts.emplace_back(buft); + } + + initialized = true; + } + + return &bufts[device]; } // default (private) buffer type static const char * ggml_backend_metal_buffer_type_private_get_name(ggml_backend_buffer_type_t buft) { - return "Metal_Private"; + ggml_backend_metal_buffer_type * ctx = (ggml_backend_metal_buffer_type *)buft->context; - GGML_UNUSED(buft); + return ctx->name.c_str(); } static ggml_backend_buffer_t ggml_backend_metal_buffer_type_private_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { @@ -300,29 +344,53 @@ static bool ggml_backend_metal_buffer_type_private_is_host(ggml_backend_buffer_t GGML_UNUSED(buft); } -static ggml_backend_buffer_type_t ggml_backend_metal_buffer_type_private(void) { - static ggml_backend_buffer_type ggml_backend_buffer_type_metal = { - /* .iface = */ { - /* .get_name = */ ggml_backend_metal_buffer_type_private_get_name, - /* .alloc_buffer = */ ggml_backend_metal_buffer_type_private_alloc_buffer, - /* .get_alignment = */ ggml_backend_metal_buffer_type_private_get_alignment, - /* .get_max_size = */ ggml_backend_metal_buffer_type_private_get_max_size, - /* .get_alloc_size = */ ggml_backend_metal_buffer_type_private_get_alloc_size, - /* .is_host = */ ggml_backend_metal_buffer_type_private_is_host, - }, - /* .device = */ &g_ggml_metal_device, - /* .context = */ NULL, - }; +static ggml_backend_buffer_type_t ggml_backend_metal_buffer_type_private(int device) { + static std::mutex mutex; + std::lock_guard lock(mutex); - return &ggml_backend_buffer_type_metal; + static std::vector bufts; + static std::vector ctxs; + + static bool initialized = false; + if (!initialized) { + bufts.reserve(g_devices); + ctxs.reserve(g_devices); + + for (int i = 0; i < g_devices; ++i) { + ggml_backend_metal_buffer_type * raw_ctx = new ggml_backend_metal_buffer_type{ + /* .device = */ i, + /* .name = */ GGML_METAL_NAME + std::to_string(i) + "_Private" + }; + ctxs.emplace_back(raw_ctx); + + ggml_backend_buffer_type buft = { + /* .iface = */ { + /* .get_name = */ ggml_backend_metal_buffer_type_private_get_name, + /* .alloc_buffer = */ ggml_backend_metal_buffer_type_private_alloc_buffer, + /* .get_alignment = */ ggml_backend_metal_buffer_type_private_get_alignment, + /* .get_max_size = */ ggml_backend_metal_buffer_type_private_get_max_size, + /* .get_alloc_size = */ ggml_backend_metal_buffer_type_private_get_alloc_size, + /* .is_host = */ ggml_backend_metal_buffer_type_private_is_host, + }, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_metal_reg(), i), + /* .context = */ raw_ctx, + }; + + bufts.emplace_back(buft); + } + + initialized = true; + } + + return &bufts[device]; } // mapped buffer type static const char * ggml_backend_metal_buffer_type_mapped_get_name(ggml_backend_buffer_type_t buft) { - return "Metal_Mapped"; + ggml_backend_metal_buffer_type * ctx = (ggml_backend_metal_buffer_type *)buft->context; - GGML_UNUSED(buft); + return ctx->name.c_str(); } static ggml_backend_buffer_t ggml_backend_metal_buffer_type_mapped_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { @@ -352,31 +420,55 @@ static bool ggml_backend_metal_buffer_type_mapped_is_host(ggml_backend_buffer_ty GGML_UNUSED(buft); } -static ggml_backend_buffer_type_t ggml_backend_metal_buffer_type_mapped(void) { - // note: not obvious, but this buffer type still needs to implement .alloc_buffer: - // https://github.com/ggml-org/llama.cpp/pull/15832#discussion_r2333177099 - static ggml_backend_buffer_type ggml_backend_buffer_type_mapped_metal = { - /* .iface = */ { - /* .get_name = */ ggml_backend_metal_buffer_type_mapped_get_name, - /* .alloc_buffer = */ ggml_backend_metal_buffer_type_mapped_alloc_buffer, - /* .get_alignment = */ ggml_backend_metal_buffer_type_mapped_get_alignment, - /* .get_max_size = */ ggml_backend_metal_buffer_type_mapped_get_max_size, - /* .get_alloc_size = */ ggml_backend_metal_buffer_type_mapped_get_alloc_size, - /* .is_host = */ ggml_backend_metal_buffer_type_mapped_is_host, - }, - /* .device = */ &g_ggml_metal_device, - /* .context = */ NULL, - }; +static ggml_backend_buffer_type_t ggml_backend_metal_buffer_type_mapped(int device) { + static std::mutex mutex; + std::lock_guard lock(mutex); - return &ggml_backend_buffer_type_mapped_metal; + static std::vector bufts; + static std::vector ctxs; + + static bool initialized = false; + if (!initialized) { + bufts.reserve(g_devices); + ctxs.reserve(g_devices); + + for (int i = 0; i < g_devices; ++i) { + ggml_backend_metal_buffer_type * raw_ctx = new ggml_backend_metal_buffer_type{ + /* .device = */ i, + /* .name = */ GGML_METAL_NAME + std::to_string(i) + "_Mapped" + }; + ctxs.emplace_back(raw_ctx); + + // note: not obvious, but this buffer type still needs to implement .alloc_buffer: + // https://github.com/ggml-org/llama.cpp/pull/15832#discussion_r2333177099 + ggml_backend_buffer_type buft = { + /* .iface = */ { + /* .get_name = */ ggml_backend_metal_buffer_type_mapped_get_name, + /* .alloc_buffer = */ ggml_backend_metal_buffer_type_mapped_alloc_buffer, + /* .get_alignment = */ ggml_backend_metal_buffer_type_mapped_get_alignment, + /* .get_max_size = */ ggml_backend_metal_buffer_type_mapped_get_max_size, + /* .get_alloc_size = */ ggml_backend_metal_buffer_type_mapped_get_alloc_size, + /* .is_host = */ ggml_backend_metal_buffer_type_mapped_is_host, + }, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_metal_reg(), i), + /* .context = */ raw_ctx, + }; + + bufts.emplace_back(buft); + } + + initialized = true; + } + + return &bufts[device]; } // backend static const char * ggml_backend_metal_name(ggml_backend_t backend) { - return "Metal"; + ggml_metal_t ctx = (ggml_metal_t)backend->context; - GGML_UNUSED(backend); + return ggml_metal_get_name(ctx); } static void ggml_backend_metal_free(ggml_backend_t backend) { @@ -409,12 +501,24 @@ static void ggml_backend_metal_get_tensor_async(ggml_backend_t backend, const gg } static bool ggml_backend_metal_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) { - return false; + if (!ggml_backend_is_metal(backend_src) || !ggml_backend_is_metal(backend_dst)) { + return false; + } - GGML_UNUSED(backend_src); - GGML_UNUSED(backend_dst); - GGML_UNUSED(src); - GGML_UNUSED(dst); + if (!ggml_backend_buffer_is_metal(src->buffer) || !ggml_backend_buffer_is_metal(dst->buffer)) { + return false; + } + + ggml_metal_t ctx_src = (ggml_metal_t)backend_src->context; + ggml_metal_t ctx_dst = (ggml_metal_t)backend_dst->context; + + //ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer; + //ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer; + + //ggml_metal_buffer_t buf_ctx_src = (ggml_metal_buffer_t)buf_src->context; + //ggml_metal_buffer_t buf_ctx_dst = (ggml_metal_buffer_t)buf_dst->context; + + return ggml_metal_cpy_tensor_async(ctx_src, ctx_dst, src, dst); } static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { @@ -423,6 +527,20 @@ static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, return ggml_metal_graph_compute(ctx, cgraph); } +static void ggml_backend_metal_event_record(ggml_backend_t backend, ggml_backend_event_t event) { + ggml_metal_t ctx = (ggml_metal_t)backend->context; + ggml_metal_event_t ev = (ggml_metal_event_t)event->context; + + ggml_metal_event_record(ctx, ev); +} + +static void ggml_backend_metal_event_wait(ggml_backend_t backend, ggml_backend_event_t event) { + ggml_metal_t ctx = (ggml_metal_t)backend->context; + ggml_metal_event_t ev = (ggml_metal_event_t)event->context; + + ggml_metal_event_wait(ctx, ev); +} + static void ggml_backend_metal_graph_optimize(ggml_backend_t backend, ggml_cgraph * cgraph) { ggml_metal_t ctx = (ggml_metal_t)backend->context; @@ -435,7 +553,6 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) { ggml_metal_t ctx = (ggml_metal_t)backend->context; ggml_metal_set_n_cb(ctx, n_cb); - } static ggml_backend_i ggml_backend_metal_i = { @@ -450,12 +567,8 @@ static ggml_backend_i ggml_backend_metal_i = { /* .graph_plan_update = */ NULL, /* .graph_plan_compute = */ NULL, /* .graph_compute = */ ggml_backend_metal_graph_compute, - - // the events API is needed only for multi-GPU setups, so likely no need to implement it for Metal - // in any case, these docs seem relevant if we ever decide to implement it: - // https://developer.apple.com/documentation/metal/mtlcommandbuffer#Synchronizing-Passes-with-Events - /* .event_record = */ NULL, - /* .event_wait = */ NULL, + /* .event_record = */ ggml_backend_metal_event_record, + /* .event_wait = */ ggml_backend_metal_event_wait, /* .graph_optimize = */ ggml_backend_metal_graph_optimize, }; @@ -519,15 +632,17 @@ void ggml_backend_metal_capture_next_compute(ggml_backend_t backend) { // backend device static const char * ggml_backend_metal_device_get_name(ggml_backend_dev_t dev) { - return "Metal"; + ggml_metal_device_t ctx_dev = (ggml_metal_device_t)dev->context; - GGML_UNUSED(dev); + const ggml_metal_device_props * props_dev = ggml_metal_device_get_props(ctx_dev); + + return props_dev->name; } static const char * ggml_backend_metal_device_get_description(ggml_backend_dev_t dev) { ggml_metal_device_t ctx_dev = (ggml_metal_device_t)dev->context; - return ggml_metal_device_get_props(ctx_dev)->name; + return ggml_metal_device_get_props(ctx_dev)->desc; } static void ggml_backend_metal_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { @@ -550,14 +665,14 @@ static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, ggml_bac ggml_backend_metal_device_get_memory(dev, &props->memory_free, &props->memory_total); props->caps = { - /* .async = */ true, - /* .host_buffer = */ false, - /* .buffer_from_host_ptr = */ true, - /* .events = */ false, + /* .async = */ true, + /* .host_buffer = */ false, + /* .buffer_from_host_ptr = */ true, + /* .events = */ true, }; } -static ggml_backend_t ggml_backend_metal_device_init(ggml_backend_dev_t dev, const char * params) { +static ggml_backend_t ggml_backend_metal_device_init_backend(ggml_backend_dev_t dev, const char * params) { ggml_metal_device_t ctx_dev = (ggml_metal_device_t)dev->context; ggml_metal_t ctx = ggml_metal_init(ctx_dev); @@ -587,7 +702,7 @@ static ggml_backend_buffer_type_t ggml_backend_metal_device_get_buffer_type(ggml const ggml_metal_device_props * props_dev = ggml_metal_device_get_props(ctx_dev); - return props_dev->use_shared_buffers ? ggml_backend_metal_buffer_type_shared() : ggml_backend_metal_buffer_type_private(); + return props_dev->use_shared_buffers ? ggml_backend_metal_buffer_type_shared(props_dev->device) : ggml_backend_metal_buffer_type_private(props_dev->device); } static ggml_backend_buffer_t ggml_backend_metal_device_buffer_mapped(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) { @@ -595,7 +710,9 @@ static ggml_backend_buffer_t ggml_backend_metal_device_buffer_mapped(ggml_backen ggml_metal_buffer_t res = ggml_metal_buffer_map(ctx_dev, ptr, size, max_tensor_size); - return ggml_backend_buffer_init(ggml_backend_metal_buffer_type_mapped(), ggml_backend_metal_buffer_shared_i, res, size); + const ggml_metal_device_props * props_dev = ggml_metal_device_get_props(ctx_dev); + + return ggml_backend_buffer_init(ggml_backend_metal_buffer_type_mapped(props_dev->device), ggml_backend_metal_buffer_shared_i, res, size); } static bool ggml_backend_metal_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) { @@ -606,9 +723,10 @@ static bool ggml_backend_metal_device_supports_op(ggml_backend_dev_t dev, const static bool ggml_backend_metal_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { return + buft->device == dev && ( buft->iface.get_name == ggml_backend_metal_buffer_type_shared_get_name || buft->iface.get_name == ggml_backend_metal_buffer_type_private_get_name || - buft->iface.get_name == ggml_backend_metal_buffer_type_mapped_get_name; + buft->iface.get_name == ggml_backend_metal_buffer_type_mapped_get_name); GGML_UNUSED(dev); } @@ -632,45 +750,97 @@ static bool ggml_backend_metal_device_offload_op(ggml_backend_dev_t dev, const g get_op_batch_size(op) >= ggml_metal_device_get_props(ctx_dev)->op_offload_min_batch_size; } +static ggml_backend_event_t ggml_backend_metal_device_event_new(ggml_backend_dev_t dev) { + ggml_metal_device_t ctx_dev = (ggml_metal_device_t)dev->context; + + ggml_metal_event_t event = ggml_metal_device_event_init(ctx_dev); + GGML_ASSERT(event); + + ggml_backend_event_t ev = new ggml_backend_event { + /* .device = */ dev, + /* .context = */ event, + }; + + return ev; +} + +static void ggml_backend_metal_device_event_free(ggml_backend_dev_t dev, ggml_backend_event_t event) { + ggml_metal_device_t ctx_dev = (ggml_metal_device_t)dev->context; + + ggml_metal_event_t ev = (ggml_metal_event_t)event->context; + + ggml_metal_device_event_free(ctx_dev, ev); + + delete event; +} + +static void ggml_backend_metal_device_event_synchronize(ggml_backend_dev_t dev, ggml_backend_event_t event) { + ggml_metal_device_t ctx_dev = (ggml_metal_device_t)dev->context; + + ggml_metal_event_t evt = (ggml_metal_event_t)event->context; + + ggml_metal_device_event_synchronize(ctx_dev, evt); +} + static ggml_backend_device_i ggml_backend_metal_device_i = { /* .get_name = */ ggml_backend_metal_device_get_name, /* .get_description = */ ggml_backend_metal_device_get_description, /* .get_memory = */ ggml_backend_metal_device_get_memory, /* .get_type = */ ggml_backend_metal_device_get_type, /* .get_props = */ ggml_backend_metal_device_get_props, - /* .init_backend = */ ggml_backend_metal_device_init, + /* .init_backend = */ ggml_backend_metal_device_init_backend, /* .get_buffer_type = */ ggml_backend_metal_device_get_buffer_type, /* .get_host_buffer_type = */ NULL, /* .buffer_from_host_ptr = */ ggml_backend_metal_device_buffer_mapped, /* .supports_op = */ ggml_backend_metal_device_supports_op, /* .supports_buft = */ ggml_backend_metal_device_supports_buft, /* .offload_op = */ ggml_backend_metal_device_offload_op, - /* .event_new = */ NULL, - /* .event_free = */ NULL, - /* .event_synchronize = */ NULL, + /* .event_new = */ ggml_backend_metal_device_event_new, + /* .event_free = */ ggml_backend_metal_device_event_free, + /* .event_synchronize = */ ggml_backend_metal_device_event_synchronize, }; // backend registry +struct ggml_backend_metal_reg { + std::vector devices; +}; + +typedef struct ggml_backend_metal_reg * ggml_backend_metal_reg_t; + +static ggml_backend_metal_reg_t ggml_backend_metal_reg_init(void) { + ggml_backend_metal_reg_t ctx = new struct ggml_backend_metal_reg; + + return ctx; +} + +static void ggml_backend_metal_reg_free(ggml_backend_metal_reg_t ctx) { + delete ctx; +} + +struct ggml_backend_metal_reg_deleter { + void operator()(ggml_backend_metal_reg_t ctx) { + ggml_backend_metal_reg_free(ctx); + } +}; + +typedef std::unique_ptr ggml_backend_metal_reg_ptr; + static const char * ggml_backend_metal_reg_get_name(ggml_backend_reg_t reg) { - return "Metal"; + return GGML_METAL_NAME; GGML_UNUSED(reg); } static size_t ggml_backend_metal_reg_device_count(ggml_backend_reg_t reg) { - return 1; - - GGML_UNUSED(reg); + ggml_backend_metal_reg_t ctx = (ggml_backend_metal_reg_t)reg->context; + return ctx->devices.size(); } static ggml_backend_dev_t ggml_backend_metal_reg_device_get(ggml_backend_reg_t reg, size_t index) { - GGML_ASSERT(index == 0); - - return &g_ggml_metal_device; - - GGML_UNUSED(reg); - GGML_UNUSED(index); + ggml_backend_metal_reg_t ctx = (ggml_backend_metal_reg_t)reg->context; + GGML_ASSERT(index < ctx->devices.size()); + return ctx->devices[index]; } static ggml_backend_feature g_ggml_backend_metal_features[] = { @@ -698,27 +868,67 @@ static void * ggml_backend_metal_get_proc_address(ggml_backend_reg_t reg, const static ggml_backend_reg_i ggml_backend_metal_reg_i = { /* .get_name = */ ggml_backend_metal_reg_get_name, - /* .device_count = */ ggml_backend_metal_reg_device_count, - /* .device_get = */ ggml_backend_metal_reg_device_get, + /* .get_device_count = */ ggml_backend_metal_reg_device_count, + /* .get_device = */ ggml_backend_metal_reg_device_get, /* .get_proc_address = */ ggml_backend_metal_get_proc_address, }; -ggml_backend_reg_t ggml_backend_metal_reg(void) { - { - g_ggml_metal_reg = { - /* .api_version = */ GGML_BACKEND_API_VERSION, - /* .iface = */ ggml_backend_metal_reg_i, - /* .context = */ NULL, - }; +static ggml_backend_dev_t ggml_backend_metal_device_init(ggml_backend_reg_t reg, int device) { + return new ggml_backend_device { + /* .iface = */ ggml_backend_metal_device_i, + /* .reg = */ reg, + /* .context = */ ggml_metal_device_get(device), + }; +} - g_ggml_metal_device = { - /* .iface = */ ggml_backend_metal_device_i, - /* .reg = */ &g_ggml_metal_reg, - /* .context = */ ggml_metal_device_get(), - }; +static void ggml_backend_metal_device_free(ggml_backend_dev_t dev) { + delete dev; +} + +struct ggml_backend_device_deleter { + void operator()(ggml_backend_dev_t ctx) { + ggml_backend_metal_device_free(ctx); + } +}; + +typedef std::unique_ptr ggml_backend_device_ptr; + +ggml_backend_reg_t ggml_backend_metal_reg(void) { + static ggml_backend_reg reg; + static bool initialized = false; + + { + static std::mutex mutex; + std::lock_guard lock(mutex); + + const char * env = getenv("GGML_METAL_DEVICES"); + if (env) { + g_devices = atoi(env); + } + + static std::vector devs; + + if (!initialized) { + static ggml_backend_metal_reg_ptr reg_ctx(ggml_backend_metal_reg_init()); + + for (int i = 0; i < g_devices; ++i) { + auto * dev = ggml_backend_metal_device_init(®, i); + devs.emplace_back(dev); + + reg_ctx->devices.push_back(dev); + } + + reg = { + /* .api_version = */ GGML_BACKEND_API_VERSION, + /* .iface = */ ggml_backend_metal_reg_i, + /* .context = */ reg_ctx.get(), + }; + } + + initialized = true; } - return &g_ggml_metal_reg; + return ® } GGML_BACKEND_DL_IMPL(ggml_backend_metal_reg) diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp index d510d21cc..27d886c88 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp @@ -344,7 +344,7 @@ void string_to_spv_func(std::string name, std::string in_path, std::string out_p std::vector cmd = {GLSLC, "-fshader-stage=compute", target_env, in_path, "-o", out_path}; #endif - // disable spirv-opt for coopmat shaders for https://github.com/ggerganov/llama.cpp/issues/10734 + // disable spirv-opt for coopmat shaders for https://github.com/ggml-org/llama.cpp/issues/10734 // disable spirv-opt for bf16 shaders for https://github.com/ggml-org/llama.cpp/issues/15344 // disable spirv-opt for rope shaders for https://github.com/ggml-org/llama.cpp/issues/16860 if (!coopmat && name.find("bf16") == std::string::npos && name.find("rope") == std::string::npos) { diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index b04c1f8cd..6756bab03 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -6578,7 +6578,7 @@ static void ggml_compute_backward( case GGML_OP_DIAG_MASK_INF: { if (src0_needs_grads) { /* ggml_diag_mask_inf_impl() shouldn't be here */ - /* ref: https://github.com/ggerganov/llama.cpp/pull/4203#discussion_r1412377992 */ + /* ref: https://github.com/ggml-org/llama.cpp/pull/4203#discussion_r1412377992 */ const int n_past = ((const int32_t *) tensor->op_params)[0]; ggml_add_or_set(ctx, cgraph, isrc0, ggml_diag_mask_zero_impl(ctx, grad, n_past, false)); } @@ -7533,8 +7533,11 @@ void ggml_quantize_free(void) { iq2xs_free_impl(GGML_TYPE_IQ2_XXS); iq2xs_free_impl(GGML_TYPE_IQ2_XS); + iq2xs_free_impl(GGML_TYPE_IQ2_S); iq2xs_free_impl(GGML_TYPE_IQ1_S); + iq2xs_free_impl(GGML_TYPE_IQ1_M); iq3xs_free_impl(256); + iq3xs_free_impl(512); ggml_critical_section_end(); } diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 31273b2b5..6f56d36c5 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -284,6 +284,8 @@ class Keys: class ClipVision: PROJECTOR_TYPE = "clip.vision.projector_type" # for mixed modality models IMAGE_SIZE = "clip.vision.image_size" + IMAGE_MIN_PIXELS = "clip.vision.image_min_pixels" + IMAGE_MAX_PIXELS = "clip.vision.image_max_pixels" PREPROC_IMAGE_SIZE = "clip.vision.preproc_image_size" PATCH_SIZE = "clip.vision.patch_size" EMBEDDING_LENGTH = "clip.vision.embedding_length" diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 7fbb78866..0b9c65016 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -1113,6 +1113,12 @@ class GGUFWriter: def add_vision_image_size(self, value: int) -> None: self.add_uint32(Keys.ClipVision.IMAGE_SIZE, value) + def add_vision_max_pixels(self, value: int) -> None: + self.add_uint32(Keys.ClipVision.IMAGE_MAX_PIXELS, value) + + def add_vision_min_pixels(self, value: int) -> None: + self.add_uint32(Keys.ClipVision.IMAGE_MIN_PIXELS, value) + def add_vision_preproc_image_size(self, value: int) -> None: self.add_uint32(Keys.ClipVision.PREPROC_IMAGE_SIZE, value) diff --git a/json_to_gbnf.py b/json_to_gbnf.py index 369007d59..cb35a0a8c 100644 --- a/json_to_gbnf.py +++ b/json_to_gbnf.py @@ -402,7 +402,7 @@ class SchemaConverter: Transforms a regular expression pattern into a GBNF rule. Input: https://json-schema.org/understanding-json-schema/reference/regular_expressions - Output: https://github.com/ggerganov/llama.cpp/blob/master/grammars/README.md + Output: https://github.com/ggml-org/llama.cpp/blob/master/grammars/README.md Unsupported features: negative/positive lookaheads, greedy/non-greedy modifiers. diff --git a/otherarch/ttscpp/TTSCPP_LICENSE b/otherarch/ttscpp/TTSCPP_LICENSE index b5479530c..dbd8d91be 100644 --- a/otherarch/ttscpp/TTSCPP_LICENSE +++ b/otherarch/ttscpp/TTSCPP_LICENSE @@ -3,7 +3,7 @@ KoboldCpp uses a minimal implementation with some files removed. MIT License -Copyright (c) 2023-2024 The ggml authors +Copyright (c) 2023-2026 The ggml authors Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/src/llama-chat.cpp b/src/llama-chat.cpp index 3c7e0afda..c415a998f 100644 --- a/src/llama-chat.cpp +++ b/src/llama-chat.cpp @@ -233,7 +233,7 @@ int32_t llm_chat_apply_template( llm_chat_template tmpl, const std::vector & chat, std::string & dest, bool add_ass) { - // Taken from the research: https://github.com/ggerganov/llama.cpp/issues/5527 + // Taken from the research: https://github.com/ggml-org/llama.cpp/issues/5527 std::stringstream ss; if (tmpl == LLM_CHAT_TEMPLATE_CHATML) { // chatml template diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 1e9c1378b..cbdf86d3b 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -325,6 +325,7 @@ llama_context::llama_context( auto dev_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get())); if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU) { // ignore CPU backend + // TODO: should we ignore ACCEL types too? continue; } auto * dev = ggml_backend_get_device(backend.get()); diff --git a/src/llama-hparams.h b/src/llama-hparams.h index caed0ec1b..dfbc7d95e 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -195,7 +195,7 @@ struct llama_hparams { uint32_t n_deepstack_layers = 0; // needed by encoder-decoder models (e.g. T5, FLAN-T5) - // ref: https://github.com/ggerganov/llama.cpp/pull/8141 + // ref: https://github.com/ggml-org/llama.cpp/pull/8141 llama_token dec_start_token_id = LLAMA_TOKEN_NULL; uint32_t dec_n_layer = 0; diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 3e22d8371..71fb0073c 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -94,7 +94,7 @@ static_assert(std::is_trivially_copyable::value, "llm_symbol is not // // SPM tokenizer // original implementation: -// https://github.com/ggerganov/llama.cpp/commit/074bea2eb1f1349a0118239c4152914aecaa1be4 +// https://github.com/ggml-org/llama.cpp/commit/074bea2eb1f1349a0118239c4152914aecaa1be4 // struct llm_bigram_spm { @@ -510,7 +510,7 @@ struct llm_tokenizer_bpe : llm_tokenizer { // original regex from tokenizer.json //"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", - // adapted: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2080233989 + // adapted: https://github.com/ggml-org/llama.cpp/pull/6920#issuecomment-2080233989 "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", }; break; @@ -2501,6 +2501,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { || t.first == "
"
                         || t.first == "▁
"          // CodeLlama
                         || t.first == "<|code_prefix|>" // GLM-4.5
+                        || t.first == "<|prefix|>"      // Falcon-H1-Tiny-Coder
                         ) {
                     special_fim_pre_id = t.second;
                     if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
@@ -2521,6 +2522,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                         || t.first == ""
                         || t.first == "▁"         // CodeLlama
                         || t.first == "<|code_suffix|>" // GLM-4.5
+                        || t.first == "<|suffix|>"      // Falcon-H1-Tiny-Coder
                         ) {
                     special_fim_suf_id = t.second;
                     if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
@@ -2541,6 +2543,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                         || t.first == ""
                         || t.first == "▁"         // CodeLlama
                         || t.first == "<|code_middle|>" // GLM-4.5
+                        || t.first == "<|middle|>"      // Falcon-H1-Tiny-Coder
                         ) {
                     special_fim_mid_id = t.second;
                     if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
@@ -2629,7 +2632,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
 
         // maintain a list of tokens that cause end-of-generation
         // this is currently determined based on the token text, which is obviously not ideal
-        // ref: https://github.com/ggerganov/llama.cpp/issues/9606
+        // ref: https://github.com/ggml-org/llama.cpp/issues/9606
         special_eog_ids.clear();
 
         if (special_fim_pad_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_fim_pad_id) == 0) {
@@ -3355,7 +3358,7 @@ int32_t llama_vocab::impl::token_to_piece(llama_token token, char * buf, int32_t
     {
         return llama_token_to_piece_old(vocab, token, buf, length);
     }
-    // ref: https://github.com/ggerganov/llama.cpp/pull/7587#discussion_r1620983843
+    // ref: https://github.com/ggml-org/llama.cpp/pull/7587#discussion_r1620983843
     static const int attr_special = LLAMA_TOKEN_ATTR_UNKNOWN | LLAMA_TOKEN_ATTR_CONTROL;
     const llama_token_attr attr = token_get_attr(token);
     if (!special && (attr & attr_special)) {
diff --git a/src/models/deepseek2.cpp b/src/models/deepseek2.cpp
index 297dca513..987f44993 100644
--- a/src/models/deepseek2.cpp
+++ b/src/models/deepseek2.cpp
@@ -14,7 +14,7 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
     const uint32_t kv_lora_rank = hparams.n_lora_kv;
 
     // We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
-    // See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
+    // See https://github.com/ggml-org/llama.cpp/discussions/7416 for detailed explanation.
     // And also: https://github.com/ggml-org/llama.cpp/pull/17945 [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
 
     // first cancel the adjustment from llama_hparams::yarn_attn_factor_adjust to get the original attn_factor
diff --git a/tests/test-jinja.cpp b/tests/test-jinja.cpp
index f6114f1e2..1f25c6ae7 100644
--- a/tests/test-jinja.cpp
+++ b/tests/test-jinja.cpp
@@ -189,12 +189,24 @@ static void test_conditionals(testing & t) {
         "negated"
     );
 
-    test_template(t, "in operator",
+    test_template(t, "in operator (element in array)",
         "{% if 'x' in items %}found{% endif %}",
         {{"items", json::array({"x", "y"})}},
         "found"
     );
 
+    test_template(t, "in operator (substring)",
+        "{% if 'bc' in 'abcd' %}found{% endif %}",
+        json::object(),
+        "found"
+    );
+
+    test_template(t, "in operator (object key)",
+        "{% if 'key' in obj %}found{% endif %}",
+        {{"obj", {{"key", 1}, {"other", 2}}}},
+        "found"
+    );
+
     test_template(t, "is defined",
         "{% if x is defined %}yes{% else %}no{% endif %}",
         {{"x", 1}},
@@ -1036,6 +1048,42 @@ static void test_tests(testing & t) {
         json::object(),
         "yes"
     );
+
+    test_template(t, "is in (array, true)",
+        "{{ 'yes' if 2 is in([1, 2, 3]) }}",
+        json::object(),
+        "yes"
+    );
+
+    test_template(t, "is in (array, false)",
+        "{{ 'yes' if 5 is in([1, 2, 3]) else 'no' }}",
+        json::object(),
+        "no"
+    );
+
+    test_template(t, "is in (string)",
+        "{{ 'yes' if 'bc' is in('abcde') }}",
+        json::object(),
+        "yes"
+    );
+
+    test_template(t, "is in (object keys)",
+        "{{ 'yes' if 'a' is in(obj) }}",
+        {{"obj", {{"a", 1}, {"b", 2}}}},
+        "yes"
+    );
+
+    test_template(t, "reject with in test",
+        "{{ items | reject('in', skip) | join(', ') }}",
+        {{"items", json::array({"a", "b", "c", "d"})}, {"skip", json::array({"b", "d"})}},
+        "a, c"
+    );
+
+    test_template(t, "select with in test",
+        "{{ items | select('in', keep) | join(', ') }}",
+        {{"items", json::array({"a", "b", "c", "d"})}, {"keep", json::array({"b", "c"})}},
+        "b, c"
+    );
 }
 
 static void test_string_methods(testing & t) {
diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h
index d66106ab2..30e4f0d79 100644
--- a/tools/mtmd/clip-impl.h
+++ b/tools/mtmd/clip-impl.h
@@ -36,6 +36,8 @@
 // vision-specific
 #define KEY_VISION_PROJ_TYPE    "clip.vision.projector_type" // for models with mixed modalities
 #define KEY_IMAGE_SIZE          "clip.vision.image_size"
+#define KEY_IMAGE_MIN_PIXELS    "clip.vision.image_min_pixels"
+#define KEY_IMAGE_MAX_PIXELS    "clip.vision.image_max_pixels"
 #define KEY_PREPROC_IMAGE_SIZE  "clip.vision.preproc_image_size"
 #define KEY_PATCH_SIZE          "clip.vision.patch_size"
 #define KEY_IMAGE_MEAN          "clip.vision.image_mean"
diff --git a/tools/server/public_legacy/index-new.html b/tools/server/public_legacy/index-new.html
index cbfbbdf28..e2f39d668 100644
--- a/tools/server/public_legacy/index-new.html
+++ b/tools/server/public_legacy/index-new.html
@@ -1096,7 +1096,7 @@ return html`
           
           
         
       `;
diff --git a/tools/server/public_legacy/index.html b/tools/server/public_legacy/index.html
index 75f39330a..98d56ea8b 100644
--- a/tools/server/public_legacy/index.html
+++ b/tools/server/public_legacy/index.html
@@ -1281,7 +1281,7 @@
 
           
         
       `;
diff --git a/tools/server/public_legacy/theme-mangotango.css b/tools/server/public_legacy/theme-mangotango.css
index e43380245..315daf734 100755
--- a/tools/server/public_legacy/theme-mangotango.css
+++ b/tools/server/public_legacy/theme-mangotango.css
@@ -1,5 +1,5 @@
 /* Author: Yazan Agha-Schrader */
-/* Inspiration from llama.cpp logo/banner https://github.com/ggerganov/llama.cpp#readme */
+/* Inspiration from llama.cpp logo/banner https://github.com/ggml-org/llama.cpp#readme */
 
 .theme-mangotango {
 
diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp
index 803cb02e6..57655476a 100644
--- a/tools/server/server-models.cpp
+++ b/tools/server/server-models.cpp
@@ -767,7 +767,7 @@ static bool router_validate_model(const std::string & name, server_models & mode
     }
     auto meta = models.get_meta(name);
     if (!meta.has_value()) {
-        res_err(res, format_error_response("model not found", ERROR_TYPE_INVALID_REQUEST));
+        res_err(res, format_error_response(string_format("model '%s' not found", name.c_str()), ERROR_TYPE_INVALID_REQUEST));
         return false;
     }
     if (models_autoload) {
diff --git a/tools/server/themes/buttons-top/index.html b/tools/server/themes/buttons-top/index.html
index 3fb88fcc8..cb5af587a 100644
--- a/tools/server/themes/buttons-top/index.html
+++ b/tools/server/themes/buttons-top/index.html
@@ -1032,7 +1032,7 @@
 
           
         
       `;
diff --git a/tools/server/themes/wild/index.html b/tools/server/themes/wild/index.html
index 73f36d4b2..601f7762c 100644
--- a/tools/server/themes/wild/index.html
+++ b/tools/server/themes/wild/index.html
@@ -1036,7 +1036,7 @@
 
           
         
       `;