diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index f79acfef1..350bbdf7f 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -19,7 +19,12 @@ if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/../.git") endif() endif() - set(GIT_INDEX "${GIT_DIR}/index") + if(EXISTS "${GIT_DIR}/index") + set(GIT_INDEX "${GIT_DIR}/index") + else() + message(WARNING "Git index not found in git repository.") + set(GIT_INDEX "") + endif() else() message(WARNING "Git repository not found; to enable automatic generation of build info, make sure Git is installed and the project is a Git repository.") set(GIT_INDEX "") diff --git a/common/common.cpp b/common/common.cpp index 1e5408fbb..857d4d296 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -514,12 +514,6 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { break; } params.n_sequences = std::stoi(argv[i]); - } else if (arg == "--p-accept" || arg == "-pa") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.p_accept = std::stof(argv[i]); } else if (arg == "--p-split" || arg == "-ps") { if (++i >= argc) { invalid_param = true; @@ -1045,7 +1039,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks); printf(" -np N, --parallel N number of parallel sequences to decode (default: %d)\n", params.n_parallel); printf(" -ns N, --sequences N number of sequences to decode (default: %d)\n", params.n_sequences); - printf(" -pa N, --p-accept N speculative decoding accept probability (default: %.1f)\n", (double)params.p_accept); printf(" -ps N, --p-split N speculative decoding split probability (default: %.1f)\n", (double)params.p_split); printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n"); printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA. see examples/llava/README.md\n"); @@ -1300,7 +1293,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch; cparams.seed = params.seed; cparams.logits_all = params.logits_all; - cparams.embedding = params.embedding; + cparams.embeddings = params.embedding; cparams.rope_scaling_type = params.rope_scaling_type; cparams.rope_freq_base = params.rope_freq_base; cparams.rope_freq_scale = params.rope_freq_scale; diff --git a/common/common.h b/common/common.h index 5f3aa43fa..411204dbf 100644 --- a/common/common.h +++ b/common/common.h @@ -37,7 +37,7 @@ int32_t get_num_physical_cores(); struct gpt_params { - uint32_t seed = -1; // RNG seed + uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed int32_t n_threads = get_num_physical_cores(); int32_t n_threads_draft = -1; @@ -47,11 +47,10 @@ struct gpt_params { int32_t n_ctx = 512; // context size int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS) int32_t n_keep = 0; // number of tokens to keep from initial prompt - int32_t n_draft = 8; // number of tokens to draft during speculative decoding + int32_t n_draft = 5; // number of tokens to draft during speculative decoding int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited) int32_t n_parallel = 1; // number of parallel sequences to decode int32_t n_sequences = 1; // number of sequences to decode - float p_accept = 0.5f; // speculative decoding accept probability float p_split = 0.1f; // speculative decoding split probability int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default) int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default) diff --git a/common/sampling.cpp b/common/sampling.cpp index 0d742d7ea..c2e952f42 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -295,6 +295,77 @@ static llama_token llama_sampling_sample_impl( return id; } +static llama_token_data_array llama_sample_probability_distribution_impl( + struct llama_sampling_context * ctx_sampling, + struct llama_context * ctx_main, + struct llama_context * ctx_cfg, + const int idx) { + const llama_sampling_params & params = ctx_sampling->params; + + const int n_vocab = llama_n_vocab(llama_get_model(ctx_main)); + + const int32_t penalty_last_n = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n; + const float penalty_repeat = params.penalty_repeat; + const float penalty_freq = params.penalty_freq; + const float penalty_present = params.penalty_present; + const bool penalize_nl = params.penalize_nl; + + auto & prev = ctx_sampling->prev; + auto & cur = ctx_sampling->cur; + + // Get a pointer to the logits + float * logits = llama_get_logits_ith(ctx_main, idx); + + // Declare original_logits at the beginning of the function scope + std::vector original_logits; + + // apply params.logit_bias map + for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) { + logits[it->first] += it->second; + } + + if (ctx_cfg) { + float * logits_guidance = llama_get_logits_ith(ctx_cfg, idx); + llama_sample_apply_guidance(ctx_main, logits, logits_guidance, params.cfg_scale); + } + + cur.clear(); + + for (llama_token token_id = 0; token_id < n_vocab; token_id++) { + cur.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f}); + } + + llama_token_data_array cur_p = { cur.data(), cur.size(), false }; + + // apply penalties + const auto& penalty_tokens = params.use_penalty_prompt_tokens ? params.penalty_prompt_tokens : prev; + const int penalty_tokens_used_size = std::min((int)penalty_tokens.size(), penalty_last_n); + if (penalty_tokens_used_size) { + const float nl_logit = logits[llama_token_nl(llama_get_model(ctx_main))]; + + llama_sample_repetition_penalties(ctx_main, &cur_p, + penalty_tokens.data() + penalty_tokens.size() - penalty_tokens_used_size, + penalty_tokens_used_size, penalty_repeat, penalty_freq, penalty_present); + + if (!penalize_nl) { + for (size_t idx = 0; idx < cur_p.size; idx++) { + if (cur_p.data[idx].id == llama_token_nl(llama_get_model(ctx_main))) { + cur_p.data[idx].logit = nl_logit; + break; + } + } + } + } + + // apply grammar checks + if (ctx_sampling->grammar != NULL) { + llama_sample_grammar(ctx_main, &cur_p, ctx_sampling->grammar); + } + + llama_sample_softmax(ctx_main, &cur_p); + return cur_p; +} + llama_token llama_sampling_sample( struct llama_sampling_context * ctx_sampling, struct llama_context * ctx_main, @@ -304,6 +375,14 @@ llama_token llama_sampling_sample( return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, false); } +llama_token_data_array llama_sampling_probability_distribution( + struct llama_sampling_context * ctx_sampling, + struct llama_context * ctx_main, + struct llama_context * ctx_cfg, + const int idx) { + return llama_sample_probability_distribution_impl(ctx_sampling,ctx_main, ctx_cfg, idx); +} + void llama_sampling_accept( struct llama_sampling_context * ctx_sampling, struct llama_context * ctx_main, diff --git a/common/sampling.h b/common/sampling.h index 9b635a905..a5b8e6079 100644 --- a/common/sampling.h +++ b/common/sampling.h @@ -132,6 +132,13 @@ llama_token llama_sampling_sample( struct llama_context * ctx_cfg, int idx = 0); +// returns the probability that token of given id will be sampled +llama_token_data_array llama_sampling_probability_distribution( + struct llama_sampling_context * ctx_sampling, + struct llama_context * ctx_main, + struct llama_context * ctx_cfg, + int idx = 0); + void llama_sampling_accept( struct llama_sampling_context * ctx_sampling, struct llama_context * ctx_main, diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index ffdba7444..f6369af38 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -36,8 +36,10 @@ class SentencePieceTokenTypes(IntEnum): UNUSED = 5 BYTE = 6 + AnyModel = TypeVar("AnyModel", bound="type[Model]") + class Model(ABC): _model_classes: dict[str, type[Model]] = {} @@ -187,6 +189,7 @@ class Model(ABC): @classmethod def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]: assert names + def func(modelcls: type[Model]): for name in names: cls._model_classes[name] = modelcls diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index c2f2c293e..4da81998a 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -20,11 +20,11 @@ static std::vector split_lines(const std::string & s) { static void batch_add_seq(llama_batch & batch, const std::vector & tokens, int seq_id) { for (size_t i = 0; i < tokens.size(); i++) { - llama_batch_add(batch, tokens[i], i, { seq_id }, false); + llama_batch_add(batch, tokens[i], i, { seq_id }, i == tokens.size() - 1); } } -static void normalize(float * vec, float * out, int n) { +static void normalize(const float * vec, float * out, int n) { float norm = 0; for (int i = 0; i < n; i++) { norm += vec[i] * vec[i]; @@ -46,10 +46,23 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu } // normalize on copy - for (int k = 0; k < n_seq; k++) { - float * emb = llama_get_embeddings_ith(ctx, k); - float * out = output + k * n_embd; - normalize(emb, out, n_embd); + for (int i = 0; i < batch.n_tokens; i++) { + if (!batch.logits[i]) { + continue; + } + + // try to get sequence embeddings - supported only when pooling_type is not NONE + const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]); + if (embd == NULL) { + embd = llama_get_embeddings_ith(ctx, i); + if (embd == NULL) { + fprintf(stderr, "%s: failed to get embeddings for token %d\n", __func__, i); + continue; + } + } + + float * out = output + batch.seq_id[i][0] * n_embd; + normalize(embd, out, n_embd); } } @@ -133,7 +146,7 @@ int main(int argc, char ** argv) { // initialize batch const int n_prompts = prompts.size(); - struct llama_batch batch = llama_batch_init(n_batch, 0, n_prompts); + struct llama_batch batch = llama_batch_init(n_batch, 0, 1); // allocate output const int n_embd = llama_n_embd(model); @@ -146,6 +159,7 @@ int main(int argc, char ** argv) { for (int k = 0; k < n_prompts; k++) { // clamp to n_batch tokens auto & inp = inputs[k]; + const uint64_t n_toks = inp.size(); // encode if at capacity diff --git a/examples/main/main.cpp b/examples/main/main.cpp index b523b8be8..d693cecc0 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -512,6 +512,14 @@ int main(int argc, char ** argv) { std::vector embd; std::vector embd_guidance; + // tokenized antiprompts + std::vector> antiprompt_ids; + + antiprompt_ids.reserve(params.antiprompt.size()); + for (const std::string & antiprompt : params.antiprompt) { + antiprompt_ids.emplace_back(::llama_tokenize(ctx, antiprompt, false, true)); + } + struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams); while ((n_remain != 0 && !is_antiprompt) || params.interactive) { @@ -770,6 +778,18 @@ int main(int argc, char ** argv) { } } + // check for reverse prompt using special tokens + llama_token last_token = llama_sampling_last(ctx_sampling); + for (std::vector ids : antiprompt_ids) { + if (ids.size() == 1 && last_token == ids[0]) { + if (params.interactive) { + is_interacting = true; + } + is_antiprompt = true; + break; + } + } + if (is_antiprompt) { LOG("found antiprompt: %s\n", last_output.c_str()); } diff --git a/examples/server-embd.py b/examples/server-embd.py new file mode 100644 index 000000000..c5c4ea87b --- /dev/null +++ b/examples/server-embd.py @@ -0,0 +1,34 @@ +import asyncio +import requests +import numpy as np + +n = 8 + +result = [] + +async def requests_post_async(*args, **kwargs): + return await asyncio.to_thread(requests.post, *args, **kwargs) + +async def main(): + model_url = "http://127.0.0.1:6900" + responses: list[requests.Response] = await asyncio.gather(*[requests_post_async( + url= f"{model_url}/embedding", + json= {"content": str(i)*1024} + ) for i in range(n)]) + + for response in responses: + embedding = response.json()["embedding"] + print(embedding[-8:]) + result.append(embedding) + +asyncio.run(main()) + +# compute cosine similarity + +for i in range(n-1): + for j in range(i+1, n): + embedding1 = np.array(result[i]) + embedding2 = np.array(result[j]) + similarity = np.dot(embedding1, embedding2) / (np.linalg.norm(embedding1) * np.linalg.norm(embedding2)) + print(f"Similarity between {i} and {j}: {similarity:.2f}") + diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 6c3f14288..2ee1e2a69 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -414,7 +414,7 @@ struct llama_server_context int res = llama_chat_apply_template(model, nullptr, chat, 1, true, buf.data(), buf.size()); if (res < 0) { LOG_ERROR("The chat template comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {}); - sparams.chat_template = "<|im_start|>"; // llama_chat_apply_template only checks if <|im_start|> exist in the template + sparams.chat_template = "chatml"; } } @@ -1211,7 +1211,7 @@ struct llama_server_context queue_results.send(res); } - void send_embedding(server_slot &slot) + void send_embedding(server_slot & slot, const llama_batch & batch) { task_result res; res.id = slot.task_id; @@ -1220,6 +1220,7 @@ struct llama_server_context res.stop = true; const int n_embd = llama_n_embd(model); + if (!params.embedding) { LOG_WARNING("embedding disabled", {{"params.embedding", params.embedding}}); @@ -1230,12 +1231,29 @@ struct llama_server_context } else { - const float *data = llama_get_embeddings(ctx); - std::vector embedding(data, data + n_embd); - res.result_json = json - { - {"embedding", embedding}, - }; + for (int i = 0; i < batch.n_tokens; ++i) { + if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) { + continue; + } + + const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]); + if (embd == NULL) { + embd = llama_get_embeddings_ith(ctx, i); + if (embd == NULL) { + LOG_ERROR("failed to get embeddings for token", {{"token", batch.token[i]}, {"seq_id", batch.seq_id[i][0]}}); + res.result_json = json + { + {"embedding", std::vector(n_embd, 0.0f)}, + }; + continue; + } + } + + res.result_json = json + { + {"embedding", std::vector(embd, embd + n_embd)}, + }; + } } queue_results.send(res); } @@ -1846,7 +1864,7 @@ struct llama_server_context ga_i += ga_w/ga_n; } } - llama_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot_npast, {slot.id }, false); + llama_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot_npast, { slot.id }, false); slot_npast++; } @@ -1882,7 +1900,7 @@ struct llama_server_context for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) { - const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i)); + const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i); for (auto & slot : slots) { @@ -1955,7 +1973,7 @@ struct llama_server_context // prompt evaluated for embedding if (slot.embedding) { - send_embedding(slot); + send_embedding(slot, batch_view); slot.release(); slot.i_batch = -1; continue; @@ -2037,6 +2055,8 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms, printf(" --yarn-attn-factor N YaRN: scale sqrt(t) or attention magnitude (default: 1.0)\n"); printf(" --yarn-beta-slow N YaRN: high correction dim or alpha (default: %.1f)\n", params.yarn_beta_slow); printf(" --yarn-beta-fast N YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast); + printf(" --pooling {none,mean,cls}\n"); + printf(" pooling type for embeddings, use model default if unspecified\n"); printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); printf(" not recommended: doubles context memory required and no measurable increase in quality\n"); @@ -2277,6 +2297,18 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, } params.yarn_beta_slow = std::stof(argv[i]); } + else if (arg == "--pooling") + { + if (++i >= argc) { + invalid_param = true; + break; + } + std::string value(argv[i]); + /**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; } + else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; } + else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; } + else { invalid_param = true; break; } + } else if (arg == "--threads" || arg == "-t") { if (++i >= argc) @@ -2331,7 +2363,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, break; } params.n_batch = std::stoi(argv[i]); - params.n_batch = std::min(512, params.n_batch); } else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") { diff --git a/examples/speculative/README.md b/examples/speculative/README.md index 814efa592..a6608c5fe 100644 --- a/examples/speculative/README.md +++ b/examples/speculative/README.md @@ -6,3 +6,4 @@ More info: - https://github.com/ggerganov/llama.cpp/pull/2926 - https://github.com/ggerganov/llama.cpp/pull/3624 +- https://github.com/ggerganov/llama.cpp/pull/5625 diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp index 1053d4da6..2d3ce23f3 100644 --- a/examples/speculative/speculative.cpp +++ b/examples/speculative/speculative.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 100 #define SPEC_VOCAB_CHECK_START_TOKEN_ID 5 @@ -20,6 +21,7 @@ struct seq_draft { std::vector i_batch_tgt; std::vector tokens; + std::vector> dists; struct llama_sampling_context * ctx_sampling; }; @@ -39,12 +41,15 @@ int main(int argc, char ** argv) { // max number of parallel drafting sequences (i.e. tree branches) const int n_seq_dft = params.n_parallel; - // probability threshold for accepting a token from the draft model - const float p_accept = params.p_accept; - // probability threshold for splitting a draft branch (only for n_seq_dft > 1) const float p_split = params.p_split; + if (params.seed == LLAMA_DEFAULT_SEED) { + params.seed = time(NULL); + } + std::default_random_engine rng(params.seed); + std::uniform_real_distribution<> u_dist; + #ifndef LOG_DISABLE_LOGS log_set_target(log_filename_generator("speculative", "log")); LOG_TEE("Log start\n"); @@ -168,7 +173,9 @@ int main(int argc, char ** argv) { std::vector drafts(n_seq_dft); params.sparams.grammar.clear(); // the draft samplers will copy the target sampler's grammar - params.sparams.temp = -1.0f; // force greedy sampling with probs for the draft model + if (params.sparams.temp == 0) { + params.sparams.temp = -1.0f; // force greedy sampling with probs for the draft model + } for (int s = 0; s < n_seq_dft; ++s) { drafts[s].ctx_sampling = llama_sampling_init(params.sparams); @@ -184,12 +191,15 @@ int main(int argc, char ** argv) { drafts[0].i_batch_tgt[0] = 0; while (true) { + std::set active_seqs = {}; + // print current draft sequences for (int s = 0; s < n_seq_dft; ++s) { if (!drafts[s].active) { continue; } + active_seqs.insert(s); const auto & tokens = drafts[s].tokens; LOG("draft %d: %s\n", s, LOG_TOKENS_TOSTR_PRETTY(ctx_dft, tokens).c_str()); @@ -198,48 +208,156 @@ int main(int argc, char ** argv) { int i_dft = 0; int s_keep = 0; + llama_token token_id; + std::string token_str; + + // loop until we fail to accept a drafted token or we run out of drafted tokens while (true) { - LOG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft]); - - // sample from the target model - llama_token id = llama_sampling_sample(ctx_sampling, ctx_tgt, NULL, drafts[s_keep].i_batch_tgt[i_dft]); - - llama_sampling_accept(ctx_sampling, ctx_tgt, id, true); - - //LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, ctx_sampling->prev).c_str()); - - const std::string token_str = llama_token_to_piece(ctx_tgt, id); - - if (!params.use_color) { - printf("%s", token_str.c_str()); - } - - if (id == llama_token_eos(model_tgt)) { - has_eos = true; - } - - ++n_predict; // check if the target token matches any of the drafts + // for stochastic sampling, attempt to match the token with the drafted tokens { - bool matches = false; + bool accept = false; + if (params.sparams.temp > 0) { + // stochastic verification - for (int s = 0; s < n_seq_dft; ++s) { - if (!drafts[s].active) { - continue; + llama_token_data_array dist_tgt = llama_sampling_probability_distribution(ctx_sampling, ctx_tgt, NULL, drafts[s_keep].i_batch_tgt[i_dft]); + float p_tgt = 0, p_dft = 0; + + // GGML_ASSERT(dist_tgt.size() == dist_dft.size()); + + while (active_seqs.size() > 0) { + // randomly select a sequence to verify from active sequences + std::uniform_int_distribution u_int_dist(0, active_seqs.size() - 1); + int s = *std::next(active_seqs.begin(), u_int_dist(rng)); + if (i_dft >= (int) drafts[s].tokens.size()) { + drafts[s].active = false; + active_seqs.erase(s); + continue; + } + if (accept) { + // if we already accepted a token, we can skip the rest + if (drafts[s].tokens[i_dft] != drafts[s_keep].tokens[i_dft]) { + drafts[s].active = false; + active_seqs.erase(s); + } + continue; + } + LOG("verifying sequence #%d at pos #%d from %d active sequence(s)\n", s, i_dft, (int) active_seqs.size()); + float r = u_dist(rng); + llama_token_data_array dist_dft = { drafts[s].dists[i_dft].data() , drafts[s].dists[i_dft].size(), true }; + // acquire the token probabilities assigned by the draft and target models + for (size_t i = 0; i < dist_tgt.size; i++) { + if (dist_tgt.data[i].id == drafts[s].tokens[i_dft]) { + p_tgt = dist_tgt.data[i].p; + } + if (dist_dft.data[i].id == drafts[s].tokens[i_dft]) { + p_dft = dist_dft.data[i].p; + } + if (p_tgt && p_dft) { + break; + } + } + LOG("r = %f, p_dft = %f, p_tgt = %f\n", r, p_dft, p_tgt); + if (r <= p_tgt / p_dft) { + s_keep = s; + accept = true; + token_id = drafts[s].tokens[i_dft]; + token_str = llama_token_to_piece(ctx_tgt, token_id); + llama_sampling_accept(ctx_sampling, ctx_tgt, token_id, true); + + LOG("draft token %d of sequence %d (%d, '%s') accepted\n", i_dft, s, token_id, token_str.c_str()); + break; + } else { + LOG("draft token %d of sequence %d (%d, '%s') rejected\n", i_dft, s, drafts[s].tokens[i_dft], llama_token_to_piece(ctx_tgt, drafts[s].tokens[i_dft]).c_str()); + drafts[s].active = false; + + // calculate residual probability + GGML_ASSERT(dist_tgt.sorted); + GGML_ASSERT(dist_dft.sorted); + float sum_probs = 0.0f; + + // sort dist by id + std::sort(dist_tgt.data, dist_tgt.data + dist_tgt.size, [](const llama_token_data &a, const llama_token_data &b) { + return a.id < b.id; + }); + std::sort(dist_dft.data, dist_dft.data + dist_dft.size, [](const llama_token_data &a, const llama_token_data &b) { + return a.id < b.id; + }); + + for (size_t i = 0; i < dist_tgt.size; i++) { + dist_tgt.data[i].p = std::max(0.0f, dist_tgt.data[i].p - dist_dft.data[i].p); + sum_probs += dist_tgt.data[i].p; + } + for (size_t i = 0; i < dist_tgt.size; i++) { + dist_tgt.data[i].p /= sum_probs; + } + + // sort dist_tgt by p desc + std::sort(dist_tgt.data, dist_tgt.data + dist_tgt.size, [](const llama_token_data &a, const llama_token_data &b) { + return a.p > b.p; + }); + } + + active_seqs.erase(s); + for(int i = 0; i < n_seq_dft; i++) { + if (i == s) { + continue; + } + if (drafts[i].tokens[i_dft] == drafts[s].tokens[i_dft]) { + // synchronize active status for sequences with the same drafted token + drafts[i].active = drafts[i].active && accept; + if (!drafts[i].active) { + active_seqs.erase(s); + } + } + } } - if (i_dft < (int) drafts[s].tokens.size() && id == drafts[s].tokens[i_dft]) { - LOG("the sampled target token matches the %dth drafted token of sequence %d (%d, '%s') - accepted\n", i_dft, s, id, token_str.c_str()); + if (!accept) { + // all drafted tokens were rejected + // sample from the target model + LOG("all drafted tokens were rejected, sampling from residual distribution\n"); + token_id = llama_sample_token(ctx_tgt, &dist_tgt); + llama_sampling_accept(ctx_sampling, ctx_tgt, token_id, true); + token_str = llama_token_to_piece(ctx_tgt, token_id); + } - s_keep = s; - matches = true; - } else { - drafts[s].active = false; + } else { + // greedy verification + + // sample from the target model + LOG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft]); + token_id = llama_sampling_sample(ctx_sampling, ctx_tgt, NULL, drafts[s_keep].i_batch_tgt[i_dft]); + + llama_sampling_accept(ctx_sampling, ctx_tgt, token_id, true); + + //LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, ctx_sampling->prev).c_str()); + + token_str = llama_token_to_piece(ctx_tgt, token_id); + + for (int s = 0; s < n_seq_dft; ++s) { + if (!drafts[s].active) { + continue; + } + + if (i_dft < (int) drafts[s].tokens.size() && token_id == drafts[s].tokens[i_dft]) { + LOG("the sampled target token matches the %dth drafted token of sequence %d (%d, '%s') - accepted\n", i_dft, s, token_id, token_str.c_str()); + + s_keep = s; + accept = true; + } else { + drafts[s].active = false; + } } } - if (matches) { + if (token_id == llama_token_eos(model_tgt)) { + has_eos = true; + } + ++n_predict; + + if (accept) { ++n_accept; ++n_past_tgt; ++n_past_dft; @@ -247,17 +365,21 @@ int main(int argc, char ** argv) { if (params.use_color) { // Color token according to its origin sequence printf("\u001b[%dm%s\u001b[37m", (36 - s_keep % 6), token_str.c_str()); - fflush(stdout); + } else { + printf("%s", token_str.c_str()); } + fflush(stdout); continue; + } else { + printf("%s", token_str.c_str()); + fflush(stdout); + break; } } - if (params.use_color) { - printf("%s", token_str.c_str()); - } - fflush(stdout); + } - LOG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", id, token_str.c_str()); + { + LOG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", token_id, token_str.c_str()); // TODO: simplify { @@ -277,21 +399,21 @@ int main(int argc, char ** argv) { drafts[s].active = false; drafts[s].tokens.clear(); drafts[s].i_batch_tgt.clear(); + drafts[s].dists.clear(); } // note: will be erased after the speculation phase - drafts[0].tokens.push_back(id); + drafts[0].tokens.push_back(token_id); + drafts[0].dists.push_back(std::vector()); drafts[0].i_batch_tgt.push_back(0); llama_batch_clear(batch_dft); - llama_batch_add (batch_dft, id, n_past_dft, { 0 }, true); + llama_batch_add (batch_dft, token_id, n_past_dft, { 0 }, true); llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1); // LOG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str()); - llama_decode (ctx_dft, batch_dft); + llama_decode(ctx_dft, batch_dft); ++n_past_dft; - - break; } if (n_predict > params.n_predict || has_eos) { @@ -336,12 +458,6 @@ int main(int argc, char ** argv) { k, s, i, cur_p[k].id, cur_p[k].p, llama_token_to_piece(ctx_dft, cur_p[k].id).c_str()); } - if (cur_p[0].p < p_accept) { - LOG("stopping drafting for seq %3d, probability too low: %.3f < %.3f\n", s, cur_p[0].p, p_accept); - drafts[s].drafting = false; - continue; - } - std::vector sa(1, s); // attempt to split the branch if the probability is high enough @@ -369,6 +485,7 @@ int main(int argc, char ** argv) { drafts[n_seq_cur].skip = true; drafts[n_seq_cur].tokens = drafts[s].tokens; + drafts[n_seq_cur].dists = drafts[s].dists; drafts[n_seq_cur].i_batch_dft = drafts[s].i_batch_dft; drafts[n_seq_cur].i_batch_tgt = drafts[s].i_batch_tgt; @@ -391,6 +508,8 @@ int main(int argc, char ** argv) { llama_sampling_accept(drafts[s].ctx_sampling, ctx_dft, id, true); drafts[s].tokens.push_back(id); + // save cur_p.data into drafts[s].dists + drafts[s].dists.push_back(cur_p); // add unique drafted tokens to the target batch drafts[s].i_batch_tgt.push_back(batch_tgt.n_tokens); @@ -442,6 +561,7 @@ int main(int argc, char ** argv) { } drafts[s].tokens.erase(drafts[s].tokens.begin()); + drafts[s].dists.erase(drafts[s].dists.begin()); } } diff --git a/ggml-backend-impl.h b/ggml-backend-impl.h index 0e5bf0ae1..2e9ba58a9 100644 --- a/ggml-backend-impl.h +++ b/ggml-backend-impl.h @@ -91,13 +91,14 @@ extern "C" { // (optional) complete all pending operations void (*GGML_CALL synchronize)(ggml_backend_t backend); - // compute graph with a plan + // create a plan for ggml_cgraph and free it ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph); void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan); - void (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan); + // compute graph with a plan + enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan); // compute graph without a plan (async) - bool (*GGML_CALL graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph); + enum ggml_status (*GGML_CALL graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph); // check if the backend supports an operation bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op); diff --git a/ggml-backend.c b/ggml-backend.c index c86673b04..d60d98414 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -262,11 +262,11 @@ void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_pla backend->iface.graph_plan_free(backend, plan); } -void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { - backend->iface.graph_plan_compute(backend, plan); +enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { + return backend->iface.graph_plan_compute(backend, plan); } -bool ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { +enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { return backend->iface.graph_compute(backend, cgraph); } @@ -732,15 +732,15 @@ GGML_CALL static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, g GGML_UNUSED(backend); } -GGML_CALL static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { +GGML_CALL static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan; - ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan); + return ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan); GGML_UNUSED(backend); } -GGML_CALL static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { +GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads); @@ -755,8 +755,7 @@ GGML_CALL static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, str cplan.abort_callback = cpu_ctx->abort_callback; cplan.abort_callback_data = cpu_ctx->abort_callback_data; - ggml_graph_compute(cgraph, &cplan); - return true; + return ggml_graph_compute(cgraph, &cplan); } GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { @@ -1437,7 +1436,7 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) { return true; } -static bool ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) { +static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) { uint64_t copy_us[GGML_MAX_BACKENDS] = {0}; uint64_t compute_us[GGML_MAX_BACKENDS] = {0}; @@ -1472,8 +1471,9 @@ static bool ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) { uint64_t compute_start_us = ggml_time_us(); if (!sched->callback_eval) { - if (!ggml_backend_graph_compute(split_backend, &split->graph)) { - return false; + enum ggml_status ec = ggml_backend_graph_compute(split_backend, &split->graph); + if (ec != GGML_STATUS_SUCCESS) { + return ec; } //ggml_backend_synchronize(split_backend); // necessary to measure compute time } else { @@ -1494,8 +1494,9 @@ static bool ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) { struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1); - if (!ggml_backend_graph_compute(split_backend, &gv)) { - return false; + enum ggml_status ec = ggml_backend_graph_compute(split_backend, &gv); + if (ec != GGML_STATUS_SUCCESS) { + return ec; } if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) { @@ -1519,7 +1520,7 @@ static bool ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) { } #endif - return true; + return GGML_STATUS_SUCCESS; } ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size) { @@ -1581,7 +1582,7 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * return true; } -bool ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) { +enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) { GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS); if (!sched->is_reset) { @@ -1590,14 +1591,10 @@ bool ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cg ggml_backend_sched_split_graph(sched, graph); if (!ggml_backend_sched_alloc_splits(sched)) { - return false; + return GGML_STATUS_ALLOC_FAILED; } - if (!ggml_backend_sched_compute_splits(sched)) { - return false; - } - - return true; + return ggml_backend_sched_compute_splits(sched); } void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) { diff --git a/ggml-backend.h b/ggml-backend.h index 8fb54bd92..8bed22578 100644 --- a/ggml-backend.h +++ b/ggml-backend.h @@ -66,12 +66,13 @@ extern "C" { GGML_API void ggml_backend_synchronize(ggml_backend_t backend); - GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create (ggml_backend_t backend, struct ggml_cgraph * cgraph); + GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph); + GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan); - GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan); - GGML_API void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan); - GGML_API bool ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph); - GGML_API bool ggml_backend_supports_op (ggml_backend_t backend, const struct ggml_tensor * op); + GGML_API enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan); + GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph); + + GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op); // tensor copy between different backends GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst); @@ -157,26 +158,26 @@ extern "C" { typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data); // Initialize a backend scheduler - GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size); - GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched); + GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size); + GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched); // Initialize backend buffers from a measure graph - GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); + GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); // Get the number of splits of the last graph - GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched); + GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched); - GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend); + GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend); - GGML_API void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend); - GGML_API ggml_backend_t ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node); + GGML_API void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend); + GGML_API ggml_backend_t ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node); // Allocate and compute graph on the backend scheduler - GGML_API bool ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph); + GGML_API enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph); // Reset all assignments and allocators - must be called before changing the node backends - GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched); + GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched); // Set a callback to be called for each resulting node during graph compute - GGML_API void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data); + GGML_API void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data); // // Utils diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 66a597056..74562d33c 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -616,6 +616,8 @@ static_assert(sizeof(block_iq4_xs) == sizeof(ggml_fp16_t) + sizeof(uint16_t) + Q #define CUDA_UPSCALE_BLOCK_SIZE 256 #define CUDA_CONCAT_BLOCK_SIZE 256 #define CUDA_PAD_BLOCK_SIZE 256 +#define CUDA_ARANGE_BLOCK_SIZE 256 +#define CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE 256 #define CUDA_ACC_BLOCK_SIZE 256 #define CUDA_IM2COL_BLOCK_SIZE 256 #define CUDA_POOL2D_BLOCK_SIZE 256 @@ -991,17 +993,21 @@ static __global__ void concat_f32(const float * x,const float * y, float * dst, nidx + blockIdx.y * ne0 + blockIdx.z * ne0 * gridDim.y; - dst[offset_dst] = x[offset_src]; + dst[offset_dst] = x[offset_src]; } else { int offset_src = nidx + blockIdx.y * ne0 + (blockIdx.z - ne02) * ne0 * gridDim.y; - dst[offset_dst] = y[offset_src]; + dst[offset_dst] = y[offset_src]; } } -static __global__ void upscale_f32(const float * x, float * dst, const int ne00, const int nb02, const int scale_factor) { +static __global__ void upscale_f32(const float * x, float * dst, const int ne00, const int ne00xne01, const int scale_factor) { + // blockIdx.z: idx of ne02*ne03 + // blockIdx.y: idx of ne01*scale_factor, aka ne1 + // blockIDx.x: idx of ne00*scale_factor / BLOCK_SIZE + // ne00xne01: ne00 * ne01 int ne0 = ne00 * scale_factor; int nidx = threadIdx.x + blockIdx.x * blockDim.x; if (nidx >= ne0) { @@ -1013,7 +1019,7 @@ static __global__ void upscale_f32(const float * x, float * dst, const int ne00, int offset_src = i00 + i01 * ne00 + - blockIdx.z * nb02; + blockIdx.z * ne00xne01; int offset_dst = nidx + blockIdx.y * ne0 + @@ -1021,7 +1027,10 @@ static __global__ void upscale_f32(const float * x, float * dst, const int ne00, dst[offset_dst] = x[offset_src]; } -static __global__ void pad_f32(const float * x, float * dst, const int ne0, const int ne00, const int ne01, const int ne02) { +static __global__ void pad_f32(const float * x, float * dst, const int ne0, const int ne00, const int ne01, const int ne02, const int ne03) { + // blockIdx.z: idx of ne2*ne3, aka ne02*ne03 + // blockIdx.y: idx of ne1 + // blockIDx.x: idx of ne0 / BLOCK_SIZE int nidx = threadIdx.x + blockIdx.x * blockDim.x; if (nidx >= ne0) { return; @@ -1032,19 +1041,53 @@ static __global__ void pad_f32(const float * x, float * dst, const int ne0, cons nidx + blockIdx.y * ne0 + blockIdx.z * ne0 * gridDim.y; - if (nidx < ne00 && blockIdx.y < ne01 && blockIdx.z < ne02) { + if (nidx < ne00 && blockIdx.y < ne01 && blockIdx.z < ne02*ne03) { int offset_src = nidx + blockIdx.y * ne00 + blockIdx.z * ne00 * ne01; - dst[offset_dst] = x[offset_src]; + dst[offset_dst] = x[offset_src]; } else { dst[offset_dst] = 0.0f; } } +static __global__ void arange_f32(float * dst, const int ne0, const float start, const float step) { + // blockIDx.x: idx of ne0 / BLOCK_SIZE + int nidx = threadIdx.x + blockIdx.x * blockDim.x; + if (nidx >= ne0) { + return; + } + dst[nidx] = start + step * nidx; +} + +static __global__ void timestep_embedding_f32(const float * timesteps, float * dst, const int nb1, const int dim, const int max_period) { + // blockIDx.y: idx of timesteps->ne[0] + // blockIDx.x: idx of ((dim + 1) / 2) / BLOCK_SIZE + int i = blockIdx.y; + int j = threadIdx.x + blockIdx.x * blockDim.x; + float * embed_data = (float *)((char *)dst + i*nb1); + + if (dim % 2 != 0 && j == ((dim + 1) / 2)) { + embed_data[dim] = 0.f; + } + + int half = dim / 2; + if (j >= half) { + return; + } + + float timestep = timesteps[i]; + float freq = (float)expf(-logf(max_period) * j / half); + float arg = timestep * freq; + embed_data[j] = cosf(arg); + embed_data[j + half] = sinf(arg); +} + template static __global__ void group_norm_f32(const float * x, float * dst, const int group_size, const int ne_elements, const float eps) { + // blockIdx.x: num_groups idx + // threadIdx.x: block_size idx int start = blockIdx.x * group_size; int end = start + group_size; @@ -6449,7 +6492,7 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13) { - const int i = blockDim.x*blockIdx.x + threadIdx.x; + const int64_t i = blockDim.x*blockIdx.x + threadIdx.x; if (i >= ne) { return; @@ -6457,17 +6500,17 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne, // determine indices i03/i13, i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor // then combine those indices with the corresponding byte offsets to get the total offsets - const int i03 = i/(ne00 * ne01 * ne02); - const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01); - const int i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00; - const int i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00; - const int x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03; + const int64_t i03 = i/(ne00 * ne01 * ne02); + const int64_t i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01); + const int64_t i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00; + const int64_t i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00; + const int64_t x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03; - const int i13 = i/(ne10 * ne11 * ne12); - const int i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11); - const int i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10; - const int i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10; - const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13 * nb13; + const int64_t i13 = i/(ne10 * ne11 * ne12); + const int64_t i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11); + const int64_t i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10; + const int64_t i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10; + const int64_t dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13 * nb13; cpy_1(cx + x_offset, cdst + dst_offset); } @@ -6957,23 +7000,23 @@ static __global__ void clamp_f32(const float * x, float * dst, const float min, template static __global__ void im2col_kernel( - const float * x, T * dst, int batch_offset, - int offset_delta, int IC, int IW, int IH, int OH, int OW, int KW, int KH, int pelements, int CHW, + const float * x, T * dst, int64_t batch_offset, + int64_t offset_delta, int64_t IC, int64_t IW, int64_t IH, int64_t OH, int64_t OW, int64_t KW, int64_t KH, int64_t pelements, int64_t CHW, int s0, int s1, int p0, int p1, int d0, int d1) { - const int i = threadIdx.x + blockIdx.x * blockDim.x; + const int64_t i = threadIdx.x + blockIdx.x * blockDim.x; if (i >= pelements) { return; } - const int ksize = OW * (KH > 1 ? KW : 1); - const int kx = i / ksize; - const int kd = kx * ksize; - const int ky = (i - kd) / OW; - const int ix = i % OW; + const int64_t ksize = OW * (KH > 1 ? KW : 1); + const int64_t kx = i / ksize; + const int64_t kd = kx * ksize; + const int64_t ky = (i - kd) / OW; + const int64_t ix = i % OW; - const int oh = blockIdx.y; - const int batch = blockIdx.z / IC; - const int ic = blockIdx.z % IC; + const int64_t oh = blockIdx.y; + const int64_t batch = blockIdx.z / IC; + const int64_t ic = blockIdx.z % IC; const int64_t iiw = ix * s0 + kx * d0 - p0; const int64_t iih = oh * s1 + ky * d1 - p1; @@ -7299,19 +7342,20 @@ static void concat_f32_cuda(const float * x, const float * y, float * dst, const concat_f32<<>>(x, y, dst, ne0, ne02); } -static void upscale_f32_cuda(const float * x, float * dst, const int ne00, const int ne01, const int ne02, const int scale_factor, cudaStream_t stream) { +static void upscale_f32_cuda(const float * x, float * dst, const int ne00, const int ne01, const int ne02, const int ne03, + const int scale_factor, cudaStream_t stream) { int ne0 = (ne00 * scale_factor); int num_blocks = (ne0 + CUDA_UPSCALE_BLOCK_SIZE - 1) / CUDA_UPSCALE_BLOCK_SIZE; - dim3 gridDim(num_blocks, (ne01 * scale_factor), ne02); + dim3 gridDim(num_blocks, (ne01 * scale_factor), ne02*ne03); upscale_f32<<>>(x, dst, ne00, ne00 * ne01, scale_factor); } static void pad_f32_cuda(const float * x, float * dst, - const int ne00, const int ne01, const int ne02, - const int ne0, const int ne1, const int ne2, cudaStream_t stream) { + const int ne00, const int ne01, const int ne02, const int ne03, + const int ne0, const int ne1, const int ne2, const int ne3, cudaStream_t stream) { int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE; - dim3 gridDim(num_blocks, ne1, ne2); - pad_f32<<>>(x, dst, ne0, ne00, ne01, ne02); + dim3 gridDim(num_blocks, ne1, ne2*ne3); + pad_f32<<>>(x, dst, ne0, ne00, ne01, ne02, ne03); } static void arange_f32_cuda(float * dst, const int ne0, const float start, const float step, cudaStream_t stream) { @@ -8457,8 +8501,8 @@ static void soft_max_f32_cuda(const float * x, const float * mask, const float * template static void im2col_cuda(const float* x, T* dst, - int IW, int IH, int OW, int OH, int KW, int KH, int IC, - int batch, int batch_offset, int offset_delta, + int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW, int64_t KH, int64_t IC, + int64_t batch, int64_t batch_offset, int64_t offset_delta, int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) { const int parallel_elements = OW * KW * KH; const int num_blocks = (parallel_elements + CUDA_IM2COL_BLOCK_SIZE - 1) / CUDA_IM2COL_BLOCK_SIZE; @@ -9128,7 +9172,7 @@ static void ggml_cuda_op_group_norm( int num_groups = dst->op_params[0]; int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups); - group_norm_f32_cuda(src0_dd, dst_dd, num_groups, group_size, src0->ne[0] * src0->ne[1] * src0->ne[2], main_stream); + group_norm_f32_cuda(src0_dd, dst_dd, num_groups * src0->ne[3], group_size, ggml_nelements(src0), main_stream); (void) src1; (void) dst; @@ -9161,7 +9205,7 @@ static void ggml_cuda_op_upscale( const int scale_factor = dst->op_params[0]; - upscale_f32_cuda(src0_dd, dst_dd, src0->ne[0], src0->ne[1], src0->ne[2], scale_factor, main_stream); + upscale_f32_cuda(src0_dd, dst_dd, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], scale_factor, main_stream); (void) src1; (void) dst; @@ -9177,8 +9221,49 @@ static void ggml_cuda_op_pad( GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors pad_f32_cuda(src0_dd, dst_dd, - src0->ne[0], src0->ne[1], src0->ne[2], - dst->ne[0], dst->ne[1], dst->ne[2], main_stream); + src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], + dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], main_stream); + + (void) src1; + (void) dst; + (void) src1_dd; +} + +static void ggml_cuda_op_arange( + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, + const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) { + + GGML_ASSERT(dst->type == GGML_TYPE_F32); + + float start; + float stop; + float step; + memcpy(&start, (float *)dst->op_params + 0, sizeof(float)); + memcpy(&stop, (float *)dst->op_params + 1, sizeof(float)); + memcpy(&step, (float *)dst->op_params + 2, sizeof(float)); + + int64_t steps = (int64_t)ceil((stop - start) / step); + GGML_ASSERT(ggml_nelements(dst) == steps); + + arange_f32_cuda(dst_dd, dst->ne[0], start, step, main_stream); + + (void) src0; + (void) src1; + (void) src0_dd; + (void) src1_dd; +} + +static void ggml_cuda_op_timestep_embedding( + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, + const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + + const int dim = dst->op_params[0]; + const int max_period = dst->op_params[1]; + + timestep_embedding_f32_cuda(src0_dd, dst_dd, src0->ne[0], dst->nb[1], dim, max_period, main_stream); (void) src1; (void) dst; @@ -10463,6 +10548,45 @@ static void ggml_cuda_pad(const ggml_tensor * src0, const ggml_tensor * src1, gg ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_pad); } +static void ggml_cuda_arange(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; + + const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU; + + // dd = data device + float * src0_ddf = nullptr; + float * src1_ddf = nullptr; + float * dst_ddf = nullptr; + + cuda_pool_alloc dst_f; + + ggml_cuda_set_device(g_main_device); + cudaStream_t main_stream = g_cudaStreams[g_main_device][0]; + + if (dst_on_device) { + dst_ddf = (float *) dst_extra->data_device[g_main_device]; + } else { + dst_ddf = dst_f.alloc(ggml_nelements(dst)); + } + + // do the computation + ggml_cuda_op_arange(src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream); + CUDA_CHECK(cudaGetLastError()); + + // copy dst to host if necessary + if (!dst_on_device) { + CUDA_CHECK(cudaMemcpyAsync(dst->data, dst_ddf, ggml_nbytes(dst), cudaMemcpyDeviceToHost, main_stream)); + } + + if (dst->backend == GGML_BACKEND_TYPE_CPU) { + CUDA_CHECK(cudaDeviceSynchronize()); + } +} + +static void ggml_cuda_timestep_embedding(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_timestep_embedding); +} + static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm); } @@ -11369,6 +11493,12 @@ GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, st case GGML_OP_PAD: func = ggml_cuda_pad; break; + case GGML_OP_ARANGE: + func = ggml_cuda_arange; + break; + case GGML_OP_TIMESTEP_EMBEDDING: + func = ggml_cuda_timestep_embedding; + break; case GGML_OP_LEAKY_RELU: func = ggml_cuda_leaky_relu; break; @@ -12109,7 +12239,7 @@ GGML_CALL static void ggml_backend_cuda_synchronize(ggml_backend_t backend) { UNUSED(backend); } -GGML_CALL static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { +GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; ggml_cuda_set_main_device(cuda_ctx->device); @@ -12145,7 +12275,7 @@ GGML_CALL static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, gg GGML_ASSERT(ok); } - return true; + return GGML_STATUS_SUCCESS; } GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) { @@ -12264,6 +12394,8 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons case GGML_OP_GROUP_NORM: case GGML_OP_UPSCALE: case GGML_OP_PAD: + case GGML_OP_ARANGE: + case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_LEAKY_RELU: return true; default: diff --git a/ggml-kompute.cpp b/ggml-kompute.cpp index e740a76d1..83a7822fd 100644 --- a/ggml-kompute.cpp +++ b/ggml-kompute.cpp @@ -1927,10 +1927,10 @@ static ggml_backend_buffer_type_t ggml_backend_kompute_get_default_buffer_type(g return ggml_backend_kompute_buffer_type(ctx->device); } -static bool ggml_backend_kompute_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { +static ggml_status ggml_backend_kompute_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { auto * ctx = static_cast(backend->context); ggml_vk_graph_compute(ctx, cgraph); - return true; + return GGML_STATUS_SUCCESS; } static bool ggml_backend_kompute_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { diff --git a/ggml-metal.m b/ggml-metal.m index 60c9e63d0..00df22838 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -163,6 +163,8 @@ enum ggml_metal_kernel_type { GGML_METAL_KERNEL_TYPE_IM2COL_F32, GGML_METAL_KERNEL_TYPE_UPSCALE_F32, GGML_METAL_KERNEL_TYPE_PAD_F32, + GGML_METAL_KERNEL_TYPE_ARANGE_F32, + GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC, GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32, @@ -569,6 +571,8 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) { GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_F32, im2col_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32, pad_f32, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, timestep_embedding_f32, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32, arange_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC, argsort_f32_i32_desc, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32, leaky_relu_f32, true); @@ -697,6 +701,8 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const return false; case GGML_OP_UPSCALE: case GGML_OP_PAD: + case GGML_OP_ARANGE: + case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_ARGSORT: case GGML_OP_LEAKY_RELU: return true; @@ -742,7 +748,7 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const } } -static bool ggml_metal_graph_compute( +static enum ggml_status ggml_metal_graph_compute( struct ggml_metal_context * ctx, struct ggml_cgraph * gf) { @@ -1091,7 +1097,8 @@ static bool ggml_metal_graph_compute( { GGML_ASSERT(ggml_is_contiguous(src0)); - const float scale = *(const float *) dst->op_params; + float scale; + memcpy(&scale, dst->op_params, sizeof(scale)); int64_t n = ggml_nelements(dst); @@ -1250,11 +1257,15 @@ static bool ggml_metal_graph_compute( pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX].pipeline; } - const float scale = ((float *) dst->op_params)[0]; - const float max_bias = ((float *) dst->op_params)[1]; + float scale; + float max_bias; + + memcpy(&scale, ((int32_t *) dst->op_params) + 0, sizeof(scale)); + memcpy(&max_bias, ((int32_t *) dst->op_params) + 1, sizeof(max_bias)); const int64_t nrows_x = ggml_nrows(src0); const int64_t nrows_y = src0->ne[1]; + const uint32_t n_head_kv = nrows_x/nrows_y; const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv)); @@ -2086,6 +2097,7 @@ static bool ggml_metal_graph_compute( //const int n_past = ((int32_t *) dst->op_params)[0]; const int n_head = ((int32_t *) dst->op_params)[1]; + float max_bias; memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float)); @@ -2300,6 +2312,50 @@ static bool ggml_metal_graph_compute( [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; } break; + case GGML_OP_ARANGE: + { + GGML_ASSERT(dst->type == GGML_TYPE_F32); + + float start; + float step; + + memcpy(&start, ((int32_t *) dst->op_params) + 0, sizeof(float)); + memcpy(&step, ((int32_t *) dst->op_params) + 2, sizeof(float)); + + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARANGE_F32].pipeline; + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:0]; + [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:1]; + [encoder setBytes:&start length:sizeof(start) atIndex:2]; + [encoder setBytes:&step length:sizeof(step) atIndex:3]; + + const int nth = MIN(1024, ne0); + + [encoder dispatchThreadgroups:MTLSizeMake(1, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; + } break; + case GGML_OP_TIMESTEP_EMBEDDING: + { + GGML_ASSERT(src0->type == GGML_TYPE_F32); + + const int dim = dst->op_params[0]; + const int max_period = dst->op_params[1]; + + const int half = dim / 2; + + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32].pipeline; + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:2]; + [encoder setBytes:&dim length:sizeof(dim) atIndex:3]; + [encoder setBytes:&max_period length:sizeof(max_period) atIndex:4]; + + const int nth = MIN(1024, half); + + [encoder dispatchThreadgroups:MTLSizeMake(ne00, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; + } break; case GGML_OP_ARGSORT: { GGML_ASSERT(src0->type == GGML_TYPE_F32); @@ -2427,8 +2483,8 @@ static bool ggml_metal_graph_compute( MTLCommandBufferStatus status = [command_buffer status]; if (status != MTLCommandBufferStatusCompleted) { - GGML_METAL_LOG_INFO("%s: command buffer %d failed with status %lu. Check if you ran out of memory.\n", __func__, i, status); - return false; + GGML_METAL_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status); + return GGML_STATUS_FAILED; } } @@ -2437,7 +2493,7 @@ static bool ggml_metal_graph_compute( } } - return true; + return GGML_STATUS_SUCCESS; } //////////////////////////////////////////////////////////////////////////////// @@ -2739,7 +2795,7 @@ GGML_CALL static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffe UNUSED(backend); } -GGML_CALL static bool ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { +GGML_CALL static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context; return ggml_metal_graph_compute(metal_ctx, cgraph); diff --git a/ggml-metal.metal b/ggml-metal.metal index 8b9488437..a65d12641 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -1959,6 +1959,49 @@ kernel void kernel_pad_f32( } } +kernel void kernel_arange_f32( + device char * dst, + constant int64_t & ne0, + constant float & start, + constant float & step, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]]) { + + device float * dst_ptr = (device float *) dst; + + for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) { + dst_ptr[i0] = start + step * i0; + } +} + +kernel void kernel_timestep_embedding_f32( + device const char * src0, + device char * dst, + constant uint64_t & nb1, + constant int & dim, + constant int & max_period, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]]) { + + int i = tgpig.x; + device float * embed_data = (device float *)(dst + i*nb1); + + int half_ = dim / 2; + for (int j = tpitg.x; j < half_; j += ntg.x) { + float timestep = ((device float *)src0)[i]; + float freq = (float)exp(-log((float)max_period) * j / half_); + float arg = timestep * freq; + embed_data[j ] = cos(arg); + embed_data[j + half_] = sin(arg); + } + + if (dim % 2 != 0 && tpitg.x == 0) { + embed_data[dim] = 0.f; + } +} + // bitonic sort implementation following the CUDA kernels as reference typedef void (argsort_t)( device const float * x, diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp index 4dc348642..d5868974c 100644 --- a/ggml-opencl.cpp +++ b/ggml-opencl.cpp @@ -2240,7 +2240,7 @@ static ggml_backend_buffer_type_t ggml_backend_opencl_get_default_buffer_type(gg GGML_UNUSED(backend); } -static bool ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggml_cgraph * graph) { +static ggml_status ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggml_cgraph * graph) { for (int i = 0; i < graph->n_nodes; ++i) { ggml_tensor * node = graph->nodes[i]; switch (node->op) { @@ -2255,7 +2255,7 @@ static bool ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggml_cgrap } } - return true; + return GGML_STATUS_SUCCESS; GGML_UNUSED(backend); } diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index cad08d610..47a605b01 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -15581,7 +15581,7 @@ catch (sycl::exception const &exc) { std::exit(1); } -GGML_CALL static bool ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { +GGML_CALL static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context; ggml_sycl_set_main_device(sycl_ctx->device); @@ -15613,7 +15613,7 @@ GGML_CALL static bool ggml_backend_sycl_graph_compute(ggml_backend_t backend, gg GGML_ASSERT(ok); } - return true; + return GGML_STATUS_SUCCESS; } GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, const ggml_tensor * op) { diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index ae9cb3c1c..bc316c3f3 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -5092,7 +5092,7 @@ GGML_CALL static void ggml_backend_vk_synchronize(ggml_backend_t backend) { ctx->transfer_ctx = nullptr; } -GGML_CALL static bool ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { +GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; for (int i = 0; i < cgraph->n_nodes; i++) { @@ -5135,7 +5135,7 @@ GGML_CALL static bool ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml ggml_vk_graph_cleanup(ctx); - return true; + return GGML_STATUS_SUCCESS; UNUSED(backend); } diff --git a/ggml.c b/ggml.c index b7ecb1de9..eeb1bb6a0 100644 --- a/ggml.c +++ b/ggml.c @@ -320,6 +320,17 @@ static ggml_fp16_t ggml_table_exp_f16[1 << 16]; // precomputed f32 table for f16 (256 KB) (ggml-impl.h) float ggml_table_f32_f16[1 << 16]; +const char * ggml_status_to_string(enum ggml_status status) { + switch (status) { + case GGML_STATUS_ALLOC_FAILED: return "GGML status: error (failed to allocate memory)"; + case GGML_STATUS_FAILED: return "GGML status: error (operation failed)"; + case GGML_STATUS_SUCCESS: return "GGML status: success"; + case GGML_STATUS_ABORTED: return "GGML status: warning (operation aborted)"; + } + + return "GGML status: unknown"; +} + // note: do not use these inside ggml.c // these are meant to be used via the ggml.h API float ggml_fp16_to_fp32(ggml_fp16_t x) { @@ -1822,6 +1833,8 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "POOL_2D", "UPSCALE", "PAD", + "ARANGE", + "TIMESTEP_EMBEDDING", "ARGSORT", "LEAKY_RELU", @@ -1850,7 +1863,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "CROSS_ENTROPY_LOSS_BACK", }; -static_assert(GGML_OP_COUNT == 72, "GGML_OP_COUNT != 72"); +static_assert(GGML_OP_COUNT == 74, "GGML_OP_COUNT != 74"); static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "none", @@ -1908,6 +1921,8 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "pool_2d(x)", "upscale(x)", "pad(x)", + "arange(start, stop, step)", + "timestep_embedding(timesteps, dim, max_period)", "argsort(x)", "leaky_relu(x)", @@ -1936,7 +1951,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "cross_entropy_loss_back(x,y)", }; -static_assert(GGML_OP_COUNT == 72, "GGML_OP_COUNT != 72"); +static_assert(GGML_OP_COUNT == 74, "GGML_OP_COUNT != 74"); static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); @@ -2896,11 +2911,21 @@ static int32_t ggml_get_op_params_i32(const struct ggml_tensor * tensor, uint32_ return ((const int32_t *)(tensor->op_params))[i]; } +static float ggml_get_op_params_f32(const struct ggml_tensor * tensor, uint32_t i) { + assert(i < GGML_MAX_OP_PARAMS / sizeof(float)); + return ((const float *)(tensor->op_params))[i]; +} + static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int32_t value) { assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t)); ((int32_t *)(tensor->op_params))[i] = value; } +static void ggml_set_op_params_f32(struct ggml_tensor * tensor, uint32_t i, float value) { + assert(i < GGML_MAX_OP_PARAMS / sizeof(float)); + ((float *)(tensor->op_params))[i] = value; +} + struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor) { memset(tensor->data, 0, ggml_nbytes(tensor)); return tensor; @@ -5899,6 +5924,55 @@ struct ggml_tensor * ggml_upscale( return ggml_upscale_impl(ctx, a, scale_factor); } +struct ggml_tensor * ggml_arange( + struct ggml_context * ctx, + float start, + float stop, + float step) { + + GGML_ASSERT(stop > start); + + const int64_t steps = (int64_t) ceilf((stop - start) / step); + + struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, steps); + + result->op = GGML_OP_ARANGE; + ggml_set_op_params_f32(result, 0, start); + ggml_set_op_params_f32(result, 1, stop); + ggml_set_op_params_f32(result, 2, step); + + return result; +} + +struct ggml_tensor * ggml_timestep_embedding( + struct ggml_context * ctx, + struct ggml_tensor * timesteps, + int dim, + int max_period) { + bool is_node = false; + + if (timesteps->grad) { + GGML_ASSERT(false); // TODO: implement backward + is_node = true; + } + + int actual_dim = dim; + if (dim % 2 != 0) { + actual_dim = dim + 1; + } + + struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, actual_dim, timesteps->ne[0]); + + result->op = GGML_OP_TIMESTEP_EMBEDDING; + ggml_set_op_params_i32(result, 0, dim); + ggml_set_op_params_i32(result, 1, max_period); + + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = timesteps; + + return result; +} + // ggml_argsort struct ggml_tensor * ggml_argsort( @@ -10232,7 +10306,7 @@ static void ggml_compute_forward_group_norm_f32( int n_channels = src0->ne[2]; int n_groups = dst->op_params[0]; int n_channels_per_group = (n_channels + n_groups - 1) / n_groups; - for (int i = ith; i < n_groups; i+=nth) { + for (int i = ith; i < n_groups; i += nth) { int start = i * n_channels_per_group; int end = start + n_channels_per_group; if (end > n_channels) { @@ -10246,28 +10320,32 @@ static void ggml_compute_forward_group_norm_f32( for (int64_t i01 = 0; i01 < ne01; i01++) { const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03); + ggml_float sumr = 0.0; for (int64_t i00 = 0; i00 < ne00; i00++) { - sum += (ggml_float)x[i00]; + sumr += (ggml_float)x[i00]; } + sum += sumr; } } - float mean = sum / (ne00 * ne01 * step); - ggml_float sum2 = 0.0; + const float mean = sum / (ne00 * ne01 * step); + ggml_float sum2 = 0.0; for (int64_t i02 = start; i02 < end; i02++) { for (int64_t i01 = 0; i01 < ne01; i01++) { const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03); float * y = (float *)((char *) dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3); + ggml_float sumr = 0.0; for (int64_t i00 = 0; i00 < ne00; i00++) { float v = x[i00] - mean; y[i00] = v; - sum2 += (ggml_float)(v * v); + sumr += (ggml_float)(v * v); } + sum2 += sumr; } } - float variance = sum2 / (ne00 * ne01 * step); + const float variance = sum2 / (ne00 * ne01 * step); const float scale = 1.0f / sqrtf(variance + eps); for (int64_t i02 = start; i02 < end; i02++) { @@ -13548,6 +13626,106 @@ static void ggml_compute_forward_pad( } } + +// ggml_compute_forward_arange + +static void ggml_compute_forward_arange_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } + + GGML_ASSERT(dst->nb[0] == sizeof(float)); + + const int ith = params->ith; + const int nth = params->nth; + + const float start = ggml_get_op_params_f32(dst, 0); + const float stop = ggml_get_op_params_f32(dst, 1); + const float step = ggml_get_op_params_f32(dst, 2); + + const int64_t steps = (int64_t) ceilf((stop - start) / step); + + GGML_ASSERT(ggml_nelements(dst) == steps); + + for (int64_t i = ith; i < steps; i+= nth) { + float value = start + step * i; + ((float *)dst->data)[i] = value; + } +} + +static void ggml_compute_forward_arange( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + switch (dst->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_arange_f32(params, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +static void ggml_compute_forward_timestep_embedding_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } + + const struct ggml_tensor * src0 = dst->src[0]; + + GGML_ASSERT(src0->nb[0] == sizeof(float)); + + const int ith = params->ith; + const int nth = params->nth; + + GGML_TENSOR_UNARY_OP_LOCALS + + const int dim = ggml_get_op_params_i32(dst, 0); + const int max_period = ggml_get_op_params_i32(dst, 1); + + int half = dim / 2; + + for (int64_t i = 0; i < ne00; i++) { + float * embed_data = (float *)((char *) dst->data + i*nb1); + for (int64_t j = ith; j < half; j += nth) { + float timestep = ((float *)src0->data)[i]; + float freq = (float)expf(-logf(max_period) * j / half); + float arg = timestep * freq; + embed_data[j] = cosf(arg); + embed_data[j + half] = sinf(arg); + } + if (dim % 2 != 0 && ith == 0) { + embed_data[dim] = 0.f; + } + } +} + +static void ggml_compute_forward_timestep_embedding( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_timestep_embedding_f32(params, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + // ggml_compute_forward_argsort static void ggml_compute_forward_argsort_f32( @@ -15616,6 +15794,14 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_pad(params, tensor); } break; + case GGML_OP_ARANGE: + { + ggml_compute_forward_arange(params, tensor); + } break; + case GGML_OP_TIMESTEP_EMBEDDING: + { + ggml_compute_forward_timestep_embedding(params, tensor); + } break; case GGML_OP_ARGSORT: { ggml_compute_forward_argsort(params, tensor); @@ -16618,6 +16804,14 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor { GGML_ASSERT(false); // TODO: not implemented } break; + case GGML_OP_ARANGE: + { + GGML_ASSERT(false); // TODO: not implemented + } break; + case GGML_OP_TIMESTEP_EMBEDDING: + { + GGML_ASSERT(false); // TODO: not implemented + } break; case GGML_OP_ARGSORT: { GGML_ASSERT(false); // TODO: not implemented @@ -17218,6 +17412,7 @@ struct ggml_compute_state { ggml_thread_t thrd; int ith; struct ggml_compute_state_shared * shared; + enum ggml_status ec; }; static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) { @@ -17369,6 +17564,14 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { { n_tasks = n_threads; } break; + case GGML_OP_ARANGE: + { + n_tasks = n_threads; + } break; + case GGML_OP_TIMESTEP_EMBEDDING: + { + n_tasks = n_threads; + } break; case GGML_OP_ARGSORT: { n_tasks = n_threads; @@ -17503,7 +17706,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { while (true) { if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) { state->shared->node_n += 1; - return (thread_ret_t) GGML_EXIT_ABORTED; + state->ec = GGML_STATUS_ABORTED; + return 0; } if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) { @@ -17625,7 +17829,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { } } - return GGML_EXIT_SUCCESS; + return 0; } struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threads) { @@ -17821,7 +18025,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa return cplan; } -int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) { +enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) { { GGML_ASSERT(cplan); GGML_ASSERT(cplan->n_threads > 0); @@ -17865,6 +18069,7 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) { .thrd = 0, .ith = j, .shared = &state_shared, + .ec = GGML_STATUS_SUCCESS, }; const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]); @@ -17875,12 +18080,14 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) { workers[0].ith = 0; workers[0].shared = &state_shared; + workers[0].ec = GGML_STATUS_SUCCESS; const int64_t perf_start_cycles = ggml_perf_cycles(); const int64_t perf_start_time_us = ggml_perf_time_us(); // this is a work thread too - int compute_status = (size_t) ggml_graph_compute_thread(&workers[0]); + ggml_graph_compute_thread(&workers[0]); + enum ggml_status compute_status = workers[0].ec; // don't leave affinity set on the main thread clear_numa_thread_affinity(); @@ -17890,6 +18097,8 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) { for (int j = 1; j < n_threads; j++) { const int rc = ggml_thread_join(workers[j].thrd, NULL); GGML_ASSERT(rc == 0); + if (workers[j].ec != GGML_STATUS_SUCCESS) + compute_status = workers[j].ec; } } @@ -17917,14 +18126,14 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) { return compute_status; } -void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) { +enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) { struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads); struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size); cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs; - ggml_graph_compute(cgraph, &cplan); + return ggml_graph_compute(cgraph, &cplan); } struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name) { diff --git a/ggml.h b/ggml.h index 8009986b7..1105a823a 100644 --- a/ggml.h +++ b/ggml.h @@ -322,6 +322,16 @@ extern "C" { #endif + enum ggml_status { + GGML_STATUS_ALLOC_FAILED = -2, + GGML_STATUS_FAILED = -1, + GGML_STATUS_SUCCESS = 0, + GGML_STATUS_ABORTED = 1, + }; + + // get ggml_status name string + GGML_API GGML_CALL const char * ggml_status_to_string(enum ggml_status status); + typedef uint16_t ggml_fp16_t; // convert FP16 <-> FP32 @@ -461,6 +471,8 @@ extern "C" { GGML_OP_POOL_2D, GGML_OP_UPSCALE, // nearest interpolate GGML_OP_PAD, + GGML_OP_ARANGE, + GGML_OP_TIMESTEP_EMBEDDING, GGML_OP_ARGSORT, GGML_OP_LEAKY_RELU, @@ -1668,6 +1680,15 @@ extern "C" { int p2, int p3); + // Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151 + // timesteps: [N,] + // return: [N, dim] + GGML_API struct ggml_tensor * ggml_timestep_embedding( + struct ggml_context * ctx, + struct ggml_tensor * timesteps, + int dim, + int max_period); + // sort rows enum ggml_sort_order { GGML_SORT_ORDER_ASC, @@ -1679,6 +1700,12 @@ extern "C" { struct ggml_tensor * a, enum ggml_sort_order order); + GGML_API struct ggml_tensor * ggml_arange( + struct ggml_context * ctx, + float start, + float stop, + float step); + // top k elements per row GGML_API struct ggml_tensor * ggml_top_k( struct ggml_context * ctx, @@ -1930,12 +1957,11 @@ extern "C" { // ggml_graph_plan() has to be called before ggml_graph_compute() // when plan.work_size > 0, caller must allocate memory for plan.work_data - GGML_API struct ggml_cplan ggml_graph_plan (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/); - GGML_API int ggml_graph_compute( struct ggml_cgraph * cgraph, struct ggml_cplan * cplan); - + GGML_API struct ggml_cplan ggml_graph_plan (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/); + GGML_API enum ggml_status ggml_graph_compute ( struct ggml_cgraph * cgraph, struct ggml_cplan * cplan); // same as ggml_graph_compute() but the work data is allocated as a part of the context // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data - GGML_API void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads); + GGML_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads); GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name); diff --git a/koboldcpp.py b/koboldcpp.py index 7e3577779..1acf71355 100644 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -570,7 +570,7 @@ maxhordelen = 256 modelbusy = threading.Lock() requestsinqueue = 0 defaultport = 5001 -KcppVersion = "1.60" +KcppVersion = "1.60.1" showdebug = True showsamplerwarning = True showmaxctxwarning = True diff --git a/llama.cpp b/llama.cpp index 6e36a206e..2bbe7ef93 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1693,7 +1693,7 @@ struct llama_hparams { }; struct llama_cparams { - uint32_t n_ctx; // context size used during inference + uint32_t n_ctx; // context size used during inference uint32_t n_batch; uint32_t n_threads; // number of threads to use for generation uint32_t n_threads_batch; // number of threads to use for batch processing @@ -1710,7 +1710,9 @@ struct llama_cparams { float yarn_beta_slow; float defrag_thold; + bool embeddings; bool offload_kqv; + enum llama_pooling_type pooling_type; ggml_backend_sched_eval_callback cb_eval; @@ -2004,7 +2006,7 @@ struct llama_context { int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1) int32_t n_eval = 0; // number of eval calls - // decode output (2-dimensional array: [n_tokens][n_vocab]) + // logits output (2-dimensional array: [n_tokens][n_vocab]) std::vector logits; #ifndef NDEBUG // guard against access to unset logits @@ -2012,8 +2014,13 @@ struct llama_context { #endif bool logits_all = false; - // input embedding (1-dimensional array: [n_embd]) - std::vector embedding; + // embeddings output (2-dimensional array: [n_tokens][n_embd]) + // populated only when pooling_type == LLAMA_POOLING_TYPE_NONE + std::vector embd; + + // sequence embeddings output (map of [n_embd] vectors) + // populated only when pooling_type != LLAMA_POOLING_TYPE_NONE + std::map> embd_seq; // memory buffers used to evaluate the model std::vector buf_compute_meta; @@ -5164,6 +5171,7 @@ static struct ggml_tensor * llm_build_kv( llm_build_kv_store(ctx, hparams, kv, graph, k_cur, v_cur, n_ctx, n_tokens, kv_head, cb, il); struct ggml_tensor * cur; + cur = llm_build_kqv(ctx, model, hparams, kv, graph, wo, wo_b, q_cur, kq_mask, kq_pos, n_ctx, n_tokens, n_kv, kq_scale, cb, il); cb(cur, "kqv_out", il); @@ -6157,6 +6165,7 @@ struct llm_build_context { const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); struct ggml_tensor * cur; @@ -6164,9 +6173,10 @@ struct llm_build_context { // get input vectors with right size const size_t stride1 = n_tokens * ggml_type_size(lctx.inp_tokens->type); - struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0); + + struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0); struct ggml_tensor * inp_mean = ggml_view_2d(ctx0, lctx.inp_mean, n_tokens, n_tokens, stride1, 0); - struct ggml_tensor * inp_cls = ggml_view_1d(ctx0, lctx.inp_cls, n_tokens, 0); + struct ggml_tensor * inp_cls = ggml_view_1d(ctx0, lctx.inp_cls, n_tokens, 0); // construct input embeddings (token, type, position) inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); @@ -6184,39 +6194,38 @@ struct llm_build_context { cb(inpL, "inp_norm", -1); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); - cb(KQ_mask, "KQ_mask", -1); // [n_kv, n_tokens] + struct ggml_tensor * KQ_mask = ggml_cont(ctx0, ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_tokens, n_tokens, n_tokens*ggml_type_size(lctx.inp_KQ_mask->type), 0)); + cb(KQ_mask, "KQ_mask", -1); // [n_tokens, n_tokens] // iterate layers for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * cur = inpL; + struct ggml_tensor * Qcur; + struct ggml_tensor * Kcur; + struct ggml_tensor * Vcur; + // self-attention if (model.arch == LLM_ARCH_BERT) { - struct ggml_tensor * Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), model.layers[il].bq); + Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), model.layers[il].bq); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), model.layers[il].bk); + Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), model.layers[il].bk); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), model.layers[il].bv); + Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), model.layers[il].bv); cb(Vcur, "Vcur", il); - // seems like we just need to do this for Q? - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - - cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, - model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); - cb(cur, "kqv_out", il); + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); } else { // compute Q and K and RoPE them cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); cb(cur, "wqkv", il); - struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); - struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); - struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); + Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); + Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); + Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); cb(Qcur, "Qcur", il); cb(Kcur, "Kcur", il); @@ -6235,13 +6244,41 @@ struct llm_build_context { ext_factor, attn_factor, beta_fast, beta_slow ); cb(Kcur, "Kcur", il); - - cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, - model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); - cb(cur, "kqv_out", il); } + struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); + struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3)); + + struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); + cb(kq, "kq", il); + + kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, nullptr, 1.0f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias); + cb(kq, "kq_soft_max_ext", il); + + struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens))); + cb(v, "v", il); + + struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_tokens, n_embd_head, n_head_kv), kq); + cb(kqv, "kqv", il); + + struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3); + cb(kqv_merged, "kqv_merged", il); + + cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens); + cb(cur, "kqv_merged_cont", il); + + ggml_build_forward_expand(gf, cur); + + cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur); + if (model.layers[il].bo) { + cb(cur, "kqv_wo", il); + } + + if (model.layers[il].bo) { + cur = ggml_add(ctx0, cur, model.layers[il].bo); + } + cb(cur, "kqv_out", il); + // re-add the layer input cur = ggml_add(ctx0, cur, inpL); @@ -6281,16 +6318,29 @@ struct llm_build_context { // final output cur = inpL; + cb(cur, "result_embd", -1); // pooling layer - if (pooling_type == LLAMA_POOLING_TYPE_MEAN) { - cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_mean); - } else if (pooling_type == LLAMA_POOLING_TYPE_CLS) { - cur = ggml_get_rows(ctx0, cur, inp_cls); - } else { - GGML_ASSERT(pooling_type == LLAMA_POOLING_TYPE_NONE && "Invalid pooling type"); + switch (pooling_type) { + case LLAMA_POOLING_TYPE_NONE: + { + // nop + } break; + case LLAMA_POOLING_TYPE_MEAN: + { + cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_mean); + cb(cur, "result_embd_pooled", -1); + } break; + case LLAMA_POOLING_TYPE_CLS: + { + cur = ggml_get_rows(ctx0, cur, inp_cls); + cb(cur, "result_embd_pooled", -1); + } break; + case LLAMA_POOLING_TYPE_UNSPECIFIED: + { + GGML_ASSERT(false && "Invalid pooling type"); + } break; } - cb(cur, "result_embd", -1); ggml_build_forward_expand(gf, cur); @@ -8052,7 +8102,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos)); } - { + if (hparams.causal_attn) { const int64_t n_kv = kv_self.n; const int64_t n_tokens = batch.n_tokens; @@ -8067,16 +8117,40 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { for (int i = 0; i < n_kv; ++i) { float f; - if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || - (hparams.causal_attn && lctx.kv_self.cells[i].pos > pos)) { + if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) { f = -INFINITY; } else { - f = 0; + f = 0.0f; } data[h*(n_kv*n_tokens) + j*n_kv + i] = f; } } } + } else { + // non-causal attention attends only the tokens within the batch (i.e. the KV cache is not used) + const int64_t n_tokens = batch.n_tokens; + + assert(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer)); + + float * data = (float *) lctx.inp_KQ_mask->data; + + for (int h = 0; h < 1; ++h) { + for (int j = 0; j < n_tokens; ++j) { + const llama_seq_id seq_id = batch.seq_id[j][0]; + + for (int i = 0; i < n_tokens; ++i) { + float f = -INFINITY; + for (int s = 0; s < batch.n_seq_id[i]; ++s) { + if (batch.seq_id[i][s] == seq_id) { + f = 0.0f; + break; + } + } + + data[h*(n_tokens*n_tokens) + j*n_tokens + i] = f; + } + } + } } if (hparams.need_kq_pos) { @@ -8095,13 +8169,16 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { const int64_t n_tokens = batch.n_tokens; GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer)); - float * data = (float *) lctx.inp_mean->data; + float * data = (float *) lctx.inp_mean->data; memset(lctx.inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(lctx.inp_mean)); std::vector sum(n_tokens, 0); for (int i = 0; i < n_tokens; ++i) { const llama_seq_id seq_id = batch.seq_id[i][0]; + + GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == MEAN"); + sum[seq_id] += 1; } @@ -8123,11 +8200,16 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { const int64_t n_tokens = batch.n_tokens; GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer)); + uint32_t * data = (uint32_t *) lctx.inp_cls->data; + memset(lctx.inp_cls->data, 0, n_tokens * ggml_element_size(lctx.inp_cls)); for (int i = 0; i < n_tokens; ++i) { const llama_seq_id seq_id = batch.seq_id[i][0]; - const llama_pos pos = batch.pos[i]; + const llama_pos pos = batch.pos[i]; + + GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS"); + if (pos == 0) { data[seq_id] = i; } @@ -8241,24 +8323,27 @@ static int llama_decode_internal( batch.seq_id = seq_id_arr.data(); } - llama_kv_cache_update(&lctx); + // non-causal masks do not use the KV cache + if (hparams.causal_attn) { + llama_kv_cache_update(&lctx); - // if we have enough unused cells before the current head -> - // better to start searching from the beginning of the cache, hoping to fill it - if (kv_self.head > kv_self.used + 2*n_tokens) { - kv_self.head = 0; + // if we have enough unused cells before the current head -> + // better to start searching from the beginning of the cache, hoping to fill it + if (kv_self.head > kv_self.used + 2*n_tokens) { + kv_self.head = 0; + } + + if (!llama_kv_cache_find_slot(kv_self, batch)) { + return 1; + } + + // a heuristic, to avoid attending the full cache if it is not yet utilized + // after enough generations, the benefit from this heuristic disappears + // if we start defragmenting the cache, the benefit from this will be more important + kv_self.n = std::min(cparams.n_ctx, std::max(32u, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32))); + //kv_self.n = llama_kv_cache_cell_max(kv_self); } - if (!llama_kv_cache_find_slot(kv_self, batch)) { - return 1; - } - - // a heuristic, to avoid attending the full cache if it is not yet utilized - // after enough generations, the benefit from this heuristic disappears - // if we start defragmenting the cache, the benefit from this will be more important - kv_self.n = std::min(cparams.n_ctx, std::max(32u, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32))); - //kv_self.n = llama_kv_cache_cell_max(kv_self); - //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head); ggml_backend_sched_reset(lctx.sched); @@ -8267,20 +8352,26 @@ static int llama_decode_internal( ggml_cgraph * gf = llama_build_graph(lctx, batch, false); // the output is always the last tensor in the graph - struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1]; - struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2]; + struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1]; + struct ggml_tensor * embd = gf->nodes[gf->n_nodes - 2]; - if (strcmp(res->name, "result_output") == 0) { - // the embeddings could be the second to last tensor, or the third to last tensor - if (strcmp(embeddings->name, "result_norm") != 0) { - embeddings = gf->nodes[gf->n_nodes - 3]; - GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0); - } - } else if (strcmp(res->name, "result_embd") == 0) { - embeddings = res; - res = nullptr; + if (!hparams.causal_attn) { + res = nullptr; // do not extract logits for embedding models such as BERT + + // token or sequence embeddings + embd = gf->nodes[gf->n_nodes - 1]; + + GGML_ASSERT(strcmp(embd->name, "result_embd") == 0 || strcmp(embd->name, "result_embd_pooled") == 0); } else { - GGML_ASSERT(false); + if (strcmp(res->name, "result_output") == 0) { + // the token embeddings could be the second to last tensor, or the third to last tensor + if (strcmp(embd->name, "result_norm") != 0) { + embd = gf->nodes[gf->n_nodes - 3]; + GGML_ASSERT(strcmp(embd->name, "result_norm") == 0); + } + } else { + GGML_ASSERT(false && "missing result_output tensor"); + } } // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs); @@ -8347,46 +8438,82 @@ static int llama_decode_internal( logits_out.clear(); #endif - ggml_backend_t res_backend = ggml_backend_sched_get_node_backend(lctx.sched, res); - GGML_ASSERT(res_backend != nullptr); + ggml_backend_t backend_res = ggml_backend_sched_get_node_backend(lctx.sched, res); + GGML_ASSERT(backend_res != nullptr); + if (batch.logits) { logits_out.resize(n_vocab * n_tokens); for (uint32_t i = 0; i < n_tokens; i++) { if (batch.logits[i] == 0) { continue; } - ggml_backend_tensor_get_async(res_backend, res, logits_out.data() + (n_vocab*i), (n_vocab*i)*sizeof(float), n_vocab*sizeof(float)); + ggml_backend_tensor_get_async(backend_res, res, logits_out.data() + (n_vocab*i), (n_vocab*i)*sizeof(float), n_vocab*sizeof(float)); #ifndef NDEBUG logits_valid[i] = true; #endif } } else if (lctx.logits_all) { logits_out.resize(n_vocab * n_tokens); - ggml_backend_tensor_get_async(res_backend, res, logits_out.data(), 0, n_vocab*n_tokens*sizeof(float)); + ggml_backend_tensor_get_async(backend_res, res, logits_out.data(), 0, n_vocab*n_tokens*sizeof(float)); #ifndef NDEBUG std::fill(logits_valid.begin(), logits_valid.end(), true); #endif } else { logits_out.resize(n_vocab); - ggml_backend_tensor_get_async(res_backend, res, logits_out.data(), (n_vocab*(n_tokens - 1))*sizeof(float), n_vocab*sizeof(float)); + ggml_backend_tensor_get_async(backend_res, res, logits_out.data(), (n_vocab*(n_tokens - 1))*sizeof(float), n_vocab*sizeof(float)); #ifndef NDEBUG logits_valid[0] = true; #endif } - ggml_backend_synchronize(res_backend); + ggml_backend_synchronize(backend_res); } // extract embeddings - if (!lctx.embedding.empty()) { - auto & embedding_out = lctx.embedding; + if (cparams.embeddings && embd) { + ggml_backend_t backend_embd = ggml_backend_sched_get_node_backend(lctx.sched, embd); + GGML_ASSERT(backend_embd != nullptr); - const int64_t embd_pos = res ? n_embd * (n_tokens-1) : 0; - const int64_t embd_size = res ? n_embd : n_embd * n_tokens; + switch (cparams.pooling_type) { + case LLAMA_POOLING_TYPE_NONE: + { + // extract token embeddings + auto & embd_out = lctx.embd; - embedding_out.resize(embd_size); - ggml_backend_t embeddings_backend = ggml_backend_sched_get_node_backend(lctx.sched, embeddings); - ggml_backend_tensor_get_async(embeddings_backend, embeddings, embedding_out.data(), embd_pos*sizeof(float), embd_size*sizeof(float)); - ggml_backend_synchronize(embeddings_backend); + if (batch.logits) { + embd_out.resize(n_embd * n_tokens); + for (uint32_t i = 0; i < n_tokens; i++) { + if (batch.logits[i] == 0) { + continue; + } + + ggml_backend_tensor_get_async(backend_embd, embd, embd_out.data() + (n_embd*i), (n_embd*i)*sizeof(float), n_embd*sizeof(float)); + } + } + } break; + case LLAMA_POOLING_TYPE_CLS: + case LLAMA_POOLING_TYPE_MEAN: + { + GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0); + + // extract sequence embeddings + auto & embd_seq_out = lctx.embd_seq; + embd_seq_out.clear(); + + for (uint32_t i = 0; i < n_tokens; i++) { + const llama_seq_id seq_id = batch.seq_id[i][0]; + if (embd_seq_out.find(seq_id) != embd_seq_out.end()) { + continue; + } + embd_seq_out[seq_id].resize(n_embd); + ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float)); + } + } break; + case LLAMA_POOLING_TYPE_UNSPECIFIED: + { + GGML_ASSERT(false && "unknown pooling type"); + } break; + } + ggml_backend_synchronize(backend_embd); } // measure the performance only for the single-token evals @@ -12181,7 +12308,7 @@ struct llama_context_params llama_context_default_params() { /*.type_k =*/ GGML_TYPE_F16, /*.type_v =*/ GGML_TYPE_F16, /*.logits_all =*/ false, - /*.embedding =*/ false, + /*.embeddings =*/ false, /*.offload_kqv =*/ true, /*.abort_callback =*/ nullptr, /*.abort_callback_data =*/ nullptr, @@ -12322,6 +12449,7 @@ struct llama_context * llama_new_context_with_model( cparams.yarn_beta_fast = params.yarn_beta_fast; cparams.yarn_beta_slow = params.yarn_beta_slow; cparams.defrag_thold = params.defrag_thold; + cparams.embeddings = params.embeddings; cparams.offload_kqv = params.offload_kqv; cparams.pooling_type = params.pooling_type; @@ -12499,8 +12627,8 @@ struct llama_context * llama_new_context_with_model( // resized during inference, reserve maximum ctx->logits.reserve(hparams.n_vocab*cparams.n_batch); - if (params.embedding) { - ctx->embedding.resize(hparams.n_embd); + if (params.embeddings) { + ctx->embd.reserve(hparams.n_embd*cparams.n_batch); } // graph inputs @@ -12935,7 +13063,7 @@ size_t llama_get_state_size(const struct llama_context * ctx) { // assume worst case for logits although only currently set ones are serialized const size_t s_logits = ctx->logits.capacity() * sizeof(float); const size_t s_embedding_size = sizeof(size_t); - const size_t s_embedding = ctx->embedding.size() * sizeof(float); + const size_t s_embedding = ctx->embd.capacity() * sizeof(float); const size_t s_kv_buf_size = sizeof(size_t); const size_t s_kv_head = sizeof(uint32_t); const size_t s_kv_size = sizeof(uint32_t); @@ -13044,12 +13172,12 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat // copy embeddings { - const size_t embedding_size = ctx->embedding.size(); + const size_t embeddings_size = ctx->embd.size(); - data_ctx->write(&embedding_size, sizeof(embedding_size)); + data_ctx->write(&embeddings_size, sizeof(embeddings_size)); - if (embedding_size) { - data_ctx->write(ctx->embedding.data(), embedding_size * sizeof(float)); + if (embeddings_size) { + data_ctx->write(ctx->embd.data(), embeddings_size * sizeof(float)); } } @@ -13153,15 +13281,17 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) { // set embeddings { - size_t embedding_size; + size_t embeddings_size; - memcpy(&embedding_size, inp, sizeof(embedding_size)); inp += sizeof(embedding_size); + memcpy(&embeddings_size, inp, sizeof(embeddings_size)); inp += sizeof(embeddings_size); - GGML_ASSERT(ctx->embedding.capacity() == embedding_size); + GGML_ASSERT(ctx->embd.capacity() == embeddings_size); - if (embedding_size) { - memcpy(ctx->embedding.data(), inp, embedding_size * sizeof(float)); - inp += embedding_size * sizeof(float); + if (embeddings_size) { + ctx->embd.resize(embeddings_size); + + memcpy(ctx->embd.data(), inp, embeddings_size * sizeof(float)); + inp += embeddings_size * sizeof(float); } } @@ -13421,11 +13551,20 @@ float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) { } float * llama_get_embeddings(struct llama_context * ctx) { - return ctx->embedding.data(); + return ctx->embd.data(); } float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) { - return ctx->embedding.data() + i*ctx->model.hparams.n_embd; + return ctx->embd.data() + i*ctx->model.hparams.n_embd; +} + +float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id) { + auto it = ctx->embd_seq.find(seq_id); + if (it == ctx->embd_seq.end()) { + return nullptr; + } + + return it->second.data(); } const char * llama_token_get_text(const struct llama_model * model, llama_token token) { @@ -13604,7 +13743,7 @@ static int32_t llama_chat_apply_template_internal( std::string & dest, bool add_ass) { // Taken from the research: https://github.com/ggerganov/llama.cpp/issues/5527 std::stringstream ss; - if (tmpl.find("<|im_start|>") != std::string::npos) { + if (tmpl == "chatml" || tmpl.find("<|im_start|>") != std::string::npos) { // chatml template for (auto message : chat) { ss << "<|im_start|>" << message->role << "\n" << message->content << "<|im_end|>\n"; @@ -13612,7 +13751,7 @@ static int32_t llama_chat_apply_template_internal( if (add_ass) { ss << "<|im_start|>assistant\n"; } - } else if (tmpl.find("[INST]") != std::string::npos) { + } else if (tmpl == "llama2" || tmpl.find("[INST]") != std::string::npos) { // llama2 template and its variants // [variant] support system message bool support_system_message = tmpl.find("<>") != std::string::npos; @@ -13647,7 +13786,7 @@ static int32_t llama_chat_apply_template_internal( } } // llama2 templates seem to not care about "add_generation_prompt" - } else if (tmpl.find("<|user|>") != std::string::npos) { + } else if (tmpl == "zephyr" || tmpl.find("<|user|>") != std::string::npos) { // zephyr template for (auto message : chat) { ss << "<|" << message->role << "|>" << "\n" << message->content << "<|endoftext|>\n"; @@ -13655,7 +13794,7 @@ static int32_t llama_chat_apply_template_internal( if (add_ass) { ss << "<|assistant|>\n"; } - } else if (tmpl.find("bos_token + message['role']") != std::string::npos) { + } else if (tmpl == "monarch" || tmpl.find("bos_token + message['role']") != std::string::npos) { // mlabonne/AlphaMonarch-7B template (the is included inside history) for (auto message : chat) { std::string bos = (message == chat.front()) ? "" : ""; // skip BOS for first message @@ -13664,7 +13803,7 @@ static int32_t llama_chat_apply_template_internal( if (add_ass) { ss << "assistant\n"; } - } else if (tmpl.find("") != std::string::npos) { + } else if (tmpl == "gemma" || tmpl.find("") != std::string::npos) { // google/gemma-7b-it std::string system_prompt = ""; for (auto message : chat) { @@ -13711,7 +13850,7 @@ LLAMA_API int32_t llama_chat_apply_template( int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size()); if (res < 0) { // worst case: there is no information about template, we will use chatml by default - curr_tmpl = "<|im_start|>"; // see llama_chat_apply_template_internal + curr_tmpl = "chatml"; // see llama_chat_apply_template_internal } else { curr_tmpl = std::string(model_template.data(), model_template.size()); } diff --git a/llama.h b/llama.h index 65c2f7835..97f58bb0e 100644 --- a/llama.h +++ b/llama.h @@ -163,7 +163,7 @@ extern "C" { // - embd : token embeddings (i.e. float vector of size n_embd) (used when token is NULL) // - pos : the positions of the respective token in the sequence // - seq_id : the sequence to which the respective token belongs - // - logits : if zero, the logits for the respective token will not be output + // - logits : if zero, the logits (and/or the embeddings) for the respective token will not be output // typedef struct llama_batch { int32_t n_tokens; @@ -173,7 +173,7 @@ extern "C" { llama_pos * pos; int32_t * n_seq_id; llama_seq_id ** seq_id; - int8_t * logits; + int8_t * logits; // TODO: rename this to "output" // NOTE: helpers for smooth API transition - can be deprecated in the future // for future-proof code, use the above fields instead and ignore everything below @@ -260,7 +260,7 @@ extern "C" { // Keep the booleans together to avoid misalignment during copy-by-value. bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead) - bool embedding; // embedding mode only + bool embeddings; // if true, extract embeddings (together with logits) bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU // Abort callback @@ -657,14 +657,20 @@ extern "C" { // llama_get_logits(ctx) + i*n_vocab LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i); - // Get the embeddings for the input - // shape: [n_embd] (1-dimensional) + // Get all output token embeddings + // shape: [n_tokens*n_embd] (1-dimensional) LLAMA_API float * llama_get_embeddings(struct llama_context * ctx); - // Get the embeddings for the ith sequence + // Get the embeddings for the ith token // llama_get_embeddings(ctx) + i*n_embd + // shape: [n_embd] (1-dimensional) LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i); + // Get the embeddings for a sequence id + // Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE + // shape: [n_embd] (1-dimensional) + LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id); + // // Vocab // diff --git a/otherarch/sdcpp/ggml_extend.hpp b/otherarch/sdcpp/ggml_extend.hpp index 580ec3e92..71f91fe58 100644 --- a/otherarch/sdcpp/ggml_extend.hpp +++ b/otherarch/sdcpp/ggml_extend.hpp @@ -846,7 +846,7 @@ public: return NULL; } // it's performing a compute, check if backend isn't cpu - if (!ggml_backend_is_cpu(backend) && tensor->backend == GGML_BACKEND_CPU) { + if (!ggml_backend_is_cpu(backend) && tensor->backend == GGML_BACKEND_TYPE_CPU) { // pass input tensors to gpu memory auto backend_tensor = ggml_dup_tensor(compute_ctx, tensor);