Merge commit '017cc5f446' into concedo_experimental

# Conflicts:
#	.github/ISSUE_TEMPLATE/010-bug-compilation.yml
#	.github/ISSUE_TEMPLATE/019-bug-misc.yml
#	CODEOWNERS
#	examples/batched-bench/batched-bench.cpp
#	examples/batched/batched.cpp
#	examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
#	examples/gritlm/gritlm.cpp
#	examples/llama-bench/llama-bench.cpp
#	examples/passkey/passkey.cpp
#	examples/quantize-stats/quantize-stats.cpp
#	examples/run/run.cpp
#	examples/simple-chat/simple-chat.cpp
#	examples/simple/simple.cpp
#	examples/tokenize/tokenize.cpp
#	ggml/CMakeLists.txt
#	ggml/src/ggml-metal/CMakeLists.txt
#	ggml/src/ggml-vulkan/CMakeLists.txt
#	scripts/sync-ggml.last
#	src/llama.cpp
#	tests/test-autorelease.cpp
#	tests/test-model-load-cancel.cpp
#	tests/test-tokenizer-0.cpp
#	tests/test-tokenizer-1-bpe.cpp
#	tests/test-tokenizer-1-spm.cpp
This commit is contained in:
Concedo 2025-01-08 23:15:21 +08:00
commit dcfa1eca4e
45 changed files with 806 additions and 229 deletions

View file

@ -23,12 +23,8 @@
#include <algorithm>
#include <array>
#include <cassert>
#include <cctype>
#include <cfloat>
#include <cinttypes>
#include <climits>
#include <cmath>
#include <cstdarg>
#include <cstddef>
#include <cstdint>
#include <cstdio>
@ -36,7 +32,6 @@
#include <ctime>
#include <functional>
#include <initializer_list>
#include <locale>
#include <map>
#include <numeric>
#include <type_traits>
@ -1655,6 +1650,32 @@ static bool llm_load_tensors(
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
}
} break;
case LLM_ARCH_COHERE2:
{
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
// output
model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
// init output from the input tok embed
model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab },
llama_model_loader::TENSOR_DUPLICATED);
for (int i = 0; i < n_layer; ++i) {
auto & layer = model.layers[i];
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd }, 0);
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_gqa }, 0);
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_gqa }, 0);
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, 0);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0);
}
}
break;
case LLM_ARCH_OLMO: // adapted from LLM_ARCH_LLAMA with norm params removed
{
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@ -1934,6 +1955,7 @@ static bool llm_load_tensors(
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
} else {
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, llama_model_loader::TENSOR_NOT_REQUIRED);
if (n_expert == 0) {
throw std::runtime_error("n_expert must be > 0");
@ -2617,21 +2639,21 @@ static struct ggml_tensor * llm_build_inp_embd(
struct ggml_context * ctx,
struct llama_context & lctx,
const llama_hparams & hparams,
const llama_ubatch & batch,
const llama_ubatch & ubatch,
struct ggml_tensor * tok_embd,
const llm_build_cb & cb) {
const int64_t n_embd = hparams.n_embd;
struct ggml_tensor * inpL;
if (batch.token) {
lctx.inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.n_tokens);
if (ubatch.token) {
lctx.inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ubatch.n_tokens);
cb(lctx.inp_tokens, "inp_tokens", -1);
ggml_set_input(lctx.inp_tokens);
inpL = ggml_get_rows(ctx, tok_embd, lctx.inp_tokens);
} else {
lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, ubatch.n_tokens);
inpL = lctx.inp_embd;
ggml_set_input(lctx.inp_embd);
}
@ -2915,12 +2937,14 @@ static struct ggml_tensor * llm_build_moe_ffn(
struct ggml_tensor * up_exps,
struct ggml_tensor * gate_exps,
struct ggml_tensor * down_exps,
struct ggml_tensor * exp_probs_b,
int64_t n_expert,
int64_t n_expert_used,
llm_ffn_op_type type_op,
bool norm_w,
bool scale_w,
float w_scale,
llama_expert_gating_func_type gating_op,
const llm_build_cb & cb,
int il) {
int64_t n_embd = cur->ne[0];
@ -2929,11 +2953,31 @@ static struct ggml_tensor * llm_build_moe_ffn(
ggml_tensor * logits = llm_build_lora_mm(lctx, ctx, gate_inp, cur); // [n_expert, n_tokens]
cb(logits, "ffn_moe_logits", il);
ggml_tensor * probs = ggml_soft_max(ctx, logits); // [n_expert, n_tokens]
ggml_tensor * probs = nullptr;
switch (gating_op) {
case LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX:
{
probs = ggml_soft_max(ctx, logits); // [n_expert, n_tokens]
} break;
case LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID:
{
probs = ggml_sigmoid(ctx, logits); // [n_expert, n_tokens]
} break;
default:
GGML_ABORT("fatal error");
}
cb(probs, "ffn_moe_probs", il);
// add experts selection bias - introduced in DeepSeek V3
// leave probs unbiased as it's later used to get expert weights
ggml_tensor * selection_probs = probs;
if (exp_probs_b != nullptr) {
selection_probs = ggml_add(ctx, probs, exp_probs_b);
cb(selection_probs, "ffn_moe_probs_biased", il);
}
// select experts
ggml_tensor * selected_experts = ggml_top_k(ctx, probs, n_expert_used); // [n_expert_used, n_tokens]
ggml_tensor * selected_experts = ggml_top_k(ctx, selection_probs, n_expert_used); // [n_expert_used, n_tokens]
cb(selected_experts->src[0], "ffn_moe_argsort", il);
cb(selected_experts, "ffn_moe_topk", il);
@ -3218,7 +3262,7 @@ static struct ggml_tensor * llm_build_copy_mask_state(
static struct ggml_tensor * llm_build_mamba(
struct ggml_context * ctx,
struct llama_context & lctx,
const llama_ubatch & batch,
const llama_ubatch & ubatch,
struct ggml_cgraph * graph,
struct ggml_tensor * cur,
struct ggml_tensor * state_copy,
@ -3234,17 +3278,17 @@ static struct ggml_tensor * llm_build_mamba(
const int64_t d_inner = hparams.ssm_d_inner;
const int64_t d_state = hparams.ssm_d_state;
const int64_t dt_rank = hparams.ssm_dt_rank;
const int64_t n_seqs = batch.n_seqs;
const int64_t n_seqs = ubatch.n_seqs;
// Some variants of Mamba arch (e.g. FalconMamba do apply layer norm on B and Dt layers)
const bool ssm_dt_b_c_rms = hparams.ssm_dt_b_c_rms;
// Use the same RMS norm as the final layer norm
const float norm_rms_eps = hparams.f_norm_rms_eps;
const int64_t n_seq_tokens = batch.n_seq_tokens;
const int64_t n_seq_tokens = ubatch.n_seq_tokens;
GGML_ASSERT(n_seqs != 0);
GGML_ASSERT(batch.equal_seqs);
GGML_ASSERT(batch.n_tokens == n_seq_tokens * n_seqs);
GGML_ASSERT(ubatch.equal_seqs);
GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
struct ggml_tensor * conv_states_all = kv.k_l[il];
struct ggml_tensor * ssm_states_all = kv.v_l[il];
@ -4072,9 +4116,11 @@ struct llm_build_context {
model.layers[il].ffn_up_exps,
model.layers[il].ffn_gate_exps,
model.layers[il].ffn_down_exps,
nullptr,
n_expert, n_expert_used,
LLM_FFN_SILU, true,
false, 0.0,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
cb, il);
cb(cur, "ffn_moe_out", il);
}
@ -4724,9 +4770,11 @@ struct llm_build_context {
model.layers[il].ffn_up_exps,
model.layers[il].ffn_gate_exps,
model.layers[il].ffn_down_exps,
nullptr,
n_expert, n_expert_used,
LLM_FFN_GELU, true,
false, 0.0,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
cb, il);
cb(cur, "ffn_moe_out", il);
@ -4865,9 +4913,11 @@ struct llm_build_context {
model.layers[il].ffn_up_exps,
model.layers[il].ffn_gate_exps,
model.layers[il].ffn_down_exps,
nullptr,
n_expert, n_expert_used,
LLM_FFN_SILU, true,
false, 0.0,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
cb, il);
cb(cur, "ffn_moe_out", il);
@ -6113,9 +6163,11 @@ struct llm_build_context {
model.layers[il].ffn_up_exps,
model.layers[il].ffn_gate_exps,
model.layers[il].ffn_down_exps,
nullptr,
n_expert, n_expert_used,
LLM_FFN_SILU, false,
false, 0.0,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
cb, il);
cb(cur, "ffn_moe_out", il);
@ -7755,6 +7807,137 @@ struct llm_build_context {
}
struct ggml_cgraph * build_cohere2() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
const float f_logit_scale = hparams.f_logit_scale;
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos();
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
// cohere2 requires different mask for layers using sliding window (SWA)
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
struct ggml_tensor * KQ_mask_swa = build_inp_KQ_mask_swa();
// sliding window switch pattern
const int32_t sliding_window_pattern = 4;
for (int il = 0; il < n_layer; ++il) {
// three layers sliding window attention (window size 4096) and ROPE
// fourth layer uses global attention without positional embeddings
const bool is_sliding = il % sliding_window_pattern < (sliding_window_pattern - 1);
struct ggml_tensor * KQ_mask_l = is_sliding ? KQ_mask_swa : KQ_mask;
// norm
cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, NULL, LLM_NORM, cb, il);
cb(cur, "attn_norm", il);
struct ggml_tensor * ffn_inp = cur;
// self-attention
{
// rope freq factors for 128k context
struct ggml_tensor * rope_factors = build_rope_factors(il);
// compute Q and K and RoPE them
struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
cb(Qcur, "Qcur", il);
if (model.layers[il].bq) {
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
cb(Qcur, "Qcur", il);
}
struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
cb(Kcur, "Kcur", il);
if (model.layers[il].bk) {
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
cb(Kcur, "Kcur", il);
}
struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
cb(Vcur, "Vcur", il);
if (model.layers[il].bv) {
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
cb(Vcur, "Vcur", il);
}
if (is_sliding) {
Qcur = ggml_rope_ext(ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor,
beta_fast, beta_slow);
cb(Qcur, "Qcur", il);
Kcur = ggml_rope_ext(ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor,
attn_factor, beta_fast, beta_slow);
cb(Kcur, "Kcur", il);
} else {
// For non-sliding layers, just reshape without applying RoPE
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
cb(Qcur, "Qcur", il);
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
cb(Kcur, "Kcur", il);
}
cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur,
KQ_mask_l, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il);
}
if (il == n_layer - 1) {
// skip computing output for unused tokens
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
}
struct ggml_tensor * attn_out = cur;
// feed-forward network
{
cur = llm_build_ffn(ctx0, lctx, ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate,
NULL, NULL, model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR,
cb, il);
cb(cur, "ffn_out", il);
}
// add together residual + FFN + self-attention
cur = ggml_add(ctx0, cur, inpL);
cur = ggml_add(ctx0, cur, attn_out);
cur = lctx.cvec.apply_to(ctx0, cur, il);
cb(cur, "l_out", il);
// input for next layer
inpL = cur;
}
cur = inpL;
cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, NULL, LLM_NORM, cb, -1);
cb(cur, "result_norm", -1);
// lm_head
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
if (f_logit_scale) {
cur = ggml_scale(ctx0, cur, f_logit_scale);
}
cb(cur, "result_output", -1);
ggml_build_forward_expand(gf, cur);
return gf;
}
// ref: https://allenai.org/olmo
// based on the original build_llama() function, changes:
// * non-parametric layer norm
@ -8107,9 +8290,11 @@ struct llm_build_context {
model.layers[il].ffn_up_exps,
model.layers[il].ffn_gate_exps,
model.layers[il].ffn_down_exps,
nullptr,
n_expert, n_expert_used,
LLM_FFN_SILU, false,
false, 0.0,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
cb, il);
cb(cur, "ffn_moe_out", il);
@ -8504,9 +8689,11 @@ struct llm_build_context {
model.layers[il].ffn_up_exps,
model.layers[il].ffn_gate_exps,
model.layers[il].ffn_down_exps,
nullptr,
n_expert, n_expert_used,
LLM_FFN_SILU, true,
false, 0.0,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
cb, il);
cb(cur, "ffn_moe_out", il);
@ -8645,9 +8832,11 @@ struct llm_build_context {
model.layers[il].ffn_up_exps,
model.layers[il].ffn_gate_exps,
model.layers[il].ffn_down_exps,
nullptr,
n_expert, n_expert_used,
LLM_FFN_SILU, false,
false, hparams.expert_weights_scale,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
cb, il);
cb(moe_out, "ffn_moe_out", il);
@ -8874,9 +9063,11 @@ struct llm_build_context {
model.layers[il].ffn_up_exps,
model.layers[il].ffn_gate_exps,
model.layers[il].ffn_down_exps,
model.layers[il].ffn_exp_probs_b,
n_expert, n_expert_used,
LLM_FFN_SILU, false,
LLM_FFN_SILU, hparams.expert_weights_norm,
true, hparams.expert_weights_scale,
(enum llama_expert_gating_func_type) hparams.expert_gating_func,
cb, il);
cb(moe_out, "ffn_moe_out", il);
@ -10506,6 +10697,10 @@ static struct ggml_cgraph * llama_build_graph(
{
result = llm.build_command_r();
} break;
case LLM_ARCH_COHERE2:
{
result = llm.build_cohere2();
} break;
case LLM_ARCH_DBRX:
{
result = llm.build_dbrx();
@ -10639,7 +10834,7 @@ static enum ggml_status llama_graph_compute(
// return positive int on warning
// return negative int on error
//
static int llama_decode_internal(
static int llama_decode_impl(
llama_context & lctx,
llama_batch inp_batch) {
@ -10974,7 +11169,7 @@ static int llama_decode_internal(
// return positive int on warning
// return negative int on error
//
static int llama_encode_internal(
static int llama_encode_impl(
llama_context & lctx,
llama_batch inp_batch) {
@ -11156,7 +11351,7 @@ static int llama_encode_internal(
}
// find holes from the beginning of the KV cache and fill them by moving data from the end of the cache
static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
static void llama_kv_cache_defrag_impl(struct llama_context & lctx) {
auto & kv_self = lctx.kv_self;
const auto & hparams = lctx.model.hparams;
@ -11376,7 +11571,7 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
//LLAMA_LOG_INFO("(tmp log) KV defrag time: %.3f ms\n", (t_end - t_start)/1000.0);
}
static void llama_kv_cache_update_internal(struct llama_context & lctx) {
static void llama_kv_cache_update_impl(struct llama_context & lctx) {
bool need_reserve = false;
if (lctx.kv_self.has_shift) {
@ -11412,7 +11607,7 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) {
// defragment the KV cache if needed
if (lctx.kv_self.do_defrag) {
llama_kv_cache_defrag_internal(lctx);
llama_kv_cache_defrag_impl(lctx);
need_reserve = true;
@ -11441,13 +11636,7 @@ int32_t llama_lora_adapter_set(
struct llama_context * ctx,
struct llama_lora_adapter * adapter,
float scale) {
if (ctx->cparams.flash_attn) {
LLAMA_LOG_ERROR("%s: flash_attn is not compatible with LoRA\n", __func__);
return -1;
}
ctx->lora_adapters[adapter] = scale;
return 0;
}
@ -11582,6 +11771,12 @@ int64_t llama_time_us(void) {
struct llama_model * llama_load_model_from_file(
const char * path_model,
struct llama_model_params params) {
return llama_model_load_from_file(path_model, params);
}
struct llama_model * llama_model_load_from_file(
const char * path_model,
struct llama_model_params params) {
ggml_time_init();
llama_model * model = new llama_model;
@ -11620,7 +11815,7 @@ struct llama_model * llama_load_model_from_file(
ggml_backend_reg_t rpc_reg = ggml_backend_reg_by_name("RPC");
if (!rpc_reg) {
LLAMA_LOG_ERROR("%s: failed to find RPC backend\n", __func__);
llama_free_model(model);
llama_model_free(model);
return nullptr;
}
@ -11628,7 +11823,7 @@ struct llama_model * llama_load_model_from_file(
ggml_backend_rpc_add_device_t ggml_backend_rpc_add_device_fn = (ggml_backend_rpc_add_device_t) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_device");
if (!ggml_backend_rpc_add_device_fn) {
LLAMA_LOG_ERROR("%s: failed to find RPC device add function\n", __func__);
llama_free_model(model);
llama_model_free(model);
return nullptr;
}
@ -11638,7 +11833,7 @@ struct llama_model * llama_load_model_from_file(
model->devices.push_back(dev);
} else {
LLAMA_LOG_ERROR("%s: failed to add RPC device for server '%s'\n", __func__, server.c_str());
llama_free_model(model);
llama_model_free(model);
return nullptr;
}
}
@ -11670,7 +11865,7 @@ struct llama_model * llama_load_model_from_file(
if (params.split_mode == LLAMA_SPLIT_MODE_NONE) {
if (params.main_gpu < 0 || params.main_gpu >= (int)model->devices.size()) {
LLAMA_LOG_ERROR("%s: invalid value for main_gpu: %d (available devices: %d)\n", __func__, params.main_gpu, (int)model->devices.size());
llama_free_model(model);
llama_model_free(model);
return nullptr;
}
ggml_backend_dev_t main_gpu = model->devices[params.main_gpu];
@ -11693,7 +11888,7 @@ struct llama_model * llama_load_model_from_file(
LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
}
llama_free_model(model);
llama_model_free(model);
return nullptr;
}
@ -12117,7 +12312,7 @@ void llama_kv_cache_defrag(struct llama_context * ctx) {
}
void llama_kv_cache_update(struct llama_context * ctx) {
llama_kv_cache_update_internal(*ctx);
llama_kv_cache_update_impl(*ctx);
}
bool llama_kv_cache_can_shift(struct llama_context * ctx) {
@ -12129,7 +12324,7 @@ bool llama_kv_cache_can_shift(struct llama_context * ctx) {
int32_t llama_encode(
struct llama_context * ctx,
struct llama_batch batch) {
const int ret = llama_encode_internal(*ctx, batch);
const int ret = llama_encode_impl(*ctx, batch);
if (ret != 0) {
LLAMA_LOG_ERROR("%s: failed to encode, ret = %d\n", __func__, ret);
}
@ -12140,7 +12335,7 @@ int32_t llama_encode(
int32_t llama_decode(
struct llama_context * ctx,
struct llama_batch batch) {
const int ret = llama_decode_internal(*ctx, batch);
const int ret = llama_decode_impl(*ctx, batch);
if (ret != 0) {
LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
}
@ -12360,16 +12555,16 @@ int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix,
return 0;
}
int llama_split_prefix(char * dest, size_t maxlen, const char * split_path, int split_no, int split_count) {
int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count) {
std::string str_split_path(split_path);
char postfix[32];
snprintf(postfix, 32, "-%05d-of-%05d.gguf", split_no + 1, split_count);
std::string str_postfix(postfix);
// check if dest ends with postfix
// check if split_prefix ends with postfix
int size_prefix = str_split_path.size() - str_postfix.size();
if (size_prefix > 0 && str_split_path.find(str_postfix, size_prefix) != std::string::npos) {
snprintf(dest, std::min((size_t) size_prefix + 1, maxlen), "%s", split_path);
snprintf(split_prefix, std::min((size_t) size_prefix + 1, maxlen), "%s", split_path);
return size_prefix;
}
@ -12378,6 +12573,8 @@ int llama_split_prefix(char * dest, size_t maxlen, const char * split_path, int
const char * llama_print_system_info(void) {
static std::string s;
s.clear(); // Clear the string, since it's static, otherwise it will accumulate data from previous calls.
for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
auto * reg = ggml_backend_reg_get(i);