mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 01:24:36 +00:00
Merge branch 'master' into concedo_experimental
# Conflicts: # ggml.c # ggml.h # requirements.txt # tests/test-quantize-perf.cpp
This commit is contained in:
commit
76a3ba42eb
17 changed files with 414 additions and 163 deletions
36
llama.cpp
36
llama.cpp
|
@ -1515,6 +1515,10 @@ struct llama_context {
|
|||
|
||||
// decode output (2-dimensional array: [n_tokens][n_vocab])
|
||||
std::vector<float> logits;
|
||||
#ifndef NDEBUG
|
||||
// guard against access to unset logits
|
||||
std::vector<bool> logits_valid;
|
||||
#endif
|
||||
bool logits_all = false;
|
||||
|
||||
// input embedding (1-dimensional array: [n_embd])
|
||||
|
@ -1565,7 +1569,7 @@ static bool llama_kv_cache_init(
|
|||
cache.cells.clear();
|
||||
cache.cells.resize(n_ctx);
|
||||
|
||||
cache.buf.resize(n_elements*(ggml_type_sizef(ktype) + ggml_type_sizef(vtype)) + 2u*n_layer*ggml_tensor_overhead());
|
||||
cache.buf.resize(ggml_row_size(ktype, n_elements) + ggml_row_size(vtype, n_elements) + 2u*n_layer*ggml_tensor_overhead());
|
||||
memset(cache.buf.data, 0, cache.buf.size);
|
||||
|
||||
struct ggml_init_params params;
|
||||
|
@ -3852,8 +3856,8 @@ static void llm_build_k_shift(
|
|||
ggml_rope_custom_inplace(ctx,
|
||||
ggml_view_3d(ctx, kv.k_l[il],
|
||||
n_embd_head, n_head_kv, n_ctx,
|
||||
ggml_type_sizef(kv.k_l[il]->type)*n_embd_head,
|
||||
ggml_type_sizef(kv.k_l[il]->type)*n_embd_gqa,
|
||||
ggml_row_size(kv.k_l[il]->type, n_embd_head),
|
||||
ggml_row_size(kv.k_l[il]->type, n_embd_gqa),
|
||||
0),
|
||||
K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
|
@ -3882,7 +3886,7 @@ static void llm_build_kv_store(
|
|||
cb(v_cur_t, "v_cur_t", il);
|
||||
|
||||
struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_gqa,
|
||||
(ggml_type_sizef(kv.k_l[il]->type)*n_embd_gqa)*kv_head);
|
||||
(ggml_row_size(kv.k_l[il]->type, n_embd_gqa))*kv_head);
|
||||
cb(k_cache_view, "k_cache_view", il);
|
||||
|
||||
struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_gqa,
|
||||
|
@ -4041,8 +4045,8 @@ static struct ggml_tensor * llm_build_kqv(
|
|||
struct ggml_tensor * k =
|
||||
ggml_view_3d(ctx, kv.k_l[il],
|
||||
n_embd_head, n_kv, n_head_kv,
|
||||
ggml_type_sizef(kv.k_l[il]->type)*n_embd_gqa,
|
||||
ggml_type_sizef(kv.k_l[il]->type)*n_embd_head,
|
||||
ggml_row_size(kv.k_l[il]->type, n_embd_gqa),
|
||||
ggml_row_size(kv.k_l[il]->type, n_embd_head),
|
||||
0);
|
||||
cb(k, "k", il);
|
||||
|
||||
|
@ -6182,6 +6186,14 @@ static int llama_decode_internal(
|
|||
{
|
||||
auto & logits_out = lctx.logits;
|
||||
|
||||
#ifndef NDEBUG
|
||||
auto & logits_valid = lctx.logits_valid;
|
||||
logits_valid.clear();
|
||||
logits_valid.resize(n_tokens);
|
||||
|
||||
logits_out.clear();
|
||||
#endif
|
||||
|
||||
if (batch.logits) {
|
||||
logits_out.resize(n_vocab * n_tokens);
|
||||
for (uint32_t i = 0; i < n_tokens; i++) {
|
||||
|
@ -6189,13 +6201,22 @@ static int llama_decode_internal(
|
|||
continue;
|
||||
}
|
||||
memcpy(logits_out.data() + (n_vocab*i), (float *) ggml_get_data(res) + (n_vocab*i), sizeof(float)*n_vocab);
|
||||
#ifndef NDEBUG
|
||||
logits_valid[i] = true;
|
||||
#endif
|
||||
}
|
||||
} else if (lctx.logits_all) {
|
||||
logits_out.resize(n_vocab * n_tokens);
|
||||
memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*n_tokens);
|
||||
#ifndef NDEBUG
|
||||
std::fill(logits_valid.begin(), logits_valid.end(), true);
|
||||
#endif
|
||||
} else {
|
||||
logits_out.resize(n_vocab);
|
||||
memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(n_tokens - 1)), sizeof(float)*n_vocab);
|
||||
#ifndef NDEBUG
|
||||
logits_valid[n_tokens - 1] = true;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -8734,7 +8755,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||
bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'?
|
||||
|
||||
// quantize only 2D tensors
|
||||
quantize &= (tensor->n_dims == 2);
|
||||
quantize &= (ggml_n_dims(tensor) == 2);
|
||||
quantize &= params->quantize_output_tensor || name != "output.weight";
|
||||
quantize &= !params->only_copy;
|
||||
|
||||
|
@ -10328,6 +10349,7 @@ float * llama_get_logits(struct llama_context * ctx) {
|
|||
}
|
||||
|
||||
float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
|
||||
assert(ctx->logits_valid.at(i));
|
||||
return ctx->logits.data() + i*ctx->model.hparams.n_vocab;
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue