mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 01:24:36 +00:00
Merge branch 'upstream' into concedo_experimental
# Conflicts: # .github/workflows/build.yml # examples/embedding/embedding.cpp # tools/imatrix/imatrix.cpp # tools/perplexity/perplexity.cpp
This commit is contained in:
commit
2439014a03
40 changed files with 2058 additions and 429 deletions
|
@ -116,8 +116,6 @@ llama_context::llama_context(
|
|||
__func__, n_ctx_per_seq, hparams.n_ctx_train);
|
||||
}
|
||||
|
||||
logits_all = params.logits_all;
|
||||
|
||||
if (!hparams.vocab_only) {
|
||||
// GPU backends
|
||||
for (auto * dev : model.devices) {
|
||||
|
@ -253,7 +251,7 @@ llama_context::llama_context(
|
|||
}
|
||||
|
||||
// reserve worst-case graph
|
||||
if (!hparams.vocab_only) {
|
||||
if (!hparams.vocab_only && memory) {
|
||||
const uint32_t n_seqs = 1; // TODO: worst-case number of sequences
|
||||
const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
|
||||
|
||||
|
@ -702,6 +700,8 @@ int llama_context::encode(llama_batch & inp_batch) {
|
|||
t_compute_start_us = ggml_time_us();
|
||||
}
|
||||
|
||||
embd_seq.clear();
|
||||
|
||||
n_queued_tokens += n_tokens;
|
||||
|
||||
const int64_t n_embd = hparams.n_embd;
|
||||
|
@ -763,12 +763,12 @@ int llama_context::encode(llama_batch & inp_batch) {
|
|||
ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(sched.get(), t_embd);
|
||||
GGML_ASSERT(backend_embd != nullptr);
|
||||
|
||||
GGML_ASSERT(embd != nullptr);
|
||||
|
||||
switch (cparams.pooling_type) {
|
||||
case LLAMA_POOLING_TYPE_NONE:
|
||||
{
|
||||
// extract token embeddings
|
||||
GGML_ASSERT(embd != nullptr);
|
||||
|
||||
GGML_ASSERT(n_tokens*n_embd <= (int64_t) embd_size);
|
||||
ggml_backend_tensor_get_async(backend_embd, t_embd, embd, 0, n_tokens*n_embd*sizeof(float));
|
||||
} break;
|
||||
|
@ -793,11 +793,18 @@ int llama_context::encode(llama_batch & inp_batch) {
|
|||
} break;
|
||||
case LLAMA_POOLING_TYPE_RANK:
|
||||
{
|
||||
// TODO: this likely should be the same logic as in llama_decoder_internal, but better to
|
||||
// wait for an encoder model that requires this pooling type in order to test it
|
||||
// https://github.com/ggerganov/llama.cpp/pull/9510
|
||||
GGML_ABORT("RANK pooling not implemented yet");
|
||||
}
|
||||
// extract the rerank score - a single float per sequence
|
||||
auto & embd_seq_out = embd_seq;
|
||||
|
||||
for (uint32_t s = 0; s < ubatch.n_seqs; ++s) {
|
||||
const llama_seq_id seq_id = ubatch.seq_id[s][0];
|
||||
if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
|
||||
continue;
|
||||
}
|
||||
embd_seq_out[seq_id].resize(1);
|
||||
ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (seq_id)*sizeof(float), sizeof(float));
|
||||
}
|
||||
} break;
|
||||
case LLAMA_POOLING_TYPE_UNSPECIFIED:
|
||||
{
|
||||
GGML_ABORT("unknown pooling type");
|
||||
|
@ -835,6 +842,11 @@ int llama_context::encode(llama_batch & inp_batch) {
|
|||
}
|
||||
|
||||
int llama_context::decode(llama_batch & inp_batch) {
|
||||
if (!memory) {
|
||||
LLAMA_LOG_WARN("%s: cannot decode batches with this context (use llama_encode() instead)\n", __func__);
|
||||
return encode(inp_batch);
|
||||
}
|
||||
|
||||
if (inp_batch.n_tokens == 0) {
|
||||
LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
|
||||
return -1;
|
||||
|
@ -890,7 +902,7 @@ int llama_context::decode(llama_batch & inp_batch) {
|
|||
for (uint32_t i = 0; i < n_tokens_all; ++i) {
|
||||
n_outputs_all += batch.logits[i] != 0;
|
||||
}
|
||||
} else if (logits_all || embd_pooled) {
|
||||
} else if (embd_pooled) {
|
||||
n_outputs_all = n_tokens_all;
|
||||
} else {
|
||||
// keep last output only
|
||||
|
@ -1853,13 +1865,12 @@ llama_context_params llama_context_default_params() {
|
|||
/*.cb_eval_user_data =*/ nullptr,
|
||||
/*.type_k =*/ GGML_TYPE_F16,
|
||||
/*.type_v =*/ GGML_TYPE_F16,
|
||||
/*.logits_all =*/ false,
|
||||
/*.abort_callback =*/ nullptr,
|
||||
/*.abort_callback_data =*/ nullptr,
|
||||
/*.embeddings =*/ false,
|
||||
/*.offload_kqv =*/ true,
|
||||
/*.flash_attn =*/ false,
|
||||
/*.no_perf =*/ true,
|
||||
/*.abort_callback =*/ nullptr,
|
||||
/*.abort_callback_data =*/ nullptr,
|
||||
};
|
||||
|
||||
return result;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue