Merge branch 'upstream' into concedo_experimental

# Conflicts:
#	.github/workflows/build.yml
#	examples/embedding/embedding.cpp
#	tools/imatrix/imatrix.cpp
#	tools/perplexity/perplexity.cpp
This commit is contained in:
Concedo 2025-05-08 23:41:02 +08:00
commit 2439014a03
40 changed files with 2058 additions and 429 deletions

View file

@ -116,8 +116,6 @@ llama_context::llama_context(
__func__, n_ctx_per_seq, hparams.n_ctx_train);
}
logits_all = params.logits_all;
if (!hparams.vocab_only) {
// GPU backends
for (auto * dev : model.devices) {
@ -253,7 +251,7 @@ llama_context::llama_context(
}
// reserve worst-case graph
if (!hparams.vocab_only) {
if (!hparams.vocab_only && memory) {
const uint32_t n_seqs = 1; // TODO: worst-case number of sequences
const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
@ -702,6 +700,8 @@ int llama_context::encode(llama_batch & inp_batch) {
t_compute_start_us = ggml_time_us();
}
embd_seq.clear();
n_queued_tokens += n_tokens;
const int64_t n_embd = hparams.n_embd;
@ -763,12 +763,12 @@ int llama_context::encode(llama_batch & inp_batch) {
ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(sched.get(), t_embd);
GGML_ASSERT(backend_embd != nullptr);
GGML_ASSERT(embd != nullptr);
switch (cparams.pooling_type) {
case LLAMA_POOLING_TYPE_NONE:
{
// extract token embeddings
GGML_ASSERT(embd != nullptr);
GGML_ASSERT(n_tokens*n_embd <= (int64_t) embd_size);
ggml_backend_tensor_get_async(backend_embd, t_embd, embd, 0, n_tokens*n_embd*sizeof(float));
} break;
@ -793,11 +793,18 @@ int llama_context::encode(llama_batch & inp_batch) {
} break;
case LLAMA_POOLING_TYPE_RANK:
{
// TODO: this likely should be the same logic as in llama_decoder_internal, but better to
// wait for an encoder model that requires this pooling type in order to test it
// https://github.com/ggerganov/llama.cpp/pull/9510
GGML_ABORT("RANK pooling not implemented yet");
}
// extract the rerank score - a single float per sequence
auto & embd_seq_out = embd_seq;
for (uint32_t s = 0; s < ubatch.n_seqs; ++s) {
const llama_seq_id seq_id = ubatch.seq_id[s][0];
if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
continue;
}
embd_seq_out[seq_id].resize(1);
ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (seq_id)*sizeof(float), sizeof(float));
}
} break;
case LLAMA_POOLING_TYPE_UNSPECIFIED:
{
GGML_ABORT("unknown pooling type");
@ -835,6 +842,11 @@ int llama_context::encode(llama_batch & inp_batch) {
}
int llama_context::decode(llama_batch & inp_batch) {
if (!memory) {
LLAMA_LOG_WARN("%s: cannot decode batches with this context (use llama_encode() instead)\n", __func__);
return encode(inp_batch);
}
if (inp_batch.n_tokens == 0) {
LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
return -1;
@ -890,7 +902,7 @@ int llama_context::decode(llama_batch & inp_batch) {
for (uint32_t i = 0; i < n_tokens_all; ++i) {
n_outputs_all += batch.logits[i] != 0;
}
} else if (logits_all || embd_pooled) {
} else if (embd_pooled) {
n_outputs_all = n_tokens_all;
} else {
// keep last output only
@ -1853,13 +1865,12 @@ llama_context_params llama_context_default_params() {
/*.cb_eval_user_data =*/ nullptr,
/*.type_k =*/ GGML_TYPE_F16,
/*.type_v =*/ GGML_TYPE_F16,
/*.logits_all =*/ false,
/*.abort_callback =*/ nullptr,
/*.abort_callback_data =*/ nullptr,
/*.embeddings =*/ false,
/*.offload_kqv =*/ true,
/*.flash_attn =*/ false,
/*.no_perf =*/ true,
/*.abort_callback =*/ nullptr,
/*.abort_callback_data =*/ nullptr,
};
return result;