Merge branch 'upstream' into concedo_experimental

# Conflicts:
#	docs/backend/CANN.md
#	examples/model-conversion/Makefile
#	examples/model-conversion/scripts/causal/compare-embeddings-logits.sh
#	examples/model-conversion/scripts/causal/convert-model.sh
#	examples/model-conversion/scripts/causal/run-casual-gen-embeddings-org.py
#	examples/model-conversion/scripts/causal/run-converted-model-embeddings-logits.sh
#	examples/model-conversion/scripts/causal/run-converted-model.sh
#	examples/model-conversion/scripts/embedding/compare-embeddings-logits.sh
#	examples/model-conversion/scripts/embedding/convert-model.sh
#	examples/model-conversion/scripts/embedding/modelcard.template
#	examples/model-conversion/scripts/embedding/run-converted-model.sh
#	examples/model-conversion/scripts/utils/create-collection-add-model.sh
#	examples/model-conversion/scripts/utils/inspect-converted-model.sh
#	examples/model-conversion/scripts/utils/inspect-org-model.py
#	examples/model-conversion/scripts/utils/perplexity-gen.sh
#	examples/model-conversion/scripts/utils/perplexity-run-simple.sh
#	examples/model-conversion/scripts/utils/perplexity-run.sh
#	examples/model-conversion/scripts/utils/quantize.sh
#	examples/model-conversion/scripts/utils/run-embedding-server.sh
#	ggml/src/ggml-cann/aclnn_ops.cpp
#	ggml/src/ggml-cann/common.h
#	ggml/src/ggml-cann/ggml-cann.cpp
#	ggml/src/ggml-opencl/ggml-opencl.cpp
#	ggml/src/ggml-sycl/ggml-sycl.cpp
#	src/llama-context.cpp
#	tests/test-backend-ops.cpp
#	tests/test-chat.cpp
This commit is contained in:
Concedo 2025-09-05 13:25:34 +08:00
commit f0d4128e9f
43 changed files with 1408 additions and 256 deletions

View file

@ -285,6 +285,9 @@ llama_context::llama_context(
const uint32_t n_seqs = cparams.kv_unified ? 1 : cparams.n_seq_max;
const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
// avoid reserving graphs with zero outputs
n_outputs = 1;
LLAMA_LOG_DEBUG("%s: worst-case: n_tokens = %d, n_seqs = %d, n_outputs = %d\n", __func__, n_tokens, n_seqs, n_outputs);
// resolve automatic Flash Attention use
@ -1367,7 +1370,8 @@ llm_graph_result * llama_context::get_gf_res_reserve() const {
}
ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only) {
// LLAMA_LOG_DEBUG("%s: reserving a graph for ubatch with n_tokens = %4u, n_seqs = %2u, n_outputs = %4u\n", __func__, n_tokens, n_seqs, n_outputs);
//LLAMA_LOG_DEBUG("%s: reserving a graph for ubatch with n_tokens = %4u, n_seqs = %2u, n_outputs = %4u\n", __func__, n_tokens, n_seqs, n_outputs);
GGML_ASSERT_CONTINUE(n_outputs >= 1);
if (n_tokens % n_seqs != 0) {
n_tokens = ((n_tokens + (n_seqs - 1)) / n_seqs) * n_seqs; // round to next multiple of n_seqs