Merge branch 'upstream' into concedo_experimental

# Conflicts: # .github/workflows/build.yml # CMakeLists.txt # cmake/common.cmake # docs/backend/SYCL.md # examples/main/README.md # examples/speculative/speculative.cpp # ggml/CMakeLists.txt # ggml/src/CMakeLists.txt # ggml/src/ggml-cpu/CMakeLists.txt # ggml/src/ggml-musa/CMakeLists.txt # ggml/src/ggml-sycl/CMakeLists.txt # ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt # tests/test-backend-ops.cpp
2025-09-10 17:14:36 +00:00 · 2025-03-19 19:27:11 +08:00 · 2025-03-19 19:27:11 +08:00 · 0c90d2ebcf
commit 0c90d2ebcf
parent ddaa8d5a38 0fd8487b14
58 changed files with 4222 additions and 1537 deletions
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@ -285,11 +285,15 @@ llama_context::llama_context(

    // reserve worst-case graph
    if (!hparams.vocab_only) {
-        uint32_t n_seqs = 1; // TODO: worst-case number of sequences
-        uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
+        const uint32_t n_seqs = 1; // TODO: worst-case number of sequences
+        const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);

        llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph

+        // restore later
+        // TODO: something cleaner
+        const auto n_outputs_save = n_outputs;
+
        // max number of outputs
        n_outputs = n_tokens;

@ -341,6 +345,8 @@ llama_context::llama_context(
            }
        }

+        n_outputs = n_outputs_save;
+
        for (size_t i = 0; i < backend_ptrs.size(); ++i) {
            ggml_backend_t             backend = backend_ptrs[i];
            ggml_backend_buffer_type_t buft    = backend_buft[i];
@ -1052,6 +1058,13 @@ int llama_context::encode(llama_batch & inp_batch) {
    ggml_backend_sched_reset(sched.get());
    ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);

+    const auto causal_attn_org = cparams.causal_attn;
+
+    // always use non-causal attention for encoder graphs
+    // TODO: this is a tmp solution until we have a proper way to support enc-dec models
+    //       ref: https://github.com/ggml-org/llama.cpp/pull/12181#issuecomment-2730451223
+    cparams.causal_attn = false;
+
    auto * gf = graph_init();
    auto res = graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_ENCODER);

@ -1059,6 +1072,8 @@ int llama_context::encode(llama_batch & inp_batch) {

    res->set_inputs(&ubatch);

+    cparams.causal_attn = causal_attn_org;
+
    const auto compute_status = graph_compute(gf, n_tokens > 1);
    switch (compute_status) {
        case GGML_STATUS_SUCCESS:
@ -1129,6 +1144,8 @@ int llama_context::encode(llama_batch & inp_batch) {
    if (model.arch == LLM_ARCH_T5 && t_embd) {
        //cross.t_embd = t_embd;

+        synchronize();
+
        cross.n_embd = t_embd->ne[0];
        cross.n_enc  = t_embd->ne[1];
        cross.v_embd.resize(cross.n_embd*cross.n_enc);