graph : support cacheless embeddings with FA and iSWA

2026-05-13 07:09:03 +00:00 · 2025-10-12 10:34:44 +03:00 · 2025-10-12 10:34:44 +03:00 · d4d465bce4
commit d4d465bce4
parent 41aac5c69b
4 changed files with 108 additions and 51 deletions
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@ -11376,8 +11376,7 @@ struct llm_build_gemma_embedding_iswa : public llm_graph_context {
        // inp_pos - contains the positions
        ggml_tensor * inp_pos = build_inp_pos();

-        // TODO: support cacheless iSWA embeddings [TAG_NO_CACHE_ISWA]
-        auto * inp_attn = build_attn_inp_kv_iswa();
+        auto * inp_attn = build_attn_inp_no_cache();

        ggml_tensor * inp_out_ids = build_inp_out_ids();

@ -19378,7 +19377,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
        case LLM_ARCH_NOMIC_BERT_MOE:
        case LLM_ARCH_NEO_BERT:
        case LLM_ARCH_WAVTOKENIZER_DEC:
-        //case LLM_ARCH_GEMMA_EMBEDDING: // TODO: disabled until the cacheless SWA logic is fixed [TAG_NO_CACHE_ISWA]
+        case LLM_ARCH_GEMMA_EMBEDDING:
        case LLM_ARCH_DREAM:
        case LLM_ARCH_LLADA:
        case LLM_ARCH_LLADA_MOE: