model : gemma3n text-only (#14400)

* gemma3n * add llm_graph_input_one
2025-09-14 02:49:41 +00:00 · 2025-06-26 19:34:02 +02:00 · 2025-06-26 19:34:02 +02:00 · 8846aace49
commit 8846aace49
parent a01047b041
13 changed files with 960 additions and 15 deletions
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@ -223,7 +223,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
                new_type = GGML_TYPE_Q6_K;
            }
        }
-    } else if (name == "token_embd.weight") {
+    } else if (name == "token_embd.weight" || name == "per_layer_token_embd.weight") {
        if (qs.params->token_embedding_type < GGML_TYPE_COUNT) {
            new_type = qs.params->token_embedding_type;
        } else {
@ -830,6 +830,13 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
        // NOTE: can't use LLM_TN here because the layer number is not known
        quantize &= name.find("ffn_gate_inp.weight") == std::string::npos;

+        // these are very small (e.g. 4x4)
+        quantize &= name.find("altup")  == std::string::npos;
+        quantize &= name.find("laurel") == std::string::npos;
+
+        // these are not too big so keep them as it is
+        quantize &= name.find("per_layer_model_proj") == std::string::npos;
+
        // do not quantize positional embeddings and token types (BERT)
        quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD,    "weight");
        quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_TOKEN_TYPES, "weight");