Merge branch 'upstream' into concedo_experimental

# Conflicts: # .devops/nix/package.nix # CMakeLists.txt # Makefile
2025-09-11 09:34:37 +00:00 · 2024-08-01 10:54:28 +08:00 · 2024-08-01 10:54:28 +08:00 · 101efb66af
commit 101efb66af
parent 9a04060aaa afbbcf3c04
4 changed files with 14 additions and 3 deletions
--- a/common/common.cpp
+++ b/common/common.cpp
@ -1635,7 +1635,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
    options.push_back({ "server",      "       --host HOST",            "ip address to listen (default: %s)", params.hostname.c_str() });
    options.push_back({ "server",      "       --port PORT",            "port to listen (default: %d)", params.port });
    options.push_back({ "server",      "       --path PATH",            "path to serve static files from (default: %s)", params.public_path.c_str() });
-    options.push_back({ "server",      "       --embedding(s)",         "enable embedding endpoint (default: %s)", params.embedding ? "enabled" : "disabled" });
+    options.push_back({ "server",      "       --embedding(s)",         "restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled" });
    options.push_back({ "server",      "       --api-key KEY",          "API key to use for authentication (default: none)" });
    options.push_back({ "server",      "       --api-key-file FNAME",   "path to file containing API keys (default: none)" });
    options.push_back({ "server",      "       --ssl-key-file FNAME",   "path to file a PEM-encoded SSL private key" });
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -247,7 +247,7 @@ server:
         --host HOST              ip address to listen (default: 127.0.0.1)
         --port PORT              port to listen (default: 8080)
         --path PATH              path to serve static files from (default: )
-         --embedding(s)           enable embedding endpoint (default: disabled)
+         --embedding(s)           restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)
         --api-key KEY            API key to use for authentication (default: none)
         --api-key-file FNAME     path to file containing API keys (default: none)
         --ssl-key-file FNAME     path to file a PEM-encoded SSL private key
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@ -312,6 +312,8 @@ class GGUFWriter:
        self.add_key_value(key, val, GGUFValueType.STRING)
    def add_array(self, key: str, val: Sequence[Any]) -> None:
        if len(val) == 0:
            return
        self.add_key_value(key, val, GGUFValueType.ARRAY)
    @staticmethod
@ -845,7 +847,14 @@ class GGUFWriter:
            encoded_val = val.encode("utf-8") if isinstance(val, str) else val
            kv_data += self._pack("Q", len(encoded_val))
            kv_data += encoded_val
-        elif vtype == GGUFValueType.ARRAY and isinstance(val, Sequence) and val:
+        elif vtype == GGUFValueType.ARRAY:
            if not isinstance(val, Sequence):
                raise ValueError("Invalid GGUF metadata array, expecting sequence")
            if len(val) == 0:
                raise ValueError("Invalid GGUF metadata array. Empty array")
            if isinstance(val, bytes):
                ltype = GGUFValueType.UINT8
            else:
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -4988,6 +4988,7 @@ static void llm_load_hparams(
                hparams.attn_soft_cap = true;
                switch (hparams.n_layer) {
                    case 26: model.type = e_model::MODEL_2B; break;
                    case 42: model.type = e_model::MODEL_9B; break;
                    case 46: model.type = e_model::MODEL_27B; break;
                    default: model.type = e_model::MODEL_UNKNOWN;
@ -11800,6 +11801,7 @@ struct llm_build_context {
                // ref: https://github.com/google/gemma_pytorch/commit/03e657582d17cb5a8617ebf333c1c16f3694670e
                switch (model.type) {
                    case e_model::MODEL_2B:
                    case e_model::MODEL_9B:  Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k)));   break;
                    case e_model::MODEL_27B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd / n_head))); break;
                    default: GGML_ABORT("fatal error");