From 41e78c567e9a8c652e405f4f909deb598deecd31 Mon Sep 17 00:00:00 2001
From: Daniel Bevenius <daniel.bevenius@gmail.com>
Date: Wed, 30 Jul 2025 18:07:11 +0200
Subject: [PATCH] server : add support for `embd_normalize` parameter (#14964)

This commit adds support for the `embd_normalize` parameter in the
server code.

The motivation for this is that currently if the server is started with
a pooling type that is not `none`, then Euclidean/L2 normalization will
be the normalization method used for embeddings. However, this is not
always the desired behavior, and users may want to use other
normalization (or none) and this commit allows that.

Example usage:
```console
curl --request POST \
    --url http://localhost:8080/embedding \
    --header "Content-Type: application/json" \
    --data '{"input": "Hello world today", "embd_normalize": -1}
```
---
 tools/server/README.md  |  9 +++++++++
 tools/server/server.cpp | 14 +++++++++++++-
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/tools/server/README.md b/tools/server/README.md
index aa07f1ef5..f3f4caed8 100644
--- a/tools/server/README.md
+++ b/tools/server/README.md
@@ -644,6 +644,15 @@ The same as [the embedding example](../embedding) does.
 
 `image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `content`. You can determine the place of the image in the content as in the following: `Image: [img-21].\nCaption: This is a picture of a house`. In this case, `[img-21]` will be replaced by the embeddings of the image with id `21` in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 21}]}`. Use `image_data` only with multimodal models, e.g., LLaVA.
 
+`embd_normalize`: Normalization for pooled embeddings. Can be one of the following values:
+```
+  -1: No normalization
+   0: Max absolute
+   1: Taxicab
+   2: Euclidean/L2
+  >2: P-Norm
+```
+
 ### POST `/reranking`: Rerank documents according to a given query
 
 Similar to https://jina.ai/reranker/ but might change in the future.
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index 022b5d0b3..2e4c40af7 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -138,6 +138,9 @@ struct slot_params {
     std::string                  oaicompat_cmpl_id;
     common_chat_syntax           oaicompat_chat_syntax;
 
+    // Embeddings
+    int32_t embd_normalize = 2; // (-1=none, 0=max absolute int16, 1=taxicab, 2=Euclidean/L2, >2=p-norm)
+
     json to_json() const {
         std::vector<std::string> samplers;
         samplers.reserve(sampling.samplers.size());
@@ -2601,7 +2604,7 @@ struct server_context {
 
             // normalize only when there is pooling
             if (llama_pooling_type(slot.ctx) != LLAMA_POOLING_TYPE_NONE) {
-                common_embd_normalize(embd, embd_res.data(), n_embd, 2);
+                common_embd_normalize(embd, embd_res.data(), n_embd, slot.params.embd_normalize);
                 res->embedding.push_back(embd_res);
                 break;
             } else {
@@ -4614,6 +4617,14 @@ int main(int argc, char ** argv) {
             }
         }
 
+        int embd_normalize = 2; // default to Euclidean/L2 norm
+        if (body.count("embd_normalize") != 0) {
+            embd_normalize = body.at("embd_normalize");
+            if (llama_pooling_type(ctx_server.ctx) == LLAMA_POOLING_TYPE_NONE) {
+                SRV_DBG("embd_normalize is not supported by pooling type %d, ignoring it\n", llama_pooling_type(ctx_server.ctx));
+            }
+        }
+
         // create and queue the task
         json responses = json::array();
         bool error = false;
@@ -4629,6 +4640,7 @@ int main(int argc, char ** argv) {
 
                 // OAI-compat
                 task.params.oaicompat = oaicompat;
+                task.params.embd_normalize = embd_normalize;
 
                 tasks.push_back(std::move(task));
             }