Merge branch 'upstream' into concedo_experimental

# Conflicts: # .devops/full-rocm.Dockerfile # .devops/llama-cli-rocm.Dockerfile # .devops/llama-server-rocm.Dockerfile # .github/workflows/build.yml # .github/workflows/python-type-check.yml # CMakeLists.txt # CONTRIBUTING.md # README.md # ci/run.sh # examples/embedding/embedding.cpp # examples/server/README.md # flake.lock # ggml/include/ggml.h # ggml/src/ggml.c # requirements/requirements-convert_legacy_llama.txt # scripts/sync-ggml.last # src/llama-vocab.cpp # src/llama.cpp # tests/test-backend-ops.cpp # tests/test-grad0.cpp # tests/test-tokenizer-0.cpp
2025-09-13 02:19:41 +00:00 · 2024-10-02 01:00:57 +08:00 · 2024-10-02 01:00:57 +08:00 · ce7f9c9a2c
commit ce7f9c9a2c
parent a785a91e56 f1b8c42711
39 changed files with 103400 additions and 102738 deletions
--- a/include/llama.h
+++ b/include/llama.h
@ -102,6 +102,7 @@ extern "C" {
        LLAMA_VOCAB_PRE_TYPE_BLOOM          = 23,
        LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH   = 24,
        LLAMA_VOCAB_PRE_TYPE_EXAONE         = 25,
+        LLAMA_VOCAB_PRE_TYPE_CHAMELEON      = 26,
    };

    enum llama_rope_type {
@ -192,6 +193,7 @@ extern "C" {
        LLAMA_POOLING_TYPE_MEAN = 1,
        LLAMA_POOLING_TYPE_CLS  = 2,
        LLAMA_POOLING_TYPE_LAST = 3,
+        LLAMA_POOLING_TYPE_RANK = 4, // used by reranking models to attach the classification head to the graph
    };

    enum llama_attention_type {
@ -201,9 +203,9 @@ extern "C" {
    };

    enum llama_split_mode {
-        LLAMA_SPLIT_MODE_NONE    = 0, // single GPU
-        LLAMA_SPLIT_MODE_LAYER   = 1, // split layers and KV across GPUs
-        LLAMA_SPLIT_MODE_ROW     = 2, // split rows across GPUs
+        LLAMA_SPLIT_MODE_NONE  = 0, // single GPU
+        LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
+        LLAMA_SPLIT_MODE_ROW   = 2, // split rows across GPUs
    };

    // TODO: simplify (https://github.com/ggerganov/llama.cpp/pull/9294#pullrequestreview-2286561979)
@ -873,7 +875,8 @@ extern "C" {

    // Get the embeddings for a sequence id
    // Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE
-    // shape: [n_embd] (1-dimensional)
+    // when pooling_type == LLAMA_POOLING_TYPE_RANK, returns float[1] with the rank of the sequence
+    // otherwise: float[n_embd] (1-dimensional)
    LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id);

    //
@ -912,6 +915,8 @@ extern "C" {
    //
    // Tokenization
    //
+    // The API is thread-safe.
+    //

    /// @details Convert the provided text into tokens.
    /// @param tokens The tokens pointer must be large enough to hold the resulting tokens.