fixed embeddings, added new parameter to limit max embeddings context

2025-09-10 17:14:36 +00:00 · 2025-06-10 01:11:55 +08:00 · 2025-06-10 01:11:55 +08:00 · 7d8aa31f1f
commit 7d8aa31f1f
parent 8780b33c64
6 changed files with 360 additions and 9 deletions
--- a/2
+++ b/2
@ -699,6 +699,8 @@ mtmd-cli: tools/mtmd/mtmd-cli.cpp tools/mtmd/mtmd.cpp common/arg.cpp build-info.
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 mainvk: tools/main/main.cpp common/arg.cpp build-info.h ggml_v4_vulkan.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_vulkan.o llava.o ggml-backend_vulkan.o ggml-backend-reg_vulkan.o ggml-vulkan.o $(OBJS_FULL) $(OBJS) lib/vulkan-1.lib
 	$(CXX) $(CXXFLAGS) -DGGML_USE_VULKAN -DSD_USE_VULKAN $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+embedding: examples/embedding/embedding.cpp common/arg.cpp build-info.h ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FULL) $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

 ggml/src/ggml-vulkan-shaders.cpp:
 ifdef VULKAN_BUILD
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@ -0,0 +1,338 @@
+#include "arg.h"
+#include "common.h"
+#include "log.h"
+#include "llama.h"
+
+#include <ctime>
+#include <algorithm>
+
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+
+static std::vector<std::string> split_lines(const std::string & s, const std::string & separator = "\n") {
+    std::vector<std::string> lines;
+    size_t start = 0;
+    size_t end = s.find(separator);
+
+    while (end != std::string::npos) {
+        lines.push_back(s.substr(start, end - start));
+        start = end + separator.length();
+        end = s.find(separator, start);
+    }
+
+    lines.push_back(s.substr(start)); // Add the last part
+
+    return lines;
+}
+
+static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, llama_seq_id seq_id) {
+    size_t n_tokens = tokens.size();
+    for (size_t i = 0; i < n_tokens; i++) {
+        common_batch_add(batch, tokens[i], i, { seq_id }, true);
+    }
+}
+
+static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd, int embd_norm) {
+    const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
+
+    // clear previous kv_cache values (irrelevant for embeddings)
+    llama_memory_clear(llama_get_memory(ctx));
+
+    // run model
+    LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
+    if (llama_decode(ctx, batch) < 0) {
+        LOG_ERR("%s : failed to process\n", __func__);
+    }
+
+    for (int i = 0; i < batch.n_tokens; i++) {
+        if (!batch.logits[i]) {
+            continue;
+        }
+
+        const float * embd = nullptr;
+        int embd_pos = 0;
+
+        if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
+            // try to get token embeddings
+            embd = llama_get_embeddings_ith(ctx, i);
+            embd_pos = i;
+            GGML_ASSERT(embd != NULL && "failed to get token embeddings");
+        } else {
+            // try to get sequence embeddings - supported only when pooling_type is not NONE
+            embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
+            embd_pos = batch.seq_id[i][0];
+            GGML_ASSERT(embd != NULL && "failed to get sequence embeddings");
+        }
+
+        float * out = output + embd_pos * n_embd;
+        common_embd_normalize(embd, out, n_embd, embd_norm);
+    }
+}
+
+int main(int argc, char ** argv) {
+    common_params params;
+
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EMBEDDING)) {
+        return 1;
+    }
+
+    common_init();
+
+    params.embedding = true;
+
+    // utilize the full context
+    if (params.n_batch < params.n_ctx) {
+        LOG_WRN("%s: setting batch size to %d\n", __func__, params.n_ctx);
+        params.n_batch = params.n_ctx;
+    }
+
+    // For non-causal models, batch size must be equal to ubatch size
+    params.n_ubatch = params.n_batch;
+
+    llama_backend_init();
+    llama_numa_init(params.numa);
+
+    // load the model
+    common_init_result llama_init = common_init_from_params(params);
+
+    llama_model * model = llama_init.model.get();
+    llama_context * ctx = llama_init.context.get();
+
+    if (model == NULL) {
+        LOG_ERR("%s: unable to load model\n", __func__);
+        return 1;
+    }
+
+    const llama_vocab * vocab = llama_model_get_vocab(model);
+
+    const int n_ctx_train = llama_model_n_ctx_train(model);
+    const int n_ctx = llama_n_ctx(ctx);
+
+    const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
+
+    if (llama_model_has_encoder(model) && llama_model_has_decoder(model)) {
+        LOG_ERR("%s: computing embeddings in encoder-decoder models is not supported\n", __func__);
+        return 1;
+    }
+
+    if (n_ctx > n_ctx_train) {
+        LOG_WRN("%s: warning: model was trained on only %d context tokens (%d specified)\n",
+                __func__, n_ctx_train, n_ctx);
+    }
+
+    // print system information
+    {
+        LOG_INF("\n");
+        LOG_INF("%s\n", common_params_get_system_info(params).c_str());
+    }
+
+    // split the prompt into lines
+    std::vector<std::string> prompts = split_lines(params.prompt, params.embd_sep);
+
+    // max batch size
+    const uint64_t n_batch = params.n_batch;
+
+    // tokenize the prompts and trim
+    std::vector<std::vector<int32_t>> inputs;
+    for (const auto & prompt : prompts) {
+        auto inp = common_tokenize(ctx, prompt, true, true);
+        if (inp.size() > n_batch) {
+            LOG_ERR("%s: number of tokens in input line (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
+                    __func__, (long long int) inp.size(), (long long int) n_batch);
+            return 1;
+        }
+        inputs.push_back(inp);
+    }
+
+    // check if the last token is SEP
+    // it should be automatically added by the tokenizer when 'tokenizer.ggml.add_eos_token' is set to 'true'
+    for (auto & inp : inputs) {
+        if (inp.empty() || inp.back() != llama_vocab_sep(vocab)) {
+            LOG_WRN("%s: last token in the prompt is not SEP\n", __func__);
+            LOG_WRN("%s: 'tokenizer.ggml.add_eos_token' should be set to 'true' in the GGUF header\n", __func__);
+        }
+    }
+
+    // tokenization stats
+    if (params.verbose_prompt) {
+        for (int i = 0; i < (int) inputs.size(); i++) {
+            LOG_INF("%s: prompt %d: '%s'\n", __func__, i, prompts[i].c_str());
+            LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, inputs[i].size());
+            for (int j = 0; j < (int) inputs[i].size(); j++) {
+                LOG("%6d -> '%s'\n", inputs[i][j], common_token_to_piece(ctx, inputs[i][j]).c_str());
+            }
+            LOG("\n\n");
+        }
+    }
+
+    // initialize batch
+    const int n_prompts = prompts.size();
+    struct llama_batch batch = llama_batch_init(n_batch, 0, 1);
+
+    // count number of embeddings
+    int n_embd_count = 0;
+    if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
+        for (int k = 0; k < n_prompts; k++) {
+            n_embd_count += inputs[k].size();
+        }
+    } else {
+        n_embd_count = n_prompts;
+    }
+
+    // allocate output
+    const int n_embd = llama_model_n_embd(model);
+    std::vector<float> embeddings(n_embd_count * n_embd, 0);
+    float * emb = embeddings.data();
+
+    // break into batches
+    int e = 0; // number of embeddings already stored
+    int s = 0; // number of prompts in current batch
+    for (int k = 0; k < n_prompts; k++) {
+        // clamp to n_batch tokens
+        auto & inp = inputs[k];
+
+        const uint64_t n_toks = inp.size();
+
+        // encode if at capacity
+        if (batch.n_tokens + n_toks > n_batch) {
+            float * out = emb + e * n_embd;
+            batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
+            e += pooling_type == LLAMA_POOLING_TYPE_NONE ? batch.n_tokens : s;
+            s = 0;
+            common_batch_clear(batch);
+        }
+
+        // add to batch
+        batch_add_seq(batch, inp, s);
+        s += 1;
+    }
+
+    // final batch
+    float * out = emb + e * n_embd;
+    batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
+
+    if (params.embd_out.empty()) {
+        LOG("\n");
+
+        if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
+            for (int j = 0; j < n_embd_count; j++) {
+                LOG("embedding %d: ", j);
+                for (int i = 0; i < std::min(3, n_embd); i++) {
+                    if (params.embd_normalize == 0) {
+                        LOG("%6.0f ", emb[j * n_embd + i]);
+                    } else {
+                        LOG("%9.6f ", emb[j * n_embd + i]);
+                    }
+                }
+                LOG(" ... ");
+                for (int i = n_embd - 3; i < n_embd; i++) {
+                    if (params.embd_normalize == 0) {
+                        LOG("%6.0f ", emb[j * n_embd + i]);
+                    } else {
+                        LOG("%9.6f ", emb[j * n_embd + i]);
+                    }
+                }
+                LOG("\n");
+            }
+        } else if (pooling_type == LLAMA_POOLING_TYPE_RANK) {
+            const uint32_t n_cls_out = llama_model_n_cls_out(model);
+            std::vector<std::string> cls_out_labels;
+
+            for (uint32_t i = 0; i < n_cls_out; i++) {
+                const char * label = llama_model_cls_label(model, i);
+                const std::string label_i(label == nullptr ? "" : label);
+                cls_out_labels.emplace_back(label_i.empty() ? std::to_string(i) : label_i);
+            }
+
+            for (int j = 0; j < n_embd_count; j++) {
+                for (uint32_t i = 0; i < n_cls_out; i++) {
+                    // NOTE: if you change this log - update the tests in ci/run.sh
+                    if (n_cls_out == 1) {
+                        LOG("rerank score %d: %8.3f\n", j, emb[j * n_embd]);
+                    } else {
+                        LOG("rerank score %d: %8.3f [%s]\n", j, emb[j * n_embd + i], cls_out_labels[i].c_str());
+                    }
+                }
+            }
+        } else {
+            // print the first part of the embeddings or for a single prompt, the full embedding
+            for (int j = 0; j < n_prompts; j++) {
+                LOG("embedding %d: ", j);
+                for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) {
+                    if (params.embd_normalize == 0) {
+                        LOG("%6.0f ", emb[j * n_embd + i]);
+                    } else {
+                        LOG("%9.6f ", emb[j * n_embd + i]);
+                    }
+                }
+                LOG("\n");
+            }
+
+            // print cosine similarity matrix
+            if (n_prompts > 1) {
+                LOG("\n");
+                LOG("cosine similarity matrix:\n\n");
+                for (int i = 0; i < n_prompts; i++) {
+                    LOG("%6.6s ", prompts[i].c_str());
+                }
+                LOG("\n");
+                for (int i = 0; i < n_prompts; i++) {
+                    for (int j = 0; j < n_prompts; j++) {
+                        float sim = common_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
+                        LOG("%6.2f ", sim);
+                    }
+                    LOG("%1.10s", prompts[i].c_str());
+                    LOG("\n");
+                }
+            }
+        }
+    }
+
+    if (params.embd_out == "json" || params.embd_out == "json+" || params.embd_out == "array") {
+        const bool notArray = params.embd_out != "array";
+
+        LOG(notArray ? "{\n  \"object\": \"list\",\n  \"data\": [\n" : "[");
+        for (int j = 0;;) { // at least one iteration (one prompt)
+            if (notArray) LOG("    {\n      \"object\": \"embedding\",\n      \"index\": %d,\n      \"embedding\": ",j);
+            LOG("[");
+            for (int i = 0;;) { // at least one iteration (n_embd > 0)
+                LOG(params.embd_normalize == 0 ? "%1.0f" : "%1.7f", emb[j * n_embd + i]);
+                i++;
+                if (i < n_embd) LOG(","); else break;
+            }
+            LOG(notArray ? "]\n    }" : "]");
+            j++;
+            if (j < n_embd_count) LOG(notArray ? ",\n" : ","); else break;
+        }
+        LOG(notArray ? "\n  ]" : "]\n");
+
+        if (params.embd_out == "json+" && n_prompts > 1) {
+            LOG(",\n  \"cosineSimilarity\": [\n");
+            for (int i = 0;;) { // at least two iteration (n_embd_count > 1)
+                LOG("    [");
+                for (int j = 0;;) { // at least two iteration (n_embd_count > 1)
+                    float sim = common_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
+                    LOG("%6.2f", sim);
+                    j++;
+                    if (j < n_embd_count) LOG(", "); else break;
+                }
+                LOG(" ]");
+                i++;
+                if (i < n_embd_count) LOG(",\n"); else break;
+            }
+            LOG("\n  ]");
+        }
+
+        if (notArray) LOG("\n}\n");
+    }
+
+    LOG("\n");
+    llama_perf_context_print(ctx);
+
+    // clean up
+    llama_batch_free(batch);
+    llama_backend_free();
+
+    return 0;
+}
--- a/expose.h
+++ b/expose.h
@ -257,6 +257,7 @@ struct embeddings_load_model_inputs
    const int gpulayers = 0;
    const bool flash_attention = false;
    const bool use_mmap = false;
+    const int embeddingsmaxctx = 0;
    const bool quiet = false;
    const int debugmode = 0;
 };
--- a/koboldcpp.py
+++ b/koboldcpp.py
@ -56,7 +56,7 @@ logit_bias_max = 512
 dry_seq_break_max = 128

 # global vars
-KcppVersion = "1.93.1"
+KcppVersion = "1.93.2"
 showdebug = True
 kcpp_instance = None #global running instance
 global_memory = {"tunnel_url": "", "restart_target":"", "input_to_exit":False, "load_complete":False}
@ -350,6 +350,7 @@ class embeddings_load_model_inputs(ctypes.Structure):
                ("gpulayers", ctypes.c_int),
                ("flash_attention", ctypes.c_bool),
                ("use_mmap", ctypes.c_bool),
+                ("embeddingsmaxctx", ctypes.c_int),
                ("quiet", ctypes.c_bool),
                ("debugmode", ctypes.c_int)]

@ -1742,6 +1743,7 @@ def embeddings_load_model(model_filename):
    inputs.flash_attention = False
    inputs.threads = args.threads
    inputs.use_mmap = args.usemmap
+    inputs.embeddingsmaxctx = args.embeddingsmaxctx
    inputs = set_backend_props(inputs)
    ret = handle.embeddings_load_model(inputs)
    return ret
@ -4245,6 +4247,7 @@ def show_gui():
    ttsmaxlen_var = ctk.StringVar(value=str(default_ttsmaxlen))

    embeddings_model_var = ctk.StringVar()
+    embeddings_ctx_var = ctk.StringVar(value=str(""))

    admin_var = ctk.IntVar(value=0)
    admin_dir_var = ctk.StringVar()
@ -4852,7 +4855,8 @@ def show_gui():
    makelabelentry(model_tab, "Draft Amount: ", draftamount_var, 13, 50,padx=100,singleline=True,tooltip="How many tokens to draft per chunk before verifying results")
    makelabelentry(model_tab, "Splits: ", draftgpusplit_str_vars, 13, 50,padx=210,singleline=True,tooltip="Distribution of draft model layers. Leave blank to follow main model's gpu split. Only works if multi-gpu (All) selected in main model.", labelpadx=160)
    makelabelentry(model_tab, "Layers: ", draftgpulayers_var, 13, 50,padx=320,singleline=True,tooltip="How many layers to GPU offload for the draft model", labelpadx=270)
-    makefileentry(model_tab, "Embeds Model:", "Select Embeddings Model File", embeddings_model_var, 15, width=280,singlerow=True, filetypes=[("*.gguf","*.gguf")], tooltiptxt="Select an embeddings GGUF model that can be used to generate embedding vectors.")
+    makefileentry(model_tab, "Embeds Model:", "Select Embeddings Model File", embeddings_model_var, 15, width=160,singlerow=True, filetypes=[("*.gguf","*.gguf")], tooltiptxt="Select an embeddings GGUF model that can be used to generate embedding vectors.")
+    makelabelentry(model_tab, "EmbdCtx: ", embeddings_ctx_var, 15, 50,padx=390,singleline=True,tooltip="If set above 0, limits max context for embedding model to save memory.", labelpadx=330)
    makefileentry(model_tab, "Preload Story:", "Select Preloaded Story File", preloadstory_var, 17,width=280,singlerow=True,tooltiptxt="Select an optional KoboldAI JSON savefile \nto be served on launch to any client.")
    makefileentry(model_tab, "SaveData File:", "Select or Create New SaveData Database File", savedatafile_var, 19,width=280,filetypes=[("KoboldCpp SaveDB", "*.jsondb")],singlerow=True,dialog_type=1,tooltiptxt="Selecting a file will allow data to be loaded and saved persistently to this KoboldCpp server remotely. File is created if it does not exist.")
    makefileentry(model_tab, "ChatCompletions Adapter:", "Select ChatCompletions Adapter File", chatcompletionsadapter_var, 24, width=250, filetypes=[("JSON Adapter", "*.json")], tooltiptxt="Select an optional ChatCompletions Adapter JSON file to force custom instruct tags.")
@ -5207,6 +5211,9 @@ def show_gui():
        if embeddings_model_var.get() != "":
            args.embeddingsmodel = embeddings_model_var.get()

+        if embeddings_ctx_var.get() != "":
+            args.embeddingsmaxctx = (0 if embeddings_ctx_var.get()=="" else int(embeddings_ctx_var.get()))
+
        if tts_model_var.get() != "" and wavtokenizer_var.get() != "":
            args.ttsthreads = (0 if tts_threads_var.get()=="" else int(tts_threads_var.get()))
            args.ttsmodel = tts_model_var.get()
@ -5401,6 +5408,7 @@ def show_gui():
        ttsmaxlen_var.set(str(dict["ttsmaxlen"]) if ("ttsmaxlen" in dict and dict["ttsmaxlen"]) else str(default_ttsmaxlen))

        embeddings_model_var.set(dict["embeddingsmodel"] if ("embeddingsmodel" in dict and dict["embeddingsmodel"]) else "")
+        embeddings_ctx_var.set(str(dict["embeddingsmaxctx"]) if ("embeddingsmaxctx" in dict and dict["embeddingsmaxctx"]) else "")

        admin_var.set(dict["admin"] if ("admin" in dict) else 0)
        admin_dir_var.set(dict["admindir"] if ("admindir" in dict and dict["admindir"]) else "")
@ -7139,6 +7147,7 @@ if __name__ == '__main__':

    embeddingsparsergroup = parser.add_argument_group('Embeddings Model Commands')
    embeddingsparsergroup.add_argument("--embeddingsmodel", metavar=('[filename]'), help="Specify an embeddings model to be loaded for generating embedding vectors.", default="")
+    embeddingsparsergroup.add_argument("--embeddingsmaxctx", metavar=('[amount]'), help="Overrides the default maximum supported context of an embeddings model (defaults to trained context).", type=int, default=0)

    admingroup = parser.add_argument_group('Administration Commands')
    admingroup.add_argument("--admin", help="Enables admin mode, allowing you to unload and reload different configurations or models.", action='store_true')
--- a/otherarch/embeddings_adapter.cpp
+++ b/otherarch/embeddings_adapter.cpp
@ -34,7 +34,7 @@ static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & toke
    }
 }

-static void batch_encode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd, int embd_norm) {
+static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd, int embd_norm) {
    const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
    const struct llama_model * model = llama_get_model(ctx);

@ -48,7 +48,7 @@ static void batch_encode(llama_context * ctx, llama_batch & batch, float * outpu
    }

    // run model
-    if (llama_encode(ctx, batch) < 0) {
+    if (llama_decode(ctx, batch) < 0) {
        printf("%s : failed to process\n", __func__);
    }

@ -125,7 +125,8 @@ bool embeddingstype_load_model(const embeddings_load_model_inputs inputs)
    llama_model * embeddingsmodel = llama_model_load_from_file(modelfile.c_str(), model_params);
    const int n_ctx_train = llama_model_n_ctx_train(embeddingsmodel);

-    max_batchsize = n_ctx_train;
+    max_batchsize = (inputs.embeddingsmaxctx>0?inputs.embeddingsmaxctx:n_ctx_train);
+    max_batchsize = (max_batchsize>n_ctx_train?n_ctx_train:max_batchsize);
    ctx_params.embeddings = true;
    ctx_params.n_ubatch = max_batchsize; //max size, must fit
    ctx_params.n_ctx = max_batchsize;
@ -144,7 +145,7 @@ bool embeddingstype_load_model(const embeddings_load_model_inputs inputs)

    std::vector<int> tmp = {1, 2, 3, 4};
    llama_kv_self_clear(embeddings_ctx);
-    auto er = llama_encode(embeddings_ctx, llama_batch_get_one(tmp.data(), tmp.size()));
+    auto er = llama_decode(embeddings_ctx, llama_batch_get_one(tmp.data(), tmp.size()));
    if(er!=0)
    {
        printf("\nEmbeddings Model Eval returned nonzero: %d\n",er);
@ -259,7 +260,7 @@ embeddings_generation_outputs embeddingstype_generate(const embeddings_generatio
        // encode if at capacity
        if (batch.n_tokens + n_toks > n_batch) {
            float * out = emb + e * n_embd;
-            batch_encode(embeddings_ctx, batch, out, s, n_embd, embd_normalize);
+            batch_decode(embeddings_ctx, batch, out, s, n_embd, embd_normalize);
            e += pooling_type == LLAMA_POOLING_TYPE_NONE ? batch.n_tokens : s;
            s = 0;
            common_batch_clear(batch);
@ -271,7 +272,7 @@ embeddings_generation_outputs embeddingstype_generate(const embeddings_generatio

    // final batch
    float * out = emb + e * n_embd;
-    batch_encode(embeddings_ctx, batch, out, s, n_embd, embd_normalize);
+    batch_decode(embeddings_ctx, batch, out, s, n_embd, embd_normalize);

    std::string outputarray = "[";
    for (int i = 0; i < n_embd; i++) {
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@ -890,7 +890,7 @@ int llama_context::encode(llama_batch & inp_batch) {

 int llama_context::decode(llama_batch & inp_batch) {
    if (!memory) {
-        LLAMA_LOG_DEBUG("%s: cannot decode batches with this context (calling encode() instead)\n", __func__);
+        //LLAMA_LOG_DEBUG("%s: cannot decode batches with this context (calling encode() instead)\n", __func__);
        return encode(inp_batch);
    }