diff --git a/expose.h b/expose.h index 4189e5142..4b7bfdf42 100644 --- a/expose.h +++ b/expose.h @@ -255,6 +255,7 @@ struct embeddings_generation_inputs struct embeddings_generation_outputs { int status = -1; + int count = 0; const char * data = ""; }; diff --git a/kcpp_docs.embd b/kcpp_docs.embd index e32741f40..9857fad83 100644 --- a/kcpp_docs.embd +++ b/kcpp_docs.embd @@ -1492,6 +1492,28 @@ ] } }, + "/api/extra/embeddings": { + "post": { + "summary": "Creates an embedding vector representing the input text. Please refer to OpenAI documentation", + "description": "Creates an embedding vector representing the input text.\n\nThis is an OpenAI compatibility endpoint.\n\n Please refer to OpenAI documentation at [https://platform.openai.com/docs/api-reference/embeddings/create](https://platform.openai.com/docs/api-reference/embeddings/create)", + "requestBody": { + "content": { + "application/json": { + "example": {}, + "schema": { + "properties": {}, + "type": "object" + } + } + }, + "required": true + }, + "tags": [ + "api/extra" + ], + "responses": {"default": {"description": ""}} + } + }, "/api/extra/data/save": { "post": { "description": "Saves data to a slot in a database file in the KoboldCpp server.", @@ -2152,6 +2174,28 @@ "responses": {"default": {"description": ""}} } }, + "/v1/embeddings": { + "post": { + "summary": "Creates an embedding vector representing the input text. Please refer to OpenAI documentation", + "description": "Creates an embedding vector representing the input text.\n\nThis is an OpenAI compatibility endpoint.\n\n Please refer to OpenAI documentation at [https://platform.openai.com/docs/api-reference/embeddings/create](https://platform.openai.com/docs/api-reference/embeddings/create)", + "requestBody": { + "content": { + "application/json": { + "example": {}, + "schema": { + "properties": {}, + "type": "object" + } + } + }, + "required": true + }, + "tags": [ + "v1" + ], + "responses": {"default": {"description": ""}} + } + }, }, "servers": [ { diff --git a/koboldcpp.py b/koboldcpp.py index 484346eb6..40c21c067 100644 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -336,6 +336,7 @@ class embeddings_generation_inputs(ctypes.Structure): class embeddings_generation_outputs(ctypes.Structure): _fields_ = [("status", ctypes.c_int), + ("count", ctypes.c_int), ("data", ctypes.c_char_p)] def getdirpath(): @@ -1602,14 +1603,34 @@ def embeddings_load_model(model_filename): def embeddings_generate(genparams): global args - prompt = genparams.get("input", "") - inputs = embeddings_generation_inputs() - inputs.prompt = prompt.encode("UTF-8") - ret = handle.embeddings_generate(inputs) - outstr = "" - if ret.status==1: - outstr = ret.data.decode("UTF-8","ignore") - return outstr + prompts = [] + if isinstance(genparams.get('input',[]), list): + prompts = genparams.get('input',[]) + else: + prompt = genparams.get("input", "") + if prompt: + prompts.append(prompt) + + tokarrs = [] + tokcnt = 0 + for prompt in prompts: + tokarr = [] + tmpcnt = 0 + try: + inputs = embeddings_generation_inputs() + inputs.prompt = prompt.encode("UTF-8") + ret = handle.embeddings_generate(inputs) + if ret.status==1: + outstr = ret.data.decode("UTF-8","ignore") + tokarr = json.loads(outstr) if outstr else [] + tmpcnt = ret.count + except Exception as e: + tokarr = [] + tmpcnt = 0 + print(f"Error: {e}") + tokarrs.append(tokarr) + tokcnt += tmpcnt + return {"count":tokcnt, "data":tokarrs} def tokenize_ids(countprompt,tcaddspecial): rawcountdata = handle.token_count(countprompt.encode("UTF-8"),tcaddspecial) @@ -3227,7 +3248,12 @@ Enter Prompt:
elif is_embeddings: try: gen = embeddings_generate(genparams) - genresp = (json.dumps({"object":"list","data":[{"object":"embedding","index":0,"embedding":[-0.003880035,-0.05006583]}],"model":"text-embedding-3-small","usage":{"prompt_tokens":2,"total_tokens":2}}).encode()) + outdatas = [] + odidx = 0 + for od in gen["data"]: + outdatas.append([{"object":"embedding","index":odidx,"embedding":od}]) + odidx += 1 + genresp = (json.dumps({"object":"list","data":outdatas,"model":"koboldcpp-embeddings","usage":{"prompt_tokens":gen["count"],"total_tokens":gen["count"]}}).encode()) self.send_response(200) self.send_header('content-length', str(len(genresp))) self.end_headers(content_type='application/json') diff --git a/otherarch/embeddings_adapter.cpp b/otherarch/embeddings_adapter.cpp index 94344ae5f..982ecea1f 100644 --- a/otherarch/embeddings_adapter.cpp +++ b/otherarch/embeddings_adapter.cpp @@ -24,7 +24,7 @@ static llama_context * embeddings_ctx = nullptr; //text to codes ctx static std::string ttsplatformenv, ttsdeviceenv, ttsvulkandeviceenv; bool embeddings_debug = false; -const int max_batchsize = 2048; +static int max_batchsize = 512; static std::string last_output = ""; static void batch_add_seq(llama_batch & batch, const std::vector & tokens, llama_seq_id seq_id) { @@ -128,16 +128,20 @@ bool embeddingstype_load_model(const embeddings_load_model_inputs inputs) model_params.use_mlock = false; model_params.n_gpu_layers = inputs.gpulayers; //offload if possible model_params.split_mode = llama_split_mode::LLAMA_SPLIT_MODE_LAYER; + + llama_model * embeddingsmodel = llama_model_load_from_file(modelfile.c_str(), model_params); + const int n_ctx_train = llama_model_n_ctx_train(embeddingsmodel); + + max_batchsize = n_ctx_train; ctx_params.embeddings = true; ctx_params.n_ubatch = ctx_params.n_ubatch = max_batchsize; //max size, must fit - ctx_params.n_ctx = max_batchsize + 512; + ctx_params.n_ctx = max_batchsize; ctx_params.logits_all = false; ctx_params.offload_kqv = true; ctx_params.n_threads = nthreads; ctx_params.n_threads_batch = nthreads; ctx_params.flash_attn = inputs.flash_attention; - llama_model * embeddingsmodel = llama_model_load_from_file(modelfile.c_str(), model_params); embeddings_ctx = llama_new_context_with_model(embeddingsmodel, ctx_params); if (embeddings_ctx == nullptr) { @@ -156,7 +160,6 @@ bool embeddingstype_load_model(const embeddings_load_model_inputs inputs) const llama_vocab * vocab = llama_model_get_vocab(embeddingsmodel); - const int n_ctx_train = llama_model_n_ctx_train(embeddingsmodel); const int n_ctx = llama_n_ctx(embeddings_ctx); if (llama_model_has_encoder(embeddingsmodel) && llama_model_has_decoder(embeddingsmodel)) { @@ -181,6 +184,7 @@ embeddings_generation_outputs embeddingstype_generate(const embeddings_generatio printf("\nWarning: KCPP Embeddings Model not initialized!\n"); output.data = ""; output.status = 0; + output.count = 0; return output; } @@ -197,14 +201,19 @@ embeddings_generation_outputs embeddingstype_generate(const embeddings_generatio std::vector> prompt_inputs; auto inp = common_tokenize(embeddings_ctx, prompt, true, true); if (inp.size() > n_batch) { - printf("\n%s: number of tokens in input line (%lld) exceeds batch size (%lld), lower token amount!\n", + printf("\n%s: number of tokens in an input (%lld) exceeds embedding size limit for this model (%lld), lower token amount!\n", __func__, (long long int) inp.size(), (long long int) n_batch); output.data = ""; output.status = 0; + output.count = 0; return output; } prompt_inputs.push_back(inp); + if(embeddings_debug) + { + print_tok_vec(inp); + } printf("\nGenerating Embeddings for %d tokens...",inp.size()); // initialize batch @@ -272,5 +281,6 @@ embeddings_generation_outputs embeddingstype_generate(const embeddings_generatio output.data = last_output.c_str(); output.status = 1; + output.count = inp.size(); return output; }