diff --git a/expose.h b/expose.h
index 4189e5142..4b7bfdf42 100644
--- a/expose.h
+++ b/expose.h
@@ -255,6 +255,7 @@ struct embeddings_generation_inputs
struct embeddings_generation_outputs
{
int status = -1;
+ int count = 0;
const char * data = "";
};
diff --git a/kcpp_docs.embd b/kcpp_docs.embd
index e32741f40..9857fad83 100644
--- a/kcpp_docs.embd
+++ b/kcpp_docs.embd
@@ -1492,6 +1492,28 @@
]
}
},
+ "/api/extra/embeddings": {
+ "post": {
+ "summary": "Creates an embedding vector representing the input text. Please refer to OpenAI documentation",
+ "description": "Creates an embedding vector representing the input text.\n\nThis is an OpenAI compatibility endpoint.\n\n Please refer to OpenAI documentation at [https://platform.openai.com/docs/api-reference/embeddings/create](https://platform.openai.com/docs/api-reference/embeddings/create)",
+ "requestBody": {
+ "content": {
+ "application/json": {
+ "example": {},
+ "schema": {
+ "properties": {},
+ "type": "object"
+ }
+ }
+ },
+ "required": true
+ },
+ "tags": [
+ "api/extra"
+ ],
+ "responses": {"default": {"description": ""}}
+ }
+ },
"/api/extra/data/save": {
"post": {
"description": "Saves data to a slot in a database file in the KoboldCpp server.",
@@ -2152,6 +2174,28 @@
"responses": {"default": {"description": ""}}
}
},
+ "/v1/embeddings": {
+ "post": {
+ "summary": "Creates an embedding vector representing the input text. Please refer to OpenAI documentation",
+ "description": "Creates an embedding vector representing the input text.\n\nThis is an OpenAI compatibility endpoint.\n\n Please refer to OpenAI documentation at [https://platform.openai.com/docs/api-reference/embeddings/create](https://platform.openai.com/docs/api-reference/embeddings/create)",
+ "requestBody": {
+ "content": {
+ "application/json": {
+ "example": {},
+ "schema": {
+ "properties": {},
+ "type": "object"
+ }
+ }
+ },
+ "required": true
+ },
+ "tags": [
+ "v1"
+ ],
+ "responses": {"default": {"description": ""}}
+ }
+ },
},
"servers": [
{
diff --git a/koboldcpp.py b/koboldcpp.py
index 484346eb6..40c21c067 100644
--- a/koboldcpp.py
+++ b/koboldcpp.py
@@ -336,6 +336,7 @@ class embeddings_generation_inputs(ctypes.Structure):
class embeddings_generation_outputs(ctypes.Structure):
_fields_ = [("status", ctypes.c_int),
+ ("count", ctypes.c_int),
("data", ctypes.c_char_p)]
def getdirpath():
@@ -1602,14 +1603,34 @@ def embeddings_load_model(model_filename):
def embeddings_generate(genparams):
global args
- prompt = genparams.get("input", "")
- inputs = embeddings_generation_inputs()
- inputs.prompt = prompt.encode("UTF-8")
- ret = handle.embeddings_generate(inputs)
- outstr = ""
- if ret.status==1:
- outstr = ret.data.decode("UTF-8","ignore")
- return outstr
+ prompts = []
+ if isinstance(genparams.get('input',[]), list):
+ prompts = genparams.get('input',[])
+ else:
+ prompt = genparams.get("input", "")
+ if prompt:
+ prompts.append(prompt)
+
+ tokarrs = []
+ tokcnt = 0
+ for prompt in prompts:
+ tokarr = []
+ tmpcnt = 0
+ try:
+ inputs = embeddings_generation_inputs()
+ inputs.prompt = prompt.encode("UTF-8")
+ ret = handle.embeddings_generate(inputs)
+ if ret.status==1:
+ outstr = ret.data.decode("UTF-8","ignore")
+ tokarr = json.loads(outstr) if outstr else []
+ tmpcnt = ret.count
+ except Exception as e:
+ tokarr = []
+ tmpcnt = 0
+ print(f"Error: {e}")
+ tokarrs.append(tokarr)
+ tokcnt += tmpcnt
+ return {"count":tokcnt, "data":tokarrs}
def tokenize_ids(countprompt,tcaddspecial):
rawcountdata = handle.token_count(countprompt.encode("UTF-8"),tcaddspecial)
@@ -3227,7 +3248,12 @@ Enter Prompt:
elif is_embeddings:
try:
gen = embeddings_generate(genparams)
- genresp = (json.dumps({"object":"list","data":[{"object":"embedding","index":0,"embedding":[-0.003880035,-0.05006583]}],"model":"text-embedding-3-small","usage":{"prompt_tokens":2,"total_tokens":2}}).encode())
+ outdatas = []
+ odidx = 0
+ for od in gen["data"]:
+ outdatas.append([{"object":"embedding","index":odidx,"embedding":od}])
+ odidx += 1
+ genresp = (json.dumps({"object":"list","data":outdatas,"model":"koboldcpp-embeddings","usage":{"prompt_tokens":gen["count"],"total_tokens":gen["count"]}}).encode())
self.send_response(200)
self.send_header('content-length', str(len(genresp)))
self.end_headers(content_type='application/json')
diff --git a/otherarch/embeddings_adapter.cpp b/otherarch/embeddings_adapter.cpp
index 94344ae5f..982ecea1f 100644
--- a/otherarch/embeddings_adapter.cpp
+++ b/otherarch/embeddings_adapter.cpp
@@ -24,7 +24,7 @@
static llama_context * embeddings_ctx = nullptr; //text to codes ctx
static std::string ttsplatformenv, ttsdeviceenv, ttsvulkandeviceenv;
bool embeddings_debug = false;
-const int max_batchsize = 2048;
+static int max_batchsize = 512;
static std::string last_output = "";
static void batch_add_seq(llama_batch & batch, const std::vector & tokens, llama_seq_id seq_id) {
@@ -128,16 +128,20 @@ bool embeddingstype_load_model(const embeddings_load_model_inputs inputs)
model_params.use_mlock = false;
model_params.n_gpu_layers = inputs.gpulayers; //offload if possible
model_params.split_mode = llama_split_mode::LLAMA_SPLIT_MODE_LAYER;
+
+ llama_model * embeddingsmodel = llama_model_load_from_file(modelfile.c_str(), model_params);
+ const int n_ctx_train = llama_model_n_ctx_train(embeddingsmodel);
+
+ max_batchsize = n_ctx_train;
ctx_params.embeddings = true;
ctx_params.n_ubatch = ctx_params.n_ubatch = max_batchsize; //max size, must fit
- ctx_params.n_ctx = max_batchsize + 512;
+ ctx_params.n_ctx = max_batchsize;
ctx_params.logits_all = false;
ctx_params.offload_kqv = true;
ctx_params.n_threads = nthreads;
ctx_params.n_threads_batch = nthreads;
ctx_params.flash_attn = inputs.flash_attention;
- llama_model * embeddingsmodel = llama_model_load_from_file(modelfile.c_str(), model_params);
embeddings_ctx = llama_new_context_with_model(embeddingsmodel, ctx_params);
if (embeddings_ctx == nullptr) {
@@ -156,7 +160,6 @@ bool embeddingstype_load_model(const embeddings_load_model_inputs inputs)
const llama_vocab * vocab = llama_model_get_vocab(embeddingsmodel);
- const int n_ctx_train = llama_model_n_ctx_train(embeddingsmodel);
const int n_ctx = llama_n_ctx(embeddings_ctx);
if (llama_model_has_encoder(embeddingsmodel) && llama_model_has_decoder(embeddingsmodel)) {
@@ -181,6 +184,7 @@ embeddings_generation_outputs embeddingstype_generate(const embeddings_generatio
printf("\nWarning: KCPP Embeddings Model not initialized!\n");
output.data = "";
output.status = 0;
+ output.count = 0;
return output;
}
@@ -197,14 +201,19 @@ embeddings_generation_outputs embeddingstype_generate(const embeddings_generatio
std::vector> prompt_inputs;
auto inp = common_tokenize(embeddings_ctx, prompt, true, true);
if (inp.size() > n_batch) {
- printf("\n%s: number of tokens in input line (%lld) exceeds batch size (%lld), lower token amount!\n",
+ printf("\n%s: number of tokens in an input (%lld) exceeds embedding size limit for this model (%lld), lower token amount!\n",
__func__, (long long int) inp.size(), (long long int) n_batch);
output.data = "";
output.status = 0;
+ output.count = 0;
return output;
}
prompt_inputs.push_back(inp);
+ if(embeddings_debug)
+ {
+ print_tok_vec(inp);
+ }
printf("\nGenerating Embeddings for %d tokens...",inp.size());
// initialize batch
@@ -272,5 +281,6 @@ embeddings_generation_outputs embeddingstype_generate(const embeddings_generatio
output.data = last_output.c_str();
output.status = 1;
+ output.count = inp.size();
return output;
}