mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-10 17:14:36 +00:00
embeddings done
This commit is contained in:
parent
82f2654049
commit
2bdf1dacff
4 changed files with 95 additions and 14 deletions
1
expose.h
1
expose.h
|
@ -255,6 +255,7 @@ struct embeddings_generation_inputs
|
||||||
struct embeddings_generation_outputs
|
struct embeddings_generation_outputs
|
||||||
{
|
{
|
||||||
int status = -1;
|
int status = -1;
|
||||||
|
int count = 0;
|
||||||
const char * data = "";
|
const char * data = "";
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -1492,6 +1492,28 @@
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"/api/extra/embeddings": {
|
||||||
|
"post": {
|
||||||
|
"summary": "Creates an embedding vector representing the input text. Please refer to OpenAI documentation",
|
||||||
|
"description": "Creates an embedding vector representing the input text.\n\nThis is an OpenAI compatibility endpoint.\n\n Please refer to OpenAI documentation at [https://platform.openai.com/docs/api-reference/embeddings/create](https://platform.openai.com/docs/api-reference/embeddings/create)",
|
||||||
|
"requestBody": {
|
||||||
|
"content": {
|
||||||
|
"application/json": {
|
||||||
|
"example": {},
|
||||||
|
"schema": {
|
||||||
|
"properties": {},
|
||||||
|
"type": "object"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": true
|
||||||
|
},
|
||||||
|
"tags": [
|
||||||
|
"api/extra"
|
||||||
|
],
|
||||||
|
"responses": {"default": {"description": ""}}
|
||||||
|
}
|
||||||
|
},
|
||||||
"/api/extra/data/save": {
|
"/api/extra/data/save": {
|
||||||
"post": {
|
"post": {
|
||||||
"description": "Saves data to a slot in a database file in the KoboldCpp server.",
|
"description": "Saves data to a slot in a database file in the KoboldCpp server.",
|
||||||
|
@ -2152,6 +2174,28 @@
|
||||||
"responses": {"default": {"description": ""}}
|
"responses": {"default": {"description": ""}}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"/v1/embeddings": {
|
||||||
|
"post": {
|
||||||
|
"summary": "Creates an embedding vector representing the input text. Please refer to OpenAI documentation",
|
||||||
|
"description": "Creates an embedding vector representing the input text.\n\nThis is an OpenAI compatibility endpoint.\n\n Please refer to OpenAI documentation at [https://platform.openai.com/docs/api-reference/embeddings/create](https://platform.openai.com/docs/api-reference/embeddings/create)",
|
||||||
|
"requestBody": {
|
||||||
|
"content": {
|
||||||
|
"application/json": {
|
||||||
|
"example": {},
|
||||||
|
"schema": {
|
||||||
|
"properties": {},
|
||||||
|
"type": "object"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": true
|
||||||
|
},
|
||||||
|
"tags": [
|
||||||
|
"v1"
|
||||||
|
],
|
||||||
|
"responses": {"default": {"description": ""}}
|
||||||
|
}
|
||||||
|
},
|
||||||
},
|
},
|
||||||
"servers": [
|
"servers": [
|
||||||
{
|
{
|
||||||
|
|
44
koboldcpp.py
44
koboldcpp.py
|
@ -336,6 +336,7 @@ class embeddings_generation_inputs(ctypes.Structure):
|
||||||
|
|
||||||
class embeddings_generation_outputs(ctypes.Structure):
|
class embeddings_generation_outputs(ctypes.Structure):
|
||||||
_fields_ = [("status", ctypes.c_int),
|
_fields_ = [("status", ctypes.c_int),
|
||||||
|
("count", ctypes.c_int),
|
||||||
("data", ctypes.c_char_p)]
|
("data", ctypes.c_char_p)]
|
||||||
|
|
||||||
def getdirpath():
|
def getdirpath():
|
||||||
|
@ -1602,14 +1603,34 @@ def embeddings_load_model(model_filename):
|
||||||
|
|
||||||
def embeddings_generate(genparams):
|
def embeddings_generate(genparams):
|
||||||
global args
|
global args
|
||||||
prompt = genparams.get("input", "")
|
prompts = []
|
||||||
inputs = embeddings_generation_inputs()
|
if isinstance(genparams.get('input',[]), list):
|
||||||
inputs.prompt = prompt.encode("UTF-8")
|
prompts = genparams.get('input',[])
|
||||||
ret = handle.embeddings_generate(inputs)
|
else:
|
||||||
outstr = ""
|
prompt = genparams.get("input", "")
|
||||||
if ret.status==1:
|
if prompt:
|
||||||
outstr = ret.data.decode("UTF-8","ignore")
|
prompts.append(prompt)
|
||||||
return outstr
|
|
||||||
|
tokarrs = []
|
||||||
|
tokcnt = 0
|
||||||
|
for prompt in prompts:
|
||||||
|
tokarr = []
|
||||||
|
tmpcnt = 0
|
||||||
|
try:
|
||||||
|
inputs = embeddings_generation_inputs()
|
||||||
|
inputs.prompt = prompt.encode("UTF-8")
|
||||||
|
ret = handle.embeddings_generate(inputs)
|
||||||
|
if ret.status==1:
|
||||||
|
outstr = ret.data.decode("UTF-8","ignore")
|
||||||
|
tokarr = json.loads(outstr) if outstr else []
|
||||||
|
tmpcnt = ret.count
|
||||||
|
except Exception as e:
|
||||||
|
tokarr = []
|
||||||
|
tmpcnt = 0
|
||||||
|
print(f"Error: {e}")
|
||||||
|
tokarrs.append(tokarr)
|
||||||
|
tokcnt += tmpcnt
|
||||||
|
return {"count":tokcnt, "data":tokarrs}
|
||||||
|
|
||||||
def tokenize_ids(countprompt,tcaddspecial):
|
def tokenize_ids(countprompt,tcaddspecial):
|
||||||
rawcountdata = handle.token_count(countprompt.encode("UTF-8"),tcaddspecial)
|
rawcountdata = handle.token_count(countprompt.encode("UTF-8"),tcaddspecial)
|
||||||
|
@ -3227,7 +3248,12 @@ Enter Prompt:<br>
|
||||||
elif is_embeddings:
|
elif is_embeddings:
|
||||||
try:
|
try:
|
||||||
gen = embeddings_generate(genparams)
|
gen = embeddings_generate(genparams)
|
||||||
genresp = (json.dumps({"object":"list","data":[{"object":"embedding","index":0,"embedding":[-0.003880035,-0.05006583]}],"model":"text-embedding-3-small","usage":{"prompt_tokens":2,"total_tokens":2}}).encode())
|
outdatas = []
|
||||||
|
odidx = 0
|
||||||
|
for od in gen["data"]:
|
||||||
|
outdatas.append([{"object":"embedding","index":odidx,"embedding":od}])
|
||||||
|
odidx += 1
|
||||||
|
genresp = (json.dumps({"object":"list","data":outdatas,"model":"koboldcpp-embeddings","usage":{"prompt_tokens":gen["count"],"total_tokens":gen["count"]}}).encode())
|
||||||
self.send_response(200)
|
self.send_response(200)
|
||||||
self.send_header('content-length', str(len(genresp)))
|
self.send_header('content-length', str(len(genresp)))
|
||||||
self.end_headers(content_type='application/json')
|
self.end_headers(content_type='application/json')
|
||||||
|
|
|
@ -24,7 +24,7 @@
|
||||||
static llama_context * embeddings_ctx = nullptr; //text to codes ctx
|
static llama_context * embeddings_ctx = nullptr; //text to codes ctx
|
||||||
static std::string ttsplatformenv, ttsdeviceenv, ttsvulkandeviceenv;
|
static std::string ttsplatformenv, ttsdeviceenv, ttsvulkandeviceenv;
|
||||||
bool embeddings_debug = false;
|
bool embeddings_debug = false;
|
||||||
const int max_batchsize = 2048;
|
static int max_batchsize = 512;
|
||||||
static std::string last_output = "";
|
static std::string last_output = "";
|
||||||
|
|
||||||
static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, llama_seq_id seq_id) {
|
static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, llama_seq_id seq_id) {
|
||||||
|
@ -128,16 +128,20 @@ bool embeddingstype_load_model(const embeddings_load_model_inputs inputs)
|
||||||
model_params.use_mlock = false;
|
model_params.use_mlock = false;
|
||||||
model_params.n_gpu_layers = inputs.gpulayers; //offload if possible
|
model_params.n_gpu_layers = inputs.gpulayers; //offload if possible
|
||||||
model_params.split_mode = llama_split_mode::LLAMA_SPLIT_MODE_LAYER;
|
model_params.split_mode = llama_split_mode::LLAMA_SPLIT_MODE_LAYER;
|
||||||
|
|
||||||
|
llama_model * embeddingsmodel = llama_model_load_from_file(modelfile.c_str(), model_params);
|
||||||
|
const int n_ctx_train = llama_model_n_ctx_train(embeddingsmodel);
|
||||||
|
|
||||||
|
max_batchsize = n_ctx_train;
|
||||||
ctx_params.embeddings = true;
|
ctx_params.embeddings = true;
|
||||||
ctx_params.n_ubatch = ctx_params.n_ubatch = max_batchsize; //max size, must fit
|
ctx_params.n_ubatch = ctx_params.n_ubatch = max_batchsize; //max size, must fit
|
||||||
ctx_params.n_ctx = max_batchsize + 512;
|
ctx_params.n_ctx = max_batchsize;
|
||||||
ctx_params.logits_all = false;
|
ctx_params.logits_all = false;
|
||||||
ctx_params.offload_kqv = true;
|
ctx_params.offload_kqv = true;
|
||||||
ctx_params.n_threads = nthreads;
|
ctx_params.n_threads = nthreads;
|
||||||
ctx_params.n_threads_batch = nthreads;
|
ctx_params.n_threads_batch = nthreads;
|
||||||
ctx_params.flash_attn = inputs.flash_attention;
|
ctx_params.flash_attn = inputs.flash_attention;
|
||||||
|
|
||||||
llama_model * embeddingsmodel = llama_model_load_from_file(modelfile.c_str(), model_params);
|
|
||||||
embeddings_ctx = llama_new_context_with_model(embeddingsmodel, ctx_params);
|
embeddings_ctx = llama_new_context_with_model(embeddingsmodel, ctx_params);
|
||||||
|
|
||||||
if (embeddings_ctx == nullptr) {
|
if (embeddings_ctx == nullptr) {
|
||||||
|
@ -156,7 +160,6 @@ bool embeddingstype_load_model(const embeddings_load_model_inputs inputs)
|
||||||
|
|
||||||
const llama_vocab * vocab = llama_model_get_vocab(embeddingsmodel);
|
const llama_vocab * vocab = llama_model_get_vocab(embeddingsmodel);
|
||||||
|
|
||||||
const int n_ctx_train = llama_model_n_ctx_train(embeddingsmodel);
|
|
||||||
const int n_ctx = llama_n_ctx(embeddings_ctx);
|
const int n_ctx = llama_n_ctx(embeddings_ctx);
|
||||||
|
|
||||||
if (llama_model_has_encoder(embeddingsmodel) && llama_model_has_decoder(embeddingsmodel)) {
|
if (llama_model_has_encoder(embeddingsmodel) && llama_model_has_decoder(embeddingsmodel)) {
|
||||||
|
@ -181,6 +184,7 @@ embeddings_generation_outputs embeddingstype_generate(const embeddings_generatio
|
||||||
printf("\nWarning: KCPP Embeddings Model not initialized!\n");
|
printf("\nWarning: KCPP Embeddings Model not initialized!\n");
|
||||||
output.data = "";
|
output.data = "";
|
||||||
output.status = 0;
|
output.status = 0;
|
||||||
|
output.count = 0;
|
||||||
return output;
|
return output;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -197,14 +201,19 @@ embeddings_generation_outputs embeddingstype_generate(const embeddings_generatio
|
||||||
std::vector<std::vector<int32_t>> prompt_inputs;
|
std::vector<std::vector<int32_t>> prompt_inputs;
|
||||||
auto inp = common_tokenize(embeddings_ctx, prompt, true, true);
|
auto inp = common_tokenize(embeddings_ctx, prompt, true, true);
|
||||||
if (inp.size() > n_batch) {
|
if (inp.size() > n_batch) {
|
||||||
printf("\n%s: number of tokens in input line (%lld) exceeds batch size (%lld), lower token amount!\n",
|
printf("\n%s: number of tokens in an input (%lld) exceeds embedding size limit for this model (%lld), lower token amount!\n",
|
||||||
__func__, (long long int) inp.size(), (long long int) n_batch);
|
__func__, (long long int) inp.size(), (long long int) n_batch);
|
||||||
output.data = "";
|
output.data = "";
|
||||||
output.status = 0;
|
output.status = 0;
|
||||||
|
output.count = 0;
|
||||||
return output;
|
return output;
|
||||||
}
|
}
|
||||||
prompt_inputs.push_back(inp);
|
prompt_inputs.push_back(inp);
|
||||||
|
|
||||||
|
if(embeddings_debug)
|
||||||
|
{
|
||||||
|
print_tok_vec(inp);
|
||||||
|
}
|
||||||
printf("\nGenerating Embeddings for %d tokens...",inp.size());
|
printf("\nGenerating Embeddings for %d tokens...",inp.size());
|
||||||
|
|
||||||
// initialize batch
|
// initialize batch
|
||||||
|
@ -272,5 +281,6 @@ embeddings_generation_outputs embeddingstype_generate(const embeddings_generatio
|
||||||
|
|
||||||
output.data = last_output.c_str();
|
output.data = last_output.c_str();
|
||||||
output.status = 1;
|
output.status = 1;
|
||||||
|
output.count = inp.size();
|
||||||
return output;
|
return output;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue