mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 09:34:37 +00:00
Merge branch 'master' into concedo_experimental
# Conflicts: # .devops/full-cuda.Dockerfile # .devops/full-rocm.Dockerfile # .devops/full.Dockerfile # .devops/main-rocm.Dockerfile # README.md # flake.lock # flake.nix # ggml-cuda.cu # requirements.txt # tests/CMakeLists.txt
This commit is contained in:
commit
fe7c200610
48 changed files with 1838 additions and 500 deletions
|
@ -26,6 +26,7 @@
|
|||
#include <thread>
|
||||
#include <mutex>
|
||||
#include <chrono>
|
||||
#include <condition_variable>
|
||||
|
||||
#ifndef SERVER_VERBOSE
|
||||
#define SERVER_VERBOSE 1
|
||||
|
@ -442,7 +443,6 @@ struct llama_client_slot
|
|||
}
|
||||
|
||||
images.clear();
|
||||
// llama_set_rng_seed(ctx, params.seed); in batched the seed matter???????
|
||||
}
|
||||
|
||||
bool has_budget(gpt_params &global_params) {
|
||||
|
@ -543,7 +543,9 @@ struct llama_server_context
|
|||
std::vector<task_result> queue_results;
|
||||
std::vector<task_multi> queue_multitasks;
|
||||
std::mutex mutex_tasks; // also guards id_gen, and queue_multitasks
|
||||
std::condition_variable condition_tasks;
|
||||
std::mutex mutex_results;
|
||||
std::condition_variable condition_results;
|
||||
|
||||
~llama_server_context()
|
||||
{
|
||||
|
@ -922,6 +924,7 @@ struct llama_server_context
|
|||
llama_sampling_free(slot->ctx_sampling);
|
||||
}
|
||||
slot->ctx_sampling = llama_sampling_init(slot->sparams);
|
||||
llama_set_rng_seed(ctx, slot->params.seed);
|
||||
slot->command = LOAD_PROMPT;
|
||||
|
||||
all_slots_are_idle = false;
|
||||
|
@ -1170,7 +1173,7 @@ struct llama_server_context
|
|||
|
||||
void send_error(task_server& task, std::string error)
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(mutex_results);
|
||||
std::unique_lock<std::mutex> lock(mutex_results);
|
||||
task_result res;
|
||||
res.id = task.id;
|
||||
res.multitask_id = task.multitask_id;
|
||||
|
@ -1178,6 +1181,7 @@ struct llama_server_context
|
|||
res.error = true;
|
||||
res.result_json = { { "content", error } };
|
||||
queue_results.push_back(res);
|
||||
condition_results.notify_all();
|
||||
}
|
||||
|
||||
void add_multi_task(int id, std::vector<int>& sub_ids)
|
||||
|
@ -1187,6 +1191,7 @@ struct llama_server_context
|
|||
multi.id = id;
|
||||
std::copy(sub_ids.begin(), sub_ids.end(), std::inserter(multi.subtasks_remaining, multi.subtasks_remaining.end()));
|
||||
queue_multitasks.push_back(multi);
|
||||
condition_tasks.notify_one();
|
||||
}
|
||||
|
||||
void update_multi_task(int multitask_id, int subtask_id, task_result& result)
|
||||
|
@ -1198,6 +1203,7 @@ struct llama_server_context
|
|||
{
|
||||
multitask.subtasks_remaining.erase(subtask_id);
|
||||
multitask.results.push_back(result);
|
||||
condition_tasks.notify_one();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1216,7 +1222,7 @@ struct llama_server_context
|
|||
{"n_ctx", slot.n_ctx},
|
||||
{"model", params.model_alias},
|
||||
{"seed", slot.params.seed},
|
||||
{"temp", slot.sparams.temp},
|
||||
{"temperature", slot.sparams.temp},
|
||||
{"top_k", slot.sparams.top_k},
|
||||
{"top_p", slot.sparams.top_p},
|
||||
{"min_p", slot.sparams.min_p},
|
||||
|
@ -1245,7 +1251,7 @@ struct llama_server_context
|
|||
|
||||
void send_partial_response(llama_client_slot &slot, completion_token_output tkn)
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(mutex_results);
|
||||
std::unique_lock<std::mutex> lock(mutex_results);
|
||||
task_result res;
|
||||
res.id = slot.task_id;
|
||||
res.multitask_id = slot.multitask_id;
|
||||
|
@ -1281,11 +1287,12 @@ struct llama_server_context
|
|||
}
|
||||
|
||||
queue_results.push_back(res);
|
||||
condition_results.notify_all();
|
||||
}
|
||||
|
||||
void send_final_response(llama_client_slot &slot)
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(mutex_results);
|
||||
std::unique_lock<std::mutex> lock(mutex_results);
|
||||
task_result res;
|
||||
res.id = slot.task_id;
|
||||
res.multitask_id = slot.multitask_id;
|
||||
|
@ -1341,11 +1348,12 @@ struct llama_server_context
|
|||
}
|
||||
|
||||
queue_results.push_back(res);
|
||||
condition_results.notify_all();
|
||||
}
|
||||
|
||||
void send_embedding(llama_client_slot &slot)
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(mutex_results);
|
||||
std::unique_lock<std::mutex> lock(mutex_results);
|
||||
task_result res;
|
||||
res.id = slot.task_id;
|
||||
res.multitask_id = slot.multitask_id;
|
||||
|
@ -1373,6 +1381,7 @@ struct llama_server_context
|
|||
};
|
||||
}
|
||||
queue_results.push_back(res);
|
||||
condition_results.notify_all();
|
||||
}
|
||||
|
||||
int request_completion(json data, bool infill, bool embedding, int multitask_id)
|
||||
|
@ -1396,6 +1405,7 @@ struct llama_server_context
|
|||
|
||||
// otherwise, it's a single-prompt task, we actually queue it
|
||||
queue_tasks.push_back(task);
|
||||
condition_tasks.notify_one();
|
||||
return task.id;
|
||||
}
|
||||
|
||||
|
@ -1403,13 +1413,10 @@ struct llama_server_context
|
|||
{
|
||||
while (true)
|
||||
{
|
||||
std::this_thread::sleep_for(std::chrono::microseconds(5));
|
||||
std::lock_guard<std::mutex> lock(mutex_results);
|
||||
|
||||
if (queue_results.empty())
|
||||
{
|
||||
continue;
|
||||
}
|
||||
std::unique_lock<std::mutex> lock(mutex_results);
|
||||
condition_results.wait(lock, [&]{
|
||||
return !queue_results.empty();
|
||||
});
|
||||
|
||||
for (int i = 0; i < (int) queue_results.size(); i++)
|
||||
{
|
||||
|
@ -1505,12 +1512,13 @@ struct llama_server_context
|
|||
|
||||
void request_cancel(int task_id)
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(mutex_tasks);
|
||||
std::unique_lock<std::mutex> lock(mutex_tasks);
|
||||
task_server task;
|
||||
task.id = id_gen++;
|
||||
task.type = CANCEL_TASK;
|
||||
task.target_id = task_id;
|
||||
queue_tasks.push_back(task);
|
||||
condition_tasks.notify_one();
|
||||
}
|
||||
|
||||
int split_multiprompt_task(task_server& multiprompt_task)
|
||||
|
@ -1536,7 +1544,7 @@ struct llama_server_context
|
|||
|
||||
void process_tasks()
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(mutex_tasks);
|
||||
std::unique_lock<std::mutex> lock(mutex_tasks);
|
||||
while (!queue_tasks.empty())
|
||||
{
|
||||
task_server task = queue_tasks.front();
|
||||
|
@ -1608,6 +1616,7 @@ struct llama_server_context
|
|||
|
||||
std::lock_guard<std::mutex> lock(mutex_results);
|
||||
queue_results.push_back(aggregate_result);
|
||||
condition_results.notify_all();
|
||||
|
||||
queue_iterator = queue_multitasks.erase(queue_iterator);
|
||||
}
|
||||
|
@ -1638,8 +1647,10 @@ struct llama_server_context
|
|||
LOG_TEE("all slots are idle and system prompt is empty, clear the KV cache\n");
|
||||
kv_cache_clear();
|
||||
}
|
||||
// avoid 100% usage of cpu all time
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(5));
|
||||
std::unique_lock<std::mutex> lock(mutex_tasks);
|
||||
condition_tasks.wait(lock, [&]{
|
||||
return !queue_tasks.empty();
|
||||
});
|
||||
}
|
||||
|
||||
for (llama_client_slot &slot : slots)
|
||||
|
@ -2438,26 +2449,33 @@ json oaicompat_completion_params_parse(
|
|||
llama_params["__oaicompat"] = true;
|
||||
|
||||
// Map OpenAI parameters to llama.cpp parameters
|
||||
//
|
||||
// For parameters that are defined by the OpenAI documentation (e.g.
|
||||
// temperature), we explicitly specify OpenAI's intended default; we
|
||||
// need to do that because sometimes OpenAI disagrees with llama.cpp
|
||||
//
|
||||
// https://platform.openai.com/docs/api-reference/chat/create
|
||||
llama_sampling_params default_sparams;
|
||||
llama_params["model"] = json_value(body, "model", std::string("uknown"));
|
||||
llama_params["prompt"] = format_chatml(body["messages"]); // OpenAI 'messages' to llama.cpp 'prompt'
|
||||
llama_params["cache_prompt"] = json_value(body, "cache_prompt", false);
|
||||
llama_params["temperature"] = json_value(body, "temperature", 0.8);
|
||||
llama_params["top_k"] = json_value(body, "top_k", 40);
|
||||
llama_params["top_p"] = json_value(body, "top_p", 0.95);
|
||||
llama_params["temperature"] = json_value(body, "temperature", 0.0);
|
||||
llama_params["top_k"] = json_value(body, "top_k", default_sparams.top_k);
|
||||
llama_params["top_p"] = json_value(body, "top_p", 1.0);
|
||||
llama_params["n_predict"] = json_value(body, "max_tokens", -1);
|
||||
llama_params["logit_bias"] = json_value(body, "logit_bias",json::object());
|
||||
llama_params["frequency_penalty"] = json_value(body, "frequency_penalty", 0.0);
|
||||
llama_params["presence_penalty"] = json_value(body, "presence_penalty", 0.0);
|
||||
llama_params["seed"] = json_value(body, "seed", 0);
|
||||
llama_params["seed"] = json_value(body, "seed", LLAMA_DEFAULT_SEED);
|
||||
llama_params["stream"] = json_value(body, "stream", false);
|
||||
llama_params["mirostat"] = json_value(body, "mirostat", false);
|
||||
llama_params["mirostat_tau"] = json_value(body, "mirostat_tau", 0.0);
|
||||
llama_params["mirostat_eta"] = json_value(body, "mirostat_eta", 0.0);
|
||||
llama_params["penalize_nl"] = json_value(body, "penalize_nl", false);
|
||||
llama_params["typical_p"] = json_value(body, "typical_p", 0.0);
|
||||
llama_params["repeat_last_n"] = json_value(body, "repeat_last_n", 0);
|
||||
llama_params["mirostat"] = json_value(body, "mirostat", default_sparams.mirostat);
|
||||
llama_params["mirostat_tau"] = json_value(body, "mirostat_tau", default_sparams.mirostat_tau);
|
||||
llama_params["mirostat_eta"] = json_value(body, "mirostat_eta", default_sparams.mirostat_eta);
|
||||
llama_params["penalize_nl"] = json_value(body, "penalize_nl", default_sparams.penalize_nl);
|
||||
llama_params["typical_p"] = json_value(body, "typical_p", default_sparams.typical_p);
|
||||
llama_params["repeat_last_n"] = json_value(body, "repeat_last_n", default_sparams.penalty_last_n);
|
||||
llama_params["ignore_eos"] = json_value(body, "ignore_eos", false);
|
||||
llama_params["tfs_z"] = json_value(body, "tfs_z", 0.0);
|
||||
llama_params["tfs_z"] = json_value(body, "tfs_z", default_sparams.tfs_z);
|
||||
|
||||
if (body.count("grammar") != 0) {
|
||||
llama_params["grammar"] = json_value(body, "grammar", json::object());
|
||||
|
@ -3071,7 +3089,17 @@ int main(int argc, char **argv)
|
|||
{
|
||||
prompt = "";
|
||||
}
|
||||
const int task_id = llama.request_completion({ {"prompt", prompt}, { "n_predict", 0} }, false, true, -1);
|
||||
|
||||
json image_data;
|
||||
if (body.count("image_data") != 0) {
|
||||
image_data = body["image_data"];
|
||||
}
|
||||
else
|
||||
{
|
||||
image_data = "";
|
||||
}
|
||||
|
||||
const int task_id = llama.request_completion({ {"prompt", prompt}, { "n_predict", 0}, {"image_data", image_data} }, false, true, -1);
|
||||
task_result result = llama.next_result(task_id);
|
||||
return res.set_content(result.result_json.dump(), "application/json; charset=utf-8");
|
||||
});
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue