Merge branch 'upstream' into concedo_experimental

# Conflicts:
#	.github/workflows/build.yml
#	examples/server/README.md
This commit is contained in:
Concedo 2024-11-06 23:13:00 +08:00
commit 628dcd640e
10 changed files with 1111 additions and 741 deletions

View file

@ -379,8 +379,8 @@ struct server_queue {
std::condition_variable condition_tasks;
// callback functions
std::function<void(server_task&)> callback_new_task;
std::function<void(void)> callback_update_slots;
std::function<void(server_task)> callback_new_task;
std::function<void(void)> callback_update_slots;
// Add a new task to the end of the queue
int post(server_task task, bool front = false) {
@ -432,7 +432,7 @@ struct server_queue {
}
// Register function to process a new task
void on_new_task(std::function<void(server_task &)> callback) {
void on_new_task(std::function<void(server_task)> callback) {
callback_new_task = std::move(callback);
}
@ -482,7 +482,7 @@ struct server_queue {
lock.unlock();
QUE_DBG("processing task, id = %d\n", task.id);
callback_new_task(task);
callback_new_task(std::move(task));
}
// all tasks in the current loop is processed, slots data is now ready
@ -645,17 +645,12 @@ struct server_context {
bool load_model(const common_params & params_) {
params = params_;
// reserve one extra sequence (seq_id == 0) for extra features
params.n_parallel += 1;
common_init_result llama_init = common_init_from_params(params);
model = llama_init.model;
ctx = llama_init.context;
loras = llama_init.lora_adapters;
params.n_parallel -= 1; // but be sneaky about it
if (model == nullptr) {
SRV_ERR("failed to load model, '%s'\n", params.model.c_str());
return false;
@ -1289,16 +1284,16 @@ struct server_context {
void send_embedding(const server_slot & slot, const llama_batch & batch) {
server_task_result res;
res.id = slot.id_task;
res.error = false;
res.stop = true;
res.id = slot.id_task;
res.error = false;
res.stop = true;
const int n_embd = llama_n_embd(model);
std::vector<float> embd_res(n_embd, 0.0f);
for (int i = 0; i < batch.n_tokens; ++i) {
if (!batch.logits[i] || batch.seq_id[i][0] != slot.id + 1) {
if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) {
continue;
}
@ -1333,12 +1328,12 @@ struct server_context {
void send_rerank(const server_slot & slot, const llama_batch & batch) {
server_task_result res;
res.id = slot.id_task;
res.error = false;
res.stop = true;
res.id = slot.id_task;
res.error = false;
res.stop = true;
for (int i = 0; i < batch.n_tokens; ++i) {
if (!batch.logits[i] || batch.seq_id[i][0] != slot.id + 1) {
if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) {
continue;
}
@ -1511,7 +1506,7 @@ struct server_context {
// Functions to process the task
//
void process_single_task(const server_task & task) {
void process_single_task(server_task task) {
switch (task.type) {
case SERVER_TASK_TYPE_INFERENCE:
{
@ -1567,11 +1562,11 @@ struct server_context {
for (server_slot & slot : slots) {
json slot_data = get_formated_generation(slot);
slot_data["id"] = slot.id;
slot_data["id_task"] = slot.id_task;
slot_data["state"] = slot.state;
slot_data["prompt"] = common_detokenize(ctx, slot.prompt_tokens);
slot_data["next_token"] = {
slot_data["id"] = slot.id;
slot_data["id_task"] = slot.id_task;
slot_data["is_processing"] = slot.is_processing();
slot_data["prompt"] = common_detokenize(ctx, slot.prompt_tokens);
slot_data["next_token"] = {
{"has_next_token", slot.has_next_token},
{"has_new_line", slot.has_new_line},
{"n_remain", slot.n_remaining},
@ -1582,10 +1577,10 @@ struct server_context {
{"stopping_word", slot.stopping_word},
};
if (slot_data["state"] == SLOT_STATE_IDLE) {
n_idle_slots++;
} else {
if (slot.is_processing()) {
n_processing_slots++;
} else {
n_idle_slots++;
}
slots_data.push_back(slot_data);
@ -1647,7 +1642,7 @@ struct server_context {
std::string filename = task.data.at("filename");
std::string filepath = task.data.at("filepath");
const size_t nwrite = llama_state_seq_save_file(ctx, filepath.c_str(), slot->id + 1, slot->cache_tokens.data(), token_count);
const size_t nwrite = llama_state_seq_save_file(ctx, filepath.c_str(), slot->id, slot->cache_tokens.data(), token_count);
const int64_t t_end = ggml_time_us();
const double t_save_ms = (t_end - t_start) / 1000.0;
@ -1689,7 +1684,7 @@ struct server_context {
slot->cache_tokens.resize(slot->n_ctx);
size_t token_count = 0;
size_t nread = llama_state_seq_load_file(ctx, filepath.c_str(), slot->id + 1, slot->cache_tokens.data(), slot->cache_tokens.size(), &token_count);
size_t nread = llama_state_seq_load_file(ctx, filepath.c_str(), slot->id, slot->cache_tokens.data(), slot->cache_tokens.size(), &token_count);
if (nread == 0) {
slot->cache_tokens.resize(0);
send_error(task, "Unable to restore slot, no available space in KV cache or invalid slot save file", ERROR_TYPE_INVALID_REQUEST);
@ -1732,7 +1727,7 @@ struct server_context {
// Erase token cache
const size_t n_erased = slot->cache_tokens.size();
llama_kv_cache_seq_rm(ctx, slot->id + 1, -1, -1);
llama_kv_cache_seq_rm(ctx, slot->id, -1, -1);
slot->cache_tokens.clear();
server_task_result result;
@ -1809,8 +1804,8 @@ struct server_context {
SLT_WRN(slot, "slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n", n_keep, n_left, n_discard);
llama_kv_cache_seq_rm (ctx, slot.id + 1, n_keep , n_keep + n_discard);
llama_kv_cache_seq_add(ctx, slot.id + 1, n_keep + n_discard, slot.n_past, -n_discard);
llama_kv_cache_seq_rm (ctx, slot.id, n_keep , n_keep + n_discard);
llama_kv_cache_seq_add(ctx, slot.id, n_keep + n_discard, slot.n_past, -n_discard);
if (slot.params.cache_prompt) {
for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) {
@ -1837,7 +1832,7 @@ struct server_context {
slot.i_batch = batch.n_tokens;
common_batch_add(batch, slot.sampled, slot.n_past, { slot.id + 1 }, true);
common_batch_add(batch, slot.sampled, slot.n_past, { slot.id }, true);
slot.n_past += 1;
@ -1984,8 +1979,8 @@ struct server_context {
const int64_t kv_shift = (int64_t) head_p - (int64_t) head_c;
llama_kv_cache_seq_rm (ctx, slot.id + 1, head_p, head_c);
llama_kv_cache_seq_add(ctx, slot.id + 1, head_c, -1, kv_shift);
llama_kv_cache_seq_rm (ctx, slot.id, head_p, head_c);
llama_kv_cache_seq_add(ctx, slot.id, head_c, -1, kv_shift);
for (size_t i = 0; i < n_match; i++) {
slot.cache_tokens[head_p + i] = slot.cache_tokens[head_c + i];
@ -2034,9 +2029,9 @@ struct server_context {
}
// keep only the common part
if (!llama_kv_cache_seq_rm(ctx, slot.id + 1, slot.n_past, -1)) {
if (!llama_kv_cache_seq_rm(ctx, slot.id, slot.n_past, -1)) {
// could not partially delete (likely using a non-Transformer model)
llama_kv_cache_seq_rm(ctx, slot.id + 1, -1, -1);
llama_kv_cache_seq_rm(ctx, slot.id, -1, -1);
// there is no common part left
slot.n_past = 0;
@ -2049,7 +2044,7 @@ struct server_context {
// add prompt tokens for processing in the current batch
while (slot.n_past < slot.n_prompt_tokens && batch.n_tokens < n_batch) {
common_batch_add(batch, prompt_tokens[slot.n_past], slot.n_past, { slot.id + 1 }, false);
common_batch_add(batch, prompt_tokens[slot.n_past], slot.n_past, { slot.id }, false);
if (slot.params.cache_prompt) {
slot.cache_tokens.push_back(prompt_tokens[slot.n_past]);