mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 09:34:37 +00:00
Merge branch 'master' into concedo_experimental
# Conflicts: # Makefile # README.md # flake.lock # ggml-cuda.cu # llama.cpp # tests/test-backend-ops.cpp # tests/test-quantize-fns.cpp
This commit is contained in:
commit
ad638285de
26 changed files with 3393 additions and 589 deletions
|
@ -1,8 +1,20 @@
|
|||
# llama.cpp/example/server
|
||||
# LLaMA.cpp HTTP Server
|
||||
|
||||
This example demonstrates a simple HTTP API server and a simple web front end to interact with llama.cpp.
|
||||
Fast, lightweight, pure C/C++ HTTP server based on [httplib](https://github.com/yhirose/cpp-httplib), [nlohmann::json](https://github.com/nlohmann/json) and **llama.cpp**.
|
||||
|
||||
Command line options:
|
||||
Set of LLM REST APIs and a simple web front end to interact with llama.cpp.
|
||||
|
||||
**Features:**
|
||||
* LLM inference of F16 and quantum models on GPU and CPU
|
||||
* [OpenAI API](https://github.com/openai/openai-openapi) compatible chat completions and embeddings routes
|
||||
* Parallel decoding with multi-user support
|
||||
* Continuous batching
|
||||
* Multimodal (wip)
|
||||
* Monitoring endpoints
|
||||
|
||||
The project is under active development, and we are [looking for feedback and contributors](https://github.com/ggerganov/llama.cpp/issues/4216).
|
||||
|
||||
**Command line options:**
|
||||
|
||||
- `--threads N`, `-t N`: Set the number of threads to use during generation.
|
||||
- `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation.
|
||||
|
|
|
@ -1337,6 +1337,10 @@ struct llama_server_context
|
|||
split_multiprompt_task(task_id, task);
|
||||
}
|
||||
} else {
|
||||
// an empty prompt can make slot become buggy
|
||||
if (task.data.contains("prompt") && task.data["prompt"].is_string() && task.data["prompt"].get<std::string>().empty()) {
|
||||
task.data["prompt"] = " "; // add a space so that we have one token
|
||||
}
|
||||
queue_tasks.post(task);
|
||||
}
|
||||
}
|
||||
|
@ -1637,8 +1641,8 @@ struct llama_server_context
|
|||
{"n_system_tokens", system_tokens.size()},
|
||||
{"n_cache_tokens", slot.cache_tokens.size()}
|
||||
});
|
||||
llama_kv_cache_seq_rm (ctx, slot.id, n_keep , n_keep + n_discard);
|
||||
llama_kv_cache_seq_shift(ctx, slot.id, n_keep + n_discard, system_tokens.size() + slot.n_past, -n_discard);
|
||||
llama_kv_cache_seq_rm (ctx, slot.id, n_keep , n_keep + n_discard);
|
||||
llama_kv_cache_seq_add(ctx, slot.id, n_keep + n_discard, system_tokens.size() + slot.n_past, -n_discard);
|
||||
|
||||
for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++)
|
||||
{
|
||||
|
@ -1942,9 +1946,9 @@ struct llama_server_context
|
|||
LOG_TEE("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n);
|
||||
LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd);
|
||||
|
||||
llama_kv_cache_seq_shift(ctx, slot.id, slot.ga_i, slot.n_past_se, ib * bd);
|
||||
llama_kv_cache_seq_add(ctx, slot.id, slot.ga_i, slot.n_past_se, ib * bd);
|
||||
llama_kv_cache_seq_div(ctx, slot.id, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w,slot.ga_n);
|
||||
llama_kv_cache_seq_shift(ctx, slot.id, slot.ga_i + ib * bd + slot.ga_w,slot.n_past_se + ib * bd, dd);
|
||||
llama_kv_cache_seq_add(ctx, slot.id, slot.ga_i + ib * bd + slot.ga_w,slot.n_past_se + ib * bd, dd);
|
||||
|
||||
slot.n_past_se -= bd;
|
||||
|
||||
|
|
|
@ -699,6 +699,8 @@ async def wait_for_health_status(context,
|
|||
if context.debug:
|
||||
print(f"Starting checking for health for expected_health_status={expected_health_status}")
|
||||
timeout = 3 # seconds
|
||||
if expected_health_status == 'ok':
|
||||
timeout = 10 # CI slow inference
|
||||
interval = 0.5
|
||||
counter = 0
|
||||
async with aiohttp.ClientSession() as session:
|
||||
|
@ -736,7 +738,7 @@ async def wait_for_health_status(context,
|
|||
if n_completions > 0:
|
||||
return
|
||||
|
||||
assert False, 'timeout exceeded'
|
||||
assert False, f'{expected_health_status} timeout exceeded {counter}s>={timeout}'
|
||||
|
||||
|
||||
def assert_embeddings(embeddings):
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue