mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-14 02:49:41 +00:00
Merge branch 'upstream' into concedo_experimental
# Conflicts: # .github/ISSUE_TEMPLATE/020-enhancement.yml # .github/ISSUE_TEMPLATE/030-research.yml # .github/ISSUE_TEMPLATE/040-refactor.yml # .github/workflows/build.yml # Makefile # common/CMakeLists.txt # examples/CMakeLists.txt # examples/infill/infill.cpp # examples/lookahead/lookahead.cpp # examples/lookup/lookup-stats.cpp # examples/lookup/lookup.cpp # examples/parallel/parallel.cpp # examples/retrieval/retrieval.cpp # examples/save-load-state/save-load-state.cpp # examples/speculative/speculative.cpp # flake.lock # ggml/src/ggml-cann/CMakeLists.txt # ggml/src/ggml-cann/aclnn_ops.cpp # ggml/src/ggml-cann/kernels/CMakeLists.txt # ggml/src/ggml-cann/kernels/dup.cpp # ggml/src/ggml-cann/kernels/get_row_f16.cpp # ggml/src/ggml-cann/kernels/get_row_f32.cpp # ggml/src/ggml-cann/kernels/get_row_q4_0.cpp # tests/test-arg-parser.cpp # tests/test-backend-ops.cpp
This commit is contained in:
commit
83350ec314
21 changed files with 801 additions and 377 deletions
|
@ -176,7 +176,7 @@ struct server_slot {
|
|||
// sampling
|
||||
json json_schema;
|
||||
|
||||
struct common_sampler_params sparams;
|
||||
struct common_params_sampling sparams;
|
||||
struct common_sampler * smpl = nullptr;
|
||||
|
||||
llama_token sampled;
|
||||
|
@ -688,7 +688,7 @@ struct server_context {
|
|||
|
||||
SLT_INF(slot, "new slot n_ctx_slot = %d\n", slot.n_ctx);
|
||||
|
||||
slot.sparams = params.sparams;
|
||||
slot.sparams = params.sampling;
|
||||
|
||||
slot.callback_on_release = [this](int) {
|
||||
queue_tasks.pop_deferred_task();
|
||||
|
@ -744,7 +744,7 @@ struct server_context {
|
|||
}
|
||||
|
||||
// length of the Longest Common Subsequence between the current slot's prompt and the input prompt
|
||||
int cur_lcs_len = longest_common_subsequence(slot.cache_tokens, task.prompt_tokens);
|
||||
int cur_lcs_len = common_lcs(slot.cache_tokens, task.prompt_tokens);
|
||||
|
||||
// fraction of the common subsequence length compared to the current slot's prompt length
|
||||
float cur_similarity = static_cast<float>(cur_lcs_len) / static_cast<int>(slot.cache_tokens.size());
|
||||
|
@ -789,7 +789,7 @@ struct server_context {
|
|||
bool launch_slot_with_task(server_slot & slot, const server_task & task) {
|
||||
slot_params default_params;
|
||||
// Sampling parameter defaults are loaded from the global server context (but individual requests can still override them)
|
||||
auto default_sparams = params.sparams;
|
||||
auto default_sparams = params.sampling;
|
||||
const auto & data = task.data;
|
||||
|
||||
if (data.count("__oaicompat") != 0) {
|
||||
|
@ -1961,7 +1961,7 @@ struct server_context {
|
|||
|
||||
if (slot.params.cache_prompt) {
|
||||
// reuse any previously computed tokens that are common with the new prompt
|
||||
slot.n_past = longest_common_prefix(slot.cache_tokens, prompt_tokens);
|
||||
slot.n_past = common_lcp(slot.cache_tokens, prompt_tokens);
|
||||
|
||||
// reuse chunks from the cached prompt by shifting their KV cache in the new position
|
||||
if (params.n_cache_reuse > 0) {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue