mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 01:24:36 +00:00
Merge branch 'upstream' into concedo_experimental
# Conflicts: # .devops/main-intel.Dockerfile # .devops/main-vulkan.Dockerfile # .devops/server-intel.Dockerfile # .devops/server-vulkan.Dockerfile # .github/workflows/bench.yml # .github/workflows/build.yml # .github/workflows/python-lint.yml # .github/workflows/server.yml # .gitignore # Makefile # README-sycl.md # README.md # ci/run.sh # flake.lock # llama.cpp # models/ggml-vocab-falcon.gguf # models/ggml-vocab-llama-spm.gguf # models/ggml-vocab-mpt.gguf # models/ggml-vocab-stablelm.gguf # models/ggml-vocab-starcoder.gguf # requirements.txt # scripts/check-requirements.sh # tests/CMakeLists.txt # tests/test-backend-ops.cpp # tests/test-grammar-integration.cpp # tests/test-tokenizer-0-bpe.py # tests/test-tokenizer-0-spm.py # tests/test-tokenizer-1-spm.cpp
This commit is contained in:
commit
17a24d753c
52 changed files with 4978 additions and 1249 deletions
|
@ -1208,6 +1208,27 @@ struct server_context {
|
|||
LOG_VERBOSE("eos token found", {});
|
||||
}
|
||||
|
||||
auto n_ctx_train = llama_n_ctx_train(model);
|
||||
if (slot.params.n_predict < 1 && slot.n_predict < 1 && slot.ga_n == 1
|
||||
&& slot.n_prompt_tokens + slot.n_decoded >= n_ctx_train) {
|
||||
LOG_WARNING("n_predict is not set and self-context extend is disabled."
|
||||
" Limiting generated tokens to n_ctx_train to avoid EOS-less generation infinite loop", {
|
||||
{ "id_slot", slot.id },
|
||||
{ "params.n_predict", slot.params.n_predict },
|
||||
{ "slot.n_prompt_tokens", slot.n_prompt_tokens },
|
||||
{ "slot.n_decoded", slot.n_decoded },
|
||||
{ "slot.n_predict", slot.n_predict },
|
||||
{ "n_slots", params.n_parallel },
|
||||
{ "slot.n_ctx", slot.n_ctx },
|
||||
{ "n_ctx", n_ctx },
|
||||
{ "n_ctx_train", n_ctx_train },
|
||||
{ "ga_n", slot.ga_n },
|
||||
});
|
||||
slot.truncated = true;
|
||||
slot.stopped_limit = true;
|
||||
slot.has_next_token = false; // stop prediction
|
||||
}
|
||||
|
||||
LOG_VERBOSE("next token", {
|
||||
{"id_slot", slot.id},
|
||||
{"id_task", slot.id_task},
|
||||
|
@ -2142,7 +2163,7 @@ struct server_context {
|
|||
});
|
||||
|
||||
// process the created batch of tokens
|
||||
for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) {
|
||||
for (int32_t i = 0; i < batch.n_tokens; i += n_batch) {
|
||||
const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i);
|
||||
|
||||
for (auto & slot : slots) {
|
||||
|
@ -2333,7 +2354,7 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co
|
|||
printf(" disable KV offload\n");
|
||||
}
|
||||
printf(" -m FNAME, --model FNAME\n");
|
||||
printf(" model path (default: %s)\n", params.model.c_str());
|
||||
printf(" model path (default: models/$filename with filename from --hf-file or --model-url if set, otherwise %s)\n", DEFAULT_MODEL_PATH);
|
||||
printf(" -mu MODEL_URL, --model-url MODEL_URL\n");
|
||||
printf(" model download url (default: unused)\n");
|
||||
printf(" -hfr REPO, --hf-repo REPO\n");
|
||||
|
@ -2357,6 +2378,7 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co
|
|||
printf(" --embeddings enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled");
|
||||
printf(" -np N, --parallel N number of slots for process requests (default: %d)\n", params.n_parallel);
|
||||
printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: enabled)\n");
|
||||
printf(" -fa, --flash-attn enable Flash Attention (default: %s)\n", params.flash_attn ? "enabled" : "disabled");
|
||||
printf(" -spf FNAME, --system-prompt-file FNAME\n");
|
||||
printf(" set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n");
|
||||
printf(" -ctk TYPE, --cache-type-k TYPE\n");
|
||||
|
@ -2372,7 +2394,7 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co
|
|||
printf(" -n, --n-predict maximum tokens to predict (default: %d)\n", params.n_predict);
|
||||
printf(" --override-kv KEY=TYPE:VALUE\n");
|
||||
printf(" advanced option to override model metadata by key. may be specified multiple times.\n");
|
||||
printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
|
||||
printf(" types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
|
||||
printf(" -gan N, --grp-attn-n N set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`\n");
|
||||
printf(" -gaw N, --grp-attn-w N set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`\n");
|
||||
printf(" --chat-template JINJA_TEMPLATE\n");
|
||||
|
@ -2722,6 +2744,8 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
|
|||
params.embedding = true;
|
||||
} else if (arg == "-cb" || arg == "--cont-batching") {
|
||||
params.cont_batching = true;
|
||||
} else if (arg == "-fa" || arg == "--flash-attn") {
|
||||
params.flash_attn = true;
|
||||
} else if (arg == "-np" || arg == "--parallel") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
|
@ -2803,43 +2827,11 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
|
|||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
char * sep = strchr(argv[i], '=');
|
||||
if (sep == nullptr || sep - argv[i] >= 128) {
|
||||
fprintf(stderr, "error: Malformed KV override: %s\n", argv[i]);
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
|
||||
struct llama_model_kv_override kvo;
|
||||
std::strncpy(kvo.key, argv[i], sep - argv[i]);
|
||||
kvo.key[sep - argv[i]] = 0;
|
||||
sep++;
|
||||
if (strncmp(sep, "int:", 4) == 0) {
|
||||
sep += 4;
|
||||
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
|
||||
kvo.int_value = std::atol(sep);
|
||||
} else if (strncmp(sep, "float:", 6) == 0) {
|
||||
sep += 6;
|
||||
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
|
||||
kvo.float_value = std::atof(sep);
|
||||
} else if (strncmp(sep, "bool:", 5) == 0) {
|
||||
sep += 5;
|
||||
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
|
||||
if (std::strcmp(sep, "true") == 0) {
|
||||
kvo.bool_value = true;
|
||||
} else if (std::strcmp(sep, "false") == 0) {
|
||||
kvo.bool_value = false;
|
||||
} else {
|
||||
fprintf(stderr, "error: Invalid boolean value for KV override: %s\n", argv[i]);
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
if (!parse_kv_override(argv[i], params.kv_overrides)) {
|
||||
fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]);
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.kv_overrides.push_back(kvo);
|
||||
} else {
|
||||
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
||||
server_print_usage(argv[0], default_params, default_sparams);
|
||||
|
@ -2847,6 +2839,8 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
|
|||
}
|
||||
}
|
||||
|
||||
gpt_params_handle_model_default(params);
|
||||
|
||||
if (!params.kv_overrides.empty()) {
|
||||
params.kv_overrides.emplace_back();
|
||||
params.kv_overrides.back().key[0] = 0;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue