mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-05-09 19:46:11 +00:00
Merge commit '017cc5f446' into concedo_experimental
# Conflicts: # .github/ISSUE_TEMPLATE/010-bug-compilation.yml # .github/ISSUE_TEMPLATE/019-bug-misc.yml # CODEOWNERS # examples/batched-bench/batched-bench.cpp # examples/batched/batched.cpp # examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp # examples/gritlm/gritlm.cpp # examples/llama-bench/llama-bench.cpp # examples/passkey/passkey.cpp # examples/quantize-stats/quantize-stats.cpp # examples/run/run.cpp # examples/simple-chat/simple-chat.cpp # examples/simple/simple.cpp # examples/tokenize/tokenize.cpp # ggml/CMakeLists.txt # ggml/src/ggml-metal/CMakeLists.txt # ggml/src/ggml-vulkan/CMakeLists.txt # scripts/sync-ggml.last # src/llama.cpp # tests/test-autorelease.cpp # tests/test-model-load-cancel.cpp # tests/test-tokenizer-0.cpp # tests/test-tokenizer-1-bpe.cpp # tests/test-tokenizer-1-spm.cpp
This commit is contained in:
commit
dcfa1eca4e
45 changed files with 806 additions and 229 deletions
|
|
@ -848,7 +848,7 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|||
} else if (!params.model_url.empty()) {
|
||||
model = common_load_model_from_url(params.model_url, params.model, params.hf_token, mparams);
|
||||
} else {
|
||||
model = llama_load_model_from_file(params.model.c_str(), mparams);
|
||||
model = llama_model_load_from_file(params.model.c_str(), mparams);
|
||||
}
|
||||
|
||||
if (model == NULL) {
|
||||
|
|
@ -875,7 +875,7 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|||
}
|
||||
|
||||
if (!ok) {
|
||||
llama_free_model(model);
|
||||
llama_model_free(model);
|
||||
|
||||
return iparams;
|
||||
}
|
||||
|
|
@ -886,7 +886,7 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|||
llama_context * lctx = llama_new_context_with_model(model, cparams);
|
||||
if (lctx == NULL) {
|
||||
LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.c_str());
|
||||
llama_free_model(model);
|
||||
llama_model_free(model);
|
||||
return iparams;
|
||||
}
|
||||
|
||||
|
|
@ -902,7 +902,7 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|||
const auto cvec = common_control_vector_load(params.control_vectors);
|
||||
if (cvec.n_embd == -1) {
|
||||
llama_free(lctx);
|
||||
llama_free_model(model);
|
||||
llama_model_free(model);
|
||||
|
||||
return iparams;
|
||||
}
|
||||
|
|
@ -915,7 +915,7 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|||
params.control_vector_layer_end);
|
||||
if (err) {
|
||||
llama_free(lctx);
|
||||
llama_free_model(model);
|
||||
llama_model_free(model);
|
||||
|
||||
return iparams;
|
||||
}
|
||||
|
|
@ -928,7 +928,7 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|||
if (lora == nullptr) {
|
||||
LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
|
||||
llama_free(lctx);
|
||||
llama_free_model(model);
|
||||
llama_model_free(model);
|
||||
return iparams;
|
||||
}
|
||||
|
||||
|
|
@ -984,7 +984,7 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|||
if (llama_model_has_encoder(model)) {
|
||||
llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size()));
|
||||
llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
|
||||
if (decoder_start_token_id == -1) {
|
||||
if (decoder_start_token_id == LLAMA_TOKEN_NULL) {
|
||||
decoder_start_token_id = bos;
|
||||
}
|
||||
tmp.clear();
|
||||
|
|
@ -1413,7 +1413,7 @@ struct llama_model * common_load_model_from_url(
|
|||
}
|
||||
}
|
||||
|
||||
return llama_load_model_from_file(local_path.c_str(), params);
|
||||
return llama_model_load_from_file(local_path.c_str(), params);
|
||||
}
|
||||
|
||||
struct llama_model * common_load_model_from_hf(
|
||||
|
|
|
|||
|
|
@ -65,13 +65,13 @@ constexpr int draft_min_percent_strict[LLAMA_NGRAM_MAX] = {75, 66, 66, 66};
|
|||
static llama_token try_draft(common_ngram_cache & nc_static, const common_ngram ngram_static) {
|
||||
common_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
|
||||
if (part_static_it == nc_static.end()) {
|
||||
return -1;
|
||||
return LLAMA_TOKEN_NULL;
|
||||
}
|
||||
const common_ngram_cache_part part_static = part_static_it->second;
|
||||
|
||||
int max_count_static = 0;
|
||||
int sum_count_static = 0;
|
||||
llama_token max_token = -1;
|
||||
llama_token max_token = LLAMA_TOKEN_NULL;
|
||||
|
||||
for (std::pair<llama_token, int> token_count_static : part_static) {
|
||||
const llama_token token = token_count_static.first;
|
||||
|
|
@ -85,10 +85,10 @@ static llama_token try_draft(common_ngram_cache & nc_static, const common_ngram
|
|||
}
|
||||
|
||||
if (sum_count_static < draft_min_sample_size_lax[LLAMA_NGRAM_STATIC-1]) {
|
||||
return -1;
|
||||
return LLAMA_TOKEN_NULL;
|
||||
}
|
||||
if (100*max_count_static < draft_min_percent_lax[LLAMA_NGRAM_STATIC-1]*sum_count_static) {
|
||||
return -1;
|
||||
return LLAMA_TOKEN_NULL;
|
||||
}
|
||||
return max_token;
|
||||
}
|
||||
|
|
@ -98,9 +98,9 @@ static llama_token try_draft(
|
|||
common_ngram_cache & nc_primary, const std::vector<common_ngram> & ngrams_primary, common_ngram_cache_part & part_static,
|
||||
const int * min_sample_size, const int * min_percent) {
|
||||
|
||||
llama_token drafted_token = -1;
|
||||
llama_token drafted_token = LLAMA_TOKEN_NULL;
|
||||
|
||||
for (int i = ngrams_primary.size()-1; i >= 0 && drafted_token == -1; --i) {
|
||||
for (int i = ngrams_primary.size()-1; i >= 0 && drafted_token == LLAMA_TOKEN_NULL; --i) {
|
||||
const common_ngram ngram_primary = ngrams_primary[i];
|
||||
|
||||
common_ngram_cache::iterator part_primary_it = nc_primary.find(ngram_primary);
|
||||
|
|
@ -112,7 +112,7 @@ static llama_token try_draft(
|
|||
int max_count_primary = 0;
|
||||
int max_count_static = 0;
|
||||
int sum_count_primary = 0;
|
||||
llama_token max_token = -1;
|
||||
llama_token max_token = LLAMA_TOKEN_NULL;
|
||||
|
||||
for (std::pair<llama_token, int> token_count_primary : part_primary) {
|
||||
const llama_token token = token_count_primary.first;
|
||||
|
|
@ -154,7 +154,7 @@ void common_ngram_cache_draft(
|
|||
}
|
||||
|
||||
while ((int) draft.size()-1 < n_draft) {
|
||||
llama_token drafted_token = -1;
|
||||
llama_token drafted_token = LLAMA_TOKEN_NULL;
|
||||
|
||||
const int ngram_start_static = inp_size-LLAMA_NGRAM_STATIC + draft.size()-1;
|
||||
common_ngram ngram_static;
|
||||
|
|
@ -177,17 +177,17 @@ void common_ngram_cache_draft(
|
|||
}
|
||||
ngrams_cd.push_back(ngram_cd);
|
||||
}
|
||||
if (drafted_token == -1) {
|
||||
if (drafted_token == LLAMA_TOKEN_NULL) {
|
||||
drafted_token = try_draft(nc_context, ngrams_cd, part_static, draft_min_sample_size_lax, draft_min_percent_lax);
|
||||
}
|
||||
if (drafted_token == -1) {
|
||||
if (drafted_token == LLAMA_TOKEN_NULL) {
|
||||
drafted_token = try_draft(nc_dynamic, ngrams_cd, part_static, draft_min_sample_size_strict, draft_min_percent_strict);
|
||||
}
|
||||
if (drafted_token == -1) {
|
||||
if (drafted_token == LLAMA_TOKEN_NULL) {
|
||||
drafted_token = try_draft(nc_static, ngram_static);
|
||||
}
|
||||
|
||||
if (drafted_token == -1) {
|
||||
if (drafted_token == LLAMA_TOKEN_NULL) {
|
||||
break;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -17,13 +17,13 @@ struct common_ngram {
|
|||
|
||||
common_ngram() {
|
||||
for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
|
||||
tokens[i] = -1;
|
||||
tokens[i] = LLAMA_TOKEN_NULL;
|
||||
}
|
||||
}
|
||||
|
||||
common_ngram(const llama_token * input, const int ngram_size) {
|
||||
for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
|
||||
tokens[i] = i < ngram_size ? input[i] : -1;
|
||||
tokens[i] = i < ngram_size ? input[i] : LLAMA_TOKEN_NULL;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue