mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-05-19 08:00:25 +00:00
Merge branch 'upstream' into concedo_experimental
# Conflicts: # .github/workflows/build-cross.yml # .github/workflows/build-self-hosted.yml # .github/workflows/release.yml # examples/llama.android/lib/src/main/cpp/CMakeLists.txt # ggml/CMakeLists.txt # ggml/src/ggml-rpc/CMakeLists.txt # ggml/src/ggml-rpc/ggml-rpc.cpp # ggml/src/ggml-sycl/mmvq.cpp # ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp # ggml/src/ggml-webgpu/ggml-webgpu.cpp # scripts/sync_vendor.py # tests/test-chat.cpp # tests/test-mtmd-c-api.c # tools/server/README.md
This commit is contained in:
commit
cd6788007e
86 changed files with 1384 additions and 1240 deletions
|
|
@ -1388,7 +1388,7 @@ common_init_result_ptr common_init_from_params(common_params & params) {
|
|||
|
||||
common_init_result::~common_init_result() = default;
|
||||
|
||||
std::string get_model_endpoint() {
|
||||
std::string common_get_model_endpoint() {
|
||||
const char * model_endpoint_env = getenv("MODEL_ENDPOINT");
|
||||
// We still respect the use of environment-variable "HF_ENDPOINT" for backward-compatibility.
|
||||
const char * hf_endpoint_env = getenv("HF_ENDPOINT");
|
||||
|
|
@ -1403,6 +1403,42 @@ std::string get_model_endpoint() {
|
|||
return model_endpoint;
|
||||
}
|
||||
|
||||
common_context_seq_rm_type common_context_can_seq_rm(llama_context * ctx) {
|
||||
auto * mem = llama_get_memory(ctx);
|
||||
if (mem == nullptr) {
|
||||
return COMMON_CONTEXT_SEQ_RM_TYPE_NO;
|
||||
}
|
||||
|
||||
common_context_seq_rm_type res = COMMON_CONTEXT_SEQ_RM_TYPE_PART;
|
||||
|
||||
llama_memory_clear(mem, true);
|
||||
|
||||
// eval 2 tokens to check if the context is compatible
|
||||
std::vector<llama_token> tmp;
|
||||
tmp.push_back(0);
|
||||
tmp.push_back(0);
|
||||
|
||||
int ret = llama_decode(ctx, llama_batch_get_one(tmp.data(), tmp.size()));
|
||||
if (ret != 0) {
|
||||
LOG_ERR("%s: llama_decode() failed: %d\n", __func__, ret);
|
||||
res = COMMON_CONTEXT_SEQ_RM_TYPE_NO;
|
||||
goto done;
|
||||
}
|
||||
|
||||
// try to remove the last tokens
|
||||
if (!llama_memory_seq_rm(mem, 0, 1, -1)) {
|
||||
LOG_WRN("%s: the target context does not support partial sequence removal\n", __func__);
|
||||
res = COMMON_CONTEXT_SEQ_RM_TYPE_FULL;
|
||||
goto done;
|
||||
}
|
||||
|
||||
done:
|
||||
llama_memory_clear(mem, true);
|
||||
llama_synchronize(ctx);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora) {
|
||||
std::vector<llama_adapter_lora *> loras;
|
||||
std::vector<float> scales;
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue