mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-16 20:09:41 +00:00
Merge commit '64eda5deb9
' into concedo_experimental
# Conflicts: # .devops/cuda.Dockerfile # .devops/intel.Dockerfile # .devops/llama-cli-cann.Dockerfile # .devops/musa.Dockerfile # .devops/rocm.Dockerfile # .devops/vulkan.Dockerfile # .github/workflows/build.yml # .github/workflows/docker.yml # README.md # docs/backend/SYCL.md # examples/llava/clip.cpp # examples/server_embd.py # ggml/src/ggml-cann/acl_tensor.cpp # ggml/src/ggml-cann/aclnn_ops.cpp # ggml/src/ggml-cann/aclnn_ops.h # ggml/src/ggml-cann/ggml-cann.cpp # src/CMakeLists.txt # tests/test-chat-template.cpp
This commit is contained in:
commit
ea9bd61e47
24 changed files with 1059 additions and 71 deletions
|
@ -1705,6 +1705,8 @@ private:
|
|||
};
|
||||
|
||||
struct server_response {
|
||||
bool running = true;
|
||||
|
||||
// for keeping track of all tasks waiting for the result
|
||||
std::unordered_set<int> waiting_task_ids;
|
||||
|
||||
|
@ -1759,6 +1761,10 @@ struct server_response {
|
|||
while (true) {
|
||||
std::unique_lock<std::mutex> lock(mutex_results);
|
||||
condition_results.wait(lock, [&]{
|
||||
if (!running) {
|
||||
SRV_DBG("%s : queue result stop\n", __func__);
|
||||
std::terminate(); // we cannot return here since the caller is HTTP code
|
||||
}
|
||||
return !queue_results.empty();
|
||||
});
|
||||
|
||||
|
@ -1789,6 +1795,10 @@ struct server_response {
|
|||
}
|
||||
|
||||
std::cv_status cr_res = condition_results.wait_for(lock, std::chrono::seconds(timeout));
|
||||
if (!running) {
|
||||
SRV_DBG("%s : queue result stop\n", __func__);
|
||||
std::terminate(); // we cannot return here since the caller is HTTP code
|
||||
}
|
||||
if (cr_res == std::cv_status::timeout) {
|
||||
return nullptr;
|
||||
}
|
||||
|
@ -1818,6 +1828,12 @@ struct server_response {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
// terminate the waiting loop
|
||||
void terminate() {
|
||||
running = false;
|
||||
condition_results.notify_all();
|
||||
}
|
||||
};
|
||||
|
||||
struct server_context {
|
||||
|
@ -4491,9 +4507,10 @@ int main(int argc, char ** argv) {
|
|||
svr->new_task_queue = [¶ms] { return new httplib::ThreadPool(params.n_threads_http); };
|
||||
|
||||
// clean up function, to be called before exit
|
||||
auto clean_up = [&svr]() {
|
||||
auto clean_up = [&svr, &ctx_server]() {
|
||||
SRV_INF("%s: cleaning up before exit...\n", __func__);
|
||||
svr->stop();
|
||||
ctx_server.queue_results.terminate();
|
||||
llama_backend_free();
|
||||
};
|
||||
|
||||
|
@ -4534,7 +4551,7 @@ int main(int argc, char ** argv) {
|
|||
|
||||
if (!ctx_server.load_model(params)) {
|
||||
clean_up();
|
||||
// t.join(); // FIXME: see below
|
||||
t.join();
|
||||
LOG_ERR("%s: exiting due to model loading error\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
@ -4582,7 +4599,7 @@ int main(int argc, char ** argv) {
|
|||
ctx_server.queue_tasks.start_loop();
|
||||
|
||||
clean_up();
|
||||
// t.join(); // FIXME: http thread may stuck if there is an on-going request. we don't need to care about this for now as the HTTP connection will already be closed at this point, but it's better to fix this
|
||||
t.join();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -49,6 +49,26 @@ def test_embedding_multiple():
|
|||
assert len(d['embedding']) > 1
|
||||
|
||||
|
||||
def test_embedding_multiple_with_fa():
|
||||
server = ServerPreset.bert_bge_small_with_fa()
|
||||
server.pooling = 'last'
|
||||
server.start()
|
||||
# one of these should trigger the FA branch (i.e. context size % 256 == 0)
|
||||
res = server.make_request("POST", "/v1/embeddings", data={
|
||||
"input": [
|
||||
"a "*253,
|
||||
"b "*254,
|
||||
"c "*255,
|
||||
"d "*256,
|
||||
],
|
||||
})
|
||||
assert res.status_code == 200
|
||||
assert len(res.body['data']) == 4
|
||||
for d in res.body['data']:
|
||||
assert 'embedding' in d
|
||||
assert len(d['embedding']) > 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"input,is_multi_prompt",
|
||||
[
|
||||
|
|
|
@ -323,6 +323,21 @@ class ServerPreset:
|
|||
server.server_embeddings = True
|
||||
return server
|
||||
|
||||
@staticmethod
|
||||
def bert_bge_small_with_fa() -> ServerProcess:
|
||||
server = ServerProcess()
|
||||
server.model_hf_repo = "ggml-org/models"
|
||||
server.model_hf_file = "bert-bge-small/ggml-model-f16.gguf"
|
||||
server.model_alias = "bert-bge-small"
|
||||
server.n_ctx = 1024
|
||||
server.n_batch = 300
|
||||
server.n_ubatch = 300
|
||||
server.n_slots = 2
|
||||
server.fa = True
|
||||
server.seed = 42
|
||||
server.server_embeddings = True
|
||||
return server
|
||||
|
||||
@staticmethod
|
||||
def tinyllama_infill() -> ServerProcess:
|
||||
server = ServerProcess()
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
#include "common.h"
|
||||
#include "log.h"
|
||||
#include "llama.h"
|
||||
#include "common/base64.hpp"
|
||||
#include "base64.hpp"
|
||||
|
||||
// increase max payload length to allow use of larger context size
|
||||
#define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue