Merge branch 'upstream' into concedo_experimental

# Conflicts:
#	CMakePresets.json
#	Makefile
#	Package.swift
#	ci/run.sh
#	common/CMakeLists.txt
#	examples/CMakeLists.txt
#	flake.lock
#	ggml/src/CMakeLists.txt
#	ggml/src/ggml-backend.cpp
#	ggml/src/ggml.c
#	pocs/vdot/q8dot.cpp
#	pocs/vdot/vdot.cpp
#	tests/test-backend-ops.cpp
#	tests/test-grad0.cpp
#	tests/test-quantize-fns.cpp
#	tests/test-quantize-perf.cpp
#	tests/test-rope.cpp
This commit is contained in:
Concedo 2024-11-04 16:54:53 +08:00
commit bb13925f39
34 changed files with 14917 additions and 19458 deletions

View file

@ -248,6 +248,7 @@ struct server_slot {
if (is_processing()) {
SLT_INF(*this, "stop processing: n_past = %d, truncated = %d\n", n_past, truncated);
t_last_used = ggml_time_us();
t_token_generation = (ggml_time_us() - t_start_generation) / 1e3;
state = SLOT_STATE_IDLE;
callback_on_release(id);
@ -731,7 +732,7 @@ struct server_context {
// find the slot that has at least n% prompt similarity
if (ret == nullptr && slot_prompt_similarity != 0.0f) {
int max_lcs_len = 0;
int lcs_len = 0;
float similarity = 0;
for (server_slot & slot : slots) {
@ -746,20 +747,21 @@ struct server_context {
}
// length of the Longest Common Subsequence between the current slot's prompt and the input prompt
int lcs_len = longest_common_subsequence(slot.cache_tokens, task.prompt_tokens);
int cur_lcs_len = longest_common_subsequence(slot.cache_tokens, task.prompt_tokens);
// fraction of the common subsequence length compared to the current slot's prompt length
similarity = static_cast<float>(lcs_len) / static_cast<int>(slot.cache_tokens.size());
float cur_similarity = static_cast<float>(cur_lcs_len) / static_cast<int>(slot.cache_tokens.size());
// select the current slot if the criteria match
if (lcs_len > max_lcs_len && similarity > slot_prompt_similarity) {
max_lcs_len = lcs_len;
if (cur_lcs_len > lcs_len && cur_similarity > slot_prompt_similarity) {
lcs_len = cur_lcs_len;
similarity = cur_similarity;
ret = &slot;
}
}
if (ret != nullptr) {
SLT_DBG(*ret, "selected slot by lcs similarity, max_lcs_len = %d, similarity = %f\n", max_lcs_len, similarity);
SLT_DBG(*ret, "selected slot by lcs similarity, lcs_len = %d, similarity = %f\n", lcs_len, similarity);
}
}
@ -2704,8 +2706,8 @@ int main(int argc, char ** argv) {
};
const auto handle_completions_generic = [&ctx_server, &res_error, &res_ok](server_task_inf_type inf_type, json & data, httplib::Response & res) {
if (ctx_server.params.embedding || ctx_server.params.reranking) {
res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings` or `--reranking`", ERROR_TYPE_NOT_SUPPORTED));
if (ctx_server.params.embedding) {
res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
return;
}
@ -2810,8 +2812,8 @@ int main(int argc, char ** argv) {
// TODO: maybe merge this function with "handle_completions_generic"
const auto handle_chat_completions = [&ctx_server, &params, &res_error, &res_ok, verbose](const httplib::Request & req, httplib::Response & res) {
if (ctx_server.params.embedding || ctx_server.params.reranking) {
res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings` or `--reranking`", ERROR_TYPE_NOT_SUPPORTED));
if (ctx_server.params.embedding) {
res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
return;
}
@ -2936,11 +2938,6 @@ int main(int argc, char ** argv) {
};
const auto handle_embeddings = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) {
// TODO: somehow clean up this checks in the future
if (!ctx_server.params.embedding || ctx_server.params.reranking) {
res_error(res, format_error_response("This server does not support embeddings. Start it with `--embeddings` and without `--reranking`", ERROR_TYPE_NOT_SUPPORTED));
return;
}
const json body = json::parse(req.body);
bool is_openai = false;
@ -2992,10 +2989,11 @@ int main(int argc, char ** argv) {
};
const auto handle_rerank = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) {
if (!ctx_server.params.reranking) {
res_error(res, format_error_response("This server does not support reranking. Start it with `--reranking`", ERROR_TYPE_NOT_SUPPORTED));
if (!ctx_server.params.reranking || ctx_server.params.embedding) {
res_error(res, format_error_response("This server does not support reranking. Start it with `--reranking` and without `--embedding`", ERROR_TYPE_NOT_SUPPORTED));
return;
}
const json body = json::parse(req.body);
// TODO: implement