Merge branch 'upstream' into concedo_experimental

# Conflicts:
#	.devops/nix/package.nix
#	.devops/tools.sh
#	.github/workflows/build.yml
#	Makefile
#	README.md
#	common/CMakeLists.txt
#	common/common.h
#	examples/llava/CMakeLists.txt
#	examples/run/CMakeLists.txt
#	examples/run/README.md
#	examples/run/run.cpp
#	ggml/CMakeLists.txt
#	ggml/src/CMakeLists.txt
#	ggml/src/ggml-kompute/ggml-kompute.cpp
#	tests/test-backend-ops.cpp
#	tests/test-rope.cpp
This commit is contained in:
Concedo 2024-12-15 15:30:10 +08:00
commit f456ed7237
38 changed files with 10752 additions and 105 deletions

View file

@ -459,7 +459,7 @@ struct server_task_result_cmpl_final : server_task_result {
int32_t n_decoded;
int32_t n_prompt_tokens;
int32_t n_tokens_cached;
int32_t has_new_line;
bool has_new_line;
std::string stopping_word;
stop_type stop = STOP_TYPE_NONE;
@ -1079,9 +1079,9 @@ struct server_slot {
SLT_INF(*this,
"\n"
"\rprompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n"
"\r eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n"
"\r total time = %10.2f ms / %5d tokens\n",
"prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n"
" eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n"
" total time = %10.2f ms / %5d tokens\n",
t_prompt_processing, n_prompt_tokens_processed, t_prompt, n_prompt_second,
t_token_generation, n_decoded, t_gen, n_gen_second,
t_prompt_processing + t_token_generation, n_prompt_tokens_processed + n_decoded);

View file

@ -25,6 +25,7 @@ def test_completion(prompt: str, n_predict: int, re_content: str, n_prompt: int,
assert res.body["timings"]["prompt_n"] == n_prompt
assert res.body["timings"]["predicted_n"] == n_predicted
assert res.body["truncated"] == truncated
assert type(res.body["has_new_line"]) == bool
assert match_regex(re_content, res.body["content"])
@ -48,6 +49,7 @@ def test_completion_stream(prompt: str, n_predict: int, re_content: str, n_promp
assert data["timings"]["predicted_n"] == n_predicted
assert data["truncated"] == truncated
assert data["stop_type"] == "limit"
assert type(data["has_new_line"]) == bool
assert "generation_settings" in data
assert server.n_predict is not None
assert data["generation_settings"]["n_predict"] == min(n_predict, server.n_predict)

View file

@ -22,7 +22,7 @@
#include <vector>
#include <memory>
#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613"
#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo"
using json = nlohmann::ordered_json;