mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-13 02:19:41 +00:00
Merge branch 'upstream' into concedo_experimental
# Conflicts: # .devops/cuda.Dockerfile # CODEOWNERS # README.md # ggml/src/ggml-cann/aclnn_ops.cpp # ggml/src/ggml-cann/common.h # ggml/src/ggml-opencl/ggml-opencl.cpp # scripts/sync-ggml-am.sh # scripts/sync-ggml.last # scripts/sync-ggml.sh # tests/test-chat.cpp # tools/batched-bench/batched-bench.cpp # tools/mtmd/clip.h
This commit is contained in:
commit
1c41c38a6a
40 changed files with 426 additions and 274 deletions
|
@ -1201,6 +1201,8 @@ struct server_task_result_metrics : server_task_result {
|
|||
uint64_t n_tokens_predicted_total = 0;
|
||||
uint64_t t_tokens_generation_total = 0;
|
||||
|
||||
uint64_t n_past_max = 0;
|
||||
|
||||
uint64_t n_prompt_tokens_processed = 0;
|
||||
uint64_t t_prompt_processing = 0;
|
||||
|
||||
|
@ -1226,6 +1228,8 @@ struct server_task_result_metrics : server_task_result {
|
|||
{ "n_tokens_predicted_total", n_tokens_predicted_total },
|
||||
{ "t_prompt_processing_total", t_prompt_processing_total },
|
||||
|
||||
{ "n_past_max", n_past_max },
|
||||
|
||||
{ "n_prompt_tokens_processed", n_prompt_tokens_processed },
|
||||
{ "t_prompt_processing", t_prompt_processing },
|
||||
{ "n_tokens_predicted", n_tokens_predicted },
|
||||
|
@ -1587,6 +1591,8 @@ struct server_metrics {
|
|||
uint64_t n_tokens_predicted_total = 0;
|
||||
uint64_t t_tokens_generation_total = 0;
|
||||
|
||||
uint64_t n_past_max = 0;
|
||||
|
||||
uint64_t n_prompt_tokens_processed = 0;
|
||||
uint64_t t_prompt_processing = 0;
|
||||
|
||||
|
@ -1605,6 +1611,10 @@ struct server_metrics {
|
|||
n_prompt_tokens_processed += slot.n_prompt_tokens_processed;
|
||||
t_prompt_processing += slot.t_prompt_processing;
|
||||
t_prompt_processing_total += slot.t_prompt_processing;
|
||||
|
||||
if (slot.n_past > 0) {
|
||||
n_past_max = std::max(n_past_max, (uint64_t) slot.n_past);
|
||||
}
|
||||
}
|
||||
|
||||
void on_prediction(const server_slot & slot) {
|
||||
|
@ -1620,6 +1630,9 @@ struct server_metrics {
|
|||
if (slot.is_processing()) {
|
||||
n_busy_slots_total++;
|
||||
}
|
||||
if (slot.n_past > 0) {
|
||||
n_past_max = std::max(n_past_max, (uint64_t) slot.n_past);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1716,7 +1729,7 @@ struct server_queue {
|
|||
void pop_deferred_task() {
|
||||
std::unique_lock<std::mutex> lock(mutex_tasks);
|
||||
if (!queue_tasks_deferred.empty()) {
|
||||
queue_tasks.emplace_back(std::move(queue_tasks_deferred.front()));
|
||||
queue_tasks.emplace_front(std::move(queue_tasks_deferred.front()));
|
||||
queue_tasks_deferred.pop_front();
|
||||
}
|
||||
condition_tasks.notify_one();
|
||||
|
@ -2875,6 +2888,8 @@ struct server_context {
|
|||
res->n_tokens_predicted_total = metrics.n_tokens_predicted_total;
|
||||
res->t_tokens_generation_total = metrics.t_tokens_generation_total;
|
||||
|
||||
res->n_past_max = metrics.n_past_max;
|
||||
|
||||
res->n_prompt_tokens_processed = metrics.n_prompt_tokens_processed;
|
||||
res->t_prompt_processing = metrics.t_prompt_processing;
|
||||
res->n_tokens_predicted = metrics.n_tokens_predicted;
|
||||
|
@ -4077,6 +4092,10 @@ int main(int argc, char ** argv) {
|
|||
{"name", "n_decode_total"},
|
||||
{"help", "Total number of llama_decode() calls"},
|
||||
{"value", res_metrics->n_decode_total}
|
||||
}, {
|
||||
{"name", "n_past_max"},
|
||||
{"help", "Largest observed n_past."},
|
||||
{"value", res_metrics->n_past_max}
|
||||
}, {
|
||||
{"name", "n_busy_slots_per_decode"},
|
||||
{"help", "Average number of busy slots per llama_decode() call"},
|
||||
|
|
|
@ -5,7 +5,7 @@ from utils import *
|
|||
server = ServerPreset.tinyllama2()
|
||||
|
||||
|
||||
@pytest.fixture(scope="module", autouse=True)
|
||||
@pytest.fixture(autouse=True)
|
||||
def create_server():
|
||||
global server
|
||||
server = ServerPreset.tinyllama2()
|
||||
|
|
|
@ -7,7 +7,7 @@ from utils import *
|
|||
server = ServerPreset.tinyllama2()
|
||||
|
||||
|
||||
@pytest.fixture(scope="module", autouse=True)
|
||||
@pytest.fixture(autouse=True)
|
||||
def create_server():
|
||||
global server
|
||||
server = ServerPreset.tinyllama2()
|
||||
|
@ -229,7 +229,7 @@ def test_nocache_long_input_prompt():
|
|||
"temperature": 1.0,
|
||||
"cache_prompt": False,
|
||||
})
|
||||
assert res.status_code == 200
|
||||
assert res.status_code == 400
|
||||
|
||||
|
||||
def test_completion_with_tokens_input():
|
||||
|
|
|
@ -11,7 +11,7 @@ Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu
|
|||
Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
|
||||
""".strip()
|
||||
|
||||
@pytest.fixture(scope="module", autouse=True)
|
||||
@pytest.fixture(autouse=True)
|
||||
def create_server():
|
||||
global server
|
||||
server = ServerPreset.tinyllama2()
|
||||
|
@ -25,6 +25,7 @@ def test_ctx_shift_enabled():
|
|||
# the prompt is truncated to keep the last 109 tokens
|
||||
# 64 tokens are generated thanks to shifting the context when it gets full
|
||||
global server
|
||||
server.enable_ctx_shift = True
|
||||
server.start()
|
||||
res = server.make_request("POST", "/completion", data={
|
||||
"n_predict": 64,
|
||||
|
@ -42,7 +43,6 @@ def test_ctx_shift_enabled():
|
|||
])
|
||||
def test_ctx_shift_disabled_short_prompt(n_predict: int, n_token_output: int, truncated: bool):
|
||||
global server
|
||||
server.disable_ctx_shift = True
|
||||
server.n_predict = -1
|
||||
server.start()
|
||||
res = server.make_request("POST", "/completion", data={
|
||||
|
@ -56,7 +56,6 @@ def test_ctx_shift_disabled_short_prompt(n_predict: int, n_token_output: int, tr
|
|||
|
||||
def test_ctx_shift_disabled_long_prompt():
|
||||
global server
|
||||
server.disable_ctx_shift = True
|
||||
server.start()
|
||||
res = server.make_request("POST", "/completion", data={
|
||||
"n_predict": 64,
|
||||
|
@ -68,7 +67,6 @@ def test_ctx_shift_disabled_long_prompt():
|
|||
|
||||
def test_ctx_shift_disabled_stream():
|
||||
global server
|
||||
server.disable_ctx_shift = True
|
||||
server.start()
|
||||
res = server.make_stream_request("POST", "/v1/completions", data={
|
||||
"n_predict": 256,
|
||||
|
|
|
@ -8,7 +8,7 @@ server = ServerPreset.bert_bge_small()
|
|||
|
||||
EPSILON = 1e-3
|
||||
|
||||
@pytest.fixture(scope="module", autouse=True)
|
||||
@pytest.fixture(autouse=True)
|
||||
def create_server():
|
||||
global server
|
||||
server = ServerPreset.bert_bge_small()
|
||||
|
|
|
@ -3,7 +3,7 @@ from utils import *
|
|||
|
||||
server = ServerPreset.tinyllama_infill()
|
||||
|
||||
@pytest.fixture(scope="module", autouse=True)
|
||||
@pytest.fixture(autouse=True)
|
||||
def create_server():
|
||||
global server
|
||||
server = ServerPreset.tinyllama_infill()
|
||||
|
|
|
@ -5,7 +5,7 @@ server = ServerPreset.stories15m_moe()
|
|||
|
||||
LORA_FILE_URL = "https://huggingface.co/ggml-org/stories15M_MOE/resolve/main/moe_shakespeare15M.gguf"
|
||||
|
||||
@pytest.fixture(scope="module", autouse=True)
|
||||
@pytest.fixture(autouse=True)
|
||||
def create_server():
|
||||
global server
|
||||
server = ServerPreset.stories15m_moe()
|
||||
|
|
|
@ -4,7 +4,7 @@ from utils import *
|
|||
server = ServerPreset.jina_reranker_tiny()
|
||||
|
||||
|
||||
@pytest.fixture(scope="module", autouse=True)
|
||||
@pytest.fixture(autouse=True)
|
||||
def create_server():
|
||||
global server
|
||||
server = ServerPreset.jina_reranker_tiny()
|
||||
|
|
|
@ -6,7 +6,7 @@ server = ServerPreset.tinyllama2()
|
|||
|
||||
TEST_API_KEY = "sk-this-is-the-secret-key"
|
||||
|
||||
@pytest.fixture(scope="module", autouse=True)
|
||||
@pytest.fixture(autouse=True)
|
||||
def create_server():
|
||||
global server
|
||||
server = ServerPreset.tinyllama2()
|
||||
|
|
|
@ -3,7 +3,7 @@ from utils import *
|
|||
|
||||
server = ServerPreset.tinyllama2()
|
||||
|
||||
@pytest.fixture(scope="module", autouse=True)
|
||||
@pytest.fixture(autouse=True)
|
||||
def create_server():
|
||||
global server
|
||||
server = ServerPreset.tinyllama2()
|
||||
|
|
|
@ -16,7 +16,7 @@ def create_server():
|
|||
server.draft_max = 8
|
||||
|
||||
|
||||
@pytest.fixture(scope="module", autouse=True)
|
||||
@pytest.fixture(autouse=True)
|
||||
def fixture_create_server():
|
||||
return create_server()
|
||||
|
||||
|
@ -91,6 +91,7 @@ def test_slot_ctx_not_exceeded():
|
|||
def test_with_ctx_shift():
|
||||
global server
|
||||
server.n_ctx = 64
|
||||
server.enable_ctx_shift = True
|
||||
server.start()
|
||||
res = server.make_request("POST", "/completion", data={
|
||||
"prompt": "Hello " * 56,
|
||||
|
|
|
@ -4,7 +4,7 @@ from utils import *
|
|||
server = ServerPreset.tinyllama2()
|
||||
|
||||
|
||||
@pytest.fixture(scope="module", autouse=True)
|
||||
@pytest.fixture(autouse=True)
|
||||
def create_server():
|
||||
global server
|
||||
server = ServerPreset.tinyllama2()
|
||||
|
|
|
@ -22,6 +22,8 @@ def create_server():
|
|||
server.model_alias = "tinyllama-2-tool-call"
|
||||
server.server_port = 8081
|
||||
server.n_slots = 1
|
||||
server.n_ctx = 8192
|
||||
server.n_batch = 2048
|
||||
|
||||
class CompletionMode(Enum):
|
||||
NORMAL = "normal"
|
||||
|
|
|
@ -79,7 +79,7 @@ class ServerProcess:
|
|||
draft: int | None = None
|
||||
api_key: str | None = None
|
||||
lora_files: List[str] | None = None
|
||||
disable_ctx_shift: int | None = False
|
||||
enable_ctx_shift: int | None = False
|
||||
draft_min: int | None = None
|
||||
draft_max: int | None = None
|
||||
no_webui: bool | None = None
|
||||
|
@ -178,8 +178,8 @@ class ServerProcess:
|
|||
if self.lora_files:
|
||||
for lora_file in self.lora_files:
|
||||
server_args.extend(["--lora", lora_file])
|
||||
if self.disable_ctx_shift:
|
||||
server_args.extend(["--no-context-shift"])
|
||||
if self.enable_ctx_shift:
|
||||
server_args.append("--context-shift")
|
||||
if self.api_key:
|
||||
server_args.extend(["--api-key", self.api_key])
|
||||
if self.draft_max:
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue