Merge branch 'upstream' into concedo_experimental

# Conflicts:
#	ggml/src/ggml-sycl/ggml-sycl.cpp
#	requirements/requirements-all.txt
This commit is contained in:
Concedo 2025-07-15 23:59:53 +08:00
commit ce7aa0d5c0
7 changed files with 604 additions and 1103 deletions

File diff suppressed because it is too large Load diff

View file

@ -3,7 +3,10 @@
typedef void (*set_rows_kernel_t)(const char * src, char * dst);
template<typename src_t, typename dst_t>
__device__ void set_rows_1(const src_t * src_f, dst_t * dst_f) {}
__device__ void set_rows_1(const src_t * src_f, dst_t * dst_f) {
GGML_UNUSED(src_f);
GGML_UNUSED(dst_f);
}
template<>
__device__ __forceinline__ void set_rows_1<float, half>(const float * src_f, half * dst_h) {
@ -53,6 +56,9 @@ static __global__ void k_set_rows(
const src_t* src_elem = src0_row + i00;
dst_t* dst_elem = dst_row_ptr + i00;
set_rows_1(src_elem, dst_elem);
GGML_UNUSED(ne10);
GGML_UNUSED(ne13);
}
template<typename src_t, typename dst_t>

View file

@ -12,7 +12,7 @@ Current version indicated by LITEVER below.
-->
<script id="init-config">
const LITEVER = 261;
const LITEVER = 262;
const urlParams = new URLSearchParams(window.location.search);
var localflag = urlParams.get('local'); //this will be replaced automatically in embedded kcpp
const STORAGE_PREFIX = (localflag?"e_":"")+"kaihordewebui_";
@ -3401,10 +3401,12 @@ Current version indicated by LITEVER below.
entersubmit: true, //enter sends the prompt
darkmode: true,
render_streaming_markdown: true,
raw_instruct_tags: false, //experimental flag
raw_instruct_tags: false, //experimental flags
show_endpoint_selector: false,
no_warn_unsaved: false,
no_compress_audio: false,
autoguess_third_party:false,
//section migrated from story itself
extrastopseq: "",
@ -4399,7 +4401,10 @@ Current version indicated by LITEVER below.
let instag = localsettings.instruct_starttag;
if(instag=="{{[INPUT]}}" && !(custom_kobold_endpoint != "" && is_using_kcpp_with_autotags()))
{
instag = "\n### Instruction:\n"; //backend not compatible with auto
if(!localsettings.autoguess_third_party)
{
instag = "\n### Instruction:\n"; //backend not compatible with auto
}
}
if(doTrim){
return replaceAll(instag, "\\n", "\n").trim();
@ -4412,7 +4417,10 @@ Current version indicated by LITEVER below.
let instag = localsettings.instruct_endtag;
if(instag=="{{[OUTPUT]}}" && !(custom_kobold_endpoint != "" && is_using_kcpp_with_autotags()))
{
instag = "\n### Response:\n"; //backend not compatible with auto
if(!localsettings.autoguess_third_party)
{
instag = "\n### Response:\n"; //backend not compatible with auto
}
}
if(doTrim){
return replaceAll(instag, "\\n", "\n").trim();
@ -12630,6 +12638,7 @@ Current version indicated by LITEVER below.
document.getElementById("show_endpoint_selector").checked = localsettings.show_endpoint_selector;
document.getElementById("no_warn_unsaved").checked = localsettings.no_warn_unsaved;
document.getElementById("no_compress_audio").checked = localsettings.no_compress_audio;
document.getElementById("autoguess_third_party").checked = localsettings.autoguess_third_party;
document.getElementById("render_streaming_markdown").checked = localsettings.render_streaming_markdown;
document.getElementById("min_p").value = localsettings.min_p;
document.getElementById("dynatemp_range").value = localsettings.dynatemp_range;
@ -13161,6 +13170,7 @@ Current version indicated by LITEVER below.
localsettings.show_endpoint_selector = (document.getElementById("show_endpoint_selector").checked ? true : false);
localsettings.no_warn_unsaved = (document.getElementById("no_warn_unsaved").checked ? true : false);
localsettings.no_compress_audio = (document.getElementById("no_compress_audio").checked ? true : false);
localsettings.autoguess_third_party = (document.getElementById("autoguess_third_party").checked ? true : false);
localsettings.render_streaming_markdown = (document.getElementById("render_streaming_markdown").checked ? true : false);
if(document.getElementById("opmode").value==1)
{
@ -18470,12 +18480,15 @@ Current version indicated by LITEVER below.
gentxt = gentxt.substring(curtag.length);
}
let found = gentxt.indexOf(curtag);
let splitresponse = [];
if (found != -1) //if found, truncate to it
if(localsettings.includedefaultstops)
{
splitresponse = gentxt.split(curtag);
gentxt = splitresponse[0];
let found = gentxt.indexOf(curtag);
let splitresponse = [];
if (found != -1) //if found, truncate to it
{
splitresponse = gentxt.split(curtag);
gentxt = splitresponse[0];
}
}
}
@ -25329,6 +25342,11 @@ Current version indicated by LITEVER below.
class="helptext">Do not compress embedded audio files. Might crash on big files! (caution!)</span></span></div>
<input title="Do Not Recompress Audio" type="checkbox" id="no_compress_audio" style="margin:0px 0px 0px 0px;">
</div>
<div class="settinglabel">
<div class="justifyleft settingsmall">AutoguessTagsForThirdParty <span class="helpicon">?<span
class="helptext">Sends raw KoboldCppAutomatic AutoGuess tags to third party APIs. (e.g. Horde) Be warned, you better hope they handle them well internally...</span></span></div>
<input title="Send Autoguess Tags For Third Party APIs" type="checkbox" id="autoguess_third_party" style="margin:0px 0px 0px 0px;">
</div>
</div>
<div class="settingitem wide">

View file

@ -0,0 +1,5 @@
datasets~=3.2.0
matplotlib~=3.10.0
numpy~=1.26.4
requests~=2.32.3
tqdm~=4.67.1

210
scripts/server-bench.py Normal file
View file

@ -0,0 +1,210 @@
#!/usr/bin/env python3
import argparse
import json
import subprocess
from time import sleep, time
from typing import Optional
import datasets
import logging
import matplotlib.pyplot as plt
import numpy as np
import requests
from tqdm.contrib.concurrent import thread_map
logging.basicConfig(level=logging.INFO, format='%(message)s')
logger = logging.getLogger("server-bench")
def get_prompts(n_prompts: int) -> list[str]:
logger.info("Loading MMLU dataset...")
ret = datasets.load_dataset("cais/mmlu", "all")["test"]["question"] # type: ignore
if n_prompts >= 0:
ret = ret[:n_prompts]
return ret
def get_server(path_server: str, path_model: str, path_log: Optional[str], port: int, n_gpu_layers: int, parallel: int, ctx_size: int) -> dict:
logger.info("Starting the llama.cpp server...")
address = f"http://localhost:{port}"
popen_args: list[str] = [
path_server,
"--flash-attn",
"--n-gpu-layers", str(n_gpu_layers),
"--parallel", str(parallel),
"--ctx-size", str(parallel * ctx_size),
"--model", path_model,
"--port", str(port),
"--swa-full", # FIXME performance bad otherwise
# "--attn-streams",
]
fout = open("bench.log", "w") if path_log is not None else subprocess.DEVNULL
process = subprocess.Popen(popen_args, stdout=fout, stderr=subprocess.STDOUT)
n_failures: int = 0
while True:
try:
sleep(1.0)
exit_code = process.poll()
if exit_code is not None:
raise RuntimeError(f"llama.cpp server for {path_model} exited unexpectedly with exit code {exit_code}")
response = requests.get(f"{address}/health")
if response.status_code == 200:
break
except requests.ConnectionError:
n_failures += 1
if n_failures >= 10:
raise RuntimeError(f"llama.cpp server for {path_model} is not healthy after 10 seconds")
return {"process": process, "address": address, "fout": fout}
def get_prompt_length(data: dict) -> int:
session = data["session"]
server_address: str = data["server_address"]
response = session.post(
f"{server_address}/apply-template",
json={"messages": [{"role": "user", "content": data["prompt"], "stream": True}]}
)
if response.status_code != 200:
raise RuntimeError(f"Server returned status code {response.status_code}: {response.text}")
prompt: str = json.loads(response.text)["prompt"]
response = session.post(
f"{server_address}/tokenize",
json={"content": prompt, "add_special": True}
)
if response.status_code != 200:
raise RuntimeError(f"Server returned status code {response.status_code}: {response.text}")
tokens: list[str] = json.loads(response.text)["tokens"]
return len(tokens)
def send_prompt(data: dict) -> tuple[float, list[float]]:
session = data["session"]
server_address: str = data["server_address"]
response = session.post(
f"{server_address}/apply-template",
json={"messages": [{"role": "user", "content": data["prompt"], "stream": True}]}
)
if response.status_code != 200:
raise RuntimeError(f"Server returned status code {response.status_code}: {response.text}")
prompt: str = json.loads(response.text)["prompt"]
json_data: dict = {"prompt": prompt, "seed": data["seed"], "n_predict": data["n_predict"], "stream": True}
response = session.post(f"{server_address}/completion", json=json_data, stream=True)
last_valid_line: str = ""
token_arrival_times: list[float] = []
for line in response.iter_lines(decode_unicode=True):
if not line.startswith("data: "):
continue
last_valid_line = line
token_arrival_times.append(time())
token_arrival_times = token_arrival_times[:-1]
if response.status_code != 200:
raise RuntimeError(f"Server returned status code {response.status_code}: {response.text}")
timings: dict = json.loads(last_valid_line[6:])["timings"]
return (timings["prompt_ms"], token_arrival_times)
def benchmark(path_server: str, path_model: str, path_log: Optional[str], port: int, n_gpu_layers: int, parallel: int, ctx_size: int, n_prompts: int, n_predict: int):
num_workers: int = parallel + 1
prompts: list[str] = get_prompts(n_prompts)
server: Optional[dict] = None
session = None
try:
server = get_server(path_server, path_model, path_log, port, n_gpu_layers, parallel, ctx_size)
server_address: str = server["address"]
adapter = requests.adapters.HTTPAdapter(pool_connections=num_workers, pool_maxsize=num_workers) # type: ignore
session = requests.Session()
session.mount("http://", adapter)
session.mount("https://", adapter)
data: list[dict] = []
for i, p in enumerate(prompts):
data.append({"session": session, "server_address": server_address, "prompt": p, "n_predict": n_predict, "seed": i})
logger.info("Getting the prompt lengths...")
prompt_n = [get_prompt_length(d) for d in data]
logger.info("Starting the benchmark...\n")
t0 = time()
results: list[tuple[int, list[float]]] = thread_map(send_prompt, data, max_workers=num_workers, chunksize=1)
finally:
if server is not None:
server["process"].terminate()
server["process"].wait()
if session is not None:
session.close()
prompt_ms = []
token_t = []
depth_sum: int = 0
for pn, (pms, tat) in zip(prompt_n, results):
prompt_ms.append(pms)
token_t += tat
n_tokens: int = len(tat)
depth_sum += n_tokens * pn
depth_sum += n_tokens * (n_tokens + 1) // 2
prompt_n = np.array(prompt_n, dtype=np.int64)
prompt_ms = np.array(prompt_ms, dtype=np.float64)
token_t = np.array(token_t, dtype=np.float64)
token_t -= t0
token_t_last = np.max(token_t)
logger.info("")
logger.info(f"Benchmark duration: {token_t_last:.2f} s")
logger.info(f"Request throughput: {n_prompts / token_t_last:.2f} requests/s = {n_prompts / (token_t_last/60):.2f} requests/min")
logger.info(f"Total prompt length: {np.sum(prompt_n)} tokens")
logger.info(f"Average prompt length: {np.mean(prompt_n):.2f} tokens")
logger.info(f"Average prompt latency: {np.mean(prompt_ms):.2f} ms")
logger.info(f"Average prompt speed: {np.sum(prompt_n) / (1e-3 * np.sum(prompt_ms)):.2f} tokens/s")
logger.info(f"Total generated tokens: {token_t.shape[0]}")
logger.info(f"Average generation depth: {depth_sum / token_t.shape[0]:.2f} tokens")
logger.info(f"Average total generation speed: {token_t.shape[0] / token_t_last:.2f} tokens/s")
logger.info(f"Average generation speed per slot: {token_t.shape[0] / (parallel * token_t_last):.2f} tokens/s / slot")
plt.figure()
plt.scatter(prompt_n, prompt_ms, s=10.0, marker=".", alpha=0.25)
plt.xlim(0, 1.05 * np.max(prompt_n))
plt.ylim(0, 1.05 * np.max(prompt_ms))
plt.title(path_model)
plt.xlabel("Prompt length [tokens]")
plt.ylabel("Time to first token [ms]")
plt.savefig("prompt_time.png", dpi=240)
bin_max = np.ceil(token_t_last) + 1
plt.figure()
plt.hist(token_t, np.arange(0, bin_max))
plt.xlim(0, bin_max + 1)
plt.title(path_model)
plt.xlabel("Time [s]")
plt.ylabel("Num. tokens generated per second")
plt.savefig("gen_rate.png", dpi=240)
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Tool for benchmarking the throughput of the llama.cpp HTTP server. "
"Results are printed to console and visualized as plots (saved to current working directory).")
parser.add_argument("--path_server", type=str, default="llama-server", help="Path to the llama.cpp server binary")
parser.add_argument("--path_model", type=str, required=True, help="Path to the model to use for the benchmark")
parser.add_argument("--path_log", type=str, default=None, help="Path to the model to use for the benchmark")
parser.add_argument("--port", type=int, default=18725, help="Port to use for the server during the benchmark")
parser.add_argument("--n_gpu_layers", type=int, default=999, help="Number of GPU layers for the server")
parser.add_argument("--parallel", type=int, default=16, help="Number of slots for the server")
parser.add_argument("--ctx_size", type=int, default=4096, help="Server context size per slot")
parser.add_argument("--n_prompts", type=int, default=1000, help="Number of prompts to evaluate")
parser.add_argument("--n_predict", type=int, default=2048, help="Max. number of tokens to predict per prompt")
args = parser.parse_args()
benchmark(**vars(args))

View file

@ -731,7 +731,8 @@ int llama_context::encode(const llama_batch & batch_inp) {
const auto & hparams = model.hparams;
const int64_t n_embd = hparams.n_embd;
const int64_t n_embd = hparams.n_embd;
const int32_t n_vocab = model.vocab.n_tokens();
// note: during encode, we always pass the full sequence starting from pos = 0
if (!balloc->init(batch_inp, model.vocab, nullptr, n_embd, true)) {
@ -791,10 +792,20 @@ int llama_context::encode(const llama_batch & batch_inp) {
}
}
auto * t_logits = res->get_logits();
auto * t_embd = res->get_embd_pooled() ? res->get_embd_pooled() : res->get_embd();
// extract logits
if (logits && t_logits) {
ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(sched.get(), t_logits);
GGML_ASSERT(backend_res != nullptr);
GGML_ASSERT(logits != nullptr);
ggml_backend_tensor_get_async(backend_res, t_logits, logits, 0, n_tokens*n_vocab*sizeof(float));
}
// extract embeddings
if (t_embd) {
if (embd && t_embd) {
ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(sched.get(), t_embd);
GGML_ASSERT(backend_embd != nullptr);

View file

@ -11,6 +11,8 @@
// increase max payload length to allow use of larger context size
#define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576
// increase backlog size to avoid connection resets for >> 1 slots
#define CPPHTTPLIB_LISTEN_BACKLOG 512
// disable Nagle's algorithm
#define CPPHTTPLIB_TCP_NODELAY true
#include <cpp-httplib/httplib.h>