From 484d2f31aed34ff9f096e3961125762e81d9b7d6 Mon Sep 17 00:00:00 2001 From: kallewoof Date: Wed, 11 Dec 2024 22:48:04 +0900 Subject: [PATCH 01/14] bug-fix: snprintf prints NULL in place of the last character (#10419) * bug-fix: snprintf prints NULL in place of the last character We need to give snprintf enough space to print the last character and the null character, thus we allocate one extra byte and then ignore it when converting to std::string. * add comment about extra null-term byte requirement --- examples/server/utils.hpp | 2 +- include/llama.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index 8f545aea5..2fcb895ab 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -333,7 +333,7 @@ static std::string llama_get_chat_template(const struct llama_model * model) { if (res < 2) { return ""; } else { - std::vector model_template(res, 0); + std::vector model_template(res + 1, 0); llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size()); return std::string(model_template.data(), model_template.size() - 1); } diff --git a/include/llama.h b/include/llama.h index 36945cde3..eebbacb80 100644 --- a/include/llama.h +++ b/include/llama.h @@ -456,6 +456,7 @@ extern "C" { // Functions to access the model's GGUF metadata scalar values // - The functions return the length of the string on success, or -1 on failure // - The output string is always null-terminated and cleared on failure + // - When retrieving a string, an extra byte must be allocated to account for the null terminator // - GGUF array values are not supported by these functions // Get metadata value as a string by key name From 92f77a640f763c0af73554fb810a85a7d4c85e5e Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 11 Dec 2024 14:59:41 +0100 Subject: [PATCH 02/14] ci : pin nodejs to 22.11.0 (#10779) --- .github/workflows/server.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml index 9e66fb68c..671fe595c 100644 --- a/.github/workflows/server.yml +++ b/.github/workflows/server.yml @@ -79,7 +79,7 @@ jobs: # Setup nodejs (to be used for verifying bundled index.html) - uses: actions/setup-node@v4 with: - node-version: 22 + node-version: '22.11.0' - name: Verify bundled index.html id: verify_server_index_html From 1a31d0dc00ba946d448e16ecc915ce5e8355994e Mon Sep 17 00:00:00 2001 From: qingy1337 Date: Wed, 11 Dec 2024 07:16:32 -0800 Subject: [PATCH 03/14] Update README.md (#10772) --- examples/quantize/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/quantize/README.md b/examples/quantize/README.md index 5d1e11c67..f9cce7b21 100644 --- a/examples/quantize/README.md +++ b/examples/quantize/README.md @@ -81,7 +81,7 @@ Several quantization methods are supported. They differ in the resulting model d - [#4930 - imatrix for all k-quants](https://github.com/ggerganov/llama.cpp/pull/4930) - [#4951 - imatrix on the GPU](https://github.com/ggerganov/llama.cpp/pull/4957) - [#4969 - imatrix for legacy quants](https://github.com/ggerganov/llama.cpp/pull/4969) - - [#4996 - k-qunats tuning](https://github.com/ggerganov/llama.cpp/pull/4996) + - [#4996 - k-quants tuning](https://github.com/ggerganov/llama.cpp/pull/4996) - [#5060 - Q3_K_XS](https://github.com/ggerganov/llama.cpp/pull/5060) - [#5196 - 3-bit i-quants](https://github.com/ggerganov/llama.cpp/pull/5196) - [quantization tuning](https://github.com/ggerganov/llama.cpp/pull/5320), [another one](https://github.com/ggerganov/llama.cpp/pull/5334), and [another one](https://github.com/ggerganov/llama.cpp/pull/5361) From 235f6e14bf0ed0211c51aeff14139038ae1000aa Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 11 Dec 2024 20:52:14 +0100 Subject: [PATCH 04/14] server : (UI) add tok/s, get rid of completion.js (#10786) * get rid of completion.js * extract chat bubble to a component * add tok/s info * sync * fix BASE_URL * only extract timings when it's enabled * fix auto scroll --- examples/server/public/index.html | 198 ++++++++++++--------- examples/server/webui/index.html | 124 ++++++++----- examples/server/webui/package-lock.json | 7 + examples/server/webui/package.json | 1 + examples/server/webui/src/completion.js | 225 ------------------------ examples/server/webui/src/main.js | 128 +++++++++++--- 6 files changed, 307 insertions(+), 376 deletions(-) delete mode 100644 examples/server/webui/src/completion.js diff --git a/examples/server/public/index.html b/examples/server/public/index.html index 250729a44..9a19c5e83 100644 --- a/examples/server/public/index.html +++ b/examples/server/public/index.html @@ -11,84 +11,84 @@ 🦙 llama.cpp - chat - - + @@ -99,7 +99,7 @@ Server rendered element contains fewer child nodes than client vdom.`),T=!0),au(
-
+

Conversations

@@ -204,51 +204,25 @@ Server rendered element contains fewer child nodes than client vdom.`),T=!0),au( {{ messages.length === 0 ? 'Send a message to start' : '' }}
-
-
- - - - -
-
- - -
- - - - - -
+
-
-
- - -
+
+
@@ -311,6 +285,10 @@ Server rendered element contains fewer child nodes than client vdom.`),T=!0),au(
Advanced config
+
+ + Show tokens per second +
+ + + + +