From b115105f05e3372bc75b2a486c1930c365fd2846 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 24 Jul 2024 11:25:19 +0200 Subject: [PATCH 1/5] add llama_lora_adapter_clear (#8653) --- include/llama.h | 6 +++++- src/llama.cpp | 4 ++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/include/llama.h b/include/llama.h index e68cd807e..413070d95 100644 --- a/include/llama.h +++ b/include/llama.h @@ -529,12 +529,16 @@ extern "C" { struct llama_lora_adapter * adapter, float scale); - // Remove a LoRA adapter from given context + // Remove a specific LoRA adapter from given context // Return -1 if the adapter is not present in the context LLAMA_API int32_t llama_lora_adapter_remove( struct llama_context * ctx, struct llama_lora_adapter * adapter); + // Remove all LoRA adapters from given context + LLAMA_API void llama_lora_adapter_clear( + struct llama_context * ctx); + // Manually free a LoRA adapter // Note: loaded adapters will be free when the associated model is deleted LLAMA_API void llama_lora_adapter_free(struct llama_lora_adapter * adapter); diff --git a/src/llama.cpp b/src/llama.cpp index 40c5e8e8d..04eaf6730 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -16201,6 +16201,10 @@ int32_t llama_lora_adapter_remove( return -1; } +void llama_lora_adapter_clear(struct llama_context * ctx) { + ctx->lora_adapters.clear(); +} + void llama_lora_adapter_free(struct llama_lora_adapter * adapter) { delete adapter; } From 79167d9e49aef9caa98e13ee7ca067ec9f88b4b5 Mon Sep 17 00:00:00 2001 From: Joe Todd Date: Wed, 24 Jul 2024 11:55:26 +0100 Subject: [PATCH 2/5] Re-add erroneously removed -fsycl from GGML_EXTRA_LIBS (#8667) --- ggml/src/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 12c440327..c6496c921 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -510,10 +510,10 @@ if (GGML_SYCL) set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL) else() if (GGML_SYCL_TARGET STREQUAL "INTEL") - set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread) + set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} -fsycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread) elseif (GGML_SYCL_TARGET STREQUAL "NVIDIA") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda") - set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} pthread m dl onemkl) + set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} -fsycl pthread m dl onemkl) endif() endif() endif() From 96952e7181929c6001b2bc69a33f240de731cc3a Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 24 Jul 2024 13:48:46 +0200 Subject: [PATCH 3/5] llama : fix `llama_chat_format_single` for mistral (#8657) * fix `llama_chat_format_single` for mistral * fix typo * use printf --- common/common.cpp | 2 +- examples/main/main.cpp | 1 + tests/test-chat-template.cpp | 30 ++++++++++++++++++++++++------ 3 files changed, 26 insertions(+), 7 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 4c19132f1..ec44a0552 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -2723,7 +2723,7 @@ std::string llama_chat_format_single(const struct llama_model * model, const llama_chat_msg & new_msg, bool add_ass) { std::ostringstream ss; - auto fmt_past_msg = llama_chat_apply_template(model, tmpl, past_msg, false); + auto fmt_past_msg = past_msg.empty() ? "" : llama_chat_apply_template(model, tmpl, past_msg, false); std::vector chat_new(past_msg); // if the past_msg ends with a newline, we must preserve it in the formatted version if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') { diff --git a/examples/main/main.cpp b/examples/main/main.cpp index a0d817b1a..61e960ea2 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -124,6 +124,7 @@ static std::string chat_add_and_format(struct llama_model * model, std::vectorchat_template, chat_msgs, new_msg, role == "user"); chat_msgs.push_back({role, content}); + LOG("formatted: %s\n", formatted.c_str()); return formatted; } diff --git a/tests/test-chat-template.cpp b/tests/test-chat-template.cpp index 6583dd0b2..46a7d3aea 100644 --- a/tests/test-chat-template.cpp +++ b/tests/test-chat-template.cpp @@ -1,4 +1,3 @@ -#include #include #include #include @@ -133,13 +132,31 @@ int main(void) { ); formatted_chat.resize(res); std::string output(formatted_chat.data(), formatted_chat.size()); - std::cout << output << "\n-------------------------\n"; + printf("%s\n", output.c_str()); + printf("-------------------------\n"); assert(output == expected); } - // test llama_chat_format_single - std::cout << "\n\n=== llama_chat_format_single ===\n\n"; + + // test llama_chat_format_single for system message + printf("\n\n=== llama_chat_format_single (system message) ===\n\n"); std::vector chat2; + llama_chat_msg sys_msg{"system", "You are a helpful assistant"}; + + auto fmt_sys = [&](std::string tmpl) { + auto output = llama_chat_format_single(nullptr, tmpl, chat2, sys_msg, false); + printf("fmt_sys(%s) : %s\n", tmpl.c_str(), output.c_str()); + printf("-------------------------\n", output.c_str()); + return output; + }; + assert(fmt_sys("chatml") == "<|im_start|>system\nYou are a helpful assistant<|im_end|>\n"); + assert(fmt_sys("llama2") == "[INST] You are a helpful assistant\n"); + assert(fmt_sys("gemma") == ""); // for gemma, system message is merged with user message + assert(fmt_sys("llama3") == "<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant<|eot_id|>"); + + + // test llama_chat_format_single for user message + printf("\n\n=== llama_chat_format_single (user message) ===\n\n"); chat2.push_back({"system", "You are a helpful assistant"}); chat2.push_back({"user", "Hello"}); chat2.push_back({"assistant", "I am assistant"}); @@ -147,12 +164,13 @@ int main(void) { auto fmt_single = [&](std::string tmpl) { auto output = llama_chat_format_single(nullptr, tmpl, chat2, new_msg, true); - std::cout << "fmt_single(" << tmpl << ")\n" << output << "\n-------------------------\n"; + printf("fmt_single(%s) : %s\n", tmpl.c_str(), output.c_str()); + printf("-------------------------\n", output.c_str()); return output; }; assert(fmt_single("chatml") == "\n<|im_start|>user\nHow are you<|im_end|>\n<|im_start|>assistant\n"); assert(fmt_single("llama2") == "[INST] How are you [/INST]"); - assert(fmt_single("gemma") == "\nuser\nHow are you\nmodel\n"); + assert(fmt_single("gemma") == "\nuser\nHow are you\nmodel\n"); assert(fmt_single("llama3") == "<|start_header_id|>user<|end_header_id|>\n\nHow are you<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"); return 0; From 3a7ac5300a7e8ebbe4a3eb5aff9dba11ed76ea61 Mon Sep 17 00:00:00 2001 From: Thorsten Sommer Date: Wed, 24 Jul 2024 14:52:30 +0200 Subject: [PATCH 4/5] readme : update UI list [no ci] (#8505) --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 7c233b5e1..b7b9bf588 100644 --- a/README.md +++ b/README.md @@ -138,6 +138,7 @@ Typically finetunes of the base models below are supported as well. Unless otherwise noted these projects are open-source with permissive licensing: +- [MindWorkAI/AI-Studio](https://github.com/MindWorkAI/AI-Studio) (FSL-1.1-MIT) - [iohub/collama](https://github.com/iohub/coLLaMA) - [janhq/jan](https://github.com/janhq/jan) (AGPL) - [nat/openplayground](https://github.com/nat/openplayground) From f19bf99c015d3d745143e8bb4f056e0ea015ad40 Mon Sep 17 00:00:00 2001 From: Joe Todd Date: Wed, 24 Jul 2024 14:36:00 +0100 Subject: [PATCH 5/5] Build Llama SYCL Intel with static libs (#8668) Ensure SYCL CI builds both static & dynamic libs for testing purposes Signed-off-by: Joe Todd --- .devops/llama-cli-intel.Dockerfile | 4 +++- .devops/llama-server-intel.Dockerfile | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/.devops/llama-cli-intel.Dockerfile b/.devops/llama-cli-intel.Dockerfile index 2bf82bb58..79dba06a7 100644 --- a/.devops/llama-cli-intel.Dockerfile +++ b/.devops/llama-cli-intel.Dockerfile @@ -14,7 +14,9 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \ echo "GGML_SYCL_F16 is set" && \ export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \ fi && \ - cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \ + echo "Building with static libs" && \ + cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx \ + ${OPT_SYCL_F16} -DBUILD_SHARED_LIBS=OFF && \ cmake --build build --config Release --target llama-cli FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime diff --git a/.devops/llama-server-intel.Dockerfile b/.devops/llama-server-intel.Dockerfile index eb9aba618..f525658dd 100644 --- a/.devops/llama-server-intel.Dockerfile +++ b/.devops/llama-server-intel.Dockerfile @@ -14,6 +14,7 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \ echo "GGML_SYCL_F16 is set" && \ export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \ fi && \ + echo "Building with dynamic libs" && \ cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \ cmake --build build --config Release --target llama-server