diff --git a/common/chat.h b/common/chat.h index 8ace3e6ba..b29c627e6 100644 --- a/common/chat.h +++ b/common/chat.h @@ -219,6 +219,7 @@ struct common_chat_parser_params { bool reasoning_in_content = false; std::string generation_prompt; bool parse_tool_calls = true; + bool is_continuation = false; bool echo = false; // Include assistant prefilled msg in output bool debug = false; // Enable debug output for PEG parser common_peg_arena parser = {}; diff --git a/common/common.h b/common/common.h index 1ffc4095d..2fb986f15 100644 --- a/common/common.h +++ b/common/common.h @@ -618,11 +618,7 @@ struct common_params { std::map default_template_kwargs; // UI configs -#ifdef LLAMA_UI_DEFAULT_ENABLED - bool ui = LLAMA_UI_DEFAULT_ENABLED != 0; -#else - bool ui = true; // default to enabled when not set -#endif + bool ui = true; // Deprecated: use ui, ui_mcp_proxy, ui_config_json instead bool webui = ui; diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 9b506e26c..c865b07e1 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -306,7 +306,7 @@ void ggml_backend_tensor_get_2d_async(ggml_backend_t backend, const struct ggml_ GGML_ASSERT(tensor); GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); - if (n_copies <= 1 || backend->iface.set_tensor_2d_async == NULL) { + if (n_copies <= 1 || backend->iface.get_tensor_2d_async == NULL) { for (size_t i = 0; i < n_copies; i++) { ggml_backend_tensor_get_async(backend, tensor, (char *) data + i*stride_data, offset + i*stride_tensor, size); } @@ -317,7 +317,7 @@ void ggml_backend_tensor_get_2d_async(ggml_backend_t backend, const struct ggml_ } GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); - GGML_ASSERT(offset + (n_copies-1)*stride_tensor + size <= ggml_nbytes(tensor) && "tensor write out of bounds"); + GGML_ASSERT(offset + (n_copies-1)*stride_tensor + size <= ggml_nbytes(tensor) && "tensor read out of bounds"); backend->iface.get_tensor_2d_async(backend, tensor, data, offset, size, n_copies, stride_tensor, stride_data); } diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 11254f500..be63c2701 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1465,6 +1465,12 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) { if (!layer.ssm_beta_s && layer.ssm_beta) { layer.ssm_beta_s = create_tensor(tn(LLM_TENSOR_SSM_BETA, "scale", i), {1}, TENSOR_NOT_REQUIRED); } + if (!layer.nextn.eh_proj_s && layer.nextn.eh_proj) { + layer.nextn.eh_proj_s = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "scale", i), {1}, TENSOR_NOT_REQUIRED); + } + if (!layer.nextn.shared_head_head_s && layer.nextn.shared_head_head) { + layer.nextn.shared_head_head_s = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "scale", i), {1}, TENSOR_NOT_REQUIRED); + } // input scales if (!layer.wq_in_s && layer.wq) { @@ -1524,6 +1530,12 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) { if (!layer.ssm_beta_in_s && layer.ssm_beta) { layer.ssm_beta_in_s = create_tensor(tn(LLM_TENSOR_SSM_BETA, "input_scale", i), {1}, TENSOR_NOT_REQUIRED); } + if (!layer.nextn.eh_proj_in_s && layer.nextn.eh_proj) { + layer.nextn.eh_proj_in_s = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "input_scale", i), {1}, TENSOR_NOT_REQUIRED); + } + if (!layer.nextn.shared_head_head_in_s && layer.nextn.shared_head_head) { + layer.nextn.shared_head_head_in_s = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "input_scale", i), {1}, TENSOR_NOT_REQUIRED); + } } // output scales if (output && output->type == GGML_TYPE_NVFP4) { diff --git a/src/llama-model.h b/src/llama-model.h index 01c87a752..398a0aa72 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -202,12 +202,16 @@ struct llama_layer_shortconv { }; struct llama_layer_nextn { - struct ggml_tensor * eh_proj = nullptr; - struct ggml_tensor * embed_tokens = nullptr; - struct ggml_tensor * enorm = nullptr; - struct ggml_tensor * hnorm = nullptr; - struct ggml_tensor * shared_head_head = nullptr; - struct ggml_tensor * shared_head_norm = nullptr; + struct ggml_tensor * eh_proj = nullptr; + struct ggml_tensor * eh_proj_s = nullptr; + struct ggml_tensor * eh_proj_in_s = nullptr; + struct ggml_tensor * embed_tokens = nullptr; + struct ggml_tensor * enorm = nullptr; + struct ggml_tensor * hnorm = nullptr; + struct ggml_tensor * shared_head_head = nullptr; + struct ggml_tensor * shared_head_head_s = nullptr; + struct ggml_tensor * shared_head_head_in_s = nullptr; + struct ggml_tensor * shared_head_norm = nullptr; }; struct llama_layer { diff --git a/src/models/qwen35.cpp b/src/models/qwen35.cpp index adeb0c26e..04ecc18fc 100644 --- a/src/models/qwen35.cpp +++ b/src/models/qwen35.cpp @@ -538,7 +538,7 @@ llama_model_qwen35::graph_mtp::graph_mtp(const llama_model & model, const llm_gr ggml_tensor * concat = ggml_concat(ctx0, e_norm, h_norm, /*dim=*/ 0); cb(concat, "mtp_concat", il); - ggml_tensor * cur = build_lora_mm(layer.nextn.eh_proj, concat); + ggml_tensor * cur = build_lora_mm(layer.nextn.eh_proj, concat, layer.nextn.eh_proj_s); cb(cur, "mtp_eh_proj", il); ggml_tensor * inpSA = cur; @@ -626,8 +626,9 @@ llama_model_qwen35::graph_mtp::graph_mtp(const llama_model & model, const llm_gr cb(cur, "mtp_shared_head_norm", -1); ggml_tensor * head_w = layer.nextn.shared_head_head ? layer.nextn.shared_head_head : model.output; + ggml_tensor * head_s = layer.nextn.shared_head_head ? layer.nextn.shared_head_head_s : model.output_s; GGML_ASSERT(head_w && "QWEN35 MTP: missing LM head (nextn.shared_head_head or model.output)"); - cur = build_lora_mm(head_w, cur); + cur = build_lora_mm(head_w, cur, head_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/qwen35moe.cpp b/src/models/qwen35moe.cpp index e4512116d..dc24f6ed5 100644 --- a/src/models/qwen35moe.cpp +++ b/src/models/qwen35moe.cpp @@ -602,7 +602,7 @@ llama_model_qwen35moe::graph_mtp::graph_mtp(const llama_model & model, const llm ggml_tensor * concat = ggml_concat(ctx0, e_norm, h_norm, /*dim=*/ 0); cb(concat, "mtp_concat", il); - ggml_tensor * cur = build_lora_mm(layer.nextn.eh_proj, concat); + ggml_tensor * cur = build_lora_mm(layer.nextn.eh_proj, concat, layer.nextn.eh_proj_s); cb(cur, "mtp_eh_proj", il); ggml_tensor * inpSA = cur; @@ -722,8 +722,9 @@ llama_model_qwen35moe::graph_mtp::graph_mtp(const llama_model & model, const llm cb(cur, "mtp_shared_head_norm", -1); ggml_tensor * head_w = layer.nextn.shared_head_head ? layer.nextn.shared_head_head : model.output; + ggml_tensor * head_s = layer.nextn.shared_head_head ? layer.nextn.shared_head_head_s : model.output_s; GGML_ASSERT(head_w && "QWEN35MOE MTP: missing LM head (nextn.shared_head_head or model.output)"); - cur = build_lora_mm(head_w, cur); + cur = build_lora_mm(head_w, cur, head_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/tools/mtmd/requirements.txt b/tools/mtmd/requirements.txt index 0a1f4e864..f26d8e912 100644 --- a/tools/mtmd/requirements.txt +++ b/tools/mtmd/requirements.txt @@ -1,5 +1,12 @@ -r ../../requirements/requirements-convert_legacy_llama.txt --extra-index-url https://download.pytorch.org/whl/cpu pillow~=11.3.0 -torch~=2.6.0 -torchvision~=0.21.0 + +## Embedding Gemma requires PyTorch 2.6.0 or later, bumped to 2.11.0 for compatibility +torch==2.11.0; platform_machine != "s390x" # check_requirements: ignore "==" +torchvision==0.26.0; platform_machine != "s390x" # check_requirements: ignore "==" + +# torch s390x packages can only be found from nightly builds +--extra-index-url https://download.pytorch.org/whl/nightly +torch>=0.0.0.dev0; platform_machine == "s390x" # check_requirements: ignore "==" +torchvision>=0.0.0.dev0; platform_machine == "s390x" # check_requirements: ignore "==" diff --git a/tools/server/server-http.cpp b/tools/server/server-http.cpp index 9d008fc94..9c025952d 100644 --- a/tools/server/server-http.cpp +++ b/tools/server/server-http.cpp @@ -231,16 +231,19 @@ bool server_http_context::init(const common_params & params) { }; auto middleware_server_state = [this](const httplib::Request & req, httplib::Response & res) { - (void)req; // suppress unused parameter warning when LLAMA_BUILD_UI is not defined bool ready = is_ready.load(); if (!ready) { -#if defined(LLAMA_BUILD_UI) +#if defined(LLAMA_UI_HAS_ASSETS) auto tmp = string_split(req.path, '.'); if (req.path == "/" || (tmp.size() > 0 && tmp.back() == "html")) { - res.status = 503; - res.set_content(reinterpret_cast(loading_html), loading_html_len, "text/html; charset=utf-8"); - return false; + if (const llama_ui_asset * a = llama_ui_find_asset("loading.html")) { + res.status = 503; + res.set_content(reinterpret_cast(a->data), a->size, "text/html; charset=utf-8"); + return false; + } } +#else + (void)req; #endif // no endpoints are allowed to be accessed when the server is not ready // this is to prevent any data races or inconsistent states @@ -312,23 +315,27 @@ bool server_http_context::init(const common_params & params) { return 1; } } else { -#if defined(LLAMA_BUILD_UI) - // using embedded static index.html - srv->Get(params.api_prefix + "/", [](const httplib::Request & /*req*/, httplib::Response & res) { - // COEP and COOP headers, required by pyodide (python interpreter) - res.set_header("Cross-Origin-Embedder-Policy", "require-corp"); - res.set_header("Cross-Origin-Opener-Policy", "same-origin"); - res.set_content(reinterpret_cast(index_html), index_html_len, "text/html; charset=utf-8"); - return false; - }); - srv->Get(params.api_prefix + "/bundle.js", [](const httplib::Request & /*req*/, httplib::Response & res) { - res.set_content(reinterpret_cast(bundle_js), bundle_js_len, "application/javascript; charset=utf-8"); - return false; - }); - srv->Get(params.api_prefix + "/bundle.css", [](const httplib::Request & /*req*/, httplib::Response & res) { - res.set_content(reinterpret_cast(bundle_css), bundle_css_len, "text/css; charset=utf-8"); - return false; - }); +#if defined(LLAMA_UI_HAS_ASSETS) + auto serve_asset = [](const std::string & name, const char * mime, bool with_isolation_headers) { + return [name, mime, with_isolation_headers](const httplib::Request & /*req*/, httplib::Response & res) { + const llama_ui_asset * a = llama_ui_find_asset(name.c_str()); + if (!a) { + res.status = 404; + return false; + } + if (with_isolation_headers) { + // COEP and COOP headers, required by pyodide (python interpreter) + res.set_header("Cross-Origin-Embedder-Policy", "require-corp"); + res.set_header("Cross-Origin-Opener-Policy", "same-origin"); + } + res.set_content(reinterpret_cast(a->data), a->size, mime); + return false; + }; + }; + + srv->Get(params.api_prefix + "/", serve_asset("index.html", "text/html; charset=utf-8", true)); + srv->Get(params.api_prefix + "/bundle.js", serve_asset("bundle.js", "application/javascript; charset=utf-8", false)); + srv->Get(params.api_prefix + "/bundle.css", serve_asset("bundle.css", "text/css; charset=utf-8", false)); #endif } } diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp index d45513dbe..abc00c82b 100644 --- a/tools/server/server-task.cpp +++ b/tools/server/server-task.cpp @@ -149,7 +149,7 @@ task_result_state::task_result_state(const common_chat_parser_params & chat_pars , oai_resp_id("resp_" + random_string()) , oai_resp_reasoning_id("rs_" + random_string()) , oai_resp_message_id("msg_" + random_string()) { - if (!chat_parser_params.echo) { + if (chat_parser_params.is_continuation && !chat_parser_params.echo) { // initialize chat_msg to avoid emitting a delta containing the assistant prefill chat_msg = common_chat_parse("", true, chat_parser_params); } @@ -432,6 +432,10 @@ task_params server_task::params_from_json_cmpl( if (data.contains("chat_parser")) { params.chat_parser_params.parser.load(data.at("chat_parser").get()); } + if (data.contains("continue_final_message")) { + auto continuation = common_chat_continuation_parse(data.at("continue_final_message")); + params.chat_parser_params.is_continuation = continuation != COMMON_CHAT_CONTINUATION_NONE; + } params.chat_parser_params.echo = json_value(data, "echo", false); } diff --git a/tools/ui/CMakeLists.txt b/tools/ui/CMakeLists.txt index 383940cb6..d4cf35802 100644 --- a/tools/ui/CMakeLists.txt +++ b/tools/ui/CMakeLists.txt @@ -1,150 +1,98 @@ set(TARGET llama-ui) -# Deprecated: use LLAMA_UI_HF_BUCKET instead -set(LLAMA_WEBUI_HF_BUCKET "llama-ui" CACHE STRING "Hugging Face bucket name for prebuilt webui assets (deprecated: use LLAMA_UI_HF_BUCKET)") set(LLAMA_UI_HF_BUCKET "llama-ui" CACHE STRING "Hugging Face bucket name for prebuilt UI assets") # Backward compat: forward old var to new one -if(DEFINED LLAMA_WEBUI_HF_BUCKET AND NOT DEFINED LLAMA_UI_HF_BUCKET) +if(DEFINED LLAMA_BUILD_WEBUI) + set(LLAMA_BUILD_UI ${LLAMA_BUILD_WEBUI}) + message(DEPRECATION "LLAMA_BUILD_WEBUI is deprecated, use LLAMA_BUILD_UI instead") +endif() +if(DEFINED LLAMA_USE_PREBUILT_WEBUI) + set(LLAMA_USE_PREBUILT_UI ${LLAMA_USE_PREBUILT_WEBUI}) + message(DEPRECATION "LLAMA_USE_PREBUILT_WEBUI is deprecated, use LLAMA_USE_PREBUILT_UI instead") +endif() +if(DEFINED LLAMA_WEBUI_HF_BUCKET) set(LLAMA_UI_HF_BUCKET ${LLAMA_WEBUI_HF_BUCKET}) -elseif(DEFINED LLAMA_WEBUI_HF_BUCKET AND NOT "${LLAMA_WEBUI_HF_BUCKET}" STREQUAL "${LLAMA_UI_HF_BUCKET}") message(DEPRECATION "LLAMA_WEBUI_HF_BUCKET is deprecated, use LLAMA_UI_HF_BUCKET instead") endif() -set(TARGET_SRCS "") -set(UI_COMPILE_DEFS "") - -if(LLAMA_BUILD_UI) - set(PUBLIC_ASSETS - index.html - bundle.js - bundle.css - loading.html - ) - - # Determine source of UI assets (priority: local > HF Bucket) - set(UI_SOURCE "") - set(UI_SOURCE_DIR "") - - # Priority 1: Check for local build output - set(LOCAL_UI_DIR "${PROJECT_SOURCE_DIR}/build/tools/ui/dist") - - # Verify all required assets exist before declaring local source valid - set(ALL_ASSETS_PRESENT TRUE) - foreach(asset ${PUBLIC_ASSETS}) - if(NOT EXISTS "${LOCAL_UI_DIR}/${asset}") - set(ALL_ASSETS_PRESENT FALSE) - break() - endif() - endforeach() - - if(ALL_ASSETS_PRESENT) - set(UI_SOURCE "local") - set(UI_SOURCE_DIR "${LOCAL_UI_DIR}") - message(STATUS "UI: using local build from ${UI_SOURCE_DIR}") - endif() - - # Priority 2: Build-time asset provisioning (npm build → HF Bucket fallback) - if(NOT UI_SOURCE_DIR) - # Environment variable takes precedence (e.g., from CI workflows) - # Deprecated: use HF_UI_VERSION instead - if(DEFINED ENV{HF_WEBUI_VERSION}) - set(HF_UI_VERSION "$ENV{HF_WEBUI_VERSION}") - message(DEPRECATION "HF_WEBUI_VERSION env var is deprecated, use HF_UI_VERSION instead") - if(NOT HF_UI_VERSION MATCHES "^[A-Za-z0-9._-]+$") - message(FATAL_ERROR "UI: invalid HF_WEBUI_VERSION='${HF_UI_VERSION}' - must match ^[A-Za-z0-9._-]+$") - endif() - elseif(DEFINED ENV{HF_UI_VERSION}) - set(HF_UI_VERSION "$ENV{HF_UI_VERSION}") - if(NOT HF_UI_VERSION MATCHES "^[A-Za-z0-9._-]+$") - message(FATAL_ERROR "UI: invalid HF_UI_VERSION='${HF_UI_VERSION}' - must match ^[A-Za-z0-9._-]+$") - endif() - elseif(DEFINED LLAMA_BUILD_NUMBER) - set(HF_UI_VERSION "b${LLAMA_BUILD_NUMBER}") - message(STATUS "UI: derived HF_UI_VERSION=b${LLAMA_BUILD_NUMBER}") - else() - set(HF_UI_VERSION "") - message(STATUS "UI: version not specified (will use HF 'latest')") - endif() - - if("${HF_UI_VERSION}" STREQUAL "") - set(UI_VERSION_TAG "provisioned") - else() - set(UI_VERSION_TAG "${HF_UI_VERSION}") - endif() - set(UI_STAMP "${CMAKE_CURRENT_BINARY_DIR}/.ui-${UI_VERSION_TAG}.stamp") - - string(REPLACE ";" "+" PUBLIC_ASSETS_JOINED "${PUBLIC_ASSETS}") - - add_custom_command( - OUTPUT ${UI_STAMP} - COMMAND ${CMAKE_COMMAND} - "-DSOURCE_DIR=${PROJECT_SOURCE_DIR}" - "-DPUBLIC_DIR=${PROJECT_SOURCE_DIR}/build/tools/ui/dist" - "-DHF_BUCKET=${LLAMA_UI_HF_BUCKET}" - "-DHF_VERSION=${HF_UI_VERSION}" - "-DHF_ENABLED=${LLAMA_USE_PREBUILT_UI}" - "-DASSETS=${PUBLIC_ASSETS_JOINED}" - "-DSTAMP_FILE=${UI_STAMP}" - "-DNPM_DIR=${PROJECT_SOURCE_DIR}/tools/ui" - -P ${PROJECT_SOURCE_DIR}/scripts/ui-download.cmake - COMMENT "Building/provisioning UI assets (npm build -> HF Bucket fallback)" - ) - - set(UI_SOURCE "provisioned") - set(UI_SOURCE_DIR "${PROJECT_SOURCE_DIR}/build/tools/ui/dist") - endif() - - # Process assets from the determined source - if(UI_SOURCE_DIR) - foreach(asset ${PUBLIC_ASSETS}) - set(input "${UI_SOURCE_DIR}/${asset}") - set(output "${CMAKE_CURRENT_BINARY_DIR}/${asset}.hpp") - list(APPEND TARGET_SRCS ${output}) - - if(UI_SOURCE STREQUAL "local") - if(NOT EXISTS "${input}") - message(FATAL_ERROR "UI asset not found: ${input}") - endif() - set(dependency "${input}") - else() - set(dependency "${UI_STAMP}") - endif() - - add_custom_command( - DEPENDS ${dependency} - OUTPUT "${output}" - COMMAND "${CMAKE_COMMAND}" "-DINPUT=${input}" "-DOUTPUT=${output}" -P "${PROJECT_SOURCE_DIR}/scripts/xxd.cmake" - ) - set_source_files_properties(${output} PROPERTIES GENERATED TRUE) - endforeach() - - list(APPEND UI_COMPILE_DEFS - LLAMA_BUILD_UI - LLAMA_UI_DEFAULT_ENABLED=1 - ) - message(STATUS "UI: embedded with source: ${UI_SOURCE}") - else() - message(WARNING "UI: no source available. Neither local build (build/tools/ui/dist/) nor HF Bucket download succeeded.") - message(WARNING "UI: building server without embedded UI. Set LLAMA_BUILD_UI=OFF to suppress this warning.") - list(APPEND UI_COMPILE_DEFS LLAMA_UI_DEFAULT_ENABLED=0) - endif() +# Resolve HF asset version: explicit env var > derived from build number > unset +if(DEFINED ENV{HF_WEBUI_VERSION}) + set(HF_UI_VERSION "$ENV{HF_WEBUI_VERSION}") + message(DEPRECATION "HF_WEBUI_VERSION env var is deprecated, use HF_UI_VERSION instead") +elseif(DEFINED ENV{HF_UI_VERSION}) + set(HF_UI_VERSION "$ENV{HF_UI_VERSION}") +elseif(DEFINED LLAMA_BUILD_NUMBER) + set(HF_UI_VERSION "b${LLAMA_BUILD_NUMBER}") else() - list(APPEND UI_COMPILE_DEFS LLAMA_UI_DEFAULT_ENABLED=0) + set(HF_UI_VERSION "") endif() -# Build the static library -add_library(${TARGET} STATIC ui.cpp) +if(NOT "${HF_UI_VERSION}" STREQUAL "" AND NOT HF_UI_VERSION MATCHES "^[A-Za-z0-9._-]+$") + message(FATAL_ERROR "UI: invalid HF_UI_VERSION='${HF_UI_VERSION}' - must match ^[A-Za-z0-9._-]+$") +endif() -target_include_directories(${TARGET} PUBLIC - ${CMAKE_CURRENT_SOURCE_DIR} - ${CMAKE_CURRENT_BINARY_DIR} +set(UI_CPP "${CMAKE_CURRENT_BINARY_DIR}/ui.cpp") +set(UI_H "${CMAKE_CURRENT_BINARY_DIR}/ui.h") + +if(CMAKE_CROSSCOMPILING) + find_program(HOST_CXX_COMPILER NAMES g++ clang++ NO_CMAKE_FIND_ROOT_PATH) + if(NOT HOST_CXX_COMPILER) + message(FATAL_ERROR "UI: no host C++ compiler (g++/clang++) found to build llama-ui-embed; set -DHOST_CXX_COMPILER=") + endif() + message(STATUS "UI: building llama-ui-embed with host compiler ${HOST_CXX_COMPILER}") + + if(CMAKE_HOST_WIN32) + set(LLAMA_UI_EMBED_EXE "${CMAKE_CURRENT_BINARY_DIR}/llama-ui-embed.exe") + else() + set(LLAMA_UI_EMBED_EXE "${CMAKE_CURRENT_BINARY_DIR}/llama-ui-embed") + endif() + + add_custom_command( + OUTPUT "${LLAMA_UI_EMBED_EXE}" + COMMAND "${HOST_CXX_COMPILER}" -O2 -std=c++17 + -o "${LLAMA_UI_EMBED_EXE}" "${CMAKE_CURRENT_SOURCE_DIR}/embed.cpp" + DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/embed.cpp" + COMMENT "Building llama-ui-embed (host)" + VERBATIM + ) + add_custom_target(llama-ui-embed DEPENDS "${LLAMA_UI_EMBED_EXE}") +else() + add_executable(llama-ui-embed embed.cpp) + target_compile_features(llama-ui-embed PRIVATE cxx_std_17) + set_target_properties(llama-ui-embed PROPERTIES + RUNTIME_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}" + ) + set(LLAMA_UI_EMBED_EXE "$") +endif() + +# Run the provisioning script every build so source changes in tools/ui/ are +# always picked up. The script uses copy_if_different for ui.cpp/ui.h, so the +# library only recompiles when contents actually change. +add_custom_target(llama-ui-assets ALL + BYPRODUCTS ${UI_CPP} ${UI_H} + COMMAND ${CMAKE_COMMAND} + "-DUI_SOURCE_DIR=${CMAKE_CURRENT_SOURCE_DIR}" + "-DUI_BINARY_DIR=${CMAKE_CURRENT_BINARY_DIR}" + "-DLLAMA_SOURCE_DIR=${PROJECT_SOURCE_DIR}" + "-DHF_BUCKET=${LLAMA_UI_HF_BUCKET}" + "-DHF_VERSION=${HF_UI_VERSION}" + "-DHF_ENABLED=${LLAMA_USE_PREBUILT_UI}" + "-DBUILD_UI=${LLAMA_BUILD_UI}" + "-DLLAMA_UI_EMBED=${LLAMA_UI_EMBED_EXE}" + -P "${PROJECT_SOURCE_DIR}/scripts/ui-assets.cmake" + COMMENT "Provisioning UI assets" + VERBATIM ) -target_compile_definitions(${TARGET} PUBLIC ${UI_COMPILE_DEFS}) +add_dependencies(llama-ui-assets llama-ui-embed) -if(TARGET_SRCS) - # List generated .hpp files as sources so CMake tracks them as build dependencies - target_sources(${TARGET} PRIVATE ${TARGET_SRCS}) - set_source_files_properties(${TARGET_SRCS} PROPERTIES HEADER_FILE_ONLY TRUE) -endif() +set_source_files_properties(${UI_CPP} ${UI_H} PROPERTIES GENERATED TRUE) + +add_library(${TARGET} STATIC ${UI_CPP} ${UI_H}) +target_compile_features(${TARGET} PRIVATE cxx_std_17) +add_dependencies(${TARGET} llama-ui-assets) + +target_include_directories(${TARGET} PUBLIC + ${CMAKE_CURRENT_BINARY_DIR} +) diff --git a/tools/ui/embed.cpp b/tools/ui/embed.cpp new file mode 100644 index 000000000..41227868e --- /dev/null +++ b/tools/ui/embed.cpp @@ -0,0 +1,144 @@ +// llama-ui-embed: generate ui.cpp / ui.h that embed UI assets as C arrays. +// +// Usage: +// llama-ui-embed [ ]... + +#include +#include +#include +#include +#include +#include + +static bool read_file(const std::string & path, std::vector & out) { + std::ifstream f(path, std::ios::binary | std::ios::ate); + if (!f) { + fprintf(stderr, "embed: cannot open %s\n", path.c_str()); + return false; + } + const auto sz = f.tellg(); + if (sz < 0) { + return false; + } + f.seekg(0); + out.resize(static_cast(sz)); + if (sz > 0 && !f.read(reinterpret_cast(out.data()), sz)) { + return false; + } + return true; +} + +static void append_bytes_hex(std::string & out, const std::vector & bytes) { + static const char hex[] = "0123456789abcdef"; + out.reserve(out.size() + bytes.size() * 5); + for (unsigned char b : bytes) { + out += '0'; + out += 'x'; + out += hex[b >> 4]; + out += hex[b & 0xf]; + out += ','; + } +} + +static bool write_if_different(const std::string & path, const std::string & content) { + std::ifstream f(path, std::ios::binary | std::ios::ate); + if (f) { + const auto sz = f.tellg(); + if (sz >= 0 && static_cast(sz) == content.size()) { + std::string existing(static_cast(sz), '\0'); + f.seekg(0); + if (sz == 0 || f.read(existing.data(), sz)) { + if (existing == content) { + return true; + } + } + } + } + + std::ofstream out(path, std::ios::binary | std::ios::trunc); + if (!out) { + fprintf(stderr, "embed: cannot write %s\n", path.c_str()); + return false; + } + if (!content.empty()) { + out.write(content.data(), static_cast(content.size())); + } + return out.good(); +} + +static std::string fmt(const char * pattern, ...) { + char tmp[512]; + va_list ap; + va_start(ap, pattern); + const int n = vsnprintf(tmp, sizeof(tmp), pattern, ap); + va_end(ap); + return (n > 0) ? std::string(tmp, static_cast(n)) : std::string(); +} + +int main(int argc, char ** argv) { + if (argc < 3 || ((argc - 3) % 2) != 0) { + fprintf(stderr, "usage: %s [ ]...\n", argv[0]); + return 1; + } + + const std::string out_cpp = argv[1]; + const std::string out_h = argv[2]; + const int n_assets = (argc - 3) / 2; + + std::string h; + h += "#pragma once\n\n#include \n\n"; + if (n_assets > 0) { + h += "#define LLAMA_UI_HAS_ASSETS 1\n\n"; + } + h += + "struct llama_ui_asset {\n" + " const char * name;\n" + " const unsigned char * data;\n" + " size_t size;\n" + "};\n\n" + "const llama_ui_asset * llama_ui_find_asset(const char * name);\n"; + + std::string cpp; + cpp += "#include \"ui.h\"\n\n#include \n\n"; + + if (n_assets > 0) { + for (int i = 0; i < n_assets; i++) { + const char * path = argv[3 + i * 2 + 1]; + std::vector bytes; + if (!read_file(path, bytes)) { + return 1; + } + cpp += fmt("static const unsigned char asset_%d_data[] = {", i); + append_bytes_hex(cpp, bytes); + cpp += fmt("};\nstatic const size_t asset_%d_size = %lu;\n\n", + i, static_cast(bytes.size())); + } + + cpp += "static const llama_ui_asset g_assets[] = {\n"; + for (int i = 0; i < n_assets; i++) { + const char * name = argv[3 + i * 2]; + cpp += fmt(" { \"%s\", asset_%d_data, asset_%d_size },\n", name, i, i); + } + cpp += "};\n\n"; + + cpp += + "const llama_ui_asset * llama_ui_find_asset(const char * name) {\n" + " for (const auto & a : g_assets) {\n" + " if (strcmp(a.name, name) == 0) {\n" + " return &a;\n" + " }\n" + " }\n" + " return nullptr;\n" + "}\n"; + } else { + cpp += + "const llama_ui_asset * llama_ui_find_asset(const char *) {\n" + " return nullptr;\n" + "}\n"; + } + + bool ok = true; + ok = write_if_different(out_h, h) && ok; + ok = write_if_different(out_cpp, cpp) && ok; + return ok ? 0 : 1; +} diff --git a/tools/ui/scripts/vite-plugin-llama-cpp-build.ts b/tools/ui/scripts/vite-plugin-llama-cpp-build.ts index ddf6fa1e5..01c714a24 100644 --- a/tools/ui/scripts/vite-plugin-llama-cpp-build.ts +++ b/tools/ui/scripts/vite-plugin-llama-cpp-build.ts @@ -19,7 +19,7 @@ const GUIDE_FOR_FRONTEND = ` --> `.trim(); -const OUTPUT_DIR = '../../build/tools/ui/dist'; +const OUTPUT_DIR = process.env.LLAMA_UI_OUT_DIR ?? './dist'; export function llamaCppBuildPlugin(): Plugin { return { diff --git a/tools/ui/sources.cmake b/tools/ui/sources.cmake new file mode 100644 index 000000000..de9dbf78b --- /dev/null +++ b/tools/ui/sources.cmake @@ -0,0 +1,15 @@ +# Inputs used to decide whether the npm build output is up-to-date. + +set(UI_SOURCE_GLOBS + src/* + static/* +) + +set(UI_SOURCE_FILES + package.json + package-lock.json + vite.config.ts + svelte.config.js + tsconfig.json + scripts/vite-plugin-llama-cpp-build.ts +) diff --git a/tools/ui/svelte.config.js b/tools/ui/svelte.config.js index 6490e91da..b2d3f14dc 100644 --- a/tools/ui/svelte.config.js +++ b/tools/ui/svelte.config.js @@ -2,6 +2,10 @@ import { mdsvex } from 'mdsvex'; import adapter from '@sveltejs/adapter-static'; import { vitePreprocess } from '@sveltejs/vite-plugin-svelte'; +// CMake sets LLAMA_UI_OUT_DIR to the staging dir under the build tree; manual +// `npm run build` runs without the env var default to ./dist. +const outDir = process.env.LLAMA_UI_OUT_DIR ?? './dist'; + /** @type {import('@sveltejs/kit').Config} */ const config = { // Consult https://svelte.dev/docs/kit/integrations @@ -15,8 +19,8 @@ const config = { }, router: { type: 'hash' }, adapter: adapter({ - pages: '../../build/tools/ui/dist', - assets: '../../build/tools/ui/dist', + pages: outDir, + assets: outDir, fallback: 'index.html', precompress: false, strict: true diff --git a/tools/ui/ui.cpp b/tools/ui/ui.cpp deleted file mode 100644 index d02a62c2c..000000000 --- a/tools/ui/ui.cpp +++ /dev/null @@ -1,7 +0,0 @@ -#ifdef LLAMA_BUILD_UI -// auto generated files (see README.md for details) -#include "index.html.hpp" -#include "bundle.js.hpp" -#include "bundle.css.hpp" -#include "loading.html.hpp" -#endif diff --git a/tools/ui/ui.h b/tools/ui/ui.h deleted file mode 100644 index 6f775ea3a..000000000 --- a/tools/ui/ui.h +++ /dev/null @@ -1,17 +0,0 @@ -#pragma once - -// TODO @ngxson : refactor, wrap these in a function - -#ifdef LLAMA_BUILD_UI -extern unsigned char index_html[]; -extern unsigned int index_html_len; - -extern unsigned char bundle_js[]; -extern unsigned int bundle_js_len; - -extern unsigned char bundle_css[]; -extern unsigned int bundle_css_len; - -extern unsigned char loading_html[]; -extern unsigned int loading_html_len; -#endif