From 80982e815e67bae2442237f4e11466f44c9a2988 Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Thu, 24 Apr 2025 12:14:13 +0200 Subject: [PATCH 01/10] arg : clean up handling --mmproj with -hf (#13082) * arg : clean up handling --mmproj with -hf * rm change about no_mmproj * Revert "rm change about no_mmproj" This reverts commit 2cac8e0efb629d66c612f137e75d562f94bb9e6c. * handle no_mmproj explicitly * skip download mmproj on examples not using it --- common/arg.cpp | 68 +++++++++++++++++++++++++++---------- common/common.h | 1 + examples/llava/mtmd-cli.cpp | 1 + 3 files changed, 53 insertions(+), 17 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 1cfd0168d..85ba41114 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -38,6 +38,11 @@ using json = nlohmann::ordered_json; +std::initializer_list mmproj_examples = { + LLAMA_EXAMPLE_LLAVA, + // TODO: add LLAMA_EXAMPLE_SERVER when it's ready +}; + common_arg & common_arg::set_examples(std::initializer_list examples) { this->examples = std::move(examples); return *this; @@ -641,11 +646,16 @@ static struct common_hf_file_res common_get_hf_file(const std::string &, const s // utils // -static void common_params_handle_model( +struct handle_model_result { + bool found_mmproj = false; + common_params_model mmproj; +}; + +static handle_model_result common_params_handle_model( struct common_params_model & model, const std::string & bearer_token, - const std::string & model_path_default, - bool is_mmproj = false) { // TODO: move is_mmproj to an enum when we have more files? + const std::string & model_path_default) { + handle_model_result result; // handle pre-fill default model path and url based on hf_repo and hf_file { if (!model.hf_repo.empty()) { @@ -657,7 +667,12 @@ static void common_params_handle_model( exit(1); // built without CURL, error message already printed } model.hf_repo = auto_detected.repo; - model.hf_file = is_mmproj ? auto_detected.mmprojFile : auto_detected.ggufFile; + model.hf_file = auto_detected.ggufFile; + if (!auto_detected.mmprojFile.empty()) { + result.found_mmproj = true; + result.mmproj.hf_repo = model.hf_repo; + result.mmproj.hf_file = auto_detected.mmprojFile; + } } else { model.hf_file = model.path; } @@ -694,6 +709,8 @@ static void common_params_handle_model( exit(1); } } + + return result; } const std::vector kv_cache_types = { @@ -827,16 +844,25 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n"); } - common_params_handle_model(params.model, params.hf_token, DEFAULT_MODEL_PATH); - common_params_handle_model(params.speculative.model, params.hf_token, ""); - common_params_handle_model(params.vocoder.model, params.hf_token, ""); - - // allow --mmproj to be set from -hf - // assuming that mmproj is always in the same repo as text model - if (!params.model.hf_repo.empty() && ctx_arg.ex == LLAMA_EXAMPLE_LLAVA) { - params.mmproj.hf_repo = params.model.hf_repo; + // handle model and download + { + auto res = common_params_handle_model(params.model, params.hf_token, DEFAULT_MODEL_PATH); + if (params.no_mmproj) { + params.mmproj = {}; + } else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) { + // optionally, handle mmproj model when -hf is specified + params.mmproj = res.mmproj; + } + // only download mmproj if the current example is using it + for (auto & ex : mmproj_examples) { + if (ctx_arg.ex == ex) { + common_params_handle_model(params.mmproj, params.hf_token, ""); + break; + } + } + common_params_handle_model(params.speculative.model, params.hf_token, ""); + common_params_handle_model(params.vocoder.model, params.hf_token, ""); } - common_params_handle_model(params.mmproj, params.hf_token, "", true); if (params.escape) { string_process_escapes(params.prompt); @@ -2095,18 +2121,25 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONT_BATCHING")); add_opt(common_arg( {"--mmproj"}, "FILE", - "path to a multimodal projector file for LLaVA. see examples/llava/README.md", + "path to a multimodal projector file. see examples/llava/README.md", [](common_params & params, const std::string & value) { params.mmproj.path = value; } - ).set_examples({LLAMA_EXAMPLE_LLAVA})); + ).set_examples(mmproj_examples)); add_opt(common_arg( {"--mmproj-url"}, "URL", - "URL to a multimodal projector file for LLaVA. see examples/llava/README.md", + "URL to a multimodal projector file. see examples/llava/README.md", [](common_params & params, const std::string & value) { params.mmproj.url = value; } - ).set_examples({LLAMA_EXAMPLE_LLAVA})); + ).set_examples(mmproj_examples)); + add_opt(common_arg( + {"--no-mmproj"}, + "explicitly disable multimodal projector, useful when using -hf", + [](common_params & params) { + params.no_mmproj = true; + } + ).set_examples(mmproj_examples)); add_opt(common_arg( {"--image"}, "FILE", "path to an image file. use with multimodal models. Specify multiple times for batching", @@ -2381,6 +2414,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex add_opt(common_arg( {"-hf", "-hfr", "--hf-repo"}, "/[:quant]", "Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.\n" + "mmproj is also downloaded automatically if available. to disable, add --no-mmproj\n" "example: unsloth/phi-4-GGUF:q4_k_m\n" "(default: unused)", [](common_params & params, const std::string & value) { diff --git a/common/common.h b/common/common.h index e6eaa8e80..70d3ef8f2 100644 --- a/common/common.h +++ b/common/common.h @@ -342,6 +342,7 @@ struct common_params { // multimodal models (see examples/llava) struct common_params_model mmproj; + bool no_mmproj = false; // explicitly disable multimodal model std::vector image; // path to image file(s) // embedding diff --git a/examples/llava/mtmd-cli.cpp b/examples/llava/mtmd-cli.cpp index 89af7331a..193737605 100644 --- a/examples/llava/mtmd-cli.cpp +++ b/examples/llava/mtmd-cli.cpp @@ -261,6 +261,7 @@ int main(int argc, char ** argv) { if (params.mmproj.path.empty()) { show_additional_info(argc, argv); + LOG_ERR("ERR: Missing --mmproj argument\n"); return 1; } From 7c727fbe39150fbe8381f4fa43fed08719ebebe6 Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Thu, 24 Apr 2025 14:04:14 +0200 Subject: [PATCH 02/10] arg : add --no-mmproj-offload (#13093) * arg : add --no-mmproj-offload * Update common/arg.cpp --- common/arg.cpp | 7 +++++++ common/common.h | 1 + examples/llava/mtmd-cli.cpp | 7 ++++--- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 85ba41114..9cbf98571 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2140,6 +2140,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.no_mmproj = true; } ).set_examples(mmproj_examples)); + add_opt(common_arg( + {"--no-mmproj-offload"}, + "do not offload multimodal projector to GPU", + [](common_params & params) { + params.mmproj_use_gpu = false; + } + ).set_examples(mmproj_examples)); add_opt(common_arg( {"--image"}, "FILE", "path to an image file. use with multimodal models. Specify multiple times for batching", diff --git a/common/common.h b/common/common.h index 70d3ef8f2..0a9dc0599 100644 --- a/common/common.h +++ b/common/common.h @@ -342,6 +342,7 @@ struct common_params { // multimodal models (see examples/llava) struct common_params_model mmproj; + bool mmproj_use_gpu = true; // use GPU for multimodal model bool no_mmproj = false; // explicitly disable multimodal model std::vector image; // path to image file(s) diff --git a/examples/llava/mtmd-cli.cpp b/examples/llava/mtmd-cli.cpp index 193737605..250e8c9a9 100644 --- a/examples/llava/mtmd-cli.cpp +++ b/examples/llava/mtmd-cli.cpp @@ -40,7 +40,8 @@ static void show_additional_info(int /*argc*/, char ** argv) { "Usage: %s [options] -m --mmproj --image -p \n\n" " -m and --mmproj are required\n" " -hf user/repo can replace both -m and --mmproj in most cases\n" - " --image and -p are optional, if NOT provided, the CLI will run in chat mode\n", + " --image and -p are optional, if NOT provided, the CLI will run in chat mode\n" + " to disable using GPU for mmproj model, add --no-mmproj-offload\n", argv[0] ); } @@ -112,10 +113,10 @@ struct mtmd_cli_context { void init_vision_context(common_params & params) { const char * clip_path = params.mmproj.path.c_str(); ctx_vision.reset(mtmd_init_from_file(clip_path, model, mtmd_context_params{ - /* use_gpu */ true, + /* use_gpu */ params.mmproj_use_gpu, /* timings */ true, /* n_threads */ params.cpuparams.n_threads, - /* verbosity */ GGML_LOG_LEVEL_INFO, + /* verbosity */ params.verbosity > 0 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_INFO, })); if (!ctx_vision.get()) { LOG_ERR("Failed to load vision model from %s\n", clip_path); From 572b3141d343d7f947bf53b57513016e90db5680 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 24 Apr 2025 15:44:05 +0300 Subject: [PATCH 03/10] clang-tidy : disable warning about missing math parenthesis (#13091) --- .clang-tidy | 1 + 1 file changed, 1 insertion(+) diff --git a/.clang-tidy b/.clang-tidy index 310c3d182..5bc63bc6e 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -13,6 +13,7 @@ Checks: > -readability-magic-numbers, -readability-uppercase-literal-suffix, -readability-simplify-boolean-expr, + -readability-math-missing-parentheses, clang-analyzer-*, -clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling, performance-*, From 13b4548877326fdabee3e831b8cfd65d9844383c Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 24 Apr 2025 16:00:10 +0300 Subject: [PATCH 04/10] cmake : do not include ./src as public for libllama (#13062) * cmake : do not include ./src as public for libllama ggml-ci * cmake : rework tests ggml-ci * llguidance : remove unicode include ggml-ci * cmake : make c++17 private ggml-ci --- common/arg.cpp | 2 - examples/CMakeLists.txt | 9 --- examples/gbnf-validator/CMakeLists.txt | 5 -- examples/quantize-stats/CMakeLists.txt | 6 -- grammars/README.md | 2 +- src/CMakeLists.txt | 5 +- tests/CMakeLists.txt | 69 +++++++++++-------- tests/test-chat.cpp | 5 +- .../test-gbnf-validator.cpp | 4 +- tests/test-grammar-integration.cpp | 5 +- tests/test-grammar-llguidance.cpp | 3 +- tests/test-grammar-parser.cpp | 4 +- tests/test-json-schema-to-grammar.cpp | 2 +- tests/test-llama-grammar.cpp | 3 +- .../test-quantize-stats.cpp | 3 +- tests/test-tokenizer-1-bpe.cpp | 3 +- tests/test-tokenizer-1-spm.cpp | 3 +- 17 files changed, 64 insertions(+), 69 deletions(-) delete mode 100644 examples/gbnf-validator/CMakeLists.txt delete mode 100644 examples/quantize-stats/CMakeLists.txt rename examples/gbnf-validator/gbnf-validator.cpp => tests/test-gbnf-validator.cpp (98%) rename examples/quantize-stats/quantize-stats.cpp => tests/test-quantize-stats.cpp (99%) diff --git a/common/arg.cpp b/common/arg.cpp index 9cbf98571..0657553e4 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -994,7 +994,6 @@ static void common_params_print_completion(common_params_context & ctx_arg) { "llama-embedding", "llama-eval-callback", "llama-export-lora", - "llama-gbnf-validator", "llama-gen-docs", "llama-gguf", "llama-gguf-hash", @@ -1014,7 +1013,6 @@ static void common_params_print_completion(common_params_context & ctx_arg) { "llama-perplexity", "llama-q8dot", "llama-quantize", - "llama-quantize-stats", "llama-qwen2vl-cli", "llama-retrieval", "llama-run", diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 66cfab2c3..37476f904 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -21,11 +21,6 @@ else() add_subdirectory(embedding) add_subdirectory(eval-callback) - if (NOT WIN32) - # disabled on Windows because it uses internal functions not exported with LLAMA_API - add_subdirectory(gbnf-validator) - endif() - add_subdirectory(gguf-hash) add_subdirectory(gguf-split) add_subdirectory(gguf) @@ -58,10 +53,6 @@ else() add_subdirectory(convert-llama2c-to-ggml) add_subdirectory(cvector-generator) add_subdirectory(export-lora) - if (NOT WIN32) - # disabled on Windows because it uses internal functions not exported with LLAMA_API - add_subdirectory(quantize-stats) - endif() add_subdirectory(llava) if (GGML_RPC) add_subdirectory(rpc) diff --git a/examples/gbnf-validator/CMakeLists.txt b/examples/gbnf-validator/CMakeLists.txt deleted file mode 100644 index d2cb524c0..000000000 --- a/examples/gbnf-validator/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -set(TARGET llama-gbnf-validator) -add_executable(${TARGET} gbnf-validator.cpp) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/quantize-stats/CMakeLists.txt b/examples/quantize-stats/CMakeLists.txt deleted file mode 100644 index 9a3a0d3cd..000000000 --- a/examples/quantize-stats/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ -set(TARGET llama-quantize-stats) -add_executable(${TARGET} quantize-stats.cpp) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT}) -target_include_directories(${TARGET} PRIVATE ../../common) -target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/grammars/README.md b/grammars/README.md index 935213f5c..5aa12acc1 100644 --- a/grammars/README.md +++ b/grammars/README.md @@ -112,7 +112,7 @@ You can use GBNF grammars: - In [llama-server](../examples/server)'s completion endpoints, passed as the `grammar` body field - In [llama-cli](../examples/main), passed as the `--grammar` & `--grammar-file` flags -- With [llama-gbnf-validator](../examples/gbnf-validator) tool, to test them against strings. +- With [test-gbnf-validator](../tests/test-gbnf-validator.cpp), to test them against strings. ## JSON Schemas → GBNF diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 9f7ab13f1..1cd316b03 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -32,8 +32,9 @@ add_library(llama unicode.h ) -target_include_directories(llama PUBLIC . ../include) -target_compile_features (llama PUBLIC cxx_std_17) # don't bump +target_include_directories(llama PRIVATE .) +target_include_directories(llama PUBLIC ../include) +target_compile_features (llama PRIVATE cxx_std_17) # don't bump target_link_libraries(llama PUBLIC ggml) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 2bb210702..ae6827525 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -1,5 +1,17 @@ llama_add_compile_flags() +function(llama_build source) + if (DEFINED LLAMA_TEST_NAME) + set(TEST_TARGET ${LLAMA_TEST_NAME}) + else() + get_filename_component(TEST_TARGET ${source} NAME_WE) + endif() + + add_executable(${TEST_TARGET} ${source}) + target_link_libraries(${TEST_TARGET} PRIVATE common) + install(TARGETS ${TEST_TARGET} RUNTIME) +endfunction() + function(llama_test target) include(CMakeParseArguments) set(options) @@ -36,7 +48,7 @@ endfunction() # - LABEL: label for the test (defaults to main) # - ARGS: arguments to pass to the test executable # - WORKING_DIRECTORY -function(llama_target_and_test source) +function(llama_build_and_test source) include(CMakeParseArguments) set(options) set(oneValueArgs NAME LABEL WORKING_DIRECTORY) @@ -58,6 +70,7 @@ function(llama_target_and_test source) add_executable(${TEST_TARGET} ${source} get-model.cpp) install(TARGETS ${TEST_TARGET} RUNTIME) target_link_libraries(${TEST_TARGET} PRIVATE common) + add_test( NAME ${TEST_TARGET} WORKING_DIRECTORY ${LLAMA_TEST_WORKING_DIRECTORY} @@ -68,9 +81,7 @@ function(llama_target_and_test source) endfunction() # build test-tokenizer-0 target once and add many tests -add_executable(test-tokenizer-0 test-tokenizer-0.cpp) -target_link_libraries(test-tokenizer-0 PRIVATE common) -install(TARGETS test-tokenizer-0 RUNTIME) +llama_build(test-tokenizer-0.cpp) llama_test(test-tokenizer-0 NAME test-tokenizer-0-bert-bge ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bert-bge.gguf) llama_test(test-tokenizer-0 NAME test-tokenizer-0-command-r ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-command-r.gguf) @@ -87,27 +98,27 @@ llama_test(test-tokenizer-0 NAME test-tokenizer-0-refact ARGS ${CMAKE llama_test(test-tokenizer-0 NAME test-tokenizer-0-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf) if (LLAMA_LLGUIDANCE) - llama_target_and_test(test-grammar-llguidance.cpp ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf) + llama_build_and_test(test-grammar-llguidance.cpp ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf) endif () if (NOT WIN32) # these tests are disabled on Windows because they use internal functions not exported with LLAMA_API - llama_target_and_test(test-sampling.cpp) - llama_target_and_test(test-grammar-parser.cpp) - llama_target_and_test(test-grammar-integration.cpp) - llama_target_and_test(test-llama-grammar.cpp) - llama_target_and_test(test-chat.cpp) + llama_build_and_test(test-sampling.cpp) + llama_build_and_test(test-grammar-parser.cpp) + llama_build_and_test(test-grammar-integration.cpp) + llama_build_and_test(test-llama-grammar.cpp) + llama_build_and_test(test-chat.cpp) # TODO: disabled on loongarch64 because the ggml-ci node lacks Python 3.8 if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64") - llama_target_and_test(test-json-schema-to-grammar.cpp WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..) + llama_build_and_test(test-json-schema-to-grammar.cpp WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..) target_include_directories(test-json-schema-to-grammar PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../examples/server) endif() + llama_build(test-quantize-stats.cpp) + llama_build(test-gbnf-validator.cpp) # build test-tokenizer-1-bpe target once and add many tests - add_executable(test-tokenizer-1-bpe test-tokenizer-1-bpe.cpp) - target_link_libraries(test-tokenizer-1-bpe PRIVATE common) - install(TARGETS test-tokenizer-1-bpe RUNTIME) + llama_build(test-tokenizer-1-bpe.cpp) # TODO: disabled due to slowness #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-aquila ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf) @@ -120,37 +131,35 @@ if (NOT WIN32) #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf) # build test-tokenizer-1-spm target once and add many tests - add_executable(test-tokenizer-1-spm test-tokenizer-1-spm.cpp) - target_link_libraries(test-tokenizer-1-spm PRIVATE common) - install(TARGETS test-tokenizer-1-spm RUNTIME) + llama_build(test-tokenizer-1-spm.cpp) llama_test(test-tokenizer-1-spm NAME test-tokenizer-1-llama-spm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-spm.gguf) #llama_test(test-tokenizer-1-spm NAME test-tokenizer-1-baichuan ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf) - # llama_target_and_test(test-double-float.cpp) # SLOW + # llama_build_and_test(test-double-float.cpp) # SLOW endif() -llama_target_and_test(test-log.cpp) -llama_target_and_test(test-chat-template.cpp) +llama_build_and_test(test-log.cpp) +llama_build_and_test(test-chat-template.cpp) # this fails on windows (github hosted runner) due to curl DLL not found (exit code 0xc0000135) if (NOT WIN32) - llama_target_and_test(test-arg-parser.cpp) + llama_build_and_test(test-arg-parser.cpp) endif() -# llama_target_and_test(test-opt.cpp) # SLOW -llama_target_and_test(test-gguf.cpp) -llama_target_and_test(test-backend-ops.cpp) +# llama_build_and_test(test-opt.cpp) # SLOW +llama_build_and_test(test-gguf.cpp) +llama_build_and_test(test-backend-ops.cpp) -llama_target_and_test(test-model-load-cancel.cpp LABEL "model") -llama_target_and_test(test-autorelease.cpp LABEL "model") +llama_build_and_test(test-model-load-cancel.cpp LABEL "model") +llama_build_and_test(test-autorelease.cpp LABEL "model") if (NOT GGML_BACKEND_DL) # these tests use the backends directly and cannot be built with dynamic loading - llama_target_and_test(test-barrier.cpp) - llama_target_and_test(test-quantize-fns.cpp) - llama_target_and_test(test-quantize-perf.cpp) - llama_target_and_test(test-rope.cpp) + llama_build_and_test(test-barrier.cpp) + llama_build_and_test(test-quantize-fns.cpp) + llama_build_and_test(test-quantize-perf.cpp) + llama_build_and_test(test-rope.cpp) endif() diff --git a/tests/test-chat.cpp b/tests/test-chat.cpp index a0bf6affe..fa7aed82d 100644 --- a/tests/test-chat.cpp +++ b/tests/test-chat.cpp @@ -11,8 +11,9 @@ #include #include "chat.h" -#include "llama-grammar.h" -#include "unicode.h" + +#include "../src/unicode.h" +#include "../src/llama-grammar.h" using json = nlohmann::ordered_json; diff --git a/examples/gbnf-validator/gbnf-validator.cpp b/tests/test-gbnf-validator.cpp similarity index 98% rename from examples/gbnf-validator/gbnf-validator.cpp rename to tests/test-gbnf-validator.cpp index a610e6a0b..6547eec32 100644 --- a/examples/gbnf-validator/gbnf-validator.cpp +++ b/tests/test-gbnf-validator.cpp @@ -1,5 +1,5 @@ -#include "unicode.h" -#include "llama-grammar.h" +#include "../src/unicode.h" +#include "../src/llama-grammar.h" #include #include diff --git a/tests/test-grammar-integration.cpp b/tests/test-grammar-integration.cpp index 890608648..8988c347e 100644 --- a/tests/test-grammar-integration.cpp +++ b/tests/test-grammar-integration.cpp @@ -2,10 +2,11 @@ #undef NDEBUG #endif -#include "unicode.h" -#include "llama-grammar.h" #include "json-schema-to-grammar.h" +#include "../src/unicode.h" +#include "../src/llama-grammar.h" + #include #include #include diff --git a/tests/test-grammar-llguidance.cpp b/tests/test-grammar-llguidance.cpp index 3c19220e1..566b039a0 100644 --- a/tests/test-grammar-llguidance.cpp +++ b/tests/test-grammar-llguidance.cpp @@ -2,7 +2,6 @@ # undef NDEBUG #endif -#include "unicode.h" #include "sampling.h" #include @@ -84,7 +83,7 @@ static void test(const std::string & test_desc, const std::string & grammar_str, fprintf(stderr, "\n NOTE: Debug grammar file generated. To analyze this failure in detail, run the following " - "command: ./llama-gbnf-validator test-grammar-integration.grammar.gbnf " + "command: ./test-gbnf-validator test-grammar-integration.grammar.gbnf " "test-grammar-integration.string.txt\n\n"); } else { fprintf(stdout, "✅︎\n"); diff --git a/tests/test-grammar-parser.cpp b/tests/test-grammar-parser.cpp index 259172d99..67821a2d5 100644 --- a/tests/test-grammar-parser.cpp +++ b/tests/test-grammar-parser.cpp @@ -3,7 +3,9 @@ #endif #include "llama.h" -#include "llama-grammar.h" + +// TODO: shold not include libllama sources +#include "../src/llama-grammar.h" #include diff --git a/tests/test-json-schema-to-grammar.cpp b/tests/test-json-schema-to-grammar.cpp index 4d78e9142..e35134f3c 100755 --- a/tests/test-json-schema-to-grammar.cpp +++ b/tests/test-json-schema-to-grammar.cpp @@ -4,7 +4,7 @@ #include "json-schema-to-grammar.h" -#include "llama-grammar.h" +#include "../src/llama-grammar.h" #include #include diff --git a/tests/test-llama-grammar.cpp b/tests/test-llama-grammar.cpp index e2129206b..cc198f3e3 100644 --- a/tests/test-llama-grammar.cpp +++ b/tests/test-llama-grammar.cpp @@ -3,7 +3,8 @@ #endif #include "llama.h" -#include "llama-grammar.h" + +#include "../src/llama-grammar.h" #include #include diff --git a/examples/quantize-stats/quantize-stats.cpp b/tests/test-quantize-stats.cpp similarity index 99% rename from examples/quantize-stats/quantize-stats.cpp rename to tests/test-quantize-stats.cpp index dd07ab9b3..db0105911 100644 --- a/examples/quantize-stats/quantize-stats.cpp +++ b/tests/test-quantize-stats.cpp @@ -1,8 +1,9 @@ #include "ggml.h" #include "llama.h" -#include "llama-model.h" #include "common.h" +#include "../src/llama-model.h" + #include #include #include diff --git a/tests/test-tokenizer-1-bpe.cpp b/tests/test-tokenizer-1-bpe.cpp index 55425d88a..b183da47f 100644 --- a/tests/test-tokenizer-1-bpe.cpp +++ b/tests/test-tokenizer-1-bpe.cpp @@ -1,8 +1,9 @@ #include "llama.h" #include "common.h" -#include "unicode.h" #include "console.h" +#include "../src/unicode.h" + #include #include #include diff --git a/tests/test-tokenizer-1-spm.cpp b/tests/test-tokenizer-1-spm.cpp index 9e7b77f31..ba6e94ba8 100644 --- a/tests/test-tokenizer-1-spm.cpp +++ b/tests/test-tokenizer-1-spm.cpp @@ -1,8 +1,9 @@ #include "llama.h" #include "common.h" -#include "unicode.h" #include "console.h" +#include "../src/unicode.h" + #include #include #include From b10d8bfdb1dac40cce34e8860ca5ec7d950c3a44 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Thu, 24 Apr 2025 15:57:10 +0200 Subject: [PATCH 05/10] CUDA: use switch statements in constexpr functions (#13095) --- ggml/src/ggml-cuda/mmq.cuh | 80 ++++++++++++++++++++------------------ ggml/src/ggml-cuda/mmvq.cu | 80 ++++++++++++++++++++------------------ 2 files changed, 84 insertions(+), 76 deletions(-) diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh index 532358018..3cb201552 100644 --- a/ggml/src/ggml-cuda/mmq.cuh +++ b/ggml/src/ggml-cuda/mmq.cuh @@ -155,25 +155,27 @@ static constexpr __device__ int get_mmq_y_device() { #define MMQ_DP4A_TXS_Q6_K tile_x_sizes{mmq_y*WARP_SIZE*2 + mmq_y, mmq_y*WARP_SIZE/QI6_K + mmq_y/QI6_K, mmq_y*WARP_SIZE/8 + mmq_y/8} static constexpr __host__ __device__ tile_x_sizes mmq_get_dp4a_tile_x_sizes(ggml_type type, int mmq_y) { - return type == GGML_TYPE_Q4_0 ? MMQ_DP4A_TXS_Q4_0 : - type == GGML_TYPE_Q4_1 ? MMQ_DP4A_TXS_Q4_1 : - type == GGML_TYPE_Q5_0 ? MMQ_DP4A_TXS_Q8_0 : - type == GGML_TYPE_Q5_1 ? MMQ_DP4A_TXS_Q8_1 : - type == GGML_TYPE_Q8_0 ? MMQ_DP4A_TXS_Q8_0 : - type == GGML_TYPE_Q2_K ? MMQ_DP4A_TXS_Q2_K : - type == GGML_TYPE_Q3_K ? MMQ_DP4A_TXS_Q3_K : - type == GGML_TYPE_Q4_K ? MMQ_DP4A_TXS_Q4_K : - type == GGML_TYPE_Q5_K ? MMQ_DP4A_TXS_Q5_K : - type == GGML_TYPE_Q6_K ? MMQ_DP4A_TXS_Q6_K : - type == GGML_TYPE_IQ2_XXS ? MMQ_DP4A_TXS_Q8_0 : - type == GGML_TYPE_IQ2_XS ? MMQ_DP4A_TXS_Q8_0_16 : - type == GGML_TYPE_IQ2_S ? MMQ_DP4A_TXS_Q8_0_16 : - type == GGML_TYPE_IQ3_XXS ? MMQ_DP4A_TXS_Q8_0 : - type == GGML_TYPE_IQ3_S ? MMQ_DP4A_TXS_Q8_0 : - type == GGML_TYPE_IQ1_S ? MMQ_DP4A_TXS_Q8_0 : - type == GGML_TYPE_IQ4_XS ? MMQ_DP4A_TXS_Q8_0 : - type == GGML_TYPE_IQ4_NL ? MMQ_DP4A_TXS_Q8_0 : - tile_x_sizes{0, 0, 0}; + switch (type) { + case GGML_TYPE_Q4_0: return MMQ_DP4A_TXS_Q4_0; + case GGML_TYPE_Q4_1: return MMQ_DP4A_TXS_Q4_1; + case GGML_TYPE_Q5_0: return MMQ_DP4A_TXS_Q8_0; + case GGML_TYPE_Q5_1: return MMQ_DP4A_TXS_Q8_1; + case GGML_TYPE_Q8_0: return MMQ_DP4A_TXS_Q8_0; + case GGML_TYPE_Q2_K: return MMQ_DP4A_TXS_Q2_K; + case GGML_TYPE_Q3_K: return MMQ_DP4A_TXS_Q3_K; + case GGML_TYPE_Q4_K: return MMQ_DP4A_TXS_Q4_K; + case GGML_TYPE_Q5_K: return MMQ_DP4A_TXS_Q5_K; + case GGML_TYPE_Q6_K: return MMQ_DP4A_TXS_Q6_K; + case GGML_TYPE_IQ2_XXS: return MMQ_DP4A_TXS_Q8_0; + case GGML_TYPE_IQ2_XS: return MMQ_DP4A_TXS_Q8_0_16; + case GGML_TYPE_IQ2_S: return MMQ_DP4A_TXS_Q8_0_16; + case GGML_TYPE_IQ3_XXS: return MMQ_DP4A_TXS_Q8_0; + case GGML_TYPE_IQ3_S: return MMQ_DP4A_TXS_Q8_0; + case GGML_TYPE_IQ1_S: return MMQ_DP4A_TXS_Q8_0; + case GGML_TYPE_IQ4_XS: return MMQ_DP4A_TXS_Q8_0; + case GGML_TYPE_IQ4_NL: return MMQ_DP4A_TXS_Q8_0; + default: return tile_x_sizes{0, 0, 0}; + } } #define MMQ_MMA_TILE_X_K_Q8_0 (2*WARP_SIZE + 2*WARP_SIZE/QI8_0 + 4) @@ -189,25 +191,27 @@ static_assert(MMQ_MMA_TILE_X_K_Q3_K % 8 == 4, "Wrong padding."); static_assert(MMQ_MMA_TILE_X_K_Q6_K % 8 == 4, "Wrong padding."); static constexpr __host__ __device__ int mmq_get_mma_tile_x_k(ggml_type type) { - return type == GGML_TYPE_Q4_0 ? MMQ_MMA_TILE_X_K_Q8_0 : - type == GGML_TYPE_Q4_1 ? MMQ_MMA_TILE_X_K_Q8_1 : - type == GGML_TYPE_Q5_0 ? MMQ_MMA_TILE_X_K_Q8_0 : - type == GGML_TYPE_Q5_1 ? MMQ_MMA_TILE_X_K_Q8_1 : - type == GGML_TYPE_Q8_0 ? MMQ_MMA_TILE_X_K_Q8_0 : - type == GGML_TYPE_Q2_K ? MMQ_MMA_TILE_X_K_Q2_K : - type == GGML_TYPE_Q3_K ? MMQ_MMA_TILE_X_K_Q3_K : - type == GGML_TYPE_Q4_K ? MMQ_MMA_TILE_X_K_Q8_1 : - type == GGML_TYPE_Q5_K ? MMQ_MMA_TILE_X_K_Q8_1 : - type == GGML_TYPE_Q6_K ? MMQ_MMA_TILE_X_K_Q6_K : - type == GGML_TYPE_IQ2_XXS ? MMQ_MMA_TILE_X_K_Q8_0 : - type == GGML_TYPE_IQ2_XS ? MMQ_MMA_TILE_X_K_Q3_K : - type == GGML_TYPE_IQ2_S ? MMQ_MMA_TILE_X_K_Q3_K : - type == GGML_TYPE_IQ3_XXS ? MMQ_MMA_TILE_X_K_Q8_0 : - type == GGML_TYPE_IQ3_S ? MMQ_MMA_TILE_X_K_Q8_0 : - type == GGML_TYPE_IQ1_S ? MMQ_MMA_TILE_X_K_Q8_0 : - type == GGML_TYPE_IQ4_XS ? MMQ_MMA_TILE_X_K_Q8_0 : - type == GGML_TYPE_IQ4_NL ? MMQ_MMA_TILE_X_K_Q8_0 : - 0; + switch (type) { + case GGML_TYPE_Q4_0: return MMQ_MMA_TILE_X_K_Q8_0; + case GGML_TYPE_Q4_1: return MMQ_MMA_TILE_X_K_Q8_1; + case GGML_TYPE_Q5_0: return MMQ_MMA_TILE_X_K_Q8_0; + case GGML_TYPE_Q5_1: return MMQ_MMA_TILE_X_K_Q8_1; + case GGML_TYPE_Q8_0: return MMQ_MMA_TILE_X_K_Q8_0; + case GGML_TYPE_Q2_K: return MMQ_MMA_TILE_X_K_Q2_K; + case GGML_TYPE_Q3_K: return MMQ_MMA_TILE_X_K_Q3_K; + case GGML_TYPE_Q4_K: return MMQ_MMA_TILE_X_K_Q8_1; + case GGML_TYPE_Q5_K: return MMQ_MMA_TILE_X_K_Q8_1; + case GGML_TYPE_Q6_K: return MMQ_MMA_TILE_X_K_Q6_K; + case GGML_TYPE_IQ2_XXS: return MMQ_MMA_TILE_X_K_Q8_0; + case GGML_TYPE_IQ2_XS: return MMQ_MMA_TILE_X_K_Q3_K; + case GGML_TYPE_IQ2_S: return MMQ_MMA_TILE_X_K_Q3_K; + case GGML_TYPE_IQ3_XXS: return MMQ_MMA_TILE_X_K_Q8_0; + case GGML_TYPE_IQ3_S: return MMQ_MMA_TILE_X_K_Q8_0; + case GGML_TYPE_IQ1_S: return MMQ_MMA_TILE_X_K_Q8_0; + case GGML_TYPE_IQ4_XS: return MMQ_MMA_TILE_X_K_Q8_0; + case GGML_TYPE_IQ4_NL: return MMQ_MMA_TILE_X_K_Q8_0; + default: return 0; + } } #define MMQ_TILE_Y_K (WARP_SIZE + WARP_SIZE/QI8_1) diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu index cac04916c..d846e35a6 100644 --- a/ggml/src/ggml-cuda/mmvq.cu +++ b/ggml/src/ggml-cuda/mmvq.cu @@ -7,47 +7,51 @@ typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs); static constexpr __device__ vec_dot_q_cuda_t get_vec_dot_q_cuda(ggml_type type) { - return type == GGML_TYPE_Q4_0 ? vec_dot_q4_0_q8_1 : - type == GGML_TYPE_Q4_1 ? vec_dot_q4_1_q8_1 : - type == GGML_TYPE_Q5_0 ? vec_dot_q5_0_q8_1 : - type == GGML_TYPE_Q5_1 ? vec_dot_q5_1_q8_1 : - type == GGML_TYPE_Q8_0 ? vec_dot_q8_0_q8_1 : - type == GGML_TYPE_Q2_K ? vec_dot_q2_K_q8_1 : - type == GGML_TYPE_Q3_K ? vec_dot_q3_K_q8_1 : - type == GGML_TYPE_Q4_K ? vec_dot_q4_K_q8_1 : - type == GGML_TYPE_Q5_K ? vec_dot_q5_K_q8_1 : - type == GGML_TYPE_Q6_K ? vec_dot_q6_K_q8_1 : - type == GGML_TYPE_IQ2_XXS ? vec_dot_iq2_xxs_q8_1 : - type == GGML_TYPE_IQ2_XS ? vec_dot_iq2_xs_q8_1 : - type == GGML_TYPE_IQ2_S ? vec_dot_iq2_s_q8_1 : - type == GGML_TYPE_IQ3_XXS ? vec_dot_iq3_xxs_q8_1 : - type == GGML_TYPE_IQ1_S ? vec_dot_iq1_s_q8_1 : - type == GGML_TYPE_IQ1_M ? vec_dot_iq1_m_q8_1 : - type == GGML_TYPE_IQ4_NL ? vec_dot_iq4_nl_q8_1 : - type == GGML_TYPE_IQ4_XS ? vec_dot_iq4_xs_q8_1 : - type == GGML_TYPE_IQ3_S ? vec_dot_iq3_s_q8_1 : - nullptr; + switch (type) { + case GGML_TYPE_Q4_0: return vec_dot_q4_0_q8_1; + case GGML_TYPE_Q4_1: return vec_dot_q4_1_q8_1; + case GGML_TYPE_Q5_0: return vec_dot_q5_0_q8_1; + case GGML_TYPE_Q5_1: return vec_dot_q5_1_q8_1; + case GGML_TYPE_Q8_0: return vec_dot_q8_0_q8_1; + case GGML_TYPE_Q2_K: return vec_dot_q2_K_q8_1; + case GGML_TYPE_Q3_K: return vec_dot_q3_K_q8_1; + case GGML_TYPE_Q4_K: return vec_dot_q4_K_q8_1; + case GGML_TYPE_Q5_K: return vec_dot_q5_K_q8_1; + case GGML_TYPE_Q6_K: return vec_dot_q6_K_q8_1; + case GGML_TYPE_IQ2_XXS: return vec_dot_iq2_xxs_q8_1; + case GGML_TYPE_IQ2_XS: return vec_dot_iq2_xs_q8_1; + case GGML_TYPE_IQ2_S: return vec_dot_iq2_s_q8_1; + case GGML_TYPE_IQ3_XXS: return vec_dot_iq3_xxs_q8_1; + case GGML_TYPE_IQ1_S: return vec_dot_iq1_s_q8_1; + case GGML_TYPE_IQ1_M: return vec_dot_iq1_m_q8_1; + case GGML_TYPE_IQ4_NL: return vec_dot_iq4_nl_q8_1; + case GGML_TYPE_IQ4_XS: return vec_dot_iq4_xs_q8_1; + case GGML_TYPE_IQ3_S: return vec_dot_iq3_s_q8_1; + default: return nullptr; + } } static constexpr __device__ int get_vdr_mmvq(ggml_type type) { - return type == GGML_TYPE_Q4_0 ? VDR_Q4_0_Q8_1_MMVQ : - type == GGML_TYPE_Q4_1 ? VDR_Q4_1_Q8_1_MMVQ : - type == GGML_TYPE_Q5_0 ? VDR_Q5_0_Q8_1_MMVQ : - type == GGML_TYPE_Q5_1 ? VDR_Q5_1_Q8_1_MMVQ : - type == GGML_TYPE_Q8_0 ? VDR_Q8_0_Q8_1_MMVQ : - type == GGML_TYPE_Q2_K ? VDR_Q2_K_Q8_1_MMVQ : - type == GGML_TYPE_Q3_K ? VDR_Q3_K_Q8_1_MMVQ : - type == GGML_TYPE_Q4_K ? VDR_Q4_K_Q8_1_MMVQ : - type == GGML_TYPE_Q5_K ? VDR_Q5_K_Q8_1_MMVQ : - type == GGML_TYPE_Q6_K ? VDR_Q6_K_Q8_1_MMVQ : - type == GGML_TYPE_IQ2_XXS ? VDR_IQ2_XXS_Q8_1_MMVQ : - type == GGML_TYPE_IQ2_XS ? VDR_IQ2_XS_Q8_1_MMVQ : - type == GGML_TYPE_IQ2_S ? VDR_IQ2_S_Q8_1_MMVQ : - type == GGML_TYPE_IQ3_XXS ? VDR_IQ3_XXS_Q8_1_MMVQ : - type == GGML_TYPE_IQ3_S ? VDR_IQ3_S_Q8_1_MMVQ : - type == GGML_TYPE_IQ4_NL ? VDR_IQ4_NL_Q8_1_MMVQ : - type == GGML_TYPE_IQ4_XS ? VDR_IQ4_XS_Q8_1_MMVQ : - 1; + switch (type) { + case GGML_TYPE_Q4_0: return VDR_Q4_0_Q8_1_MMVQ; + case GGML_TYPE_Q4_1: return VDR_Q4_1_Q8_1_MMVQ; + case GGML_TYPE_Q5_0: return VDR_Q5_0_Q8_1_MMVQ; + case GGML_TYPE_Q5_1: return VDR_Q5_1_Q8_1_MMVQ; + case GGML_TYPE_Q8_0: return VDR_Q8_0_Q8_1_MMVQ; + case GGML_TYPE_Q2_K: return VDR_Q2_K_Q8_1_MMVQ; + case GGML_TYPE_Q3_K: return VDR_Q3_K_Q8_1_MMVQ; + case GGML_TYPE_Q4_K: return VDR_Q4_K_Q8_1_MMVQ; + case GGML_TYPE_Q5_K: return VDR_Q5_K_Q8_1_MMVQ; + case GGML_TYPE_Q6_K: return VDR_Q6_K_Q8_1_MMVQ; + case GGML_TYPE_IQ2_XXS: return VDR_IQ2_XXS_Q8_1_MMVQ; + case GGML_TYPE_IQ2_XS: return VDR_IQ2_XS_Q8_1_MMVQ; + case GGML_TYPE_IQ2_S: return VDR_IQ2_S_Q8_1_MMVQ; + case GGML_TYPE_IQ3_XXS: return VDR_IQ3_XXS_Q8_1_MMVQ; + case GGML_TYPE_IQ3_S: return VDR_IQ3_S_Q8_1_MMVQ; + case GGML_TYPE_IQ4_NL: return VDR_IQ4_NL_Q8_1_MMVQ; + case GGML_TYPE_IQ4_XS: return VDR_IQ4_XS_Q8_1_MMVQ; + default: return 1; + } } enum mmvq_parameter_table_id { From c6e8cc28c15166dba15629dba6a7366d4d5955ca Mon Sep 17 00:00:00 2001 From: Acly Date: Thu, 17 Apr 2025 14:16:45 +0200 Subject: [PATCH 06/10] ggml : Depthwise 2D convolution (ggml/1152) * ggml-cpu : kernels for faster depthwise 2D convolution * fix compile: remove static after moving to ops.cpp * add dilation for depthwise_conv_2d * review: rename to ggml_conv_2d_dw_direct, remove redundant struct keywords, pass by ref, whitespace * review: rename depthwise_conv_2d -> conv_2d_dw everywhere --- ggml/include/ggml.h | 22 ++++- ggml/src/ggml-cpu/ggml-cpu.c | 5 + ggml/src/ggml-cpu/ops.cpp | 172 +++++++++++++++++++++++++++++++++++ ggml/src/ggml-cpu/ops.h | 1 + ggml/src/ggml.c | 53 ++++++++++- 5 files changed, 250 insertions(+), 3 deletions(-) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 8fcc16df9..51aa5b3a0 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -481,6 +481,7 @@ extern "C" { GGML_OP_CONV_TRANSPOSE_1D, GGML_OP_IM2COL, GGML_OP_IM2COL_BACK, + GGML_OP_CONV_2D_DW, GGML_OP_CONV_TRANSPOSE_2D, GGML_OP_POOL_1D, GGML_OP_POOL_2D, @@ -677,6 +678,9 @@ extern "C" { GGML_API bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1 GGML_API bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2 + // true for tensor that is stored in memory as CxWxHxN and has been permuted to WxHxCxN + GGML_API bool ggml_is_contiguous_channels(const struct ggml_tensor * tensor); + GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1); GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1); @@ -1660,7 +1664,7 @@ extern "C" { struct ggml_tensor * a, struct ggml_tensor * b); - // depthwise + // depthwise (via im2col and mul_mat) GGML_API struct ggml_tensor * ggml_conv_2d_dw( struct ggml_context * ctx, struct ggml_tensor * a, // convolution kernel @@ -1672,6 +1676,22 @@ extern "C" { int d0, // dilation dimension 0 int d1); // dilation dimension 1 + // Depthwise 2D convolution + // may be faster than ggml_conv_2d_dw, but not available in all backends + // a: KW KH 1 C convolution kernel + // b: W H C N input data + // res: W_out H_out C N + GGML_API struct ggml_tensor * ggml_conv_2d_dw_direct( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int stride0, + int stride1, + int pad0, + int pad1, + int dilation0, + int dilation1); + GGML_API struct ggml_tensor * ggml_conv_transpose_2d_p0( struct ggml_context * ctx, struct ggml_tensor * a, diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 504003287..dbad8f61a 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -1932,6 +1932,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_im2col_back_f32(params, tensor); } break; + case GGML_OP_CONV_2D_DW: + { + ggml_compute_forward_conv_2d_dw(params, tensor); + } break; case GGML_OP_CONV_TRANSPOSE_2D: { ggml_compute_forward_conv_transpose_2d(params, tensor); @@ -2268,6 +2272,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { } break; case GGML_OP_IM2COL: case GGML_OP_IM2COL_BACK: + case GGML_OP_CONV_2D_DW: case GGML_OP_CONV_TRANSPOSE_1D: case GGML_OP_CONV_TRANSPOSE_2D: { diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index 6050147be..3c2adb217 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -6064,6 +6064,178 @@ void ggml_compute_forward_conv_transpose_2d( } } +// ggml_compute_forward_conv_2d_dw + +struct ggml_conv_2d_dw_params { + int64_t channels; + int64_t batch; + int64_t src_w; + int64_t src_h; + int64_t dst_w; + int64_t dst_h; + int64_t knl_w; + int64_t knl_h; + int stride_x; + int stride_y; + int pad_x; + int pad_y; + int dilation_x; + int dilation_y; +}; + +static void ggml_compute_forward_conv_2d_dw_cwhn( + const ggml_compute_params * params, + const ggml_tensor * src, + const ggml_tensor * kernel, + ggml_tensor * dst, + const ggml_conv_2d_dw_params & p) { + + const int64_t c = p.channels; + const float * knl_data = (const float *)kernel->data; + + const int64_t rows_total = p.dst_h * p.batch; + const int64_t rows_per_thread = (rows_total + params->nth - 1) / params->nth; + const int64_t row_start = params->ith * rows_per_thread; + const int64_t row_end = MIN(row_start + rows_per_thread, rows_total); + +#ifdef GGML_SIMD + const int64_t pkg_size = GGML_F32_EPR; + const int64_t pkg_count = c / pkg_size; + const int64_t c_pkg_end = pkg_count * pkg_size; +#else + const int64_t c_pkg_end = 0; +#endif + + for (int64_t row = row_start; row < row_end; ++row) { + const int64_t dst_y = row % p.dst_h; + const float * src_data = (const float *)src->data + (row / p.dst_h) * p.src_w * p.src_h * c; + for (int64_t dst_x = 0; dst_x < p.dst_w; ++dst_x) { + float * dst_data = (float *)dst->data + (row * p.dst_w + dst_x) * c; + const int64_t src_y_base = dst_y * p.stride_y - p.pad_y; + const int64_t src_x_base = dst_x * p.stride_x - p.pad_x; + +#ifdef GGML_SIMD + // Vectorized loop + for (int64_t c_i = 0; c_i < c_pkg_end; c_i += pkg_size) { + GGML_F32_VEC sum = GGML_F32_VEC_ZERO; + for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) { + const int64_t src_y = src_y_base + knl_y * p.dilation_y; + if (src_y < 0 || src_y >= p.src_h) { + continue; + } + for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) { + const int64_t src_x = src_x_base + knl_x * p.dilation_x; + if (src_x < 0 || src_x >= p.src_w) { + continue; + } + GGML_F32_VEC k = GGML_F32_VEC_LOAD(knl_data + (knl_y * p.knl_w + knl_x) * c + c_i); + GGML_F32_VEC s = GGML_F32_VEC_LOAD(src_data + (src_y * p.src_w + src_x) * c + c_i); + sum = GGML_F32_VEC_FMA(sum, k, s); + } + } + GGML_F32_VEC_STORE(dst_data + c_i, sum); + } +#endif + // Scalar loop + for (int64_t c_i = c_pkg_end; c_i < c; ++c_i) { + float sum = 0.0f; + for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) { + const int64_t src_y = src_y_base + knl_y * p.dilation_y; + if (src_y < 0 || src_y >= p.src_h) { + continue; + } + for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) { + const int64_t src_x = src_x_base + knl_x * p.dilation_x; + if (src_x < 0 || src_x >= p.src_w) { + continue; + } + sum += knl_data[(knl_y * p.knl_w + knl_x) * c + c_i] + * src_data[(src_y * p.src_w + src_x) * c + c_i]; + } + } + dst_data[c_i] = sum; + } + } + } +} + +static void ggml_compute_forward_conv_2d_dw_whcn( + const ggml_compute_params * params, + const ggml_tensor * src, + const ggml_tensor * kernel, + ggml_tensor * dst, + const ggml_conv_2d_dw_params & p) { + + const int64_t n = p.channels * p.batch; + const int64_t per_thread = (n + params->nth - 1) / params->nth; + const int64_t start = params->ith * per_thread; + const int64_t end = MIN(start + per_thread, n); + + for (int64_t i = start; i < end; ++i) { + const float * knl_data = (const float *)kernel->data + (i % p.channels) * p.knl_w * p.knl_h; + const float * src_data = (const float *)src->data + i * p.src_w * p.src_h; + float * dst_data = (float *)dst->data + i * p.dst_w * p.dst_h; + + for (int64_t dst_y = 0; dst_y < p.dst_h; ++dst_y) { + for (int64_t dst_x = 0; dst_x < p.dst_w; ++dst_x) { + + float sum = 0.0f; + for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) { + const int64_t src_y = dst_y * p.stride_y + knl_y * p.dilation_y - p.pad_y; + if (src_y < 0 || src_y >= p.src_h) { + continue; + } + for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) { + const int64_t src_x = dst_x * p.stride_x + knl_x * p.dilation_x - p.pad_x; + if (src_x < 0 || src_x >= p.src_w) { + continue; + } + sum += knl_data[knl_y * p.knl_w + knl_x] + * src_data[src_y * p.src_w + src_x]; + } + } + dst_data[dst_y * p.dst_w + dst_x] = sum; + } + } + } +} + +void ggml_compute_forward_conv_2d_dw( + const ggml_compute_params * params, + ggml_tensor * dst) { + + const ggml_tensor * kernel = dst->src[0]; + const ggml_tensor * src = dst->src[1]; + ggml_conv_2d_dw_params p; + p.channels = src->ne[2]; + p.batch = src->ne[3]; + p.src_w = src->ne[0]; + p.src_h = src->ne[1]; + p.dst_w = dst->ne[0]; + p.dst_h = dst->ne[1]; + p.knl_w = kernel->ne[0]; + p.knl_h = kernel->ne[1]; + p.stride_x = dst->op_params[0]; + p.stride_y = dst->op_params[1]; + p.pad_x = dst->op_params[2]; + p.pad_y = dst->op_params[3]; + p.dilation_x = dst->op_params[4]; + p.dilation_y = dst->op_params[5]; + + GGML_ASSERT(kernel->ne[3] == p.channels); + GGML_ASSERT(dst->ne[3] == p.batch); + + if (ggml_is_contiguous(src)) { + ggml_compute_forward_conv_2d_dw_whcn(params, src, kernel, dst, p); + } else if (ggml_is_contiguous_channels(src)) { + // kernel should also have channels most contiguous in memory + GGML_ASSERT(kernel->nb[0] >= kernel->nb[2] && kernel->nb[1] >= kernel->nb[0]); + ggml_compute_forward_conv_2d_dw_cwhn(params, src, kernel, dst, p); + } else { + GGML_ABORT("non-contiguous memory layout not supported"); + } +} + // ggml_compute_forward_pool_1d_sk_p0 static void ggml_compute_forward_pool_1d_sk_p0( diff --git a/ggml/src/ggml-cpu/ops.h b/ggml/src/ggml-cpu/ops.h index 410a37204..dc081b9e6 100644 --- a/ggml/src/ggml-cpu/ops.h +++ b/ggml/src/ggml-cpu/ops.h @@ -65,6 +65,7 @@ void ggml_compute_forward_conv_transpose_1d(const struct ggml_compute_params * p void ggml_compute_forward_im2col(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_im2col_back_f32(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_conv_transpose_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst); +void ggml_compute_forward_conv_2d_dw(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_pool_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_pool_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params, struct ggml_tensor * dst); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 950772c75..c8b2feff4 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -956,6 +956,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "CONV_TRANSPOSE_1D", "IM2COL", "IM2COL_BACK", + "CONV_2D_DW", "CONV_TRANSPOSE_2D", "POOL_1D", "POOL_2D", @@ -993,7 +994,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "OPT_STEP_ADAMW", }; -static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81"); +static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82"); static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "none", @@ -1050,6 +1051,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "conv_transpose_1d(x)", "im2col(x)", "im2col_back(x)", + "conv_2d_dw(x)", "conv_transpose_2d(x)", "pool_1d(x)", "pool_2d(x)", @@ -1087,7 +1089,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "adamw(x)", }; -static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81"); +static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82"); static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); @@ -1344,6 +1346,13 @@ bool ggml_is_permuted(const struct ggml_tensor * tensor) { return tensor->nb[0] > tensor->nb[1] || tensor->nb[1] > tensor->nb[2] || tensor->nb[2] > tensor->nb[3]; } +bool ggml_is_contiguous_channels(const struct ggml_tensor * tensor) { + return + tensor->nb[0] > tensor->nb[2] && + tensor->nb[1] > tensor->nb[0] && + tensor->nb[2] == ggml_type_size(tensor->type); +} + static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) { static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); @@ -4050,6 +4059,46 @@ struct ggml_tensor * ggml_conv_2d_dw( return result; } +// ggml_conv_2d_dw_direct + +struct ggml_tensor * ggml_conv_2d_dw_direct( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int stride0, + int stride1, + int pad0, + int pad1, + int dilation0, + int dilation1) { + GGML_ASSERT(a->ne[2] == 1); + GGML_ASSERT(a->ne[3] == b->ne[2]); + int64_t ne[4]; + ne[0] = ggml_calc_conv_output_size(b->ne[0], a->ne[0], stride0, pad0, dilation0); + ne[1] = ggml_calc_conv_output_size(b->ne[1], a->ne[1], stride1, pad1, dilation1); + ne[2] = b->ne[2]; + ne[3] = b->ne[3]; + + struct ggml_tensor * result = ggml_new_tensor(ctx, b->type, 4, ne); + + if (ggml_is_contiguous_channels(b)) { + // Result will be permuted the same way as input (CWHN order) + const int64_t type_size = ggml_type_size(result->type); + GGML_ASSERT(ggml_blck_size(result->type) == 1); + result->nb[0] = result->ne[2] * type_size; + result->nb[1] = result->ne[0] * result->nb[0]; + result->nb[2] = type_size; + } + + int32_t params[] = { stride0, stride1, pad0, pad1, dilation0, dilation1 }; + ggml_set_op_params(result, params, sizeof(params)); + + result->op = GGML_OP_CONV_2D_DW; + result->src[0] = a; + result->src[1] = b; + return result; +} + // ggml_conv_transpose_2d_p0 static int64_t ggml_calc_conv_transpose_output_size(int64_t ins, int64_t ks, int s, int p) { From 63b4911494afe04778c61b9c19019341d71c99fc Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 24 Apr 2025 16:47:43 +0300 Subject: [PATCH 07/10] sync : ggml ggml-ci --- scripts/sync-ggml.last | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last index cad082a90..41feffca9 100644 --- a/scripts/sync-ggml.last +++ b/scripts/sync-ggml.last @@ -1 +1 @@ -f71d538ece3fb32a04824dc6d1e73e360be9d22f +13bcf9ce50651a8b4238ec6d136f46f2c1b23b6f From 87616f0680947800ecba3e9f6bc6e101943bf8e6 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 24 Apr 2025 17:22:27 +0300 Subject: [PATCH 08/10] ggml : fix trailing whitespaces (#0) --- ggml/src/ggml.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index c8b2feff4..2a39dc7bf 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -4069,7 +4069,7 @@ struct ggml_tensor * ggml_conv_2d_dw_direct( int stride1, int pad0, int pad1, - int dilation0, + int dilation0, int dilation1) { GGML_ASSERT(a->ne[2] == 1); GGML_ASSERT(a->ne[3] == b->ne[2]); From 226251ed56b85190e18a1cca963c45b888f4953c Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 24 Apr 2025 22:29:22 +0300 Subject: [PATCH 09/10] embeddings : fix batch sizes (#13076) ggml-ci --- examples/embedding/embedding.cpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index 6f0890415..06fce236e 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -89,6 +89,13 @@ int main(int argc, char ** argv) { common_init(); params.embedding = true; + + // utilize the full context + if (params.n_batch < params.n_ctx) { + LOG_WRN("%s: setting batch size to %d\n", __func__, params.n_ctx); + params.n_batch = params.n_ctx; + } + // For non-causal models, batch size must be equal to ubatch size params.n_ubatch = params.n_batch; @@ -134,7 +141,6 @@ int main(int argc, char ** argv) { // max batch size const uint64_t n_batch = params.n_batch; - GGML_ASSERT(params.n_batch >= params.n_ctx); // tokenize the prompts and trim std::vector> inputs; From 13be08daf992c89d5169518229b3740041c0f419 Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Thu, 24 Apr 2025 22:17:04 +0200 Subject: [PATCH 10/10] clip : remove boi/eoi embeddings for GLM-edge model (#13081) --- examples/llava/clip-impl.h | 2 -- examples/llava/clip.cpp | 17 +---------------- examples/llava/mtmd.cpp | 5 +++++ 3 files changed, 6 insertions(+), 18 deletions(-) diff --git a/examples/llava/clip-impl.h b/examples/llava/clip-impl.h index 8d310fb02..53ac38130 100644 --- a/examples/llava/clip-impl.h +++ b/examples/llava/clip-impl.h @@ -90,8 +90,6 @@ #define TN_GLM_ADAPTER_D_H_2_4H "adapter.linear.dense_h_to_4h.%s" #define TN_GLM_ADAPTER_GATE "adapter.linear.gate.%s" #define TN_GLM_ADAPTER_D_4H_2_H "adapter.linear.dense_4h_to_h.%s" -#define TN_GLM_BOI_W "adapter.boi" -#define TN_GLM_EOI_W "adapter.eoi" enum projector_type { PROJECTOR_TYPE_MLP, diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 4eec4a264..9a5ab7c81 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -244,8 +244,6 @@ struct clip_vision_model { //GLMV-Edge projection struct ggml_tensor * mm_model_adapter_conv_w = nullptr; struct ggml_tensor * mm_model_adapter_conv_b = nullptr; - struct ggml_tensor * boi_w = nullptr; - struct ggml_tensor * eoi_w = nullptr; // MobileVLM projection struct ggml_tensor * mm_model_mlp_1_w = nullptr; @@ -1697,8 +1695,6 @@ struct clip_model_loader { vision_model.mm_model_mlp_1_w = get_tensor(string_format(TN_GLM_ADAPTER_D_H_2_4H,"weight")); vision_model.mm_model_mlp_2_w = get_tensor(string_format(TN_GLM_ADAPTER_GATE,"weight")); vision_model.mm_model_mlp_3_w = get_tensor(string_format(TN_GLM_ADAPTER_D_4H_2_H,"weight")); - vision_model.boi_w = get_tensor(TN_GLM_BOI_W); - vision_model.eoi_w = get_tensor(TN_GLM_EOI_W); } break; case PROJECTOR_TYPE_MERGER: { @@ -2593,8 +2589,7 @@ void clip_free(clip_ctx * ctx) { } size_t clip_embd_nbytes(const struct clip_ctx * ctx) { - int extra_tokens = ctx->has_glm_projector ? 2 : 0; - return (clip_n_patches(ctx) + extra_tokens) * clip_n_mmproj_embd(ctx) * sizeof(float); + return clip_n_patches(ctx) * clip_n_mmproj_embd(ctx) * sizeof(float); } size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w) { @@ -2790,9 +2785,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima } if (ctx->has_glm_projector) { GGML_ASSERT(batch_size == 1); - ggml_tensor * boi = ctx->vision_model.boi_w; - ggml_backend_tensor_get(boi,vec,0,ggml_nbytes(boi)); - vec = (float*)(vec+ggml_nelements(boi)); //offset for boi } // build the inference graph @@ -3001,13 +2993,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima // copy the embeddings to the location passed by the user ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings)); - if (ctx->has_glm_projector) { - //eoi - ggml_tensor * eoi = ctx->vision_model.eoi_w; - int offset = ggml_nelements(embeddings); - ggml_backend_tensor_get(eoi, vec+offset, 0, ggml_nbytes(eoi)); - } - return true; } diff --git a/examples/llava/mtmd.cpp b/examples/llava/mtmd.cpp index 11ca7b30f..a994ef016 100644 --- a/examples/llava/mtmd.cpp +++ b/examples/llava/mtmd.cpp @@ -186,6 +186,11 @@ int32_t mtmd_tokenize(mtmd_context * ctx, marker_modified = "" + ctx->image_marker + ""; string_replace_all(prompt_modified, ctx->image_marker, marker_modified); + } else if (proj_type == PROJECTOR_TYPE_GLM_EDGE) { + // <|begin_of_image|> ... (image embeddings) ... <|end_of_image|> + marker_modified = "<|begin_of_image|>" + ctx->image_marker + "<|end_of_image|>"; + string_replace_all(prompt_modified, ctx->image_marker, marker_modified); + } else if (proj_type == PROJECTOR_TYPE_IDEFICS3) { // https://github.com/huggingface/transformers/blob/a42ba80fa520c784c8f11a973ca9034e5f859b79/src/transformers/models/idefics3/processing_idefics3.py#L192-L215 marker_modified = "" + ctx->image_marker + "";