Merge branch 'upstream' into concedo_experimental

# Conflicts: # .flake8 # .github/labeler.yml # .github/workflows/bench.yml.disabled # .github/workflows/build-linux-cross.yml # .github/workflows/build.yml # .github/workflows/server.yml # .gitignore # CMakeLists.txt # CODEOWNERS # Makefile # README.md # SECURITY.md # build-xcframework.sh # ci/run.sh # docs/development/HOWTO-add-model.md # docs/multimodal/MobileVLM.md # docs/multimodal/glmedge.md # docs/multimodal/llava.md # docs/multimodal/minicpmo2.6.md # docs/multimodal/minicpmv2.5.md # docs/multimodal/minicpmv2.6.md # examples/CMakeLists.txt # examples/pydantic_models_to_grammar_examples.py # grammars/README.md # pyrightconfig.json # requirements/requirements-all.txt # scripts/fetch_server_test_models.py # scripts/tool_bench.py # scripts/xxd.cmake # tests/CMakeLists.txt # tests/run-json-schema-to-grammar.mjs # tools/batched-bench/CMakeLists.txt # tools/batched-bench/README.md # tools/batched-bench/batched-bench.cpp # tools/cvector-generator/CMakeLists.txt # tools/cvector-generator/README.md # tools/cvector-generator/completions.txt # tools/cvector-generator/cvector-generator.cpp # tools/cvector-generator/mean.hpp # tools/cvector-generator/negative.txt # tools/cvector-generator/pca.hpp # tools/cvector-generator/positive.txt # tools/export-lora/CMakeLists.txt # tools/export-lora/README.md # tools/export-lora/export-lora.cpp # tools/gguf-split/CMakeLists.txt # tools/gguf-split/README.md # tools/imatrix/CMakeLists.txt # tools/imatrix/README.md # tools/imatrix/imatrix.cpp # tools/llama-bench/CMakeLists.txt # tools/llama-bench/README.md # tools/llama-bench/llama-bench.cpp # tools/llava/CMakeLists.txt # tools/llava/README.md # tools/llava/android/adb_run.sh # tools/llava/android/build_64.sh # tools/llava/clip-quantize-cli.cpp # tools/main/CMakeLists.txt # tools/main/README.md # tools/perplexity/CMakeLists.txt # tools/perplexity/README.md # tools/perplexity/perplexity.cpp # tools/quantize/CMakeLists.txt # tools/rpc/CMakeLists.txt # tools/rpc/README.md # tools/rpc/rpc-server.cpp # tools/run/CMakeLists.txt # tools/run/README.md # tools/run/linenoise.cpp/linenoise.cpp # tools/run/linenoise.cpp/linenoise.h # tools/run/run.cpp # tools/server/CMakeLists.txt # tools/server/README.md # tools/server/bench/README.md # tools/server/public_simplechat/readme.md # tools/server/tests/README.md # tools/server/themes/README.md # tools/server/themes/buttons-top/README.md # tools/server/themes/wild/README.md # tools/tokenize/CMakeLists.txt # tools/tokenize/tokenize.cpp
2025-09-10 17:14:36 +00:00 · 2025-05-03 12:15:36 +08:00 · 2025-05-03 12:15:36 +08:00 · 5a2808ffaf
commit 5a2808ffaf
parent b258e23003 1d36b3670b
137 changed files with 75 additions and 303 deletions
--- a/.editorconfig
+++ b/.editorconfig
@ -21,15 +21,15 @@ indent_style = tab
 [prompts/*.txt]
 insert_final_newline = unset
-[examples/server/public/*]
+[tools/server/public/*]
 indent_size = 2
-[examples/server/public/deps_*]
+[tools/server/public/deps_*]
 trim_trailing_whitespace = unset
 indent_style = unset
 indent_size = unset
-[examples/server/deps_*]
+[tools/server/deps_*]
 trim_trailing_whitespace = unset
 indent_style = unset
 indent_size = unset
@ -37,7 +37,7 @@ indent_size = unset
 [examples/llama.swiftui/llama.swiftui.xcodeproj/*]
 indent_style = tab
-[examples/cvector-generator/*.txt]
+[tools/cvector-generator/*.txt]
 trim_trailing_whitespace = unset
 insert_final_newline = unset
--- a/.gitignore
+++ b/.gitignore
@ -92,8 +92,6 @@ ppl-*.txt
 qnt-*.txt
 perf-*.txt
 examples/jeopardy/results.txt
 poetry.lock
 poetry.toml
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -470,51 +470,51 @@ add_library(common2
            common/common.h
            common/sampling.cpp
            common/sampling.h
-            examples/llava/llava.cpp
+            tools/llava/llava.cpp
-            examples/llava/llava.h
+            tools/llava/llava.h
-            examples/llava/clip.cpp
+            tools/llava/clip.cpp
-            examples/llava/clip.h
+            tools/llava/clip.h
            src/unicode.h
            src/unicode.cpp
            src/unicode-data.cpp
            otherarch/utils.cpp
            otherarch/utils.h)
-target_include_directories(common2 PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./examples ./common)
+target_include_directories(common2 PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./tools ./common)
 target_compile_features(common2 PUBLIC cxx_std_17) # don't bump
 target_link_libraries(common2 PRIVATE ggml ${LLAMA_EXTRA_LIBS})
 set_target_properties(common2 PROPERTIES POSITION_INDEPENDENT_CODE ON)
 add_library(sdtype_adapter
            otherarch/sdcpp/sdtype_adapter.cpp)
-target_include_directories(sdtype_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./examples ./common)
+target_include_directories(sdtype_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./tools ./common)
 target_compile_features(sdtype_adapter PUBLIC cxx_std_17) # don't bump
 target_link_libraries(sdtype_adapter PRIVATE common2 ggml ${LLAMA_EXTRA_LIBS})
 set_target_properties(sdtype_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON)
 add_library(whisper_adapter
            otherarch/whispercpp/whisper_adapter.cpp)
-target_include_directories(whisper_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./otherarch/whispercpp ./examples ./common)
+target_include_directories(whisper_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./otherarch/whispercpp ./tools ./common)
 target_compile_features(whisper_adapter PUBLIC cxx_std_17) # don't bump
 target_link_libraries(whisper_adapter PRIVATE common2 ggml ${LLAMA_EXTRA_LIBS})
 set_target_properties(whisper_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON)
 add_library(tts_adapter
            otherarch/tts_adapter.cpp)
-target_include_directories(tts_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./examples ./common)
+target_include_directories(tts_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./tools ./common)
 target_compile_features(tts_adapter PUBLIC cxx_std_17) # don't bump
 target_link_libraries(tts_adapter PRIVATE common2 ggml ${LLAMA_EXTRA_LIBS})
 set_target_properties(tts_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON)
 add_library(embeddings_adapter
            otherarch/embeddings_adapter.cpp)
-target_include_directories(embeddings_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./examples ./common)
+target_include_directories(embeddings_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./tools ./common)
 target_compile_features(embeddings_adapter PUBLIC cxx_std_17) # don't bump
 target_link_libraries(embeddings_adapter PRIVATE common2 ggml ${LLAMA_EXTRA_LIBS})
 set_target_properties(embeddings_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON)
 add_library(gpttype_adapter
            gpttype_adapter.cpp)
-target_include_directories(gpttype_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./examples ./common)
+target_include_directories(gpttype_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./tools ./common)
 target_compile_features(gpttype_adapter PUBLIC cxx_std_17) # don't bump
 target_link_libraries(gpttype_adapter PRIVATE common2 ggml ggml_v1 ggml_v2 ggml_v3 ${LLAMA_EXTRA_LIBS})
 set_target_properties(gpttype_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON)
@ -522,7 +522,7 @@ set_target_properties(gpttype_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON)
 if (LLAMA_CUBLAS)
    set(TARGET koboldcpp_cublas)
    add_library(${TARGET} SHARED expose.cpp expose.h)
-    target_include_directories(${TARGET} PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./examples ./common)
+    target_include_directories(${TARGET} PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./tools ./common)
    target_compile_features(${TARGET} PUBLIC cxx_std_17) # don't bump
    set_target_properties(${TARGET} PROPERTIES PREFIX "")
    set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "koboldcpp_cublas")
@ -542,7 +542,7 @@ endif()
 if (LLAMA_HIPBLAS)
    set(TARGET koboldcpp_hipblas)
    add_library(${TARGET} SHARED expose.cpp expose.h)
-    target_include_directories(${TARGET} PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./examples ./common)
+    target_include_directories(${TARGET} PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./tools ./common)
    target_compile_features(${TARGET} PUBLIC cxx_std_17) # don't bump
    set_target_properties(${TARGET} PROPERTIES PREFIX "")
    set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "koboldcpp_hipblas")
--- a/20
+++ b/20
@ -511,7 +511,7 @@ sgemm_failsafe.o: ggml/src/ggml-cpu/llamafile/sgemm.cpp ggml/src/ggml-cpu/llamaf
 #there's no intrinsics or special gpu ops used here, so we can have a universal object
 ggml-alloc.o: ggml/src/ggml-alloc.c ggml/include/ggml.h ggml/include/ggml-alloc.h
 	$(CC)  $(CFLAGS) -c $< -o $@
-llava.o: examples/llava/llava.cpp examples/llava/llava.h
+llava.o: tools/llava/llava.cpp tools/llava/llava.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 unicode.o: src/unicode.cpp src/unicode.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
@ -541,11 +541,11 @@ ggml-backend-reg_vulkan.o: ggml/src/ggml-backend-reg.cpp ggml/src/ggml-backend-i
 	$(CXX)  $(CXXFLAGS) $(VULKAN_FLAGS) -c $< -o $@
 ggml-backend-reg_cublas.o: ggml/src/ggml-backend-reg.cpp ggml/src/ggml-backend-impl.h ggml/include/ggml.h ggml/include/ggml-backend.h ggml/include/ggml-cpu.h
 	$(CXX)  $(CXXFLAGS) $(CUBLAS_FLAGS) $(HIPFLAGS) -c $< -o $@
-llavaclip_default.o: examples/llava/clip.cpp examples/llava/clip.h
+llavaclip_default.o: tools/llava/clip.cpp tools/llava/clip.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
-llavaclip_cublas.o: examples/llava/clip.cpp examples/llava/clip.h
+llavaclip_cublas.o: tools/llava/clip.cpp tools/llava/clip.h
 	$(CXX) $(CXXFLAGS) $(CUBLAS_FLAGS) $(HIPFLAGS) -c $< -o $@
-llavaclip_vulkan.o: examples/llava/clip.cpp examples/llava/clip.h
+llavaclip_vulkan.o: tools/llava/clip.cpp tools/llava/clip.h
 	$(CXX) $(CXXFLAGS) $(VULKAN_FLAGS) -c $< -o $@
 #this is only used for accelerate
@ -663,17 +663,17 @@ clean:
 	rm -vrf ggml/src/ggml-cuda/template-instances/*.o
 # useful tools
-main: examples/main/main.cpp common/arg.cpp build-info.h ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FULL) $(OBJS)
+main: tools/main/main.cpp common/arg.cpp build-info.h ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FULL) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 sdmain: otherarch/sdcpp/util.cpp otherarch/sdcpp/main.cpp otherarch/sdcpp/stable-diffusion.cpp otherarch/sdcpp/upscaler.cpp otherarch/sdcpp/model.cpp otherarch/sdcpp/thirdparty/zip.c build-info.h ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FULL) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 whispermain: otherarch/whispercpp/main.cpp otherarch/whispercpp/whisper.cpp build-info.h ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FULL) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-ttsmain: examples/tts/tts.cpp common/arg.cpp build-info.h ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FULL) $(OBJS)
+ttsmain: tools/tts/tts.cpp common/arg.cpp build-info.h ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FULL) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-gguf-split: examples/gguf-split/gguf-split.cpp ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o build-info.h llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FULL) $(OBJS)
+gguf-split: tools/gguf-split/gguf-split.cpp ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o build-info.h llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FULL) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-mtmd-cli: examples/llava/mtmd-cli.cpp examples/llava/mtmd.cpp common/arg.cpp build-info.h ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FULL) $(OBJS)
+mtmd-cli: tools/llava/mtmd-cli.cpp tools/llava/mtmd.cpp common/arg.cpp build-info.h ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FULL) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 ggml/src/ggml-vulkan-shaders.cpp:
@ -817,7 +817,7 @@ koboldcpp_vulkan_noavx2:
 endif
 # tools
-quantize_gguf: examples/quantize/quantize.cpp ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FULL)
+quantize_gguf: tools/quantize/quantize.cpp ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FULL)
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
 quantize_gptj: otherarch/tools/gptj_quantize.cpp otherarch/tools/common-ggml.cpp ggml_v3.o ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FULL)
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
@ -827,7 +827,7 @@ quantize_neox: otherarch/tools/neox_quantize.cpp otherarch/tools/common-ggml.cpp
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
 quantize_mpt: otherarch/tools/mpt_quantize.cpp otherarch/tools/common-ggml.cpp ggml_v3.o ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FULL)
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
-quantize_clip: examples/llava/clip.cpp examples/llava/clip.h examples/llava/quantclip.cpp ggml_v3.o ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FULL)
+quantize_clip: tools/llava/clip.cpp tools/llava/clip.h tools/quantclip.cpp ggml_v3.o ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FULL)
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
 #window simple clinfo
--- a/common/arg.cpp
+++ b/common/arg.cpp
@ -2212,14 +2212,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONT_BATCHING"));
    add_opt(common_arg(
        {"--mmproj"}, "FILE",
-        "path to a multimodal projector file. see examples/llava/README.md",
+        "path to a multimodal projector file. see tools/llava/README.md",
        [](common_params & params, const std::string & value) {
            params.mmproj.path = value;
        }
    ).set_examples(mmproj_examples));
    add_opt(common_arg(
        {"--mmproj-url"}, "URL",
-        "URL to a multimodal projector file. see examples/llava/README.md",
+        "URL to a multimodal projector file. see tools/llava/README.md",
        [](common_params & params, const std::string & value) {
            params.mmproj.url = value;
        }
--- a/common/common.h
+++ b/common/common.h
@ -336,7 +336,7 @@ struct common_params {
    common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;
-    // multimodal models (see examples/llava)
+    // multimodal models (see tools/llava)
    struct common_params_model mmproj;
    bool mmproj_use_gpu = true;     // use GPU for multimodal model
    bool no_mmproj = false;         // explicitly disable multimodal model
@ -410,8 +410,8 @@ struct common_params {
    int n_pca_batch = 100;
    int n_pca_iterations = 1000;
    dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
-    std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
+    std::string cvector_positive_file = "tools/cvector-generator/positive.txt";
-    std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
+    std::string cvector_negative_file = "tools/cvector-generator/negative.txt";
    bool spm_infill = false; // suffix/prefix/middle pattern for infill
--- a/examples/gguf/gguf.cpp
+++ b/examples/gguf/gguf.cpp
@ -1,265 +0,0 @@
 #include "ggml.h"
 #include "gguf.h"
 #include <cstdio>
 #include <string>
 #include <sstream>
 #include <vector>
 #undef MIN
 #undef MAX
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
 template <typename T>
 static std::string to_string(const T & val) {
    std::stringstream ss;
    ss << val;
    return ss.str();
 }
 static bool gguf_ex_write(const std::string & fname) {
    struct gguf_context * ctx = gguf_init_empty();
    gguf_set_val_u8  (ctx, "some.parameter.uint8",    0x12);
    gguf_set_val_i8  (ctx, "some.parameter.int8",    -0x13);
    gguf_set_val_u16 (ctx, "some.parameter.uint16",   0x1234);
    gguf_set_val_i16 (ctx, "some.parameter.int16",   -0x1235);
    gguf_set_val_u32 (ctx, "some.parameter.uint32",   0x12345678);
    gguf_set_val_i32 (ctx, "some.parameter.int32",   -0x12345679);
    gguf_set_val_f32 (ctx, "some.parameter.float32",  0.123456789f);
    gguf_set_val_u64 (ctx, "some.parameter.uint64",   0x123456789abcdef0ull);
    gguf_set_val_i64 (ctx, "some.parameter.int64",   -0x123456789abcdef1ll);
    gguf_set_val_f64 (ctx, "some.parameter.float64",  0.1234567890123456789);
    gguf_set_val_bool(ctx, "some.parameter.bool",     true);
    gguf_set_val_str (ctx, "some.parameter.string",   "hello world");
    gguf_set_arr_data(ctx, "some.parameter.arr.i16", GGUF_TYPE_INT16,   std::vector<int16_t>{ 1, 2, 3, 4, }.data(), 4);
    gguf_set_arr_data(ctx, "some.parameter.arr.f32", GGUF_TYPE_FLOAT32, std::vector<float>{ 3.145f, 2.718f, 1.414f, }.data(), 3);
    gguf_set_arr_str (ctx, "some.parameter.arr.str",                    std::vector<const char *>{ "hello", "world", "!" }.data(), 3);
    struct ggml_init_params params = {
        /*.mem_size   =*/ 128ull*1024ull*1024ull,
        /*.mem_buffer =*/ NULL,
        /*.no_alloc   =*/ false,
    };
    struct ggml_context * ctx_data = ggml_init(params);
    const int n_tensors = 10;
    // tensor infos
    for (int i = 0; i < n_tensors; ++i) {
        const std::string name = "tensor_" + to_string(i);
        int64_t ne[GGML_MAX_DIMS] = { 1 };
        int32_t n_dims = rand() % GGML_MAX_DIMS + 1;
        for (int j = 0; j < n_dims; ++j) {
            ne[j] = rand() % 10 + 1;
        }
        struct ggml_tensor * cur = ggml_new_tensor(ctx_data, GGML_TYPE_F32, n_dims, ne);
        ggml_set_name(cur, name.c_str());
        {
            float * data = (float *) cur->data;
            for (int j = 0; j < ggml_nelements(cur); ++j) {
                data[j] = 100 + i;
            }
        }
        gguf_add_tensor(ctx, cur);
    }
    gguf_write_to_file(ctx, fname.c_str(), false);
    printf("%s: wrote file '%s;\n", __func__, fname.c_str());
    ggml_free(ctx_data);
    gguf_free(ctx);
    return true;
 }
 // just read tensor info
 static bool gguf_ex_read_0(const std::string & fname) {
    struct gguf_init_params params = {
        /*.no_alloc = */ false,
        /*.ctx      = */ NULL,
    };
    struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
    if (!ctx) {
        fprintf(stderr, "%s: failed to load '%s'\n", __func__, fname.c_str());
        return false;
    }
    printf("%s: version:      %d\n", __func__, gguf_get_version(ctx));
    printf("%s: alignment:   %zu\n", __func__, gguf_get_alignment(ctx));
    printf("%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx));
    // kv
    {
        const int n_kv = gguf_get_n_kv(ctx);
        printf("%s: n_kv: %d\n", __func__, n_kv);
        for (int i = 0; i < n_kv; ++i) {
            const char * key = gguf_get_key(ctx, i);
            printf("%s: kv[%d]: key = %s\n", __func__, i, key);
        }
    }
    // find kv string
    {
        const char * findkey = "some.parameter.string";
        const int keyidx = gguf_find_key(ctx, findkey);
        if (keyidx == -1) {
            printf("%s: find key: %s not found.\n", __func__, findkey);
        } else {
            const char * key_value = gguf_get_val_str(ctx, keyidx);
            printf("%s: find key: %s found, kv[%d] value = %s\n", __func__, findkey, keyidx, key_value);
        }
    }
    // tensor info
    {
        const int n_tensors = gguf_get_n_tensors(ctx);
        printf("%s: n_tensors: %d\n", __func__, n_tensors);
        for (int i = 0; i < n_tensors; ++i) {
            const char * name   = gguf_get_tensor_name  (ctx, i);
            const size_t size   = gguf_get_tensor_size  (ctx, i);
            const size_t offset = gguf_get_tensor_offset(ctx, i);
            printf("%s: tensor[%d]: name = %s, size = %zu, offset = %zu\n", __func__, i, name, size, offset);
        }
    }
    gguf_free(ctx);
    return true;
 }
 // read and create ggml_context containing the tensors and their data
 static bool gguf_ex_read_1(const std::string & fname, bool check_data) {
    struct ggml_context * ctx_data = NULL;
    struct gguf_init_params params = {
        /*.no_alloc = */ false,
        /*.ctx      = */ &ctx_data,
    };
    struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
    printf("%s: version:      %d\n", __func__, gguf_get_version(ctx));
    printf("%s: alignment:   %zu\n", __func__, gguf_get_alignment(ctx));
    printf("%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx));
    // kv
    {
        const int n_kv = gguf_get_n_kv(ctx);
        printf("%s: n_kv: %d\n", __func__, n_kv);
        for (int i = 0; i < n_kv; ++i) {
            const char * key = gguf_get_key(ctx, i);
            printf("%s: kv[%d]: key = %s\n", __func__, i, key);
        }
    }
    // tensor info
    {
        const int n_tensors = gguf_get_n_tensors(ctx);
        printf("%s: n_tensors: %d\n", __func__, n_tensors);
        for (int i = 0; i < n_tensors; ++i) {
            const char * name   = gguf_get_tensor_name  (ctx, i);
            const size_t size   = gguf_get_tensor_size  (ctx, i);
            const size_t offset = gguf_get_tensor_offset(ctx, i);
            printf("%s: tensor[%d]: name = %s, size = %zu, offset = %zu\n", __func__, i, name, size, offset);
        }
    }
    // data
    {
        const int n_tensors = gguf_get_n_tensors(ctx);
        for (int i = 0; i < n_tensors; ++i) {
            printf("%s: reading tensor %d data\n", __func__, i);
            const char * name = gguf_get_tensor_name(ctx, i);
            struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
            printf("%s: tensor[%d]: n_dims = %d, ne = (%d, %d, %d, %d), name = %s, data = %p\n",
                __func__, i, ggml_n_dims(cur), int(cur->ne[0]), int(cur->ne[1]), int(cur->ne[2]), int(cur->ne[3]), cur->name, cur->data);
            // print first 10 elements
            const float * data = (const float *) cur->data;
            printf("%s data[:10] : ", name);
            for (int j = 0; j < MIN(10, ggml_nelements(cur)); ++j) {
                printf("%f ", data[j]);
            }
            printf("\n\n");
            // check data
            if (check_data) {
                const float * data = (const float *) cur->data;
                for (int j = 0; j < ggml_nelements(cur); ++j) {
                    if (data[j] != 100 + i) {
                        fprintf(stderr, "%s: tensor[%d], data[%d]: found %f, expected %f\n", __func__, i, j, data[j], float(100 + i));
                        gguf_free(ctx);
                        return false;
                    }
                }
            }
        }
    }
    printf("%s: ctx_data size: %zu\n", __func__, ggml_get_mem_size(ctx_data));
    ggml_free(ctx_data);
    gguf_free(ctx);
    return true;
 }
 int main(int argc, char ** argv) {
    if (argc < 3) {
        printf("usage: %s data.gguf r|w [n]\n", argv[0]);
        printf("r: read data.gguf file\n");
        printf("w: write data.gguf file\n");
        printf("n: no check of tensor data\n");
        return -1;
    }
    bool check_data = true;
    if (argc == 4) {
        check_data = false;
    }
    srand(123456);
    const std::string fname(argv[1]);
    const std::string mode (argv[2]);
    GGML_ASSERT((mode == "r" || mode == "w") && "mode must be r or w");
    if (mode == "w") {
        GGML_ASSERT(gguf_ex_write(fname) && "failed to write gguf file");
    } else if (mode == "r") {
        GGML_ASSERT(gguf_ex_read_0(fname) && "failed to read gguf file");
        GGML_ASSERT(gguf_ex_read_1(fname, check_data) && "failed to read gguf file");
    }
    return 0;
 }
--- a/examples/server/public/index.html.gz
+++ b/examples/server/public/index.html.gz
--- a/gpttype_adapter.cpp
+++ b/gpttype_adapter.cpp
@ -40,8 +40,8 @@
 #include "neox_v2.cpp"
 #include "neox_v3.cpp"
 #include "mpt_v3.cpp"
-#include "examples/llava/clip.h"
+#include "tools/llava/clip.h"
-#include "examples/llava/llava.h"
+#include "tools/llava/llava.h"
 #include "common/common.h"
 //const
--- a/tools/CMakeLists.txt
+++ b/tools/CMakeLists.txt
@ -0,0 +1,39 @@
 # dependencies
 find_package(Threads REQUIRED)
 # third-party
 # ...
 # flags
 llama_add_compile_flags()
 # tools
 if (EMSCRIPTEN)
 else()
    add_subdirectory(batched-bench)
    add_subdirectory(gguf-split)
    add_subdirectory(imatrix)
    add_subdirectory(llama-bench)
    add_subdirectory(main)
    add_subdirectory(perplexity)
    add_subdirectory(quantize)
    if (LLAMA_BUILD_SERVER)
        add_subdirectory(server)
    endif()
    add_subdirectory(run)
    add_subdirectory(tokenize)
    add_subdirectory(tts)
    if (NOT GGML_BACKEND_DL)
        # these examples use the backends directly and cannot be built with dynamic loading
        add_subdirectory(cvector-generator)
        add_subdirectory(export-lora)
        add_subdirectory(llava)
        if (GGML_RPC)
            add_subdirectory(rpc)
        endif()
    endif()
 endif()
--- a/examples/gguf-split/gguf-split.cpp
+++ b/examples/gguf-split/gguf-split.cpp
--- a/examples/gguf-split/tests.sh
+++ b/examples/gguf-split/tests.sh
--- a/examples/llava/README-quantize.md
+++ b/examples/llava/README-quantize.md
--- a/examples/llava/clip-impl.h
+++ b/examples/llava/clip-impl.h
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
--- a/examples/llava/clip.h
+++ b/examples/llava/clip.h
--- a/examples/llava/convert_image_encoder_to_gguf.py
+++ b/examples/llava/convert_image_encoder_to_gguf.py
--- a/examples/llava/deprecation-warning.cpp
+++ b/examples/llava/deprecation-warning.cpp
--- a/examples/llava/glmedge-convert-image-encoder-to-gguf.py
+++ b/examples/llava/glmedge-convert-image-encoder-to-gguf.py
--- a/examples/llava/glmedge-surgery.py
+++ b/examples/llava/glmedge-surgery.py
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
--- a/examples/llava/llava.h
+++ b/examples/llava/llava.h
--- a/examples/llava/llava_surgery.py
+++ b/examples/llava/llava_surgery.py
--- a/examples/llava/llava_surgery_v2.py
+++ b/examples/llava/llava_surgery_v2.py
--- a/examples/llava/minicpmv-convert-image-encoder-to-gguf.py
+++ b/examples/llava/minicpmv-convert-image-encoder-to-gguf.py
--- a/examples/llava/minicpmv-surgery.py
+++ b/examples/llava/minicpmv-surgery.py
--- a/examples/llava/mtmd-cli.cpp
+++ b/examples/llava/mtmd-cli.cpp
--- a/examples/llava/mtmd.cpp
+++ b/examples/llava/mtmd.cpp
--- a/examples/llava/mtmd.h
+++ b/examples/llava/mtmd.h
--- a/examples/llava/qwen2vl-test.cpp
+++ b/examples/llava/qwen2vl-test.cpp
--- a/examples/llava/requirements.txt
+++ b/examples/llava/requirements.txt
--- a/examples/llava/test-1.jpeg
+++ b/examples/llava/test-1.jpeg
--- a/examples/llava/tests.sh
+++ b/examples/llava/tests.sh
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
--- a/examples/llava/quantclip.cpp
+++ b/examples/llava/quantclip.cpp
--- a/examples/quantize/README.md
+++ b/examples/quantize/README.md
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
--- a/examples/quantize/tests.sh
+++ b/examples/quantize/tests.sh
--- a/examples/server/bench/bench.py
+++ b/examples/server/bench/bench.py
--- a/examples/server/bench/prometheus.yml
+++ b/examples/server/bench/prometheus.yml
--- a/examples/server/bench/requirements.txt
+++ b/examples/server/bench/requirements.txt
--- a/examples/server/bench/script.js
+++ b/examples/server/bench/script.js
--- a/examples/server/chat-llama2.sh
+++ b/examples/server/chat-llama2.sh
--- a/examples/server/chat.mjs
+++ b/examples/server/chat.mjs
--- a/examples/server/chat.sh
+++ b/examples/server/chat.sh
--- a/examples/server/httplib.h
+++ b/examples/server/httplib.h
--- a/tools/server/public/index.html.gz
+++ b/tools/server/public/index.html.gz
--- a/examples/server/public/loading.html
+++ b/examples/server/public/loading.html
--- a/examples/server/public_legacy/colorthemes.css
+++ b/examples/server/public_legacy/colorthemes.css
--- a/examples/server/public_legacy/completion.js
+++ b/examples/server/public_legacy/completion.js
--- a/examples/server/public_legacy/favicon.ico
+++ b/examples/server/public_legacy/favicon.ico
--- a/examples/server/public_legacy/index-new.html
+++ b/examples/server/public_legacy/index-new.html
--- a/examples/server/public_legacy/index.html
+++ b/examples/server/public_legacy/index.html
--- a/examples/server/public_legacy/index.js
+++ b/examples/server/public_legacy/index.js
--- a/examples/server/public_legacy/json-schema-to-grammar.mjs
+++ b/examples/server/public_legacy/json-schema-to-grammar.mjs
--- a/examples/server/public_legacy/loading.html
+++ b/examples/server/public_legacy/loading.html
--- a/examples/server/public_legacy/prompt-formats.js
+++ b/examples/server/public_legacy/prompt-formats.js
--- a/examples/server/public_legacy/style.css
+++ b/examples/server/public_legacy/style.css
--- a/examples/server/public_legacy/system-prompts.js
+++ b/examples/server/public_legacy/system-prompts.js
--- a/examples/server/public_legacy/theme-beeninorder.css
+++ b/examples/server/public_legacy/theme-beeninorder.css
--- a/examples/server/public_legacy/theme-ketivah.css
+++ b/examples/server/public_legacy/theme-ketivah.css
--- a/examples/server/public_legacy/theme-mangotango.css
+++ b/examples/server/public_legacy/theme-mangotango.css
--- a/examples/server/public_legacy/theme-playground.css
+++ b/examples/server/public_legacy/theme-playground.css
--- a/examples/server/public_legacy/theme-polarnight.css
+++ b/examples/server/public_legacy/theme-polarnight.css
--- a/examples/server/public_legacy/theme-snowstorm.css
+++ b/examples/server/public_legacy/theme-snowstorm.css
--- a/examples/server/public_simplechat/datautils.mjs
+++ b/examples/server/public_simplechat/datautils.mjs
--- a/examples/server/public_simplechat/index.html
+++ b/examples/server/public_simplechat/index.html
--- a/examples/server/public_simplechat/simplechat.css
+++ b/examples/server/public_simplechat/simplechat.css
--- a/examples/server/public_simplechat/simplechat.js
+++ b/examples/server/public_simplechat/simplechat.js
--- a/examples/server/public_simplechat/simplechat_screens.webp
+++ b/examples/server/public_simplechat/simplechat_screens.webp
--- a/examples/server/public_simplechat/ui.mjs
+++ b/examples/server/public_simplechat/ui.mjs
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
--- a/examples/server/tests/.gitignore
+++ b/examples/server/tests/.gitignore
--- a/examples/server/tests/conftest.py
+++ b/examples/server/tests/conftest.py
--- a/examples/server/tests/pytest.ini
+++ b/examples/server/tests/pytest.ini
--- a/examples/server/tests/requirements.txt
+++ b/examples/server/tests/requirements.txt
--- a/examples/server/tests/tests.sh
+++ b/examples/server/tests/tests.sh
--- a/examples/server/tests/unit/test_basic.py
+++ b/examples/server/tests/unit/test_basic.py
--- a/examples/server/tests/unit/test_chat_completion.py
+++ b/examples/server/tests/unit/test_chat_completion.py
--- a/examples/server/tests/unit/test_completion.py
+++ b/examples/server/tests/unit/test_completion.py
--- a/examples/server/tests/unit/test_ctx_shift.py
+++ b/examples/server/tests/unit/test_ctx_shift.py
--- a/examples/server/tests/unit/test_embedding.py
+++ b/examples/server/tests/unit/test_embedding.py
--- a/examples/server/tests/unit/test_infill.py
+++ b/examples/server/tests/unit/test_infill.py
--- a/examples/server/tests/unit/test_lora.py
+++ b/examples/server/tests/unit/test_lora.py
--- a/examples/server/tests/unit/test_rerank.py
+++ b/examples/server/tests/unit/test_rerank.py
--- a/examples/server/tests/unit/test_security.py
+++ b/examples/server/tests/unit/test_security.py
--- a/examples/server/tests/unit/test_slot_save.py
+++ b/examples/server/tests/unit/test_slot_save.py
--- a/examples/server/tests/unit/test_speculative.py
+++ b/examples/server/tests/unit/test_speculative.py
--- a/examples/server/tests/unit/test_tokenize.py
+++ b/examples/server/tests/unit/test_tokenize.py
--- a/examples/server/tests/unit/test_tool_call.py
+++ b/examples/server/tests/unit/test_tool_call.py
--- a/examples/server/tests/utils.py
+++ b/examples/server/tests/utils.py
--- a/examples/server/themes/buttons-top/buttons_top.png
+++ b/examples/server/themes/buttons-top/buttons_top.png
--- a/examples/server/themes/buttons-top/favicon.ico
+++ b/examples/server/themes/buttons-top/favicon.ico
--- a/examples/server/themes/buttons-top/index.html
+++ b/examples/server/themes/buttons-top/index.html
--- a/examples/server/themes/wild/favicon.ico
+++ b/examples/server/themes/wild/favicon.ico
--- a/examples/server/themes/wild/index.html
+++ b/examples/server/themes/wild/index.html
--- a/examples/server/themes/wild/llama_cpp.png
+++ b/examples/server/themes/wild/llama_cpp.png
--- a/examples/server/themes/wild/llamapattern.png
+++ b/examples/server/themes/wild/llamapattern.png
--- a/examples/server/themes/wild/wild.png
+++ b/examples/server/themes/wild/wild.png
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
--- a/Show more
+++ b/Show more