Merge commit '280345968d' into concedo_experimental

# Conflicts: # .devops/full-cuda.Dockerfile # .devops/llama-cpp-cuda.srpm.spec # .devops/main-cuda.Dockerfile # .devops/nix/package.nix # .devops/server-cuda.Dockerfile # .github/workflows/build.yml # CMakeLists.txt # Makefile # README.md # ci/run.sh # docs/token_generation_performance_tips.md # flake.lock # llama.cpp # scripts/LlamaConfig.cmake.in # scripts/compare-commits.sh # scripts/server-llm.sh # tests/test-quantize-fns.cpp
2025-09-11 09:34:37 +00:00 · 2024-04-07 20:27:17 +08:00 · 2024-04-07 20:27:17 +08:00 · a530afa1e4
commit a530afa1e4
parent d8b808454d 280345968d
33 changed files with 124 additions and 1280 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -80,7 +80,7 @@ if (LLAMA_CUBLAS)
        enable_language(CUDA)
-        add_compile_definitions(GGML_USE_CUBLAS)
+        add_compile_definitions(GGML_USE_CUDA)
        add_compile_definitions(SD_USE_CUBLAS)
        add_compile_definitions(GGML_CUDA_MMQ_Y=${LLAMA_CUDA_MMQ_Y})
@ -152,7 +152,7 @@ if (LLAMA_HIPBLAS)
        message(STATUS "HIP and hipBLAS found")
        file(GLOB GGML_SOURCES_ROCM "ggml-cuda/*.cu")
        list(APPEND GGML_SOURCES_ROCM "ggml-cuda.cu")
-        add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS SD_USE_CUBLAS)
+        add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUDA SD_USE_CUBLAS)
        add_library(ggml-rocm OBJECT ${GGML_SOURCES_CUDA})
        if (LLAMA_CUDA_FORCE_DMMV)
            target_compile_definitions(ggml-rocm PUBLIC GGML_CUDA_FORCE_DMMV)
--- a/6
+++ b/6
@ -55,7 +55,7 @@ CLBLAST_FLAGS = -DGGML_USE_CLBLAST
 FAILSAFE_FLAGS = -DUSE_FAILSAFE
 VULKAN_FLAGS = -DGGML_USE_VULKAN
 ifdef LLAMA_CUBLAS
-	CUBLAS_FLAGS = -DGGML_USE_CUBLAS -DSD_USE_CUBLAS
+	CUBLAS_FLAGS = -DGGML_USE_CUDA -DSD_USE_CUBLAS
 else
 	CUBLAS_FLAGS =
 endif
@ -142,7 +142,7 @@ endif
 # it is recommended to use the CMAKE file to build for cublas if you can - will likely work better
 ifdef LLAMA_CUBLAS
-	CUBLAS_FLAGS = -DGGML_USE_CUBLAS -DSD_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
+	CUBLAS_FLAGS = -DGGML_USE_CUDA -DSD_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
 	CUBLASLD_FLAGS = -lcuda -lcublas -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib -L/usr/local/cuda/targets/aarch64-linux/lib -L/usr/local/cuda/targets/sbsa-linux/lib -L/usr/lib/wsl/lib
 	CUBLAS_OBJS = ggml-cuda.o ggml_v3-cuda.o ggml_v2-cuda.o ggml_v2-cuda-legacy.o
 	NVCC      = nvcc
@ -226,7 +226,7 @@ ifdef LLAMA_HIPBLAS
 	LLAMA_CUDA_DMMV_X ?= 32
 	LLAMA_CUDA_MMV_Y ?= 1
 	LLAMA_CUDA_KQUANTS_ITER ?= 2
-	HIPFLAGS   += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS -DSD_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C)
+	HIPFLAGS   += -DGGML_USE_HIPBLAS -DGGML_USE_CUDA -DSD_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C)
 	HIPLDFLAGS    += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib -lhipblas -lamdhip64 -lrocblas
 	HIP_OBJS       += ggml-cuda.o ggml_v3-cuda.o ggml_v2-cuda.o ggml_v2-cuda-legacy.o
 ggml-cuda.o: HIPFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS)) \
--- a/common/common.cpp
+++ b/common/common.cpp
@ -49,12 +49,12 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
-#if (defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL))
+#if (defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL))
-#define GGML_USE_CUBLAS_SYCL
+#define GGML_USE_CUDA_SYCL
 #endif
-#if (defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)) || defined(GGML_USE_VULKAN)
+#if (defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)) || defined(GGML_USE_VULKAN)
-#define GGML_USE_CUBLAS_SYCL_VULKAN
+#define GGML_USE_CUDA_SYCL_VULKAN
 #endif
 #if defined(LLAMA_USE_CURL)
@ -862,9 +862,9 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
            return true;
        }
        params.main_gpu = std::stoi(argv[i]);
-#ifndef GGML_USE_CUBLAS_SYCL
+#ifndef GGML_USE_CUDA_SYCL
-        fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the main GPU has no effect.\n");
+        fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL. Setting the main GPU has no effect.\n");
-#endif // GGML_USE_CUBLAS_SYCL
+#endif // GGML_USE_CUDA_SYCL
        return true;
    }
    if (arg == "--split-mode" || arg == "-sm") {
@ -890,9 +890,9 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
            invalid_param = true;
            return true;
        }
-#ifndef GGML_USE_CUBLAS_SYCL
+#ifndef GGML_USE_CUDA_SYCL
-        fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the split mode has no effect.\n");
+        fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL. Setting the split mode has no effect.\n");
-#endif // GGML_USE_CUBLAS_SYCL
+#endif // GGML_USE_CUDA_SYCL
        return true;
    }
    if (arg == "--tensor-split" || arg == "-ts") {
@ -918,9 +918,9 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
                params.tensor_split[i] = 0.0f;
            }
        }
-#ifndef GGML_USE_CUBLAS_SYCL_VULKAN
+#ifndef GGML_USE_CUDA_SYCL_VULKAN
-        fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL/Vulkan. Setting a tensor split has no effect.\n");
+        fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting a tensor split has no effect.\n");
-#endif // GGML_USE_CUBLAS_SYCL
+#endif // GGML_USE_CUDA_SYCL_VULKAN
        return true;
    }
    if (arg == "--no-mmap") {
@ -2388,7 +2388,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
    fprintf(stream, "cpu_has_avx512: %s\n",      ggml_cpu_has_avx512()      ? "true" : "false");
    fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false");
    fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
-    fprintf(stream, "cpu_has_cublas: %s\n",      ggml_cpu_has_cublas()      ? "true" : "false");
+    fprintf(stream, "cpu_has_cuda: %s\n",        ggml_cpu_has_cuda()        ? "true" : "false");
    fprintf(stream, "cpu_has_vulkan: %s\n",      ggml_cpu_has_vulkan()      ? "true" : "false");
    fprintf(stream, "cpu_has_clblast: %s\n",     ggml_cpu_has_clblast()     ? "true" : "false");
    fprintf(stream, "cpu_has_kompute: %s\n",     ggml_cpu_has_kompute()     ? "true" : "false");
--- a/examples/imatrix/README.md
+++ b/examples/imatrix/README.md
@ -22,7 +22,7 @@ For faster computation, make sure to use GPU offloading via the `-ngl` argument
 ## Example
 ```bash
-LLAMA_CUBLAS=1 make -j
+LLAMA_CUDA=1 make -j
 # generate importance matrix (imatrix.dat)
 ./imatrix -m ggml-model-f16.gguf -f train-data.txt -ngl 99
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@ -114,7 +114,7 @@ static std::string get_cpu_info() {
 static std::string get_gpu_info() {
    std::string id;
-#ifdef GGML_USE_CUBLAS
+#ifdef GGML_USE_CUDA
    int count = ggml_backend_cuda_get_device_count();
    for (int i = 0; i < count; i++) {
        char buf[128];
@ -809,7 +809,7 @@ struct test {
 const std::string test::build_commit = LLAMA_COMMIT;
 const int         test::build_number = LLAMA_BUILD_NUMBER;
-const bool        test::cuda         = !!ggml_cpu_has_cublas();
+const bool        test::cuda         = !!ggml_cpu_has_cuda();
 const bool        test::opencl       = !!ggml_cpu_has_clblast();
 const bool        test::vulkan       = !!ggml_cpu_has_vulkan();
 const bool        test::kompute      = !!ggml_cpu_has_kompute();
--- a/examples/llava/MobileVLM-README.md
+++ b/examples/llava/MobileVLM-README.md
@ -124,7 +124,7 @@ llama_print_timings:       total time =   34570.79 ms
 ## Orin compile and run
 ### compile
 ```sh
-make LLAMA_CUBLAS=1 CUDA_DOCKER_ARCH=sm_87 LLAMA_CUDA_F16=1 -j 32
+make LLAMA_CUDA=1 CUDA_DOCKER_ARCH=sm_87 LLAMA_CUDA_F16=1 -j 32
 ```
 ### run on Orin
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@ -7,7 +7,7 @@
 #include "ggml-alloc.h"
 #include "ggml-backend.h"
-#ifdef GGML_USE_CUBLAS
+#ifdef GGML_USE_CUDA
 #include "ggml-cuda.h"
 #endif
@ -968,7 +968,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
        }
    }
-#ifdef GGML_USE_CUBLAS
+#ifdef GGML_USE_CUDA
    new_clip->backend = ggml_backend_cuda_init(0);
    printf("%s: CLIP using CUDA backend\n", __func__);
 #endif
--- a/examples/main-cmake-pkg/README.md
+++ b/examples/main-cmake-pkg/README.md
@ -8,7 +8,7 @@ Because this example is "outside of the source tree", it is important to first b
 ### Considerations
-When hardware acceleration libraries are used (e.g. CUBlas, Metal, CLBlast, etc.), CMake must be able to locate the associated CMake package. In the example below, when building _main-cmake-pkg_ notice the `CMAKE_PREFIX_PATH` includes the Llama CMake package location _in addition to_ the CLBlast package—which was used when compiling _llama.cpp_.
+When hardware acceleration libraries are used (e.g. CUDA, Metal, CLBlast, etc.), CMake must be able to locate the associated CMake package. In the example below, when building _main-cmake-pkg_ notice the `CMAKE_PREFIX_PATH` includes the Llama CMake package location _in addition to_ the CLBlast package—which was used when compiling _llama.cpp_.
 ### Build llama.cpp and install to C:\LlamaCPP directory
--- a/examples/main/README.md
+++ b/examples/main/README.md
@ -316,8 +316,8 @@ These options provide extra functionality and customization when running the LLa
 -   `-h, --help`: Display a help message showing all available options and their default values. This is particularly useful for checking the latest options and default values, as they can change frequently, and the information in this document may become outdated.
 -   `--verbose-prompt`: Print the prompt before generating text.
-   `-ngl N, --n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
+-   `-ngl N, --n-gpu-layers N`: When compiled with GPU support, this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
-   `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
+-   `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used.
-   `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
+-   `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance.
 -   `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
 -   `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -25,9 +25,9 @@ The project is under active development, and we are [looking for feedback and co
 - `-hff FILE, --hf-file FILE`: Hugging Face model file (default: unused).
 - `-a ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
 - `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096.
- `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
+- `-ngl N`, `--n-gpu-layers N`: When compiled with GPU support, this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
+- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used.
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
+- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance.
 - `-b N`, `--batch-size N`: Set the batch size for prompt processing. Default: `2048`.
 - `-ub N`, `--ubatch-size N`: physical maximum batch size. Default: `512`.
 - `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended.
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -2511,15 +2511,15 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
                invalid_param = true;
                break;
            }
-#ifndef GGML_USE_CUBLAS
+#ifndef GGML_USE_CUDA
-            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting the split mode has no effect.\n");
+            fprintf(stderr, "warning: llama.cpp was compiled without CUDA. Setting the split mode has no effect.\n");
-#endif // GGML_USE_CUBLAS
+#endif // GGML_USE_CUDA
        } else if (arg == "--tensor-split" || arg == "-ts") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)
+#if defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)
            std::string arg_next = argv[i];
            // split string by , and /
@ -2536,17 +2536,17 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
                }
            }
 #else
-            LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n", {});
+            LOG_WARNING("llama.cpp was compiled without CUDA. It is not possible to set a tensor split.\n", {});
-#endif // GGML_USE_CUBLAS
+#endif // GGML_USE_CUDA
        } else if (arg == "--main-gpu" || arg == "-mg") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)
+#if defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)
            params.main_gpu = std::stoi(argv[i]);
 #else
-            LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.", {});
+            LOG_WARNING("llama.cpp was compiled without CUDA. It is not possible to set a main GPU.", {});
 #endif
        } else if (arg == "--lora") {
            if (++i >= argc) {
--- a/ggml-backend.c
+++ b/ggml-backend.c
@ -420,7 +420,7 @@ GGML_CALL static void ggml_backend_registry_init(void) {
    ggml_backend_register("CPU", ggml_backend_reg_cpu_init, ggml_backend_cpu_buffer_type(), NULL);
    // add forward decls here to avoid including the backend headers
-#ifdef GGML_USE_CUBLAS
+#ifdef GGML_USE_CUDA
    extern GGML_CALL void ggml_backend_cuda_reg_devices(void);
    ggml_backend_cuda_reg_devices();
 #endif
--- a/ggml.c
+++ b/ggml.c
@ -21719,15 +21719,15 @@ int ggml_cpu_has_wasm_simd(void) {
 }
 int ggml_cpu_has_blas(void) {
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_SYCL)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_SYCL)
    return 1;
 #else
    return 0;
 #endif
 }
-int ggml_cpu_has_cublas(void) {
+int ggml_cpu_has_cuda(void) {
-#if defined(GGML_USE_CUBLAS)
+#if defined(GGML_USE_CUDA)
    return 1;
 #else
    return 0;
@ -21767,7 +21767,7 @@ int ggml_cpu_has_sycl(void) {
 }
 int ggml_cpu_has_gpublas(void) {
-    return ggml_cpu_has_cublas() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() ||
+    return ggml_cpu_has_cuda() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() ||
           ggml_cpu_has_sycl();
 }
--- a/ggml.h
+++ b/ggml.h
@ -2361,7 +2361,7 @@ extern "C" {
    GGML_API int ggml_cpu_has_fp16_va    (void);
    GGML_API int ggml_cpu_has_wasm_simd  (void);
    GGML_API int ggml_cpu_has_blas       (void);
-    GGML_API int ggml_cpu_has_cublas     (void);
+    GGML_API int ggml_cpu_has_cuda       (void);
    GGML_API int ggml_cpu_has_clblast    (void);
    GGML_API int ggml_cpu_has_vulkan     (void);
    GGML_API int ggml_cpu_has_kompute    (void);
--- a/gpttype_adapter.cpp
+++ b/gpttype_adapter.cpp
@ -829,7 +829,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
    int cu_parseinfo_maindevice = inputs.cublas_info<=0?0:inputs.cublas_info;
    printf("System Info: %s\n", llama_print_system_info());
-    #if defined(GGML_USE_CUBLAS)
+    #if defined(GGML_USE_CUDA)
    if(file_format!=FileFormat::GGUF_GENERIC)
    {
        if(ggml_v3_cpu_has_gpublas() && cu_parseinfo_maindevice>0)
@ -909,7 +909,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
        llama_ctx_params.rope_freq_scale = rope_freq_scale;
        llama_ctx_params.n_batch = kcpp_params->n_batch;
-        #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_VULKAN)
+        #if defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN)
        bool ts_all_zero = true;
        for (int i = 0; i < tensor_split_max; ++i) {
            if (inputs.tensor_split[i] != 0.0f) {
@ -997,7 +997,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
            }
        }
        #endif
-        #if defined(GGML_USE_CUBLAS)
+        #if defined(GGML_USE_CUDA)
        if(ggml_cpu_has_gpublas() && cu_parseinfo_maindevice>0)
        {
            printf("CUBLAS: Set main device to %d\n",cu_parseinfo_maindevice);
@ -1006,7 +1006,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
        #endif
        model_params.main_gpu = cu_parseinfo_maindevice;
-        #if defined(GGML_USE_CUBLAS)
+        #if defined(GGML_USE_CUDA)
        model_params.split_mode = (inputs.use_rowsplit?llama_split_mode::LLAMA_SPLIT_MODE_ROW:llama_split_mode::LLAMA_SPLIT_MODE_LAYER);
        #else
        model_params.split_mode = llama_split_mode::LLAMA_SPLIT_MODE_LAYER;
@ -1016,7 +1016,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
        llama_ctx_params.n_threads = kcpp_params->n_threads;
        llama_ctx_params.n_threads_batch = kcpp_params->n_threads_batch;
-        #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_VULKAN)
+        #if defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN)
        bool ts_all_zero = true;
        for (int i = 0; i < tensor_split_max; ++i) {
            if (inputs.tensor_split[i] != 0.0f) {
--- a/llama.cpp
+++ b/llama.cpp
@ -9,13 +9,11 @@
 #include "ggml-alloc.h"
 #include "ggml-backend.h"
-#ifdef GGML_USE_CUBLAS
+#ifdef GGML_USE_CUDA
 #  include "ggml-cuda.h"
-#endif
+#elif defined(GGML_USE_CLBLAST)
 #if defined(GGML_USE_CLBLAST)
 #  include "ggml-opencl.h"
-#endif
+#elif defined(GGML_USE_VULKAN)
 #if defined(GGML_USE_VULKAN)
 #  include "ggml-vulkan.h"
 #elif defined(GGML_USE_SYCL)
 #  include "ggml-sycl.h"
@ -1533,7 +1531,7 @@ static std::string llama_token_to_str(const struct llama_context * ctx, llama_to
 static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer) {
    ggml_backend_buffer_type_t buft = nullptr;
-#if defined(GGML_USE_CUBLAS)
+#if defined(GGML_USE_CUDA)
    // host buffers should only be used when data is expected to be copied to/from the GPU
    if (host_buffer) {
        buft = ggml_backend_cuda_host_buffer_type();
@ -1563,7 +1561,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
 #ifdef GGML_USE_METAL
    buft = ggml_backend_metal_buffer_type();
-#elif defined(GGML_USE_CUBLAS)
+#elif defined(GGML_USE_CUDA)
    buft = ggml_backend_cuda_buffer_type(gpu);
 #elif defined(GGML_USE_VULKAN)
    buft = ggml_backend_vk_buffer_type(gpu);
@ -1589,7 +1587,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
 static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_gpu, const float * tensor_split) {
    ggml_backend_buffer_type_t buft = nullptr;
-#ifdef GGML_USE_CUBLAS
+#ifdef GGML_USE_CUDA
    if (ggml_backend_cuda_get_device_count() > 1) {
        buft = ggml_backend_cuda_split_buffer_type(tensor_split);
    }
@ -1610,7 +1608,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g
 }
 static size_t llama_get_device_count() {
-#if defined(GGML_USE_CUBLAS)
+#if defined(GGML_USE_CUDA)
    return ggml_backend_cuda_get_device_count();
 #elif defined(GGML_USE_SYCL)
    return ggml_backend_sycl_get_device_count();
@ -1622,7 +1620,7 @@ static size_t llama_get_device_count() {
 }
 static size_t llama_get_device_memory(int device) {
-#if defined(GGML_USE_CUBLAS)
+#if defined(GGML_USE_CUDA)
    size_t total;
    size_t free;
    ggml_backend_cuda_get_device_memory(device, &total, &free);
@ -2112,7 +2110,7 @@ struct llama_model {
            ggml_free(ctx);
        }
        for (ggml_backend_buffer_t buf : bufs) {
-#ifdef GGML_USE_CUBLAS
+#ifdef GGML_USE_CUDA
            if (ggml_backend_buffer_get_type(buf) == ggml_backend_cpu_buffer_type()) {
                ggml_backend_cuda_unregister_host_buffer(ggml_backend_buffer_get_base(buf));
            }
@ -5341,7 +5339,7 @@ static bool llm_load_tensors(
                }
                model.bufs.push_back(buf);
                bufs.emplace(idx, buf);
-#ifdef GGML_USE_CUBLAS
+#ifdef GGML_USE_CUDA
                if (n_layer >= n_gpu_layers) {
                    ggml_backend_cuda_register_host_buffer(
                        ggml_backend_buffer_get_base(buf),
@ -13698,7 +13696,7 @@ bool llama_supports_mlock(void) {
 }
 bool llama_supports_gpu_offload(void) {
-#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
+#if defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
    defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
    // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
    return true;
@ -13904,7 +13902,7 @@ struct llama_context * llama_new_context_with_model(
            }
            ctx->backends.push_back(ctx->backend_metal);
        }
-#elif defined(GGML_USE_CUBLAS)
+#elif defined(GGML_USE_CUDA)
        if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
            // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
            ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu);
@ -14051,7 +14049,7 @@ struct llama_context * llama_new_context_with_model(
            // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
            bool pipeline_parallel = llama_get_device_count() > 1 && model->n_gpu_layers > (int)model->hparams.n_layer && model->split_mode == LLAMA_SPLIT_MODE_LAYER;
-#ifndef GGML_USE_CUBLAS
+#ifndef GGML_USE_CUDA
            // pipeline parallelism requires support for async compute and events
            // currently this is only implemented in the CUDA backend
            pipeline_parallel = false;
--- a/otherarch/ggml_v2.c
+++ b/otherarch/ggml_v2.c
@ -136,7 +136,7 @@ inline static void* ggml_v2_aligned_malloc(size_t size) {
 #include <Accelerate/Accelerate.h>
 #elif defined(GGML_USE_OPENBLAS)
 #include <cblas.h>
-#elif defined(GGML_USE_CUBLAS)
+#elif defined(GGML_USE_CUDA)
 #include "ggml_v2-cuda.h"
 #include "ggml_v2-cuda-legacy.h"
 #endif
@ -3895,7 +3895,7 @@ struct ggml_v2_context * ggml_v2_init(struct ggml_v2_init_params params) {
            GGML_V2_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
        }
-#if defined(GGML_USE_CUBLAS)
+#if defined(GGML_USE_CUDA)
        if(quants_unshuffled)
        {
            ggml_v2_init_cublas();
@ -9456,7 +9456,7 @@ static void ggml_v2_compute_forward_mul_mat_f32(
    // nb01 >= nb00 - src0 is not transposed
    //   compute by src0 rows
-#if defined(GGML_USE_CUBLAS)
+#if defined(GGML_USE_CUDA)
    if (ggml_v2_cuda_can_mul_mat(src0, src1, dst)) {
        if (params->ith == 0 && params->type == GGML_V2_TASK_COMPUTE) {
            if(quants_unshuffled)
@ -9656,7 +9656,7 @@ static void ggml_v2_compute_forward_mul_mat_f16_f32(
    // nb01 >= nb00 - src0 is not transposed
    //   compute by src0 rows
-#if defined(GGML_USE_CUBLAS)
+#if defined(GGML_USE_CUDA)
    if (ggml_v2_cuda_can_mul_mat(src0, src1, dst)) {
        if (params->ith == 0 && params->type == GGML_V2_TASK_COMPUTE) {
            if(quants_unshuffled)
@ -9901,7 +9901,7 @@ static void ggml_v2_compute_forward_mul_mat_q_f32(
    // nb01 >= nb00 - src0 is not transposed
    //   compute by src0 rows
-#if defined(GGML_USE_CUBLAS)
+#if defined(GGML_USE_CUDA)
    if (ggml_v2_cuda_can_mul_mat(src0, src1, dst)) {
        if (params->ith == 0 && params->type == GGML_V2_TASK_COMPUTE) {
            if(quants_unshuffled)
@ -14087,7 +14087,7 @@ void ggml_v2_graph_compute(struct ggml_v2_context * ctx, struct ggml_v2_cgraph *
                        size_t cur = 0;
-#if defined(GGML_USE_CUBLAS)
+#if defined(GGML_USE_CUDA)
                        if (ggml_v2_cuda_can_mul_mat(node->src0, node->src1, node)) {
                            node->n_tasks = 1; // TODO: this actually is doing nothing
                                                //       the threads are still spinning
@ -15585,7 +15585,7 @@ int ggml_v2_cpu_has_wasm_simd(void) {
 }
 int ggml_v2_cpu_has_blas(void) {
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST)
    return 1;
 #else
    return 0;
@ -15593,7 +15593,7 @@ int ggml_v2_cpu_has_blas(void) {
 }
 int ggml_v2_cpu_has_cublas(void) {
-#if defined(GGML_USE_CUBLAS)
+#if defined(GGML_USE_CUDA)
    return 1;
 #else
    return 0;
--- a/otherarch/ggml_v3.c
+++ b/otherarch/ggml_v3.c
@ -1367,7 +1367,7 @@ inline static void * ggml_v3_aligned_malloc(size_t size) {
 #else
 #include <cblas.h>
 #endif
-#elif defined(GGML_USE_CUBLAS)
+#elif defined(GGML_USE_CUDA)
 #include "ggml_v3-cuda.h"
 #elif defined(GGML_USE_CLBLAST)
 #include "ggml_v3-opencl.h"
@ -3413,7 +3413,7 @@ struct ggml_v3_context * ggml_v3_init(struct ggml_v3_init_params params) {
            GGML_V3_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
        }
-#if defined(GGML_USE_CUBLAS)
+#if defined(GGML_USE_CUDA)
        ggml_v3_init_cublas();
 #elif defined(GGML_USE_CLBLAST)
        ggml_v3_cl_init();
@ -11325,7 +11325,7 @@ static void ggml_v3_compute_forward_out_prod_f32(
    // nb01 >= nb00 - src0 is not transposed
    //   compute by src0 rows
-    // TODO: #if defined(GGML_USE_CUBLAS) ggml_v3_cuda_out_prod
+    // TODO: #if defined(GGML_USE_CUDA) ggml_v3_cuda_out_prod
    // TODO: #if defined(GGML_USE_CLBLAST)
 #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
@ -11520,7 +11520,7 @@ static void ggml_v3_compute_forward_out_prod_q_f32(
    // nb01 >= nb00 - src0 is not transposed
    //   compute by src0 rows
-    // TODO: #if defined(GGML_USE_CUBLAS) ggml_v3_cuda_out_prod
+    // TODO: #if defined(GGML_USE_CUDA) ggml_v3_cuda_out_prod
    // TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
    if (params->type == GGML_V3_TASK_INIT) {
@ -15587,14 +15587,14 @@ static void ggml_v3_compute_forward(struct ggml_v3_compute_params * params, stru
        return;
    }
-#ifdef GGML_USE_CUBLAS
+#ifdef GGML_USE_CUDA
    bool skip_cpu = ggml_v3_cuda_compute_forward(params, tensor);
    if (skip_cpu) {
        return;
    }
    GGML_V3_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_V3_BACKEND_CPU);
    GGML_V3_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_V3_BACKEND_CPU);
-#endif // GGML_USE_CUBLAS
+#endif // GGML_USE_CUDA
    switch (tensor->op) {
        case GGML_V3_OP_DUP:
@ -21106,7 +21106,7 @@ int ggml_v3_cpu_has_wasm_simd(void) {
 }
 int ggml_v3_cpu_has_blas(void) {
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST)
    return 1;
 #else
    return 0;
@ -21114,7 +21114,7 @@ int ggml_v3_cpu_has_blas(void) {
 }
 int ggml_v3_cpu_has_cublas(void) {
-#if defined(GGML_USE_CUBLAS)
+#if defined(GGML_USE_CUDA)
    return 1;
 #else
    return 0;
--- a/otherarch/gpt2_v3.cpp
+++ b/otherarch/gpt2_v3.cpp
@ -16,7 +16,7 @@
 #include "model_adapter.h"
-#ifdef GGML_USE_CUBLAS
+#ifdef GGML_USE_CUDA
 #include "ggml_v3-cuda.h"
 #endif
 #if defined(GGML_USE_CLBLAST)
@ -353,7 +353,7 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g
    fin.close();
    //gpu offload
-    #if defined(GGML_USE_CLBLAST) || defined(GGML_USE_CUBLAS)
+    #if defined(GGML_USE_CLBLAST) || defined(GGML_USE_CUDA)
    if(gpulayers>0)
    {
        const auto & hparams = model.hparams;
--- a/otherarch/gptj_v3.cpp
+++ b/otherarch/gptj_v3.cpp
@ -16,7 +16,7 @@
 #include "model_adapter.h"
-#ifdef GGML_USE_CUBLAS
+#ifdef GGML_USE_CUDA
 #include "ggml_v3-cuda.h"
 #endif
 #if defined(GGML_USE_CLBLAST)
@ -342,7 +342,7 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g
    fin.close();
    //gpu offload
-    #if defined(GGML_USE_CLBLAST) || defined(GGML_USE_CUBLAS)
+    #if defined(GGML_USE_CLBLAST) || defined(GGML_USE_CUDA)
    if(gpulayers>0)
    {
        const auto & hparams = model.hparams;
--- a/otherarch/llama-util.h
+++ b/otherarch/llama-util.h
@ -504,7 +504,7 @@ struct llama_v3_buffer {
    llama_v3_buffer& operator=(llama_v3_buffer&&) = delete;
 };
-#ifdef GGML_USE_CUBLAS
+#ifdef GGML_USE_CUDA
 #include "ggml_v3-cuda.h"
 struct llama_v3_ctx_buffer {
    uint8_t * addr = NULL;
--- a/otherarch/llama_v2-util.h
+++ b/otherarch/llama_v2-util.h
@ -407,7 +407,7 @@ struct llama_v2_buffer {
    llama_v2_buffer& operator=(llama_v2_buffer&&) = delete;
 };
-#ifdef GGML_USE_CUBLAS
+#ifdef GGML_USE_CUDA
 #include "ggml_v2-cuda.h"
 struct llama_v2_ctx_buffer {
    uint8_t * addr = NULL;
--- a/otherarch/llama_v2.cpp
+++ b/otherarch/llama_v2.cpp
@ -10,7 +10,7 @@
 #include "ggml_v2.h"
-#ifdef GGML_USE_CUBLAS
+#ifdef GGML_USE_CUDA
 #include "ggml_v2-cuda.h"
 #endif
 #if defined(GGML_USE_CLBLAST)
@ -1063,7 +1063,7 @@ static void llama_v2_model_load_internal(
    ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
    model.mapping = std::move(ml->mapping);
-#if defined(GGML_USE_CUBLAS)
+#if defined(GGML_USE_CUDA)
    {
        const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
        if(GetQuantsUnshuffled())
--- a/otherarch/llama_v3.cpp
+++ b/otherarch/llama_v3.cpp
@ -13,7 +13,7 @@
 #include "ggml_v3.h"
 #include "otherarch.h"
-#ifdef GGML_USE_CUBLAS
+#ifdef GGML_USE_CUDA
 #include "ggml_v3-cuda.h"
 #endif
 #if defined(GGML_USE_CLBLAST)
@ -61,7 +61,7 @@ static void llama_v3_log_callback_default(llama_v3_log_level level, const char *
 #define LLAMA_V3_LOG_WARN(...)  llama_v3_log_internal(LLAMA_V3_LOG_LEVEL_WARN , __VA_ARGS__)
 #define LLAMA_V3_LOG_ERROR(...) llama_v3_log_internal(LLAMA_V3_LOG_LEVEL_ERROR, __VA_ARGS__)
-#if !defined(GGML_USE_CUBLAS)
+#if !defined(GGML_USE_CUDA)
 #define LLAMA_V3_USE_ALLOCATOR
 #else
 #define LLAMA_V3_USE_SCRATCH
@ -270,10 +270,10 @@ struct llama_v3_kv_cache {
            ggml_v3_free(ctx);
        }
-#ifdef GGML_USE_CUBLAS
+#ifdef GGML_USE_CUDA
        ggml_v3_cuda_free_data(k);
        ggml_v3_cuda_free_data(v);
-#endif // GGML_USE_CUBLAS
+#endif // GGML_USE_CUDA
    }
 };
@ -329,7 +329,7 @@ struct llama_v3_model {
            ggml_v3_free(ctx);
        }
-#ifdef GGML_USE_CUBLAS
+#ifdef GGML_USE_CUDA
        for (size_t i = 0; i < tensors_by_name.size(); ++i) {
            ggml_v3_cuda_free_data(tensors_by_name[i].second);
        }
@ -795,7 +795,7 @@ struct llama_v3_model_loader {
                        lmlock->grow_to(lock_size);
                    }
                    break;
-#if defined(GGML_USE_CUBLAS)
+#if defined(GGML_USE_CUDA)
                case GGML_V3_BACKEND_GPU:
                case GGML_V3_BACKEND_GPU_SPLIT:
                    ggml_v3_cuda_transform_tensor(lt.data, lt.ggml_v3_tensor);
@ -882,14 +882,14 @@ static bool kv_cache_init(
    ggml_v3_set_name(cache.v, "cache_v");
    (void) n_gpu_layers;
-#ifdef GGML_USE_CUBLAS
+#ifdef GGML_USE_CUDA
    if (n_gpu_layers > n_layer + 1) {
        ggml_v3_cuda_assign_buffers_no_scratch(cache.v);
    }
    if (n_gpu_layers > n_layer + 2) {
        ggml_v3_cuda_assign_buffers_no_scratch(cache.k);
    }
-#endif // GGML_USE_CUBLAS
+#endif // GGML_USE_CUDA
    return true;
 }
@ -1181,7 +1181,7 @@ static void llama_v3_model_load_internal(
    (void) main_gpu;
    (void) mul_mat_q;
-#if defined(GGML_USE_CUBLAS)
+#if defined(GGML_USE_CUDA)
    LLAMA_V3_LOG_INFO("%s: using CUDA for GPU acceleration\n", __func__);
    ggml_v3_cuda_set_main_device(main_gpu);
    ggml_v3_cuda_set_mul_mat_q(mul_mat_q);
@ -1298,7 +1298,7 @@ static void llama_v3_model_load_internal(
        (void) vram_scratch;
        (void) n_batch;
-#ifdef GGML_USE_CUBLAS
+#ifdef GGML_USE_CUDA
        if (low_vram) {
            LLAMA_V3_LOG_INFO("%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__);
            ggml_v3_cuda_set_scratch_size(0); // disable scratch
@ -1313,9 +1313,9 @@ static void llama_v3_model_load_internal(
                        (vram_scratch + MB3 - 1) / MB3); // round up
            }
        }
-#endif // GGML_USE_CUBLAS
+#endif // GGML_USE_CUDA
-#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST)
        const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
        LLAMA_V3_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
@ -1324,7 +1324,7 @@ static void llama_v3_model_load_internal(
        }
        size_t vram_kv_cache = 0;
-#ifdef GGML_USE_CUBLAS
+#ifdef GGML_USE_CUDA
        const int max_backend_supported_layers = hparams.n_layer + 3;
        const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3;
        if (n_gpu_layers > (int) hparams.n_layer + 1) {
@ -1346,7 +1346,7 @@ static void llama_v3_model_load_internal(
 #elif defined(GGML_USE_CLBLAST)
        const int max_backend_supported_layers = hparams.n_layer + 1;
        const int max_offloadable_layers = hparams.n_layer + 1;
-#endif // GGML_USE_CUBLAS
+#endif // GGML_USE_CUDA
        LLAMA_V3_LOG_INFO("%s: offloaded %d/%d layers to GPU\n",
                __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
@ -1354,7 +1354,7 @@ static void llama_v3_model_load_internal(
                __func__, (vram_weights + vram_scratch + vram_kv_cache + MB3 - 1) / MB3); // round up
 #else
        (void) n_gpu_layers;
-#endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
+#endif // defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST)
    }
    // populate `tensors_by_name`
@ -1363,7 +1363,7 @@ static void llama_v3_model_load_internal(
    }
    (void) tensor_split;
-#if defined(GGML_USE_CUBLAS)
+#if defined(GGML_USE_CUDA)
    {
        ggml_v3_cuda_set_tensor_split(tensor_split);
    }
@ -1510,7 +1510,7 @@ static struct ggml_v3_cgraph * llama_v3_build_graph(
    offload_func_v3_t offload_func_kq = llama_v3_nop;
    offload_func_v3_t offload_func_v  = llama_v3_nop;
-#ifdef GGML_USE_CUBLAS
+#ifdef GGML_USE_CUDA
    if (n_gpu_layers > n_layer) {
        offload_func_nr = ggml_v3_cuda_assign_buffers;
    }
@ -1520,7 +1520,7 @@ static struct ggml_v3_cgraph * llama_v3_build_graph(
    if (n_gpu_layers > n_layer + 2) {
        offload_func_kq = ggml_v3_cuda_assign_buffers;
    }
-#endif // GGML_USE_CUBLAS
+#endif // GGML_USE_CUDA
    struct ggml_v3_tensor * KQ_scale = ggml_v3_new_tensor_1d(ctx0, GGML_V3_TYPE_F32, 1);
 #ifdef LLAMA_V3_USE_ALLOCATOR
@ -1541,11 +1541,11 @@ static struct ggml_v3_cgraph * llama_v3_build_graph(
        offload_func_v3_t offload_func = llama_v3_nop;
-#ifdef GGML_USE_CUBLAS
+#ifdef GGML_USE_CUDA
        if (il >= i_gpu_start) {
            offload_func = ggml_v3_cuda_assign_buffers;
        }
-#endif // GGML_USE_CUBLAS
+#endif // GGML_USE_CUDA
        struct ggml_v3_tensor * inpSA = inpL;
@ -3661,19 +3661,19 @@ int llama_v3_apply_lora_from_file_internal(const struct llama_v3_model & model,
            offload_func_v3_t offload_func = llama_v3_nop;
            offload_func_v3_t offload_func_force_inplace = llama_v3_nop;
-#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST)
            if (dest_t->backend == GGML_V3_BACKEND_GPU || dest_t->backend == GGML_V3_BACKEND_GPU_SPLIT) {
                if (dest_t->type != GGML_V3_TYPE_F16) {
                    printf("\nError: the simultaneous use of LoRAs and GPU acceleration is only supported for f16 models\n");
                    throw std::runtime_error(format_old(
                        "%s: error: lora failed", __func__));
                }
-#if defined(GGML_USE_CUBLAS)
+#if defined(GGML_USE_CUDA)
                offload_func = ggml_v3_cuda_assign_buffers;
                offload_func_force_inplace = ggml_v3_cuda_assign_buffers_force_inplace;
 #endif
            }
-#endif // GGML_USE_CUBLAS
+#endif // GGML_USE_CUDA
            ggml_v3_tensor * base_t;
            if (model_loader) {
--- a/otherarch/llama_v3.h
+++ b/otherarch/llama_v3.h
@ -2,12 +2,12 @@
 #define LLAMA_V3_H
 #include "ggml_v3.h"
-#ifdef GGML_USE_CUBLAS
+#ifdef GGML_USE_CUDA
 #include "ggml_v3-cuda.h"
 #define LLAMA_V3_MAX_DEVICES GGML_V3_CUDA_MAX_DEVICES
 #else
 #define LLAMA_V3_MAX_DEVICES 1
-#endif // GGML_USE_CUBLAS
+#endif // GGML_USE_CUDA
 #include <stddef.h>
 #include <stdint.h>
 #include <stdbool.h>
@ -48,7 +48,7 @@
 #define LLAMA_V3_DEFAULT_SEED           0xFFFFFFFF
-#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
+#if defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
 // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
 #define LLAMA_V3_SUPPORTS_GPU_OFFLOAD
 #endif
--- a/otherarch/mpt_v3.cpp
+++ b/otherarch/mpt_v3.cpp
@ -16,7 +16,7 @@
 #include "model_adapter.h"
-#ifdef GGML_USE_CUBLAS
+#ifdef GGML_USE_CUDA
 #include "ggml_v3-cuda.h"
 #endif
 #if defined(GGML_USE_CLBLAST)
@ -295,7 +295,7 @@ bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vo
    fin.close();
    //gpu offload
-    #if defined(GGML_USE_CLBLAST) || defined(GGML_USE_CUBLAS)
+    #if defined(GGML_USE_CLBLAST) || defined(GGML_USE_CUDA)
    if(gpulayers>0)
    {
        const auto & hparams = model.hparams;
--- a/otherarch/neox_v3.cpp
+++ b/otherarch/neox_v3.cpp
@ -14,7 +14,7 @@
 #include <iostream>
 #include <algorithm>
-#ifdef GGML_USE_CUBLAS
+#ifdef GGML_USE_CUDA
 #include "ggml_v3-cuda.h"
 #endif
 #if defined(GGML_USE_CLBLAST)
@ -329,7 +329,7 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model &
    fin.close();
    //gpu offload
-    #if defined(GGML_USE_CLBLAST) || defined(GGML_USE_CUBLAS)
+    #if defined(GGML_USE_CLBLAST) || defined(GGML_USE_CUDA)
    if(gpulayers>0)
    {
        const auto & hparams = model.hparams;
--- a/otherarch/rwkv_v3.cpp
+++ b/otherarch/rwkv_v3.cpp
@ -6,7 +6,7 @@
 #include "rwkv_v3.h"
 #include "ggml_v3.h"
-#ifdef GGML_USE_CUBLAS
+#ifdef GGML_USE_CUDA
 #include "ggml_v3-cuda.h"
 #endif
 #if defined(GGML_USE_CLBLAST)
@ -1076,7 +1076,7 @@ struct rwkv_future_tensor rwkv_future_graph_work(struct rwkv_future_ctx & ctx,
    const size_t n_threads,
    const size_t sequence_len = 1
 ) {
-#if defined(GGML_USE_CLBLAST) || defined(GGML_USE_CUBLAS)
+#if defined(GGML_USE_CLBLAST) || defined(GGML_USE_CUDA)
    enum ggml_v3_type mul_mat_type = type == GGML_V3_TYPE_F32 ? GGML_V3_TYPE_F32 : GGML_V3_TYPE_F16;
 #else
    enum ggml_v3_type mul_mat_type = ggml_v3_is_quantized(type) ? GGML_V3_TYPE_Q8_1 : type;
@ -1566,7 +1566,7 @@ struct rwkv_context * rwkv_clone_context(struct rwkv_context * ctx, const uint32
 }
 bool rwkv_gpu_offload_layers(struct rwkv_context * ctx, const uint32_t n_layers) {
-#if defined(GGML_USE_CLBLAST) || defined(GGML_USE_CUBLAS)
+#if defined(GGML_USE_CLBLAST) || defined(GGML_USE_CUDA)
    printf("\nOffloading %u (or fewer) layers...",n_layers);
    const auto offload = [&](struct ggml_v3_tensor * tensor) {
        // TODO support multi-GPU
--- a/scripts/get-wikitext-103.sh
+++ b/scripts/get-wikitext-103.sh
@ -1,10 +0,0 @@
 #!/bin/bash
 wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip
 echo "Usage:"
 echo ""
 echo "  ./perplexity -m model.gguf -f wiki.test.raw [other params]"
 echo ""
 exit 0
--- a/scripts/pod-llama.sh
+++ b/scripts/pod-llama.sh
@ -1,213 +0,0 @@
 #!/bin/bash
 #
 # Use this script only on fresh pods (runpod.io)!
 # Otherwise, it can break your environment!
 #
 if [ -z "$1" ]; then
    echo "Usage: $0 <data>"
    echo "  0: no models"
    echo "  1: tinyllama-1b"
    echo "  2: codellama-7b"
    echo "  3: codellama-13b"
    echo "  4: codellama-34b"
    echo "  5: codellama-7b-instruct"
    echo "  6: codellama-13b-instruct"
    echo "  7: codellama-34b-instruct"
    exit 1
 fi
 set -x
 # setup deps
 apt-get update
 apt-get install -y git-lfs cmake cmake-curses-gui vim ruby
 git-lfs install
 if [ ! -d "/workspace" ]; then
    ln -sfn $(pwd) /workspace
 fi
 # download data
 cd /workspace
 # this is useful to git clone repos without doubling the disk size due to .git
 git clone https://github.com/iboB/git-lfs-download
 ln -sfn /workspace/git-lfs-download/git-lfs-download /usr/local/bin/git-lfs-download
 # llama.cpp
 cd /workspace
 git clone https://github.com/ggerganov/llama.cpp
 cd llama.cpp
 LLAMA_CUBLAS=1 make -j
 ln -sfn /workspace/TinyLlama-1.1B-Chat-v0.3  ./models/tinyllama-1b
 ln -sfn /workspace/CodeLlama-7b-hf           ./models/codellama-7b
 ln -sfn /workspace/CodeLlama-13b-hf          ./models/codellama-13b
 ln -sfn /workspace/CodeLlama-34b-hf          ./models/codellama-34b
 ln -sfn /workspace/CodeLlama-7b-Instruct-hf  ./models/codellama-7b-instruct
 ln -sfn /workspace/CodeLlama-13b-Instruct-hf ./models/codellama-13b-instruct
 ln -sfn /workspace/CodeLlama-34b-Instruct-hf ./models/codellama-34b-instruct
 pip install -r requirements.txt
 # cmake
 cd /workspace/llama.cpp
 mkdir build-cublas
 cd build-cublas
 cmake -DLLAMA_CUBLAS=1 ../
 make -j
 if [ "$1" -eq "0" ]; then
    exit 0
 fi
 # more models
 if [ "$1" -eq "1" ]; then
    cd /workspace
    git-lfs-download https://huggingface.co/PY007/TinyLlama-1.1B-Chat-v0.3
    cd /workspace/llama.cpp
    python3 convert.py ./models/tinyllama-1b  --outfile ./models/tinyllama-1b/ggml-model-f16.gguf  --outtype f16
    ./quantize ./models/tinyllama-1b/ggml-model-f16.gguf ./models/tinyllama-1b/ggml-model-q4_0.gguf q4_0
    ./quantize ./models/tinyllama-1b/ggml-model-f16.gguf ./models/tinyllama-1b/ggml-model-q4_k.gguf q4_k
    ./quantize ./models/tinyllama-1b/ggml-model-f16.gguf ./models/tinyllama-1b/ggml-model-q8_0.gguf q8_0
 fi
 if [ "$1" -eq "2" ]; then
    cd /workspace
    git-lfs-download https://huggingface.co/codellama/CodeLlama-7b-hf  --without *safetensors*
    rm -v ./CodeLlama-7b-hf/*safetensors*
    cd /workspace/llama.cpp
    python3 convert.py ./models/codellama-7b  --outfile ./models/codellama-7b/ggml-model-f16.gguf  --outtype f16
    ./quantize ./models/codellama-7b/ggml-model-f16.gguf ./models/codellama-7b/ggml-model-q4_0.gguf q4_0
    ./quantize ./models/codellama-7b/ggml-model-f16.gguf ./models/codellama-7b/ggml-model-q4_k.gguf q4_k
    ./quantize ./models/codellama-7b/ggml-model-f16.gguf ./models/codellama-7b/ggml-model-q8_0.gguf q8_0
 fi
 if [ "$1" -eq "3" ]; then
    cd /workspace
    git-lfs-download https://huggingface.co/codellama/CodeLlama-13b-hf --without *safetensors*
    rm -v ./CodeLlama-13b-hf/*safetensors*
    cd /workspace/llama.cpp
    python3 convert.py ./models/codellama-13b --outfile ./models/codellama-13b/ggml-model-f16.gguf --outtype f16
    ./quantize ./models/codellama-13b/ggml-model-f16.gguf ./models/codellama-13b/ggml-model-q4_0.gguf q4_0
    ./quantize ./models/codellama-13b/ggml-model-f16.gguf ./models/codellama-13b/ggml-model-q4_k.gguf q4_k
    ./quantize ./models/codellama-13b/ggml-model-f16.gguf ./models/codellama-13b/ggml-model-q8_0.gguf q8_0
 fi
 if [ "$1" -eq "4" ]; then
    cd /workspace
    git-lfs-download https://huggingface.co/codellama/CodeLlama-34b-hf --without *safetensors*
    rm -v ./CodeLlama-34b-hf/*safetensors*
    cd /workspace/llama.cpp
    python3 convert.py ./models/codellama-34b --outfile ./models/codellama-34b/ggml-model-f16.gguf --outtype f16
    ./quantize ./models/codellama-34b/ggml-model-f16.gguf ./models/codellama-34b/ggml-model-q4_0.gguf q4_0
    ./quantize ./models/codellama-34b/ggml-model-f16.gguf ./models/codellama-34b/ggml-model-q4_k.gguf q4_k
    ./quantize ./models/codellama-34b/ggml-model-f16.gguf ./models/codellama-34b/ggml-model-q8_0.gguf q8_0
 fi
 if [ "$1" -eq "5" ]; then
    cd /workspace
    git-lfs-download https://huggingface.co/codellama/CodeLlama-7b-Instruct-hf  --without *safetensors*
    rm -v ./CodeLlama-7b-Instruct-hf/*safetensors*
    cd /workspace/llama.cpp
    python3 convert.py ./models/codellama-7b-instruct  --outfile ./models/codellama-7b-instruct/ggml-model-f16.gguf  --outtype f16
    ./quantize ./models/codellama-7b-instruct/ggml-model-f16.gguf ./models/codellama-7b-instruct/ggml-model-q4_0.gguf q4_0
    ./quantize ./models/codellama-7b-instruct/ggml-model-f16.gguf ./models/codellama-7b-instruct/ggml-model-q4_k.gguf q4_k
    ./quantize ./models/codellama-7b-instruct/ggml-model-f16.gguf ./models/codellama-7b-instruct/ggml-model-q8_0.gguf q8_0
 fi
 if [ "$1" -eq "6" ]; then
    cd /workspace
    git-lfs-download https://huggingface.co/codellama/CodeLlama-13b-Instruct-hf --without *safetensors*
    rm -v ./CodeLlama-13b-Instruct-hf/*safetensors*
    cd /workspace/llama.cpp
    python3 convert.py ./models/codellama-13b-instruct --outfile ./models/codellama-13b-instruct/ggml-model-f16.gguf --outtype f16
    ./quantize ./models/codellama-13b-instruct/ggml-model-f16.gguf ./models/codellama-13b-instruct/ggml-model-q4_0.gguf q4_0
    ./quantize ./models/codellama-13b-instruct/ggml-model-f16.gguf ./models/codellama-13b-instruct/ggml-model-q4_k.gguf q4_k
    ./quantize ./models/codellama-13b-instruct/ggml-model-f16.gguf ./models/codellama-13b-instruct/ggml-model-q8_0.gguf q8_0
 fi
 if [ "$1" -eq "7" ]; then
    cd /workspace
    git-lfs-download https://huggingface.co/codellama/CodeLlama-34b-Instruct-hf --without *safetensors*
    rm -v ./CodeLlama-34b-Instruct-hf/*safetensors*
    cd /workspace/llama.cpp
    python3 convert.py ./models/codellama-34b-instruct --outfile ./models/codellama-34b-instruct/ggml-model-f16.gguf --outtype f16
    ./quantize ./models/codellama-34b-instruct/ggml-model-f16.gguf ./models/codellama-34b-instruct/ggml-model-q4_0.gguf q4_0
    ./quantize ./models/codellama-34b-instruct/ggml-model-f16.gguf ./models/codellama-34b-instruct/ggml-model-q4_k.gguf q4_k
    ./quantize ./models/codellama-34b-instruct/ggml-model-f16.gguf ./models/codellama-34b-instruct/ggml-model-q8_0.gguf q8_0
 fi
 if [ "$1" -eq "1" ]; then
    # perf + perplexity
    cd /workspace/llama.cpp/build-cublas
    make -j && ../scripts/run-all-perf.sh tinyllama-1b "f16" "-ngl 99 -t 1 -p 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,32,64,128,256,512,1024,2048 -n 128"
    ../scripts/get-wikitext-2.sh
    unzip wikitext-2-raw-v1.zip
    make -j && ./bin/perplexity -m ../models/tinyllama-1b/ggml-model-f16.gguf -f ./wikitext-2-raw/wiki.test.raw -ngl 100 --chunks 32
    # batched
    cd /workspace/llama.cpp
    LLAMA_CUBLAS=1 make -j && ./batched ./models/tinyllama-1b/ggml-model-f16.gguf "Hello, my name is" 8 128 999
    # batched-bench
    cd /workspace/llama.cpp
    LLAMA_CUBLAS=1 make -j && ./batched-bench ./models/tinyllama-1b/ggml-model-f16.gguf 4608 1 99 0 512 128 1,2,3,4,5,6,7,8,16,32
    # parallel
    cd /workspace/llama.cpp
    LLAMA_CUBLAS=1 make -j && ./parallel -m ./models/tinyllama-1b/ggml-model-f16.gguf -t 1 -ngl 100 -c 4096 -b 512 -s 1 -np 8 -ns 128 -n 100 -cb
 fi
 # speculative
 #if [ "$1" -eq "7" ]; then
 #    cd /workspace/llama.cpp
 #
 #    LLAMA_CUBLAS=1 make -j && ./speculative -m ./models/codellama-34b-instruct/ggml-model-f16.gguf -md ./models/codellama-7b-instruct/ggml-model-q4_0.gguf -p "# Dijkstra's shortest path algorithm in Python (4 spaces indentation) + complexity analysis:\n\n" -e -ngl 999 -ngld 999 -t 4 -n 512 -c 4096 -s 21 --draft 16 -np 1 --temp 0.0
 #fi
 # more benches
 #LLAMA_CUBLAS=1 make -j && ./batched-bench ./models/codellama-7b/ggml-model-q4_k.gguf  4096 1 99 1 512,3200 128,128,800 1
 #LLAMA_CUBLAS=1 make -j && ./batched-bench ./models/codellama-13b/ggml-model-q4_k.gguf 4096 1 99 1 512,3200 128,128,800 1
--- a/tests/run-json-schema-to-grammar.mjs
+++ b/tests/run-json-schema-to-grammar.mjs
@ -1,10 +0,0 @@
 import { readFileSync } from "fs"
 import { SchemaConverter } from "../examples/server/public/json-schema-to-grammar.mjs"
 const [, , file] = process.argv
 const url = `file://${file}`
 let schema = JSON.parse(readFileSync(file, "utf8"));
 const converter = new SchemaConverter({})
 schema = await converter.resolveRefs(schema, url)
 converter.visit(schema, '')
 console.log(converter.formatGrammar())
--- a/tests/test-chat-template.cpp
+++ b/tests/test-chat-template.cpp
@ -1,79 +0,0 @@
 #include <iostream>
 #include <string>
 #include <vector>
 #include <sstream>
 #undef NDEBUG
 #include <cassert>
 #include "llama.h"
 int main(void) {
    llama_chat_message conversation[] = {
        {"system", "You are a helpful assistant"},
        {"user", "Hello"},
        {"assistant", "Hi there"},
        {"user", "Who are you"},
        {"assistant", "   I am an assistant   "},
        {"user", "Another question"},
    };
    size_t message_count = 6;
    std::vector<std::string> templates = {
        // teknium/OpenHermes-2.5-Mistral-7B
        "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content'] + '<|im_end|>' + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\\n' }}{% endif %}",
        // mistralai/Mistral-7B-Instruct-v0.2
        "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}",
        // TheBloke/FusionNet_34Bx2_MoE-AWQ
        "{%- for idx in range(0, messages|length) -%}\\n{%- if messages[idx]['role'] == 'user' -%}\\n{%- if idx > 1 -%}\\n{{- bos_token + '[INST] ' + messages[idx]['content'] + ' [/INST]' -}}\\n{%- else -%}\\n{{- messages[idx]['content'] + ' [/INST]' -}}\\n{%- endif -%}\\n{% elif messages[idx]['role'] == 'system' %}\\n{{- '[INST] <<SYS>>\\\\n' + messages[idx]['content'] + '\\\\n<</SYS>>\\\\n\\\\n' -}}\\n{%- elif messages[idx]['role'] == 'assistant' -%}\\n{{- ' '  + messages[idx]['content'] + ' ' + eos_token -}}\\n{% endif %}\\n{% endfor %}",
        // bofenghuang/vigogne-2-70b-chat
        "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif true == true and not '<<SYS>>' in messages[0]['content'] %}{% set loop_messages = messages %}{% set system_message = 'Vous êtes Vigogne, un assistant IA créé par Zaion Lab. Vous suivez extrêmement bien les instructions. Aidez autant que vous le pouvez.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\\\n' + system_message + '\\\\n<</SYS>>\\\\n\\\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'system' %}{{ '<<SYS>>\\\\n' + content.strip() + '\\\\n<</SYS>>\\\\n\\\\n' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}",
        // mlabonne/AlphaMonarch-7B
        "{% for message in messages %}{{bos_token + message['role'] + '\\n' + message['content'] + eos_token + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ bos_token + 'assistant\\n' }}{% endif %}",
        // google/gemma-7b-it
        "{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '<start_of_turn>' + role + '\\n' + message['content'] | trim + '<end_of_turn>\\n' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model\\n'}}{% endif %}",
        // OrionStarAI/Orion-14B-Chat
        "{% for message in messages %}{% if loop.first %}{{ bos_token }}{% endif %}{% if message['role'] == 'user' %}{{ 'Human: ' + message['content'] + '\\n\\nAssistant: ' + eos_token }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token }}{% endif %}{% endfor %}",
    };
    std::vector<std::string> expected_output = {
        // teknium/OpenHermes-2.5-Mistral-7B
        "<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\nHello<|im_end|>\n<|im_start|>assistant\nHi there<|im_end|>\n<|im_start|>user\nWho are you<|im_end|>\n<|im_start|>assistant\n   I am an assistant   <|im_end|>\n<|im_start|>user\nAnother question<|im_end|>\n<|im_start|>assistant\n",
        // mistralai/Mistral-7B-Instruct-v0.2
        "[INST] You are a helpful assistant\nHello [/INST]Hi there</s>[INST] Who are you [/INST]   I am an assistant   </s>[INST] Another question [/INST]",
        // TheBloke/FusionNet_34Bx2_MoE-AWQ
        "[INST] <<SYS>>\nYou are a helpful assistant\n<</SYS>>\n\nHello [/INST] Hi there </s><s>[INST] Who are you [/INST]    I am an assistant    </s><s>[INST] Another question [/INST]",
        // bofenghuang/vigogne-2-70b-chat
        "[INST] <<SYS>>\nYou are a helpful assistant\n<</SYS>>\n\nHello [/INST] Hi there </s>[INST] Who are you [/INST] I am an assistant </s>[INST] Another question [/INST]",
        // mlabonne/AlphaMonarch-7B
        "system\nYou are a helpful assistant</s>\n<s>user\nHello</s>\n<s>assistant\nHi there</s>\n<s>user\nWho are you</s>\n<s>assistant\n   I am an assistant   </s>\n<s>user\nAnother question</s>\n<s>assistant\n",
        // google/gemma-7b-it
        "<start_of_turn>user\nYou are a helpful assistant\n\nHello<end_of_turn>\n<start_of_turn>model\nHi there<end_of_turn>\n<start_of_turn>user\nWho are you<end_of_turn>\n<start_of_turn>model\nI am an assistant<end_of_turn>\n<start_of_turn>user\nAnother question<end_of_turn>\n<start_of_turn>model\n",
        // OrionStarAI/Orion-14B-Chat
        "Human: You are a helpful assistant\n\nHello\n\nAssistant: </s>Hi there</s>Human: Who are you\n\nAssistant: </s>   I am an assistant   </s>Human: Another question\n\nAssistant: </s>",
    };
    std::vector<char> formatted_chat(1024);
    int32_t res;
    // test invalid chat template
    res = llama_chat_apply_template(nullptr, "INVALID TEMPLATE", conversation, message_count, true, formatted_chat.data(), formatted_chat.size());
    assert(res < 0);
    for (size_t i = 0; i < templates.size(); i++) {
        std::string custom_template = templates[i];
        std::string expected = expected_output[i];
        formatted_chat.resize(1024);
        res = llama_chat_apply_template(
            nullptr,
            custom_template.c_str(),
            conversation,
            message_count,
            true,
            formatted_chat.data(),
            formatted_chat.size()
        );
        formatted_chat.resize(res);
        std::string output(formatted_chat.data(), formatted_chat.size());
        std::cout << output << "\n-------------------------\n";
        assert(output == expected);
    }
    return 0;
 }
--- a/tests/test-json-schema-to-grammar.cpp
+++ b/tests/test-json-schema-to-grammar.cpp
@ -1,842 +0,0 @@
 #ifdef NDEBUG
 #undef NDEBUG
 #endif
 #include <fstream>
 #include <sstream>
 #include <regex>
 #include "json-schema-to-grammar.h"
 #include "grammar-parser.h"
 static std::string trim(const std::string & source) {
    std::string s(source);
    s.erase(0,s.find_first_not_of(" \n\r\t"));
    s.erase(s.find_last_not_of(" \n\r\t")+1);
    return std::regex_replace(s, std::regex("(^|\n)[ \t]+"), "$1");
 }
 enum TestCaseStatus {
    SUCCESS,
    FAILURE
 };
 struct TestCase {
    TestCaseStatus expected_status;
    std::string name;
    std::string schema;
    std::string expected_grammar;
    void _print_failure_header() const {
        fprintf(stderr, "#\n# Test '%s' failed.\n#\n%s\n", name.c_str(), schema.c_str());
    }
    void verify(const std::string & actual_grammar) const {
        if (trim(actual_grammar) != trim(expected_grammar)) {
        _print_failure_header();
        fprintf(stderr, "# EXPECTED:\n%s\n# ACTUAL:\n%s\n", expected_grammar.c_str(), actual_grammar.c_str());
        assert(false);
        }
    }
    void verify_expectation_parseable() const {
        try {
            auto state = grammar_parser::parse(expected_grammar.c_str());
            if (state.symbol_ids.find("root") == state.symbol_ids.end()) {
                throw std::runtime_error("Grammar failed to parse:\n" + expected_grammar);
            }
        } catch (const std::runtime_error & ex) {
            _print_failure_header();
            fprintf(stderr, "# GRAMMAR ERROR: %s\n", ex.what());
            assert(false);
        }
    }
    void verify_status(TestCaseStatus status) const {
        if (status != expected_status) {
            _print_failure_header();
            fprintf(stderr, "# EXPECTED STATUS: %s\n", expected_status == SUCCESS ? "SUCCESS" : "FAILURE");
            fprintf(stderr, "# ACTUAL STATUS: %s\n", status == SUCCESS ? "SUCCESS" : "FAILURE");
            assert(false);
        }
    }
 };
 static void write(const std::string & file, const std::string & content) {
    std::ofstream f;
    f.open(file.c_str());
    f << content.c_str();
    f.close();
 }
 static std::string read(const std::string & file) {
    std::ostringstream actuals;
    actuals << std::ifstream(file.c_str()).rdbuf();
    return actuals.str();
 }
 static void test_all(const std::string & lang, std::function<void(const TestCase &)> runner) {
    fprintf(stderr, "#\n# Testing JSON schema conversion (%s)\n#\n", lang.c_str());
    auto test = [&](const TestCase & tc) {
        fprintf(stderr, "- %s%s\n", tc.name.c_str(), tc.expected_status == FAILURE ? " (failure expected)" : "");
        runner(tc);
    };
    test({
        FAILURE,
        "unknown type",
        R"""({
            "type": "kaboom"
        })""",
        ""
    });
    test({
        FAILURE,
        "invalid type",
        R"""({
            "type": 123
        })""",
        ""
    });
    test({
        SUCCESS,
        "empty schema (object)",
        "{}",
        R"""(
            array ::= "[" space ( value ("," space value)* )? "]" space
            boolean ::= ("true" | "false") space
            null ::= "null" space
            number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space
            object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? "}" space
            root ::= object
            space ::= " "?
            string ::=  "\"" (
                    [^"\\] |
                    "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
                    )* "\"" space
            value ::= object | array | string | number | boolean
        )"""
    });
    test({
        SUCCESS,
        "exotic formats",
        R"""({
            "items": [
                { "format": "date" },
                { "format": "uuid" },
                { "format": "time" },
                { "format": "date-time" }
            ]
        })""",
        R"""(
            date ::= [0-9] [0-9] [0-9] [0-9] "-" ( "0" [1-9] | "1" [0-2] ) "-" ( "0" [1-9] | [1-2] [0-9] | "3" [0-1] )
            date-string ::= "\"" date "\"" space
            date-time ::= date "T" time
            date-time-string ::= "\"" date-time "\"" space
            root ::= "[" space date-string "," space uuid "," space time-string "," space date-time-string "]" space
            space ::= " "?
            time ::= ([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9] [0-9] [0-9] )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )
            time-string ::= "\"" time "\"" space
            uuid ::= "\"" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "-" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "-" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "-" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "-" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "\"" space
        )"""
    });
    test({
        SUCCESS,
        "string",
        R"""({
            "type": "string"
        })""",
        R"""(
            root ::=  "\"" (
                    [^"\\] |
                    "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
                    )* "\"" space
            space ::= " "?
        )"""
    });
    test({
        SUCCESS,
        "boolean",
        R"""({
            "type": "boolean"
        })""",
        R"""(
            root ::= ("true" | "false") space
            space ::= " "?
        )"""
    });
    test({
        SUCCESS,
        "integer",
        R"""({
            "type": "integer"
        })""",
        R"""(
            root ::= ("-"? ([0-9] | [1-9] [0-9]*)) space
            space ::= " "?
        )"""
    });
    test({
        SUCCESS,
        "string const",
        R"""({
            "const": "foo"
        })""",
        R"""(
            root ::= "\"foo\""
            space ::= " "?
        )"""
    });
    test({
        SUCCESS,
        "non-string const",
        R"""({
            "const": 123
        })""",
        R"""(
            root ::= "123"
            space ::= " "?
        )"""
    });
    test({
        SUCCESS,
        "non-string enum",
        R"""({
            "enum": ["red", "amber", "green", null, 42, ["foo"]]
        })""",
        R"""(
            root ::= "\"red\"" | "\"amber\"" | "\"green\"" | "null" | "42" | "[\"foo\"]"
            space ::= " "?
        )"""
    });
    test({
        SUCCESS,
        "tuple1",
        R"""({
            "prefixItems": [{ "type": "string" }]
        })""",
        R"""(
            root ::= "[" space string "]" space
            space ::= " "?
            string ::=  "\"" (
                    [^"\\] |
                    "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
                    )* "\"" space
        )"""
    });
    test({
        SUCCESS,
        "tuple2",
        R"""({
            "prefixItems": [{ "type": "string" }, { "type": "number" }]
        })""",
        R"""(
            number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space
            root ::= "[" space string "," space number "]" space
            space ::= " "?
            string ::=  "\"" (
                    [^"\\] |
                    "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
                    )* "\"" space
        )"""
    });
    test({
        SUCCESS,
        "number",
        R"""({
            "type": "number"
        })""",
        R"""(
            root ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space
            space ::= " "?
        )"""
    });
    test({
        SUCCESS,
        "minItems",
        R"""({
            "items": {
                "type": "boolean"
            },
            "minItems": 2
        })""",
        R"""(
            boolean ::= ("true" | "false") space
            root ::= "[" space boolean ( "," space boolean )( "," space boolean )* "]" space
            space ::= " "?
        )"""
    });
    test({
        SUCCESS,
        "maxItems 1",
        R"""({
            "items": {
                "type": "boolean"
            },
            "maxItems": 1
        })""",
        R"""(
            boolean ::= ("true" | "false") space
            root ::= "[" space ( boolean  )? "]" space
            space ::= " "?
        )"""
    });
    test({
        SUCCESS,
        "maxItems 2",
        R"""({
            "items": {
                "type": "boolean"
            },
            "maxItems": 2
        })""",
        R"""(
            boolean ::= ("true" | "false") space
            root ::= "[" space ( boolean ( "," space boolean )? )? "]" space
            space ::= " "?
        )"""
    });
    test({
        SUCCESS,
        "min + maxItems",
        R"""({
            "items": {
                "type": ["number", "integer"]
            },
            "minItems": 3,
            "maxItems": 5
        })""",
        R"""(
            integer ::= ("-"? ([0-9] | [1-9] [0-9]*)) space
            item ::= number | integer
            number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space
            root ::= "[" space item ( "," space item )( "," space item )( "," space item )?( "," space item )? "]" space
            space ::= " "?
        )"""
    });
    test({
        SUCCESS,
        "simple regexp",
        R"""({
            "type": "string",
            "pattern": "^abc?d*efg+(hij)?kl$"
        })""",
        R"""(
            root ::= "\"" "ab" "c"? "d"* "ef" "g"+ ("hij")? "kl" "\"" space
            space ::= " "?
        )"""
    });
    test({
        SUCCESS,
        "regexp escapes",
        R"""({
            "type": "string",
            "pattern": "^\\[\\]\\{\\}\\(\\)\\|\\+\\*\\?$"
        })""",
        R"""(
            root ::= "\"" "[]{}()|+*?" "\"" space
            space ::= " "?
        )"""
    });
    test({
        SUCCESS,
        "regexp quote",
        R"""({
            "type": "string",
            "pattern": "^\"$"
        })""",
        R"""(
            root ::= "\"" "\"" "\"" space
            space ::= " "?
        )"""
    });
    test({
        SUCCESS,
        "regexp",
        R"""({
            "type": "string",
            "pattern": "^(\\([0-9]{1,3}\\))?[0-9]{3}-[0-9]{4} and...$"
        })""",
        R"""(
            dot ::= [\U00000000-\x09\x0B\x0C\x0E-\U0010FFFF]
            root ::= "\"" ("(" root-1 root-1? root-1? ")")? root-1 root-1 root-1 "-" root-1 root-1 root-1 root-1 " and" dot dot dot "\"" space
            root-1 ::= [0-9]
            space ::= " "?
        )"""
    });
    test({
        SUCCESS,
        "required props in original order",
        R"""({
            "type": "object",
            "properties": {
                "b": {"type": "string"},
                "c": {"type": "string"},
                "a": {"type": "string"}
            },
            "required": [
                "a",
                "b",
                "c"
            ],
            "additionalProperties": false,
            "definitions": {}
        })""",
        R"""(
            a-kv ::= "\"a\"" space ":" space string
            b-kv ::= "\"b\"" space ":" space string
            c-kv ::= "\"c\"" space ":" space string
            root ::= "{" space b-kv "," space c-kv "," space a-kv "}" space
            space ::= " "?
            string ::=  "\"" (
                    [^"\\] |
                    "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
                    )* "\"" space
        )"""
    });
    test({
        SUCCESS,
        "1 optional prop",
        R"""({
            "properties": {
                "a": {
                "type": "string"
                }
            },
            "additionalProperties": false
        })""",
        R"""(
            a-kv ::= "\"a\"" space ":" space string
            root ::= "{" space  (a-kv )? "}" space
            space ::= " "?
            string ::=  "\"" (
                    [^"\\] |
                    "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
                    )* "\"" space
        )"""
    });
    test({
        SUCCESS,
        "N optional props",
        R"""({
            "properties": {
                "a": {"type": "string"},
                "b": {"type": "string"},
                "c": {"type": "string"}
            },
            "additionalProperties": false
        })""",
        R"""(
            a-kv ::= "\"a\"" space ":" space string
            a-rest ::= ( "," space b-kv )? b-rest
            b-kv ::= "\"b\"" space ":" space string
            b-rest ::= ( "," space c-kv )?
            c-kv ::= "\"c\"" space ":" space string
            root ::= "{" space  (a-kv a-rest | b-kv b-rest | c-kv )? "}" space
            space ::= " "?
            string ::=  "\"" (
                    [^"\\] |
                    "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
                    )* "\"" space
        )"""
    });
    test({
        SUCCESS,
        "required + optional props each in original order",
        R"""({
            "properties": {
                "b": {"type": "string"},
                "a": {"type": "string"},
                "d": {"type": "string"},
                "c": {"type": "string"}
            },
            "required": ["a", "b"],
            "additionalProperties": false
        })""",
        R"""(
            a-kv ::= "\"a\"" space ":" space string
            b-kv ::= "\"b\"" space ":" space string
            c-kv ::= "\"c\"" space ":" space string
            d-kv ::= "\"d\"" space ":" space string
            d-rest ::= ( "," space c-kv )?
            root ::= "{" space b-kv "," space a-kv ( "," space ( d-kv d-rest | c-kv ) )? "}" space
            space ::= " "?
            string ::=  "\"" (
                    [^"\\] |
                    "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
                )* "\"" space
        )"""
    });
    test({
        SUCCESS,
        "additional props",
        R"""({
            "type": "object",
            "additionalProperties": {"type": "array", "items": {"type": "number"}}
        })""",
        R"""(
            additional-kv ::= string ":" space additional-value
            additional-kvs ::= additional-kv ( "," space additional-kv )*
            additional-value ::= "[" space ( number ( "," space number )* )? "]" space
            number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space
            root ::= "{" space  (additional-kvs )? "}" space
            space ::= " "?
            string ::=  "\"" (
                    [^"\\] |
                    "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
                    )* "\"" space
        )"""
    });
    test({
        SUCCESS,
        "additional props (true)",
        R"""({
            "type": "object",
            "additionalProperties": true
        })""",
        R"""(
            array ::= "[" space ( value ("," space value)* )? "]" space
            boolean ::= ("true" | "false") space
            null ::= "null" space
            number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space
            object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? "}" space
            root ::= object
            space ::= " "?
            string ::=  "\"" (
                    [^"\\] |
                    "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
                    )* "\"" space
            value ::= object | array | string | number | boolean
        )"""
    });
    test({
        SUCCESS,
        "additional props (implicit)",
        R"""({
            "type": "object"
        })""",
        R"""(
            array ::= "[" space ( value ("," space value)* )? "]" space
            boolean ::= ("true" | "false") space
            null ::= "null" space
            number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space
            object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? "}" space
            root ::= object
            space ::= " "?
            string ::=  "\"" (
                    [^"\\] |
                    "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
                    )* "\"" space
            value ::= object | array | string | number | boolean
        )"""
    });
    test({
        SUCCESS,
        "empty w/o additional props",
        R"""({
            "type": "object",
            "additionalProperties": false
        })""",
        R"""(
            root ::= "{" space  "}" space
            space ::= " "?
        )"""
    });
    test({
        SUCCESS,
        "required + additional props",
        R"""({
            "type": "object",
            "properties": {
                "a": {"type": "number"}
            },
            "required": ["a"],
            "additionalProperties": {"type": "string"}
        })""",
        R"""(
            a-kv ::= "\"a\"" space ":" space number
            additional-kv ::= string ":" space string
            additional-kvs ::= additional-kv ( "," space additional-kv )*
            number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space
            root ::= "{" space a-kv ( "," space ( additional-kvs ) )? "}" space
            space ::= " "?
            string ::=  "\"" (
                    [^"\\] |
                    "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
                    )* "\"" space
        )"""
    });
    test({
        SUCCESS,
        "optional + additional props",
        R"""({
            "type": "object",
            "properties": {
                "a": {"type": "number"}
            },
            "additionalProperties": {"type": "number"}
        })""",
        R"""(
            a-kv ::= "\"a\"" space ":" space number
            a-rest ::= additional-kvs
            additional-kv ::= string ":" space number
            additional-kvs ::= additional-kv ( "," space additional-kv )*
            number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space
            root ::= "{" space  (a-kv a-rest | additional-kvs )? "}" space
            space ::= " "?
            string ::=  "\"" (
                    [^"\\] |
                    "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
                    )* "\"" space
        )"""
    });
    test({
        SUCCESS,
        "required + optional + additional props",
        R"""({
            "type": "object",
            "properties": {
                "a": {"type": "number"},
                "b": {"type": "number"}
            },
            "required": ["a"],
            "additionalProperties": {"type": "number"}
        })""",
        R"""(
            a-kv ::= "\"a\"" space ":" space number
            additional-kv ::= string ":" space number
            additional-kvs ::= additional-kv ( "," space additional-kv )*
            b-kv ::= "\"b\"" space ":" space number
            b-rest ::= additional-kvs
            number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space
            root ::= "{" space a-kv ( "," space ( b-kv b-rest | additional-kvs ) )? "}" space
            space ::= " "?
            string ::=  "\"" (
                    [^"\\] |
                    "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
                    )* "\"" space
        )"""
    });
    test({
        SUCCESS,
        "top-level $ref",
        R"""({
            "$ref": "#/definitions/MyType",
            "definitions": {
                "MyType": {
                    "type": "object",
                    "properties": {
                        "a": {
                            "type": "string"
                        }
                    },
                    "required": [
                        "a"
                    ],
                    "additionalProperties": false
                }
            }
        })""",
        R"""(
            MyType ::= "{" space MyType-a-kv "}" space
            MyType-a-kv ::= "\"a\"" space ":" space string
            root ::= MyType
            space ::= " "?
            string ::=  "\"" (
                    [^"\\] |
                    "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
                    )* "\"" space
        )"""
    });
    test({
        SUCCESS,
        "anyOf",
        R"""({
            "anyOf": [
                {"$ref": "#/definitions/foo"},
                {"$ref": "#/definitions/bar"}
            ],
            "definitions": {
                "foo": {
                    "properties": {"a": {"type": "number"}}
                },
                "bar": {
                    "properties": {"b": {"type": "number"}}
                }
            },
            "type": "object"
        })""",
        R"""(
            alternative-0 ::= foo
            alternative-1 ::= bar
            bar ::= "{" space  (bar-b-kv )? "}" space
            bar-b-kv ::= "\"b\"" space ":" space number
            foo ::= "{" space  (foo-a-kv )? "}" space
            foo-a-kv ::= "\"a\"" space ":" space number
            number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space
            root ::= alternative-0 | alternative-1
            space ::= " "?
        )"""
    });
    test({
        SUCCESS,
        "mix of allOf, anyOf and $ref (similar to https://json.schemastore.org/tsconfig.json)",
        R"""({
            "allOf": [
                {"$ref": "#/definitions/foo"},
                {"$ref": "#/definitions/bar"},
                {
                "anyOf": [
                    {"$ref": "#/definitions/baz"},
                    {"$ref": "#/definitions/bam"}
                ]
                }
            ],
            "definitions": {
                "foo": {
                    "properties": {"a": {"type": "number"}}
                },
                "bar": {
                    "properties": {"b": {"type": "number"}}
                },
                "bam": {
                    "properties": {"c": {"type": "number"}}
                },
                "baz": {
                    "properties": {"d": {"type": "number"}}
                }
            },
            "type": "object"
        })""",
        R"""(
            a-kv ::= "\"a\"" space ":" space number
            b-kv ::= "\"b\"" space ":" space number
            c-kv ::= "\"c\"" space ":" space number
            d-kv ::= "\"d\"" space ":" space number
            d-rest ::= ( "," space c-kv )?
            number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space
            root ::= "{" space a-kv "," space b-kv ( "," space ( d-kv d-rest | c-kv ) )? "}" space
            space ::= " "?
        )"""
    });
    test({
        SUCCESS,
        "conflicting names",
        R"""({
            "type": "object",
            "properties": {
                "number": {
                "type": "object",
                "properties": {
                    "number": {
                    "type": "object",
                        "properties": {
                            "root": {
                                "type": "number"
                            }
                        },
                        "required": [
                            "root"
                        ],
                        "additionalProperties": false
                    }
                },
                "required": [
                    "number"
                ],
                "additionalProperties": false
                }
            },
            "required": [
                "number"
            ],
            "additionalProperties": false,
            "definitions": {}
        })""",
        R"""(
            number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space
            number- ::= "{" space number-number-kv "}" space
            number-kv ::= "\"number\"" space ":" space number-
            number-number ::= "{" space number-number-root-kv "}" space
            number-number-kv ::= "\"number\"" space ":" space number-number
            number-number-root-kv ::= "\"root\"" space ":" space number
            root ::= "{" space number-kv "}" space
            space ::= " "?
        )"""
    });
 }
 int main() {
    fprintf(stderr, "LLAMA_NODE_AVAILABLE = %s\n", getenv("LLAMA_NODE_AVAILABLE") ? "true" : "false");
    fprintf(stderr, "LLAMA_PYTHON_AVAILABLE = %s\n", getenv("LLAMA_PYTHON_AVAILABLE") ? "true" : "false");
    test_all("C++", [](const TestCase & tc) {
        try {
            tc.verify(json_schema_to_grammar(nlohmann::ordered_json::parse(tc.schema)));
            tc.verify_status(SUCCESS);
        } catch (const std::runtime_error & ex) {
            fprintf(stderr, "Error: %s\n", ex.what());
            tc.verify_status(FAILURE);
        }
    });
    if (getenv("LLAMA_PYTHON_AVAILABLE") || (std::system("python --version") == 0)) {
        test_all("Python", [](const TestCase & tc) {
            write("test-json-schema-input.tmp", tc.schema);
            tc.verify_status(std::system(
                "python ./examples/json-schema-to-grammar.py test-json-schema-input.tmp > test-grammar-output.tmp") == 0 ? SUCCESS : FAILURE);
            tc.verify(read("test-grammar-output.tmp"));
        });
    } else {
        fprintf(stderr, "\033[33mWARNING: Python not found, skipping Python JSON schema -> grammar tests.\n\033[0m");
    }
    if (getenv("LLAMA_NODE_AVAILABLE") || (std::system("node --version") == 0)) {
        test_all("JavaScript", [](const TestCase & tc) {
            write("test-json-schema-input.tmp", tc.schema);
            tc.verify_status(std::system(
                "node ./tests/run-json-schema-to-grammar.mjs test-json-schema-input.tmp > test-grammar-output.tmp") == 0 ? SUCCESS : FAILURE);
            tc.verify(read("test-grammar-output.tmp"));
        });
    } else {
        fprintf(stderr, "\033[33mWARNING: Node not found, skipping JavaScript JSON schema -> grammar tests.\n\033[0m");
    }
    test_all("Check Expectations Validity", [](const TestCase & tc) {
        if (tc.expected_status == SUCCESS) {
            tc.verify_expectation_parseable();
        }
    });
 }