Merge commit '280345968d' into concedo_experimental

# Conflicts:
#	.devops/full-cuda.Dockerfile
#	.devops/llama-cpp-cuda.srpm.spec
#	.devops/main-cuda.Dockerfile
#	.devops/nix/package.nix
#	.devops/server-cuda.Dockerfile
#	.github/workflows/build.yml
#	CMakeLists.txt
#	Makefile
#	README.md
#	ci/run.sh
#	docs/token_generation_performance_tips.md
#	flake.lock
#	llama.cpp
#	scripts/LlamaConfig.cmake.in
#	scripts/compare-commits.sh
#	scripts/server-llm.sh
#	tests/test-quantize-fns.cpp
This commit is contained in:
Concedo 2024-04-07 20:27:17 +08:00
commit a530afa1e4
33 changed files with 124 additions and 1280 deletions

View file

@ -80,7 +80,7 @@ if (LLAMA_CUBLAS)
enable_language(CUDA) enable_language(CUDA)
add_compile_definitions(GGML_USE_CUBLAS) add_compile_definitions(GGML_USE_CUDA)
add_compile_definitions(SD_USE_CUBLAS) add_compile_definitions(SD_USE_CUBLAS)
add_compile_definitions(GGML_CUDA_MMQ_Y=${LLAMA_CUDA_MMQ_Y}) add_compile_definitions(GGML_CUDA_MMQ_Y=${LLAMA_CUDA_MMQ_Y})
@ -152,7 +152,7 @@ if (LLAMA_HIPBLAS)
message(STATUS "HIP and hipBLAS found") message(STATUS "HIP and hipBLAS found")
file(GLOB GGML_SOURCES_ROCM "ggml-cuda/*.cu") file(GLOB GGML_SOURCES_ROCM "ggml-cuda/*.cu")
list(APPEND GGML_SOURCES_ROCM "ggml-cuda.cu") list(APPEND GGML_SOURCES_ROCM "ggml-cuda.cu")
add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS SD_USE_CUBLAS) add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUDA SD_USE_CUBLAS)
add_library(ggml-rocm OBJECT ${GGML_SOURCES_CUDA}) add_library(ggml-rocm OBJECT ${GGML_SOURCES_CUDA})
if (LLAMA_CUDA_FORCE_DMMV) if (LLAMA_CUDA_FORCE_DMMV)
target_compile_definitions(ggml-rocm PUBLIC GGML_CUDA_FORCE_DMMV) target_compile_definitions(ggml-rocm PUBLIC GGML_CUDA_FORCE_DMMV)

View file

@ -55,7 +55,7 @@ CLBLAST_FLAGS = -DGGML_USE_CLBLAST
FAILSAFE_FLAGS = -DUSE_FAILSAFE FAILSAFE_FLAGS = -DUSE_FAILSAFE
VULKAN_FLAGS = -DGGML_USE_VULKAN VULKAN_FLAGS = -DGGML_USE_VULKAN
ifdef LLAMA_CUBLAS ifdef LLAMA_CUBLAS
CUBLAS_FLAGS = -DGGML_USE_CUBLAS -DSD_USE_CUBLAS CUBLAS_FLAGS = -DGGML_USE_CUDA -DSD_USE_CUBLAS
else else
CUBLAS_FLAGS = CUBLAS_FLAGS =
endif endif
@ -142,7 +142,7 @@ endif
# it is recommended to use the CMAKE file to build for cublas if you can - will likely work better # it is recommended to use the CMAKE file to build for cublas if you can - will likely work better
ifdef LLAMA_CUBLAS ifdef LLAMA_CUBLAS
CUBLAS_FLAGS = -DGGML_USE_CUBLAS -DSD_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include CUBLAS_FLAGS = -DGGML_USE_CUDA -DSD_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
CUBLASLD_FLAGS = -lcuda -lcublas -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib -L/usr/local/cuda/targets/aarch64-linux/lib -L/usr/local/cuda/targets/sbsa-linux/lib -L/usr/lib/wsl/lib CUBLASLD_FLAGS = -lcuda -lcublas -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib -L/usr/local/cuda/targets/aarch64-linux/lib -L/usr/local/cuda/targets/sbsa-linux/lib -L/usr/lib/wsl/lib
CUBLAS_OBJS = ggml-cuda.o ggml_v3-cuda.o ggml_v2-cuda.o ggml_v2-cuda-legacy.o CUBLAS_OBJS = ggml-cuda.o ggml_v3-cuda.o ggml_v2-cuda.o ggml_v2-cuda-legacy.o
NVCC = nvcc NVCC = nvcc
@ -226,7 +226,7 @@ ifdef LLAMA_HIPBLAS
LLAMA_CUDA_DMMV_X ?= 32 LLAMA_CUDA_DMMV_X ?= 32
LLAMA_CUDA_MMV_Y ?= 1 LLAMA_CUDA_MMV_Y ?= 1
LLAMA_CUDA_KQUANTS_ITER ?= 2 LLAMA_CUDA_KQUANTS_ITER ?= 2
HIPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS -DSD_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C) HIPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUDA -DSD_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C)
HIPLDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib -lhipblas -lamdhip64 -lrocblas HIPLDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib -lhipblas -lamdhip64 -lrocblas
HIP_OBJS += ggml-cuda.o ggml_v3-cuda.o ggml_v2-cuda.o ggml_v2-cuda-legacy.o HIP_OBJS += ggml-cuda.o ggml_v3-cuda.o ggml_v2-cuda.o ggml_v2-cuda-legacy.o
ggml-cuda.o: HIPFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS)) \ ggml-cuda.o: HIPFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS)) \

View file

@ -49,12 +49,12 @@
#pragma warning(disable: 4244 4267) // possible loss of data #pragma warning(disable: 4244 4267) // possible loss of data
#endif #endif
#if (defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)) #if (defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL))
#define GGML_USE_CUBLAS_SYCL #define GGML_USE_CUDA_SYCL
#endif #endif
#if (defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)) || defined(GGML_USE_VULKAN) #if (defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)) || defined(GGML_USE_VULKAN)
#define GGML_USE_CUBLAS_SYCL_VULKAN #define GGML_USE_CUDA_SYCL_VULKAN
#endif #endif
#if defined(LLAMA_USE_CURL) #if defined(LLAMA_USE_CURL)
@ -862,9 +862,9 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true; return true;
} }
params.main_gpu = std::stoi(argv[i]); params.main_gpu = std::stoi(argv[i]);
#ifndef GGML_USE_CUBLAS_SYCL #ifndef GGML_USE_CUDA_SYCL
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the main GPU has no effect.\n"); fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL. Setting the main GPU has no effect.\n");
#endif // GGML_USE_CUBLAS_SYCL #endif // GGML_USE_CUDA_SYCL
return true; return true;
} }
if (arg == "--split-mode" || arg == "-sm") { if (arg == "--split-mode" || arg == "-sm") {
@ -890,9 +890,9 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
invalid_param = true; invalid_param = true;
return true; return true;
} }
#ifndef GGML_USE_CUBLAS_SYCL #ifndef GGML_USE_CUDA_SYCL
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the split mode has no effect.\n"); fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL. Setting the split mode has no effect.\n");
#endif // GGML_USE_CUBLAS_SYCL #endif // GGML_USE_CUDA_SYCL
return true; return true;
} }
if (arg == "--tensor-split" || arg == "-ts") { if (arg == "--tensor-split" || arg == "-ts") {
@ -918,9 +918,9 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
params.tensor_split[i] = 0.0f; params.tensor_split[i] = 0.0f;
} }
} }
#ifndef GGML_USE_CUBLAS_SYCL_VULKAN #ifndef GGML_USE_CUDA_SYCL_VULKAN
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL/Vulkan. Setting a tensor split has no effect.\n"); fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting a tensor split has no effect.\n");
#endif // GGML_USE_CUBLAS_SYCL #endif // GGML_USE_CUDA_SYCL_VULKAN
return true; return true;
} }
if (arg == "--no-mmap") { if (arg == "--no-mmap") {
@ -2388,7 +2388,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false"); fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false");
fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false"); fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false");
fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false"); fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
fprintf(stream, "cpu_has_cublas: %s\n", ggml_cpu_has_cublas() ? "true" : "false"); fprintf(stream, "cpu_has_cuda: %s\n", ggml_cpu_has_cuda() ? "true" : "false");
fprintf(stream, "cpu_has_vulkan: %s\n", ggml_cpu_has_vulkan() ? "true" : "false"); fprintf(stream, "cpu_has_vulkan: %s\n", ggml_cpu_has_vulkan() ? "true" : "false");
fprintf(stream, "cpu_has_clblast: %s\n", ggml_cpu_has_clblast() ? "true" : "false"); fprintf(stream, "cpu_has_clblast: %s\n", ggml_cpu_has_clblast() ? "true" : "false");
fprintf(stream, "cpu_has_kompute: %s\n", ggml_cpu_has_kompute() ? "true" : "false"); fprintf(stream, "cpu_has_kompute: %s\n", ggml_cpu_has_kompute() ? "true" : "false");

View file

@ -22,7 +22,7 @@ For faster computation, make sure to use GPU offloading via the `-ngl` argument
## Example ## Example
```bash ```bash
LLAMA_CUBLAS=1 make -j LLAMA_CUDA=1 make -j
# generate importance matrix (imatrix.dat) # generate importance matrix (imatrix.dat)
./imatrix -m ggml-model-f16.gguf -f train-data.txt -ngl 99 ./imatrix -m ggml-model-f16.gguf -f train-data.txt -ngl 99

View file

@ -114,7 +114,7 @@ static std::string get_cpu_info() {
static std::string get_gpu_info() { static std::string get_gpu_info() {
std::string id; std::string id;
#ifdef GGML_USE_CUBLAS #ifdef GGML_USE_CUDA
int count = ggml_backend_cuda_get_device_count(); int count = ggml_backend_cuda_get_device_count();
for (int i = 0; i < count; i++) { for (int i = 0; i < count; i++) {
char buf[128]; char buf[128];
@ -809,7 +809,7 @@ struct test {
const std::string test::build_commit = LLAMA_COMMIT; const std::string test::build_commit = LLAMA_COMMIT;
const int test::build_number = LLAMA_BUILD_NUMBER; const int test::build_number = LLAMA_BUILD_NUMBER;
const bool test::cuda = !!ggml_cpu_has_cublas(); const bool test::cuda = !!ggml_cpu_has_cuda();
const bool test::opencl = !!ggml_cpu_has_clblast(); const bool test::opencl = !!ggml_cpu_has_clblast();
const bool test::vulkan = !!ggml_cpu_has_vulkan(); const bool test::vulkan = !!ggml_cpu_has_vulkan();
const bool test::kompute = !!ggml_cpu_has_kompute(); const bool test::kompute = !!ggml_cpu_has_kompute();

View file

@ -124,7 +124,7 @@ llama_print_timings: total time = 34570.79 ms
## Orin compile and run ## Orin compile and run
### compile ### compile
```sh ```sh
make LLAMA_CUBLAS=1 CUDA_DOCKER_ARCH=sm_87 LLAMA_CUDA_F16=1 -j 32 make LLAMA_CUDA=1 CUDA_DOCKER_ARCH=sm_87 LLAMA_CUDA_F16=1 -j 32
``` ```
### run on Orin ### run on Orin

View file

@ -7,7 +7,7 @@
#include "ggml-alloc.h" #include "ggml-alloc.h"
#include "ggml-backend.h" #include "ggml-backend.h"
#ifdef GGML_USE_CUBLAS #ifdef GGML_USE_CUDA
#include "ggml-cuda.h" #include "ggml-cuda.h"
#endif #endif
@ -968,7 +968,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
} }
} }
#ifdef GGML_USE_CUBLAS #ifdef GGML_USE_CUDA
new_clip->backend = ggml_backend_cuda_init(0); new_clip->backend = ggml_backend_cuda_init(0);
printf("%s: CLIP using CUDA backend\n", __func__); printf("%s: CLIP using CUDA backend\n", __func__);
#endif #endif

View file

@ -8,7 +8,7 @@ Because this example is "outside of the source tree", it is important to first b
### Considerations ### Considerations
When hardware acceleration libraries are used (e.g. CUBlas, Metal, CLBlast, etc.), CMake must be able to locate the associated CMake package. In the example below, when building _main-cmake-pkg_ notice the `CMAKE_PREFIX_PATH` includes the Llama CMake package location _in addition to_ the CLBlast package—which was used when compiling _llama.cpp_. When hardware acceleration libraries are used (e.g. CUDA, Metal, CLBlast, etc.), CMake must be able to locate the associated CMake package. In the example below, when building _main-cmake-pkg_ notice the `CMAKE_PREFIX_PATH` includes the Llama CMake package location _in addition to_ the CLBlast package—which was used when compiling _llama.cpp_.
### Build llama.cpp and install to C:\LlamaCPP directory ### Build llama.cpp and install to C:\LlamaCPP directory

View file

@ -316,8 +316,8 @@ These options provide extra functionality and customization when running the LLa
- `-h, --help`: Display a help message showing all available options and their default values. This is particularly useful for checking the latest options and default values, as they can change frequently, and the information in this document may become outdated. - `-h, --help`: Display a help message showing all available options and their default values. This is particularly useful for checking the latest options and default values, as they can change frequently, and the information in this document may become outdated.
- `--verbose-prompt`: Print the prompt before generating text. - `--verbose-prompt`: Print the prompt before generating text.
- `-ngl N, --n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance. - `-ngl N, --n-gpu-layers N`: When compiled with GPU support, this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS. - `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used.
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS. - `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance.
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains. - `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation. - `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.

View file

@ -25,9 +25,9 @@ The project is under active development, and we are [looking for feedback and co
- `-hff FILE, --hf-file FILE`: Hugging Face model file (default: unused). - `-hff FILE, --hf-file FILE`: Hugging Face model file (default: unused).
- `-a ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses. - `-a ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
- `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096. - `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096.
- `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance. - `-ngl N`, `--n-gpu-layers N`: When compiled with GPU support, this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS. - `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used.
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS. - `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance.
- `-b N`, `--batch-size N`: Set the batch size for prompt processing. Default: `2048`. - `-b N`, `--batch-size N`: Set the batch size for prompt processing. Default: `2048`.
- `-ub N`, `--ubatch-size N`: physical maximum batch size. Default: `512`. - `-ub N`, `--ubatch-size N`: physical maximum batch size. Default: `512`.
- `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended. - `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended.

View file

@ -2511,15 +2511,15 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
invalid_param = true; invalid_param = true;
break; break;
} }
#ifndef GGML_USE_CUBLAS #ifndef GGML_USE_CUDA
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting the split mode has no effect.\n"); fprintf(stderr, "warning: llama.cpp was compiled without CUDA. Setting the split mode has no effect.\n");
#endif // GGML_USE_CUBLAS #endif // GGML_USE_CUDA
} else if (arg == "--tensor-split" || arg == "-ts") { } else if (arg == "--tensor-split" || arg == "-ts") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
break; break;
} }
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL) #if defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)
std::string arg_next = argv[i]; std::string arg_next = argv[i];
// split string by , and / // split string by , and /
@ -2536,17 +2536,17 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
} }
} }
#else #else
LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n", {}); LOG_WARNING("llama.cpp was compiled without CUDA. It is not possible to set a tensor split.\n", {});
#endif // GGML_USE_CUBLAS #endif // GGML_USE_CUDA
} else if (arg == "--main-gpu" || arg == "-mg") { } else if (arg == "--main-gpu" || arg == "-mg") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
break; break;
} }
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL) #if defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)
params.main_gpu = std::stoi(argv[i]); params.main_gpu = std::stoi(argv[i]);
#else #else
LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.", {}); LOG_WARNING("llama.cpp was compiled without CUDA. It is not possible to set a main GPU.", {});
#endif #endif
} else if (arg == "--lora") { } else if (arg == "--lora") {
if (++i >= argc) { if (++i >= argc) {

View file

@ -420,7 +420,7 @@ GGML_CALL static void ggml_backend_registry_init(void) {
ggml_backend_register("CPU", ggml_backend_reg_cpu_init, ggml_backend_cpu_buffer_type(), NULL); ggml_backend_register("CPU", ggml_backend_reg_cpu_init, ggml_backend_cpu_buffer_type(), NULL);
// add forward decls here to avoid including the backend headers // add forward decls here to avoid including the backend headers
#ifdef GGML_USE_CUBLAS #ifdef GGML_USE_CUDA
extern GGML_CALL void ggml_backend_cuda_reg_devices(void); extern GGML_CALL void ggml_backend_cuda_reg_devices(void);
ggml_backend_cuda_reg_devices(); ggml_backend_cuda_reg_devices();
#endif #endif

8
ggml.c
View file

@ -21719,15 +21719,15 @@ int ggml_cpu_has_wasm_simd(void) {
} }
int ggml_cpu_has_blas(void) { int ggml_cpu_has_blas(void) {
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_SYCL) #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_SYCL)
return 1; return 1;
#else #else
return 0; return 0;
#endif #endif
} }
int ggml_cpu_has_cublas(void) { int ggml_cpu_has_cuda(void) {
#if defined(GGML_USE_CUBLAS) #if defined(GGML_USE_CUDA)
return 1; return 1;
#else #else
return 0; return 0;
@ -21767,7 +21767,7 @@ int ggml_cpu_has_sycl(void) {
} }
int ggml_cpu_has_gpublas(void) { int ggml_cpu_has_gpublas(void) {
return ggml_cpu_has_cublas() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() || return ggml_cpu_has_cuda() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() ||
ggml_cpu_has_sycl(); ggml_cpu_has_sycl();
} }

2
ggml.h
View file

@ -2361,7 +2361,7 @@ extern "C" {
GGML_API int ggml_cpu_has_fp16_va (void); GGML_API int ggml_cpu_has_fp16_va (void);
GGML_API int ggml_cpu_has_wasm_simd (void); GGML_API int ggml_cpu_has_wasm_simd (void);
GGML_API int ggml_cpu_has_blas (void); GGML_API int ggml_cpu_has_blas (void);
GGML_API int ggml_cpu_has_cublas (void); GGML_API int ggml_cpu_has_cuda (void);
GGML_API int ggml_cpu_has_clblast (void); GGML_API int ggml_cpu_has_clblast (void);
GGML_API int ggml_cpu_has_vulkan (void); GGML_API int ggml_cpu_has_vulkan (void);
GGML_API int ggml_cpu_has_kompute (void); GGML_API int ggml_cpu_has_kompute (void);

View file

@ -829,7 +829,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
int cu_parseinfo_maindevice = inputs.cublas_info<=0?0:inputs.cublas_info; int cu_parseinfo_maindevice = inputs.cublas_info<=0?0:inputs.cublas_info;
printf("System Info: %s\n", llama_print_system_info()); printf("System Info: %s\n", llama_print_system_info());
#if defined(GGML_USE_CUBLAS) #if defined(GGML_USE_CUDA)
if(file_format!=FileFormat::GGUF_GENERIC) if(file_format!=FileFormat::GGUF_GENERIC)
{ {
if(ggml_v3_cpu_has_gpublas() && cu_parseinfo_maindevice>0) if(ggml_v3_cpu_has_gpublas() && cu_parseinfo_maindevice>0)
@ -909,7 +909,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
llama_ctx_params.rope_freq_scale = rope_freq_scale; llama_ctx_params.rope_freq_scale = rope_freq_scale;
llama_ctx_params.n_batch = kcpp_params->n_batch; llama_ctx_params.n_batch = kcpp_params->n_batch;
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_VULKAN) #if defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN)
bool ts_all_zero = true; bool ts_all_zero = true;
for (int i = 0; i < tensor_split_max; ++i) { for (int i = 0; i < tensor_split_max; ++i) {
if (inputs.tensor_split[i] != 0.0f) { if (inputs.tensor_split[i] != 0.0f) {
@ -997,7 +997,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
} }
} }
#endif #endif
#if defined(GGML_USE_CUBLAS) #if defined(GGML_USE_CUDA)
if(ggml_cpu_has_gpublas() && cu_parseinfo_maindevice>0) if(ggml_cpu_has_gpublas() && cu_parseinfo_maindevice>0)
{ {
printf("CUBLAS: Set main device to %d\n",cu_parseinfo_maindevice); printf("CUBLAS: Set main device to %d\n",cu_parseinfo_maindevice);
@ -1006,7 +1006,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
#endif #endif
model_params.main_gpu = cu_parseinfo_maindevice; model_params.main_gpu = cu_parseinfo_maindevice;
#if defined(GGML_USE_CUBLAS) #if defined(GGML_USE_CUDA)
model_params.split_mode = (inputs.use_rowsplit?llama_split_mode::LLAMA_SPLIT_MODE_ROW:llama_split_mode::LLAMA_SPLIT_MODE_LAYER); model_params.split_mode = (inputs.use_rowsplit?llama_split_mode::LLAMA_SPLIT_MODE_ROW:llama_split_mode::LLAMA_SPLIT_MODE_LAYER);
#else #else
model_params.split_mode = llama_split_mode::LLAMA_SPLIT_MODE_LAYER; model_params.split_mode = llama_split_mode::LLAMA_SPLIT_MODE_LAYER;
@ -1016,7 +1016,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
llama_ctx_params.n_threads = kcpp_params->n_threads; llama_ctx_params.n_threads = kcpp_params->n_threads;
llama_ctx_params.n_threads_batch = kcpp_params->n_threads_batch; llama_ctx_params.n_threads_batch = kcpp_params->n_threads_batch;
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_VULKAN) #if defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN)
bool ts_all_zero = true; bool ts_all_zero = true;
for (int i = 0; i < tensor_split_max; ++i) { for (int i = 0; i < tensor_split_max; ++i) {
if (inputs.tensor_split[i] != 0.0f) { if (inputs.tensor_split[i] != 0.0f) {

View file

@ -9,13 +9,11 @@
#include "ggml-alloc.h" #include "ggml-alloc.h"
#include "ggml-backend.h" #include "ggml-backend.h"
#ifdef GGML_USE_CUBLAS #ifdef GGML_USE_CUDA
# include "ggml-cuda.h" # include "ggml-cuda.h"
#endif #elif defined(GGML_USE_CLBLAST)
#if defined(GGML_USE_CLBLAST)
# include "ggml-opencl.h" # include "ggml-opencl.h"
#endif #elif defined(GGML_USE_VULKAN)
#if defined(GGML_USE_VULKAN)
# include "ggml-vulkan.h" # include "ggml-vulkan.h"
#elif defined(GGML_USE_SYCL) #elif defined(GGML_USE_SYCL)
# include "ggml-sycl.h" # include "ggml-sycl.h"
@ -1533,7 +1531,7 @@ static std::string llama_token_to_str(const struct llama_context * ctx, llama_to
static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer) { static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer) {
ggml_backend_buffer_type_t buft = nullptr; ggml_backend_buffer_type_t buft = nullptr;
#if defined(GGML_USE_CUBLAS) #if defined(GGML_USE_CUDA)
// host buffers should only be used when data is expected to be copied to/from the GPU // host buffers should only be used when data is expected to be copied to/from the GPU
if (host_buffer) { if (host_buffer) {
buft = ggml_backend_cuda_host_buffer_type(); buft = ggml_backend_cuda_host_buffer_type();
@ -1563,7 +1561,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
#ifdef GGML_USE_METAL #ifdef GGML_USE_METAL
buft = ggml_backend_metal_buffer_type(); buft = ggml_backend_metal_buffer_type();
#elif defined(GGML_USE_CUBLAS) #elif defined(GGML_USE_CUDA)
buft = ggml_backend_cuda_buffer_type(gpu); buft = ggml_backend_cuda_buffer_type(gpu);
#elif defined(GGML_USE_VULKAN) #elif defined(GGML_USE_VULKAN)
buft = ggml_backend_vk_buffer_type(gpu); buft = ggml_backend_vk_buffer_type(gpu);
@ -1589,7 +1587,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_gpu, const float * tensor_split) { static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_gpu, const float * tensor_split) {
ggml_backend_buffer_type_t buft = nullptr; ggml_backend_buffer_type_t buft = nullptr;
#ifdef GGML_USE_CUBLAS #ifdef GGML_USE_CUDA
if (ggml_backend_cuda_get_device_count() > 1) { if (ggml_backend_cuda_get_device_count() > 1) {
buft = ggml_backend_cuda_split_buffer_type(tensor_split); buft = ggml_backend_cuda_split_buffer_type(tensor_split);
} }
@ -1610,7 +1608,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g
} }
static size_t llama_get_device_count() { static size_t llama_get_device_count() {
#if defined(GGML_USE_CUBLAS) #if defined(GGML_USE_CUDA)
return ggml_backend_cuda_get_device_count(); return ggml_backend_cuda_get_device_count();
#elif defined(GGML_USE_SYCL) #elif defined(GGML_USE_SYCL)
return ggml_backend_sycl_get_device_count(); return ggml_backend_sycl_get_device_count();
@ -1622,7 +1620,7 @@ static size_t llama_get_device_count() {
} }
static size_t llama_get_device_memory(int device) { static size_t llama_get_device_memory(int device) {
#if defined(GGML_USE_CUBLAS) #if defined(GGML_USE_CUDA)
size_t total; size_t total;
size_t free; size_t free;
ggml_backend_cuda_get_device_memory(device, &total, &free); ggml_backend_cuda_get_device_memory(device, &total, &free);
@ -2112,7 +2110,7 @@ struct llama_model {
ggml_free(ctx); ggml_free(ctx);
} }
for (ggml_backend_buffer_t buf : bufs) { for (ggml_backend_buffer_t buf : bufs) {
#ifdef GGML_USE_CUBLAS #ifdef GGML_USE_CUDA
if (ggml_backend_buffer_get_type(buf) == ggml_backend_cpu_buffer_type()) { if (ggml_backend_buffer_get_type(buf) == ggml_backend_cpu_buffer_type()) {
ggml_backend_cuda_unregister_host_buffer(ggml_backend_buffer_get_base(buf)); ggml_backend_cuda_unregister_host_buffer(ggml_backend_buffer_get_base(buf));
} }
@ -5341,7 +5339,7 @@ static bool llm_load_tensors(
} }
model.bufs.push_back(buf); model.bufs.push_back(buf);
bufs.emplace(idx, buf); bufs.emplace(idx, buf);
#ifdef GGML_USE_CUBLAS #ifdef GGML_USE_CUDA
if (n_layer >= n_gpu_layers) { if (n_layer >= n_gpu_layers) {
ggml_backend_cuda_register_host_buffer( ggml_backend_cuda_register_host_buffer(
ggml_backend_buffer_get_base(buf), ggml_backend_buffer_get_base(buf),
@ -13698,7 +13696,7 @@ bool llama_supports_mlock(void) {
} }
bool llama_supports_gpu_offload(void) { bool llama_supports_gpu_offload(void) {
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \ #if defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
// Defined when llama.cpp is compiled with support for offloading model layers to GPU. // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
return true; return true;
@ -13904,7 +13902,7 @@ struct llama_context * llama_new_context_with_model(
} }
ctx->backends.push_back(ctx->backend_metal); ctx->backends.push_back(ctx->backend_metal);
} }
#elif defined(GGML_USE_CUBLAS) #elif defined(GGML_USE_CUDA)
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) { if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu); ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu);
@ -14051,7 +14049,7 @@ struct llama_context * llama_new_context_with_model(
// enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
bool pipeline_parallel = llama_get_device_count() > 1 && model->n_gpu_layers > (int)model->hparams.n_layer && model->split_mode == LLAMA_SPLIT_MODE_LAYER; bool pipeline_parallel = llama_get_device_count() > 1 && model->n_gpu_layers > (int)model->hparams.n_layer && model->split_mode == LLAMA_SPLIT_MODE_LAYER;
#ifndef GGML_USE_CUBLAS #ifndef GGML_USE_CUDA
// pipeline parallelism requires support for async compute and events // pipeline parallelism requires support for async compute and events
// currently this is only implemented in the CUDA backend // currently this is only implemented in the CUDA backend
pipeline_parallel = false; pipeline_parallel = false;

View file

@ -136,7 +136,7 @@ inline static void* ggml_v2_aligned_malloc(size_t size) {
#include <Accelerate/Accelerate.h> #include <Accelerate/Accelerate.h>
#elif defined(GGML_USE_OPENBLAS) #elif defined(GGML_USE_OPENBLAS)
#include <cblas.h> #include <cblas.h>
#elif defined(GGML_USE_CUBLAS) #elif defined(GGML_USE_CUDA)
#include "ggml_v2-cuda.h" #include "ggml_v2-cuda.h"
#include "ggml_v2-cuda-legacy.h" #include "ggml_v2-cuda-legacy.h"
#endif #endif
@ -3895,7 +3895,7 @@ struct ggml_v2_context * ggml_v2_init(struct ggml_v2_init_params params) {
GGML_V2_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f); GGML_V2_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
} }
#if defined(GGML_USE_CUBLAS) #if defined(GGML_USE_CUDA)
if(quants_unshuffled) if(quants_unshuffled)
{ {
ggml_v2_init_cublas(); ggml_v2_init_cublas();
@ -9456,7 +9456,7 @@ static void ggml_v2_compute_forward_mul_mat_f32(
// nb01 >= nb00 - src0 is not transposed // nb01 >= nb00 - src0 is not transposed
// compute by src0 rows // compute by src0 rows
#if defined(GGML_USE_CUBLAS) #if defined(GGML_USE_CUDA)
if (ggml_v2_cuda_can_mul_mat(src0, src1, dst)) { if (ggml_v2_cuda_can_mul_mat(src0, src1, dst)) {
if (params->ith == 0 && params->type == GGML_V2_TASK_COMPUTE) { if (params->ith == 0 && params->type == GGML_V2_TASK_COMPUTE) {
if(quants_unshuffled) if(quants_unshuffled)
@ -9656,7 +9656,7 @@ static void ggml_v2_compute_forward_mul_mat_f16_f32(
// nb01 >= nb00 - src0 is not transposed // nb01 >= nb00 - src0 is not transposed
// compute by src0 rows // compute by src0 rows
#if defined(GGML_USE_CUBLAS) #if defined(GGML_USE_CUDA)
if (ggml_v2_cuda_can_mul_mat(src0, src1, dst)) { if (ggml_v2_cuda_can_mul_mat(src0, src1, dst)) {
if (params->ith == 0 && params->type == GGML_V2_TASK_COMPUTE) { if (params->ith == 0 && params->type == GGML_V2_TASK_COMPUTE) {
if(quants_unshuffled) if(quants_unshuffled)
@ -9901,7 +9901,7 @@ static void ggml_v2_compute_forward_mul_mat_q_f32(
// nb01 >= nb00 - src0 is not transposed // nb01 >= nb00 - src0 is not transposed
// compute by src0 rows // compute by src0 rows
#if defined(GGML_USE_CUBLAS) #if defined(GGML_USE_CUDA)
if (ggml_v2_cuda_can_mul_mat(src0, src1, dst)) { if (ggml_v2_cuda_can_mul_mat(src0, src1, dst)) {
if (params->ith == 0 && params->type == GGML_V2_TASK_COMPUTE) { if (params->ith == 0 && params->type == GGML_V2_TASK_COMPUTE) {
if(quants_unshuffled) if(quants_unshuffled)
@ -14087,7 +14087,7 @@ void ggml_v2_graph_compute(struct ggml_v2_context * ctx, struct ggml_v2_cgraph *
size_t cur = 0; size_t cur = 0;
#if defined(GGML_USE_CUBLAS) #if defined(GGML_USE_CUDA)
if (ggml_v2_cuda_can_mul_mat(node->src0, node->src1, node)) { if (ggml_v2_cuda_can_mul_mat(node->src0, node->src1, node)) {
node->n_tasks = 1; // TODO: this actually is doing nothing node->n_tasks = 1; // TODO: this actually is doing nothing
// the threads are still spinning // the threads are still spinning
@ -15585,7 +15585,7 @@ int ggml_v2_cpu_has_wasm_simd(void) {
} }
int ggml_v2_cpu_has_blas(void) { int ggml_v2_cpu_has_blas(void) {
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST)
return 1; return 1;
#else #else
return 0; return 0;
@ -15593,7 +15593,7 @@ int ggml_v2_cpu_has_blas(void) {
} }
int ggml_v2_cpu_has_cublas(void) { int ggml_v2_cpu_has_cublas(void) {
#if defined(GGML_USE_CUBLAS) #if defined(GGML_USE_CUDA)
return 1; return 1;
#else #else
return 0; return 0;

View file

@ -1367,7 +1367,7 @@ inline static void * ggml_v3_aligned_malloc(size_t size) {
#else #else
#include <cblas.h> #include <cblas.h>
#endif #endif
#elif defined(GGML_USE_CUBLAS) #elif defined(GGML_USE_CUDA)
#include "ggml_v3-cuda.h" #include "ggml_v3-cuda.h"
#elif defined(GGML_USE_CLBLAST) #elif defined(GGML_USE_CLBLAST)
#include "ggml_v3-opencl.h" #include "ggml_v3-opencl.h"
@ -3413,7 +3413,7 @@ struct ggml_v3_context * ggml_v3_init(struct ggml_v3_init_params params) {
GGML_V3_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f); GGML_V3_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
} }
#if defined(GGML_USE_CUBLAS) #if defined(GGML_USE_CUDA)
ggml_v3_init_cublas(); ggml_v3_init_cublas();
#elif defined(GGML_USE_CLBLAST) #elif defined(GGML_USE_CLBLAST)
ggml_v3_cl_init(); ggml_v3_cl_init();
@ -11325,7 +11325,7 @@ static void ggml_v3_compute_forward_out_prod_f32(
// nb01 >= nb00 - src0 is not transposed // nb01 >= nb00 - src0 is not transposed
// compute by src0 rows // compute by src0 rows
// TODO: #if defined(GGML_USE_CUBLAS) ggml_v3_cuda_out_prod // TODO: #if defined(GGML_USE_CUDA) ggml_v3_cuda_out_prod
// TODO: #if defined(GGML_USE_CLBLAST) // TODO: #if defined(GGML_USE_CLBLAST)
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
@ -11520,7 +11520,7 @@ static void ggml_v3_compute_forward_out_prod_q_f32(
// nb01 >= nb00 - src0 is not transposed // nb01 >= nb00 - src0 is not transposed
// compute by src0 rows // compute by src0 rows
// TODO: #if defined(GGML_USE_CUBLAS) ggml_v3_cuda_out_prod // TODO: #if defined(GGML_USE_CUDA) ggml_v3_cuda_out_prod
// TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) // TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
if (params->type == GGML_V3_TASK_INIT) { if (params->type == GGML_V3_TASK_INIT) {
@ -15587,14 +15587,14 @@ static void ggml_v3_compute_forward(struct ggml_v3_compute_params * params, stru
return; return;
} }
#ifdef GGML_USE_CUBLAS #ifdef GGML_USE_CUDA
bool skip_cpu = ggml_v3_cuda_compute_forward(params, tensor); bool skip_cpu = ggml_v3_cuda_compute_forward(params, tensor);
if (skip_cpu) { if (skip_cpu) {
return; return;
} }
GGML_V3_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_V3_BACKEND_CPU); GGML_V3_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_V3_BACKEND_CPU);
GGML_V3_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_V3_BACKEND_CPU); GGML_V3_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_V3_BACKEND_CPU);
#endif // GGML_USE_CUBLAS #endif // GGML_USE_CUDA
switch (tensor->op) { switch (tensor->op) {
case GGML_V3_OP_DUP: case GGML_V3_OP_DUP:
@ -21106,7 +21106,7 @@ int ggml_v3_cpu_has_wasm_simd(void) {
} }
int ggml_v3_cpu_has_blas(void) { int ggml_v3_cpu_has_blas(void) {
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST)
return 1; return 1;
#else #else
return 0; return 0;
@ -21114,7 +21114,7 @@ int ggml_v3_cpu_has_blas(void) {
} }
int ggml_v3_cpu_has_cublas(void) { int ggml_v3_cpu_has_cublas(void) {
#if defined(GGML_USE_CUBLAS) #if defined(GGML_USE_CUDA)
return 1; return 1;
#else #else
return 0; return 0;

View file

@ -16,7 +16,7 @@
#include "model_adapter.h" #include "model_adapter.h"
#ifdef GGML_USE_CUBLAS #ifdef GGML_USE_CUDA
#include "ggml_v3-cuda.h" #include "ggml_v3-cuda.h"
#endif #endif
#if defined(GGML_USE_CLBLAST) #if defined(GGML_USE_CLBLAST)
@ -353,7 +353,7 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g
fin.close(); fin.close();
//gpu offload //gpu offload
#if defined(GGML_USE_CLBLAST) || defined(GGML_USE_CUBLAS) #if defined(GGML_USE_CLBLAST) || defined(GGML_USE_CUDA)
if(gpulayers>0) if(gpulayers>0)
{ {
const auto & hparams = model.hparams; const auto & hparams = model.hparams;

View file

@ -16,7 +16,7 @@
#include "model_adapter.h" #include "model_adapter.h"
#ifdef GGML_USE_CUBLAS #ifdef GGML_USE_CUDA
#include "ggml_v3-cuda.h" #include "ggml_v3-cuda.h"
#endif #endif
#if defined(GGML_USE_CLBLAST) #if defined(GGML_USE_CLBLAST)
@ -342,7 +342,7 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g
fin.close(); fin.close();
//gpu offload //gpu offload
#if defined(GGML_USE_CLBLAST) || defined(GGML_USE_CUBLAS) #if defined(GGML_USE_CLBLAST) || defined(GGML_USE_CUDA)
if(gpulayers>0) if(gpulayers>0)
{ {
const auto & hparams = model.hparams; const auto & hparams = model.hparams;

View file

@ -504,7 +504,7 @@ struct llama_v3_buffer {
llama_v3_buffer& operator=(llama_v3_buffer&&) = delete; llama_v3_buffer& operator=(llama_v3_buffer&&) = delete;
}; };
#ifdef GGML_USE_CUBLAS #ifdef GGML_USE_CUDA
#include "ggml_v3-cuda.h" #include "ggml_v3-cuda.h"
struct llama_v3_ctx_buffer { struct llama_v3_ctx_buffer {
uint8_t * addr = NULL; uint8_t * addr = NULL;

View file

@ -407,7 +407,7 @@ struct llama_v2_buffer {
llama_v2_buffer& operator=(llama_v2_buffer&&) = delete; llama_v2_buffer& operator=(llama_v2_buffer&&) = delete;
}; };
#ifdef GGML_USE_CUBLAS #ifdef GGML_USE_CUDA
#include "ggml_v2-cuda.h" #include "ggml_v2-cuda.h"
struct llama_v2_ctx_buffer { struct llama_v2_ctx_buffer {
uint8_t * addr = NULL; uint8_t * addr = NULL;

View file

@ -10,7 +10,7 @@
#include "ggml_v2.h" #include "ggml_v2.h"
#ifdef GGML_USE_CUBLAS #ifdef GGML_USE_CUDA
#include "ggml_v2-cuda.h" #include "ggml_v2-cuda.h"
#endif #endif
#if defined(GGML_USE_CLBLAST) #if defined(GGML_USE_CLBLAST)
@ -1063,7 +1063,7 @@ static void llama_v2_model_load_internal(
ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL); ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
model.mapping = std::move(ml->mapping); model.mapping = std::move(ml->mapping);
#if defined(GGML_USE_CUBLAS) #if defined(GGML_USE_CUDA)
{ {
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer)); const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
if(GetQuantsUnshuffled()) if(GetQuantsUnshuffled())

View file

@ -13,7 +13,7 @@
#include "ggml_v3.h" #include "ggml_v3.h"
#include "otherarch.h" #include "otherarch.h"
#ifdef GGML_USE_CUBLAS #ifdef GGML_USE_CUDA
#include "ggml_v3-cuda.h" #include "ggml_v3-cuda.h"
#endif #endif
#if defined(GGML_USE_CLBLAST) #if defined(GGML_USE_CLBLAST)
@ -61,7 +61,7 @@ static void llama_v3_log_callback_default(llama_v3_log_level level, const char *
#define LLAMA_V3_LOG_WARN(...) llama_v3_log_internal(LLAMA_V3_LOG_LEVEL_WARN , __VA_ARGS__) #define LLAMA_V3_LOG_WARN(...) llama_v3_log_internal(LLAMA_V3_LOG_LEVEL_WARN , __VA_ARGS__)
#define LLAMA_V3_LOG_ERROR(...) llama_v3_log_internal(LLAMA_V3_LOG_LEVEL_ERROR, __VA_ARGS__) #define LLAMA_V3_LOG_ERROR(...) llama_v3_log_internal(LLAMA_V3_LOG_LEVEL_ERROR, __VA_ARGS__)
#if !defined(GGML_USE_CUBLAS) #if !defined(GGML_USE_CUDA)
#define LLAMA_V3_USE_ALLOCATOR #define LLAMA_V3_USE_ALLOCATOR
#else #else
#define LLAMA_V3_USE_SCRATCH #define LLAMA_V3_USE_SCRATCH
@ -270,10 +270,10 @@ struct llama_v3_kv_cache {
ggml_v3_free(ctx); ggml_v3_free(ctx);
} }
#ifdef GGML_USE_CUBLAS #ifdef GGML_USE_CUDA
ggml_v3_cuda_free_data(k); ggml_v3_cuda_free_data(k);
ggml_v3_cuda_free_data(v); ggml_v3_cuda_free_data(v);
#endif // GGML_USE_CUBLAS #endif // GGML_USE_CUDA
} }
}; };
@ -329,7 +329,7 @@ struct llama_v3_model {
ggml_v3_free(ctx); ggml_v3_free(ctx);
} }
#ifdef GGML_USE_CUBLAS #ifdef GGML_USE_CUDA
for (size_t i = 0; i < tensors_by_name.size(); ++i) { for (size_t i = 0; i < tensors_by_name.size(); ++i) {
ggml_v3_cuda_free_data(tensors_by_name[i].second); ggml_v3_cuda_free_data(tensors_by_name[i].second);
} }
@ -795,7 +795,7 @@ struct llama_v3_model_loader {
lmlock->grow_to(lock_size); lmlock->grow_to(lock_size);
} }
break; break;
#if defined(GGML_USE_CUBLAS) #if defined(GGML_USE_CUDA)
case GGML_V3_BACKEND_GPU: case GGML_V3_BACKEND_GPU:
case GGML_V3_BACKEND_GPU_SPLIT: case GGML_V3_BACKEND_GPU_SPLIT:
ggml_v3_cuda_transform_tensor(lt.data, lt.ggml_v3_tensor); ggml_v3_cuda_transform_tensor(lt.data, lt.ggml_v3_tensor);
@ -882,14 +882,14 @@ static bool kv_cache_init(
ggml_v3_set_name(cache.v, "cache_v"); ggml_v3_set_name(cache.v, "cache_v");
(void) n_gpu_layers; (void) n_gpu_layers;
#ifdef GGML_USE_CUBLAS #ifdef GGML_USE_CUDA
if (n_gpu_layers > n_layer + 1) { if (n_gpu_layers > n_layer + 1) {
ggml_v3_cuda_assign_buffers_no_scratch(cache.v); ggml_v3_cuda_assign_buffers_no_scratch(cache.v);
} }
if (n_gpu_layers > n_layer + 2) { if (n_gpu_layers > n_layer + 2) {
ggml_v3_cuda_assign_buffers_no_scratch(cache.k); ggml_v3_cuda_assign_buffers_no_scratch(cache.k);
} }
#endif // GGML_USE_CUBLAS #endif // GGML_USE_CUDA
return true; return true;
} }
@ -1181,7 +1181,7 @@ static void llama_v3_model_load_internal(
(void) main_gpu; (void) main_gpu;
(void) mul_mat_q; (void) mul_mat_q;
#if defined(GGML_USE_CUBLAS) #if defined(GGML_USE_CUDA)
LLAMA_V3_LOG_INFO("%s: using CUDA for GPU acceleration\n", __func__); LLAMA_V3_LOG_INFO("%s: using CUDA for GPU acceleration\n", __func__);
ggml_v3_cuda_set_main_device(main_gpu); ggml_v3_cuda_set_main_device(main_gpu);
ggml_v3_cuda_set_mul_mat_q(mul_mat_q); ggml_v3_cuda_set_mul_mat_q(mul_mat_q);
@ -1298,7 +1298,7 @@ static void llama_v3_model_load_internal(
(void) vram_scratch; (void) vram_scratch;
(void) n_batch; (void) n_batch;
#ifdef GGML_USE_CUBLAS #ifdef GGML_USE_CUDA
if (low_vram) { if (low_vram) {
LLAMA_V3_LOG_INFO("%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__); LLAMA_V3_LOG_INFO("%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__);
ggml_v3_cuda_set_scratch_size(0); // disable scratch ggml_v3_cuda_set_scratch_size(0); // disable scratch
@ -1313,9 +1313,9 @@ static void llama_v3_model_load_internal(
(vram_scratch + MB3 - 1) / MB3); // round up (vram_scratch + MB3 - 1) / MB3); // round up
} }
} }
#endif // GGML_USE_CUBLAS #endif // GGML_USE_CUDA
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) #if defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST)
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer)); const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
LLAMA_V3_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu); LLAMA_V3_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
@ -1324,7 +1324,7 @@ static void llama_v3_model_load_internal(
} }
size_t vram_kv_cache = 0; size_t vram_kv_cache = 0;
#ifdef GGML_USE_CUBLAS #ifdef GGML_USE_CUDA
const int max_backend_supported_layers = hparams.n_layer + 3; const int max_backend_supported_layers = hparams.n_layer + 3;
const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3; const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3;
if (n_gpu_layers > (int) hparams.n_layer + 1) { if (n_gpu_layers > (int) hparams.n_layer + 1) {
@ -1346,7 +1346,7 @@ static void llama_v3_model_load_internal(
#elif defined(GGML_USE_CLBLAST) #elif defined(GGML_USE_CLBLAST)
const int max_backend_supported_layers = hparams.n_layer + 1; const int max_backend_supported_layers = hparams.n_layer + 1;
const int max_offloadable_layers = hparams.n_layer + 1; const int max_offloadable_layers = hparams.n_layer + 1;
#endif // GGML_USE_CUBLAS #endif // GGML_USE_CUDA
LLAMA_V3_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", LLAMA_V3_LOG_INFO("%s: offloaded %d/%d layers to GPU\n",
__func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers); __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
@ -1354,7 +1354,7 @@ static void llama_v3_model_load_internal(
__func__, (vram_weights + vram_scratch + vram_kv_cache + MB3 - 1) / MB3); // round up __func__, (vram_weights + vram_scratch + vram_kv_cache + MB3 - 1) / MB3); // round up
#else #else
(void) n_gpu_layers; (void) n_gpu_layers;
#endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) #endif // defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST)
} }
// populate `tensors_by_name` // populate `tensors_by_name`
@ -1363,7 +1363,7 @@ static void llama_v3_model_load_internal(
} }
(void) tensor_split; (void) tensor_split;
#if defined(GGML_USE_CUBLAS) #if defined(GGML_USE_CUDA)
{ {
ggml_v3_cuda_set_tensor_split(tensor_split); ggml_v3_cuda_set_tensor_split(tensor_split);
} }
@ -1510,7 +1510,7 @@ static struct ggml_v3_cgraph * llama_v3_build_graph(
offload_func_v3_t offload_func_kq = llama_v3_nop; offload_func_v3_t offload_func_kq = llama_v3_nop;
offload_func_v3_t offload_func_v = llama_v3_nop; offload_func_v3_t offload_func_v = llama_v3_nop;
#ifdef GGML_USE_CUBLAS #ifdef GGML_USE_CUDA
if (n_gpu_layers > n_layer) { if (n_gpu_layers > n_layer) {
offload_func_nr = ggml_v3_cuda_assign_buffers; offload_func_nr = ggml_v3_cuda_assign_buffers;
} }
@ -1520,7 +1520,7 @@ static struct ggml_v3_cgraph * llama_v3_build_graph(
if (n_gpu_layers > n_layer + 2) { if (n_gpu_layers > n_layer + 2) {
offload_func_kq = ggml_v3_cuda_assign_buffers; offload_func_kq = ggml_v3_cuda_assign_buffers;
} }
#endif // GGML_USE_CUBLAS #endif // GGML_USE_CUDA
struct ggml_v3_tensor * KQ_scale = ggml_v3_new_tensor_1d(ctx0, GGML_V3_TYPE_F32, 1); struct ggml_v3_tensor * KQ_scale = ggml_v3_new_tensor_1d(ctx0, GGML_V3_TYPE_F32, 1);
#ifdef LLAMA_V3_USE_ALLOCATOR #ifdef LLAMA_V3_USE_ALLOCATOR
@ -1541,11 +1541,11 @@ static struct ggml_v3_cgraph * llama_v3_build_graph(
offload_func_v3_t offload_func = llama_v3_nop; offload_func_v3_t offload_func = llama_v3_nop;
#ifdef GGML_USE_CUBLAS #ifdef GGML_USE_CUDA
if (il >= i_gpu_start) { if (il >= i_gpu_start) {
offload_func = ggml_v3_cuda_assign_buffers; offload_func = ggml_v3_cuda_assign_buffers;
} }
#endif // GGML_USE_CUBLAS #endif // GGML_USE_CUDA
struct ggml_v3_tensor * inpSA = inpL; struct ggml_v3_tensor * inpSA = inpL;
@ -3661,19 +3661,19 @@ int llama_v3_apply_lora_from_file_internal(const struct llama_v3_model & model,
offload_func_v3_t offload_func = llama_v3_nop; offload_func_v3_t offload_func = llama_v3_nop;
offload_func_v3_t offload_func_force_inplace = llama_v3_nop; offload_func_v3_t offload_func_force_inplace = llama_v3_nop;
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) #if defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST)
if (dest_t->backend == GGML_V3_BACKEND_GPU || dest_t->backend == GGML_V3_BACKEND_GPU_SPLIT) { if (dest_t->backend == GGML_V3_BACKEND_GPU || dest_t->backend == GGML_V3_BACKEND_GPU_SPLIT) {
if (dest_t->type != GGML_V3_TYPE_F16) { if (dest_t->type != GGML_V3_TYPE_F16) {
printf("\nError: the simultaneous use of LoRAs and GPU acceleration is only supported for f16 models\n"); printf("\nError: the simultaneous use of LoRAs and GPU acceleration is only supported for f16 models\n");
throw std::runtime_error(format_old( throw std::runtime_error(format_old(
"%s: error: lora failed", __func__)); "%s: error: lora failed", __func__));
} }
#if defined(GGML_USE_CUBLAS) #if defined(GGML_USE_CUDA)
offload_func = ggml_v3_cuda_assign_buffers; offload_func = ggml_v3_cuda_assign_buffers;
offload_func_force_inplace = ggml_v3_cuda_assign_buffers_force_inplace; offload_func_force_inplace = ggml_v3_cuda_assign_buffers_force_inplace;
#endif #endif
} }
#endif // GGML_USE_CUBLAS #endif // GGML_USE_CUDA
ggml_v3_tensor * base_t; ggml_v3_tensor * base_t;
if (model_loader) { if (model_loader) {

View file

@ -2,12 +2,12 @@
#define LLAMA_V3_H #define LLAMA_V3_H
#include "ggml_v3.h" #include "ggml_v3.h"
#ifdef GGML_USE_CUBLAS #ifdef GGML_USE_CUDA
#include "ggml_v3-cuda.h" #include "ggml_v3-cuda.h"
#define LLAMA_V3_MAX_DEVICES GGML_V3_CUDA_MAX_DEVICES #define LLAMA_V3_MAX_DEVICES GGML_V3_CUDA_MAX_DEVICES
#else #else
#define LLAMA_V3_MAX_DEVICES 1 #define LLAMA_V3_MAX_DEVICES 1
#endif // GGML_USE_CUBLAS #endif // GGML_USE_CUDA
#include <stddef.h> #include <stddef.h>
#include <stdint.h> #include <stdint.h>
#include <stdbool.h> #include <stdbool.h>
@ -48,7 +48,7 @@
#define LLAMA_V3_DEFAULT_SEED 0xFFFFFFFF #define LLAMA_V3_DEFAULT_SEED 0xFFFFFFFF
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) #if defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
// Defined when llama.cpp is compiled with support for offloading model layers to GPU. // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
#define LLAMA_V3_SUPPORTS_GPU_OFFLOAD #define LLAMA_V3_SUPPORTS_GPU_OFFLOAD
#endif #endif

View file

@ -16,7 +16,7 @@
#include "model_adapter.h" #include "model_adapter.h"
#ifdef GGML_USE_CUBLAS #ifdef GGML_USE_CUDA
#include "ggml_v3-cuda.h" #include "ggml_v3-cuda.h"
#endif #endif
#if defined(GGML_USE_CLBLAST) #if defined(GGML_USE_CLBLAST)
@ -295,7 +295,7 @@ bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vo
fin.close(); fin.close();
//gpu offload //gpu offload
#if defined(GGML_USE_CLBLAST) || defined(GGML_USE_CUBLAS) #if defined(GGML_USE_CLBLAST) || defined(GGML_USE_CUDA)
if(gpulayers>0) if(gpulayers>0)
{ {
const auto & hparams = model.hparams; const auto & hparams = model.hparams;

View file

@ -14,7 +14,7 @@
#include <iostream> #include <iostream>
#include <algorithm> #include <algorithm>
#ifdef GGML_USE_CUBLAS #ifdef GGML_USE_CUDA
#include "ggml_v3-cuda.h" #include "ggml_v3-cuda.h"
#endif #endif
#if defined(GGML_USE_CLBLAST) #if defined(GGML_USE_CLBLAST)
@ -329,7 +329,7 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model &
fin.close(); fin.close();
//gpu offload //gpu offload
#if defined(GGML_USE_CLBLAST) || defined(GGML_USE_CUBLAS) #if defined(GGML_USE_CLBLAST) || defined(GGML_USE_CUDA)
if(gpulayers>0) if(gpulayers>0)
{ {
const auto & hparams = model.hparams; const auto & hparams = model.hparams;

View file

@ -6,7 +6,7 @@
#include "rwkv_v3.h" #include "rwkv_v3.h"
#include "ggml_v3.h" #include "ggml_v3.h"
#ifdef GGML_USE_CUBLAS #ifdef GGML_USE_CUDA
#include "ggml_v3-cuda.h" #include "ggml_v3-cuda.h"
#endif #endif
#if defined(GGML_USE_CLBLAST) #if defined(GGML_USE_CLBLAST)
@ -1076,7 +1076,7 @@ struct rwkv_future_tensor rwkv_future_graph_work(struct rwkv_future_ctx & ctx,
const size_t n_threads, const size_t n_threads,
const size_t sequence_len = 1 const size_t sequence_len = 1
) { ) {
#if defined(GGML_USE_CLBLAST) || defined(GGML_USE_CUBLAS) #if defined(GGML_USE_CLBLAST) || defined(GGML_USE_CUDA)
enum ggml_v3_type mul_mat_type = type == GGML_V3_TYPE_F32 ? GGML_V3_TYPE_F32 : GGML_V3_TYPE_F16; enum ggml_v3_type mul_mat_type = type == GGML_V3_TYPE_F32 ? GGML_V3_TYPE_F32 : GGML_V3_TYPE_F16;
#else #else
enum ggml_v3_type mul_mat_type = ggml_v3_is_quantized(type) ? GGML_V3_TYPE_Q8_1 : type; enum ggml_v3_type mul_mat_type = ggml_v3_is_quantized(type) ? GGML_V3_TYPE_Q8_1 : type;
@ -1566,7 +1566,7 @@ struct rwkv_context * rwkv_clone_context(struct rwkv_context * ctx, const uint32
} }
bool rwkv_gpu_offload_layers(struct rwkv_context * ctx, const uint32_t n_layers) { bool rwkv_gpu_offload_layers(struct rwkv_context * ctx, const uint32_t n_layers) {
#if defined(GGML_USE_CLBLAST) || defined(GGML_USE_CUBLAS) #if defined(GGML_USE_CLBLAST) || defined(GGML_USE_CUDA)
printf("\nOffloading %u (or fewer) layers...",n_layers); printf("\nOffloading %u (or fewer) layers...",n_layers);
const auto offload = [&](struct ggml_v3_tensor * tensor) { const auto offload = [&](struct ggml_v3_tensor * tensor) {
// TODO support multi-GPU // TODO support multi-GPU

View file

@ -1,10 +0,0 @@
#!/bin/bash
wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip
echo "Usage:"
echo ""
echo " ./perplexity -m model.gguf -f wiki.test.raw [other params]"
echo ""
exit 0

View file

@ -1,213 +0,0 @@
#!/bin/bash
#
# Use this script only on fresh pods (runpod.io)!
# Otherwise, it can break your environment!
#
if [ -z "$1" ]; then
echo "Usage: $0 <data>"
echo " 0: no models"
echo " 1: tinyllama-1b"
echo " 2: codellama-7b"
echo " 3: codellama-13b"
echo " 4: codellama-34b"
echo " 5: codellama-7b-instruct"
echo " 6: codellama-13b-instruct"
echo " 7: codellama-34b-instruct"
exit 1
fi
set -x
# setup deps
apt-get update
apt-get install -y git-lfs cmake cmake-curses-gui vim ruby
git-lfs install
if [ ! -d "/workspace" ]; then
ln -sfn $(pwd) /workspace
fi
# download data
cd /workspace
# this is useful to git clone repos without doubling the disk size due to .git
git clone https://github.com/iboB/git-lfs-download
ln -sfn /workspace/git-lfs-download/git-lfs-download /usr/local/bin/git-lfs-download
# llama.cpp
cd /workspace
git clone https://github.com/ggerganov/llama.cpp
cd llama.cpp
LLAMA_CUBLAS=1 make -j
ln -sfn /workspace/TinyLlama-1.1B-Chat-v0.3 ./models/tinyllama-1b
ln -sfn /workspace/CodeLlama-7b-hf ./models/codellama-7b
ln -sfn /workspace/CodeLlama-13b-hf ./models/codellama-13b
ln -sfn /workspace/CodeLlama-34b-hf ./models/codellama-34b
ln -sfn /workspace/CodeLlama-7b-Instruct-hf ./models/codellama-7b-instruct
ln -sfn /workspace/CodeLlama-13b-Instruct-hf ./models/codellama-13b-instruct
ln -sfn /workspace/CodeLlama-34b-Instruct-hf ./models/codellama-34b-instruct
pip install -r requirements.txt
# cmake
cd /workspace/llama.cpp
mkdir build-cublas
cd build-cublas
cmake -DLLAMA_CUBLAS=1 ../
make -j
if [ "$1" -eq "0" ]; then
exit 0
fi
# more models
if [ "$1" -eq "1" ]; then
cd /workspace
git-lfs-download https://huggingface.co/PY007/TinyLlama-1.1B-Chat-v0.3
cd /workspace/llama.cpp
python3 convert.py ./models/tinyllama-1b --outfile ./models/tinyllama-1b/ggml-model-f16.gguf --outtype f16
./quantize ./models/tinyllama-1b/ggml-model-f16.gguf ./models/tinyllama-1b/ggml-model-q4_0.gguf q4_0
./quantize ./models/tinyllama-1b/ggml-model-f16.gguf ./models/tinyllama-1b/ggml-model-q4_k.gguf q4_k
./quantize ./models/tinyllama-1b/ggml-model-f16.gguf ./models/tinyllama-1b/ggml-model-q8_0.gguf q8_0
fi
if [ "$1" -eq "2" ]; then
cd /workspace
git-lfs-download https://huggingface.co/codellama/CodeLlama-7b-hf --without *safetensors*
rm -v ./CodeLlama-7b-hf/*safetensors*
cd /workspace/llama.cpp
python3 convert.py ./models/codellama-7b --outfile ./models/codellama-7b/ggml-model-f16.gguf --outtype f16
./quantize ./models/codellama-7b/ggml-model-f16.gguf ./models/codellama-7b/ggml-model-q4_0.gguf q4_0
./quantize ./models/codellama-7b/ggml-model-f16.gguf ./models/codellama-7b/ggml-model-q4_k.gguf q4_k
./quantize ./models/codellama-7b/ggml-model-f16.gguf ./models/codellama-7b/ggml-model-q8_0.gguf q8_0
fi
if [ "$1" -eq "3" ]; then
cd /workspace
git-lfs-download https://huggingface.co/codellama/CodeLlama-13b-hf --without *safetensors*
rm -v ./CodeLlama-13b-hf/*safetensors*
cd /workspace/llama.cpp
python3 convert.py ./models/codellama-13b --outfile ./models/codellama-13b/ggml-model-f16.gguf --outtype f16
./quantize ./models/codellama-13b/ggml-model-f16.gguf ./models/codellama-13b/ggml-model-q4_0.gguf q4_0
./quantize ./models/codellama-13b/ggml-model-f16.gguf ./models/codellama-13b/ggml-model-q4_k.gguf q4_k
./quantize ./models/codellama-13b/ggml-model-f16.gguf ./models/codellama-13b/ggml-model-q8_0.gguf q8_0
fi
if [ "$1" -eq "4" ]; then
cd /workspace
git-lfs-download https://huggingface.co/codellama/CodeLlama-34b-hf --without *safetensors*
rm -v ./CodeLlama-34b-hf/*safetensors*
cd /workspace/llama.cpp
python3 convert.py ./models/codellama-34b --outfile ./models/codellama-34b/ggml-model-f16.gguf --outtype f16
./quantize ./models/codellama-34b/ggml-model-f16.gguf ./models/codellama-34b/ggml-model-q4_0.gguf q4_0
./quantize ./models/codellama-34b/ggml-model-f16.gguf ./models/codellama-34b/ggml-model-q4_k.gguf q4_k
./quantize ./models/codellama-34b/ggml-model-f16.gguf ./models/codellama-34b/ggml-model-q8_0.gguf q8_0
fi
if [ "$1" -eq "5" ]; then
cd /workspace
git-lfs-download https://huggingface.co/codellama/CodeLlama-7b-Instruct-hf --without *safetensors*
rm -v ./CodeLlama-7b-Instruct-hf/*safetensors*
cd /workspace/llama.cpp
python3 convert.py ./models/codellama-7b-instruct --outfile ./models/codellama-7b-instruct/ggml-model-f16.gguf --outtype f16
./quantize ./models/codellama-7b-instruct/ggml-model-f16.gguf ./models/codellama-7b-instruct/ggml-model-q4_0.gguf q4_0
./quantize ./models/codellama-7b-instruct/ggml-model-f16.gguf ./models/codellama-7b-instruct/ggml-model-q4_k.gguf q4_k
./quantize ./models/codellama-7b-instruct/ggml-model-f16.gguf ./models/codellama-7b-instruct/ggml-model-q8_0.gguf q8_0
fi
if [ "$1" -eq "6" ]; then
cd /workspace
git-lfs-download https://huggingface.co/codellama/CodeLlama-13b-Instruct-hf --without *safetensors*
rm -v ./CodeLlama-13b-Instruct-hf/*safetensors*
cd /workspace/llama.cpp
python3 convert.py ./models/codellama-13b-instruct --outfile ./models/codellama-13b-instruct/ggml-model-f16.gguf --outtype f16
./quantize ./models/codellama-13b-instruct/ggml-model-f16.gguf ./models/codellama-13b-instruct/ggml-model-q4_0.gguf q4_0
./quantize ./models/codellama-13b-instruct/ggml-model-f16.gguf ./models/codellama-13b-instruct/ggml-model-q4_k.gguf q4_k
./quantize ./models/codellama-13b-instruct/ggml-model-f16.gguf ./models/codellama-13b-instruct/ggml-model-q8_0.gguf q8_0
fi
if [ "$1" -eq "7" ]; then
cd /workspace
git-lfs-download https://huggingface.co/codellama/CodeLlama-34b-Instruct-hf --without *safetensors*
rm -v ./CodeLlama-34b-Instruct-hf/*safetensors*
cd /workspace/llama.cpp
python3 convert.py ./models/codellama-34b-instruct --outfile ./models/codellama-34b-instruct/ggml-model-f16.gguf --outtype f16
./quantize ./models/codellama-34b-instruct/ggml-model-f16.gguf ./models/codellama-34b-instruct/ggml-model-q4_0.gguf q4_0
./quantize ./models/codellama-34b-instruct/ggml-model-f16.gguf ./models/codellama-34b-instruct/ggml-model-q4_k.gguf q4_k
./quantize ./models/codellama-34b-instruct/ggml-model-f16.gguf ./models/codellama-34b-instruct/ggml-model-q8_0.gguf q8_0
fi
if [ "$1" -eq "1" ]; then
# perf + perplexity
cd /workspace/llama.cpp/build-cublas
make -j && ../scripts/run-all-perf.sh tinyllama-1b "f16" "-ngl 99 -t 1 -p 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,32,64,128,256,512,1024,2048 -n 128"
../scripts/get-wikitext-2.sh
unzip wikitext-2-raw-v1.zip
make -j && ./bin/perplexity -m ../models/tinyllama-1b/ggml-model-f16.gguf -f ./wikitext-2-raw/wiki.test.raw -ngl 100 --chunks 32
# batched
cd /workspace/llama.cpp
LLAMA_CUBLAS=1 make -j && ./batched ./models/tinyllama-1b/ggml-model-f16.gguf "Hello, my name is" 8 128 999
# batched-bench
cd /workspace/llama.cpp
LLAMA_CUBLAS=1 make -j && ./batched-bench ./models/tinyllama-1b/ggml-model-f16.gguf 4608 1 99 0 512 128 1,2,3,4,5,6,7,8,16,32
# parallel
cd /workspace/llama.cpp
LLAMA_CUBLAS=1 make -j && ./parallel -m ./models/tinyllama-1b/ggml-model-f16.gguf -t 1 -ngl 100 -c 4096 -b 512 -s 1 -np 8 -ns 128 -n 100 -cb
fi
# speculative
#if [ "$1" -eq "7" ]; then
# cd /workspace/llama.cpp
#
# LLAMA_CUBLAS=1 make -j && ./speculative -m ./models/codellama-34b-instruct/ggml-model-f16.gguf -md ./models/codellama-7b-instruct/ggml-model-q4_0.gguf -p "# Dijkstra's shortest path algorithm in Python (4 spaces indentation) + complexity analysis:\n\n" -e -ngl 999 -ngld 999 -t 4 -n 512 -c 4096 -s 21 --draft 16 -np 1 --temp 0.0
#fi
# more benches
#LLAMA_CUBLAS=1 make -j && ./batched-bench ./models/codellama-7b/ggml-model-q4_k.gguf 4096 1 99 1 512,3200 128,128,800 1
#LLAMA_CUBLAS=1 make -j && ./batched-bench ./models/codellama-13b/ggml-model-q4_k.gguf 4096 1 99 1 512,3200 128,128,800 1

View file

@ -1,10 +0,0 @@
import { readFileSync } from "fs"
import { SchemaConverter } from "../examples/server/public/json-schema-to-grammar.mjs"
const [, , file] = process.argv
const url = `file://${file}`
let schema = JSON.parse(readFileSync(file, "utf8"));
const converter = new SchemaConverter({})
schema = await converter.resolveRefs(schema, url)
converter.visit(schema, '')
console.log(converter.formatGrammar())

View file

@ -1,79 +0,0 @@
#include <iostream>
#include <string>
#include <vector>
#include <sstream>
#undef NDEBUG
#include <cassert>
#include "llama.h"
int main(void) {
llama_chat_message conversation[] = {
{"system", "You are a helpful assistant"},
{"user", "Hello"},
{"assistant", "Hi there"},
{"user", "Who are you"},
{"assistant", " I am an assistant "},
{"user", "Another question"},
};
size_t message_count = 6;
std::vector<std::string> templates = {
// teknium/OpenHermes-2.5-Mistral-7B
"{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content'] + '<|im_end|>' + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\\n' }}{% endif %}",
// mistralai/Mistral-7B-Instruct-v0.2
"{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}",
// TheBloke/FusionNet_34Bx2_MoE-AWQ
"{%- for idx in range(0, messages|length) -%}\\n{%- if messages[idx]['role'] == 'user' -%}\\n{%- if idx > 1 -%}\\n{{- bos_token + '[INST] ' + messages[idx]['content'] + ' [/INST]' -}}\\n{%- else -%}\\n{{- messages[idx]['content'] + ' [/INST]' -}}\\n{%- endif -%}\\n{% elif messages[idx]['role'] == 'system' %}\\n{{- '[INST] <<SYS>>\\\\n' + messages[idx]['content'] + '\\\\n<</SYS>>\\\\n\\\\n' -}}\\n{%- elif messages[idx]['role'] == 'assistant' -%}\\n{{- ' ' + messages[idx]['content'] + ' ' + eos_token -}}\\n{% endif %}\\n{% endfor %}",
// bofenghuang/vigogne-2-70b-chat
"{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif true == true and not '<<SYS>>' in messages[0]['content'] %}{% set loop_messages = messages %}{% set system_message = 'Vous êtes Vigogne, un assistant IA créé par Zaion Lab. Vous suivez extrêmement bien les instructions. Aidez autant que vous le pouvez.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\\\n' + system_message + '\\\\n<</SYS>>\\\\n\\\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'system' %}{{ '<<SYS>>\\\\n' + content.strip() + '\\\\n<</SYS>>\\\\n\\\\n' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}",
// mlabonne/AlphaMonarch-7B
"{% for message in messages %}{{bos_token + message['role'] + '\\n' + message['content'] + eos_token + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ bos_token + 'assistant\\n' }}{% endif %}",
// google/gemma-7b-it
"{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '<start_of_turn>' + role + '\\n' + message['content'] | trim + '<end_of_turn>\\n' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model\\n'}}{% endif %}",
// OrionStarAI/Orion-14B-Chat
"{% for message in messages %}{% if loop.first %}{{ bos_token }}{% endif %}{% if message['role'] == 'user' %}{{ 'Human: ' + message['content'] + '\\n\\nAssistant: ' + eos_token }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token }}{% endif %}{% endfor %}",
};
std::vector<std::string> expected_output = {
// teknium/OpenHermes-2.5-Mistral-7B
"<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\nHello<|im_end|>\n<|im_start|>assistant\nHi there<|im_end|>\n<|im_start|>user\nWho are you<|im_end|>\n<|im_start|>assistant\n I am an assistant <|im_end|>\n<|im_start|>user\nAnother question<|im_end|>\n<|im_start|>assistant\n",
// mistralai/Mistral-7B-Instruct-v0.2
"[INST] You are a helpful assistant\nHello [/INST]Hi there</s>[INST] Who are you [/INST] I am an assistant </s>[INST] Another question [/INST]",
// TheBloke/FusionNet_34Bx2_MoE-AWQ
"[INST] <<SYS>>\nYou are a helpful assistant\n<</SYS>>\n\nHello [/INST] Hi there </s><s>[INST] Who are you [/INST] I am an assistant </s><s>[INST] Another question [/INST]",
// bofenghuang/vigogne-2-70b-chat
"[INST] <<SYS>>\nYou are a helpful assistant\n<</SYS>>\n\nHello [/INST] Hi there </s>[INST] Who are you [/INST] I am an assistant </s>[INST] Another question [/INST]",
// mlabonne/AlphaMonarch-7B
"system\nYou are a helpful assistant</s>\n<s>user\nHello</s>\n<s>assistant\nHi there</s>\n<s>user\nWho are you</s>\n<s>assistant\n I am an assistant </s>\n<s>user\nAnother question</s>\n<s>assistant\n",
// google/gemma-7b-it
"<start_of_turn>user\nYou are a helpful assistant\n\nHello<end_of_turn>\n<start_of_turn>model\nHi there<end_of_turn>\n<start_of_turn>user\nWho are you<end_of_turn>\n<start_of_turn>model\nI am an assistant<end_of_turn>\n<start_of_turn>user\nAnother question<end_of_turn>\n<start_of_turn>model\n",
// OrionStarAI/Orion-14B-Chat
"Human: You are a helpful assistant\n\nHello\n\nAssistant: </s>Hi there</s>Human: Who are you\n\nAssistant: </s> I am an assistant </s>Human: Another question\n\nAssistant: </s>",
};
std::vector<char> formatted_chat(1024);
int32_t res;
// test invalid chat template
res = llama_chat_apply_template(nullptr, "INVALID TEMPLATE", conversation, message_count, true, formatted_chat.data(), formatted_chat.size());
assert(res < 0);
for (size_t i = 0; i < templates.size(); i++) {
std::string custom_template = templates[i];
std::string expected = expected_output[i];
formatted_chat.resize(1024);
res = llama_chat_apply_template(
nullptr,
custom_template.c_str(),
conversation,
message_count,
true,
formatted_chat.data(),
formatted_chat.size()
);
formatted_chat.resize(res);
std::string output(formatted_chat.data(), formatted_chat.size());
std::cout << output << "\n-------------------------\n";
assert(output == expected);
}
return 0;
}

View file

@ -1,842 +0,0 @@
#ifdef NDEBUG
#undef NDEBUG
#endif
#include <fstream>
#include <sstream>
#include <regex>
#include "json-schema-to-grammar.h"
#include "grammar-parser.h"
static std::string trim(const std::string & source) {
std::string s(source);
s.erase(0,s.find_first_not_of(" \n\r\t"));
s.erase(s.find_last_not_of(" \n\r\t")+1);
return std::regex_replace(s, std::regex("(^|\n)[ \t]+"), "$1");
}
enum TestCaseStatus {
SUCCESS,
FAILURE
};
struct TestCase {
TestCaseStatus expected_status;
std::string name;
std::string schema;
std::string expected_grammar;
void _print_failure_header() const {
fprintf(stderr, "#\n# Test '%s' failed.\n#\n%s\n", name.c_str(), schema.c_str());
}
void verify(const std::string & actual_grammar) const {
if (trim(actual_grammar) != trim(expected_grammar)) {
_print_failure_header();
fprintf(stderr, "# EXPECTED:\n%s\n# ACTUAL:\n%s\n", expected_grammar.c_str(), actual_grammar.c_str());
assert(false);
}
}
void verify_expectation_parseable() const {
try {
auto state = grammar_parser::parse(expected_grammar.c_str());
if (state.symbol_ids.find("root") == state.symbol_ids.end()) {
throw std::runtime_error("Grammar failed to parse:\n" + expected_grammar);
}
} catch (const std::runtime_error & ex) {
_print_failure_header();
fprintf(stderr, "# GRAMMAR ERROR: %s\n", ex.what());
assert(false);
}
}
void verify_status(TestCaseStatus status) const {
if (status != expected_status) {
_print_failure_header();
fprintf(stderr, "# EXPECTED STATUS: %s\n", expected_status == SUCCESS ? "SUCCESS" : "FAILURE");
fprintf(stderr, "# ACTUAL STATUS: %s\n", status == SUCCESS ? "SUCCESS" : "FAILURE");
assert(false);
}
}
};
static void write(const std::string & file, const std::string & content) {
std::ofstream f;
f.open(file.c_str());
f << content.c_str();
f.close();
}
static std::string read(const std::string & file) {
std::ostringstream actuals;
actuals << std::ifstream(file.c_str()).rdbuf();
return actuals.str();
}
static void test_all(const std::string & lang, std::function<void(const TestCase &)> runner) {
fprintf(stderr, "#\n# Testing JSON schema conversion (%s)\n#\n", lang.c_str());
auto test = [&](const TestCase & tc) {
fprintf(stderr, "- %s%s\n", tc.name.c_str(), tc.expected_status == FAILURE ? " (failure expected)" : "");
runner(tc);
};
test({
FAILURE,
"unknown type",
R"""({
"type": "kaboom"
})""",
""
});
test({
FAILURE,
"invalid type",
R"""({
"type": 123
})""",
""
});
test({
SUCCESS,
"empty schema (object)",
"{}",
R"""(
array ::= "[" space ( value ("," space value)* )? "]" space
boolean ::= ("true" | "false") space
null ::= "null" space
number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space
object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? "}" space
root ::= object
space ::= " "?
string ::= "\"" (
[^"\\] |
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
)* "\"" space
value ::= object | array | string | number | boolean
)"""
});
test({
SUCCESS,
"exotic formats",
R"""({
"items": [
{ "format": "date" },
{ "format": "uuid" },
{ "format": "time" },
{ "format": "date-time" }
]
})""",
R"""(
date ::= [0-9] [0-9] [0-9] [0-9] "-" ( "0" [1-9] | "1" [0-2] ) "-" ( "0" [1-9] | [1-2] [0-9] | "3" [0-1] )
date-string ::= "\"" date "\"" space
date-time ::= date "T" time
date-time-string ::= "\"" date-time "\"" space
root ::= "[" space date-string "," space uuid "," space time-string "," space date-time-string "]" space
space ::= " "?
time ::= ([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9] [0-9] [0-9] )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )
time-string ::= "\"" time "\"" space
uuid ::= "\"" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "-" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "-" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "-" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "-" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "\"" space
)"""
});
test({
SUCCESS,
"string",
R"""({
"type": "string"
})""",
R"""(
root ::= "\"" (
[^"\\] |
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
)* "\"" space
space ::= " "?
)"""
});
test({
SUCCESS,
"boolean",
R"""({
"type": "boolean"
})""",
R"""(
root ::= ("true" | "false") space
space ::= " "?
)"""
});
test({
SUCCESS,
"integer",
R"""({
"type": "integer"
})""",
R"""(
root ::= ("-"? ([0-9] | [1-9] [0-9]*)) space
space ::= " "?
)"""
});
test({
SUCCESS,
"string const",
R"""({
"const": "foo"
})""",
R"""(
root ::= "\"foo\""
space ::= " "?
)"""
});
test({
SUCCESS,
"non-string const",
R"""({
"const": 123
})""",
R"""(
root ::= "123"
space ::= " "?
)"""
});
test({
SUCCESS,
"non-string enum",
R"""({
"enum": ["red", "amber", "green", null, 42, ["foo"]]
})""",
R"""(
root ::= "\"red\"" | "\"amber\"" | "\"green\"" | "null" | "42" | "[\"foo\"]"
space ::= " "?
)"""
});
test({
SUCCESS,
"tuple1",
R"""({
"prefixItems": [{ "type": "string" }]
})""",
R"""(
root ::= "[" space string "]" space
space ::= " "?
string ::= "\"" (
[^"\\] |
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
)* "\"" space
)"""
});
test({
SUCCESS,
"tuple2",
R"""({
"prefixItems": [{ "type": "string" }, { "type": "number" }]
})""",
R"""(
number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space
root ::= "[" space string "," space number "]" space
space ::= " "?
string ::= "\"" (
[^"\\] |
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
)* "\"" space
)"""
});
test({
SUCCESS,
"number",
R"""({
"type": "number"
})""",
R"""(
root ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space
space ::= " "?
)"""
});
test({
SUCCESS,
"minItems",
R"""({
"items": {
"type": "boolean"
},
"minItems": 2
})""",
R"""(
boolean ::= ("true" | "false") space
root ::= "[" space boolean ( "," space boolean )( "," space boolean )* "]" space
space ::= " "?
)"""
});
test({
SUCCESS,
"maxItems 1",
R"""({
"items": {
"type": "boolean"
},
"maxItems": 1
})""",
R"""(
boolean ::= ("true" | "false") space
root ::= "[" space ( boolean )? "]" space
space ::= " "?
)"""
});
test({
SUCCESS,
"maxItems 2",
R"""({
"items": {
"type": "boolean"
},
"maxItems": 2
})""",
R"""(
boolean ::= ("true" | "false") space
root ::= "[" space ( boolean ( "," space boolean )? )? "]" space
space ::= " "?
)"""
});
test({
SUCCESS,
"min + maxItems",
R"""({
"items": {
"type": ["number", "integer"]
},
"minItems": 3,
"maxItems": 5
})""",
R"""(
integer ::= ("-"? ([0-9] | [1-9] [0-9]*)) space
item ::= number | integer
number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space
root ::= "[" space item ( "," space item )( "," space item )( "," space item )?( "," space item )? "]" space
space ::= " "?
)"""
});
test({
SUCCESS,
"simple regexp",
R"""({
"type": "string",
"pattern": "^abc?d*efg+(hij)?kl$"
})""",
R"""(
root ::= "\"" "ab" "c"? "d"* "ef" "g"+ ("hij")? "kl" "\"" space
space ::= " "?
)"""
});
test({
SUCCESS,
"regexp escapes",
R"""({
"type": "string",
"pattern": "^\\[\\]\\{\\}\\(\\)\\|\\+\\*\\?$"
})""",
R"""(
root ::= "\"" "[]{}()|+*?" "\"" space
space ::= " "?
)"""
});
test({
SUCCESS,
"regexp quote",
R"""({
"type": "string",
"pattern": "^\"$"
})""",
R"""(
root ::= "\"" "\"" "\"" space
space ::= " "?
)"""
});
test({
SUCCESS,
"regexp",
R"""({
"type": "string",
"pattern": "^(\\([0-9]{1,3}\\))?[0-9]{3}-[0-9]{4} and...$"
})""",
R"""(
dot ::= [\U00000000-\x09\x0B\x0C\x0E-\U0010FFFF]
root ::= "\"" ("(" root-1 root-1? root-1? ")")? root-1 root-1 root-1 "-" root-1 root-1 root-1 root-1 " and" dot dot dot "\"" space
root-1 ::= [0-9]
space ::= " "?
)"""
});
test({
SUCCESS,
"required props in original order",
R"""({
"type": "object",
"properties": {
"b": {"type": "string"},
"c": {"type": "string"},
"a": {"type": "string"}
},
"required": [
"a",
"b",
"c"
],
"additionalProperties": false,
"definitions": {}
})""",
R"""(
a-kv ::= "\"a\"" space ":" space string
b-kv ::= "\"b\"" space ":" space string
c-kv ::= "\"c\"" space ":" space string
root ::= "{" space b-kv "," space c-kv "," space a-kv "}" space
space ::= " "?
string ::= "\"" (
[^"\\] |
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
)* "\"" space
)"""
});
test({
SUCCESS,
"1 optional prop",
R"""({
"properties": {
"a": {
"type": "string"
}
},
"additionalProperties": false
})""",
R"""(
a-kv ::= "\"a\"" space ":" space string
root ::= "{" space (a-kv )? "}" space
space ::= " "?
string ::= "\"" (
[^"\\] |
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
)* "\"" space
)"""
});
test({
SUCCESS,
"N optional props",
R"""({
"properties": {
"a": {"type": "string"},
"b": {"type": "string"},
"c": {"type": "string"}
},
"additionalProperties": false
})""",
R"""(
a-kv ::= "\"a\"" space ":" space string
a-rest ::= ( "," space b-kv )? b-rest
b-kv ::= "\"b\"" space ":" space string
b-rest ::= ( "," space c-kv )?
c-kv ::= "\"c\"" space ":" space string
root ::= "{" space (a-kv a-rest | b-kv b-rest | c-kv )? "}" space
space ::= " "?
string ::= "\"" (
[^"\\] |
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
)* "\"" space
)"""
});
test({
SUCCESS,
"required + optional props each in original order",
R"""({
"properties": {
"b": {"type": "string"},
"a": {"type": "string"},
"d": {"type": "string"},
"c": {"type": "string"}
},
"required": ["a", "b"],
"additionalProperties": false
})""",
R"""(
a-kv ::= "\"a\"" space ":" space string
b-kv ::= "\"b\"" space ":" space string
c-kv ::= "\"c\"" space ":" space string
d-kv ::= "\"d\"" space ":" space string
d-rest ::= ( "," space c-kv )?
root ::= "{" space b-kv "," space a-kv ( "," space ( d-kv d-rest | c-kv ) )? "}" space
space ::= " "?
string ::= "\"" (
[^"\\] |
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
)* "\"" space
)"""
});
test({
SUCCESS,
"additional props",
R"""({
"type": "object",
"additionalProperties": {"type": "array", "items": {"type": "number"}}
})""",
R"""(
additional-kv ::= string ":" space additional-value
additional-kvs ::= additional-kv ( "," space additional-kv )*
additional-value ::= "[" space ( number ( "," space number )* )? "]" space
number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space
root ::= "{" space (additional-kvs )? "}" space
space ::= " "?
string ::= "\"" (
[^"\\] |
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
)* "\"" space
)"""
});
test({
SUCCESS,
"additional props (true)",
R"""({
"type": "object",
"additionalProperties": true
})""",
R"""(
array ::= "[" space ( value ("," space value)* )? "]" space
boolean ::= ("true" | "false") space
null ::= "null" space
number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space
object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? "}" space
root ::= object
space ::= " "?
string ::= "\"" (
[^"\\] |
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
)* "\"" space
value ::= object | array | string | number | boolean
)"""
});
test({
SUCCESS,
"additional props (implicit)",
R"""({
"type": "object"
})""",
R"""(
array ::= "[" space ( value ("," space value)* )? "]" space
boolean ::= ("true" | "false") space
null ::= "null" space
number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space
object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? "}" space
root ::= object
space ::= " "?
string ::= "\"" (
[^"\\] |
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
)* "\"" space
value ::= object | array | string | number | boolean
)"""
});
test({
SUCCESS,
"empty w/o additional props",
R"""({
"type": "object",
"additionalProperties": false
})""",
R"""(
root ::= "{" space "}" space
space ::= " "?
)"""
});
test({
SUCCESS,
"required + additional props",
R"""({
"type": "object",
"properties": {
"a": {"type": "number"}
},
"required": ["a"],
"additionalProperties": {"type": "string"}
})""",
R"""(
a-kv ::= "\"a\"" space ":" space number
additional-kv ::= string ":" space string
additional-kvs ::= additional-kv ( "," space additional-kv )*
number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space
root ::= "{" space a-kv ( "," space ( additional-kvs ) )? "}" space
space ::= " "?
string ::= "\"" (
[^"\\] |
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
)* "\"" space
)"""
});
test({
SUCCESS,
"optional + additional props",
R"""({
"type": "object",
"properties": {
"a": {"type": "number"}
},
"additionalProperties": {"type": "number"}
})""",
R"""(
a-kv ::= "\"a\"" space ":" space number
a-rest ::= additional-kvs
additional-kv ::= string ":" space number
additional-kvs ::= additional-kv ( "," space additional-kv )*
number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space
root ::= "{" space (a-kv a-rest | additional-kvs )? "}" space
space ::= " "?
string ::= "\"" (
[^"\\] |
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
)* "\"" space
)"""
});
test({
SUCCESS,
"required + optional + additional props",
R"""({
"type": "object",
"properties": {
"a": {"type": "number"},
"b": {"type": "number"}
},
"required": ["a"],
"additionalProperties": {"type": "number"}
})""",
R"""(
a-kv ::= "\"a\"" space ":" space number
additional-kv ::= string ":" space number
additional-kvs ::= additional-kv ( "," space additional-kv )*
b-kv ::= "\"b\"" space ":" space number
b-rest ::= additional-kvs
number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space
root ::= "{" space a-kv ( "," space ( b-kv b-rest | additional-kvs ) )? "}" space
space ::= " "?
string ::= "\"" (
[^"\\] |
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
)* "\"" space
)"""
});
test({
SUCCESS,
"top-level $ref",
R"""({
"$ref": "#/definitions/MyType",
"definitions": {
"MyType": {
"type": "object",
"properties": {
"a": {
"type": "string"
}
},
"required": [
"a"
],
"additionalProperties": false
}
}
})""",
R"""(
MyType ::= "{" space MyType-a-kv "}" space
MyType-a-kv ::= "\"a\"" space ":" space string
root ::= MyType
space ::= " "?
string ::= "\"" (
[^"\\] |
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
)* "\"" space
)"""
});
test({
SUCCESS,
"anyOf",
R"""({
"anyOf": [
{"$ref": "#/definitions/foo"},
{"$ref": "#/definitions/bar"}
],
"definitions": {
"foo": {
"properties": {"a": {"type": "number"}}
},
"bar": {
"properties": {"b": {"type": "number"}}
}
},
"type": "object"
})""",
R"""(
alternative-0 ::= foo
alternative-1 ::= bar
bar ::= "{" space (bar-b-kv )? "}" space
bar-b-kv ::= "\"b\"" space ":" space number
foo ::= "{" space (foo-a-kv )? "}" space
foo-a-kv ::= "\"a\"" space ":" space number
number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space
root ::= alternative-0 | alternative-1
space ::= " "?
)"""
});
test({
SUCCESS,
"mix of allOf, anyOf and $ref (similar to https://json.schemastore.org/tsconfig.json)",
R"""({
"allOf": [
{"$ref": "#/definitions/foo"},
{"$ref": "#/definitions/bar"},
{
"anyOf": [
{"$ref": "#/definitions/baz"},
{"$ref": "#/definitions/bam"}
]
}
],
"definitions": {
"foo": {
"properties": {"a": {"type": "number"}}
},
"bar": {
"properties": {"b": {"type": "number"}}
},
"bam": {
"properties": {"c": {"type": "number"}}
},
"baz": {
"properties": {"d": {"type": "number"}}
}
},
"type": "object"
})""",
R"""(
a-kv ::= "\"a\"" space ":" space number
b-kv ::= "\"b\"" space ":" space number
c-kv ::= "\"c\"" space ":" space number
d-kv ::= "\"d\"" space ":" space number
d-rest ::= ( "," space c-kv )?
number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space
root ::= "{" space a-kv "," space b-kv ( "," space ( d-kv d-rest | c-kv ) )? "}" space
space ::= " "?
)"""
});
test({
SUCCESS,
"conflicting names",
R"""({
"type": "object",
"properties": {
"number": {
"type": "object",
"properties": {
"number": {
"type": "object",
"properties": {
"root": {
"type": "number"
}
},
"required": [
"root"
],
"additionalProperties": false
}
},
"required": [
"number"
],
"additionalProperties": false
}
},
"required": [
"number"
],
"additionalProperties": false,
"definitions": {}
})""",
R"""(
number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space
number- ::= "{" space number-number-kv "}" space
number-kv ::= "\"number\"" space ":" space number-
number-number ::= "{" space number-number-root-kv "}" space
number-number-kv ::= "\"number\"" space ":" space number-number
number-number-root-kv ::= "\"root\"" space ":" space number
root ::= "{" space number-kv "}" space
space ::= " "?
)"""
});
}
int main() {
fprintf(stderr, "LLAMA_NODE_AVAILABLE = %s\n", getenv("LLAMA_NODE_AVAILABLE") ? "true" : "false");
fprintf(stderr, "LLAMA_PYTHON_AVAILABLE = %s\n", getenv("LLAMA_PYTHON_AVAILABLE") ? "true" : "false");
test_all("C++", [](const TestCase & tc) {
try {
tc.verify(json_schema_to_grammar(nlohmann::ordered_json::parse(tc.schema)));
tc.verify_status(SUCCESS);
} catch (const std::runtime_error & ex) {
fprintf(stderr, "Error: %s\n", ex.what());
tc.verify_status(FAILURE);
}
});
if (getenv("LLAMA_PYTHON_AVAILABLE") || (std::system("python --version") == 0)) {
test_all("Python", [](const TestCase & tc) {
write("test-json-schema-input.tmp", tc.schema);
tc.verify_status(std::system(
"python ./examples/json-schema-to-grammar.py test-json-schema-input.tmp > test-grammar-output.tmp") == 0 ? SUCCESS : FAILURE);
tc.verify(read("test-grammar-output.tmp"));
});
} else {
fprintf(stderr, "\033[33mWARNING: Python not found, skipping Python JSON schema -> grammar tests.\n\033[0m");
}
if (getenv("LLAMA_NODE_AVAILABLE") || (std::system("node --version") == 0)) {
test_all("JavaScript", [](const TestCase & tc) {
write("test-json-schema-input.tmp", tc.schema);
tc.verify_status(std::system(
"node ./tests/run-json-schema-to-grammar.mjs test-json-schema-input.tmp > test-grammar-output.tmp") == 0 ? SUCCESS : FAILURE);
tc.verify(read("test-grammar-output.tmp"));
});
} else {
fprintf(stderr, "\033[33mWARNING: Node not found, skipping JavaScript JSON schema -> grammar tests.\n\033[0m");
}
test_all("Check Expectations Validity", [](const TestCase & tc) {
if (tc.expected_status == SUCCESS) {
tc.verify_expectation_parseable();
}
});
}