mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 01:24:36 +00:00
Merge commit '280345968d
' into concedo_experimental
# Conflicts: # .devops/full-cuda.Dockerfile # .devops/llama-cpp-cuda.srpm.spec # .devops/main-cuda.Dockerfile # .devops/nix/package.nix # .devops/server-cuda.Dockerfile # .github/workflows/build.yml # CMakeLists.txt # Makefile # README.md # ci/run.sh # docs/token_generation_performance_tips.md # flake.lock # llama.cpp # scripts/LlamaConfig.cmake.in # scripts/compare-commits.sh # scripts/server-llm.sh # tests/test-quantize-fns.cpp
This commit is contained in:
commit
a530afa1e4
33 changed files with 124 additions and 1280 deletions
|
@ -80,7 +80,7 @@ if (LLAMA_CUBLAS)
|
||||||
|
|
||||||
enable_language(CUDA)
|
enable_language(CUDA)
|
||||||
|
|
||||||
add_compile_definitions(GGML_USE_CUBLAS)
|
add_compile_definitions(GGML_USE_CUDA)
|
||||||
add_compile_definitions(SD_USE_CUBLAS)
|
add_compile_definitions(SD_USE_CUBLAS)
|
||||||
add_compile_definitions(GGML_CUDA_MMQ_Y=${LLAMA_CUDA_MMQ_Y})
|
add_compile_definitions(GGML_CUDA_MMQ_Y=${LLAMA_CUDA_MMQ_Y})
|
||||||
|
|
||||||
|
@ -152,7 +152,7 @@ if (LLAMA_HIPBLAS)
|
||||||
message(STATUS "HIP and hipBLAS found")
|
message(STATUS "HIP and hipBLAS found")
|
||||||
file(GLOB GGML_SOURCES_ROCM "ggml-cuda/*.cu")
|
file(GLOB GGML_SOURCES_ROCM "ggml-cuda/*.cu")
|
||||||
list(APPEND GGML_SOURCES_ROCM "ggml-cuda.cu")
|
list(APPEND GGML_SOURCES_ROCM "ggml-cuda.cu")
|
||||||
add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS SD_USE_CUBLAS)
|
add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUDA SD_USE_CUBLAS)
|
||||||
add_library(ggml-rocm OBJECT ${GGML_SOURCES_CUDA})
|
add_library(ggml-rocm OBJECT ${GGML_SOURCES_CUDA})
|
||||||
if (LLAMA_CUDA_FORCE_DMMV)
|
if (LLAMA_CUDA_FORCE_DMMV)
|
||||||
target_compile_definitions(ggml-rocm PUBLIC GGML_CUDA_FORCE_DMMV)
|
target_compile_definitions(ggml-rocm PUBLIC GGML_CUDA_FORCE_DMMV)
|
||||||
|
|
6
Makefile
6
Makefile
|
@ -55,7 +55,7 @@ CLBLAST_FLAGS = -DGGML_USE_CLBLAST
|
||||||
FAILSAFE_FLAGS = -DUSE_FAILSAFE
|
FAILSAFE_FLAGS = -DUSE_FAILSAFE
|
||||||
VULKAN_FLAGS = -DGGML_USE_VULKAN
|
VULKAN_FLAGS = -DGGML_USE_VULKAN
|
||||||
ifdef LLAMA_CUBLAS
|
ifdef LLAMA_CUBLAS
|
||||||
CUBLAS_FLAGS = -DGGML_USE_CUBLAS -DSD_USE_CUBLAS
|
CUBLAS_FLAGS = -DGGML_USE_CUDA -DSD_USE_CUBLAS
|
||||||
else
|
else
|
||||||
CUBLAS_FLAGS =
|
CUBLAS_FLAGS =
|
||||||
endif
|
endif
|
||||||
|
@ -142,7 +142,7 @@ endif
|
||||||
|
|
||||||
# it is recommended to use the CMAKE file to build for cublas if you can - will likely work better
|
# it is recommended to use the CMAKE file to build for cublas if you can - will likely work better
|
||||||
ifdef LLAMA_CUBLAS
|
ifdef LLAMA_CUBLAS
|
||||||
CUBLAS_FLAGS = -DGGML_USE_CUBLAS -DSD_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
|
CUBLAS_FLAGS = -DGGML_USE_CUDA -DSD_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
|
||||||
CUBLASLD_FLAGS = -lcuda -lcublas -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib -L/usr/local/cuda/targets/aarch64-linux/lib -L/usr/local/cuda/targets/sbsa-linux/lib -L/usr/lib/wsl/lib
|
CUBLASLD_FLAGS = -lcuda -lcublas -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib -L/usr/local/cuda/targets/aarch64-linux/lib -L/usr/local/cuda/targets/sbsa-linux/lib -L/usr/lib/wsl/lib
|
||||||
CUBLAS_OBJS = ggml-cuda.o ggml_v3-cuda.o ggml_v2-cuda.o ggml_v2-cuda-legacy.o
|
CUBLAS_OBJS = ggml-cuda.o ggml_v3-cuda.o ggml_v2-cuda.o ggml_v2-cuda-legacy.o
|
||||||
NVCC = nvcc
|
NVCC = nvcc
|
||||||
|
@ -226,7 +226,7 @@ ifdef LLAMA_HIPBLAS
|
||||||
LLAMA_CUDA_DMMV_X ?= 32
|
LLAMA_CUDA_DMMV_X ?= 32
|
||||||
LLAMA_CUDA_MMV_Y ?= 1
|
LLAMA_CUDA_MMV_Y ?= 1
|
||||||
LLAMA_CUDA_KQUANTS_ITER ?= 2
|
LLAMA_CUDA_KQUANTS_ITER ?= 2
|
||||||
HIPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS -DSD_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C)
|
HIPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUDA -DSD_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C)
|
||||||
HIPLDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib -lhipblas -lamdhip64 -lrocblas
|
HIPLDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib -lhipblas -lamdhip64 -lrocblas
|
||||||
HIP_OBJS += ggml-cuda.o ggml_v3-cuda.o ggml_v2-cuda.o ggml_v2-cuda-legacy.o
|
HIP_OBJS += ggml-cuda.o ggml_v3-cuda.o ggml_v2-cuda.o ggml_v2-cuda-legacy.o
|
||||||
ggml-cuda.o: HIPFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS)) \
|
ggml-cuda.o: HIPFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS)) \
|
||||||
|
|
|
@ -49,12 +49,12 @@
|
||||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if (defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL))
|
#if (defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL))
|
||||||
#define GGML_USE_CUBLAS_SYCL
|
#define GGML_USE_CUDA_SYCL
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if (defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)) || defined(GGML_USE_VULKAN)
|
#if (defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)) || defined(GGML_USE_VULKAN)
|
||||||
#define GGML_USE_CUBLAS_SYCL_VULKAN
|
#define GGML_USE_CUDA_SYCL_VULKAN
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(LLAMA_USE_CURL)
|
#if defined(LLAMA_USE_CURL)
|
||||||
|
@ -862,9 +862,9 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
params.main_gpu = std::stoi(argv[i]);
|
params.main_gpu = std::stoi(argv[i]);
|
||||||
#ifndef GGML_USE_CUBLAS_SYCL
|
#ifndef GGML_USE_CUDA_SYCL
|
||||||
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the main GPU has no effect.\n");
|
fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL. Setting the main GPU has no effect.\n");
|
||||||
#endif // GGML_USE_CUBLAS_SYCL
|
#endif // GGML_USE_CUDA_SYCL
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
if (arg == "--split-mode" || arg == "-sm") {
|
if (arg == "--split-mode" || arg == "-sm") {
|
||||||
|
@ -890,9 +890,9 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
#ifndef GGML_USE_CUBLAS_SYCL
|
#ifndef GGML_USE_CUDA_SYCL
|
||||||
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the split mode has no effect.\n");
|
fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL. Setting the split mode has no effect.\n");
|
||||||
#endif // GGML_USE_CUBLAS_SYCL
|
#endif // GGML_USE_CUDA_SYCL
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
if (arg == "--tensor-split" || arg == "-ts") {
|
if (arg == "--tensor-split" || arg == "-ts") {
|
||||||
|
@ -918,9 +918,9 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
||||||
params.tensor_split[i] = 0.0f;
|
params.tensor_split[i] = 0.0f;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#ifndef GGML_USE_CUBLAS_SYCL_VULKAN
|
#ifndef GGML_USE_CUDA_SYCL_VULKAN
|
||||||
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL/Vulkan. Setting a tensor split has no effect.\n");
|
fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting a tensor split has no effect.\n");
|
||||||
#endif // GGML_USE_CUBLAS_SYCL
|
#endif // GGML_USE_CUDA_SYCL_VULKAN
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
if (arg == "--no-mmap") {
|
if (arg == "--no-mmap") {
|
||||||
|
@ -2388,7 +2388,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
|
||||||
fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false");
|
fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false");
|
||||||
fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false");
|
fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false");
|
||||||
fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
|
fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
|
||||||
fprintf(stream, "cpu_has_cublas: %s\n", ggml_cpu_has_cublas() ? "true" : "false");
|
fprintf(stream, "cpu_has_cuda: %s\n", ggml_cpu_has_cuda() ? "true" : "false");
|
||||||
fprintf(stream, "cpu_has_vulkan: %s\n", ggml_cpu_has_vulkan() ? "true" : "false");
|
fprintf(stream, "cpu_has_vulkan: %s\n", ggml_cpu_has_vulkan() ? "true" : "false");
|
||||||
fprintf(stream, "cpu_has_clblast: %s\n", ggml_cpu_has_clblast() ? "true" : "false");
|
fprintf(stream, "cpu_has_clblast: %s\n", ggml_cpu_has_clblast() ? "true" : "false");
|
||||||
fprintf(stream, "cpu_has_kompute: %s\n", ggml_cpu_has_kompute() ? "true" : "false");
|
fprintf(stream, "cpu_has_kompute: %s\n", ggml_cpu_has_kompute() ? "true" : "false");
|
||||||
|
|
|
@ -22,7 +22,7 @@ For faster computation, make sure to use GPU offloading via the `-ngl` argument
|
||||||
## Example
|
## Example
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
LLAMA_CUBLAS=1 make -j
|
LLAMA_CUDA=1 make -j
|
||||||
|
|
||||||
# generate importance matrix (imatrix.dat)
|
# generate importance matrix (imatrix.dat)
|
||||||
./imatrix -m ggml-model-f16.gguf -f train-data.txt -ngl 99
|
./imatrix -m ggml-model-f16.gguf -f train-data.txt -ngl 99
|
||||||
|
|
|
@ -114,7 +114,7 @@ static std::string get_cpu_info() {
|
||||||
|
|
||||||
static std::string get_gpu_info() {
|
static std::string get_gpu_info() {
|
||||||
std::string id;
|
std::string id;
|
||||||
#ifdef GGML_USE_CUBLAS
|
#ifdef GGML_USE_CUDA
|
||||||
int count = ggml_backend_cuda_get_device_count();
|
int count = ggml_backend_cuda_get_device_count();
|
||||||
for (int i = 0; i < count; i++) {
|
for (int i = 0; i < count; i++) {
|
||||||
char buf[128];
|
char buf[128];
|
||||||
|
@ -809,7 +809,7 @@ struct test {
|
||||||
|
|
||||||
const std::string test::build_commit = LLAMA_COMMIT;
|
const std::string test::build_commit = LLAMA_COMMIT;
|
||||||
const int test::build_number = LLAMA_BUILD_NUMBER;
|
const int test::build_number = LLAMA_BUILD_NUMBER;
|
||||||
const bool test::cuda = !!ggml_cpu_has_cublas();
|
const bool test::cuda = !!ggml_cpu_has_cuda();
|
||||||
const bool test::opencl = !!ggml_cpu_has_clblast();
|
const bool test::opencl = !!ggml_cpu_has_clblast();
|
||||||
const bool test::vulkan = !!ggml_cpu_has_vulkan();
|
const bool test::vulkan = !!ggml_cpu_has_vulkan();
|
||||||
const bool test::kompute = !!ggml_cpu_has_kompute();
|
const bool test::kompute = !!ggml_cpu_has_kompute();
|
||||||
|
|
|
@ -124,7 +124,7 @@ llama_print_timings: total time = 34570.79 ms
|
||||||
## Orin compile and run
|
## Orin compile and run
|
||||||
### compile
|
### compile
|
||||||
```sh
|
```sh
|
||||||
make LLAMA_CUBLAS=1 CUDA_DOCKER_ARCH=sm_87 LLAMA_CUDA_F16=1 -j 32
|
make LLAMA_CUDA=1 CUDA_DOCKER_ARCH=sm_87 LLAMA_CUDA_F16=1 -j 32
|
||||||
```
|
```
|
||||||
|
|
||||||
### run on Orin
|
### run on Orin
|
||||||
|
|
|
@ -7,7 +7,7 @@
|
||||||
#include "ggml-alloc.h"
|
#include "ggml-alloc.h"
|
||||||
#include "ggml-backend.h"
|
#include "ggml-backend.h"
|
||||||
|
|
||||||
#ifdef GGML_USE_CUBLAS
|
#ifdef GGML_USE_CUDA
|
||||||
#include "ggml-cuda.h"
|
#include "ggml-cuda.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -968,7 +968,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef GGML_USE_CUBLAS
|
#ifdef GGML_USE_CUDA
|
||||||
new_clip->backend = ggml_backend_cuda_init(0);
|
new_clip->backend = ggml_backend_cuda_init(0);
|
||||||
printf("%s: CLIP using CUDA backend\n", __func__);
|
printf("%s: CLIP using CUDA backend\n", __func__);
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -8,7 +8,7 @@ Because this example is "outside of the source tree", it is important to first b
|
||||||
|
|
||||||
### Considerations
|
### Considerations
|
||||||
|
|
||||||
When hardware acceleration libraries are used (e.g. CUBlas, Metal, CLBlast, etc.), CMake must be able to locate the associated CMake package. In the example below, when building _main-cmake-pkg_ notice the `CMAKE_PREFIX_PATH` includes the Llama CMake package location _in addition to_ the CLBlast package—which was used when compiling _llama.cpp_.
|
When hardware acceleration libraries are used (e.g. CUDA, Metal, CLBlast, etc.), CMake must be able to locate the associated CMake package. In the example below, when building _main-cmake-pkg_ notice the `CMAKE_PREFIX_PATH` includes the Llama CMake package location _in addition to_ the CLBlast package—which was used when compiling _llama.cpp_.
|
||||||
|
|
||||||
### Build llama.cpp and install to C:\LlamaCPP directory
|
### Build llama.cpp and install to C:\LlamaCPP directory
|
||||||
|
|
||||||
|
|
|
@ -316,8 +316,8 @@ These options provide extra functionality and customization when running the LLa
|
||||||
|
|
||||||
- `-h, --help`: Display a help message showing all available options and their default values. This is particularly useful for checking the latest options and default values, as they can change frequently, and the information in this document may become outdated.
|
- `-h, --help`: Display a help message showing all available options and their default values. This is particularly useful for checking the latest options and default values, as they can change frequently, and the information in this document may become outdated.
|
||||||
- `--verbose-prompt`: Print the prompt before generating text.
|
- `--verbose-prompt`: Print the prompt before generating text.
|
||||||
- `-ngl N, --n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
|
- `-ngl N, --n-gpu-layers N`: When compiled with GPU support, this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
|
||||||
- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
|
- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used.
|
||||||
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
|
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance.
|
||||||
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
|
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
|
||||||
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
|
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
|
||||||
|
|
|
@ -25,9 +25,9 @@ The project is under active development, and we are [looking for feedback and co
|
||||||
- `-hff FILE, --hf-file FILE`: Hugging Face model file (default: unused).
|
- `-hff FILE, --hf-file FILE`: Hugging Face model file (default: unused).
|
||||||
- `-a ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
|
- `-a ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
|
||||||
- `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096.
|
- `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096.
|
||||||
- `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
|
- `-ngl N`, `--n-gpu-layers N`: When compiled with GPU support, this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
|
||||||
- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
|
- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used.
|
||||||
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
|
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance.
|
||||||
- `-b N`, `--batch-size N`: Set the batch size for prompt processing. Default: `2048`.
|
- `-b N`, `--batch-size N`: Set the batch size for prompt processing. Default: `2048`.
|
||||||
- `-ub N`, `--ubatch-size N`: physical maximum batch size. Default: `512`.
|
- `-ub N`, `--ubatch-size N`: physical maximum batch size. Default: `512`.
|
||||||
- `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended.
|
- `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended.
|
||||||
|
|
|
@ -2511,15 +2511,15 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
#ifndef GGML_USE_CUBLAS
|
#ifndef GGML_USE_CUDA
|
||||||
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting the split mode has no effect.\n");
|
fprintf(stderr, "warning: llama.cpp was compiled without CUDA. Setting the split mode has no effect.\n");
|
||||||
#endif // GGML_USE_CUBLAS
|
#endif // GGML_USE_CUDA
|
||||||
} else if (arg == "--tensor-split" || arg == "-ts") {
|
} else if (arg == "--tensor-split" || arg == "-ts") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)
|
#if defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)
|
||||||
std::string arg_next = argv[i];
|
std::string arg_next = argv[i];
|
||||||
|
|
||||||
// split string by , and /
|
// split string by , and /
|
||||||
|
@ -2536,17 +2536,17 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n", {});
|
LOG_WARNING("llama.cpp was compiled without CUDA. It is not possible to set a tensor split.\n", {});
|
||||||
#endif // GGML_USE_CUBLAS
|
#endif // GGML_USE_CUDA
|
||||||
} else if (arg == "--main-gpu" || arg == "-mg") {
|
} else if (arg == "--main-gpu" || arg == "-mg") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)
|
#if defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)
|
||||||
params.main_gpu = std::stoi(argv[i]);
|
params.main_gpu = std::stoi(argv[i]);
|
||||||
#else
|
#else
|
||||||
LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.", {});
|
LOG_WARNING("llama.cpp was compiled without CUDA. It is not possible to set a main GPU.", {});
|
||||||
#endif
|
#endif
|
||||||
} else if (arg == "--lora") {
|
} else if (arg == "--lora") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
|
|
|
@ -420,7 +420,7 @@ GGML_CALL static void ggml_backend_registry_init(void) {
|
||||||
ggml_backend_register("CPU", ggml_backend_reg_cpu_init, ggml_backend_cpu_buffer_type(), NULL);
|
ggml_backend_register("CPU", ggml_backend_reg_cpu_init, ggml_backend_cpu_buffer_type(), NULL);
|
||||||
|
|
||||||
// add forward decls here to avoid including the backend headers
|
// add forward decls here to avoid including the backend headers
|
||||||
#ifdef GGML_USE_CUBLAS
|
#ifdef GGML_USE_CUDA
|
||||||
extern GGML_CALL void ggml_backend_cuda_reg_devices(void);
|
extern GGML_CALL void ggml_backend_cuda_reg_devices(void);
|
||||||
ggml_backend_cuda_reg_devices();
|
ggml_backend_cuda_reg_devices();
|
||||||
#endif
|
#endif
|
||||||
|
|
8
ggml.c
8
ggml.c
|
@ -21719,15 +21719,15 @@ int ggml_cpu_has_wasm_simd(void) {
|
||||||
}
|
}
|
||||||
|
|
||||||
int ggml_cpu_has_blas(void) {
|
int ggml_cpu_has_blas(void) {
|
||||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_SYCL)
|
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_SYCL)
|
||||||
return 1;
|
return 1;
|
||||||
#else
|
#else
|
||||||
return 0;
|
return 0;
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
int ggml_cpu_has_cublas(void) {
|
int ggml_cpu_has_cuda(void) {
|
||||||
#if defined(GGML_USE_CUBLAS)
|
#if defined(GGML_USE_CUDA)
|
||||||
return 1;
|
return 1;
|
||||||
#else
|
#else
|
||||||
return 0;
|
return 0;
|
||||||
|
@ -21767,7 +21767,7 @@ int ggml_cpu_has_sycl(void) {
|
||||||
}
|
}
|
||||||
|
|
||||||
int ggml_cpu_has_gpublas(void) {
|
int ggml_cpu_has_gpublas(void) {
|
||||||
return ggml_cpu_has_cublas() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() ||
|
return ggml_cpu_has_cuda() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() ||
|
||||||
ggml_cpu_has_sycl();
|
ggml_cpu_has_sycl();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
2
ggml.h
2
ggml.h
|
@ -2361,7 +2361,7 @@ extern "C" {
|
||||||
GGML_API int ggml_cpu_has_fp16_va (void);
|
GGML_API int ggml_cpu_has_fp16_va (void);
|
||||||
GGML_API int ggml_cpu_has_wasm_simd (void);
|
GGML_API int ggml_cpu_has_wasm_simd (void);
|
||||||
GGML_API int ggml_cpu_has_blas (void);
|
GGML_API int ggml_cpu_has_blas (void);
|
||||||
GGML_API int ggml_cpu_has_cublas (void);
|
GGML_API int ggml_cpu_has_cuda (void);
|
||||||
GGML_API int ggml_cpu_has_clblast (void);
|
GGML_API int ggml_cpu_has_clblast (void);
|
||||||
GGML_API int ggml_cpu_has_vulkan (void);
|
GGML_API int ggml_cpu_has_vulkan (void);
|
||||||
GGML_API int ggml_cpu_has_kompute (void);
|
GGML_API int ggml_cpu_has_kompute (void);
|
||||||
|
|
|
@ -829,7 +829,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
||||||
int cu_parseinfo_maindevice = inputs.cublas_info<=0?0:inputs.cublas_info;
|
int cu_parseinfo_maindevice = inputs.cublas_info<=0?0:inputs.cublas_info;
|
||||||
|
|
||||||
printf("System Info: %s\n", llama_print_system_info());
|
printf("System Info: %s\n", llama_print_system_info());
|
||||||
#if defined(GGML_USE_CUBLAS)
|
#if defined(GGML_USE_CUDA)
|
||||||
if(file_format!=FileFormat::GGUF_GENERIC)
|
if(file_format!=FileFormat::GGUF_GENERIC)
|
||||||
{
|
{
|
||||||
if(ggml_v3_cpu_has_gpublas() && cu_parseinfo_maindevice>0)
|
if(ggml_v3_cpu_has_gpublas() && cu_parseinfo_maindevice>0)
|
||||||
|
@ -909,7 +909,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
||||||
llama_ctx_params.rope_freq_scale = rope_freq_scale;
|
llama_ctx_params.rope_freq_scale = rope_freq_scale;
|
||||||
llama_ctx_params.n_batch = kcpp_params->n_batch;
|
llama_ctx_params.n_batch = kcpp_params->n_batch;
|
||||||
|
|
||||||
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_VULKAN)
|
#if defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN)
|
||||||
bool ts_all_zero = true;
|
bool ts_all_zero = true;
|
||||||
for (int i = 0; i < tensor_split_max; ++i) {
|
for (int i = 0; i < tensor_split_max; ++i) {
|
||||||
if (inputs.tensor_split[i] != 0.0f) {
|
if (inputs.tensor_split[i] != 0.0f) {
|
||||||
|
@ -997,7 +997,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#if defined(GGML_USE_CUBLAS)
|
#if defined(GGML_USE_CUDA)
|
||||||
if(ggml_cpu_has_gpublas() && cu_parseinfo_maindevice>0)
|
if(ggml_cpu_has_gpublas() && cu_parseinfo_maindevice>0)
|
||||||
{
|
{
|
||||||
printf("CUBLAS: Set main device to %d\n",cu_parseinfo_maindevice);
|
printf("CUBLAS: Set main device to %d\n",cu_parseinfo_maindevice);
|
||||||
|
@ -1006,7 +1006,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
||||||
#endif
|
#endif
|
||||||
model_params.main_gpu = cu_parseinfo_maindevice;
|
model_params.main_gpu = cu_parseinfo_maindevice;
|
||||||
|
|
||||||
#if defined(GGML_USE_CUBLAS)
|
#if defined(GGML_USE_CUDA)
|
||||||
model_params.split_mode = (inputs.use_rowsplit?llama_split_mode::LLAMA_SPLIT_MODE_ROW:llama_split_mode::LLAMA_SPLIT_MODE_LAYER);
|
model_params.split_mode = (inputs.use_rowsplit?llama_split_mode::LLAMA_SPLIT_MODE_ROW:llama_split_mode::LLAMA_SPLIT_MODE_LAYER);
|
||||||
#else
|
#else
|
||||||
model_params.split_mode = llama_split_mode::LLAMA_SPLIT_MODE_LAYER;
|
model_params.split_mode = llama_split_mode::LLAMA_SPLIT_MODE_LAYER;
|
||||||
|
@ -1016,7 +1016,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
||||||
llama_ctx_params.n_threads = kcpp_params->n_threads;
|
llama_ctx_params.n_threads = kcpp_params->n_threads;
|
||||||
llama_ctx_params.n_threads_batch = kcpp_params->n_threads_batch;
|
llama_ctx_params.n_threads_batch = kcpp_params->n_threads_batch;
|
||||||
|
|
||||||
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_VULKAN)
|
#if defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN)
|
||||||
bool ts_all_zero = true;
|
bool ts_all_zero = true;
|
||||||
for (int i = 0; i < tensor_split_max; ++i) {
|
for (int i = 0; i < tensor_split_max; ++i) {
|
||||||
if (inputs.tensor_split[i] != 0.0f) {
|
if (inputs.tensor_split[i] != 0.0f) {
|
||||||
|
|
34
llama.cpp
34
llama.cpp
|
@ -9,13 +9,11 @@
|
||||||
#include "ggml-alloc.h"
|
#include "ggml-alloc.h"
|
||||||
#include "ggml-backend.h"
|
#include "ggml-backend.h"
|
||||||
|
|
||||||
#ifdef GGML_USE_CUBLAS
|
#ifdef GGML_USE_CUDA
|
||||||
#include "ggml-cuda.h"
|
# include "ggml-cuda.h"
|
||||||
#endif
|
#elif defined(GGML_USE_CLBLAST)
|
||||||
#if defined(GGML_USE_CLBLAST)
|
# include "ggml-opencl.h"
|
||||||
#include "ggml-opencl.h"
|
#elif defined(GGML_USE_VULKAN)
|
||||||
#endif
|
|
||||||
#if defined(GGML_USE_VULKAN)
|
|
||||||
# include "ggml-vulkan.h"
|
# include "ggml-vulkan.h"
|
||||||
#elif defined(GGML_USE_SYCL)
|
#elif defined(GGML_USE_SYCL)
|
||||||
# include "ggml-sycl.h"
|
# include "ggml-sycl.h"
|
||||||
|
@ -1533,7 +1531,7 @@ static std::string llama_token_to_str(const struct llama_context * ctx, llama_to
|
||||||
static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer) {
|
static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer) {
|
||||||
ggml_backend_buffer_type_t buft = nullptr;
|
ggml_backend_buffer_type_t buft = nullptr;
|
||||||
|
|
||||||
#if defined(GGML_USE_CUBLAS)
|
#if defined(GGML_USE_CUDA)
|
||||||
// host buffers should only be used when data is expected to be copied to/from the GPU
|
// host buffers should only be used when data is expected to be copied to/from the GPU
|
||||||
if (host_buffer) {
|
if (host_buffer) {
|
||||||
buft = ggml_backend_cuda_host_buffer_type();
|
buft = ggml_backend_cuda_host_buffer_type();
|
||||||
|
@ -1563,7 +1561,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
|
||||||
|
|
||||||
#ifdef GGML_USE_METAL
|
#ifdef GGML_USE_METAL
|
||||||
buft = ggml_backend_metal_buffer_type();
|
buft = ggml_backend_metal_buffer_type();
|
||||||
#elif defined(GGML_USE_CUBLAS)
|
#elif defined(GGML_USE_CUDA)
|
||||||
buft = ggml_backend_cuda_buffer_type(gpu);
|
buft = ggml_backend_cuda_buffer_type(gpu);
|
||||||
#elif defined(GGML_USE_VULKAN)
|
#elif defined(GGML_USE_VULKAN)
|
||||||
buft = ggml_backend_vk_buffer_type(gpu);
|
buft = ggml_backend_vk_buffer_type(gpu);
|
||||||
|
@ -1589,7 +1587,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
|
||||||
static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_gpu, const float * tensor_split) {
|
static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_gpu, const float * tensor_split) {
|
||||||
ggml_backend_buffer_type_t buft = nullptr;
|
ggml_backend_buffer_type_t buft = nullptr;
|
||||||
|
|
||||||
#ifdef GGML_USE_CUBLAS
|
#ifdef GGML_USE_CUDA
|
||||||
if (ggml_backend_cuda_get_device_count() > 1) {
|
if (ggml_backend_cuda_get_device_count() > 1) {
|
||||||
buft = ggml_backend_cuda_split_buffer_type(tensor_split);
|
buft = ggml_backend_cuda_split_buffer_type(tensor_split);
|
||||||
}
|
}
|
||||||
|
@ -1610,7 +1608,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g
|
||||||
}
|
}
|
||||||
|
|
||||||
static size_t llama_get_device_count() {
|
static size_t llama_get_device_count() {
|
||||||
#if defined(GGML_USE_CUBLAS)
|
#if defined(GGML_USE_CUDA)
|
||||||
return ggml_backend_cuda_get_device_count();
|
return ggml_backend_cuda_get_device_count();
|
||||||
#elif defined(GGML_USE_SYCL)
|
#elif defined(GGML_USE_SYCL)
|
||||||
return ggml_backend_sycl_get_device_count();
|
return ggml_backend_sycl_get_device_count();
|
||||||
|
@ -1622,7 +1620,7 @@ static size_t llama_get_device_count() {
|
||||||
}
|
}
|
||||||
|
|
||||||
static size_t llama_get_device_memory(int device) {
|
static size_t llama_get_device_memory(int device) {
|
||||||
#if defined(GGML_USE_CUBLAS)
|
#if defined(GGML_USE_CUDA)
|
||||||
size_t total;
|
size_t total;
|
||||||
size_t free;
|
size_t free;
|
||||||
ggml_backend_cuda_get_device_memory(device, &total, &free);
|
ggml_backend_cuda_get_device_memory(device, &total, &free);
|
||||||
|
@ -2112,7 +2110,7 @@ struct llama_model {
|
||||||
ggml_free(ctx);
|
ggml_free(ctx);
|
||||||
}
|
}
|
||||||
for (ggml_backend_buffer_t buf : bufs) {
|
for (ggml_backend_buffer_t buf : bufs) {
|
||||||
#ifdef GGML_USE_CUBLAS
|
#ifdef GGML_USE_CUDA
|
||||||
if (ggml_backend_buffer_get_type(buf) == ggml_backend_cpu_buffer_type()) {
|
if (ggml_backend_buffer_get_type(buf) == ggml_backend_cpu_buffer_type()) {
|
||||||
ggml_backend_cuda_unregister_host_buffer(ggml_backend_buffer_get_base(buf));
|
ggml_backend_cuda_unregister_host_buffer(ggml_backend_buffer_get_base(buf));
|
||||||
}
|
}
|
||||||
|
@ -5341,7 +5339,7 @@ static bool llm_load_tensors(
|
||||||
}
|
}
|
||||||
model.bufs.push_back(buf);
|
model.bufs.push_back(buf);
|
||||||
bufs.emplace(idx, buf);
|
bufs.emplace(idx, buf);
|
||||||
#ifdef GGML_USE_CUBLAS
|
#ifdef GGML_USE_CUDA
|
||||||
if (n_layer >= n_gpu_layers) {
|
if (n_layer >= n_gpu_layers) {
|
||||||
ggml_backend_cuda_register_host_buffer(
|
ggml_backend_cuda_register_host_buffer(
|
||||||
ggml_backend_buffer_get_base(buf),
|
ggml_backend_buffer_get_base(buf),
|
||||||
|
@ -13698,8 +13696,8 @@ bool llama_supports_mlock(void) {
|
||||||
}
|
}
|
||||||
|
|
||||||
bool llama_supports_gpu_offload(void) {
|
bool llama_supports_gpu_offload(void) {
|
||||||
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
|
#if defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
|
||||||
defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
|
defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
|
||||||
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
||||||
return true;
|
return true;
|
||||||
#else
|
#else
|
||||||
|
@ -13904,7 +13902,7 @@ struct llama_context * llama_new_context_with_model(
|
||||||
}
|
}
|
||||||
ctx->backends.push_back(ctx->backend_metal);
|
ctx->backends.push_back(ctx->backend_metal);
|
||||||
}
|
}
|
||||||
#elif defined(GGML_USE_CUBLAS)
|
#elif defined(GGML_USE_CUDA)
|
||||||
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
||||||
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
|
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
|
||||||
ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu);
|
ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu);
|
||||||
|
@ -14051,7 +14049,7 @@ struct llama_context * llama_new_context_with_model(
|
||||||
|
|
||||||
// enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
|
// enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
|
||||||
bool pipeline_parallel = llama_get_device_count() > 1 && model->n_gpu_layers > (int)model->hparams.n_layer && model->split_mode == LLAMA_SPLIT_MODE_LAYER;
|
bool pipeline_parallel = llama_get_device_count() > 1 && model->n_gpu_layers > (int)model->hparams.n_layer && model->split_mode == LLAMA_SPLIT_MODE_LAYER;
|
||||||
#ifndef GGML_USE_CUBLAS
|
#ifndef GGML_USE_CUDA
|
||||||
// pipeline parallelism requires support for async compute and events
|
// pipeline parallelism requires support for async compute and events
|
||||||
// currently this is only implemented in the CUDA backend
|
// currently this is only implemented in the CUDA backend
|
||||||
pipeline_parallel = false;
|
pipeline_parallel = false;
|
||||||
|
|
|
@ -136,7 +136,7 @@ inline static void* ggml_v2_aligned_malloc(size_t size) {
|
||||||
#include <Accelerate/Accelerate.h>
|
#include <Accelerate/Accelerate.h>
|
||||||
#elif defined(GGML_USE_OPENBLAS)
|
#elif defined(GGML_USE_OPENBLAS)
|
||||||
#include <cblas.h>
|
#include <cblas.h>
|
||||||
#elif defined(GGML_USE_CUBLAS)
|
#elif defined(GGML_USE_CUDA)
|
||||||
#include "ggml_v2-cuda.h"
|
#include "ggml_v2-cuda.h"
|
||||||
#include "ggml_v2-cuda-legacy.h"
|
#include "ggml_v2-cuda-legacy.h"
|
||||||
#endif
|
#endif
|
||||||
|
@ -3895,7 +3895,7 @@ struct ggml_v2_context * ggml_v2_init(struct ggml_v2_init_params params) {
|
||||||
GGML_V2_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
|
GGML_V2_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(GGML_USE_CUBLAS)
|
#if defined(GGML_USE_CUDA)
|
||||||
if(quants_unshuffled)
|
if(quants_unshuffled)
|
||||||
{
|
{
|
||||||
ggml_v2_init_cublas();
|
ggml_v2_init_cublas();
|
||||||
|
@ -9456,7 +9456,7 @@ static void ggml_v2_compute_forward_mul_mat_f32(
|
||||||
// nb01 >= nb00 - src0 is not transposed
|
// nb01 >= nb00 - src0 is not transposed
|
||||||
// compute by src0 rows
|
// compute by src0 rows
|
||||||
|
|
||||||
#if defined(GGML_USE_CUBLAS)
|
#if defined(GGML_USE_CUDA)
|
||||||
if (ggml_v2_cuda_can_mul_mat(src0, src1, dst)) {
|
if (ggml_v2_cuda_can_mul_mat(src0, src1, dst)) {
|
||||||
if (params->ith == 0 && params->type == GGML_V2_TASK_COMPUTE) {
|
if (params->ith == 0 && params->type == GGML_V2_TASK_COMPUTE) {
|
||||||
if(quants_unshuffled)
|
if(quants_unshuffled)
|
||||||
|
@ -9656,7 +9656,7 @@ static void ggml_v2_compute_forward_mul_mat_f16_f32(
|
||||||
// nb01 >= nb00 - src0 is not transposed
|
// nb01 >= nb00 - src0 is not transposed
|
||||||
// compute by src0 rows
|
// compute by src0 rows
|
||||||
|
|
||||||
#if defined(GGML_USE_CUBLAS)
|
#if defined(GGML_USE_CUDA)
|
||||||
if (ggml_v2_cuda_can_mul_mat(src0, src1, dst)) {
|
if (ggml_v2_cuda_can_mul_mat(src0, src1, dst)) {
|
||||||
if (params->ith == 0 && params->type == GGML_V2_TASK_COMPUTE) {
|
if (params->ith == 0 && params->type == GGML_V2_TASK_COMPUTE) {
|
||||||
if(quants_unshuffled)
|
if(quants_unshuffled)
|
||||||
|
@ -9901,7 +9901,7 @@ static void ggml_v2_compute_forward_mul_mat_q_f32(
|
||||||
// nb01 >= nb00 - src0 is not transposed
|
// nb01 >= nb00 - src0 is not transposed
|
||||||
// compute by src0 rows
|
// compute by src0 rows
|
||||||
|
|
||||||
#if defined(GGML_USE_CUBLAS)
|
#if defined(GGML_USE_CUDA)
|
||||||
if (ggml_v2_cuda_can_mul_mat(src0, src1, dst)) {
|
if (ggml_v2_cuda_can_mul_mat(src0, src1, dst)) {
|
||||||
if (params->ith == 0 && params->type == GGML_V2_TASK_COMPUTE) {
|
if (params->ith == 0 && params->type == GGML_V2_TASK_COMPUTE) {
|
||||||
if(quants_unshuffled)
|
if(quants_unshuffled)
|
||||||
|
@ -14087,7 +14087,7 @@ void ggml_v2_graph_compute(struct ggml_v2_context * ctx, struct ggml_v2_cgraph *
|
||||||
|
|
||||||
size_t cur = 0;
|
size_t cur = 0;
|
||||||
|
|
||||||
#if defined(GGML_USE_CUBLAS)
|
#if defined(GGML_USE_CUDA)
|
||||||
if (ggml_v2_cuda_can_mul_mat(node->src0, node->src1, node)) {
|
if (ggml_v2_cuda_can_mul_mat(node->src0, node->src1, node)) {
|
||||||
node->n_tasks = 1; // TODO: this actually is doing nothing
|
node->n_tasks = 1; // TODO: this actually is doing nothing
|
||||||
// the threads are still spinning
|
// the threads are still spinning
|
||||||
|
@ -15585,7 +15585,7 @@ int ggml_v2_cpu_has_wasm_simd(void) {
|
||||||
}
|
}
|
||||||
|
|
||||||
int ggml_v2_cpu_has_blas(void) {
|
int ggml_v2_cpu_has_blas(void) {
|
||||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST)
|
||||||
return 1;
|
return 1;
|
||||||
#else
|
#else
|
||||||
return 0;
|
return 0;
|
||||||
|
@ -15593,7 +15593,7 @@ int ggml_v2_cpu_has_blas(void) {
|
||||||
}
|
}
|
||||||
|
|
||||||
int ggml_v2_cpu_has_cublas(void) {
|
int ggml_v2_cpu_has_cublas(void) {
|
||||||
#if defined(GGML_USE_CUBLAS)
|
#if defined(GGML_USE_CUDA)
|
||||||
return 1;
|
return 1;
|
||||||
#else
|
#else
|
||||||
return 0;
|
return 0;
|
||||||
|
|
|
@ -1367,7 +1367,7 @@ inline static void * ggml_v3_aligned_malloc(size_t size) {
|
||||||
#else
|
#else
|
||||||
#include <cblas.h>
|
#include <cblas.h>
|
||||||
#endif
|
#endif
|
||||||
#elif defined(GGML_USE_CUBLAS)
|
#elif defined(GGML_USE_CUDA)
|
||||||
#include "ggml_v3-cuda.h"
|
#include "ggml_v3-cuda.h"
|
||||||
#elif defined(GGML_USE_CLBLAST)
|
#elif defined(GGML_USE_CLBLAST)
|
||||||
#include "ggml_v3-opencl.h"
|
#include "ggml_v3-opencl.h"
|
||||||
|
@ -3413,7 +3413,7 @@ struct ggml_v3_context * ggml_v3_init(struct ggml_v3_init_params params) {
|
||||||
GGML_V3_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
|
GGML_V3_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(GGML_USE_CUBLAS)
|
#if defined(GGML_USE_CUDA)
|
||||||
ggml_v3_init_cublas();
|
ggml_v3_init_cublas();
|
||||||
#elif defined(GGML_USE_CLBLAST)
|
#elif defined(GGML_USE_CLBLAST)
|
||||||
ggml_v3_cl_init();
|
ggml_v3_cl_init();
|
||||||
|
@ -11325,7 +11325,7 @@ static void ggml_v3_compute_forward_out_prod_f32(
|
||||||
// nb01 >= nb00 - src0 is not transposed
|
// nb01 >= nb00 - src0 is not transposed
|
||||||
// compute by src0 rows
|
// compute by src0 rows
|
||||||
|
|
||||||
// TODO: #if defined(GGML_USE_CUBLAS) ggml_v3_cuda_out_prod
|
// TODO: #if defined(GGML_USE_CUDA) ggml_v3_cuda_out_prod
|
||||||
// TODO: #if defined(GGML_USE_CLBLAST)
|
// TODO: #if defined(GGML_USE_CLBLAST)
|
||||||
|
|
||||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
||||||
|
@ -11520,7 +11520,7 @@ static void ggml_v3_compute_forward_out_prod_q_f32(
|
||||||
// nb01 >= nb00 - src0 is not transposed
|
// nb01 >= nb00 - src0 is not transposed
|
||||||
// compute by src0 rows
|
// compute by src0 rows
|
||||||
|
|
||||||
// TODO: #if defined(GGML_USE_CUBLAS) ggml_v3_cuda_out_prod
|
// TODO: #if defined(GGML_USE_CUDA) ggml_v3_cuda_out_prod
|
||||||
// TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
|
// TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
|
||||||
|
|
||||||
if (params->type == GGML_V3_TASK_INIT) {
|
if (params->type == GGML_V3_TASK_INIT) {
|
||||||
|
@ -15587,14 +15587,14 @@ static void ggml_v3_compute_forward(struct ggml_v3_compute_params * params, stru
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef GGML_USE_CUBLAS
|
#ifdef GGML_USE_CUDA
|
||||||
bool skip_cpu = ggml_v3_cuda_compute_forward(params, tensor);
|
bool skip_cpu = ggml_v3_cuda_compute_forward(params, tensor);
|
||||||
if (skip_cpu) {
|
if (skip_cpu) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
GGML_V3_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_V3_BACKEND_CPU);
|
GGML_V3_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_V3_BACKEND_CPU);
|
||||||
GGML_V3_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_V3_BACKEND_CPU);
|
GGML_V3_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_V3_BACKEND_CPU);
|
||||||
#endif // GGML_USE_CUBLAS
|
#endif // GGML_USE_CUDA
|
||||||
|
|
||||||
switch (tensor->op) {
|
switch (tensor->op) {
|
||||||
case GGML_V3_OP_DUP:
|
case GGML_V3_OP_DUP:
|
||||||
|
@ -21106,7 +21106,7 @@ int ggml_v3_cpu_has_wasm_simd(void) {
|
||||||
}
|
}
|
||||||
|
|
||||||
int ggml_v3_cpu_has_blas(void) {
|
int ggml_v3_cpu_has_blas(void) {
|
||||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST)
|
||||||
return 1;
|
return 1;
|
||||||
#else
|
#else
|
||||||
return 0;
|
return 0;
|
||||||
|
@ -21114,7 +21114,7 @@ int ggml_v3_cpu_has_blas(void) {
|
||||||
}
|
}
|
||||||
|
|
||||||
int ggml_v3_cpu_has_cublas(void) {
|
int ggml_v3_cpu_has_cublas(void) {
|
||||||
#if defined(GGML_USE_CUBLAS)
|
#if defined(GGML_USE_CUDA)
|
||||||
return 1;
|
return 1;
|
||||||
#else
|
#else
|
||||||
return 0;
|
return 0;
|
||||||
|
|
|
@ -16,7 +16,7 @@
|
||||||
|
|
||||||
#include "model_adapter.h"
|
#include "model_adapter.h"
|
||||||
|
|
||||||
#ifdef GGML_USE_CUBLAS
|
#ifdef GGML_USE_CUDA
|
||||||
#include "ggml_v3-cuda.h"
|
#include "ggml_v3-cuda.h"
|
||||||
#endif
|
#endif
|
||||||
#if defined(GGML_USE_CLBLAST)
|
#if defined(GGML_USE_CLBLAST)
|
||||||
|
@ -353,7 +353,7 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g
|
||||||
fin.close();
|
fin.close();
|
||||||
|
|
||||||
//gpu offload
|
//gpu offload
|
||||||
#if defined(GGML_USE_CLBLAST) || defined(GGML_USE_CUBLAS)
|
#if defined(GGML_USE_CLBLAST) || defined(GGML_USE_CUDA)
|
||||||
if(gpulayers>0)
|
if(gpulayers>0)
|
||||||
{
|
{
|
||||||
const auto & hparams = model.hparams;
|
const auto & hparams = model.hparams;
|
||||||
|
|
|
@ -16,7 +16,7 @@
|
||||||
|
|
||||||
#include "model_adapter.h"
|
#include "model_adapter.h"
|
||||||
|
|
||||||
#ifdef GGML_USE_CUBLAS
|
#ifdef GGML_USE_CUDA
|
||||||
#include "ggml_v3-cuda.h"
|
#include "ggml_v3-cuda.h"
|
||||||
#endif
|
#endif
|
||||||
#if defined(GGML_USE_CLBLAST)
|
#if defined(GGML_USE_CLBLAST)
|
||||||
|
@ -342,7 +342,7 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g
|
||||||
fin.close();
|
fin.close();
|
||||||
|
|
||||||
//gpu offload
|
//gpu offload
|
||||||
#if defined(GGML_USE_CLBLAST) || defined(GGML_USE_CUBLAS)
|
#if defined(GGML_USE_CLBLAST) || defined(GGML_USE_CUDA)
|
||||||
if(gpulayers>0)
|
if(gpulayers>0)
|
||||||
{
|
{
|
||||||
const auto & hparams = model.hparams;
|
const auto & hparams = model.hparams;
|
||||||
|
|
|
@ -504,7 +504,7 @@ struct llama_v3_buffer {
|
||||||
llama_v3_buffer& operator=(llama_v3_buffer&&) = delete;
|
llama_v3_buffer& operator=(llama_v3_buffer&&) = delete;
|
||||||
};
|
};
|
||||||
|
|
||||||
#ifdef GGML_USE_CUBLAS
|
#ifdef GGML_USE_CUDA
|
||||||
#include "ggml_v3-cuda.h"
|
#include "ggml_v3-cuda.h"
|
||||||
struct llama_v3_ctx_buffer {
|
struct llama_v3_ctx_buffer {
|
||||||
uint8_t * addr = NULL;
|
uint8_t * addr = NULL;
|
||||||
|
|
|
@ -407,7 +407,7 @@ struct llama_v2_buffer {
|
||||||
llama_v2_buffer& operator=(llama_v2_buffer&&) = delete;
|
llama_v2_buffer& operator=(llama_v2_buffer&&) = delete;
|
||||||
};
|
};
|
||||||
|
|
||||||
#ifdef GGML_USE_CUBLAS
|
#ifdef GGML_USE_CUDA
|
||||||
#include "ggml_v2-cuda.h"
|
#include "ggml_v2-cuda.h"
|
||||||
struct llama_v2_ctx_buffer {
|
struct llama_v2_ctx_buffer {
|
||||||
uint8_t * addr = NULL;
|
uint8_t * addr = NULL;
|
||||||
|
|
|
@ -10,7 +10,7 @@
|
||||||
|
|
||||||
#include "ggml_v2.h"
|
#include "ggml_v2.h"
|
||||||
|
|
||||||
#ifdef GGML_USE_CUBLAS
|
#ifdef GGML_USE_CUDA
|
||||||
#include "ggml_v2-cuda.h"
|
#include "ggml_v2-cuda.h"
|
||||||
#endif
|
#endif
|
||||||
#if defined(GGML_USE_CLBLAST)
|
#if defined(GGML_USE_CLBLAST)
|
||||||
|
@ -1063,7 +1063,7 @@ static void llama_v2_model_load_internal(
|
||||||
ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
|
ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
|
||||||
|
|
||||||
model.mapping = std::move(ml->mapping);
|
model.mapping = std::move(ml->mapping);
|
||||||
#if defined(GGML_USE_CUBLAS)
|
#if defined(GGML_USE_CUDA)
|
||||||
{
|
{
|
||||||
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
||||||
if(GetQuantsUnshuffled())
|
if(GetQuantsUnshuffled())
|
||||||
|
|
|
@ -13,7 +13,7 @@
|
||||||
|
|
||||||
#include "ggml_v3.h"
|
#include "ggml_v3.h"
|
||||||
#include "otherarch.h"
|
#include "otherarch.h"
|
||||||
#ifdef GGML_USE_CUBLAS
|
#ifdef GGML_USE_CUDA
|
||||||
#include "ggml_v3-cuda.h"
|
#include "ggml_v3-cuda.h"
|
||||||
#endif
|
#endif
|
||||||
#if defined(GGML_USE_CLBLAST)
|
#if defined(GGML_USE_CLBLAST)
|
||||||
|
@ -61,7 +61,7 @@ static void llama_v3_log_callback_default(llama_v3_log_level level, const char *
|
||||||
#define LLAMA_V3_LOG_WARN(...) llama_v3_log_internal(LLAMA_V3_LOG_LEVEL_WARN , __VA_ARGS__)
|
#define LLAMA_V3_LOG_WARN(...) llama_v3_log_internal(LLAMA_V3_LOG_LEVEL_WARN , __VA_ARGS__)
|
||||||
#define LLAMA_V3_LOG_ERROR(...) llama_v3_log_internal(LLAMA_V3_LOG_LEVEL_ERROR, __VA_ARGS__)
|
#define LLAMA_V3_LOG_ERROR(...) llama_v3_log_internal(LLAMA_V3_LOG_LEVEL_ERROR, __VA_ARGS__)
|
||||||
|
|
||||||
#if !defined(GGML_USE_CUBLAS)
|
#if !defined(GGML_USE_CUDA)
|
||||||
#define LLAMA_V3_USE_ALLOCATOR
|
#define LLAMA_V3_USE_ALLOCATOR
|
||||||
#else
|
#else
|
||||||
#define LLAMA_V3_USE_SCRATCH
|
#define LLAMA_V3_USE_SCRATCH
|
||||||
|
@ -270,10 +270,10 @@ struct llama_v3_kv_cache {
|
||||||
ggml_v3_free(ctx);
|
ggml_v3_free(ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef GGML_USE_CUBLAS
|
#ifdef GGML_USE_CUDA
|
||||||
ggml_v3_cuda_free_data(k);
|
ggml_v3_cuda_free_data(k);
|
||||||
ggml_v3_cuda_free_data(v);
|
ggml_v3_cuda_free_data(v);
|
||||||
#endif // GGML_USE_CUBLAS
|
#endif // GGML_USE_CUDA
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -329,7 +329,7 @@ struct llama_v3_model {
|
||||||
ggml_v3_free(ctx);
|
ggml_v3_free(ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef GGML_USE_CUBLAS
|
#ifdef GGML_USE_CUDA
|
||||||
for (size_t i = 0; i < tensors_by_name.size(); ++i) {
|
for (size_t i = 0; i < tensors_by_name.size(); ++i) {
|
||||||
ggml_v3_cuda_free_data(tensors_by_name[i].second);
|
ggml_v3_cuda_free_data(tensors_by_name[i].second);
|
||||||
}
|
}
|
||||||
|
@ -795,7 +795,7 @@ struct llama_v3_model_loader {
|
||||||
lmlock->grow_to(lock_size);
|
lmlock->grow_to(lock_size);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
#if defined(GGML_USE_CUBLAS)
|
#if defined(GGML_USE_CUDA)
|
||||||
case GGML_V3_BACKEND_GPU:
|
case GGML_V3_BACKEND_GPU:
|
||||||
case GGML_V3_BACKEND_GPU_SPLIT:
|
case GGML_V3_BACKEND_GPU_SPLIT:
|
||||||
ggml_v3_cuda_transform_tensor(lt.data, lt.ggml_v3_tensor);
|
ggml_v3_cuda_transform_tensor(lt.data, lt.ggml_v3_tensor);
|
||||||
|
@ -882,14 +882,14 @@ static bool kv_cache_init(
|
||||||
ggml_v3_set_name(cache.v, "cache_v");
|
ggml_v3_set_name(cache.v, "cache_v");
|
||||||
|
|
||||||
(void) n_gpu_layers;
|
(void) n_gpu_layers;
|
||||||
#ifdef GGML_USE_CUBLAS
|
#ifdef GGML_USE_CUDA
|
||||||
if (n_gpu_layers > n_layer + 1) {
|
if (n_gpu_layers > n_layer + 1) {
|
||||||
ggml_v3_cuda_assign_buffers_no_scratch(cache.v);
|
ggml_v3_cuda_assign_buffers_no_scratch(cache.v);
|
||||||
}
|
}
|
||||||
if (n_gpu_layers > n_layer + 2) {
|
if (n_gpu_layers > n_layer + 2) {
|
||||||
ggml_v3_cuda_assign_buffers_no_scratch(cache.k);
|
ggml_v3_cuda_assign_buffers_no_scratch(cache.k);
|
||||||
}
|
}
|
||||||
#endif // GGML_USE_CUBLAS
|
#endif // GGML_USE_CUDA
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -1181,7 +1181,7 @@ static void llama_v3_model_load_internal(
|
||||||
|
|
||||||
(void) main_gpu;
|
(void) main_gpu;
|
||||||
(void) mul_mat_q;
|
(void) mul_mat_q;
|
||||||
#if defined(GGML_USE_CUBLAS)
|
#if defined(GGML_USE_CUDA)
|
||||||
LLAMA_V3_LOG_INFO("%s: using CUDA for GPU acceleration\n", __func__);
|
LLAMA_V3_LOG_INFO("%s: using CUDA for GPU acceleration\n", __func__);
|
||||||
ggml_v3_cuda_set_main_device(main_gpu);
|
ggml_v3_cuda_set_main_device(main_gpu);
|
||||||
ggml_v3_cuda_set_mul_mat_q(mul_mat_q);
|
ggml_v3_cuda_set_mul_mat_q(mul_mat_q);
|
||||||
|
@ -1298,7 +1298,7 @@ static void llama_v3_model_load_internal(
|
||||||
|
|
||||||
(void) vram_scratch;
|
(void) vram_scratch;
|
||||||
(void) n_batch;
|
(void) n_batch;
|
||||||
#ifdef GGML_USE_CUBLAS
|
#ifdef GGML_USE_CUDA
|
||||||
if (low_vram) {
|
if (low_vram) {
|
||||||
LLAMA_V3_LOG_INFO("%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__);
|
LLAMA_V3_LOG_INFO("%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__);
|
||||||
ggml_v3_cuda_set_scratch_size(0); // disable scratch
|
ggml_v3_cuda_set_scratch_size(0); // disable scratch
|
||||||
|
@ -1313,9 +1313,9 @@ static void llama_v3_model_load_internal(
|
||||||
(vram_scratch + MB3 - 1) / MB3); // round up
|
(vram_scratch + MB3 - 1) / MB3); // round up
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif // GGML_USE_CUBLAS
|
#endif // GGML_USE_CUDA
|
||||||
|
|
||||||
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
#if defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST)
|
||||||
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
||||||
|
|
||||||
LLAMA_V3_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
|
LLAMA_V3_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
|
||||||
|
@ -1324,7 +1324,7 @@ static void llama_v3_model_load_internal(
|
||||||
}
|
}
|
||||||
size_t vram_kv_cache = 0;
|
size_t vram_kv_cache = 0;
|
||||||
|
|
||||||
#ifdef GGML_USE_CUBLAS
|
#ifdef GGML_USE_CUDA
|
||||||
const int max_backend_supported_layers = hparams.n_layer + 3;
|
const int max_backend_supported_layers = hparams.n_layer + 3;
|
||||||
const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3;
|
const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3;
|
||||||
if (n_gpu_layers > (int) hparams.n_layer + 1) {
|
if (n_gpu_layers > (int) hparams.n_layer + 1) {
|
||||||
|
@ -1346,7 +1346,7 @@ static void llama_v3_model_load_internal(
|
||||||
#elif defined(GGML_USE_CLBLAST)
|
#elif defined(GGML_USE_CLBLAST)
|
||||||
const int max_backend_supported_layers = hparams.n_layer + 1;
|
const int max_backend_supported_layers = hparams.n_layer + 1;
|
||||||
const int max_offloadable_layers = hparams.n_layer + 1;
|
const int max_offloadable_layers = hparams.n_layer + 1;
|
||||||
#endif // GGML_USE_CUBLAS
|
#endif // GGML_USE_CUDA
|
||||||
|
|
||||||
LLAMA_V3_LOG_INFO("%s: offloaded %d/%d layers to GPU\n",
|
LLAMA_V3_LOG_INFO("%s: offloaded %d/%d layers to GPU\n",
|
||||||
__func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
|
__func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
|
||||||
|
@ -1354,7 +1354,7 @@ static void llama_v3_model_load_internal(
|
||||||
__func__, (vram_weights + vram_scratch + vram_kv_cache + MB3 - 1) / MB3); // round up
|
__func__, (vram_weights + vram_scratch + vram_kv_cache + MB3 - 1) / MB3); // round up
|
||||||
#else
|
#else
|
||||||
(void) n_gpu_layers;
|
(void) n_gpu_layers;
|
||||||
#endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
#endif // defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST)
|
||||||
}
|
}
|
||||||
|
|
||||||
// populate `tensors_by_name`
|
// populate `tensors_by_name`
|
||||||
|
@ -1363,7 +1363,7 @@ static void llama_v3_model_load_internal(
|
||||||
}
|
}
|
||||||
|
|
||||||
(void) tensor_split;
|
(void) tensor_split;
|
||||||
#if defined(GGML_USE_CUBLAS)
|
#if defined(GGML_USE_CUDA)
|
||||||
{
|
{
|
||||||
ggml_v3_cuda_set_tensor_split(tensor_split);
|
ggml_v3_cuda_set_tensor_split(tensor_split);
|
||||||
}
|
}
|
||||||
|
@ -1510,7 +1510,7 @@ static struct ggml_v3_cgraph * llama_v3_build_graph(
|
||||||
offload_func_v3_t offload_func_kq = llama_v3_nop;
|
offload_func_v3_t offload_func_kq = llama_v3_nop;
|
||||||
offload_func_v3_t offload_func_v = llama_v3_nop;
|
offload_func_v3_t offload_func_v = llama_v3_nop;
|
||||||
|
|
||||||
#ifdef GGML_USE_CUBLAS
|
#ifdef GGML_USE_CUDA
|
||||||
if (n_gpu_layers > n_layer) {
|
if (n_gpu_layers > n_layer) {
|
||||||
offload_func_nr = ggml_v3_cuda_assign_buffers;
|
offload_func_nr = ggml_v3_cuda_assign_buffers;
|
||||||
}
|
}
|
||||||
|
@ -1520,7 +1520,7 @@ static struct ggml_v3_cgraph * llama_v3_build_graph(
|
||||||
if (n_gpu_layers > n_layer + 2) {
|
if (n_gpu_layers > n_layer + 2) {
|
||||||
offload_func_kq = ggml_v3_cuda_assign_buffers;
|
offload_func_kq = ggml_v3_cuda_assign_buffers;
|
||||||
}
|
}
|
||||||
#endif // GGML_USE_CUBLAS
|
#endif // GGML_USE_CUDA
|
||||||
|
|
||||||
struct ggml_v3_tensor * KQ_scale = ggml_v3_new_tensor_1d(ctx0, GGML_V3_TYPE_F32, 1);
|
struct ggml_v3_tensor * KQ_scale = ggml_v3_new_tensor_1d(ctx0, GGML_V3_TYPE_F32, 1);
|
||||||
#ifdef LLAMA_V3_USE_ALLOCATOR
|
#ifdef LLAMA_V3_USE_ALLOCATOR
|
||||||
|
@ -1541,11 +1541,11 @@ static struct ggml_v3_cgraph * llama_v3_build_graph(
|
||||||
|
|
||||||
offload_func_v3_t offload_func = llama_v3_nop;
|
offload_func_v3_t offload_func = llama_v3_nop;
|
||||||
|
|
||||||
#ifdef GGML_USE_CUBLAS
|
#ifdef GGML_USE_CUDA
|
||||||
if (il >= i_gpu_start) {
|
if (il >= i_gpu_start) {
|
||||||
offload_func = ggml_v3_cuda_assign_buffers;
|
offload_func = ggml_v3_cuda_assign_buffers;
|
||||||
}
|
}
|
||||||
#endif // GGML_USE_CUBLAS
|
#endif // GGML_USE_CUDA
|
||||||
|
|
||||||
struct ggml_v3_tensor * inpSA = inpL;
|
struct ggml_v3_tensor * inpSA = inpL;
|
||||||
|
|
||||||
|
@ -3661,19 +3661,19 @@ int llama_v3_apply_lora_from_file_internal(const struct llama_v3_model & model,
|
||||||
offload_func_v3_t offload_func = llama_v3_nop;
|
offload_func_v3_t offload_func = llama_v3_nop;
|
||||||
offload_func_v3_t offload_func_force_inplace = llama_v3_nop;
|
offload_func_v3_t offload_func_force_inplace = llama_v3_nop;
|
||||||
|
|
||||||
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
#if defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST)
|
||||||
if (dest_t->backend == GGML_V3_BACKEND_GPU || dest_t->backend == GGML_V3_BACKEND_GPU_SPLIT) {
|
if (dest_t->backend == GGML_V3_BACKEND_GPU || dest_t->backend == GGML_V3_BACKEND_GPU_SPLIT) {
|
||||||
if (dest_t->type != GGML_V3_TYPE_F16) {
|
if (dest_t->type != GGML_V3_TYPE_F16) {
|
||||||
printf("\nError: the simultaneous use of LoRAs and GPU acceleration is only supported for f16 models\n");
|
printf("\nError: the simultaneous use of LoRAs and GPU acceleration is only supported for f16 models\n");
|
||||||
throw std::runtime_error(format_old(
|
throw std::runtime_error(format_old(
|
||||||
"%s: error: lora failed", __func__));
|
"%s: error: lora failed", __func__));
|
||||||
}
|
}
|
||||||
#if defined(GGML_USE_CUBLAS)
|
#if defined(GGML_USE_CUDA)
|
||||||
offload_func = ggml_v3_cuda_assign_buffers;
|
offload_func = ggml_v3_cuda_assign_buffers;
|
||||||
offload_func_force_inplace = ggml_v3_cuda_assign_buffers_force_inplace;
|
offload_func_force_inplace = ggml_v3_cuda_assign_buffers_force_inplace;
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
#endif // GGML_USE_CUBLAS
|
#endif // GGML_USE_CUDA
|
||||||
|
|
||||||
ggml_v3_tensor * base_t;
|
ggml_v3_tensor * base_t;
|
||||||
if (model_loader) {
|
if (model_loader) {
|
||||||
|
|
|
@ -2,12 +2,12 @@
|
||||||
#define LLAMA_V3_H
|
#define LLAMA_V3_H
|
||||||
|
|
||||||
#include "ggml_v3.h"
|
#include "ggml_v3.h"
|
||||||
#ifdef GGML_USE_CUBLAS
|
#ifdef GGML_USE_CUDA
|
||||||
#include "ggml_v3-cuda.h"
|
#include "ggml_v3-cuda.h"
|
||||||
#define LLAMA_V3_MAX_DEVICES GGML_V3_CUDA_MAX_DEVICES
|
#define LLAMA_V3_MAX_DEVICES GGML_V3_CUDA_MAX_DEVICES
|
||||||
#else
|
#else
|
||||||
#define LLAMA_V3_MAX_DEVICES 1
|
#define LLAMA_V3_MAX_DEVICES 1
|
||||||
#endif // GGML_USE_CUBLAS
|
#endif // GGML_USE_CUDA
|
||||||
#include <stddef.h>
|
#include <stddef.h>
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include <stdbool.h>
|
#include <stdbool.h>
|
||||||
|
@ -48,7 +48,7 @@
|
||||||
|
|
||||||
#define LLAMA_V3_DEFAULT_SEED 0xFFFFFFFF
|
#define LLAMA_V3_DEFAULT_SEED 0xFFFFFFFF
|
||||||
|
|
||||||
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
|
#if defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
|
||||||
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
||||||
#define LLAMA_V3_SUPPORTS_GPU_OFFLOAD
|
#define LLAMA_V3_SUPPORTS_GPU_OFFLOAD
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -16,7 +16,7 @@
|
||||||
|
|
||||||
#include "model_adapter.h"
|
#include "model_adapter.h"
|
||||||
|
|
||||||
#ifdef GGML_USE_CUBLAS
|
#ifdef GGML_USE_CUDA
|
||||||
#include "ggml_v3-cuda.h"
|
#include "ggml_v3-cuda.h"
|
||||||
#endif
|
#endif
|
||||||
#if defined(GGML_USE_CLBLAST)
|
#if defined(GGML_USE_CLBLAST)
|
||||||
|
@ -295,7 +295,7 @@ bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vo
|
||||||
fin.close();
|
fin.close();
|
||||||
|
|
||||||
//gpu offload
|
//gpu offload
|
||||||
#if defined(GGML_USE_CLBLAST) || defined(GGML_USE_CUBLAS)
|
#if defined(GGML_USE_CLBLAST) || defined(GGML_USE_CUDA)
|
||||||
if(gpulayers>0)
|
if(gpulayers>0)
|
||||||
{
|
{
|
||||||
const auto & hparams = model.hparams;
|
const auto & hparams = model.hparams;
|
||||||
|
|
|
@ -14,7 +14,7 @@
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
|
||||||
#ifdef GGML_USE_CUBLAS
|
#ifdef GGML_USE_CUDA
|
||||||
#include "ggml_v3-cuda.h"
|
#include "ggml_v3-cuda.h"
|
||||||
#endif
|
#endif
|
||||||
#if defined(GGML_USE_CLBLAST)
|
#if defined(GGML_USE_CLBLAST)
|
||||||
|
@ -329,7 +329,7 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model &
|
||||||
fin.close();
|
fin.close();
|
||||||
|
|
||||||
//gpu offload
|
//gpu offload
|
||||||
#if defined(GGML_USE_CLBLAST) || defined(GGML_USE_CUBLAS)
|
#if defined(GGML_USE_CLBLAST) || defined(GGML_USE_CUDA)
|
||||||
if(gpulayers>0)
|
if(gpulayers>0)
|
||||||
{
|
{
|
||||||
const auto & hparams = model.hparams;
|
const auto & hparams = model.hparams;
|
||||||
|
|
|
@ -6,7 +6,7 @@
|
||||||
#include "rwkv_v3.h"
|
#include "rwkv_v3.h"
|
||||||
#include "ggml_v3.h"
|
#include "ggml_v3.h"
|
||||||
|
|
||||||
#ifdef GGML_USE_CUBLAS
|
#ifdef GGML_USE_CUDA
|
||||||
#include "ggml_v3-cuda.h"
|
#include "ggml_v3-cuda.h"
|
||||||
#endif
|
#endif
|
||||||
#if defined(GGML_USE_CLBLAST)
|
#if defined(GGML_USE_CLBLAST)
|
||||||
|
@ -1076,7 +1076,7 @@ struct rwkv_future_tensor rwkv_future_graph_work(struct rwkv_future_ctx & ctx,
|
||||||
const size_t n_threads,
|
const size_t n_threads,
|
||||||
const size_t sequence_len = 1
|
const size_t sequence_len = 1
|
||||||
) {
|
) {
|
||||||
#if defined(GGML_USE_CLBLAST) || defined(GGML_USE_CUBLAS)
|
#if defined(GGML_USE_CLBLAST) || defined(GGML_USE_CUDA)
|
||||||
enum ggml_v3_type mul_mat_type = type == GGML_V3_TYPE_F32 ? GGML_V3_TYPE_F32 : GGML_V3_TYPE_F16;
|
enum ggml_v3_type mul_mat_type = type == GGML_V3_TYPE_F32 ? GGML_V3_TYPE_F32 : GGML_V3_TYPE_F16;
|
||||||
#else
|
#else
|
||||||
enum ggml_v3_type mul_mat_type = ggml_v3_is_quantized(type) ? GGML_V3_TYPE_Q8_1 : type;
|
enum ggml_v3_type mul_mat_type = ggml_v3_is_quantized(type) ? GGML_V3_TYPE_Q8_1 : type;
|
||||||
|
@ -1566,7 +1566,7 @@ struct rwkv_context * rwkv_clone_context(struct rwkv_context * ctx, const uint32
|
||||||
}
|
}
|
||||||
|
|
||||||
bool rwkv_gpu_offload_layers(struct rwkv_context * ctx, const uint32_t n_layers) {
|
bool rwkv_gpu_offload_layers(struct rwkv_context * ctx, const uint32_t n_layers) {
|
||||||
#if defined(GGML_USE_CLBLAST) || defined(GGML_USE_CUBLAS)
|
#if defined(GGML_USE_CLBLAST) || defined(GGML_USE_CUDA)
|
||||||
printf("\nOffloading %u (or fewer) layers...",n_layers);
|
printf("\nOffloading %u (or fewer) layers...",n_layers);
|
||||||
const auto offload = [&](struct ggml_v3_tensor * tensor) {
|
const auto offload = [&](struct ggml_v3_tensor * tensor) {
|
||||||
// TODO support multi-GPU
|
// TODO support multi-GPU
|
||||||
|
|
|
@ -1,10 +0,0 @@
|
||||||
#!/bin/bash
|
|
||||||
|
|
||||||
wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip
|
|
||||||
|
|
||||||
echo "Usage:"
|
|
||||||
echo ""
|
|
||||||
echo " ./perplexity -m model.gguf -f wiki.test.raw [other params]"
|
|
||||||
echo ""
|
|
||||||
|
|
||||||
exit 0
|
|
|
@ -1,213 +0,0 @@
|
||||||
#!/bin/bash
|
|
||||||
#
|
|
||||||
# Use this script only on fresh pods (runpod.io)!
|
|
||||||
# Otherwise, it can break your environment!
|
|
||||||
#
|
|
||||||
|
|
||||||
if [ -z "$1" ]; then
|
|
||||||
echo "Usage: $0 <data>"
|
|
||||||
echo " 0: no models"
|
|
||||||
echo " 1: tinyllama-1b"
|
|
||||||
echo " 2: codellama-7b"
|
|
||||||
echo " 3: codellama-13b"
|
|
||||||
echo " 4: codellama-34b"
|
|
||||||
echo " 5: codellama-7b-instruct"
|
|
||||||
echo " 6: codellama-13b-instruct"
|
|
||||||
echo " 7: codellama-34b-instruct"
|
|
||||||
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
set -x
|
|
||||||
|
|
||||||
# setup deps
|
|
||||||
apt-get update
|
|
||||||
apt-get install -y git-lfs cmake cmake-curses-gui vim ruby
|
|
||||||
git-lfs install
|
|
||||||
|
|
||||||
if [ ! -d "/workspace" ]; then
|
|
||||||
ln -sfn $(pwd) /workspace
|
|
||||||
fi
|
|
||||||
|
|
||||||
# download data
|
|
||||||
cd /workspace
|
|
||||||
|
|
||||||
# this is useful to git clone repos without doubling the disk size due to .git
|
|
||||||
git clone https://github.com/iboB/git-lfs-download
|
|
||||||
ln -sfn /workspace/git-lfs-download/git-lfs-download /usr/local/bin/git-lfs-download
|
|
||||||
|
|
||||||
# llama.cpp
|
|
||||||
cd /workspace
|
|
||||||
git clone https://github.com/ggerganov/llama.cpp
|
|
||||||
|
|
||||||
cd llama.cpp
|
|
||||||
|
|
||||||
LLAMA_CUBLAS=1 make -j
|
|
||||||
|
|
||||||
ln -sfn /workspace/TinyLlama-1.1B-Chat-v0.3 ./models/tinyllama-1b
|
|
||||||
ln -sfn /workspace/CodeLlama-7b-hf ./models/codellama-7b
|
|
||||||
ln -sfn /workspace/CodeLlama-13b-hf ./models/codellama-13b
|
|
||||||
ln -sfn /workspace/CodeLlama-34b-hf ./models/codellama-34b
|
|
||||||
ln -sfn /workspace/CodeLlama-7b-Instruct-hf ./models/codellama-7b-instruct
|
|
||||||
ln -sfn /workspace/CodeLlama-13b-Instruct-hf ./models/codellama-13b-instruct
|
|
||||||
ln -sfn /workspace/CodeLlama-34b-Instruct-hf ./models/codellama-34b-instruct
|
|
||||||
|
|
||||||
pip install -r requirements.txt
|
|
||||||
|
|
||||||
# cmake
|
|
||||||
cd /workspace/llama.cpp
|
|
||||||
|
|
||||||
mkdir build-cublas
|
|
||||||
cd build-cublas
|
|
||||||
|
|
||||||
cmake -DLLAMA_CUBLAS=1 ../
|
|
||||||
make -j
|
|
||||||
|
|
||||||
if [ "$1" -eq "0" ]; then
|
|
||||||
exit 0
|
|
||||||
fi
|
|
||||||
|
|
||||||
# more models
|
|
||||||
if [ "$1" -eq "1" ]; then
|
|
||||||
cd /workspace
|
|
||||||
|
|
||||||
git-lfs-download https://huggingface.co/PY007/TinyLlama-1.1B-Chat-v0.3
|
|
||||||
|
|
||||||
cd /workspace/llama.cpp
|
|
||||||
|
|
||||||
python3 convert.py ./models/tinyllama-1b --outfile ./models/tinyllama-1b/ggml-model-f16.gguf --outtype f16
|
|
||||||
|
|
||||||
./quantize ./models/tinyllama-1b/ggml-model-f16.gguf ./models/tinyllama-1b/ggml-model-q4_0.gguf q4_0
|
|
||||||
./quantize ./models/tinyllama-1b/ggml-model-f16.gguf ./models/tinyllama-1b/ggml-model-q4_k.gguf q4_k
|
|
||||||
./quantize ./models/tinyllama-1b/ggml-model-f16.gguf ./models/tinyllama-1b/ggml-model-q8_0.gguf q8_0
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ "$1" -eq "2" ]; then
|
|
||||||
cd /workspace
|
|
||||||
|
|
||||||
git-lfs-download https://huggingface.co/codellama/CodeLlama-7b-hf --without *safetensors*
|
|
||||||
rm -v ./CodeLlama-7b-hf/*safetensors*
|
|
||||||
|
|
||||||
cd /workspace/llama.cpp
|
|
||||||
|
|
||||||
python3 convert.py ./models/codellama-7b --outfile ./models/codellama-7b/ggml-model-f16.gguf --outtype f16
|
|
||||||
|
|
||||||
./quantize ./models/codellama-7b/ggml-model-f16.gguf ./models/codellama-7b/ggml-model-q4_0.gguf q4_0
|
|
||||||
./quantize ./models/codellama-7b/ggml-model-f16.gguf ./models/codellama-7b/ggml-model-q4_k.gguf q4_k
|
|
||||||
./quantize ./models/codellama-7b/ggml-model-f16.gguf ./models/codellama-7b/ggml-model-q8_0.gguf q8_0
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ "$1" -eq "3" ]; then
|
|
||||||
cd /workspace
|
|
||||||
|
|
||||||
git-lfs-download https://huggingface.co/codellama/CodeLlama-13b-hf --without *safetensors*
|
|
||||||
rm -v ./CodeLlama-13b-hf/*safetensors*
|
|
||||||
|
|
||||||
cd /workspace/llama.cpp
|
|
||||||
|
|
||||||
python3 convert.py ./models/codellama-13b --outfile ./models/codellama-13b/ggml-model-f16.gguf --outtype f16
|
|
||||||
|
|
||||||
./quantize ./models/codellama-13b/ggml-model-f16.gguf ./models/codellama-13b/ggml-model-q4_0.gguf q4_0
|
|
||||||
./quantize ./models/codellama-13b/ggml-model-f16.gguf ./models/codellama-13b/ggml-model-q4_k.gguf q4_k
|
|
||||||
./quantize ./models/codellama-13b/ggml-model-f16.gguf ./models/codellama-13b/ggml-model-q8_0.gguf q8_0
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ "$1" -eq "4" ]; then
|
|
||||||
cd /workspace
|
|
||||||
|
|
||||||
git-lfs-download https://huggingface.co/codellama/CodeLlama-34b-hf --without *safetensors*
|
|
||||||
rm -v ./CodeLlama-34b-hf/*safetensors*
|
|
||||||
|
|
||||||
cd /workspace/llama.cpp
|
|
||||||
|
|
||||||
python3 convert.py ./models/codellama-34b --outfile ./models/codellama-34b/ggml-model-f16.gguf --outtype f16
|
|
||||||
|
|
||||||
./quantize ./models/codellama-34b/ggml-model-f16.gguf ./models/codellama-34b/ggml-model-q4_0.gguf q4_0
|
|
||||||
./quantize ./models/codellama-34b/ggml-model-f16.gguf ./models/codellama-34b/ggml-model-q4_k.gguf q4_k
|
|
||||||
./quantize ./models/codellama-34b/ggml-model-f16.gguf ./models/codellama-34b/ggml-model-q8_0.gguf q8_0
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ "$1" -eq "5" ]; then
|
|
||||||
cd /workspace
|
|
||||||
|
|
||||||
git-lfs-download https://huggingface.co/codellama/CodeLlama-7b-Instruct-hf --without *safetensors*
|
|
||||||
rm -v ./CodeLlama-7b-Instruct-hf/*safetensors*
|
|
||||||
|
|
||||||
cd /workspace/llama.cpp
|
|
||||||
|
|
||||||
python3 convert.py ./models/codellama-7b-instruct --outfile ./models/codellama-7b-instruct/ggml-model-f16.gguf --outtype f16
|
|
||||||
|
|
||||||
./quantize ./models/codellama-7b-instruct/ggml-model-f16.gguf ./models/codellama-7b-instruct/ggml-model-q4_0.gguf q4_0
|
|
||||||
./quantize ./models/codellama-7b-instruct/ggml-model-f16.gguf ./models/codellama-7b-instruct/ggml-model-q4_k.gguf q4_k
|
|
||||||
./quantize ./models/codellama-7b-instruct/ggml-model-f16.gguf ./models/codellama-7b-instruct/ggml-model-q8_0.gguf q8_0
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ "$1" -eq "6" ]; then
|
|
||||||
cd /workspace
|
|
||||||
|
|
||||||
git-lfs-download https://huggingface.co/codellama/CodeLlama-13b-Instruct-hf --without *safetensors*
|
|
||||||
rm -v ./CodeLlama-13b-Instruct-hf/*safetensors*
|
|
||||||
|
|
||||||
cd /workspace/llama.cpp
|
|
||||||
|
|
||||||
python3 convert.py ./models/codellama-13b-instruct --outfile ./models/codellama-13b-instruct/ggml-model-f16.gguf --outtype f16
|
|
||||||
|
|
||||||
./quantize ./models/codellama-13b-instruct/ggml-model-f16.gguf ./models/codellama-13b-instruct/ggml-model-q4_0.gguf q4_0
|
|
||||||
./quantize ./models/codellama-13b-instruct/ggml-model-f16.gguf ./models/codellama-13b-instruct/ggml-model-q4_k.gguf q4_k
|
|
||||||
./quantize ./models/codellama-13b-instruct/ggml-model-f16.gguf ./models/codellama-13b-instruct/ggml-model-q8_0.gguf q8_0
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ "$1" -eq "7" ]; then
|
|
||||||
cd /workspace
|
|
||||||
|
|
||||||
git-lfs-download https://huggingface.co/codellama/CodeLlama-34b-Instruct-hf --without *safetensors*
|
|
||||||
rm -v ./CodeLlama-34b-Instruct-hf/*safetensors*
|
|
||||||
|
|
||||||
cd /workspace/llama.cpp
|
|
||||||
|
|
||||||
python3 convert.py ./models/codellama-34b-instruct --outfile ./models/codellama-34b-instruct/ggml-model-f16.gguf --outtype f16
|
|
||||||
|
|
||||||
./quantize ./models/codellama-34b-instruct/ggml-model-f16.gguf ./models/codellama-34b-instruct/ggml-model-q4_0.gguf q4_0
|
|
||||||
./quantize ./models/codellama-34b-instruct/ggml-model-f16.gguf ./models/codellama-34b-instruct/ggml-model-q4_k.gguf q4_k
|
|
||||||
./quantize ./models/codellama-34b-instruct/ggml-model-f16.gguf ./models/codellama-34b-instruct/ggml-model-q8_0.gguf q8_0
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ "$1" -eq "1" ]; then
|
|
||||||
# perf + perplexity
|
|
||||||
cd /workspace/llama.cpp/build-cublas
|
|
||||||
|
|
||||||
make -j && ../scripts/run-all-perf.sh tinyllama-1b "f16" "-ngl 99 -t 1 -p 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,32,64,128,256,512,1024,2048 -n 128"
|
|
||||||
|
|
||||||
../scripts/get-wikitext-2.sh
|
|
||||||
unzip wikitext-2-raw-v1.zip
|
|
||||||
|
|
||||||
make -j && ./bin/perplexity -m ../models/tinyllama-1b/ggml-model-f16.gguf -f ./wikitext-2-raw/wiki.test.raw -ngl 100 --chunks 32
|
|
||||||
|
|
||||||
# batched
|
|
||||||
cd /workspace/llama.cpp
|
|
||||||
|
|
||||||
LLAMA_CUBLAS=1 make -j && ./batched ./models/tinyllama-1b/ggml-model-f16.gguf "Hello, my name is" 8 128 999
|
|
||||||
|
|
||||||
# batched-bench
|
|
||||||
cd /workspace/llama.cpp
|
|
||||||
|
|
||||||
LLAMA_CUBLAS=1 make -j && ./batched-bench ./models/tinyllama-1b/ggml-model-f16.gguf 4608 1 99 0 512 128 1,2,3,4,5,6,7,8,16,32
|
|
||||||
|
|
||||||
# parallel
|
|
||||||
cd /workspace/llama.cpp
|
|
||||||
|
|
||||||
LLAMA_CUBLAS=1 make -j && ./parallel -m ./models/tinyllama-1b/ggml-model-f16.gguf -t 1 -ngl 100 -c 4096 -b 512 -s 1 -np 8 -ns 128 -n 100 -cb
|
|
||||||
|
|
||||||
fi
|
|
||||||
|
|
||||||
# speculative
|
|
||||||
#if [ "$1" -eq "7" ]; then
|
|
||||||
# cd /workspace/llama.cpp
|
|
||||||
#
|
|
||||||
# LLAMA_CUBLAS=1 make -j && ./speculative -m ./models/codellama-34b-instruct/ggml-model-f16.gguf -md ./models/codellama-7b-instruct/ggml-model-q4_0.gguf -p "# Dijkstra's shortest path algorithm in Python (4 spaces indentation) + complexity analysis:\n\n" -e -ngl 999 -ngld 999 -t 4 -n 512 -c 4096 -s 21 --draft 16 -np 1 --temp 0.0
|
|
||||||
#fi
|
|
||||||
|
|
||||||
# more benches
|
|
||||||
#LLAMA_CUBLAS=1 make -j && ./batched-bench ./models/codellama-7b/ggml-model-q4_k.gguf 4096 1 99 1 512,3200 128,128,800 1
|
|
||||||
#LLAMA_CUBLAS=1 make -j && ./batched-bench ./models/codellama-13b/ggml-model-q4_k.gguf 4096 1 99 1 512,3200 128,128,800 1
|
|
||||||
|
|
|
@ -1,10 +0,0 @@
|
||||||
import { readFileSync } from "fs"
|
|
||||||
import { SchemaConverter } from "../examples/server/public/json-schema-to-grammar.mjs"
|
|
||||||
|
|
||||||
const [, , file] = process.argv
|
|
||||||
const url = `file://${file}`
|
|
||||||
let schema = JSON.parse(readFileSync(file, "utf8"));
|
|
||||||
const converter = new SchemaConverter({})
|
|
||||||
schema = await converter.resolveRefs(schema, url)
|
|
||||||
converter.visit(schema, '')
|
|
||||||
console.log(converter.formatGrammar())
|
|
|
@ -1,79 +0,0 @@
|
||||||
#include <iostream>
|
|
||||||
#include <string>
|
|
||||||
#include <vector>
|
|
||||||
#include <sstream>
|
|
||||||
|
|
||||||
#undef NDEBUG
|
|
||||||
#include <cassert>
|
|
||||||
|
|
||||||
#include "llama.h"
|
|
||||||
|
|
||||||
int main(void) {
|
|
||||||
llama_chat_message conversation[] = {
|
|
||||||
{"system", "You are a helpful assistant"},
|
|
||||||
{"user", "Hello"},
|
|
||||||
{"assistant", "Hi there"},
|
|
||||||
{"user", "Who are you"},
|
|
||||||
{"assistant", " I am an assistant "},
|
|
||||||
{"user", "Another question"},
|
|
||||||
};
|
|
||||||
size_t message_count = 6;
|
|
||||||
std::vector<std::string> templates = {
|
|
||||||
// teknium/OpenHermes-2.5-Mistral-7B
|
|
||||||
"{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content'] + '<|im_end|>' + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\\n' }}{% endif %}",
|
|
||||||
// mistralai/Mistral-7B-Instruct-v0.2
|
|
||||||
"{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}",
|
|
||||||
// TheBloke/FusionNet_34Bx2_MoE-AWQ
|
|
||||||
"{%- for idx in range(0, messages|length) -%}\\n{%- if messages[idx]['role'] == 'user' -%}\\n{%- if idx > 1 -%}\\n{{- bos_token + '[INST] ' + messages[idx]['content'] + ' [/INST]' -}}\\n{%- else -%}\\n{{- messages[idx]['content'] + ' [/INST]' -}}\\n{%- endif -%}\\n{% elif messages[idx]['role'] == 'system' %}\\n{{- '[INST] <<SYS>>\\\\n' + messages[idx]['content'] + '\\\\n<</SYS>>\\\\n\\\\n' -}}\\n{%- elif messages[idx]['role'] == 'assistant' -%}\\n{{- ' ' + messages[idx]['content'] + ' ' + eos_token -}}\\n{% endif %}\\n{% endfor %}",
|
|
||||||
// bofenghuang/vigogne-2-70b-chat
|
|
||||||
"{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif true == true and not '<<SYS>>' in messages[0]['content'] %}{% set loop_messages = messages %}{% set system_message = 'Vous êtes Vigogne, un assistant IA créé par Zaion Lab. Vous suivez extrêmement bien les instructions. Aidez autant que vous le pouvez.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\\\n' + system_message + '\\\\n<</SYS>>\\\\n\\\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'system' %}{{ '<<SYS>>\\\\n' + content.strip() + '\\\\n<</SYS>>\\\\n\\\\n' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}",
|
|
||||||
// mlabonne/AlphaMonarch-7B
|
|
||||||
"{% for message in messages %}{{bos_token + message['role'] + '\\n' + message['content'] + eos_token + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ bos_token + 'assistant\\n' }}{% endif %}",
|
|
||||||
// google/gemma-7b-it
|
|
||||||
"{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '<start_of_turn>' + role + '\\n' + message['content'] | trim + '<end_of_turn>\\n' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model\\n'}}{% endif %}",
|
|
||||||
// OrionStarAI/Orion-14B-Chat
|
|
||||||
"{% for message in messages %}{% if loop.first %}{{ bos_token }}{% endif %}{% if message['role'] == 'user' %}{{ 'Human: ' + message['content'] + '\\n\\nAssistant: ' + eos_token }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token }}{% endif %}{% endfor %}",
|
|
||||||
};
|
|
||||||
std::vector<std::string> expected_output = {
|
|
||||||
// teknium/OpenHermes-2.5-Mistral-7B
|
|
||||||
"<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\nHello<|im_end|>\n<|im_start|>assistant\nHi there<|im_end|>\n<|im_start|>user\nWho are you<|im_end|>\n<|im_start|>assistant\n I am an assistant <|im_end|>\n<|im_start|>user\nAnother question<|im_end|>\n<|im_start|>assistant\n",
|
|
||||||
// mistralai/Mistral-7B-Instruct-v0.2
|
|
||||||
"[INST] You are a helpful assistant\nHello [/INST]Hi there</s>[INST] Who are you [/INST] I am an assistant </s>[INST] Another question [/INST]",
|
|
||||||
// TheBloke/FusionNet_34Bx2_MoE-AWQ
|
|
||||||
"[INST] <<SYS>>\nYou are a helpful assistant\n<</SYS>>\n\nHello [/INST] Hi there </s><s>[INST] Who are you [/INST] I am an assistant </s><s>[INST] Another question [/INST]",
|
|
||||||
// bofenghuang/vigogne-2-70b-chat
|
|
||||||
"[INST] <<SYS>>\nYou are a helpful assistant\n<</SYS>>\n\nHello [/INST] Hi there </s>[INST] Who are you [/INST] I am an assistant </s>[INST] Another question [/INST]",
|
|
||||||
// mlabonne/AlphaMonarch-7B
|
|
||||||
"system\nYou are a helpful assistant</s>\n<s>user\nHello</s>\n<s>assistant\nHi there</s>\n<s>user\nWho are you</s>\n<s>assistant\n I am an assistant </s>\n<s>user\nAnother question</s>\n<s>assistant\n",
|
|
||||||
// google/gemma-7b-it
|
|
||||||
"<start_of_turn>user\nYou are a helpful assistant\n\nHello<end_of_turn>\n<start_of_turn>model\nHi there<end_of_turn>\n<start_of_turn>user\nWho are you<end_of_turn>\n<start_of_turn>model\nI am an assistant<end_of_turn>\n<start_of_turn>user\nAnother question<end_of_turn>\n<start_of_turn>model\n",
|
|
||||||
// OrionStarAI/Orion-14B-Chat
|
|
||||||
"Human: You are a helpful assistant\n\nHello\n\nAssistant: </s>Hi there</s>Human: Who are you\n\nAssistant: </s> I am an assistant </s>Human: Another question\n\nAssistant: </s>",
|
|
||||||
};
|
|
||||||
std::vector<char> formatted_chat(1024);
|
|
||||||
int32_t res;
|
|
||||||
|
|
||||||
// test invalid chat template
|
|
||||||
res = llama_chat_apply_template(nullptr, "INVALID TEMPLATE", conversation, message_count, true, formatted_chat.data(), formatted_chat.size());
|
|
||||||
assert(res < 0);
|
|
||||||
|
|
||||||
for (size_t i = 0; i < templates.size(); i++) {
|
|
||||||
std::string custom_template = templates[i];
|
|
||||||
std::string expected = expected_output[i];
|
|
||||||
formatted_chat.resize(1024);
|
|
||||||
res = llama_chat_apply_template(
|
|
||||||
nullptr,
|
|
||||||
custom_template.c_str(),
|
|
||||||
conversation,
|
|
||||||
message_count,
|
|
||||||
true,
|
|
||||||
formatted_chat.data(),
|
|
||||||
formatted_chat.size()
|
|
||||||
);
|
|
||||||
formatted_chat.resize(res);
|
|
||||||
std::string output(formatted_chat.data(), formatted_chat.size());
|
|
||||||
std::cout << output << "\n-------------------------\n";
|
|
||||||
assert(output == expected);
|
|
||||||
}
|
|
||||||
return 0;
|
|
||||||
}
|
|
|
@ -1,842 +0,0 @@
|
||||||
#ifdef NDEBUG
|
|
||||||
#undef NDEBUG
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#include <fstream>
|
|
||||||
#include <sstream>
|
|
||||||
#include <regex>
|
|
||||||
|
|
||||||
#include "json-schema-to-grammar.h"
|
|
||||||
#include "grammar-parser.h"
|
|
||||||
|
|
||||||
static std::string trim(const std::string & source) {
|
|
||||||
std::string s(source);
|
|
||||||
s.erase(0,s.find_first_not_of(" \n\r\t"));
|
|
||||||
s.erase(s.find_last_not_of(" \n\r\t")+1);
|
|
||||||
return std::regex_replace(s, std::regex("(^|\n)[ \t]+"), "$1");
|
|
||||||
}
|
|
||||||
|
|
||||||
enum TestCaseStatus {
|
|
||||||
SUCCESS,
|
|
||||||
FAILURE
|
|
||||||
};
|
|
||||||
|
|
||||||
struct TestCase {
|
|
||||||
TestCaseStatus expected_status;
|
|
||||||
std::string name;
|
|
||||||
std::string schema;
|
|
||||||
std::string expected_grammar;
|
|
||||||
|
|
||||||
void _print_failure_header() const {
|
|
||||||
fprintf(stderr, "#\n# Test '%s' failed.\n#\n%s\n", name.c_str(), schema.c_str());
|
|
||||||
}
|
|
||||||
void verify(const std::string & actual_grammar) const {
|
|
||||||
if (trim(actual_grammar) != trim(expected_grammar)) {
|
|
||||||
_print_failure_header();
|
|
||||||
fprintf(stderr, "# EXPECTED:\n%s\n# ACTUAL:\n%s\n", expected_grammar.c_str(), actual_grammar.c_str());
|
|
||||||
assert(false);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
void verify_expectation_parseable() const {
|
|
||||||
try {
|
|
||||||
auto state = grammar_parser::parse(expected_grammar.c_str());
|
|
||||||
if (state.symbol_ids.find("root") == state.symbol_ids.end()) {
|
|
||||||
throw std::runtime_error("Grammar failed to parse:\n" + expected_grammar);
|
|
||||||
}
|
|
||||||
} catch (const std::runtime_error & ex) {
|
|
||||||
_print_failure_header();
|
|
||||||
fprintf(stderr, "# GRAMMAR ERROR: %s\n", ex.what());
|
|
||||||
assert(false);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
void verify_status(TestCaseStatus status) const {
|
|
||||||
if (status != expected_status) {
|
|
||||||
_print_failure_header();
|
|
||||||
fprintf(stderr, "# EXPECTED STATUS: %s\n", expected_status == SUCCESS ? "SUCCESS" : "FAILURE");
|
|
||||||
fprintf(stderr, "# ACTUAL STATUS: %s\n", status == SUCCESS ? "SUCCESS" : "FAILURE");
|
|
||||||
assert(false);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
static void write(const std::string & file, const std::string & content) {
|
|
||||||
std::ofstream f;
|
|
||||||
f.open(file.c_str());
|
|
||||||
f << content.c_str();
|
|
||||||
f.close();
|
|
||||||
}
|
|
||||||
|
|
||||||
static std::string read(const std::string & file) {
|
|
||||||
std::ostringstream actuals;
|
|
||||||
actuals << std::ifstream(file.c_str()).rdbuf();
|
|
||||||
return actuals.str();
|
|
||||||
}
|
|
||||||
|
|
||||||
static void test_all(const std::string & lang, std::function<void(const TestCase &)> runner) {
|
|
||||||
fprintf(stderr, "#\n# Testing JSON schema conversion (%s)\n#\n", lang.c_str());
|
|
||||||
auto test = [&](const TestCase & tc) {
|
|
||||||
fprintf(stderr, "- %s%s\n", tc.name.c_str(), tc.expected_status == FAILURE ? " (failure expected)" : "");
|
|
||||||
runner(tc);
|
|
||||||
};
|
|
||||||
|
|
||||||
test({
|
|
||||||
FAILURE,
|
|
||||||
"unknown type",
|
|
||||||
R"""({
|
|
||||||
"type": "kaboom"
|
|
||||||
})""",
|
|
||||||
""
|
|
||||||
});
|
|
||||||
|
|
||||||
test({
|
|
||||||
FAILURE,
|
|
||||||
"invalid type",
|
|
||||||
R"""({
|
|
||||||
"type": 123
|
|
||||||
})""",
|
|
||||||
""
|
|
||||||
});
|
|
||||||
|
|
||||||
test({
|
|
||||||
SUCCESS,
|
|
||||||
"empty schema (object)",
|
|
||||||
"{}",
|
|
||||||
R"""(
|
|
||||||
array ::= "[" space ( value ("," space value)* )? "]" space
|
|
||||||
boolean ::= ("true" | "false") space
|
|
||||||
null ::= "null" space
|
|
||||||
number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space
|
|
||||||
object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? "}" space
|
|
||||||
root ::= object
|
|
||||||
space ::= " "?
|
|
||||||
string ::= "\"" (
|
|
||||||
[^"\\] |
|
|
||||||
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
|
|
||||||
)* "\"" space
|
|
||||||
value ::= object | array | string | number | boolean
|
|
||||||
)"""
|
|
||||||
});
|
|
||||||
|
|
||||||
test({
|
|
||||||
SUCCESS,
|
|
||||||
"exotic formats",
|
|
||||||
R"""({
|
|
||||||
"items": [
|
|
||||||
{ "format": "date" },
|
|
||||||
{ "format": "uuid" },
|
|
||||||
{ "format": "time" },
|
|
||||||
{ "format": "date-time" }
|
|
||||||
]
|
|
||||||
})""",
|
|
||||||
R"""(
|
|
||||||
date ::= [0-9] [0-9] [0-9] [0-9] "-" ( "0" [1-9] | "1" [0-2] ) "-" ( "0" [1-9] | [1-2] [0-9] | "3" [0-1] )
|
|
||||||
date-string ::= "\"" date "\"" space
|
|
||||||
date-time ::= date "T" time
|
|
||||||
date-time-string ::= "\"" date-time "\"" space
|
|
||||||
root ::= "[" space date-string "," space uuid "," space time-string "," space date-time-string "]" space
|
|
||||||
space ::= " "?
|
|
||||||
time ::= ([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9] [0-9] [0-9] )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )
|
|
||||||
time-string ::= "\"" time "\"" space
|
|
||||||
uuid ::= "\"" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "-" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "-" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "-" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "-" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "\"" space
|
|
||||||
)"""
|
|
||||||
});
|
|
||||||
|
|
||||||
test({
|
|
||||||
SUCCESS,
|
|
||||||
"string",
|
|
||||||
R"""({
|
|
||||||
"type": "string"
|
|
||||||
})""",
|
|
||||||
R"""(
|
|
||||||
root ::= "\"" (
|
|
||||||
[^"\\] |
|
|
||||||
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
|
|
||||||
)* "\"" space
|
|
||||||
space ::= " "?
|
|
||||||
)"""
|
|
||||||
});
|
|
||||||
|
|
||||||
test({
|
|
||||||
SUCCESS,
|
|
||||||
"boolean",
|
|
||||||
R"""({
|
|
||||||
"type": "boolean"
|
|
||||||
})""",
|
|
||||||
R"""(
|
|
||||||
root ::= ("true" | "false") space
|
|
||||||
space ::= " "?
|
|
||||||
)"""
|
|
||||||
});
|
|
||||||
|
|
||||||
test({
|
|
||||||
SUCCESS,
|
|
||||||
"integer",
|
|
||||||
R"""({
|
|
||||||
"type": "integer"
|
|
||||||
})""",
|
|
||||||
R"""(
|
|
||||||
root ::= ("-"? ([0-9] | [1-9] [0-9]*)) space
|
|
||||||
space ::= " "?
|
|
||||||
)"""
|
|
||||||
});
|
|
||||||
|
|
||||||
test({
|
|
||||||
SUCCESS,
|
|
||||||
"string const",
|
|
||||||
R"""({
|
|
||||||
"const": "foo"
|
|
||||||
})""",
|
|
||||||
R"""(
|
|
||||||
root ::= "\"foo\""
|
|
||||||
space ::= " "?
|
|
||||||
)"""
|
|
||||||
});
|
|
||||||
|
|
||||||
test({
|
|
||||||
SUCCESS,
|
|
||||||
"non-string const",
|
|
||||||
R"""({
|
|
||||||
"const": 123
|
|
||||||
})""",
|
|
||||||
R"""(
|
|
||||||
root ::= "123"
|
|
||||||
space ::= " "?
|
|
||||||
)"""
|
|
||||||
});
|
|
||||||
|
|
||||||
test({
|
|
||||||
SUCCESS,
|
|
||||||
"non-string enum",
|
|
||||||
R"""({
|
|
||||||
"enum": ["red", "amber", "green", null, 42, ["foo"]]
|
|
||||||
})""",
|
|
||||||
R"""(
|
|
||||||
root ::= "\"red\"" | "\"amber\"" | "\"green\"" | "null" | "42" | "[\"foo\"]"
|
|
||||||
space ::= " "?
|
|
||||||
)"""
|
|
||||||
});
|
|
||||||
|
|
||||||
test({
|
|
||||||
SUCCESS,
|
|
||||||
"tuple1",
|
|
||||||
R"""({
|
|
||||||
"prefixItems": [{ "type": "string" }]
|
|
||||||
})""",
|
|
||||||
R"""(
|
|
||||||
root ::= "[" space string "]" space
|
|
||||||
space ::= " "?
|
|
||||||
string ::= "\"" (
|
|
||||||
[^"\\] |
|
|
||||||
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
|
|
||||||
)* "\"" space
|
|
||||||
)"""
|
|
||||||
});
|
|
||||||
|
|
||||||
test({
|
|
||||||
SUCCESS,
|
|
||||||
"tuple2",
|
|
||||||
R"""({
|
|
||||||
"prefixItems": [{ "type": "string" }, { "type": "number" }]
|
|
||||||
})""",
|
|
||||||
R"""(
|
|
||||||
number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space
|
|
||||||
root ::= "[" space string "," space number "]" space
|
|
||||||
space ::= " "?
|
|
||||||
string ::= "\"" (
|
|
||||||
[^"\\] |
|
|
||||||
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
|
|
||||||
)* "\"" space
|
|
||||||
)"""
|
|
||||||
});
|
|
||||||
|
|
||||||
test({
|
|
||||||
SUCCESS,
|
|
||||||
"number",
|
|
||||||
R"""({
|
|
||||||
"type": "number"
|
|
||||||
})""",
|
|
||||||
R"""(
|
|
||||||
root ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space
|
|
||||||
space ::= " "?
|
|
||||||
)"""
|
|
||||||
});
|
|
||||||
|
|
||||||
test({
|
|
||||||
SUCCESS,
|
|
||||||
"minItems",
|
|
||||||
R"""({
|
|
||||||
"items": {
|
|
||||||
"type": "boolean"
|
|
||||||
},
|
|
||||||
"minItems": 2
|
|
||||||
})""",
|
|
||||||
R"""(
|
|
||||||
boolean ::= ("true" | "false") space
|
|
||||||
root ::= "[" space boolean ( "," space boolean )( "," space boolean )* "]" space
|
|
||||||
space ::= " "?
|
|
||||||
)"""
|
|
||||||
});
|
|
||||||
|
|
||||||
test({
|
|
||||||
SUCCESS,
|
|
||||||
"maxItems 1",
|
|
||||||
R"""({
|
|
||||||
"items": {
|
|
||||||
"type": "boolean"
|
|
||||||
},
|
|
||||||
"maxItems": 1
|
|
||||||
})""",
|
|
||||||
R"""(
|
|
||||||
boolean ::= ("true" | "false") space
|
|
||||||
root ::= "[" space ( boolean )? "]" space
|
|
||||||
space ::= " "?
|
|
||||||
)"""
|
|
||||||
});
|
|
||||||
|
|
||||||
test({
|
|
||||||
SUCCESS,
|
|
||||||
"maxItems 2",
|
|
||||||
R"""({
|
|
||||||
"items": {
|
|
||||||
"type": "boolean"
|
|
||||||
},
|
|
||||||
"maxItems": 2
|
|
||||||
})""",
|
|
||||||
R"""(
|
|
||||||
boolean ::= ("true" | "false") space
|
|
||||||
root ::= "[" space ( boolean ( "," space boolean )? )? "]" space
|
|
||||||
space ::= " "?
|
|
||||||
)"""
|
|
||||||
});
|
|
||||||
|
|
||||||
test({
|
|
||||||
SUCCESS,
|
|
||||||
"min + maxItems",
|
|
||||||
R"""({
|
|
||||||
"items": {
|
|
||||||
"type": ["number", "integer"]
|
|
||||||
},
|
|
||||||
"minItems": 3,
|
|
||||||
"maxItems": 5
|
|
||||||
})""",
|
|
||||||
R"""(
|
|
||||||
integer ::= ("-"? ([0-9] | [1-9] [0-9]*)) space
|
|
||||||
item ::= number | integer
|
|
||||||
number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space
|
|
||||||
root ::= "[" space item ( "," space item )( "," space item )( "," space item )?( "," space item )? "]" space
|
|
||||||
space ::= " "?
|
|
||||||
)"""
|
|
||||||
});
|
|
||||||
|
|
||||||
test({
|
|
||||||
SUCCESS,
|
|
||||||
"simple regexp",
|
|
||||||
R"""({
|
|
||||||
"type": "string",
|
|
||||||
"pattern": "^abc?d*efg+(hij)?kl$"
|
|
||||||
})""",
|
|
||||||
R"""(
|
|
||||||
root ::= "\"" "ab" "c"? "d"* "ef" "g"+ ("hij")? "kl" "\"" space
|
|
||||||
space ::= " "?
|
|
||||||
)"""
|
|
||||||
});
|
|
||||||
|
|
||||||
test({
|
|
||||||
SUCCESS,
|
|
||||||
"regexp escapes",
|
|
||||||
R"""({
|
|
||||||
"type": "string",
|
|
||||||
"pattern": "^\\[\\]\\{\\}\\(\\)\\|\\+\\*\\?$"
|
|
||||||
})""",
|
|
||||||
R"""(
|
|
||||||
root ::= "\"" "[]{}()|+*?" "\"" space
|
|
||||||
space ::= " "?
|
|
||||||
)"""
|
|
||||||
});
|
|
||||||
|
|
||||||
test({
|
|
||||||
SUCCESS,
|
|
||||||
"regexp quote",
|
|
||||||
R"""({
|
|
||||||
"type": "string",
|
|
||||||
"pattern": "^\"$"
|
|
||||||
})""",
|
|
||||||
R"""(
|
|
||||||
root ::= "\"" "\"" "\"" space
|
|
||||||
space ::= " "?
|
|
||||||
)"""
|
|
||||||
});
|
|
||||||
|
|
||||||
test({
|
|
||||||
SUCCESS,
|
|
||||||
"regexp",
|
|
||||||
R"""({
|
|
||||||
"type": "string",
|
|
||||||
"pattern": "^(\\([0-9]{1,3}\\))?[0-9]{3}-[0-9]{4} and...$"
|
|
||||||
})""",
|
|
||||||
R"""(
|
|
||||||
dot ::= [\U00000000-\x09\x0B\x0C\x0E-\U0010FFFF]
|
|
||||||
root ::= "\"" ("(" root-1 root-1? root-1? ")")? root-1 root-1 root-1 "-" root-1 root-1 root-1 root-1 " and" dot dot dot "\"" space
|
|
||||||
root-1 ::= [0-9]
|
|
||||||
space ::= " "?
|
|
||||||
)"""
|
|
||||||
});
|
|
||||||
|
|
||||||
test({
|
|
||||||
SUCCESS,
|
|
||||||
"required props in original order",
|
|
||||||
R"""({
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"b": {"type": "string"},
|
|
||||||
"c": {"type": "string"},
|
|
||||||
"a": {"type": "string"}
|
|
||||||
},
|
|
||||||
"required": [
|
|
||||||
"a",
|
|
||||||
"b",
|
|
||||||
"c"
|
|
||||||
],
|
|
||||||
"additionalProperties": false,
|
|
||||||
"definitions": {}
|
|
||||||
})""",
|
|
||||||
R"""(
|
|
||||||
a-kv ::= "\"a\"" space ":" space string
|
|
||||||
b-kv ::= "\"b\"" space ":" space string
|
|
||||||
c-kv ::= "\"c\"" space ":" space string
|
|
||||||
root ::= "{" space b-kv "," space c-kv "," space a-kv "}" space
|
|
||||||
space ::= " "?
|
|
||||||
string ::= "\"" (
|
|
||||||
[^"\\] |
|
|
||||||
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
|
|
||||||
)* "\"" space
|
|
||||||
)"""
|
|
||||||
});
|
|
||||||
|
|
||||||
test({
|
|
||||||
SUCCESS,
|
|
||||||
"1 optional prop",
|
|
||||||
R"""({
|
|
||||||
"properties": {
|
|
||||||
"a": {
|
|
||||||
"type": "string"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"additionalProperties": false
|
|
||||||
})""",
|
|
||||||
R"""(
|
|
||||||
a-kv ::= "\"a\"" space ":" space string
|
|
||||||
root ::= "{" space (a-kv )? "}" space
|
|
||||||
space ::= " "?
|
|
||||||
string ::= "\"" (
|
|
||||||
[^"\\] |
|
|
||||||
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
|
|
||||||
)* "\"" space
|
|
||||||
)"""
|
|
||||||
});
|
|
||||||
|
|
||||||
test({
|
|
||||||
SUCCESS,
|
|
||||||
"N optional props",
|
|
||||||
R"""({
|
|
||||||
"properties": {
|
|
||||||
"a": {"type": "string"},
|
|
||||||
"b": {"type": "string"},
|
|
||||||
"c": {"type": "string"}
|
|
||||||
},
|
|
||||||
"additionalProperties": false
|
|
||||||
})""",
|
|
||||||
R"""(
|
|
||||||
a-kv ::= "\"a\"" space ":" space string
|
|
||||||
a-rest ::= ( "," space b-kv )? b-rest
|
|
||||||
b-kv ::= "\"b\"" space ":" space string
|
|
||||||
b-rest ::= ( "," space c-kv )?
|
|
||||||
c-kv ::= "\"c\"" space ":" space string
|
|
||||||
root ::= "{" space (a-kv a-rest | b-kv b-rest | c-kv )? "}" space
|
|
||||||
space ::= " "?
|
|
||||||
string ::= "\"" (
|
|
||||||
[^"\\] |
|
|
||||||
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
|
|
||||||
)* "\"" space
|
|
||||||
)"""
|
|
||||||
});
|
|
||||||
|
|
||||||
test({
|
|
||||||
SUCCESS,
|
|
||||||
"required + optional props each in original order",
|
|
||||||
R"""({
|
|
||||||
"properties": {
|
|
||||||
"b": {"type": "string"},
|
|
||||||
"a": {"type": "string"},
|
|
||||||
"d": {"type": "string"},
|
|
||||||
"c": {"type": "string"}
|
|
||||||
},
|
|
||||||
"required": ["a", "b"],
|
|
||||||
"additionalProperties": false
|
|
||||||
})""",
|
|
||||||
R"""(
|
|
||||||
a-kv ::= "\"a\"" space ":" space string
|
|
||||||
b-kv ::= "\"b\"" space ":" space string
|
|
||||||
c-kv ::= "\"c\"" space ":" space string
|
|
||||||
d-kv ::= "\"d\"" space ":" space string
|
|
||||||
d-rest ::= ( "," space c-kv )?
|
|
||||||
root ::= "{" space b-kv "," space a-kv ( "," space ( d-kv d-rest | c-kv ) )? "}" space
|
|
||||||
space ::= " "?
|
|
||||||
string ::= "\"" (
|
|
||||||
[^"\\] |
|
|
||||||
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
|
|
||||||
)* "\"" space
|
|
||||||
)"""
|
|
||||||
});
|
|
||||||
|
|
||||||
test({
|
|
||||||
SUCCESS,
|
|
||||||
"additional props",
|
|
||||||
R"""({
|
|
||||||
"type": "object",
|
|
||||||
"additionalProperties": {"type": "array", "items": {"type": "number"}}
|
|
||||||
})""",
|
|
||||||
R"""(
|
|
||||||
additional-kv ::= string ":" space additional-value
|
|
||||||
additional-kvs ::= additional-kv ( "," space additional-kv )*
|
|
||||||
additional-value ::= "[" space ( number ( "," space number )* )? "]" space
|
|
||||||
number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space
|
|
||||||
root ::= "{" space (additional-kvs )? "}" space
|
|
||||||
space ::= " "?
|
|
||||||
string ::= "\"" (
|
|
||||||
[^"\\] |
|
|
||||||
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
|
|
||||||
)* "\"" space
|
|
||||||
)"""
|
|
||||||
});
|
|
||||||
|
|
||||||
test({
|
|
||||||
SUCCESS,
|
|
||||||
"additional props (true)",
|
|
||||||
R"""({
|
|
||||||
"type": "object",
|
|
||||||
"additionalProperties": true
|
|
||||||
})""",
|
|
||||||
R"""(
|
|
||||||
array ::= "[" space ( value ("," space value)* )? "]" space
|
|
||||||
boolean ::= ("true" | "false") space
|
|
||||||
null ::= "null" space
|
|
||||||
number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space
|
|
||||||
object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? "}" space
|
|
||||||
root ::= object
|
|
||||||
space ::= " "?
|
|
||||||
string ::= "\"" (
|
|
||||||
[^"\\] |
|
|
||||||
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
|
|
||||||
)* "\"" space
|
|
||||||
value ::= object | array | string | number | boolean
|
|
||||||
)"""
|
|
||||||
});
|
|
||||||
|
|
||||||
test({
|
|
||||||
SUCCESS,
|
|
||||||
"additional props (implicit)",
|
|
||||||
R"""({
|
|
||||||
"type": "object"
|
|
||||||
})""",
|
|
||||||
R"""(
|
|
||||||
array ::= "[" space ( value ("," space value)* )? "]" space
|
|
||||||
boolean ::= ("true" | "false") space
|
|
||||||
null ::= "null" space
|
|
||||||
number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space
|
|
||||||
object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? "}" space
|
|
||||||
root ::= object
|
|
||||||
space ::= " "?
|
|
||||||
string ::= "\"" (
|
|
||||||
[^"\\] |
|
|
||||||
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
|
|
||||||
)* "\"" space
|
|
||||||
value ::= object | array | string | number | boolean
|
|
||||||
)"""
|
|
||||||
});
|
|
||||||
|
|
||||||
test({
|
|
||||||
SUCCESS,
|
|
||||||
"empty w/o additional props",
|
|
||||||
R"""({
|
|
||||||
"type": "object",
|
|
||||||
"additionalProperties": false
|
|
||||||
})""",
|
|
||||||
R"""(
|
|
||||||
root ::= "{" space "}" space
|
|
||||||
space ::= " "?
|
|
||||||
)"""
|
|
||||||
});
|
|
||||||
|
|
||||||
test({
|
|
||||||
SUCCESS,
|
|
||||||
"required + additional props",
|
|
||||||
R"""({
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"a": {"type": "number"}
|
|
||||||
},
|
|
||||||
"required": ["a"],
|
|
||||||
"additionalProperties": {"type": "string"}
|
|
||||||
})""",
|
|
||||||
R"""(
|
|
||||||
a-kv ::= "\"a\"" space ":" space number
|
|
||||||
additional-kv ::= string ":" space string
|
|
||||||
additional-kvs ::= additional-kv ( "," space additional-kv )*
|
|
||||||
number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space
|
|
||||||
root ::= "{" space a-kv ( "," space ( additional-kvs ) )? "}" space
|
|
||||||
space ::= " "?
|
|
||||||
string ::= "\"" (
|
|
||||||
[^"\\] |
|
|
||||||
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
|
|
||||||
)* "\"" space
|
|
||||||
)"""
|
|
||||||
});
|
|
||||||
|
|
||||||
test({
|
|
||||||
SUCCESS,
|
|
||||||
"optional + additional props",
|
|
||||||
R"""({
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"a": {"type": "number"}
|
|
||||||
},
|
|
||||||
"additionalProperties": {"type": "number"}
|
|
||||||
})""",
|
|
||||||
R"""(
|
|
||||||
a-kv ::= "\"a\"" space ":" space number
|
|
||||||
a-rest ::= additional-kvs
|
|
||||||
additional-kv ::= string ":" space number
|
|
||||||
additional-kvs ::= additional-kv ( "," space additional-kv )*
|
|
||||||
number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space
|
|
||||||
root ::= "{" space (a-kv a-rest | additional-kvs )? "}" space
|
|
||||||
space ::= " "?
|
|
||||||
string ::= "\"" (
|
|
||||||
[^"\\] |
|
|
||||||
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
|
|
||||||
)* "\"" space
|
|
||||||
)"""
|
|
||||||
});
|
|
||||||
|
|
||||||
test({
|
|
||||||
SUCCESS,
|
|
||||||
"required + optional + additional props",
|
|
||||||
R"""({
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"a": {"type": "number"},
|
|
||||||
"b": {"type": "number"}
|
|
||||||
},
|
|
||||||
"required": ["a"],
|
|
||||||
"additionalProperties": {"type": "number"}
|
|
||||||
})""",
|
|
||||||
R"""(
|
|
||||||
a-kv ::= "\"a\"" space ":" space number
|
|
||||||
additional-kv ::= string ":" space number
|
|
||||||
additional-kvs ::= additional-kv ( "," space additional-kv )*
|
|
||||||
b-kv ::= "\"b\"" space ":" space number
|
|
||||||
b-rest ::= additional-kvs
|
|
||||||
number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space
|
|
||||||
root ::= "{" space a-kv ( "," space ( b-kv b-rest | additional-kvs ) )? "}" space
|
|
||||||
space ::= " "?
|
|
||||||
string ::= "\"" (
|
|
||||||
[^"\\] |
|
|
||||||
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
|
|
||||||
)* "\"" space
|
|
||||||
)"""
|
|
||||||
});
|
|
||||||
|
|
||||||
test({
|
|
||||||
SUCCESS,
|
|
||||||
"top-level $ref",
|
|
||||||
R"""({
|
|
||||||
"$ref": "#/definitions/MyType",
|
|
||||||
"definitions": {
|
|
||||||
"MyType": {
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"a": {
|
|
||||||
"type": "string"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"required": [
|
|
||||||
"a"
|
|
||||||
],
|
|
||||||
"additionalProperties": false
|
|
||||||
}
|
|
||||||
}
|
|
||||||
})""",
|
|
||||||
R"""(
|
|
||||||
MyType ::= "{" space MyType-a-kv "}" space
|
|
||||||
MyType-a-kv ::= "\"a\"" space ":" space string
|
|
||||||
root ::= MyType
|
|
||||||
space ::= " "?
|
|
||||||
string ::= "\"" (
|
|
||||||
[^"\\] |
|
|
||||||
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
|
|
||||||
)* "\"" space
|
|
||||||
)"""
|
|
||||||
});
|
|
||||||
|
|
||||||
test({
|
|
||||||
SUCCESS,
|
|
||||||
"anyOf",
|
|
||||||
R"""({
|
|
||||||
"anyOf": [
|
|
||||||
{"$ref": "#/definitions/foo"},
|
|
||||||
{"$ref": "#/definitions/bar"}
|
|
||||||
],
|
|
||||||
"definitions": {
|
|
||||||
"foo": {
|
|
||||||
"properties": {"a": {"type": "number"}}
|
|
||||||
},
|
|
||||||
"bar": {
|
|
||||||
"properties": {"b": {"type": "number"}}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"type": "object"
|
|
||||||
})""",
|
|
||||||
R"""(
|
|
||||||
alternative-0 ::= foo
|
|
||||||
alternative-1 ::= bar
|
|
||||||
bar ::= "{" space (bar-b-kv )? "}" space
|
|
||||||
bar-b-kv ::= "\"b\"" space ":" space number
|
|
||||||
foo ::= "{" space (foo-a-kv )? "}" space
|
|
||||||
foo-a-kv ::= "\"a\"" space ":" space number
|
|
||||||
number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space
|
|
||||||
root ::= alternative-0 | alternative-1
|
|
||||||
space ::= " "?
|
|
||||||
)"""
|
|
||||||
});
|
|
||||||
|
|
||||||
test({
|
|
||||||
SUCCESS,
|
|
||||||
"mix of allOf, anyOf and $ref (similar to https://json.schemastore.org/tsconfig.json)",
|
|
||||||
R"""({
|
|
||||||
"allOf": [
|
|
||||||
{"$ref": "#/definitions/foo"},
|
|
||||||
{"$ref": "#/definitions/bar"},
|
|
||||||
{
|
|
||||||
"anyOf": [
|
|
||||||
{"$ref": "#/definitions/baz"},
|
|
||||||
{"$ref": "#/definitions/bam"}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"definitions": {
|
|
||||||
"foo": {
|
|
||||||
"properties": {"a": {"type": "number"}}
|
|
||||||
},
|
|
||||||
"bar": {
|
|
||||||
"properties": {"b": {"type": "number"}}
|
|
||||||
},
|
|
||||||
"bam": {
|
|
||||||
"properties": {"c": {"type": "number"}}
|
|
||||||
},
|
|
||||||
"baz": {
|
|
||||||
"properties": {"d": {"type": "number"}}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"type": "object"
|
|
||||||
})""",
|
|
||||||
R"""(
|
|
||||||
a-kv ::= "\"a\"" space ":" space number
|
|
||||||
b-kv ::= "\"b\"" space ":" space number
|
|
||||||
c-kv ::= "\"c\"" space ":" space number
|
|
||||||
d-kv ::= "\"d\"" space ":" space number
|
|
||||||
d-rest ::= ( "," space c-kv )?
|
|
||||||
number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space
|
|
||||||
root ::= "{" space a-kv "," space b-kv ( "," space ( d-kv d-rest | c-kv ) )? "}" space
|
|
||||||
space ::= " "?
|
|
||||||
)"""
|
|
||||||
});
|
|
||||||
|
|
||||||
test({
|
|
||||||
SUCCESS,
|
|
||||||
"conflicting names",
|
|
||||||
R"""({
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"number": {
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"number": {
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"root": {
|
|
||||||
"type": "number"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"required": [
|
|
||||||
"root"
|
|
||||||
],
|
|
||||||
"additionalProperties": false
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"required": [
|
|
||||||
"number"
|
|
||||||
],
|
|
||||||
"additionalProperties": false
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"required": [
|
|
||||||
"number"
|
|
||||||
],
|
|
||||||
"additionalProperties": false,
|
|
||||||
"definitions": {}
|
|
||||||
})""",
|
|
||||||
R"""(
|
|
||||||
number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space
|
|
||||||
number- ::= "{" space number-number-kv "}" space
|
|
||||||
number-kv ::= "\"number\"" space ":" space number-
|
|
||||||
number-number ::= "{" space number-number-root-kv "}" space
|
|
||||||
number-number-kv ::= "\"number\"" space ":" space number-number
|
|
||||||
number-number-root-kv ::= "\"root\"" space ":" space number
|
|
||||||
root ::= "{" space number-kv "}" space
|
|
||||||
space ::= " "?
|
|
||||||
)"""
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
int main() {
|
|
||||||
fprintf(stderr, "LLAMA_NODE_AVAILABLE = %s\n", getenv("LLAMA_NODE_AVAILABLE") ? "true" : "false");
|
|
||||||
fprintf(stderr, "LLAMA_PYTHON_AVAILABLE = %s\n", getenv("LLAMA_PYTHON_AVAILABLE") ? "true" : "false");
|
|
||||||
|
|
||||||
test_all("C++", [](const TestCase & tc) {
|
|
||||||
try {
|
|
||||||
tc.verify(json_schema_to_grammar(nlohmann::ordered_json::parse(tc.schema)));
|
|
||||||
tc.verify_status(SUCCESS);
|
|
||||||
} catch (const std::runtime_error & ex) {
|
|
||||||
fprintf(stderr, "Error: %s\n", ex.what());
|
|
||||||
tc.verify_status(FAILURE);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
if (getenv("LLAMA_PYTHON_AVAILABLE") || (std::system("python --version") == 0)) {
|
|
||||||
test_all("Python", [](const TestCase & tc) {
|
|
||||||
write("test-json-schema-input.tmp", tc.schema);
|
|
||||||
tc.verify_status(std::system(
|
|
||||||
"python ./examples/json-schema-to-grammar.py test-json-schema-input.tmp > test-grammar-output.tmp") == 0 ? SUCCESS : FAILURE);
|
|
||||||
tc.verify(read("test-grammar-output.tmp"));
|
|
||||||
});
|
|
||||||
} else {
|
|
||||||
fprintf(stderr, "\033[33mWARNING: Python not found, skipping Python JSON schema -> grammar tests.\n\033[0m");
|
|
||||||
}
|
|
||||||
|
|
||||||
if (getenv("LLAMA_NODE_AVAILABLE") || (std::system("node --version") == 0)) {
|
|
||||||
test_all("JavaScript", [](const TestCase & tc) {
|
|
||||||
write("test-json-schema-input.tmp", tc.schema);
|
|
||||||
tc.verify_status(std::system(
|
|
||||||
"node ./tests/run-json-schema-to-grammar.mjs test-json-schema-input.tmp > test-grammar-output.tmp") == 0 ? SUCCESS : FAILURE);
|
|
||||||
tc.verify(read("test-grammar-output.tmp"));
|
|
||||||
});
|
|
||||||
} else {
|
|
||||||
fprintf(stderr, "\033[33mWARNING: Node not found, skipping JavaScript JSON schema -> grammar tests.\n\033[0m");
|
|
||||||
}
|
|
||||||
|
|
||||||
test_all("Check Expectations Validity", [](const TestCase & tc) {
|
|
||||||
if (tc.expected_status == SUCCESS) {
|
|
||||||
tc.verify_expectation_parseable();
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
Loading…
Add table
Add a link
Reference in a new issue