mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-10 17:14:36 +00:00
attempts a backflip, but does he stick the landing?
This commit is contained in:
commit
70aee82552
139 changed files with 16067 additions and 19275 deletions
|
@ -1,30 +0,0 @@
|
||||||
ARG UBUNTU_VERSION=22.04
|
|
||||||
# This needs to generally match the container host's environment.
|
|
||||||
ARG MUSA_VERSION=rc3.1.0
|
|
||||||
# Target the MUSA build image
|
|
||||||
ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
|
||||||
# Target the MUSA runtime image
|
|
||||||
ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
|
|
||||||
|
|
||||||
FROM ${BASE_MUSA_DEV_CONTAINER} AS build
|
|
||||||
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y build-essential git cmake
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
RUN cmake -B build -DGGML_MUSA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
|
||||||
cmake --build build --config Release --target llama-cli -j$(nproc)
|
|
||||||
|
|
||||||
FROM ${BASE_MUSA_RUN_CONTAINER} AS runtime
|
|
||||||
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y libgomp1
|
|
||||||
|
|
||||||
COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
|
|
||||||
COPY --from=build /app/build/src/libllama.so /libllama.so
|
|
||||||
COPY --from=build /app/build/bin/llama-cli /llama-cli
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/llama-cli" ]
|
|
|
@ -1,35 +0,0 @@
|
||||||
ARG UBUNTU_VERSION=22.04
|
|
||||||
# This needs to generally match the container host's environment.
|
|
||||||
ARG MUSA_VERSION=rc3.1.0
|
|
||||||
# Target the MUSA build image
|
|
||||||
ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
|
||||||
# Target the MUSA runtime image
|
|
||||||
ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
|
|
||||||
|
|
||||||
FROM ${BASE_MUSA_DEV_CONTAINER} AS build
|
|
||||||
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y build-essential git cmake libcurl4-openssl-dev
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
RUN cmake -B build -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
|
||||||
cmake --build build --config Release --target llama-server -j$(nproc)
|
|
||||||
|
|
||||||
FROM ${BASE_MUSA_RUN_CONTAINER} AS runtime
|
|
||||||
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y libcurl4-openssl-dev libgomp1 curl
|
|
||||||
|
|
||||||
COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
|
|
||||||
COPY --from=build /app/build/src/libllama.so /libllama.so
|
|
||||||
COPY --from=build /app/build/bin/llama-server /llama-server
|
|
||||||
|
|
||||||
# Must be set to 0.0.0.0 so it can listen to requests from host machine
|
|
||||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
|
||||||
|
|
||||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/llama-server" ]
|
|
0
.gitmodules
vendored
0
.gitmodules
vendored
|
@ -174,7 +174,7 @@ if (LLAMA_HIPBLAS)
|
||||||
list(APPEND GGML_SOURCES_ROCM ${SRCS})
|
list(APPEND GGML_SOURCES_ROCM ${SRCS})
|
||||||
file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/mmq*.cu")
|
file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/mmq*.cu")
|
||||||
list(APPEND GGML_SOURCES_ROCM ${SRCS})
|
list(APPEND GGML_SOURCES_ROCM ${SRCS})
|
||||||
add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUDA SD_USE_CUBLAS)
|
add_compile_definitions(GGML_USE_HIP GGML_USE_CUDA SD_USE_CUBLAS)
|
||||||
add_library(ggml-rocm ${GGML_SOURCES_CUDA})
|
add_library(ggml-rocm ${GGML_SOURCES_CUDA})
|
||||||
if (LLAMA_CUDA_FORCE_DMMV)
|
if (LLAMA_CUDA_FORCE_DMMV)
|
||||||
target_compile_definitions(ggml-rocm PUBLIC GGML_CUDA_FORCE_DMMV)
|
target_compile_definitions(ggml-rocm PUBLIC GGML_CUDA_FORCE_DMMV)
|
||||||
|
@ -421,7 +421,7 @@ endif()
|
||||||
add_library(ggml
|
add_library(ggml
|
||||||
ggml/src/ggml.c
|
ggml/src/ggml.c
|
||||||
ggml/include/ggml.h
|
ggml/include/ggml.h
|
||||||
ggml/src/ggml-cpu.c
|
ggml/src/ggml-cpu/ggml-cpu.c
|
||||||
ggml/include/ggml-cpu.h
|
ggml/include/ggml-cpu.h
|
||||||
ggml/src/ggml-alloc.c
|
ggml/src/ggml-alloc.c
|
||||||
ggml/include/ggml-alloc.h
|
ggml/include/ggml-alloc.h
|
||||||
|
@ -431,10 +431,17 @@ add_library(ggml
|
||||||
ggml/include/ggml-cpp.h
|
ggml/include/ggml-cpp.h
|
||||||
ggml/src/ggml-quants.c
|
ggml/src/ggml-quants.c
|
||||||
ggml/src/ggml-quants.h
|
ggml/src/ggml-quants.h
|
||||||
ggml/src/llamafile/sgemm.cpp
|
ggml/src/ggml-cpu/llamafile/sgemm.cpp
|
||||||
ggml/src/llamafile/sgemm.h
|
ggml/src/ggml-cpu/llamafile/sgemm.h
|
||||||
ggml/src/ggml-aarch64.c
|
ggml/src/ggml-aarch64.c
|
||||||
ggml/src/ggml-aarch64.h
|
ggml/src/ggml-aarch64.h
|
||||||
|
ggml/src/ggml-threading.cpp
|
||||||
|
ggml/src/ggml-cpu/ggml-cpu.cpp
|
||||||
|
ggml/src/ggml-cpu/ggml-cpu-aarch64.c
|
||||||
|
ggml/src/ggml-cpu/ggml-cpu-aarch64.h
|
||||||
|
ggml/src/ggml-cpu/ggml-cpu-quants.c
|
||||||
|
ggml/src/ggml-cpu/ggml-cpu-quants.h
|
||||||
|
ggml/src/ggml-backend-reg.cpp
|
||||||
${GGML_SOURCES_CUDA})
|
${GGML_SOURCES_CUDA})
|
||||||
target_include_directories(ggml PUBLIC . ./ggml/include ./ggml/src ./include ./otherarch ./otherarch/tools)
|
target_include_directories(ggml PUBLIC . ./ggml/include ./ggml/src ./include ./otherarch ./otherarch/tools)
|
||||||
target_compile_features(ggml PUBLIC c_std_11) # don't bump
|
target_compile_features(ggml PUBLIC c_std_11) # don't bump
|
||||||
|
|
110
Makefile
110
Makefile
|
@ -48,8 +48,8 @@ ifdef KCPP_DEBUG
|
||||||
CFLAGS = -g -O0
|
CFLAGS = -g -O0
|
||||||
CXXFLAGS = -g -O0
|
CXXFLAGS = -g -O0
|
||||||
endif
|
endif
|
||||||
CFLAGS += -I. -Iggml/include -Iggml/src -Iinclude -Isrc -I./include -I./include/CL -I./otherarch -I./otherarch/tools -I./otherarch/sdcpp -I./otherarch/sdcpp/thirdparty -I./include/vulkan -O3 -fno-finite-math-only -std=c11 -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE
|
CFLAGS += -I. -Iggml/include -Iggml/src -Iggml/src/ggml-cpu -Iinclude -Isrc -I./include -I./include/CL -I./otherarch -I./otherarch/tools -I./otherarch/sdcpp -I./otherarch/sdcpp/thirdparty -I./include/vulkan -O3 -fno-finite-math-only -std=c11 -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE
|
||||||
CXXFLAGS += -I. -Iggml/include -Iggml/src -Iinclude -Isrc -I./common -I./include -I./include/CL -I./otherarch -I./otherarch/tools -I./otherarch/sdcpp -I./otherarch/sdcpp/thirdparty -I./include/vulkan -O3 -fno-finite-math-only -std=c++11 -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE
|
CXXFLAGS += -I. -Iggml/include -Iggml/src -Iggml/src/ggml-cpu -Iinclude -Isrc -I./common -I./include -I./include/CL -I./otherarch -I./otherarch/tools -I./otherarch/sdcpp -I./otherarch/sdcpp/thirdparty -I./include/vulkan -O3 -fno-finite-math-only -std=c++11 -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE
|
||||||
ifndef KCPP_DEBUG
|
ifndef KCPP_DEBUG
|
||||||
CFLAGS += -DNDEBUG -s
|
CFLAGS += -DNDEBUG -s
|
||||||
CXXFLAGS += -DNDEBUG -s
|
CXXFLAGS += -DNDEBUG -s
|
||||||
|
@ -86,9 +86,9 @@ endif
|
||||||
CUBLASLD_FLAGS =
|
CUBLASLD_FLAGS =
|
||||||
CUBLAS_OBJS =
|
CUBLAS_OBJS =
|
||||||
|
|
||||||
OBJS_FULL += ggml-alloc.o ggml-aarch64.o ggml-quants.o unicode.o unicode-data.o sgemm.o common.o sampling.o
|
OBJS_FULL += ggml-alloc.o ggml-aarch64.o ggml-quants.o ggml-cpu-quants.o ggml-cpu-aarch64.o unicode.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o sgemm.o common.o sampling.o
|
||||||
OBJS_SIMPLE += ggml-alloc.o ggml-aarch64.o ggml-quants_noavx2.o unicode.o unicode-data.o sgemm_noavx2.o common.o sampling.o
|
OBJS_SIMPLE += ggml-alloc.o ggml-aarch64.o ggml-quants_noavx2.o ggml-cpu-quants_noavx2.o ggml-cpu-aarch64_noavx2.o unicode.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o sgemm_noavx2.o common.o sampling.o
|
||||||
OBJS_FAILSAFE += ggml-alloc.o ggml-aarch64.o ggml-quants_failsafe.o unicode.o unicode-data.o sgemm_failsafe.o common.o sampling.o
|
OBJS_FAILSAFE += ggml-alloc.o ggml-aarch64.o ggml-quants_failsafe.o ggml-cpu-quants_failsafe.o ggml-cpu-aarch64_failsafe.o unicode.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o sgemm_failsafe.o common.o sampling.o
|
||||||
|
|
||||||
# OS specific
|
# OS specific
|
||||||
# TODO: support Windows
|
# TODO: support Windows
|
||||||
|
@ -167,8 +167,8 @@ ifndef LLAMA_NO_ACCELERATE
|
||||||
# Mac M1 - include Accelerate framework.
|
# Mac M1 - include Accelerate framework.
|
||||||
# `-framework Accelerate` works on Mac Intel as well, with negliable performance boost (as of the predict time).
|
# `-framework Accelerate` works on Mac Intel as well, with negliable performance boost (as of the predict time).
|
||||||
ifeq ($(UNAME_S),Darwin)
|
ifeq ($(UNAME_S),Darwin)
|
||||||
CFLAGS += -DGGML_USE_ACCELERATE -DGGML_USE_BLAS
|
CFLAGS += -DGGML_USE_ACCELERATE -DGGML_USE_BLAS -DGGML_BLAS_USE_ACCELERATE
|
||||||
CXXFLAGS += -DGGML_USE_ACCELERATE -DGGML_USE_BLAS
|
CXXFLAGS += -DGGML_USE_ACCELERATE -DGGML_USE_BLAS -DGGML_BLAS_USE_ACCELERATE
|
||||||
LDFLAGS += -framework Accelerate
|
LDFLAGS += -framework Accelerate
|
||||||
OBJS += ggml-blas.o
|
OBJS += ggml-blas.o
|
||||||
endif
|
endif
|
||||||
|
@ -241,7 +241,7 @@ endif
|
||||||
|
|
||||||
ggml/src/ggml-cuda/%.o: ggml/src/ggml-cuda/%.cu ggml/include/ggml.h ggml/src/ggml-common.h ggml/src/ggml-cuda/common.cuh
|
ggml/src/ggml-cuda/%.o: ggml/src/ggml-cuda/%.cu ggml/include/ggml.h ggml/src/ggml-common.h ggml/src/ggml-cuda/common.cuh
|
||||||
$(NVCC) $(NVCCFLAGS) $(subst -Ofast,-O3,$(CXXFLAGS)) $(CUBLAS_FLAGS) $(HIPFLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@
|
$(NVCC) $(NVCCFLAGS) $(subst -Ofast,-O3,$(CXXFLAGS)) $(CUBLAS_FLAGS) $(HIPFLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@
|
||||||
ggml-cuda.o: ggml/src/ggml-cuda.cu ggml/include/ggml-cuda.h ggml/include/ggml.h ggml/include/ggml-backend.h ggml/src/ggml-backend-impl.h ggml/src/ggml-common.h $(wildcard ggml/src/ggml-cuda/*.cuh)
|
ggml-cuda.o: ggml/src/ggml-cuda/ggml-cuda.cu ggml/include/ggml-cuda.h ggml/include/ggml.h ggml/include/ggml-backend.h ggml/src/ggml-backend-impl.h ggml/src/ggml-common.h $(wildcard ggml/src/ggml-cuda/*.cuh)
|
||||||
$(NVCC) $(NVCCFLAGS) $(subst -Ofast,-O3,$(CXXFLAGS)) $(CUBLAS_FLAGS) $(HIPFLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@
|
$(NVCC) $(NVCCFLAGS) $(subst -Ofast,-O3,$(CXXFLAGS)) $(CUBLAS_FLAGS) $(HIPFLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@
|
||||||
ggml_v2-cuda.o: otherarch/ggml_v2-cuda.cu otherarch/ggml_v2-cuda.h
|
ggml_v2-cuda.o: otherarch/ggml_v2-cuda.cu otherarch/ggml_v2-cuda.h
|
||||||
$(NVCC) $(NVCCFLAGS) $(subst -Ofast,-O3,$(CXXFLAGS)) $(CUBLAS_FLAGS) $(HIPFLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@
|
$(NVCC) $(NVCCFLAGS) $(subst -Ofast,-O3,$(CXXFLAGS)) $(CUBLAS_FLAGS) $(HIPFLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@
|
||||||
|
@ -266,7 +266,7 @@ ifdef LLAMA_HIPBLAS
|
||||||
LLAMA_CUDA_DMMV_X ?= 32
|
LLAMA_CUDA_DMMV_X ?= 32
|
||||||
LLAMA_CUDA_MMV_Y ?= 1
|
LLAMA_CUDA_MMV_Y ?= 1
|
||||||
LLAMA_CUDA_KQUANTS_ITER ?= 2
|
LLAMA_CUDA_KQUANTS_ITER ?= 2
|
||||||
HIPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUDA -DSD_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C)
|
HIPFLAGS += -DGGML_USE_HIP -DGGML_USE_CUDA -DSD_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C)
|
||||||
HIPLDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
|
HIPLDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
|
||||||
HIPLDFLAGS += -L$(ROCM_PATH)/lib64 -Wl,-rpath=$(ROCM_PATH)/lib64
|
HIPLDFLAGS += -L$(ROCM_PATH)/lib64 -Wl,-rpath=$(ROCM_PATH)/lib64
|
||||||
HIPLDFLAGS += -lhipblas -lamdhip64 -lrocblas
|
HIPLDFLAGS += -lhipblas -lamdhip64 -lrocblas
|
||||||
|
@ -281,7 +281,7 @@ ifdef LLAMA_HIPBLAS
|
||||||
|
|
||||||
ggml/src/ggml-cuda/%.o: ggml/src/ggml-cuda/%.cu ggml/include/ggml.h ggml/src/ggml-common.h ggml/src/ggml-cuda/common.cuh
|
ggml/src/ggml-cuda/%.o: ggml/src/ggml-cuda/%.cu ggml/include/ggml.h ggml/src/ggml-common.h ggml/src/ggml-cuda/common.cuh
|
||||||
$(HCXX) $(CXXFLAGS) $(HIPFLAGS) $(HIPFLAGS2) -x hip -c -o $@ $<
|
$(HCXX) $(CXXFLAGS) $(HIPFLAGS) $(HIPFLAGS2) -x hip -c -o $@ $<
|
||||||
ggml-cuda.o: ggml/src/ggml-cuda.cu ggml/include/ggml-cuda.h ggml/include/ggml.h ggml/include/ggml-backend.h ggml/src/ggml-backend-impl.h ggml/src/ggml-common.h $(wildcard ggml/src/ggml-cuda/*.cuh)
|
ggml-cuda.o: ggml/src/ggml-cuda/ggml-cuda.cu ggml/include/ggml-cuda.h ggml/include/ggml.h ggml/include/ggml-backend.h ggml/src/ggml-backend-impl.h ggml/src/ggml-common.h $(wildcard ggml/src/ggml-cuda/*.cuh)
|
||||||
$(HCXX) $(CXXFLAGS) $(HIPFLAGS) $(HIPFLAGS2) -x hip -c -o $@ $<
|
$(HCXX) $(CXXFLAGS) $(HIPFLAGS) $(HIPFLAGS2) -x hip -c -o $@ $<
|
||||||
ggml_v2-cuda.o: otherarch/ggml_v2-cuda.cu otherarch/ggml_v2-cuda.h
|
ggml_v2-cuda.o: otherarch/ggml_v2-cuda.cu otherarch/ggml_v2-cuda.h
|
||||||
$(HCXX) $(CXXFLAGS) $(HIPFLAGS) $(HIPFLAGS2) -x hip -c -o $@ $<
|
$(HCXX) $(CXXFLAGS) $(HIPFLAGS) $(HIPFLAGS2) -x hip -c -o $@ $<
|
||||||
|
@ -298,9 +298,9 @@ ifdef LLAMA_METAL
|
||||||
LDFLAGS += -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
|
LDFLAGS += -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
|
||||||
OBJS += ggml-metal.o
|
OBJS += ggml-metal.o
|
||||||
|
|
||||||
ggml-metal.o: ggml/src/ggml-metal.m ggml/include/ggml-metal.h
|
ggml-metal.o: ggml/src/ggml-metal/ggml-metal.m ggml/include/ggml-metal.h
|
||||||
@echo "== Preparing merged Metal file =="
|
@echo "== Preparing merged Metal file =="
|
||||||
@sed -e '/#include "ggml-common.h"/r ggml/src/ggml-common.h' -e '/#include "ggml-common.h"/d' < ggml/src/ggml-metal.metal > ggml/src/ggml-metal-merged.metal
|
@sed -e '/#include "ggml-common.h"/r ggml/src/ggml-common.h' -e '/#include "ggml-common.h"/d' < ggml/src/ggml-metal/ggml-metal.metal > ggml/src/ggml-metal-merged.metal
|
||||||
@cp ggml/src/ggml-metal-merged.metal ./ggml-metal-merged.metal
|
@cp ggml/src/ggml-metal-merged.metal ./ggml-metal-merged.metal
|
||||||
$(CC) $(CFLAGS) -c $< -o $@
|
$(CC) $(CFLAGS) -c $< -o $@
|
||||||
endif # LLAMA_METAL
|
endif # LLAMA_METAL
|
||||||
|
@ -431,15 +431,15 @@ ggml_v4_vulkan_noavx2.o: ggml/src/ggml.c ggml/include/ggml.h
|
||||||
$(CC) $(FASTCFLAGS) $(SIMPLECFLAGS) $(VULKAN_FLAGS) -c $< -o $@
|
$(CC) $(FASTCFLAGS) $(SIMPLECFLAGS) $(VULKAN_FLAGS) -c $< -o $@
|
||||||
|
|
||||||
# cpu and clblast separated
|
# cpu and clblast separated
|
||||||
ggml-cpu.o: ggml/src/ggml-cpu.c ggml/include/ggml-cpu.h
|
ggml-cpu.o: ggml/src/ggml-cpu/ggml-cpu.c ggml/include/ggml-cpu.h
|
||||||
$(CC) $(FASTCFLAGS) $(FULLCFLAGS) -c $< -o $@
|
$(CC) $(FASTCFLAGS) $(FULLCFLAGS) -c $< -o $@
|
||||||
ggml-cpu_v4_failsafe.o: ggml/src/ggml-cpu.c ggml/include/ggml-cpu.h
|
ggml-cpu_v4_failsafe.o: ggml/src/ggml-cpu/ggml-cpu.c ggml/include/ggml-cpu.h
|
||||||
$(CC) $(FASTCFLAGS) $(NONECFLAGS) -c $< -o $@
|
$(CC) $(FASTCFLAGS) $(NONECFLAGS) -c $< -o $@
|
||||||
ggml-cpu_v4_noavx2.o: ggml/src/ggml-cpu.c ggml/include/ggml-cpu.h
|
ggml-cpu_v4_noavx2.o: ggml/src/ggml-cpu/ggml-cpu.c ggml/include/ggml-cpu.h
|
||||||
$(CC) $(FASTCFLAGS) $(SIMPLECFLAGS) -c $< -o $@
|
$(CC) $(FASTCFLAGS) $(SIMPLECFLAGS) -c $< -o $@
|
||||||
ggml-cpu_v4_clblast.o: ggml/src/ggml-cpu.c ggml/include/ggml-cpu.h
|
ggml-cpu_v4_clblast.o: ggml/src/ggml-cpu/ggml-cpu.c ggml/include/ggml-cpu.h
|
||||||
$(CC) $(FASTCFLAGS) $(FULLCFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
|
$(CC) $(FASTCFLAGS) $(FULLCFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
|
||||||
ggml-cpu_v4_clblast_noavx2.o: ggml/src/ggml-cpu.c ggml/include/ggml-cpu.h
|
ggml-cpu_v4_clblast_noavx2.o: ggml/src/ggml-cpu/ggml-cpu.c ggml/include/ggml-cpu.h
|
||||||
$(CC) $(FASTCFLAGS) $(SIMPLECFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
|
$(CC) $(FASTCFLAGS) $(SIMPLECFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
|
||||||
|
|
||||||
#quants
|
#quants
|
||||||
|
@ -449,13 +449,27 @@ ggml-quants_noavx2.o: ggml/src/ggml-quants.c ggml/include/ggml.h ggml/src/ggml-q
|
||||||
$(CC) $(CFLAGS) $(SIMPLECFLAGS) -c $< -o $@
|
$(CC) $(CFLAGS) $(SIMPLECFLAGS) -c $< -o $@
|
||||||
ggml-quants_failsafe.o: ggml/src/ggml-quants.c ggml/include/ggml.h ggml/src/ggml-quants.h ggml/src/ggml-common.h
|
ggml-quants_failsafe.o: ggml/src/ggml-quants.c ggml/include/ggml.h ggml/src/ggml-quants.h ggml/src/ggml-common.h
|
||||||
$(CC) $(CFLAGS) $(NONECFLAGS) -c $< -o $@
|
$(CC) $(CFLAGS) $(NONECFLAGS) -c $< -o $@
|
||||||
|
ggml-cpu-quants.o: ggml/src/ggml-cpu/ggml-cpu-quants.c ggml/include/ggml.h ggml/src/ggml-cpu/ggml-cpu-quants.h ggml/src/ggml-common.h
|
||||||
|
$(CC) $(CFLAGS) $(FULLCFLAGS) -c $< -o $@
|
||||||
|
ggml-cpu-quants_noavx2.o: ggml/src/ggml-cpu/ggml-cpu-quants.c ggml/include/ggml.h ggml/src/ggml-cpu/ggml-cpu-quants.h ggml/src/ggml-common.h
|
||||||
|
$(CC) $(CFLAGS) $(SIMPLECFLAGS) -c $< -o $@
|
||||||
|
ggml-cpu-quants_failsafe.o: ggml/src/ggml-cpu/ggml-cpu-quants.c ggml/include/ggml.h ggml/src/ggml-cpu/ggml-cpu-quants.h ggml/src/ggml-common.h
|
||||||
|
$(CC) $(CFLAGS) $(NONECFLAGS) -c $< -o $@
|
||||||
|
|
||||||
|
#aarch64
|
||||||
|
ggml-cpu-aarch64.o: ggml/src/ggml-cpu/ggml-cpu-aarch64.c ggml/include/ggml.h ggml/src/ggml-cpu/ggml-cpu-aarch64.h
|
||||||
|
$(CC) $(CFLAGS) $(FULLCFLAGS) -c $< -o $@
|
||||||
|
ggml-cpu-aarch64_noavx2.o: ggml/src/ggml-cpu/ggml-cpu-aarch64.c ggml/include/ggml.h ggml/src/ggml-cpu/ggml-cpu-aarch64.h
|
||||||
|
$(CC) $(CFLAGS) $(SIMPLECFLAGS) -c $< -o $@
|
||||||
|
ggml-cpu-aarch64_failsafe.o: ggml/src/ggml-cpu/ggml-cpu-aarch64.c ggml/include/ggml.h ggml/src/ggml-cpu/ggml-cpu-aarch64.h
|
||||||
|
$(CC) $(CFLAGS) $(NONECFLAGS) -c $< -o $@
|
||||||
|
|
||||||
#sgemm
|
#sgemm
|
||||||
sgemm.o: ggml/src/llamafile/sgemm.cpp ggml/src/llamafile/sgemm.h ggml/include/ggml.h
|
sgemm.o: ggml/src/ggml-cpu/llamafile/sgemm.cpp ggml/src/ggml-cpu/llamafile/sgemm.h ggml/include/ggml.h
|
||||||
$(CXX) $(CXXFLAGS) $(FULLCFLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) $(FULLCFLAGS) -c $< -o $@
|
||||||
sgemm_noavx2.o: ggml/src/llamafile/sgemm.cpp ggml/src/llamafile/sgemm.h ggml/include/ggml.h
|
sgemm_noavx2.o: ggml/src/ggml-cpu/llamafile/sgemm.cpp ggml/src/ggml-cpu/llamafile/sgemm.h ggml/include/ggml.h
|
||||||
$(CXX) $(CXXFLAGS) $(SIMPLECFLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) $(SIMPLECFLAGS) -c $< -o $@
|
||||||
sgemm_failsafe.o: ggml/src/llamafile/sgemm.cpp ggml/src/llamafile/sgemm.h ggml/include/ggml.h
|
sgemm_failsafe.o: ggml/src/ggml-cpu/llamafile/sgemm.cpp ggml/src/ggml-cpu/llamafile/sgemm.h ggml/include/ggml.h
|
||||||
$(CXX) $(CXXFLAGS) $(NONECFLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) $(NONECFLAGS) -c $< -o $@
|
||||||
|
|
||||||
#there's no intrinsics or special gpu ops used here, so we can have a universal object
|
#there's no intrinsics or special gpu ops used here, so we can have a universal object
|
||||||
|
@ -469,6 +483,10 @@ unicode-data.o: src/unicode-data.cpp src/unicode-data.h
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
ggml-aarch64.o: ggml/src/ggml-aarch64.c ggml/include/ggml.h ggml/src/ggml-aarch64.h ggml/src/ggml-common.h
|
ggml-aarch64.o: ggml/src/ggml-aarch64.c ggml/include/ggml.h ggml/src/ggml-aarch64.h ggml/src/ggml-common.h
|
||||||
$(CC) $(CFLAGS) -c $< -o $@
|
$(CC) $(CFLAGS) -c $< -o $@
|
||||||
|
ggml-threading.o: ggml/src/ggml-threading.cpp ggml/include/ggml.h
|
||||||
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
ggml-cpu-cpp.o: ggml/src/ggml-cpu/ggml-cpu.cpp ggml/include/ggml.h ggml/src/ggml-common.h
|
||||||
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
#these have special gpu defines
|
#these have special gpu defines
|
||||||
ggml-backend_default.o: ggml/src/ggml-backend.cpp ggml/src/ggml-backend-impl.h ggml/include/ggml.h ggml/include/ggml-backend.h
|
ggml-backend_default.o: ggml/src/ggml-backend.cpp ggml/src/ggml-backend-impl.h ggml/include/ggml.h ggml/include/ggml-backend.h
|
||||||
|
@ -477,6 +495,12 @@ ggml-backend_vulkan.o: ggml/src/ggml-backend.cpp ggml/src/ggml-backend-impl.h gg
|
||||||
$(CXX) $(CXXFLAGS) $(VULKAN_FLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) $(VULKAN_FLAGS) -c $< -o $@
|
||||||
ggml-backend_cublas.o: ggml/src/ggml-backend.cpp ggml/src/ggml-backend-impl.h ggml/include/ggml.h ggml/include/ggml-backend.h
|
ggml-backend_cublas.o: ggml/src/ggml-backend.cpp ggml/src/ggml-backend-impl.h ggml/include/ggml.h ggml/include/ggml-backend.h
|
||||||
$(CXX) $(CXXFLAGS) $(CUBLAS_FLAGS) $(HIPFLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) $(CUBLAS_FLAGS) $(HIPFLAGS) -c $< -o $@
|
||||||
|
ggml-backend-reg_default.o: ggml/src/ggml-backend-reg.cpp ggml/src/ggml-backend-impl.h ggml/include/ggml.h ggml/include/ggml-backend.h ggml/include/ggml-cpu.h
|
||||||
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
ggml-backend-reg_vulkan.o: ggml/src/ggml-backend-reg.cpp ggml/src/ggml-backend-impl.h ggml/include/ggml.h ggml/include/ggml-backend.h ggml/include/ggml-cpu.h
|
||||||
|
$(CXX) $(CXXFLAGS) $(VULKAN_FLAGS) -c $< -o $@
|
||||||
|
ggml-backend-reg_cublas.o: ggml/src/ggml-backend-reg.cpp ggml/src/ggml-backend-impl.h ggml/include/ggml.h ggml/include/ggml-backend.h ggml/include/ggml-cpu.h
|
||||||
|
$(CXX) $(CXXFLAGS) $(CUBLAS_FLAGS) $(HIPFLAGS) -c $< -o $@
|
||||||
llavaclip_default.o: examples/llava/clip.cpp examples/llava/clip.h
|
llavaclip_default.o: examples/llava/clip.cpp examples/llava/clip.h
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
llavaclip_cublas.o: examples/llava/clip.cpp examples/llava/clip.h
|
llavaclip_cublas.o: examples/llava/clip.cpp examples/llava/clip.h
|
||||||
|
@ -485,7 +509,7 @@ llavaclip_vulkan.o: examples/llava/clip.cpp examples/llava/clip.h
|
||||||
$(CXX) $(CXXFLAGS) $(VULKAN_FLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) $(VULKAN_FLAGS) -c $< -o $@
|
||||||
|
|
||||||
#this is only used for accelerate
|
#this is only used for accelerate
|
||||||
ggml-blas.o: ggml/src/ggml-blas.cpp ggml/include/ggml-blas.h
|
ggml-blas.o: ggml/src/ggml-blas/ggml-blas.cpp ggml/include/ggml-blas.h
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
#version 3 libs
|
#version 3 libs
|
||||||
|
@ -533,7 +557,7 @@ ggml_v3-opencl.o: otherarch/ggml_v3-opencl.cpp otherarch/ggml_v3-opencl.h
|
||||||
$(CXX) $(CXXFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
|
||||||
|
|
||||||
#vulkan
|
#vulkan
|
||||||
ggml-vulkan.o: ggml/src/ggml-vulkan.cpp ggml/include/ggml-vulkan.h ggml/src/ggml-vulkan-shaders.hpp ggml/src/ggml-vulkan-shaders.cpp
|
ggml-vulkan.o: ggml/src/ggml-vulkan/ggml-vulkan.cpp ggml/include/ggml-vulkan.h ggml/src/ggml-vulkan-shaders.hpp ggml/src/ggml-vulkan-shaders.cpp
|
||||||
$(CXX) $(CXXFLAGS) $(VULKAN_FLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) $(VULKAN_FLAGS) -c $< -o $@
|
||||||
|
|
||||||
# intermediate objects
|
# intermediate objects
|
||||||
|
@ -586,18 +610,18 @@ clean:
|
||||||
rm -vrf ggml/src/ggml-cuda/template-instances/*.o
|
rm -vrf ggml/src/ggml-cuda/template-instances/*.o
|
||||||
|
|
||||||
# useful tools
|
# useful tools
|
||||||
main: examples/main/main.cpp common/json-schema-to-grammar.cpp common/arg.cpp build-info.h ggml.o ggml-cpu.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o $(OBJS_FULL) $(OBJS)
|
main: examples/main/main.cpp common/json-schema-to-grammar.cpp common/arg.cpp build-info.h ggml.o ggml-cpu.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FULL) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
@echo '==== Run ./main -h for help. ===='
|
@echo '==== Run ./main -h for help. ===='
|
||||||
sdmain: otherarch/sdcpp/util.cpp otherarch/sdcpp/main.cpp otherarch/sdcpp/stable-diffusion.cpp otherarch/sdcpp/upscaler.cpp otherarch/sdcpp/model.cpp otherarch/sdcpp/thirdparty/zip.c build-info.h ggml.o ggml-cpu.o llama.o console.o ggml-backend_default.o $(OBJS_FULL) $(OBJS)
|
sdmain: otherarch/sdcpp/util.cpp otherarch/sdcpp/main.cpp otherarch/sdcpp/stable-diffusion.cpp otherarch/sdcpp/upscaler.cpp otherarch/sdcpp/model.cpp otherarch/sdcpp/thirdparty/zip.c build-info.h ggml.o ggml-cpu.o llama.o console.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FULL) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
whispermain: otherarch/whispercpp/main.cpp otherarch/whispercpp/whisper.cpp build-info.h ggml.o ggml-cpu.o llama.o console.o ggml-backend_default.o $(OBJS_FULL) $(OBJS)
|
whispermain: otherarch/whispercpp/main.cpp otherarch/whispercpp/whisper.cpp build-info.h ggml.o ggml-cpu.o llama.o console.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FULL) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
imatrix: examples/imatrix/imatrix.cpp build-info.h ggml.o ggml-cpu.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o $(OBJS_FULL) $(OBJS)
|
imatrix: examples/imatrix/imatrix.cpp build-info.h ggml.o ggml-cpu.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FULL) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
gguf: examples/gguf/gguf.cpp build-info.h ggml.o ggml-cpu.o llama.o llavaclip_default.o llava.o ggml-backend_default.o $(OBJS_FULL) $(OBJS)
|
gguf: examples/gguf/gguf.cpp build-info.h ggml.o ggml-cpu.o llama.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FULL) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
gguf-split: examples/gguf-split/gguf-split.cpp ggml.o ggml-cpu.o llama.o build-info.h llavaclip_default.o llava.o ggml-backend_default.o $(OBJS_FULL) $(OBJS)
|
gguf-split: examples/gguf-split/gguf-split.cpp ggml.o ggml-cpu.o llama.o build-info.h llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FULL) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
vulkan-shaders-gen: ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp
|
vulkan-shaders-gen: ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp
|
||||||
|
@ -607,11 +631,11 @@ vulkan-shaders-gen: ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp
|
||||||
$(shell) vulkan-shaders-gen --glslc glslc --input-dir ggml/src/vulkan-shaders --target-hpp ggml/src/ggml-vulkan-shaders.hpp --target-cpp ggml/src/ggml-vulkan-shaders.cpp
|
$(shell) vulkan-shaders-gen --glslc glslc --input-dir ggml/src/vulkan-shaders --target-hpp ggml/src/ggml-vulkan-shaders.hpp --target-cpp ggml/src/ggml-vulkan-shaders.cpp
|
||||||
|
|
||||||
#generated libraries
|
#generated libraries
|
||||||
koboldcpp_default: ggml.o ggml-cpu.o ggml_v3.o ggml_v2.o ggml_v1.o expose.o gpttype_adapter.o sdcpp_default.o whispercpp_default.o llavaclip_default.o llava.o ggml-backend_default.o $(OBJS_FULL) $(OBJS)
|
koboldcpp_default: ggml.o ggml-cpu.o ggml_v3.o ggml_v2.o ggml_v1.o expose.o gpttype_adapter.o sdcpp_default.o whispercpp_default.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FULL) $(OBJS)
|
||||||
$(DEFAULT_BUILD)
|
$(DEFAULT_BUILD)
|
||||||
|
|
||||||
ifdef FAILSAFE_BUILD
|
ifdef FAILSAFE_BUILD
|
||||||
koboldcpp_failsafe: ggml_v4_failsafe.o ggml-cpu_v4_failsafe.o ggml_v3_failsafe.o ggml_v2_failsafe.o ggml_v1_failsafe.o expose.o gpttype_adapter_failsafe.o sdcpp_default.o whispercpp_default.o llavaclip_default.o llava.o ggml-backend_default.o $(OBJS_FAILSAFE) $(OBJS)
|
koboldcpp_failsafe: ggml_v4_failsafe.o ggml-cpu_v4_failsafe.o ggml_v3_failsafe.o ggml_v2_failsafe.o ggml_v1_failsafe.o expose.o gpttype_adapter_failsafe.o sdcpp_default.o whispercpp_default.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FAILSAFE) $(OBJS)
|
||||||
$(FAILSAFE_BUILD)
|
$(FAILSAFE_BUILD)
|
||||||
else
|
else
|
||||||
koboldcpp_failsafe:
|
koboldcpp_failsafe:
|
||||||
|
@ -619,7 +643,7 @@ koboldcpp_failsafe:
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifdef NOAVX2_BUILD
|
ifdef NOAVX2_BUILD
|
||||||
koboldcpp_noavx2: ggml_v4_noavx2.o ggml-cpu_v4_noavx2.o ggml_v3_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o gpttype_adapter_failsafe.o sdcpp_default.o whispercpp_default.o llavaclip_default.o llava.o ggml-backend_default.o $(OBJS_SIMPLE) $(OBJS)
|
koboldcpp_noavx2: ggml_v4_noavx2.o ggml-cpu_v4_noavx2.o ggml_v3_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o gpttype_adapter_failsafe.o sdcpp_default.o whispercpp_default.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_SIMPLE) $(OBJS)
|
||||||
$(NOAVX2_BUILD)
|
$(NOAVX2_BUILD)
|
||||||
else
|
else
|
||||||
koboldcpp_noavx2:
|
koboldcpp_noavx2:
|
||||||
|
@ -627,10 +651,10 @@ koboldcpp_noavx2:
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifdef CLBLAST_BUILD
|
ifdef CLBLAST_BUILD
|
||||||
koboldcpp_clblast: ggml_v4_clblast.o ggml-cpu_v4_clblast.o ggml_v3_clblast.o ggml_v2_clblast.o ggml_v1.o expose.o gpttype_adapter_clblast.o ggml-opencl.o ggml_v3-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o sdcpp_default.o whispercpp_default.o llavaclip_default.o llava.o ggml-backend_default.o $(OBJS_FULL) $(OBJS)
|
koboldcpp_clblast: ggml_v4_clblast.o ggml-cpu_v4_clblast.o ggml_v3_clblast.o ggml_v2_clblast.o ggml_v1.o expose.o gpttype_adapter_clblast.o ggml-opencl.o ggml_v3-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o sdcpp_default.o whispercpp_default.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FULL) $(OBJS)
|
||||||
$(CLBLAST_BUILD)
|
$(CLBLAST_BUILD)
|
||||||
ifdef NOAVX2_BUILD
|
ifdef NOAVX2_BUILD
|
||||||
koboldcpp_clblast_noavx2: ggml_v4_clblast_noavx2.o ggml-cpu_v4_clblast_noavx2.o ggml_v3_clblast_noavx2.o ggml_v2_clblast_noavx2.o ggml_v1_failsafe.o expose.o gpttype_adapter_clblast_noavx2.o ggml-opencl.o ggml_v3-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o sdcpp_default.o whispercpp_default.o llavaclip_default.o llava.o ggml-backend_default.o $(OBJS_SIMPLE) $(OBJS)
|
koboldcpp_clblast_noavx2: ggml_v4_clblast_noavx2.o ggml-cpu_v4_clblast_noavx2.o ggml_v3_clblast_noavx2.o ggml_v2_clblast_noavx2.o ggml_v1_failsafe.o expose.o gpttype_adapter_clblast_noavx2.o ggml-opencl.o ggml_v3-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o sdcpp_default.o whispercpp_default.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_SIMPLE) $(OBJS)
|
||||||
$(CLBLAST_BUILD)
|
$(CLBLAST_BUILD)
|
||||||
else
|
else
|
||||||
koboldcpp_clblast_noavx2:
|
koboldcpp_clblast_noavx2:
|
||||||
|
@ -644,7 +668,7 @@ koboldcpp_clblast_noavx2:
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifdef CUBLAS_BUILD
|
ifdef CUBLAS_BUILD
|
||||||
koboldcpp_cublas: ggml_v4_cublas.o ggml-cpu.o ggml_v3_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o gpttype_adapter_cublas.o sdcpp_cublas.o whispercpp_cublas.o llavaclip_cublas.o llava.o ggml-backend_cublas.o $(CUBLAS_OBJS) $(OBJS_FULL) $(OBJS)
|
koboldcpp_cublas: ggml_v4_cublas.o ggml-cpu.o ggml_v3_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o gpttype_adapter_cublas.o sdcpp_cublas.o whispercpp_cublas.o llavaclip_cublas.o llava.o ggml-backend_cublas.o ggml-backend-reg_cublas.o $(CUBLAS_OBJS) $(OBJS_FULL) $(OBJS)
|
||||||
$(CUBLAS_BUILD)
|
$(CUBLAS_BUILD)
|
||||||
else
|
else
|
||||||
koboldcpp_cublas:
|
koboldcpp_cublas:
|
||||||
|
@ -652,7 +676,7 @@ koboldcpp_cublas:
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifdef HIPBLAS_BUILD
|
ifdef HIPBLAS_BUILD
|
||||||
koboldcpp_hipblas: ggml_v4_cublas.o ggml-cpu.o ggml_v3_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o gpttype_adapter_cublas.o sdcpp_cublas.o whispercpp_cublas.o llavaclip_cublas.o llava.o ggml-backend_cublas.o $(HIP_OBJS) $(OBJS_FULL) $(OBJS)
|
koboldcpp_hipblas: ggml_v4_cublas.o ggml-cpu.o ggml_v3_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o gpttype_adapter_cublas.o sdcpp_cublas.o whispercpp_cublas.o llavaclip_cublas.o llava.o ggml-backend_cublas.o ggml-backend-reg_cublas.o $(HIP_OBJS) $(OBJS_FULL) $(OBJS)
|
||||||
$(HIPBLAS_BUILD)
|
$(HIPBLAS_BUILD)
|
||||||
else
|
else
|
||||||
koboldcpp_hipblas:
|
koboldcpp_hipblas:
|
||||||
|
@ -660,10 +684,10 @@ koboldcpp_hipblas:
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifdef VULKAN_BUILD
|
ifdef VULKAN_BUILD
|
||||||
koboldcpp_vulkan: ggml_v4_vulkan.o ggml-cpu.o ggml_v3.o ggml_v2.o ggml_v1.o expose.o gpttype_adapter_vulkan.o ggml-vulkan.o sdcpp_vulkan.o whispercpp_default.o llavaclip_vulkan.o llava.o ggml-backend_vulkan.o $(OBJS_FULL) $(OBJS)
|
koboldcpp_vulkan: ggml_v4_vulkan.o ggml-cpu.o ggml_v3.o ggml_v2.o ggml_v1.o expose.o gpttype_adapter_vulkan.o ggml-vulkan.o sdcpp_vulkan.o whispercpp_default.o llavaclip_vulkan.o llava.o ggml-backend_vulkan.o ggml-backend-reg_vulkan.o $(OBJS_FULL) $(OBJS)
|
||||||
$(VULKAN_BUILD)
|
$(VULKAN_BUILD)
|
||||||
ifdef NOAVX2_BUILD
|
ifdef NOAVX2_BUILD
|
||||||
koboldcpp_vulkan_noavx2: ggml_v4_vulkan_noavx2.o ggml-cpu_v4_noavx2.o ggml_v3_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o gpttype_adapter_vulkan_noavx2.o ggml-vulkan.o sdcpp_vulkan.o whispercpp_default.o llavaclip_vulkan.o llava.o ggml-backend_vulkan.o $(OBJS_SIMPLE) $(OBJS)
|
koboldcpp_vulkan_noavx2: ggml_v4_vulkan_noavx2.o ggml-cpu_v4_noavx2.o ggml_v3_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o gpttype_adapter_vulkan_noavx2.o ggml-vulkan.o sdcpp_vulkan.o whispercpp_default.o llavaclip_vulkan.o llava.o ggml-backend_vulkan.o ggml-backend-reg_vulkan.o $(OBJS_SIMPLE) $(OBJS)
|
||||||
$(VULKAN_BUILD)
|
$(VULKAN_BUILD)
|
||||||
else
|
else
|
||||||
koboldcpp_vulkan_noavx2:
|
koboldcpp_vulkan_noavx2:
|
||||||
|
@ -677,17 +701,17 @@ koboldcpp_vulkan_noavx2:
|
||||||
endif
|
endif
|
||||||
|
|
||||||
# tools
|
# tools
|
||||||
quantize_gguf: examples/quantize/quantize.cpp ggml.o ggml-cpu.o llama.o llavaclip_default.o llava.o ggml-backend_default.o $(OBJS_FULL)
|
quantize_gguf: examples/quantize/quantize.cpp ggml.o ggml-cpu.o llama.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FULL)
|
||||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||||
quantize_gptj: otherarch/tools/gptj_quantize.cpp otherarch/tools/common-ggml.cpp ggml_v3.o ggml.o ggml-cpu.o llama.o llavaclip_default.o llava.o ggml-backend_default.o $(OBJS_FULL)
|
quantize_gptj: otherarch/tools/gptj_quantize.cpp otherarch/tools/common-ggml.cpp ggml_v3.o ggml.o ggml-cpu.o llama.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FULL)
|
||||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||||
quantize_gpt2: otherarch/tools/gpt2_quantize.cpp otherarch/tools/common-ggml.cpp ggml_v3.o ggml.o ggml-cpu.o llama.o llavaclip_default.o llava.o ggml-backend_default.o $(OBJS_FULL)
|
quantize_gpt2: otherarch/tools/gpt2_quantize.cpp otherarch/tools/common-ggml.cpp ggml_v3.o ggml.o ggml-cpu.o llama.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FULL)
|
||||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||||
quantize_neox: otherarch/tools/neox_quantize.cpp otherarch/tools/common-ggml.cpp ggml_v3.o ggml.o ggml-cpu.o llama.o llavaclip_default.o llava.o ggml-backend_default.o $(OBJS_FULL)
|
quantize_neox: otherarch/tools/neox_quantize.cpp otherarch/tools/common-ggml.cpp ggml_v3.o ggml.o ggml-cpu.o llama.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FULL)
|
||||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||||
quantize_mpt: otherarch/tools/mpt_quantize.cpp otherarch/tools/common-ggml.cpp ggml_v3.o ggml.o ggml-cpu.o llama.o llavaclip_default.o llava.o ggml-backend_default.o $(OBJS_FULL)
|
quantize_mpt: otherarch/tools/mpt_quantize.cpp otherarch/tools/common-ggml.cpp ggml_v3.o ggml.o ggml-cpu.o llama.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FULL)
|
||||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||||
quantize_clip: examples/llava/clip.cpp examples/llava/clip.h examples/llava/quantclip.cpp ggml_v3.o ggml.o ggml-cpu.o llama.o ggml-backend_default.o $(OBJS_FULL)
|
quantize_clip: examples/llava/clip.cpp examples/llava/clip.h examples/llava/quantclip.cpp ggml_v3.o ggml.o ggml-cpu.o llama.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FULL)
|
||||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
#window simple clinfo
|
#window simple clinfo
|
||||||
|
|
|
@ -1969,18 +1969,13 @@ void yaml_dump_non_result_info(FILE * stream, const common_params & params, cons
|
||||||
fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false");
|
fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false");
|
||||||
fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false");
|
fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false");
|
||||||
fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
|
fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
|
||||||
fprintf(stream, "cpu_has_cuda: %s\n", ggml_cpu_has_cuda() ? "true" : "false");
|
|
||||||
fprintf(stream, "cpu_has_vulkan: %s\n", ggml_cpu_has_vulkan() ? "true" : "false");
|
|
||||||
fprintf(stream, "cpu_has_kompute: %s\n", ggml_cpu_has_kompute() ? "true" : "false");
|
|
||||||
fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false");
|
fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false");
|
||||||
fprintf(stream, "cpu_has_gpublas: %s\n", ggml_cpu_has_gpublas() ? "true" : "false");
|
|
||||||
fprintf(stream, "cpu_has_neon: %s\n", ggml_cpu_has_neon() ? "true" : "false");
|
fprintf(stream, "cpu_has_neon: %s\n", ggml_cpu_has_neon() ? "true" : "false");
|
||||||
fprintf(stream, "cpu_has_sve: %s\n", ggml_cpu_has_sve() ? "true" : "false");
|
fprintf(stream, "cpu_has_sve: %s\n", ggml_cpu_has_sve() ? "true" : "false");
|
||||||
fprintf(stream, "cpu_has_f16c: %s\n", ggml_cpu_has_f16c() ? "true" : "false");
|
fprintf(stream, "cpu_has_f16c: %s\n", ggml_cpu_has_f16c() ? "true" : "false");
|
||||||
fprintf(stream, "cpu_has_fp16_va: %s\n", ggml_cpu_has_fp16_va() ? "true" : "false");
|
fprintf(stream, "cpu_has_fp16_va: %s\n", ggml_cpu_has_fp16_va() ? "true" : "false");
|
||||||
fprintf(stream, "cpu_has_riscv_v: %s\n", ggml_cpu_has_riscv_v() ? "true" : "false");
|
fprintf(stream, "cpu_has_riscv_v: %s\n", ggml_cpu_has_riscv_v() ? "true" : "false");
|
||||||
fprintf(stream, "cpu_has_wasm_simd: %s\n", ggml_cpu_has_wasm_simd() ? "true" : "false");
|
fprintf(stream, "cpu_has_wasm_simd: %s\n", ggml_cpu_has_wasm_simd() ? "true" : "false");
|
||||||
fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
|
|
||||||
fprintf(stream, "cpu_has_sse3: %s\n", ggml_cpu_has_sse3() ? "true" : "false");
|
fprintf(stream, "cpu_has_sse3: %s\n", ggml_cpu_has_sse3() ? "true" : "false");
|
||||||
fprintf(stream, "cpu_has_vsx: %s\n", ggml_cpu_has_vsx() ? "true" : "false");
|
fprintf(stream, "cpu_has_vsx: %s\n", ggml_cpu_has_vsx() ? "true" : "false");
|
||||||
fprintf(stream, "cpu_has_matmul_int8: %s\n", ggml_cpu_has_matmul_int8() ? "true" : "false");
|
fprintf(stream, "cpu_has_matmul_int8: %s\n", ggml_cpu_has_matmul_int8() ? "true" : "false");
|
||||||
|
|
|
@ -9,16 +9,16 @@ extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// buffer_type API
|
// buffer_type API
|
||||||
GGML_API ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void);
|
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void);
|
||||||
|
|
||||||
GGML_API bool ggml_backend_is_amx(ggml_backend_t backend);
|
GGML_BACKEND_API bool ggml_backend_is_amx(ggml_backend_t backend);
|
||||||
|
|
||||||
// backend API
|
// backend API
|
||||||
GGML_API ggml_backend_t ggml_backend_amx_init(void);
|
GGML_BACKEND_API ggml_backend_t ggml_backend_amx_init(void);
|
||||||
|
|
||||||
GGML_API void ggml_backend_amx_set_n_threads(ggml_backend_t backend_amx, int n_threads);
|
GGML_BACKEND_API void ggml_backend_amx_set_n_threads(ggml_backend_t backend_amx, int n_threads);
|
||||||
|
|
||||||
GGML_API ggml_backend_reg_t ggml_backend_amx_reg(void);
|
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_amx_reg(void);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
|
|
|
@ -3,6 +3,20 @@
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
#include "ggml-alloc.h"
|
#include "ggml-alloc.h"
|
||||||
|
|
||||||
|
#ifdef GGML_BACKEND_SHARED
|
||||||
|
# if defined(_WIN32) && !defined(__MINGW32__)
|
||||||
|
# ifdef GGML_BACKEND_BUILD
|
||||||
|
# define GGML_BACKEND_API __declspec(dllexport) extern
|
||||||
|
# else
|
||||||
|
# define GGML_BACKEND_API __declspec(dllimport) extern
|
||||||
|
# endif
|
||||||
|
# else
|
||||||
|
# define GGML_BACKEND_API __attribute__ ((visibility ("default"))) extern
|
||||||
|
# endif
|
||||||
|
#else
|
||||||
|
# define GGML_BACKEND_API extern
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -9,15 +9,15 @@ extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// backend API
|
// backend API
|
||||||
GGML_API ggml_backend_t ggml_backend_blas_init(void);
|
GGML_BACKEND_API ggml_backend_t ggml_backend_blas_init(void);
|
||||||
|
|
||||||
GGML_API bool ggml_backend_is_blas(ggml_backend_t backend);
|
GGML_BACKEND_API bool ggml_backend_is_blas(ggml_backend_t backend);
|
||||||
|
|
||||||
// number of threads used for conversion to float
|
// number of threads used for conversion to float
|
||||||
// for openblas and blis, this will also set the number of threads used for blas operations
|
// for openblas and blis, this will also set the number of threads used for blas operations
|
||||||
GGML_API void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads);
|
GGML_BACKEND_API void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads);
|
||||||
|
|
||||||
GGML_API ggml_backend_reg_t ggml_backend_blas_reg(void);
|
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_blas_reg(void);
|
||||||
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
|
|
|
@ -34,7 +34,7 @@ extern "C" {
|
||||||
*/
|
*/
|
||||||
#define GGML_CANN_MAX_DEVICES 16
|
#define GGML_CANN_MAX_DEVICES 16
|
||||||
|
|
||||||
GGML_API ggml_backend_reg_t ggml_backend_cann_reg(void);
|
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cann_reg(void);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Initializes the CANN backend for a specified device.
|
* @brief Initializes the CANN backend for a specified device.
|
||||||
|
@ -46,7 +46,7 @@ GGML_API ggml_backend_reg_t ggml_backend_cann_reg(void);
|
||||||
* @param device The index of the device to initialize.
|
* @param device The index of the device to initialize.
|
||||||
* @return A pointer to the initialized backend instance, or nullptr on failure.
|
* @return A pointer to the initialized backend instance, or nullptr on failure.
|
||||||
*/
|
*/
|
||||||
GGML_API ggml_backend_t ggml_backend_cann_init(int32_t device);
|
GGML_BACKEND_API ggml_backend_t ggml_backend_cann_init(int32_t device);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Checks if a given backend is a CANN backend.
|
* @brief Checks if a given backend is a CANN backend.
|
||||||
|
@ -57,7 +57,7 @@ GGML_API ggml_backend_t ggml_backend_cann_init(int32_t device);
|
||||||
* @param backend The backend instance to check.
|
* @param backend The backend instance to check.
|
||||||
* @return True if the backend is a CANN backend, false otherwise.
|
* @return True if the backend is a CANN backend, false otherwise.
|
||||||
*/
|
*/
|
||||||
GGML_API bool ggml_backend_is_cann(ggml_backend_t backend);
|
GGML_BACKEND_API bool ggml_backend_is_cann(ggml_backend_t backend);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Retrieves the CANN buffer type for a specified device.
|
* @brief Retrieves the CANN buffer type for a specified device.
|
||||||
|
@ -69,7 +69,7 @@ GGML_API bool ggml_backend_is_cann(ggml_backend_t backend);
|
||||||
* @return A pointer to the buffer type interface for the specified device, or
|
* @return A pointer to the buffer type interface for the specified device, or
|
||||||
* nullptr if the device index is out of range.
|
* nullptr if the device index is out of range.
|
||||||
*/
|
*/
|
||||||
GGML_API ggml_backend_buffer_type_t
|
GGML_BACKEND_API ggml_backend_buffer_type_t
|
||||||
ggml_backend_cann_buffer_type(int32_t device);
|
ggml_backend_cann_buffer_type(int32_t device);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -80,14 +80,14 @@ ggml_backend_cann_buffer_type(int32_t device);
|
||||||
*
|
*
|
||||||
* @return The number of CANN devices available.
|
* @return The number of CANN devices available.
|
||||||
*/
|
*/
|
||||||
GGML_API int32_t ggml_backend_cann_get_device_count(void);
|
GGML_BACKEND_API int32_t ggml_backend_cann_get_device_count(void);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief pinned host buffer for use with the CPU backend for faster copies between CPU and NPU.
|
* @brief pinned host buffer for use with the CPU backend for faster copies between CPU and NPU.
|
||||||
*
|
*
|
||||||
* @return A pointer to the host buffer type interface.
|
* @return A pointer to the host buffer type interface.
|
||||||
*/
|
*/
|
||||||
GGML_API ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type(void);
|
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type(void);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Retrieves the description of a specific CANN device.
|
* @brief Retrieves the description of a specific CANN device.
|
||||||
|
@ -99,7 +99,7 @@ GGML_API ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type(void);
|
||||||
* @param description Pointer to a buffer where the description will be written.
|
* @param description Pointer to a buffer where the description will be written.
|
||||||
* @param description_size Size of the description buffer.
|
* @param description_size Size of the description buffer.
|
||||||
*/
|
*/
|
||||||
GGML_API void ggml_backend_cann_get_device_description(
|
GGML_BACKEND_API void ggml_backend_cann_get_device_description(
|
||||||
int32_t device, char* description, size_t description_size);
|
int32_t device, char* description, size_t description_size);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -114,7 +114,7 @@ GGML_API void ggml_backend_cann_get_device_description(
|
||||||
* @param total Pointer to a variable where the total memory size will be
|
* @param total Pointer to a variable where the total memory size will be
|
||||||
* stored.
|
* stored.
|
||||||
*/
|
*/
|
||||||
GGML_API void ggml_backend_cann_get_device_memory(int32_t device,
|
GGML_BACKEND_API void ggml_backend_cann_get_device_memory(int32_t device,
|
||||||
size_t* free,
|
size_t* free,
|
||||||
size_t* total);
|
size_t* total);
|
||||||
|
|
||||||
|
|
|
@ -54,54 +54,77 @@ extern "C" {
|
||||||
GGML_NUMA_STRATEGY_COUNT
|
GGML_NUMA_STRATEGY_COUNT
|
||||||
};
|
};
|
||||||
|
|
||||||
GGML_API void ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems
|
GGML_BACKEND_API void ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems
|
||||||
GGML_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node
|
GGML_BACKEND_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
|
GGML_BACKEND_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
|
||||||
GGML_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
|
GGML_BACKEND_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
|
GGML_BACKEND_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
|
||||||
GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
|
GGML_BACKEND_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
|
||||||
|
|
||||||
GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
|
GGML_BACKEND_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
|
||||||
GGML_API void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
|
GGML_BACKEND_API void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
|
||||||
|
|
||||||
GGML_API int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
|
GGML_BACKEND_API int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
|
||||||
GGML_API void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value);
|
GGML_BACKEND_API void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value);
|
||||||
|
|
||||||
GGML_API float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
|
GGML_BACKEND_API float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
|
||||||
GGML_API void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
|
GGML_BACKEND_API void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
|
||||||
|
|
||||||
GGML_API float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
|
GGML_BACKEND_API float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
|
||||||
GGML_API void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);
|
GGML_BACKEND_API void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);
|
||||||
|
|
||||||
GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
|
GGML_BACKEND_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
|
||||||
GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads);
|
GGML_BACKEND_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads);
|
||||||
GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
|
GGML_BACKEND_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
|
||||||
GGML_API struct ggml_threadpool * ggml_threadpool_new (struct ggml_threadpool_params * params);
|
GGML_BACKEND_API struct ggml_threadpool * ggml_threadpool_new (struct ggml_threadpool_params * params);
|
||||||
GGML_API void ggml_threadpool_free (struct ggml_threadpool * threadpool);
|
GGML_BACKEND_API void ggml_threadpool_free (struct ggml_threadpool * threadpool);
|
||||||
GGML_API int ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
|
GGML_BACKEND_API int ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
|
||||||
GGML_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool);
|
GGML_BACKEND_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool);
|
||||||
GGML_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool);
|
GGML_BACKEND_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool);
|
||||||
|
|
||||||
// ggml_graph_plan() has to be called before ggml_graph_compute()
|
// ggml_graph_plan() has to be called before ggml_graph_compute()
|
||||||
// when plan.work_size > 0, caller must allocate memory for plan.work_data
|
// when plan.work_size > 0, caller must allocate memory for plan.work_data
|
||||||
GGML_API struct ggml_cplan ggml_graph_plan(
|
GGML_BACKEND_API struct ggml_cplan ggml_graph_plan(
|
||||||
const struct ggml_cgraph * cgraph,
|
const struct ggml_cgraph * cgraph,
|
||||||
int n_threads, /* = GGML_DEFAULT_N_THREADS */
|
int n_threads, /* = GGML_DEFAULT_N_THREADS */
|
||||||
struct ggml_threadpool * threadpool /* = NULL */ );
|
struct ggml_threadpool * threadpool /* = NULL */ );
|
||||||
GGML_API enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
|
GGML_BACKEND_API enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
|
||||||
|
|
||||||
// same as ggml_graph_compute() but the work data is allocated as a part of the context
|
// same as ggml_graph_compute() but the work data is allocated as a part of the context
|
||||||
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
|
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
|
||||||
GGML_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
|
GGML_BACKEND_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
|
||||||
|
|
||||||
// TODO: move to backend interface
|
//
|
||||||
GGML_API int ggml_cpu_has_neon (void);
|
// system info
|
||||||
GGML_API int ggml_cpu_has_sve (void);
|
//
|
||||||
GGML_API int ggml_cpu_has_matmul_int8(void);
|
|
||||||
// get the sve vector length in bytes
|
// x86
|
||||||
GGML_API int ggml_cpu_get_sve_cnt(void);
|
GGML_BACKEND_API int ggml_cpu_has_sse3 (void);
|
||||||
|
GGML_BACKEND_API int ggml_cpu_has_ssse3 (void);
|
||||||
|
GGML_BACKEND_API int ggml_cpu_has_avx (void);
|
||||||
|
GGML_BACKEND_API int ggml_cpu_has_avx2 (void);
|
||||||
|
GGML_BACKEND_API int ggml_cpu_has_f16c (void);
|
||||||
|
GGML_BACKEND_API int ggml_cpu_has_fma (void);
|
||||||
|
GGML_BACKEND_API int ggml_cpu_has_avx_vnni (void);
|
||||||
|
GGML_BACKEND_API int ggml_cpu_has_avx512 (void);
|
||||||
|
GGML_BACKEND_API int ggml_cpu_has_avx512_vbmi(void);
|
||||||
|
GGML_BACKEND_API int ggml_cpu_has_avx512_vnni(void);
|
||||||
|
GGML_BACKEND_API int ggml_cpu_has_avx512_bf16(void);
|
||||||
|
GGML_BACKEND_API int ggml_cpu_has_amx_int8 (void);
|
||||||
|
// ARM
|
||||||
|
GGML_BACKEND_API int ggml_cpu_has_neon (void);
|
||||||
|
GGML_BACKEND_API int ggml_cpu_has_arm_fma (void);
|
||||||
|
GGML_BACKEND_API int ggml_cpu_has_fp16_va (void);
|
||||||
|
GGML_BACKEND_API int ggml_cpu_has_matmul_int8(void);
|
||||||
|
GGML_BACKEND_API int ggml_cpu_has_sve (void);
|
||||||
|
GGML_BACKEND_API int ggml_cpu_get_sve_cnt (void); // sve vector length in bytes
|
||||||
|
// other
|
||||||
|
GGML_BACKEND_API int ggml_cpu_has_riscv_v (void);
|
||||||
|
GGML_BACKEND_API int ggml_cpu_has_vsx (void);
|
||||||
|
GGML_BACKEND_API int ggml_cpu_has_wasm_simd (void);
|
||||||
|
GGML_BACKEND_API int ggml_cpu_has_llamafile (void);
|
||||||
|
|
||||||
// Internal types and functions exposed for tests and benchmarks
|
// Internal types and functions exposed for tests and benchmarks
|
||||||
|
|
||||||
|
@ -115,6 +138,7 @@ extern "C" {
|
||||||
const void * GGML_RESTRICT y, int nr, int nc);
|
const void * GGML_RESTRICT y, int nr, int nc);
|
||||||
|
|
||||||
struct ggml_type_traits_cpu {
|
struct ggml_type_traits_cpu {
|
||||||
|
ggml_from_float_t from_float;
|
||||||
ggml_from_float_to_mat_t from_float_to_mat;
|
ggml_from_float_to_mat_t from_float_to_mat;
|
||||||
ggml_vec_dot_t vec_dot;
|
ggml_vec_dot_t vec_dot;
|
||||||
enum ggml_type vec_dot_type;
|
enum ggml_type vec_dot_type;
|
||||||
|
@ -124,25 +148,25 @@ extern "C" {
|
||||||
ggml_gemm_t gemm;
|
ggml_gemm_t gemm;
|
||||||
};
|
};
|
||||||
|
|
||||||
GGML_API const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type);
|
GGML_BACKEND_API const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type);
|
||||||
|
|
||||||
GGML_API void ggml_cpu_init(void);
|
GGML_BACKEND_API void ggml_cpu_init(void);
|
||||||
|
|
||||||
//
|
//
|
||||||
// CPU backend
|
// CPU backend
|
||||||
//
|
//
|
||||||
|
|
||||||
GGML_API ggml_backend_t ggml_backend_cpu_init(void);
|
GGML_BACKEND_API ggml_backend_t ggml_backend_cpu_init(void);
|
||||||
|
|
||||||
GGML_API bool ggml_backend_is_cpu (ggml_backend_t backend);
|
GGML_BACKEND_API bool ggml_backend_is_cpu (ggml_backend_t backend);
|
||||||
GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads);
|
GGML_BACKEND_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads);
|
||||||
GGML_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
|
GGML_BACKEND_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
|
||||||
GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
|
GGML_BACKEND_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
|
||||||
|
|
||||||
GGML_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
|
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
|
||||||
|
|
||||||
#ifdef GGML_USE_CPU_HBM
|
#ifdef GGML_USE_CPU_HBM
|
||||||
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
|
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
|
|
|
@ -7,7 +7,7 @@
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef GGML_USE_HIPBLAS
|
#ifdef GGML_USE_HIP
|
||||||
#define GGML_CUDA_NAME "ROCm"
|
#define GGML_CUDA_NAME "ROCm"
|
||||||
#define GGML_CUBLAS_NAME "hipBLAS"
|
#define GGML_CUBLAS_NAME "hipBLAS"
|
||||||
#elif defined(GGML_USE_MUSA)
|
#elif defined(GGML_USE_MUSA)
|
||||||
|
@ -19,29 +19,29 @@ extern "C" {
|
||||||
#endif
|
#endif
|
||||||
#define GGML_CUDA_MAX_DEVICES 16
|
#define GGML_CUDA_MAX_DEVICES 16
|
||||||
|
|
||||||
GGML_API void ggml_cuda_set_mul_mat_q(bool mul_mat_q);
|
GGML_BACKEND_API void ggml_cuda_set_mul_mat_q(bool mul_mat_q);
|
||||||
// backend API
|
// backend API
|
||||||
GGML_API ggml_backend_t ggml_backend_cuda_init(int device);
|
GGML_BACKEND_API ggml_backend_t ggml_backend_cuda_init(int device);
|
||||||
|
|
||||||
GGML_API bool ggml_backend_is_cuda(ggml_backend_t backend);
|
GGML_BACKEND_API bool ggml_backend_is_cuda(ggml_backend_t backend);
|
||||||
|
|
||||||
// device buffer
|
// device buffer
|
||||||
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
|
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
|
||||||
|
|
||||||
// split tensor buffer that splits matrices by rows across multiple devices
|
// split tensor buffer that splits matrices by rows across multiple devices
|
||||||
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split);
|
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split);
|
||||||
|
|
||||||
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
|
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
|
||||||
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
|
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
|
||||||
|
|
||||||
GGML_API int ggml_backend_cuda_get_device_count(void);
|
GGML_BACKEND_API int ggml_backend_cuda_get_device_count(void);
|
||||||
GGML_API void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
|
GGML_BACKEND_API void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
|
||||||
GGML_API void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
|
GGML_BACKEND_API void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
|
||||||
|
|
||||||
GGML_API bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
|
GGML_BACKEND_API bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
|
||||||
GGML_API void ggml_backend_cuda_unregister_host_buffer(void * buffer);
|
GGML_BACKEND_API void ggml_backend_cuda_unregister_host_buffer(void * buffer);
|
||||||
|
|
||||||
GGML_API ggml_backend_reg_t ggml_backend_cuda_reg(void);
|
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cuda_reg(void);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
|
|
|
@ -37,13 +37,13 @@ struct ggml_vk_device ggml_vk_current_device(void);
|
||||||
// forward declaration
|
// forward declaration
|
||||||
typedef struct ggml_backend * ggml_backend_t;
|
typedef struct ggml_backend * ggml_backend_t;
|
||||||
|
|
||||||
GGML_API ggml_backend_t ggml_backend_kompute_init(int device);
|
GGML_BACKEND_API ggml_backend_t ggml_backend_kompute_init(int device);
|
||||||
|
|
||||||
GGML_API bool ggml_backend_is_kompute(ggml_backend_t backend);
|
GGML_BACKEND_API bool ggml_backend_is_kompute(ggml_backend_t backend);
|
||||||
|
|
||||||
GGML_API ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device);
|
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device);
|
||||||
|
|
||||||
GGML_API ggml_backend_reg_t ggml_backend_kompute_reg(void);
|
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_kompute_reg(void);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
|
|
|
@ -39,27 +39,27 @@ extern "C" {
|
||||||
// user-code should use only these functions
|
// user-code should use only these functions
|
||||||
//
|
//
|
||||||
|
|
||||||
GGML_API ggml_backend_t ggml_backend_metal_init(void);
|
GGML_BACKEND_API ggml_backend_t ggml_backend_metal_init(void);
|
||||||
|
|
||||||
GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
|
GGML_BACKEND_API bool ggml_backend_is_metal(ggml_backend_t backend);
|
||||||
|
|
||||||
GGML_DEPRECATED(
|
GGML_DEPRECATED(
|
||||||
GGML_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size),
|
GGML_BACKEND_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size),
|
||||||
"obsoleted by the new device interface - https://github.com/ggerganov/llama.cpp/pull/9713");
|
"obsoleted by the new device interface - https://github.com/ggerganov/llama.cpp/pull/9713");
|
||||||
|
|
||||||
GGML_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data);
|
GGML_BACKEND_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data);
|
||||||
|
|
||||||
GGML_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
|
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
|
||||||
|
|
||||||
// helper to check if the device supports a specific family
|
// helper to check if the device supports a specific family
|
||||||
// ideally, the user code should be doing these checks
|
// ideally, the user code should be doing these checks
|
||||||
// ref: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
|
// ref: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
|
||||||
GGML_API bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family);
|
GGML_BACKEND_API bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family);
|
||||||
|
|
||||||
// capture all command buffers committed the next time `ggml_backend_graph_compute` is called
|
// capture all command buffers committed the next time `ggml_backend_graph_compute` is called
|
||||||
GGML_API void ggml_backend_metal_capture_next_compute(ggml_backend_t backend);
|
GGML_BACKEND_API void ggml_backend_metal_capture_next_compute(ggml_backend_t backend);
|
||||||
|
|
||||||
GGML_API ggml_backend_reg_t ggml_backend_metal_reg(void);
|
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_metal_reg(void);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
|
|
|
@ -10,18 +10,18 @@ extern "C" {
|
||||||
#define GGML_RPC_MAX_SERVERS 16
|
#define GGML_RPC_MAX_SERVERS 16
|
||||||
|
|
||||||
// backend API
|
// backend API
|
||||||
GGML_API ggml_backend_t ggml_backend_rpc_init(const char * endpoint);
|
GGML_BACKEND_API ggml_backend_t ggml_backend_rpc_init(const char * endpoint);
|
||||||
GGML_API bool ggml_backend_is_rpc(ggml_backend_t backend);
|
GGML_BACKEND_API bool ggml_backend_is_rpc(ggml_backend_t backend);
|
||||||
|
|
||||||
GGML_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint);
|
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint);
|
||||||
|
|
||||||
GGML_API void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total);
|
GGML_BACKEND_API void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total);
|
||||||
|
|
||||||
GGML_API void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem);
|
GGML_BACKEND_API void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem);
|
||||||
|
|
||||||
GGML_API ggml_backend_reg_t ggml_backend_rpc_reg(void);
|
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_rpc_reg(void);
|
||||||
|
|
||||||
GGML_API ggml_backend_dev_t ggml_backend_rpc_add_device(const char * endpoint);
|
GGML_BACKEND_API ggml_backend_dev_t ggml_backend_rpc_add_device(const char * endpoint);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,32 +17,32 @@ extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// backend API
|
// backend API
|
||||||
GGML_API ggml_backend_t ggml_backend_sycl_init(int device);
|
GGML_BACKEND_API ggml_backend_t ggml_backend_sycl_init(int device);
|
||||||
|
|
||||||
GGML_API bool ggml_backend_is_sycl(ggml_backend_t backend);
|
GGML_BACKEND_API bool ggml_backend_is_sycl(ggml_backend_t backend);
|
||||||
|
|
||||||
// devide buffer
|
// devide buffer
|
||||||
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device);
|
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device);
|
||||||
|
|
||||||
// split tensor buffer that splits matrices by rows across multiple devices
|
// split tensor buffer that splits matrices by rows across multiple devices
|
||||||
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split);
|
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split);
|
||||||
|
|
||||||
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
|
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
|
||||||
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
|
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
|
||||||
|
|
||||||
GGML_API void ggml_backend_sycl_print_sycl_devices(void);
|
GGML_BACKEND_API void ggml_backend_sycl_print_sycl_devices(void);
|
||||||
GGML_API void ggml_backend_sycl_get_gpu_list(int *id_list, int max_len);
|
GGML_BACKEND_API void ggml_backend_sycl_get_gpu_list(int *id_list, int max_len);
|
||||||
GGML_API void ggml_backend_sycl_get_device_description(int device,
|
GGML_BACKEND_API void ggml_backend_sycl_get_device_description(int device,
|
||||||
char *description,
|
char *description,
|
||||||
size_t description_size);
|
size_t description_size);
|
||||||
GGML_API int ggml_backend_sycl_get_device_count();
|
GGML_BACKEND_API int ggml_backend_sycl_get_device_count();
|
||||||
GGML_API void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);
|
GGML_BACKEND_API void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);
|
||||||
|
|
||||||
// SYCL doesn't support registering host memory, keep here for reference
|
// SYCL doesn't support registering host memory, keep here for reference
|
||||||
// GGML_API bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size);
|
// GGML_BACKEND_API bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size);
|
||||||
// GGML_API void ggml_backend_sycl_unregister_host_buffer(void * buffer);
|
// GGML_BACKEND_API void ggml_backend_sycl_unregister_host_buffer(void * buffer);
|
||||||
|
|
||||||
GGML_API ggml_backend_reg_t ggml_backend_sycl_reg(void);
|
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_sycl_reg(void);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
|
|
|
@ -10,21 +10,21 @@ extern "C" {
|
||||||
#define GGML_VK_NAME "Vulkan"
|
#define GGML_VK_NAME "Vulkan"
|
||||||
#define GGML_VK_MAX_DEVICES 16
|
#define GGML_VK_MAX_DEVICES 16
|
||||||
|
|
||||||
GGML_API void ggml_vk_instance_init(void);
|
GGML_BACKEND_API void ggml_vk_instance_init(void);
|
||||||
|
|
||||||
// backend API
|
// backend API
|
||||||
GGML_API ggml_backend_t ggml_backend_vk_init(size_t dev_num);
|
GGML_BACKEND_API ggml_backend_t ggml_backend_vk_init(size_t dev_num);
|
||||||
|
|
||||||
GGML_API bool ggml_backend_is_vk(ggml_backend_t backend);
|
GGML_BACKEND_API bool ggml_backend_is_vk(ggml_backend_t backend);
|
||||||
GGML_API int ggml_backend_vk_get_device_count(void);
|
GGML_BACKEND_API int ggml_backend_vk_get_device_count(void);
|
||||||
GGML_API void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size);
|
GGML_BACKEND_API void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size);
|
||||||
GGML_API void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total);
|
GGML_BACKEND_API void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total);
|
||||||
|
|
||||||
GGML_API ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num);
|
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num);
|
||||||
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
|
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
|
||||||
GGML_API ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void);
|
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void);
|
||||||
|
|
||||||
GGML_API ggml_backend_reg_t ggml_backend_vk_reg(void);
|
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_vk_reg(void);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
|
|
|
@ -176,15 +176,15 @@
|
||||||
#ifdef GGML_SHARED
|
#ifdef GGML_SHARED
|
||||||
# if defined(_WIN32) && !defined(__MINGW32__)
|
# if defined(_WIN32) && !defined(__MINGW32__)
|
||||||
# ifdef GGML_BUILD
|
# ifdef GGML_BUILD
|
||||||
# define GGML_API __declspec(dllexport)
|
# define GGML_API __declspec(dllexport) extern
|
||||||
# else
|
# else
|
||||||
# define GGML_API __declspec(dllimport)
|
# define GGML_API __declspec(dllimport) extern
|
||||||
# endif
|
# endif
|
||||||
# else
|
# else
|
||||||
# define GGML_API __attribute__ ((visibility ("default")))
|
# define GGML_API __attribute__ ((visibility ("default"))) extern
|
||||||
# endif
|
# endif
|
||||||
#else
|
#else
|
||||||
# define GGML_API
|
# define GGML_API extern
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// TODO: support for clang
|
// TODO: support for clang
|
||||||
|
@ -1496,7 +1496,7 @@ extern "C" {
|
||||||
"use ggml_rope_ext_inplace instead");
|
"use ggml_rope_ext_inplace instead");
|
||||||
|
|
||||||
// compute correction dims for YaRN RoPE scaling
|
// compute correction dims for YaRN RoPE scaling
|
||||||
void ggml_rope_yarn_corr_dims(
|
GGML_API void ggml_rope_yarn_corr_dims(
|
||||||
int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2]);
|
int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2]);
|
||||||
|
|
||||||
// rotary position embedding backward, i.e compute dx from dy
|
// rotary position embedding backward, i.e compute dx from dy
|
||||||
|
@ -2390,38 +2390,6 @@ extern "C" {
|
||||||
GGML_API size_t gguf_get_meta_size(const struct gguf_context * ctx);
|
GGML_API size_t gguf_get_meta_size(const struct gguf_context * ctx);
|
||||||
GGML_API void gguf_get_meta_data(const struct gguf_context * ctx, void * data);
|
GGML_API void gguf_get_meta_data(const struct gguf_context * ctx, void * data);
|
||||||
|
|
||||||
//
|
|
||||||
// system info
|
|
||||||
//
|
|
||||||
|
|
||||||
GGML_API int ggml_cpu_has_avx (void);
|
|
||||||
GGML_API int ggml_cpu_has_avx_vnni (void);
|
|
||||||
GGML_API int ggml_cpu_has_avx2 (void);
|
|
||||||
GGML_API int ggml_cpu_has_avx512 (void);
|
|
||||||
GGML_API int ggml_cpu_has_avx512_vbmi(void);
|
|
||||||
GGML_API int ggml_cpu_has_avx512_vnni(void);
|
|
||||||
GGML_API int ggml_cpu_has_avx512_bf16(void);
|
|
||||||
GGML_API int ggml_cpu_has_amx_int8 (void);
|
|
||||||
GGML_API int ggml_cpu_has_fma (void);
|
|
||||||
GGML_API int ggml_cpu_has_arm_fma (void);
|
|
||||||
GGML_API int ggml_cpu_has_metal (void);
|
|
||||||
GGML_API int ggml_cpu_has_f16c (void);
|
|
||||||
GGML_API int ggml_cpu_has_fp16_va (void);
|
|
||||||
GGML_API int ggml_cpu_has_wasm_simd (void);
|
|
||||||
GGML_API int ggml_cpu_has_blas (void);
|
|
||||||
GGML_API int ggml_cpu_has_cuda (void);
|
|
||||||
GGML_API int ggml_cpu_has_vulkan (void);
|
|
||||||
GGML_API int ggml_cpu_has_kompute (void);
|
|
||||||
GGML_API int ggml_cpu_has_gpublas (void);
|
|
||||||
GGML_API int ggml_cpu_has_sse3 (void);
|
|
||||||
GGML_API int ggml_cpu_has_ssse3 (void);
|
|
||||||
GGML_API int ggml_cpu_has_riscv_v (void);
|
|
||||||
GGML_API int ggml_cpu_has_sycl (void);
|
|
||||||
GGML_API int ggml_cpu_has_rpc (void);
|
|
||||||
GGML_API int ggml_cpu_has_vsx (void);
|
|
||||||
GGML_API int ggml_cpu_has_cann (void);
|
|
||||||
GGML_API int ggml_cpu_has_llamafile (void);
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
// restrict not standard in C++
|
// restrict not standard in C++
|
||||||
#define GGML_RESTRICT
|
#define GGML_RESTRICT
|
||||||
|
@ -2438,7 +2406,6 @@ extern "C" {
|
||||||
size_t type_size;
|
size_t type_size;
|
||||||
bool is_quantized;
|
bool is_quantized;
|
||||||
ggml_to_float_t to_float;
|
ggml_to_float_t to_float;
|
||||||
ggml_from_float_t from_float;
|
|
||||||
ggml_from_float_t from_float_ref;
|
ggml_from_float_t from_float_ref;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -1,9 +1,5 @@
|
||||||
// SPDX-FileCopyrightText: Copyright 2024 Arm Ltd.
|
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#define GGML_COMMON_DECL_C
|
|
||||||
#include "ggml-common.h"
|
|
||||||
|
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
|
|
||||||
// GGML internal header
|
// GGML internal header
|
||||||
|
@ -12,27 +8,11 @@
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Quantization
|
|
||||||
void quantize_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
|
||||||
void quantize_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
|
||||||
|
|
||||||
void quantize_mat_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nrows, int64_t n_per_row, int64_t blck_size_interleave);
|
|
||||||
|
|
||||||
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
|
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
|
||||||
size_t quantize_q4_0_4x4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
size_t quantize_q4_0_4x4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_q4_0_4x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
size_t quantize_q4_0_4x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_q4_0_8x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
size_t quantize_q4_0_8x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
|
|
||||||
// GEMV
|
|
||||||
void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
||||||
void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
||||||
void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
||||||
|
|
||||||
// GEMM
|
|
||||||
void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
||||||
void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
||||||
void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -1,7 +1,8 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
#include "ggml-cpu-impl.h" // <immintrin.h>
|
// hack until AMX is moved into the CPU backend
|
||||||
|
#include "../ggml-cpu/ggml-cpu-impl.h" // <immintrin.h>
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
|
|
|
@ -421,9 +421,18 @@ ggml_backend_reg_t ggml_backend_amx_reg(void) {
|
||||||
|
|
||||||
#else // if defined(__AMX_INT8__)
|
#else // if defined(__AMX_INT8__)
|
||||||
|
|
||||||
|
ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void) {
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool ggml_backend_is_amx(ggml_backend_t backend) {
|
||||||
|
GGML_UNUSED(backend);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
ggml_backend_t ggml_backend_amx_init(void) {
|
ggml_backend_t ggml_backend_amx_init(void) {
|
||||||
fprintf(stderr, "GGML is not compiled with AMX support!\n");
|
fprintf(stderr, "GGML is not compiled with AMX support!\n");
|
||||||
return ggml_backend_t{};
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_backend_amx_set_n_threads(ggml_backend_t backend_amx, int n_threads) {
|
void ggml_backend_amx_set_n_threads(ggml_backend_t backend_amx, int n_threads) {
|
||||||
|
@ -433,4 +442,8 @@ void ggml_backend_amx_set_n_threads(ggml_backend_t backend_amx, int n_threads) {
|
||||||
GGML_UNUSED(n_threads);
|
GGML_UNUSED(n_threads);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ggml_backend_reg_t ggml_backend_amx_reg(void) {
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
|
@ -496,19 +496,20 @@ inline void from_float(const float * x, char * vy, int64_t k);
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
inline void from_float<block_q8_0>(const float * x, char * vy, int64_t k) {
|
inline void from_float<block_q8_0>(const float * x, char * vy, int64_t k) {
|
||||||
quantize_row_q8_0(x, vy, k);
|
// FIXME: using unoptimized reference impl until moved to CPU backend
|
||||||
|
quantize_row_q8_0_ref(x, (block_q8_0 *)vy, k);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
inline void from_float<block_q8_1>(const float * x, char * vy, int64_t k) {
|
inline void from_float<block_q8_1>(const float * x, char * vy, int64_t k) {
|
||||||
quantize_row_q8_1(x, vy, k);
|
quantize_row_q8_1_ref(x, (block_q8_1 *)vy, k);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
inline void from_float<block_q8_K>(const float * x, char * vy, int64_t k) {
|
inline void from_float<block_q8_K>(const float * x, char * vy, int64_t k) {
|
||||||
#if 1
|
#if 1
|
||||||
// TODO: this is reference impl!
|
// TODO: this is reference impl!
|
||||||
quantize_row_q8_K(x, vy, k);
|
quantize_row_q8_K_ref(x, (block_q8_K *)vy, k);
|
||||||
#else
|
#else
|
||||||
quantize_row_q8_K_vnni(x, vy, k);
|
quantize_row_q8_K_vnni(x, vy, k);
|
||||||
#endif
|
#endif
|
||||||
|
|
195
ggml/src/ggml-backend-reg.cpp
Normal file
195
ggml/src/ggml-backend-reg.cpp
Normal file
|
@ -0,0 +1,195 @@
|
||||||
|
#include "ggml-backend-impl.h"
|
||||||
|
#include "ggml-backend.h"
|
||||||
|
#include "ggml-cpu.h"
|
||||||
|
#include "ggml-impl.h"
|
||||||
|
#include <cstring>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
// Backend registry
|
||||||
|
|
||||||
|
#ifdef GGML_USE_CUDA
|
||||||
|
#include "ggml-cuda.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef GGML_USE_METAL
|
||||||
|
#include "ggml-metal.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef GGML_USE_SYCL
|
||||||
|
#include "ggml-sycl.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef GGML_USE_VULKAN
|
||||||
|
#include "ggml-vulkan.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef GGML_USE_BLAS
|
||||||
|
#include "ggml-blas.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef GGML_USE_RPC
|
||||||
|
#include "ggml-rpc.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef GGML_USE_AMX
|
||||||
|
# include "ggml-amx.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef GGML_USE_CANN
|
||||||
|
#include "ggml-cann.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef GGML_USE_KOMPUTE
|
||||||
|
#include "ggml-kompute.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
struct ggml_backend_registry {
|
||||||
|
std::vector<ggml_backend_reg_t> backends;
|
||||||
|
std::vector<ggml_backend_dev_t> devices;
|
||||||
|
|
||||||
|
ggml_backend_registry() {
|
||||||
|
#ifdef GGML_USE_CUDA
|
||||||
|
register_backend(ggml_backend_cuda_reg());
|
||||||
|
#endif
|
||||||
|
#ifdef GGML_USE_METAL
|
||||||
|
register_backend(ggml_backend_metal_reg());
|
||||||
|
#endif
|
||||||
|
#ifdef GGML_USE_SYCL
|
||||||
|
register_backend(ggml_backend_sycl_reg());
|
||||||
|
#endif
|
||||||
|
#ifdef GGML_USE_VULKAN
|
||||||
|
register_backend(ggml_backend_vk_reg());
|
||||||
|
#endif
|
||||||
|
#ifdef GGML_USE_CANN
|
||||||
|
register_backend(ggml_backend_cann_reg());
|
||||||
|
#endif
|
||||||
|
#ifdef GGML_USE_BLAS
|
||||||
|
register_backend(ggml_backend_blas_reg());
|
||||||
|
#endif
|
||||||
|
#ifdef GGML_USE_RPC
|
||||||
|
register_backend(ggml_backend_rpc_reg());
|
||||||
|
#endif
|
||||||
|
#ifdef GGML_USE_AMX
|
||||||
|
register_backend(ggml_backend_amx_reg());
|
||||||
|
#endif
|
||||||
|
#ifdef GGML_USE_KOMPUTE
|
||||||
|
register_backend(ggml_backend_kompute_reg());
|
||||||
|
#endif
|
||||||
|
|
||||||
|
register_backend(ggml_backend_cpu_reg());
|
||||||
|
}
|
||||||
|
|
||||||
|
void register_backend(ggml_backend_reg_t reg) {
|
||||||
|
if (!reg) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifndef NDEBUG
|
||||||
|
GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n",
|
||||||
|
__func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg));
|
||||||
|
#endif
|
||||||
|
backends.push_back(reg);
|
||||||
|
for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) {
|
||||||
|
register_device(ggml_backend_reg_dev_get(reg, i));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void register_device(ggml_backend_dev_t device) {
|
||||||
|
#ifndef NDEBUG
|
||||||
|
GGML_LOG_DEBUG("%s: registered device %s (%s)\n", __func__, ggml_backend_dev_name(device), ggml_backend_dev_description(device));
|
||||||
|
#endif
|
||||||
|
devices.push_back(device);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
static ggml_backend_registry & get_reg() {
|
||||||
|
static ggml_backend_registry reg;
|
||||||
|
return reg;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Internal API
|
||||||
|
void ggml_backend_register(ggml_backend_reg_t reg) {
|
||||||
|
get_reg().register_backend(reg);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_backend_device_register(ggml_backend_dev_t device) {
|
||||||
|
get_reg().register_device(device);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Backend (reg) enumeration
|
||||||
|
size_t ggml_backend_reg_count() {
|
||||||
|
return get_reg().backends.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_backend_reg_t ggml_backend_reg_get(size_t index) {
|
||||||
|
GGML_ASSERT(index < ggml_backend_reg_count());
|
||||||
|
return get_reg().backends[index];
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_backend_reg_t ggml_backend_reg_by_name(const char * name) {
|
||||||
|
for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
|
||||||
|
ggml_backend_reg_t reg = ggml_backend_reg_get(i);
|
||||||
|
if (std::strcmp(ggml_backend_reg_name(reg), name) == 0) {
|
||||||
|
return reg;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Device enumeration
|
||||||
|
size_t ggml_backend_dev_count() {
|
||||||
|
return get_reg().devices.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_backend_dev_t ggml_backend_dev_get(size_t index) {
|
||||||
|
GGML_ASSERT(index < ggml_backend_dev_count());
|
||||||
|
return get_reg().devices[index];
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_backend_dev_t ggml_backend_dev_by_name(const char * name) {
|
||||||
|
for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
|
||||||
|
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
|
||||||
|
if (strcmp(ggml_backend_dev_name(dev), name) == 0) {
|
||||||
|
return dev;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type) {
|
||||||
|
for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
|
||||||
|
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
|
||||||
|
if (ggml_backend_dev_type(dev) == type) {
|
||||||
|
return dev;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Convenience functions
|
||||||
|
ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params) {
|
||||||
|
ggml_backend_dev_t dev = ggml_backend_dev_by_name(name);
|
||||||
|
if (!dev) {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
return ggml_backend_dev_init(dev, params);
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params) {
|
||||||
|
ggml_backend_dev_t dev = ggml_backend_dev_by_type(type);
|
||||||
|
if (!dev) {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
return ggml_backend_dev_init(dev, params);
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_backend_t ggml_backend_init_best(void) {
|
||||||
|
ggml_backend_dev_t dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU);
|
||||||
|
if (!dev) {
|
||||||
|
dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||||
|
}
|
||||||
|
if (!dev) {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
return ggml_backend_dev_init(dev, NULL);
|
||||||
|
}
|
|
@ -525,197 +525,6 @@ void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * na
|
||||||
return reg->iface.get_proc_address(reg, name);
|
return reg->iface.get_proc_address(reg, name);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Backend registry
|
|
||||||
|
|
||||||
#ifdef GGML_USE_CUDA
|
|
||||||
#include "ggml-cuda.h"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef GGML_USE_METAL
|
|
||||||
#include "ggml-metal.h"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef GGML_USE_SYCL
|
|
||||||
#include "ggml-sycl.h"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef GGML_USE_VULKAN
|
|
||||||
#include "ggml-vulkan.h"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef GGML_USE_BLAS
|
|
||||||
#include "ggml-blas.h"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef GGML_USE_RPC
|
|
||||||
#include "ggml-rpc.h"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifndef __AMX_INT8__
|
|
||||||
#undef GGML_USE_AMX
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef GGML_USE_AMX
|
|
||||||
# include "ggml-amx.h"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef GGML_USE_CANN
|
|
||||||
#include "ggml-cann.h"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef GGML_USE_KOMPUTE
|
|
||||||
#include "ggml-kompute.h"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#include "ggml-cpu.h"
|
|
||||||
|
|
||||||
struct ggml_backend_registry {
|
|
||||||
std::vector<ggml_backend_reg_t> backends;
|
|
||||||
std::vector<ggml_backend_dev_t> devices;
|
|
||||||
|
|
||||||
ggml_backend_registry() {
|
|
||||||
#ifdef GGML_USE_CUDA
|
|
||||||
register_backend(ggml_backend_cuda_reg());
|
|
||||||
#endif
|
|
||||||
#ifdef GGML_USE_METAL
|
|
||||||
register_backend(ggml_backend_metal_reg());
|
|
||||||
#endif
|
|
||||||
#ifdef GGML_USE_SYCL
|
|
||||||
register_backend(ggml_backend_sycl_reg());
|
|
||||||
#endif
|
|
||||||
#ifdef GGML_USE_VULKAN
|
|
||||||
register_backend(ggml_backend_vk_reg());
|
|
||||||
#endif
|
|
||||||
#ifdef GGML_USE_CANN
|
|
||||||
register_backend(ggml_backend_cann_reg());
|
|
||||||
#endif
|
|
||||||
#ifdef GGML_USE_BLAS
|
|
||||||
register_backend(ggml_backend_blas_reg());
|
|
||||||
#endif
|
|
||||||
#ifdef GGML_USE_RPC
|
|
||||||
register_backend(ggml_backend_rpc_reg());
|
|
||||||
#endif
|
|
||||||
#ifdef GGML_USE_AMX
|
|
||||||
register_backend(ggml_backend_amx_reg());
|
|
||||||
#endif
|
|
||||||
#ifdef GGML_USE_KOMPUTE
|
|
||||||
register_backend(ggml_backend_kompute_reg());
|
|
||||||
#endif
|
|
||||||
|
|
||||||
register_backend(ggml_backend_cpu_reg());
|
|
||||||
}
|
|
||||||
|
|
||||||
void register_backend(ggml_backend_reg_t reg) {
|
|
||||||
#ifndef NDEBUG
|
|
||||||
GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n",
|
|
||||||
__func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg));
|
|
||||||
#endif
|
|
||||||
backends.push_back(reg);
|
|
||||||
for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) {
|
|
||||||
register_device(ggml_backend_reg_dev_get(reg, i));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void register_device(ggml_backend_dev_t device) {
|
|
||||||
#ifndef NDEBUG
|
|
||||||
GGML_LOG_DEBUG("%s: registered device %s (%s)\n", __func__, ggml_backend_dev_name(device), ggml_backend_dev_description(device));
|
|
||||||
#endif
|
|
||||||
devices.push_back(device);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
static ggml_backend_registry & get_reg() {
|
|
||||||
static ggml_backend_registry reg;
|
|
||||||
return reg;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Internal API
|
|
||||||
void ggml_backend_register(ggml_backend_reg_t reg) {
|
|
||||||
get_reg().register_backend(reg);
|
|
||||||
}
|
|
||||||
|
|
||||||
void ggml_backend_device_register(ggml_backend_dev_t device) {
|
|
||||||
get_reg().register_device(device);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Backend (reg) enumeration
|
|
||||||
size_t ggml_backend_reg_count() {
|
|
||||||
return get_reg().backends.size();
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_backend_reg_t ggml_backend_reg_get(size_t index) {
|
|
||||||
GGML_ASSERT(index < ggml_backend_reg_count());
|
|
||||||
return get_reg().backends[index];
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_backend_reg_t ggml_backend_reg_by_name(const char * name) {
|
|
||||||
for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
|
|
||||||
ggml_backend_reg_t reg = ggml_backend_reg_get(i);
|
|
||||||
if (strcmp(ggml_backend_reg_name(reg), name) == 0) {
|
|
||||||
return reg;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Device enumeration
|
|
||||||
size_t ggml_backend_dev_count() {
|
|
||||||
return get_reg().devices.size();
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_backend_dev_t ggml_backend_dev_get(size_t index) {
|
|
||||||
GGML_ASSERT(index < ggml_backend_dev_count());
|
|
||||||
return get_reg().devices[index];
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_backend_dev_t ggml_backend_dev_by_name(const char * name) {
|
|
||||||
for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
|
|
||||||
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
|
|
||||||
if (strcmp(ggml_backend_dev_name(dev), name) == 0) {
|
|
||||||
return dev;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type) {
|
|
||||||
for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
|
|
||||||
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
|
|
||||||
if (ggml_backend_dev_type(dev) == type) {
|
|
||||||
return dev;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Convenience functions
|
|
||||||
ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params) {
|
|
||||||
ggml_backend_dev_t dev = ggml_backend_dev_by_name(name);
|
|
||||||
if (!dev) {
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
return ggml_backend_dev_init(dev, params);
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params) {
|
|
||||||
ggml_backend_dev_t dev = ggml_backend_dev_by_type(type);
|
|
||||||
if (!dev) {
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
return ggml_backend_dev_init(dev, params);
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_backend_t ggml_backend_init_best(void) {
|
|
||||||
ggml_backend_dev_t dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU);
|
|
||||||
if (!dev) {
|
|
||||||
dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
|
||||||
}
|
|
||||||
if (!dev) {
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
return ggml_backend_dev_init(dev, NULL);
|
|
||||||
}
|
|
||||||
|
|
||||||
// multi-buffer buffer
|
// multi-buffer buffer
|
||||||
|
|
||||||
struct ggml_backend_multi_buffer_context {
|
struct ggml_backend_multi_buffer_context {
|
||||||
|
@ -1646,7 +1455,7 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
||||||
bool parallel) {
|
bool parallel) {
|
||||||
GGML_ASSERT(n_backends > 0);
|
GGML_ASSERT(n_backends > 0);
|
||||||
GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
|
GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
|
||||||
GGML_ASSERT(ggml_backend_is_cpu(backends[n_backends - 1])); // last backend must be CPU
|
GGML_ASSERT(ggml_backend_dev_type(ggml_backend_get_device(backends[n_backends - 1])) == GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||||
|
|
||||||
struct ggml_backend_sched * sched = (ggml_backend_sched *) calloc(1, sizeof(struct ggml_backend_sched));
|
struct ggml_backend_sched * sched = (ggml_backend_sched *) calloc(1, sizeof(struct ggml_backend_sched));
|
||||||
|
|
||||||
|
@ -2042,17 +1851,6 @@ bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#include "ggml-backend.h"
|
|
||||||
#include "ggml-backend-impl.h"
|
|
||||||
#include "ggml-cpu.h"
|
|
||||||
#include "ggml-impl.h"
|
|
||||||
#include <cctype>
|
|
||||||
#include <string>
|
|
||||||
|
|
||||||
// ggml-backend interface
|
|
||||||
|
|
||||||
// CPU backend - buffer
|
// CPU backend - buffer
|
||||||
|
|
||||||
static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
|
static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||||
|
@ -2126,7 +1924,9 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = {
|
||||||
/* .reset = */ NULL,
|
/* .reset = */ NULL,
|
||||||
};
|
};
|
||||||
|
|
||||||
// CPU backend - buffer type
|
// CPU backend buffer type
|
||||||
|
|
||||||
|
// this buffer type is defined here to make it available to all backends
|
||||||
|
|
||||||
static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
||||||
return "CPU";
|
return "CPU";
|
||||||
|
@ -2167,7 +1967,7 @@ ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
|
||||||
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
||||||
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
||||||
},
|
},
|
||||||
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
|
/* .device = */ NULL, // FIXME ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
|
||||||
/* .context = */ NULL,
|
/* .context = */ NULL,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -2190,478 +1990,14 @@ static ggml_backend_buffer_type_t ggml_backend_cpu_buffer_from_ptr_type(void) {
|
||||||
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
||||||
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
||||||
},
|
},
|
||||||
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
|
/* .device = */ NULL, // FIXME ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
|
||||||
/* .context = */ NULL,
|
/* .context = */ NULL,
|
||||||
};
|
};
|
||||||
|
|
||||||
return &ggml_backend_cpu_buffer_type;
|
return &ggml_backend_cpu_buffer_type;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef GGML_USE_CPU_HBM
|
|
||||||
|
|
||||||
// buffer type HBM
|
|
||||||
|
|
||||||
#include <hbwmalloc.h>
|
|
||||||
|
|
||||||
static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
|
||||||
return "CPU_HBM";
|
|
||||||
|
|
||||||
GGML_UNUSED(buft);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
|
||||||
hbw_free(buffer->context);
|
|
||||||
}
|
|
||||||
|
|
||||||
static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
|
||||||
void * ptr;
|
|
||||||
int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
|
|
||||||
if (result != 0) {
|
|
||||||
GGML_LOG_ERROR("failed to allocate HBM buffer of size %zu\n", size);
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
|
||||||
buffer->buft = buft;
|
|
||||||
buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer;
|
|
||||||
|
|
||||||
return buffer;
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
|
|
||||||
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = {
|
|
||||||
/* .iface = */ {
|
|
||||||
/* .get_name = */ ggml_backend_cpu_hbm_buffer_type_get_name,
|
|
||||||
/* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
|
|
||||||
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
|
||||||
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
|
||||||
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
|
||||||
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
|
||||||
},
|
|
||||||
/* .context = */ NULL,
|
|
||||||
};
|
|
||||||
|
|
||||||
return &ggml_backend_cpu_buffer_type_hbm;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
static ggml_backend_buffer_type_t * ggml_backend_cpu_get_extra_bufts(ggml_backend_dev_t device) {
|
|
||||||
static ggml_backend_buffer_type_t bufts[] = {
|
|
||||||
#ifdef GGML_USE_CPU_HBM
|
|
||||||
ggml_backend_cpu_hbm_buffer_type(),
|
|
||||||
#endif
|
|
||||||
NULL
|
|
||||||
};
|
|
||||||
|
|
||||||
return bufts;
|
|
||||||
|
|
||||||
GGML_UNUSED(device);
|
|
||||||
}
|
|
||||||
|
|
||||||
// CPU backend - backend (stream)
|
|
||||||
|
|
||||||
struct ggml_backend_cpu_context {
|
|
||||||
int n_threads;
|
|
||||||
ggml_threadpool_t threadpool;
|
|
||||||
|
|
||||||
uint8_t * work_data;
|
|
||||||
size_t work_size;
|
|
||||||
|
|
||||||
ggml_abort_callback abort_callback;
|
|
||||||
void * abort_callback_data;
|
|
||||||
};
|
|
||||||
|
|
||||||
static const char * ggml_backend_cpu_get_name(ggml_backend_t backend) {
|
|
||||||
return "CPU";
|
|
||||||
|
|
||||||
GGML_UNUSED(backend);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ggml_backend_cpu_free(ggml_backend_t backend) {
|
|
||||||
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
|
||||||
delete[] cpu_ctx->work_data;
|
|
||||||
delete cpu_ctx;
|
|
||||||
delete backend;
|
|
||||||
}
|
|
||||||
|
|
||||||
struct ggml_backend_plan_cpu {
|
|
||||||
struct ggml_cplan cplan;
|
|
||||||
struct ggml_cgraph cgraph;
|
|
||||||
};
|
|
||||||
|
|
||||||
static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) {
|
|
||||||
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
|
||||||
|
|
||||||
struct ggml_backend_plan_cpu * cpu_plan = new ggml_backend_plan_cpu;
|
|
||||||
|
|
||||||
cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
|
|
||||||
cpu_plan->cgraph = *cgraph; // FIXME: deep copy
|
|
||||||
|
|
||||||
if (cpu_plan->cplan.work_size > 0) {
|
|
||||||
cpu_plan->cplan.work_data = new uint8_t[cpu_plan->cplan.work_size];
|
|
||||||
if (cpu_plan->cplan.work_data == NULL) {
|
|
||||||
delete cpu_plan;
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback;
|
|
||||||
cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data;
|
|
||||||
|
|
||||||
return cpu_plan;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
|
||||||
struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
|
|
||||||
|
|
||||||
delete[] cpu_plan->cplan.work_data;
|
|
||||||
delete cpu_plan;
|
|
||||||
|
|
||||||
GGML_UNUSED(backend);
|
|
||||||
}
|
|
||||||
|
|
||||||
static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
|
||||||
struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
|
|
||||||
|
|
||||||
return ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
|
|
||||||
|
|
||||||
GGML_UNUSED(backend);
|
|
||||||
}
|
|
||||||
|
|
||||||
static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
|
||||||
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
|
||||||
|
|
||||||
struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
|
|
||||||
|
|
||||||
if (cpu_ctx->work_size < cplan.work_size) {
|
|
||||||
delete[] cpu_ctx->work_data;
|
|
||||||
cpu_ctx->work_data = new uint8_t[cplan.work_size];
|
|
||||||
if (cpu_ctx->work_data == NULL) {
|
|
||||||
cpu_ctx->work_size = 0;
|
|
||||||
return GGML_STATUS_ALLOC_FAILED;
|
|
||||||
}
|
|
||||||
cpu_ctx->work_size = cplan.work_size;
|
|
||||||
}
|
|
||||||
cplan.work_data = (uint8_t *)cpu_ctx->work_data;
|
|
||||||
|
|
||||||
cplan.abort_callback = cpu_ctx->abort_callback;
|
|
||||||
cplan.abort_callback_data = cpu_ctx->abort_callback_data;
|
|
||||||
|
|
||||||
return ggml_graph_compute(cgraph, &cplan);
|
|
||||||
}
|
|
||||||
|
|
||||||
static const struct ggml_backend_i ggml_backend_cpu_i = {
|
|
||||||
/* .get_name = */ ggml_backend_cpu_get_name,
|
|
||||||
/* .free = */ ggml_backend_cpu_free,
|
|
||||||
/* .set_tensor_async = */ NULL,
|
|
||||||
/* .get_tensor_async = */ NULL,
|
|
||||||
/* .cpy_tensor_async = */ NULL,
|
|
||||||
/* .synchronize = */ NULL,
|
|
||||||
/* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create,
|
|
||||||
/* .graph_plan_free = */ ggml_backend_cpu_graph_plan_free,
|
|
||||||
/* .graph_plan_update = */ NULL,
|
|
||||||
/* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
|
|
||||||
/* .graph_compute = */ ggml_backend_cpu_graph_compute,
|
|
||||||
/* .event_record = */ NULL,
|
|
||||||
/* .event_wait = */ NULL,
|
|
||||||
};
|
|
||||||
|
|
||||||
static ggml_guid_t ggml_backend_cpu_guid(void) {
|
|
||||||
static ggml_guid guid = { 0xaa, 0x67, 0xc7, 0x43, 0x96, 0xe6, 0xa3, 0x8a, 0xe3, 0xaf, 0xea, 0x92, 0x36, 0xbc, 0xfc, 0x89 };
|
|
||||||
return &guid;
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_backend_t ggml_backend_cpu_init(void) {
|
|
||||||
// initialize CPU backend now to avoid slowing the first graph computation
|
|
||||||
ggml_cpu_init();
|
|
||||||
|
|
||||||
struct ggml_backend_cpu_context * ctx = new ggml_backend_cpu_context;
|
|
||||||
if (ctx == NULL) {
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
ctx->n_threads = GGML_DEFAULT_N_THREADS;
|
|
||||||
ctx->threadpool = NULL;
|
|
||||||
ctx->work_data = NULL;
|
|
||||||
ctx->work_size = 0;
|
|
||||||
ctx->abort_callback = NULL;
|
|
||||||
ctx->abort_callback_data = NULL;
|
|
||||||
|
|
||||||
ggml_backend_t cpu_backend = new ggml_backend {
|
|
||||||
/* .guid = */ ggml_backend_cpu_guid(),
|
|
||||||
/* .interface = */ ggml_backend_cpu_i,
|
|
||||||
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
|
|
||||||
/* .context = */ ctx,
|
|
||||||
};
|
|
||||||
|
|
||||||
if (cpu_backend == NULL) {
|
|
||||||
delete ctx;
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
return cpu_backend;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool ggml_backend_is_cpu(ggml_backend_t backend) {
|
|
||||||
return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cpu_guid());
|
|
||||||
}
|
|
||||||
|
|
||||||
void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
|
|
||||||
GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
|
|
||||||
|
|
||||||
struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
|
|
||||||
ctx->n_threads = n_threads;
|
|
||||||
}
|
|
||||||
|
|
||||||
void ggml_backend_cpu_set_threadpool(ggml_backend_t backend_cpu, ggml_threadpool_t threadpool) {
|
|
||||||
GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
|
|
||||||
|
|
||||||
struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
|
|
||||||
|
|
||||||
if (ctx->threadpool && ctx->threadpool != threadpool) {
|
|
||||||
// already had a different threadpool, pause/suspend it before switching
|
|
||||||
ggml_threadpool_pause(ctx->threadpool);
|
|
||||||
}
|
|
||||||
ctx->threadpool = threadpool;
|
|
||||||
}
|
|
||||||
|
|
||||||
void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data) {
|
|
||||||
GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
|
|
||||||
|
|
||||||
struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
|
|
||||||
ctx->abort_callback = abort_callback;
|
|
||||||
ctx->abort_callback_data = abort_callback_data;
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
|
ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
|
||||||
GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned");
|
GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned");
|
||||||
return ggml_backend_buffer_init(ggml_backend_cpu_buffer_from_ptr_type(), ggml_backend_cpu_buffer_from_ptr_i, ptr, size);
|
return ggml_backend_buffer_init(ggml_backend_cpu_buffer_from_ptr_type(), ggml_backend_cpu_buffer_from_ptr_i, ptr, size);
|
||||||
}
|
}
|
||||||
|
|
||||||
// CPU backend - device
|
|
||||||
|
|
||||||
struct ggml_backend_cpu_device_context {
|
|
||||||
std::string description = "CPU";
|
|
||||||
|
|
||||||
ggml_backend_cpu_device_context() {
|
|
||||||
#ifdef __APPLE__
|
|
||||||
size_t len = 0;
|
|
||||||
if (!sysctlbyname("machdep.cpu.brand_string", NULL, &len, NULL, 0)) {
|
|
||||||
description.resize(len);
|
|
||||||
sysctlbyname("machdep.cpu.brand_string", &description[0], &len, NULL, 0); // NOLINT
|
|
||||||
}
|
|
||||||
#elif defined(__linux__)
|
|
||||||
FILE * f = fopen("/proc/cpuinfo", "r");
|
|
||||||
if (f) {
|
|
||||||
char buf[1024];
|
|
||||||
while (fgets(buf, sizeof(buf), f)) {
|
|
||||||
if (strncmp(buf, "model name", 10) == 0) {
|
|
||||||
char * p = strchr(buf, ':');
|
|
||||||
if (p) {
|
|
||||||
p++;
|
|
||||||
while (std::isspace(*p)) {
|
|
||||||
p++;
|
|
||||||
}
|
|
||||||
while (std::isspace(p[strlen(p) - 1])) {
|
|
||||||
p[strlen(p) - 1] = '\0';
|
|
||||||
}
|
|
||||||
description = p;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
fclose(f);
|
|
||||||
}
|
|
||||||
#elif defined(_WIN32)
|
|
||||||
HKEY hKey;
|
|
||||||
if (RegOpenKeyEx(HKEY_LOCAL_MACHINE,
|
|
||||||
TEXT("HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0"),
|
|
||||||
0,
|
|
||||||
KEY_READ,
|
|
||||||
&hKey) == ERROR_SUCCESS) {
|
|
||||||
DWORD cpu_brand_size = 0;
|
|
||||||
if (RegQueryValueExA(hKey,
|
|
||||||
TEXT("ProcessorNameString"),
|
|
||||||
NULL,
|
|
||||||
NULL,
|
|
||||||
NULL,
|
|
||||||
&cpu_brand_size) == ERROR_SUCCESS) {
|
|
||||||
description.resize(cpu_brand_size);
|
|
||||||
if (RegQueryValueExA(hKey,
|
|
||||||
TEXT("ProcessorNameString"),
|
|
||||||
NULL,
|
|
||||||
NULL,
|
|
||||||
(LPBYTE)&description[0], // NOLINT
|
|
||||||
&cpu_brand_size) == ERROR_SUCCESS) {
|
|
||||||
if (description.find('\0') != std::string::npos) {
|
|
||||||
description.resize(description.find('\0'));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
RegCloseKey(hKey);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
static const char * ggml_backend_cpu_device_get_name(ggml_backend_dev_t dev) {
|
|
||||||
return "CPU";
|
|
||||||
|
|
||||||
GGML_UNUSED(dev);
|
|
||||||
}
|
|
||||||
|
|
||||||
static const char * ggml_backend_cpu_device_get_description(ggml_backend_dev_t dev) {
|
|
||||||
struct ggml_backend_cpu_device_context * ctx = (struct ggml_backend_cpu_device_context *)dev->context;
|
|
||||||
|
|
||||||
return ctx->description.c_str();
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ggml_backend_cpu_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
|
|
||||||
// TODO
|
|
||||||
*free = 0;
|
|
||||||
*total = 0;
|
|
||||||
|
|
||||||
GGML_UNUSED(dev);
|
|
||||||
}
|
|
||||||
|
|
||||||
static enum ggml_backend_dev_type ggml_backend_cpu_device_get_type(ggml_backend_dev_t dev) {
|
|
||||||
return GGML_BACKEND_DEVICE_TYPE_CPU;
|
|
||||||
|
|
||||||
GGML_UNUSED(dev);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ggml_backend_cpu_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
|
|
||||||
props->name = ggml_backend_cpu_device_get_name(dev);
|
|
||||||
props->description = ggml_backend_cpu_device_get_description(dev);
|
|
||||||
props->type = ggml_backend_cpu_device_get_type(dev);
|
|
||||||
ggml_backend_cpu_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
|
||||||
props->caps = {
|
|
||||||
/* .async = */ false,
|
|
||||||
/* .host_buffer = */ false,
|
|
||||||
/* .buffer_from_host_ptr = */ true,
|
|
||||||
/* .events = */ false,
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
static ggml_backend_t ggml_backend_cpu_device_init_backend(ggml_backend_dev_t dev, const char * params) {
|
|
||||||
return ggml_backend_cpu_init();
|
|
||||||
|
|
||||||
GGML_UNUSED(dev);
|
|
||||||
GGML_UNUSED(params);
|
|
||||||
}
|
|
||||||
|
|
||||||
static ggml_backend_buffer_type_t ggml_backend_cpu_device_get_buffer_type(ggml_backend_dev_t dev) {
|
|
||||||
return ggml_backend_cpu_buffer_type();
|
|
||||||
|
|
||||||
GGML_UNUSED(dev);
|
|
||||||
}
|
|
||||||
|
|
||||||
static ggml_backend_buffer_t ggml_backend_cpu_device_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
|
|
||||||
return ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
|
||||||
|
|
||||||
GGML_UNUSED(dev);
|
|
||||||
GGML_UNUSED(max_tensor_size);
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
|
|
||||||
switch (op->op) {
|
|
||||||
case GGML_OP_CPY:
|
|
||||||
return
|
|
||||||
op->type != GGML_TYPE_IQ2_XXS &&
|
|
||||||
op->type != GGML_TYPE_IQ2_XS &&
|
|
||||||
op->type != GGML_TYPE_IQ1_S &&
|
|
||||||
op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
|
|
||||||
case GGML_OP_MUL_MAT:
|
|
||||||
return op->src[1]->type == GGML_TYPE_F32;// FIXME || op->src[1]->type == ggml_get_type_traits(op->src[0]->type)->vec_dot_type;
|
|
||||||
case GGML_OP_ROPE_BACK:
|
|
||||||
return op->src[2] == NULL && (op->op_params[2] & 4) == 0;
|
|
||||||
case GGML_OP_IM2COL_BACK:
|
|
||||||
return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
|
|
||||||
case GGML_OP_OUT_PROD:
|
|
||||||
return (op->src[0]->type == GGML_TYPE_F32 || ggml_is_quantized(op->src[0]->type)) && op->src[1]->type == GGML_TYPE_F32;
|
|
||||||
default:
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
GGML_UNUSED(dev);
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool ggml_backend_cpu_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
|
|
||||||
return ggml_backend_buft_is_host(buft);
|
|
||||||
|
|
||||||
GGML_UNUSED(dev);
|
|
||||||
}
|
|
||||||
|
|
||||||
static const struct ggml_backend_device_i ggml_backend_cpu_device_i = {
|
|
||||||
/* .get_name = */ ggml_backend_cpu_device_get_name,
|
|
||||||
/* .get_description = */ ggml_backend_cpu_device_get_description,
|
|
||||||
/* .get_memory = */ ggml_backend_cpu_device_get_memory,
|
|
||||||
/* .get_type = */ ggml_backend_cpu_device_get_type,
|
|
||||||
/* .get_props = */ ggml_backend_cpu_device_get_props,
|
|
||||||
/* .init_backend = */ ggml_backend_cpu_device_init_backend,
|
|
||||||
/* .get_buffer_type = */ ggml_backend_cpu_device_get_buffer_type,
|
|
||||||
/* .get_host_buffer_type = */ NULL,
|
|
||||||
/* .buffer_from_host_ptr = */ ggml_backend_cpu_device_buffer_from_host_ptr,
|
|
||||||
/* .supports_op = */ ggml_backend_cpu_device_supports_op,
|
|
||||||
/* .supports_buft = */ ggml_backend_cpu_device_supports_buft,
|
|
||||||
/* .offload_op = */ NULL,
|
|
||||||
/* .event_new = */ NULL,
|
|
||||||
/* .event_free = */ NULL,
|
|
||||||
/* .event_synchronize = */ NULL,
|
|
||||||
};
|
|
||||||
|
|
||||||
// CPU backend - backend (reg)
|
|
||||||
|
|
||||||
static const char * ggml_backend_cpu_reg_get_name(ggml_backend_reg_t reg) {
|
|
||||||
return "CPU";
|
|
||||||
|
|
||||||
GGML_UNUSED(reg);
|
|
||||||
}
|
|
||||||
|
|
||||||
static size_t ggml_backend_cpu_reg_get_device_count(ggml_backend_reg_t reg) {
|
|
||||||
return 1;
|
|
||||||
|
|
||||||
GGML_UNUSED(reg);
|
|
||||||
}
|
|
||||||
|
|
||||||
static ggml_backend_dev_t ggml_backend_cpu_reg_get_device(ggml_backend_reg_t reg, size_t index) {
|
|
||||||
GGML_ASSERT(index == 0);
|
|
||||||
|
|
||||||
static ggml_backend_cpu_device_context ctx;
|
|
||||||
static ggml_backend_device ggml_backend_cpu_device = {
|
|
||||||
/* .iface = */ ggml_backend_cpu_device_i,
|
|
||||||
/* .reg = */ reg,
|
|
||||||
/* .context = */ &ctx,
|
|
||||||
};
|
|
||||||
|
|
||||||
return &ggml_backend_cpu_device;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void * ggml_backend_cpu_get_proc_address(ggml_backend_reg_t reg, const char * name) {
|
|
||||||
if (strcmp(name, "ggml_backend_set_n_threads") == 0) {
|
|
||||||
return (void *)ggml_backend_cpu_set_n_threads;
|
|
||||||
}
|
|
||||||
if (strcmp(name, "ggml_backend_dev_get_extra_bufts") == 0) {
|
|
||||||
return (void *)ggml_backend_cpu_get_extra_bufts;
|
|
||||||
}
|
|
||||||
|
|
||||||
return NULL;
|
|
||||||
|
|
||||||
GGML_UNUSED(reg);
|
|
||||||
}
|
|
||||||
|
|
||||||
static const struct ggml_backend_reg_i ggml_backend_cpu_reg_i = {
|
|
||||||
/* .get_name = */ ggml_backend_cpu_reg_get_name,
|
|
||||||
/* .get_device_count = */ ggml_backend_cpu_reg_get_device_count,
|
|
||||||
/* .get_device = */ ggml_backend_cpu_reg_get_device,
|
|
||||||
/* .get_proc_address = */ ggml_backend_cpu_get_proc_address,
|
|
||||||
};
|
|
||||||
|
|
||||||
ggml_backend_reg_t ggml_backend_cpu_reg(void) {
|
|
||||||
static struct ggml_backend_reg ggml_backend_cpu_reg = {
|
|
||||||
/* .iface = */ ggml_backend_cpu_reg_i,
|
|
||||||
/* .context = */ NULL,
|
|
||||||
};
|
|
||||||
|
|
||||||
return &ggml_backend_cpu_reg;
|
|
||||||
}
|
|
||||||
|
|
|
@ -6,7 +6,7 @@
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
|
|
||||||
#if defined(GGML_USE_ACCELERATE)
|
#if defined(GGML_BLAS_USE_ACCELERATE)
|
||||||
# include <Accelerate/Accelerate.h>
|
# include <Accelerate/Accelerate.h>
|
||||||
#elif defined(GGML_BLAS_USE_MKL)
|
#elif defined(GGML_BLAS_USE_MKL)
|
||||||
# include <mkl.h>
|
# include <mkl.h>
|
||||||
|
@ -320,7 +320,7 @@ static const char * ggml_backend_blas_device_get_name(ggml_backend_dev_t dev) {
|
||||||
}
|
}
|
||||||
|
|
||||||
static const char * ggml_backend_blas_device_get_description(ggml_backend_dev_t dev) {
|
static const char * ggml_backend_blas_device_get_description(ggml_backend_dev_t dev) {
|
||||||
#if defined(GGML_USE_ACCELERATE)
|
#if defined(GGML_BLAS_USE_ACCELERATE)
|
||||||
return "Accelerate";
|
return "Accelerate";
|
||||||
#elif defined(GGML_BLAS_USE_MKL)
|
#elif defined(GGML_BLAS_USE_MKL)
|
||||||
return "MKL";
|
return "MKL";
|
3387
ggml/src/ggml-cpu/ggml-cpu-aarch64.c
Normal file
3387
ggml/src/ggml-cpu/ggml-cpu-aarch64.c
Normal file
File diff suppressed because it is too large
Load diff
27
ggml/src/ggml-cpu/ggml-cpu-aarch64.h
Normal file
27
ggml/src/ggml-cpu/ggml-cpu-aarch64.h
Normal file
|
@ -0,0 +1,27 @@
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "ggml.h"
|
||||||
|
|
||||||
|
// GGML internal header
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Quantization
|
||||||
|
void quantize_mat_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nrows, int64_t n_per_row, int64_t blck_size_interleave);
|
||||||
|
|
||||||
|
// GEMV
|
||||||
|
void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
|
void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
|
void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
|
|
||||||
|
// GEMM
|
||||||
|
void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
|
void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
|
void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
|
@ -27,80 +27,6 @@ extern "C" {
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/**
|
|
||||||
* Converts brain16 to float32.
|
|
||||||
*
|
|
||||||
* The bfloat16 floating point format has the following structure:
|
|
||||||
*
|
|
||||||
* ┌sign
|
|
||||||
* │
|
|
||||||
* │ ┌exponent
|
|
||||||
* │ │
|
|
||||||
* │ │ ┌mantissa
|
|
||||||
* │ │ │
|
|
||||||
* │┌──┴───┐┌─┴───┐
|
|
||||||
* 0b0000000000000000 brain16
|
|
||||||
*
|
|
||||||
* Since bf16 has the same number of exponent bits as a 32bit float,
|
|
||||||
* encoding and decoding numbers becomes relatively straightforward.
|
|
||||||
*
|
|
||||||
* ┌sign
|
|
||||||
* │
|
|
||||||
* │ ┌exponent
|
|
||||||
* │ │
|
|
||||||
* │ │ ┌mantissa
|
|
||||||
* │ │ │
|
|
||||||
* │┌──┴───┐┌─┴───────────────────┐
|
|
||||||
* 0b00000000000000000000000000000000 IEEE binary32
|
|
||||||
*
|
|
||||||
* For comparison, the standard fp16 format has fewer exponent bits.
|
|
||||||
*
|
|
||||||
* ┌sign
|
|
||||||
* │
|
|
||||||
* │ ┌exponent
|
|
||||||
* │ │
|
|
||||||
* │ │ ┌mantissa
|
|
||||||
* │ │ │
|
|
||||||
* │┌─┴─┐┌─┴──────┐
|
|
||||||
* 0b0000000000000000 IEEE binary16
|
|
||||||
*
|
|
||||||
* @see IEEE 754-2008
|
|
||||||
*/
|
|
||||||
static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
|
|
||||||
union {
|
|
||||||
float f;
|
|
||||||
uint32_t i;
|
|
||||||
} u;
|
|
||||||
u.i = (uint32_t)h.bits << 16;
|
|
||||||
return u.f;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Converts float32 to brain16.
|
|
||||||
*
|
|
||||||
* This is binary identical with Google Brain float conversion.
|
|
||||||
* Floats shall round to nearest even, and NANs shall be quiet.
|
|
||||||
* Subnormals aren't flushed to zero, except perhaps when used.
|
|
||||||
* This code should vectorize nicely if using modern compilers.
|
|
||||||
*/
|
|
||||||
static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
|
|
||||||
ggml_bf16_t h;
|
|
||||||
union {
|
|
||||||
float f;
|
|
||||||
uint32_t i;
|
|
||||||
} u;
|
|
||||||
u.f = s;
|
|
||||||
if ((u.i & 0x7fffffff) > 0x7f800000) { /* nan */
|
|
||||||
h.bits = (u.i >> 16) | 64; /* force to quiet */
|
|
||||||
return h;
|
|
||||||
}
|
|
||||||
h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16;
|
|
||||||
return h;
|
|
||||||
}
|
|
||||||
|
|
||||||
#define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x)
|
|
||||||
#define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x)
|
|
||||||
|
|
||||||
// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
|
// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
|
||||||
#if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
|
#if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
|
||||||
#ifndef __FMA__
|
#ifndef __FMA__
|
||||||
|
@ -388,28 +314,6 @@ inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b)
|
||||||
|
|
||||||
#endif // defined(__ARM_NEON)
|
#endif // defined(__ARM_NEON)
|
||||||
|
|
||||||
#if defined(__ARM_NEON) && !defined(_MSC_VER)
|
|
||||||
|
|
||||||
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
|
||||||
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
|
||||||
|
|
||||||
#define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
|
||||||
|
|
||||||
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
|
||||||
ggml_fp16_internal_t tmp;
|
|
||||||
memcpy(&tmp, &h, sizeof(ggml_fp16_t));
|
|
||||||
return (float)tmp;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
|
||||||
ggml_fp16_t res;
|
|
||||||
ggml_fp16_internal_t tmp = f;
|
|
||||||
memcpy(&res, &tmp, sizeof(ggml_fp16_t));
|
|
||||||
return res;
|
|
||||||
}
|
|
||||||
|
|
||||||
#else
|
|
||||||
|
|
||||||
#ifdef __wasm_simd128__
|
#ifdef __wasm_simd128__
|
||||||
#include <wasm_simd128.h>
|
#include <wasm_simd128.h>
|
||||||
#else
|
#else
|
||||||
|
@ -462,153 +366,6 @@ static __m256 __lasx_xvreplfr2vr_s(float val) {
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef __F16C__
|
|
||||||
|
|
||||||
#ifdef _MSC_VER
|
|
||||||
#define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
|
|
||||||
#define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
|
|
||||||
#else
|
|
||||||
#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
|
|
||||||
#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#elif defined(__POWER9_VECTOR__)
|
|
||||||
|
|
||||||
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
|
||||||
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
|
||||||
/* the inline asm below is about 12% faster than the lookup method */
|
|
||||||
#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
|
|
||||||
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
|
|
||||||
|
|
||||||
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
|
||||||
register float f;
|
|
||||||
register double d;
|
|
||||||
__asm__(
|
|
||||||
"mtfprd %0,%2\n"
|
|
||||||
"xscvhpdp %0,%0\n"
|
|
||||||
"frsp %1,%0\n" :
|
|
||||||
/* temp */ "=d"(d),
|
|
||||||
/* out */ "=f"(f):
|
|
||||||
/* in */ "r"(h));
|
|
||||||
return f;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
|
||||||
register double d;
|
|
||||||
register ggml_fp16_t r;
|
|
||||||
__asm__( /* xscvdphp can work on double or single precision */
|
|
||||||
"xscvdphp %0,%2\n"
|
|
||||||
"mffprd %1,%0\n" :
|
|
||||||
/* temp */ "=d"(d),
|
|
||||||
/* out */ "=r"(r):
|
|
||||||
/* in */ "f"(f));
|
|
||||||
return r;
|
|
||||||
}
|
|
||||||
|
|
||||||
#else
|
|
||||||
|
|
||||||
// FP16 <-> FP32
|
|
||||||
// ref: https://github.com/Maratyszcza/FP16
|
|
||||||
|
|
||||||
static inline float fp32_from_bits(uint32_t w) {
|
|
||||||
union {
|
|
||||||
uint32_t as_bits;
|
|
||||||
float as_value;
|
|
||||||
} fp32;
|
|
||||||
fp32.as_bits = w;
|
|
||||||
return fp32.as_value;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline uint32_t fp32_to_bits(float f) {
|
|
||||||
union {
|
|
||||||
float as_value;
|
|
||||||
uint32_t as_bits;
|
|
||||||
} fp32;
|
|
||||||
fp32.as_value = f;
|
|
||||||
return fp32.as_bits;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
|
||||||
const uint32_t w = (uint32_t) h << 16;
|
|
||||||
const uint32_t sign = w & UINT32_C(0x80000000);
|
|
||||||
const uint32_t two_w = w + w;
|
|
||||||
|
|
||||||
const uint32_t exp_offset = UINT32_C(0xE0) << 23;
|
|
||||||
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
|
|
||||||
const float exp_scale = 0x1.0p-112f;
|
|
||||||
#else
|
|
||||||
const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
|
|
||||||
#endif
|
|
||||||
const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
|
|
||||||
|
|
||||||
const uint32_t magic_mask = UINT32_C(126) << 23;
|
|
||||||
const float magic_bias = 0.5f;
|
|
||||||
const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
|
|
||||||
|
|
||||||
const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
|
|
||||||
const uint32_t result = sign |
|
|
||||||
(two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
|
|
||||||
return fp32_from_bits(result);
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
|
||||||
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
|
|
||||||
const float scale_to_inf = 0x1.0p+112f;
|
|
||||||
const float scale_to_zero = 0x1.0p-110f;
|
|
||||||
#else
|
|
||||||
const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
|
|
||||||
const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
|
|
||||||
#endif
|
|
||||||
float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
|
|
||||||
|
|
||||||
const uint32_t w = fp32_to_bits(f);
|
|
||||||
const uint32_t shl1_w = w + w;
|
|
||||||
const uint32_t sign = w & UINT32_C(0x80000000);
|
|
||||||
uint32_t bias = shl1_w & UINT32_C(0xFF000000);
|
|
||||||
if (bias < UINT32_C(0x71000000)) {
|
|
||||||
bias = UINT32_C(0x71000000);
|
|
||||||
}
|
|
||||||
|
|
||||||
base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
|
|
||||||
const uint32_t bits = fp32_to_bits(base);
|
|
||||||
const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
|
|
||||||
const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
|
|
||||||
const uint32_t nonsign = exp_bits + mantissa_bits;
|
|
||||||
return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
|
|
||||||
}
|
|
||||||
|
|
||||||
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
|
||||||
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
|
||||||
|
|
||||||
#endif // __F16C__
|
|
||||||
|
|
||||||
#endif // defined(__ARM_NEON) && (!defined(__MSC_VER)
|
|
||||||
|
|
||||||
#ifdef __ARM_FEATURE_SVE
|
|
||||||
#include <arm_sve.h>
|
|
||||||
#endif // __ARM_FEATURE_SVE
|
|
||||||
|
|
||||||
// precomputed f32 table for f16 (256 KB)
|
|
||||||
// defined in ggml.c, initialized in ggml_init()
|
|
||||||
extern float ggml_table_f32_f16[1 << 16];
|
|
||||||
|
|
||||||
// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
|
|
||||||
// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
|
|
||||||
// This is also true for POWER9.
|
|
||||||
#if !defined(GGML_FP16_TO_FP32)
|
|
||||||
inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
|
|
||||||
uint16_t s;
|
|
||||||
memcpy(&s, &f, sizeof(uint16_t));
|
|
||||||
return ggml_table_f32_f16[s];
|
|
||||||
}
|
|
||||||
|
|
||||||
#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if !defined(GGML_FP32_TO_FP16)
|
|
||||||
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
10797
ggml/src/ggml-cpu/ggml-cpu-quants.c
Normal file
10797
ggml/src/ggml-cpu/ggml-cpu-quants.c
Normal file
File diff suppressed because it is too large
Load diff
63
ggml/src/ggml-cpu/ggml-cpu-quants.h
Normal file
63
ggml/src/ggml-cpu/ggml-cpu-quants.h
Normal file
|
@ -0,0 +1,63 @@
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#define GGML_COMMON_DECL_C
|
||||||
|
#include "ggml-common.h"
|
||||||
|
|
||||||
|
#include "ggml.h"
|
||||||
|
|
||||||
|
// GGML CPU internal header
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Quantization
|
||||||
|
void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
|
void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
|
void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
|
void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
|
void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
|
void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
|
|
||||||
|
void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
|
void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
|
void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
|
void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
|
void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
|
void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
|
|
||||||
|
void quantize_row_tq1_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
|
void quantize_row_tq2_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
|
|
||||||
|
void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
|
void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
|
|
||||||
|
// Dot product
|
||||||
|
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
|
void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
|
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
|
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
|
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
|
|
||||||
|
void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
|
void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
|
void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
|
void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
|
void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
|
|
||||||
|
void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
|
void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
|
|
||||||
|
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
|
void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
|
void ggml_vec_dot_iq2_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
|
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
|
void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
|
void ggml_vec_dot_iq1_m_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
|
void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
|
void ggml_vec_dot_iq4_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
|
void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
|
@ -1,13 +1,15 @@
|
||||||
#define _CRT_SECURE_NO_DEPRECATE // Disables "unsafe" warnings on Windows
|
#define _CRT_SECURE_NO_DEPRECATE // Disables "unsafe" warnings on Windows
|
||||||
#define _USE_MATH_DEFINES // For M_PI on MSVC
|
#define _USE_MATH_DEFINES // For M_PI on MSVC
|
||||||
|
|
||||||
#include "ggml-aarch64.h"
|
|
||||||
#include "ggml-backend-impl.h"
|
#include "ggml-backend-impl.h"
|
||||||
#include "ggml-backend.h"
|
#include "ggml-backend.h"
|
||||||
|
#include "ggml-cpu-aarch64.h"
|
||||||
#include "ggml-cpu-impl.h"
|
#include "ggml-cpu-impl.h"
|
||||||
#include "ggml-cpu.h"
|
#include "ggml-cpu.h"
|
||||||
#include "ggml-impl.h"
|
#include "ggml-impl.h"
|
||||||
#include "ggml-quants.h"
|
#include "ggml-quants.h"
|
||||||
|
#include "ggml-cpu-quants.h"
|
||||||
|
#include "ggml-threading.h"
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
|
|
||||||
#if defined(_MSC_VER) || defined(__MINGW32__)
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||||
|
@ -42,7 +44,7 @@
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef GGML_USE_LLAMAFILE
|
#ifdef GGML_USE_LLAMAFILE
|
||||||
#include <llamafile/sgemm.h>
|
#include "llamafile/sgemm.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(_MSC_VER)
|
#if defined(_MSC_VER)
|
||||||
|
@ -107,9 +109,6 @@ static ggml_fp16_t ggml_table_gelu_f16[1 << 16];
|
||||||
// precomputed quick gelu table for f16 (128 KB)
|
// precomputed quick gelu table for f16 (128 KB)
|
||||||
static ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16];
|
static ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16];
|
||||||
|
|
||||||
// precomputed f32 table for f16 (256 KB) (ggml-impl.h)
|
|
||||||
float ggml_table_f32_f16[1 << 16];
|
|
||||||
|
|
||||||
#if defined(__ARM_ARCH)
|
#if defined(__ARM_ARCH)
|
||||||
struct ggml_arm_arch_features_type {
|
struct ggml_arm_arch_features_type {
|
||||||
int has_neon;
|
int has_neon;
|
||||||
|
@ -264,11 +263,13 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
|
||||||
.nrows = 1,
|
.nrows = 1,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_F16] = {
|
[GGML_TYPE_F16] = {
|
||||||
|
.from_float = (ggml_from_float_t) ggml_fp32_to_fp16_row,
|
||||||
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f16,
|
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f16,
|
||||||
.vec_dot_type = GGML_TYPE_F16,
|
.vec_dot_type = GGML_TYPE_F16,
|
||||||
.nrows = 1,
|
.nrows = 1,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_Q4_0] = {
|
[GGML_TYPE_Q4_0] = {
|
||||||
|
.from_float = quantize_row_q4_0,
|
||||||
.vec_dot = ggml_vec_dot_q4_0_q8_0,
|
.vec_dot = ggml_vec_dot_q4_0_q8_0,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_0,
|
.vec_dot_type = GGML_TYPE_Q8_0,
|
||||||
#if defined (__ARM_FEATURE_MATMUL_INT8)
|
#if defined (__ARM_FEATURE_MATMUL_INT8)
|
||||||
|
@ -278,6 +279,7 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
|
||||||
#endif
|
#endif
|
||||||
},
|
},
|
||||||
[GGML_TYPE_Q4_1] = {
|
[GGML_TYPE_Q4_1] = {
|
||||||
|
.from_float = quantize_row_q4_1,
|
||||||
.vec_dot = ggml_vec_dot_q4_1_q8_1,
|
.vec_dot = ggml_vec_dot_q4_1_q8_1,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_1,
|
.vec_dot_type = GGML_TYPE_Q8_1,
|
||||||
#if defined (__ARM_FEATURE_MATMUL_INT8)
|
#if defined (__ARM_FEATURE_MATMUL_INT8)
|
||||||
|
@ -286,27 +288,20 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
|
||||||
.nrows = 1,
|
.nrows = 1,
|
||||||
#endif
|
#endif
|
||||||
},
|
},
|
||||||
[4] = { // GGML_TYPE_Q4_2
|
|
||||||
.vec_dot = NULL,
|
|
||||||
.vec_dot_type = GGML_TYPE_COUNT,
|
|
||||||
.nrows = 1,
|
|
||||||
},
|
|
||||||
[5] = { // GGML_TYPE_Q4_3
|
|
||||||
.vec_dot = NULL,
|
|
||||||
.vec_dot_type = GGML_TYPE_COUNT,
|
|
||||||
.nrows = 1,
|
|
||||||
},
|
|
||||||
[GGML_TYPE_Q5_0] = {
|
[GGML_TYPE_Q5_0] = {
|
||||||
|
.from_float = quantize_row_q5_0,
|
||||||
.vec_dot = ggml_vec_dot_q5_0_q8_0,
|
.vec_dot = ggml_vec_dot_q5_0_q8_0,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_0,
|
.vec_dot_type = GGML_TYPE_Q8_0,
|
||||||
.nrows = 1,
|
.nrows = 1,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_Q5_1] = {
|
[GGML_TYPE_Q5_1] = {
|
||||||
|
.from_float = quantize_row_q5_1,
|
||||||
.vec_dot = ggml_vec_dot_q5_1_q8_1,
|
.vec_dot = ggml_vec_dot_q5_1_q8_1,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_1,
|
.vec_dot_type = GGML_TYPE_Q8_1,
|
||||||
.nrows = 1,
|
.nrows = 1,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_Q8_0] = {
|
[GGML_TYPE_Q8_0] = {
|
||||||
|
.from_float = quantize_row_q8_0,
|
||||||
.from_float_to_mat = quantize_mat_q8_0,
|
.from_float_to_mat = quantize_mat_q8_0,
|
||||||
.vec_dot = ggml_vec_dot_q8_0_q8_0,
|
.vec_dot = ggml_vec_dot_q8_0_q8_0,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_0,
|
.vec_dot_type = GGML_TYPE_Q8_0,
|
||||||
|
@ -317,85 +312,106 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
|
||||||
#endif
|
#endif
|
||||||
},
|
},
|
||||||
[GGML_TYPE_Q8_1] = {
|
[GGML_TYPE_Q8_1] = {
|
||||||
|
.from_float = quantize_row_q8_1,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_1,
|
.vec_dot_type = GGML_TYPE_Q8_1,
|
||||||
.nrows = 1,
|
.nrows = 1,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_Q2_K] = {
|
[GGML_TYPE_Q2_K] = {
|
||||||
|
.from_float = quantize_row_q2_K,
|
||||||
.vec_dot = ggml_vec_dot_q2_K_q8_K,
|
.vec_dot = ggml_vec_dot_q2_K_q8_K,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||||
.nrows = 1,
|
.nrows = 1,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_Q3_K] = {
|
[GGML_TYPE_Q3_K] = {
|
||||||
|
.from_float = quantize_row_q3_K,
|
||||||
.vec_dot = ggml_vec_dot_q3_K_q8_K,
|
.vec_dot = ggml_vec_dot_q3_K_q8_K,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||||
.nrows = 1,
|
.nrows = 1,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_Q4_K] = {
|
[GGML_TYPE_Q4_K] = {
|
||||||
|
.from_float = quantize_row_q4_K,
|
||||||
.vec_dot = ggml_vec_dot_q4_K_q8_K,
|
.vec_dot = ggml_vec_dot_q4_K_q8_K,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||||
.nrows = 1,
|
.nrows = 1,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_Q5_K] = {
|
[GGML_TYPE_Q5_K] = {
|
||||||
|
.from_float = quantize_row_q5_K,
|
||||||
.vec_dot = ggml_vec_dot_q5_K_q8_K,
|
.vec_dot = ggml_vec_dot_q5_K_q8_K,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||||
.nrows = 1,
|
.nrows = 1,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_Q6_K] = {
|
[GGML_TYPE_Q6_K] = {
|
||||||
|
.from_float = quantize_row_q6_K,
|
||||||
.vec_dot = ggml_vec_dot_q6_K_q8_K,
|
.vec_dot = ggml_vec_dot_q6_K_q8_K,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||||
.nrows = 1,
|
.nrows = 1,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_IQ2_XXS] = {
|
[GGML_TYPE_IQ2_XXS] = {
|
||||||
|
.from_float = NULL,
|
||||||
.vec_dot = ggml_vec_dot_iq2_xxs_q8_K,
|
.vec_dot = ggml_vec_dot_iq2_xxs_q8_K,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||||
.nrows = 1,
|
.nrows = 1,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_IQ2_XS] = {
|
[GGML_TYPE_IQ2_XS] = {
|
||||||
|
.from_float = NULL,
|
||||||
.vec_dot = ggml_vec_dot_iq2_xs_q8_K,
|
.vec_dot = ggml_vec_dot_iq2_xs_q8_K,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||||
.nrows = 1,
|
.nrows = 1,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_IQ3_XXS] = {
|
[GGML_TYPE_IQ3_XXS] = {
|
||||||
|
// NOTE: from_float for iq3 and iq2_s was removed because these quants require initialization in ggml_quantize_init
|
||||||
|
//.from_float = quantize_row_iq3_xxs,
|
||||||
.vec_dot = ggml_vec_dot_iq3_xxs_q8_K,
|
.vec_dot = ggml_vec_dot_iq3_xxs_q8_K,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||||
.nrows = 1,
|
.nrows = 1,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_IQ3_S] = {
|
[GGML_TYPE_IQ3_S] = {
|
||||||
|
//.from_float = quantize_row_iq3_s,
|
||||||
.vec_dot = ggml_vec_dot_iq3_s_q8_K,
|
.vec_dot = ggml_vec_dot_iq3_s_q8_K,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||||
.nrows = 1,
|
.nrows = 1,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_IQ2_S] = {
|
[GGML_TYPE_IQ2_S] = {
|
||||||
|
//.from_float = quantize_row_iq2_s,
|
||||||
.vec_dot = ggml_vec_dot_iq2_s_q8_K,
|
.vec_dot = ggml_vec_dot_iq2_s_q8_K,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||||
.nrows = 1,
|
.nrows = 1,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_IQ1_S] = {
|
[GGML_TYPE_IQ1_S] = {
|
||||||
|
.from_float = NULL,
|
||||||
.vec_dot = ggml_vec_dot_iq1_s_q8_K,
|
.vec_dot = ggml_vec_dot_iq1_s_q8_K,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||||
.nrows = 1,
|
.nrows = 1,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_IQ1_M] = {
|
[GGML_TYPE_IQ1_M] = {
|
||||||
|
.from_float = NULL,
|
||||||
.vec_dot = ggml_vec_dot_iq1_m_q8_K,
|
.vec_dot = ggml_vec_dot_iq1_m_q8_K,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||||
.nrows = 1,
|
.nrows = 1,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_IQ4_NL] = {
|
[GGML_TYPE_IQ4_NL] = {
|
||||||
|
.from_float = quantize_row_iq4_nl,
|
||||||
.vec_dot = ggml_vec_dot_iq4_nl_q8_0,
|
.vec_dot = ggml_vec_dot_iq4_nl_q8_0,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_0,
|
.vec_dot_type = GGML_TYPE_Q8_0,
|
||||||
.nrows = 1,
|
.nrows = 1,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_IQ4_XS] = {
|
[GGML_TYPE_IQ4_XS] = {
|
||||||
|
.from_float = quantize_row_iq4_xs,
|
||||||
.vec_dot = ggml_vec_dot_iq4_xs_q8_K,
|
.vec_dot = ggml_vec_dot_iq4_xs_q8_K,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||||
.nrows = 1,
|
.nrows = 1,
|
||||||
},
|
},
|
||||||
|
[GGML_TYPE_Q8_K] = {
|
||||||
|
.from_float = quantize_row_q8_K,
|
||||||
|
},
|
||||||
[GGML_TYPE_BF16] = {
|
[GGML_TYPE_BF16] = {
|
||||||
|
.from_float = (ggml_from_float_t) ggml_fp32_to_bf16_row,
|
||||||
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_bf16,
|
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_bf16,
|
||||||
.vec_dot_type = GGML_TYPE_BF16,
|
.vec_dot_type = GGML_TYPE_BF16,
|
||||||
.nrows = 1,
|
.nrows = 1,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_Q4_0_4_4] = {
|
[GGML_TYPE_Q4_0_4_4] = {
|
||||||
|
.from_float = NULL,
|
||||||
.vec_dot = NULL,
|
.vec_dot = NULL,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_0,
|
.vec_dot_type = GGML_TYPE_Q8_0,
|
||||||
.nrows = 1,
|
.nrows = 1,
|
||||||
|
@ -404,6 +420,7 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
|
||||||
.gemm = ggml_gemm_q4_0_4x4_q8_0,
|
.gemm = ggml_gemm_q4_0_4x4_q8_0,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_Q4_0_4_8] = {
|
[GGML_TYPE_Q4_0_4_8] = {
|
||||||
|
.from_float = NULL,
|
||||||
.vec_dot = NULL,
|
.vec_dot = NULL,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_0,
|
.vec_dot_type = GGML_TYPE_Q8_0,
|
||||||
.nrows = 1,
|
.nrows = 1,
|
||||||
|
@ -412,6 +429,7 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
|
||||||
.gemm = ggml_gemm_q4_0_4x8_q8_0,
|
.gemm = ggml_gemm_q4_0_4x8_q8_0,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_Q4_0_8_8] = {
|
[GGML_TYPE_Q4_0_8_8] = {
|
||||||
|
.from_float = NULL,
|
||||||
.vec_dot = NULL,
|
.vec_dot = NULL,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_0,
|
.vec_dot_type = GGML_TYPE_Q8_0,
|
||||||
.nrows = 1,
|
.nrows = 1,
|
||||||
|
@ -420,11 +438,13 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
|
||||||
.gemm = ggml_gemm_q4_0_8x8_q8_0,
|
.gemm = ggml_gemm_q4_0_8x8_q8_0,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_TQ1_0] = {
|
[GGML_TYPE_TQ1_0] = {
|
||||||
|
.from_float = quantize_row_tq1_0,
|
||||||
.vec_dot = ggml_vec_dot_tq1_0_q8_K,
|
.vec_dot = ggml_vec_dot_tq1_0_q8_K,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||||
.nrows = 1,
|
.nrows = 1,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_TQ2_0] = {
|
[GGML_TYPE_TQ2_0] = {
|
||||||
|
.from_float = quantize_row_tq2_0,
|
||||||
.vec_dot = ggml_vec_dot_tq2_0_q8_K,
|
.vec_dot = ggml_vec_dot_tq2_0_q8_K,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||||
.nrows = 1,
|
.nrows = 1,
|
||||||
|
@ -2253,22 +2273,7 @@ struct ggml_state {
|
||||||
struct ggml_numa_nodes numa;
|
struct ggml_numa_nodes numa;
|
||||||
};
|
};
|
||||||
|
|
||||||
// global state
|
|
||||||
static struct ggml_state g_state = {0};
|
static struct ggml_state g_state = {0};
|
||||||
static atomic_flag g_state_critical = ATOMIC_FLAG_INIT;
|
|
||||||
|
|
||||||
// TODO: move to threading file
|
|
||||||
// critical section via spin lock
|
|
||||||
void ggml_critical_section_start(void) {
|
|
||||||
while (atomic_flag_test_and_set(&g_state_critical)) {
|
|
||||||
// spin
|
|
||||||
sched_yield();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void ggml_critical_section_end(void) {
|
|
||||||
atomic_flag_clear(&g_state_critical);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ggml_barrier(struct ggml_threadpool * tp) {
|
static void ggml_barrier(struct ggml_threadpool * tp) {
|
||||||
int n_threads = atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed);
|
int n_threads = atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed);
|
||||||
|
@ -3001,8 +3006,8 @@ static void ggml_compute_forward_dup_f16(
|
||||||
id += ne00 * (ne01 - ir1);
|
id += ne00 * (ne01 - ir1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else if (ggml_get_type_traits(dst->type)->from_float) {
|
} else if (ggml_get_type_traits_cpu(dst->type)->from_float) {
|
||||||
ggml_from_float_t const quantize_row_q = ggml_get_type_traits(dst->type)->from_float;
|
ggml_from_float_t const quantize_row_q = ggml_get_type_traits_cpu(dst->type)->from_float;
|
||||||
float * src0_f32 = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith;
|
float * src0_f32 = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith;
|
||||||
|
|
||||||
size_t id = 0;
|
size_t id = 0;
|
||||||
|
@ -3282,8 +3287,8 @@ static void ggml_compute_forward_dup_bf16(
|
||||||
id += ne00 * (ne01 - ir1);
|
id += ne00 * (ne01 - ir1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else if (ggml_get_type_traits(dst->type)->from_float) {
|
} else if (ggml_get_type_traits_cpu(dst->type)->from_float) {
|
||||||
ggml_from_float_t const quantize_row_q = ggml_get_type_traits(dst->type)->from_float;
|
ggml_from_float_t const quantize_row_q = ggml_get_type_traits_cpu(dst->type)->from_float;
|
||||||
float * src0_f32 = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith;
|
float * src0_f32 = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith;
|
||||||
|
|
||||||
size_t id = 0;
|
size_t id = 0;
|
||||||
|
@ -3598,8 +3603,8 @@ static void ggml_compute_forward_dup_f32(
|
||||||
id += rs * (ne01 - ir1);
|
id += rs * (ne01 - ir1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else if (ggml_get_type_traits(dst->type)->from_float) {
|
} else if (ggml_get_type_traits_cpu(dst->type)->from_float) {
|
||||||
ggml_from_float_t const quantize_row_q = ggml_get_type_traits(dst->type)->from_float;
|
ggml_from_float_t const quantize_row_q = ggml_get_type_traits_cpu(dst->type)->from_float;
|
||||||
|
|
||||||
size_t id = 0;
|
size_t id = 0;
|
||||||
size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type));
|
size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type));
|
||||||
|
@ -4393,7 +4398,7 @@ static void ggml_compute_forward_add_q_f32(
|
||||||
const enum ggml_type type = src0->type;
|
const enum ggml_type type = src0->type;
|
||||||
const enum ggml_type dtype = dst->type;
|
const enum ggml_type dtype = dst->type;
|
||||||
ggml_to_float_t const dequantize_row_q = ggml_get_type_traits(type)->to_float;
|
ggml_to_float_t const dequantize_row_q = ggml_get_type_traits(type)->to_float;
|
||||||
ggml_from_float_t const quantize_row_q = ggml_get_type_traits(dtype)->from_float;
|
ggml_from_float_t const quantize_row_q = ggml_get_type_traits_cpu(dtype)->from_float;
|
||||||
|
|
||||||
// we don't support permuted src0 or src1
|
// we don't support permuted src0 or src1
|
||||||
GGML_ASSERT(nb00 == ggml_type_size(type));
|
GGML_ASSERT(nb00 == ggml_type_size(type));
|
||||||
|
@ -4695,7 +4700,7 @@ static void ggml_compute_forward_add1_q_f32(
|
||||||
|
|
||||||
const enum ggml_type type = src0->type;
|
const enum ggml_type type = src0->type;
|
||||||
ggml_to_float_t const dequantize_row_q = ggml_get_type_traits(type)->to_float;
|
ggml_to_float_t const dequantize_row_q = ggml_get_type_traits(type)->to_float;
|
||||||
ggml_from_float_t const quantize_row_q = ggml_get_type_traits(type)->from_float;
|
ggml_from_float_t const quantize_row_q = ggml_get_type_traits_cpu(type)->from_float;
|
||||||
|
|
||||||
// we don't support permuted src0
|
// we don't support permuted src0
|
||||||
GGML_ASSERT(nb00 == ggml_type_size(type));
|
GGML_ASSERT(nb00 == ggml_type_size(type));
|
||||||
|
@ -7456,7 +7461,7 @@ static void ggml_compute_forward_mul_mat(
|
||||||
const enum ggml_type type = src0->type;
|
const enum ggml_type type = src0->type;
|
||||||
|
|
||||||
enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type;
|
enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type;
|
||||||
ggml_from_float_t const from_float = ggml_get_type_traits(vec_dot_type)->from_float;
|
ggml_from_float_t const from_float = type_traits_cpu[vec_dot_type].from_float;
|
||||||
ggml_from_float_to_mat_t const from_float_to_mat = type_traits_cpu[vec_dot_type].from_float_to_mat;
|
ggml_from_float_to_mat_t const from_float_to_mat = type_traits_cpu[vec_dot_type].from_float_to_mat;
|
||||||
int64_t const vec_dot_num_rows = type_traits_cpu[type].nrows;
|
int64_t const vec_dot_num_rows = type_traits_cpu[type].nrows;
|
||||||
int64_t const matmul_num_cols = type_traits_cpu[type].ncols;
|
int64_t const matmul_num_cols = type_traits_cpu[type].ncols;
|
||||||
|
@ -7685,7 +7690,7 @@ static void ggml_compute_forward_mul_mat_id(
|
||||||
|
|
||||||
ggml_vec_dot_t const vec_dot = type_traits_cpu[type].vec_dot;
|
ggml_vec_dot_t const vec_dot = type_traits_cpu[type].vec_dot;
|
||||||
enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type;
|
enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type;
|
||||||
ggml_from_float_t const from_float = ggml_get_type_traits(vec_dot_type)->from_float;
|
ggml_from_float_t const from_float = type_traits_cpu[vec_dot_type].from_float;
|
||||||
int64_t const matmul_num_cols = type_traits_cpu[type].ncols;
|
int64_t const matmul_num_cols = type_traits_cpu[type].ncols;
|
||||||
ggml_gemv_t const gemv = type_traits_cpu[type].gemv;
|
ggml_gemv_t const gemv = type_traits_cpu[type].gemv;
|
||||||
|
|
||||||
|
@ -9195,12 +9200,6 @@ static void rope_yarn(
|
||||||
*sin_theta = sinf(theta) * mscale;
|
*sin_theta = sinf(theta) * mscale;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
|
|
||||||
// `corr_dim(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))`
|
|
||||||
static float ggml_rope_yarn_corr_dim(int n_dims, int n_ctx_orig, float n_rot, float base) {
|
|
||||||
return n_dims * logf(n_ctx_orig / (n_rot * 2 * (float)M_PI)) / (2 * logf(base));
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ggml_rope_cache_init(
|
static void ggml_rope_cache_init(
|
||||||
float theta_base, float freq_scale, const float * freq_factors, float corr_dims[2], int64_t ne0, float ext_factor, float mscale,
|
float theta_base, float freq_scale, const float * freq_factors, float corr_dims[2], int64_t ne0, float ext_factor, float mscale,
|
||||||
float * cache, float sin_sign, float theta_scale) {
|
float * cache, float sin_sign, float theta_scale) {
|
||||||
|
@ -9217,16 +9216,6 @@ static void ggml_rope_cache_init(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_rope_yarn_corr_dims(
|
|
||||||
int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2]
|
|
||||||
) {
|
|
||||||
// start and end correction dims
|
|
||||||
float start = floorf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_fast, freq_base));
|
|
||||||
float end = ceilf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_slow, freq_base));
|
|
||||||
dims[0] = MAX(0, start);
|
|
||||||
dims[1] = MIN(n_dims - 1, end);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ggml_compute_forward_rope_f32(
|
static void ggml_compute_forward_rope_f32(
|
||||||
const struct ggml_compute_params * params,
|
const struct ggml_compute_params * params,
|
||||||
struct ggml_tensor * dst,
|
struct ggml_tensor * dst,
|
||||||
|
@ -10704,7 +10693,7 @@ static void ggml_compute_forward_flash_attn_ext_f16(
|
||||||
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
||||||
|
|
||||||
enum ggml_type const k_vec_dot_type = type_traits_cpu[k->type].vec_dot_type;
|
enum ggml_type const k_vec_dot_type = type_traits_cpu[k->type].vec_dot_type;
|
||||||
ggml_from_float_t const q_to_vec_dot = ggml_get_type_traits(k_vec_dot_type)->from_float;
|
ggml_from_float_t const q_to_vec_dot = type_traits_cpu[k_vec_dot_type].from_float;
|
||||||
ggml_vec_dot_t const kq_vec_dot = type_traits_cpu[k->type].vec_dot;
|
ggml_vec_dot_t const kq_vec_dot = type_traits_cpu[k->type].vec_dot;
|
||||||
ggml_to_float_t const v_to_float = ggml_get_type_traits(v->type)->to_float;
|
ggml_to_float_t const v_to_float = ggml_get_type_traits(v->type)->to_float;
|
||||||
|
|
||||||
|
@ -13800,6 +13789,151 @@ enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct g
|
||||||
return ggml_graph_compute(cgraph, &cplan);
|
return ggml_graph_compute(cgraph, &cplan);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int ggml_cpu_has_avx(void) {
|
||||||
|
#if defined(__AVX__)
|
||||||
|
return 1;
|
||||||
|
#else
|
||||||
|
return 0;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
int ggml_cpu_has_avx_vnni(void) {
|
||||||
|
#if defined(__AVXVNNI__)
|
||||||
|
return 1;
|
||||||
|
#else
|
||||||
|
return 0;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
int ggml_cpu_has_avx2(void) {
|
||||||
|
#if defined(__AVX2__)
|
||||||
|
return 1;
|
||||||
|
#else
|
||||||
|
return 0;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
int ggml_cpu_has_avx512(void) {
|
||||||
|
#if defined(__AVX512F__)
|
||||||
|
return 1;
|
||||||
|
#else
|
||||||
|
return 0;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
int ggml_cpu_has_avx512_vbmi(void) {
|
||||||
|
#if defined(__AVX512VBMI__)
|
||||||
|
return 1;
|
||||||
|
#else
|
||||||
|
return 0;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
int ggml_cpu_has_avx512_vnni(void) {
|
||||||
|
#if defined(__AVX512VNNI__)
|
||||||
|
return 1;
|
||||||
|
#else
|
||||||
|
return 0;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
int ggml_cpu_has_avx512_bf16(void) {
|
||||||
|
#if defined(__AVX512BF16__)
|
||||||
|
return 1;
|
||||||
|
#else
|
||||||
|
return 0;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
int ggml_cpu_has_amx_int8(void) {
|
||||||
|
#if defined(__AMX_INT8__)
|
||||||
|
return 1;
|
||||||
|
#else
|
||||||
|
return 0;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
int ggml_cpu_has_fma(void) {
|
||||||
|
#if defined(__FMA__)
|
||||||
|
return 1;
|
||||||
|
#else
|
||||||
|
return 0;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
int ggml_cpu_has_arm_fma(void) {
|
||||||
|
#if defined(__ARM_FEATURE_FMA)
|
||||||
|
return 1;
|
||||||
|
#else
|
||||||
|
return 0;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
int ggml_cpu_has_riscv_v(void) {
|
||||||
|
#if defined(__riscv_v_intrinsic)
|
||||||
|
return 1;
|
||||||
|
#else
|
||||||
|
return 0;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
int ggml_cpu_has_f16c(void) {
|
||||||
|
#if defined(__F16C__)
|
||||||
|
return 1;
|
||||||
|
#else
|
||||||
|
return 0;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
int ggml_cpu_has_fp16_va(void) {
|
||||||
|
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
|
||||||
|
return 1;
|
||||||
|
#else
|
||||||
|
return 0;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
int ggml_cpu_has_wasm_simd(void) {
|
||||||
|
#if defined(__wasm_simd128__)
|
||||||
|
return 1;
|
||||||
|
#else
|
||||||
|
return 0;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
int ggml_cpu_has_llamafile(void) {
|
||||||
|
#if defined(GGML_USE_LLAMAFILE)
|
||||||
|
return 1;
|
||||||
|
#else
|
||||||
|
return 0;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
int ggml_cpu_has_sse3(void) {
|
||||||
|
#if defined(__SSE3__)
|
||||||
|
return 1;
|
||||||
|
#else
|
||||||
|
return 0;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
int ggml_cpu_has_ssse3(void) {
|
||||||
|
#if defined(__SSSE3__)
|
||||||
|
return 1;
|
||||||
|
#else
|
||||||
|
return 0;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
int ggml_cpu_has_vsx(void) {
|
||||||
|
#if defined(__POWER9_VECTOR__)
|
||||||
|
return 1;
|
||||||
|
#else
|
||||||
|
return 0;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
int ggml_cpu_has_neon(void) {
|
int ggml_cpu_has_neon(void) {
|
||||||
#if defined(__ARM_ARCH)
|
#if defined(__ARM_ARCH)
|
||||||
return ggml_arm_arch_features.has_neon;
|
return ggml_arm_arch_features.has_neon;
|
575
ggml/src/ggml-cpu/ggml-cpu.cpp
Normal file
575
ggml/src/ggml-cpu/ggml-cpu.cpp
Normal file
|
@ -0,0 +1,575 @@
|
||||||
|
#include "ggml-backend.h"
|
||||||
|
#include "ggml-backend-impl.h"
|
||||||
|
#include "ggml-cpu.h"
|
||||||
|
#include "ggml-impl.h"
|
||||||
|
#include <cctype>
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#if defined(__APPLE__)
|
||||||
|
#include <sys/types.h>
|
||||||
|
#include <sys/sysctl.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(_WIN32)
|
||||||
|
#define WIN32_LEAN_AND_MEAN
|
||||||
|
#ifndef NOMINMAX
|
||||||
|
#define NOMINMAX
|
||||||
|
#endif
|
||||||
|
#include <windows.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// ggml-backend interface
|
||||||
|
|
||||||
|
#ifdef GGML_USE_CPU_HBM
|
||||||
|
|
||||||
|
// buffer type HBM
|
||||||
|
|
||||||
|
#include <hbwmalloc.h>
|
||||||
|
|
||||||
|
static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
||||||
|
return "CPU_HBM";
|
||||||
|
|
||||||
|
GGML_UNUSED(buft);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||||
|
hbw_free(buffer->context);
|
||||||
|
}
|
||||||
|
|
||||||
|
static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||||
|
void * ptr;
|
||||||
|
int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
|
||||||
|
if (result != 0) {
|
||||||
|
GGML_LOG_ERROR("failed to allocate HBM buffer of size %zu\n", size);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
||||||
|
buffer->buft = buft;
|
||||||
|
buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer;
|
||||||
|
|
||||||
|
return buffer;
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
|
||||||
|
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = {
|
||||||
|
/* .iface = */ {
|
||||||
|
/* .get_name = */ ggml_backend_cpu_hbm_buffer_type_get_name,
|
||||||
|
/* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
|
||||||
|
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
||||||
|
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
||||||
|
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
||||||
|
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
||||||
|
},
|
||||||
|
/* .context = */ NULL,
|
||||||
|
};
|
||||||
|
|
||||||
|
return &ggml_backend_cpu_buffer_type_hbm;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
static ggml_backend_buffer_type_t * ggml_backend_cpu_get_extra_bufts(ggml_backend_dev_t device) {
|
||||||
|
static ggml_backend_buffer_type_t bufts[] = {
|
||||||
|
#ifdef GGML_USE_CPU_HBM
|
||||||
|
ggml_backend_cpu_hbm_buffer_type(),
|
||||||
|
#endif
|
||||||
|
NULL
|
||||||
|
};
|
||||||
|
|
||||||
|
return bufts;
|
||||||
|
|
||||||
|
GGML_UNUSED(device);
|
||||||
|
}
|
||||||
|
|
||||||
|
// CPU backend - backend (stream)
|
||||||
|
|
||||||
|
struct ggml_backend_cpu_context {
|
||||||
|
int n_threads;
|
||||||
|
ggml_threadpool_t threadpool;
|
||||||
|
|
||||||
|
uint8_t * work_data;
|
||||||
|
size_t work_size;
|
||||||
|
|
||||||
|
ggml_abort_callback abort_callback;
|
||||||
|
void * abort_callback_data;
|
||||||
|
};
|
||||||
|
|
||||||
|
static const char * ggml_backend_cpu_get_name(ggml_backend_t backend) {
|
||||||
|
return "CPU";
|
||||||
|
|
||||||
|
GGML_UNUSED(backend);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_backend_cpu_free(ggml_backend_t backend) {
|
||||||
|
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
||||||
|
delete[] cpu_ctx->work_data;
|
||||||
|
delete cpu_ctx;
|
||||||
|
delete backend;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_backend_plan_cpu {
|
||||||
|
struct ggml_cplan cplan;
|
||||||
|
struct ggml_cgraph cgraph;
|
||||||
|
};
|
||||||
|
|
||||||
|
static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) {
|
||||||
|
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
||||||
|
|
||||||
|
struct ggml_backend_plan_cpu * cpu_plan = new ggml_backend_plan_cpu;
|
||||||
|
|
||||||
|
cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
|
||||||
|
cpu_plan->cgraph = *cgraph; // FIXME: deep copy
|
||||||
|
|
||||||
|
if (cpu_plan->cplan.work_size > 0) {
|
||||||
|
cpu_plan->cplan.work_data = new uint8_t[cpu_plan->cplan.work_size];
|
||||||
|
if (cpu_plan->cplan.work_data == NULL) {
|
||||||
|
delete cpu_plan;
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback;
|
||||||
|
cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data;
|
||||||
|
|
||||||
|
return cpu_plan;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
||||||
|
struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
|
||||||
|
|
||||||
|
delete[] cpu_plan->cplan.work_data;
|
||||||
|
delete cpu_plan;
|
||||||
|
|
||||||
|
GGML_UNUSED(backend);
|
||||||
|
}
|
||||||
|
|
||||||
|
static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
||||||
|
struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
|
||||||
|
|
||||||
|
return ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
|
||||||
|
|
||||||
|
GGML_UNUSED(backend);
|
||||||
|
}
|
||||||
|
|
||||||
|
static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
||||||
|
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
||||||
|
|
||||||
|
struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
|
||||||
|
|
||||||
|
if (cpu_ctx->work_size < cplan.work_size) {
|
||||||
|
delete[] cpu_ctx->work_data;
|
||||||
|
cpu_ctx->work_data = new uint8_t[cplan.work_size];
|
||||||
|
if (cpu_ctx->work_data == NULL) {
|
||||||
|
cpu_ctx->work_size = 0;
|
||||||
|
return GGML_STATUS_ALLOC_FAILED;
|
||||||
|
}
|
||||||
|
cpu_ctx->work_size = cplan.work_size;
|
||||||
|
}
|
||||||
|
cplan.work_data = (uint8_t *)cpu_ctx->work_data;
|
||||||
|
|
||||||
|
cplan.abort_callback = cpu_ctx->abort_callback;
|
||||||
|
cplan.abort_callback_data = cpu_ctx->abort_callback_data;
|
||||||
|
|
||||||
|
return ggml_graph_compute(cgraph, &cplan);
|
||||||
|
}
|
||||||
|
|
||||||
|
static const struct ggml_backend_i ggml_backend_cpu_i = {
|
||||||
|
/* .get_name = */ ggml_backend_cpu_get_name,
|
||||||
|
/* .free = */ ggml_backend_cpu_free,
|
||||||
|
/* .set_tensor_async = */ NULL,
|
||||||
|
/* .get_tensor_async = */ NULL,
|
||||||
|
/* .cpy_tensor_async = */ NULL,
|
||||||
|
/* .synchronize = */ NULL,
|
||||||
|
/* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create,
|
||||||
|
/* .graph_plan_free = */ ggml_backend_cpu_graph_plan_free,
|
||||||
|
/* .graph_plan_update = */ NULL,
|
||||||
|
/* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
|
||||||
|
/* .graph_compute = */ ggml_backend_cpu_graph_compute,
|
||||||
|
/* .event_record = */ NULL,
|
||||||
|
/* .event_wait = */ NULL,
|
||||||
|
};
|
||||||
|
|
||||||
|
static ggml_guid_t ggml_backend_cpu_guid(void) {
|
||||||
|
static ggml_guid guid = { 0xaa, 0x67, 0xc7, 0x43, 0x96, 0xe6, 0xa3, 0x8a, 0xe3, 0xaf, 0xea, 0x92, 0x36, 0xbc, 0xfc, 0x89 };
|
||||||
|
return &guid;
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_backend_t ggml_backend_cpu_init(void) {
|
||||||
|
// initialize CPU backend now to avoid slowing the first graph computation
|
||||||
|
ggml_cpu_init();
|
||||||
|
|
||||||
|
struct ggml_backend_cpu_context * ctx = new ggml_backend_cpu_context;
|
||||||
|
if (ctx == NULL) {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx->n_threads = GGML_DEFAULT_N_THREADS;
|
||||||
|
ctx->threadpool = NULL;
|
||||||
|
ctx->work_data = NULL;
|
||||||
|
ctx->work_size = 0;
|
||||||
|
ctx->abort_callback = NULL;
|
||||||
|
ctx->abort_callback_data = NULL;
|
||||||
|
|
||||||
|
ggml_backend_t cpu_backend = new ggml_backend {
|
||||||
|
/* .guid = */ ggml_backend_cpu_guid(),
|
||||||
|
/* .interface = */ ggml_backend_cpu_i,
|
||||||
|
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
|
||||||
|
/* .context = */ ctx,
|
||||||
|
};
|
||||||
|
|
||||||
|
if (cpu_backend == NULL) {
|
||||||
|
delete ctx;
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
return cpu_backend;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool ggml_backend_is_cpu(ggml_backend_t backend) {
|
||||||
|
return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cpu_guid());
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
|
||||||
|
GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
|
||||||
|
|
||||||
|
struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
|
||||||
|
ctx->n_threads = n_threads;
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_backend_cpu_set_threadpool(ggml_backend_t backend_cpu, ggml_threadpool_t threadpool) {
|
||||||
|
GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
|
||||||
|
|
||||||
|
struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
|
||||||
|
|
||||||
|
if (ctx->threadpool && ctx->threadpool != threadpool) {
|
||||||
|
// already had a different threadpool, pause/suspend it before switching
|
||||||
|
ggml_threadpool_pause(ctx->threadpool);
|
||||||
|
}
|
||||||
|
ctx->threadpool = threadpool;
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data) {
|
||||||
|
GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
|
||||||
|
|
||||||
|
struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
|
||||||
|
ctx->abort_callback = abort_callback;
|
||||||
|
ctx->abort_callback_data = abort_callback_data;
|
||||||
|
}
|
||||||
|
|
||||||
|
// CPU backend - device
|
||||||
|
|
||||||
|
struct ggml_backend_cpu_device_context {
|
||||||
|
std::string description = "CPU";
|
||||||
|
|
||||||
|
ggml_backend_cpu_device_context() {
|
||||||
|
#ifdef __APPLE__
|
||||||
|
size_t len = 0;
|
||||||
|
if (!sysctlbyname("machdep.cpu.brand_string", NULL, &len, NULL, 0)) {
|
||||||
|
description.resize(len);
|
||||||
|
sysctlbyname("machdep.cpu.brand_string", &description[0], &len, NULL, 0); // NOLINT
|
||||||
|
}
|
||||||
|
#elif defined(__linux__)
|
||||||
|
FILE * f = fopen("/proc/cpuinfo", "r");
|
||||||
|
if (f) {
|
||||||
|
char buf[1024];
|
||||||
|
while (fgets(buf, sizeof(buf), f)) {
|
||||||
|
if (strncmp(buf, "model name", 10) == 0) {
|
||||||
|
char * p = strchr(buf, ':');
|
||||||
|
if (p) {
|
||||||
|
p++;
|
||||||
|
while (std::isspace(*p)) {
|
||||||
|
p++;
|
||||||
|
}
|
||||||
|
while (std::isspace(p[strlen(p) - 1])) {
|
||||||
|
p[strlen(p) - 1] = '\0';
|
||||||
|
}
|
||||||
|
description = p;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fclose(f);
|
||||||
|
}
|
||||||
|
#elif defined(_WIN32)
|
||||||
|
HKEY hKey;
|
||||||
|
if (RegOpenKeyEx(HKEY_LOCAL_MACHINE,
|
||||||
|
TEXT("HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0"),
|
||||||
|
0,
|
||||||
|
KEY_READ,
|
||||||
|
&hKey) == ERROR_SUCCESS) {
|
||||||
|
DWORD cpu_brand_size = 0;
|
||||||
|
if (RegQueryValueExA(hKey,
|
||||||
|
TEXT("ProcessorNameString"),
|
||||||
|
NULL,
|
||||||
|
NULL,
|
||||||
|
NULL,
|
||||||
|
&cpu_brand_size) == ERROR_SUCCESS) {
|
||||||
|
description.resize(cpu_brand_size);
|
||||||
|
if (RegQueryValueExA(hKey,
|
||||||
|
TEXT("ProcessorNameString"),
|
||||||
|
NULL,
|
||||||
|
NULL,
|
||||||
|
(LPBYTE)&description[0], // NOLINT
|
||||||
|
&cpu_brand_size) == ERROR_SUCCESS) {
|
||||||
|
if (description.find('\0') != std::string::npos) {
|
||||||
|
description.resize(description.find('\0'));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
RegCloseKey(hKey);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
static const char * ggml_backend_cpu_device_get_name(ggml_backend_dev_t dev) {
|
||||||
|
return "CPU";
|
||||||
|
|
||||||
|
GGML_UNUSED(dev);
|
||||||
|
}
|
||||||
|
|
||||||
|
static const char * ggml_backend_cpu_device_get_description(ggml_backend_dev_t dev) {
|
||||||
|
struct ggml_backend_cpu_device_context * ctx = (struct ggml_backend_cpu_device_context *)dev->context;
|
||||||
|
|
||||||
|
return ctx->description.c_str();
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_backend_cpu_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
|
||||||
|
// TODO
|
||||||
|
*free = 0;
|
||||||
|
*total = 0;
|
||||||
|
|
||||||
|
GGML_UNUSED(dev);
|
||||||
|
}
|
||||||
|
|
||||||
|
static enum ggml_backend_dev_type ggml_backend_cpu_device_get_type(ggml_backend_dev_t dev) {
|
||||||
|
return GGML_BACKEND_DEVICE_TYPE_CPU;
|
||||||
|
|
||||||
|
GGML_UNUSED(dev);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_backend_cpu_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
|
||||||
|
props->name = ggml_backend_cpu_device_get_name(dev);
|
||||||
|
props->description = ggml_backend_cpu_device_get_description(dev);
|
||||||
|
props->type = ggml_backend_cpu_device_get_type(dev);
|
||||||
|
ggml_backend_cpu_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
||||||
|
props->caps = {
|
||||||
|
/* .async = */ false,
|
||||||
|
/* .host_buffer = */ false,
|
||||||
|
/* .buffer_from_host_ptr = */ true,
|
||||||
|
/* .events = */ false,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
static ggml_backend_t ggml_backend_cpu_device_init_backend(ggml_backend_dev_t dev, const char * params) {
|
||||||
|
return ggml_backend_cpu_init();
|
||||||
|
|
||||||
|
GGML_UNUSED(dev);
|
||||||
|
GGML_UNUSED(params);
|
||||||
|
}
|
||||||
|
|
||||||
|
static ggml_backend_buffer_type_t ggml_backend_cpu_device_get_buffer_type(ggml_backend_dev_t dev) {
|
||||||
|
return ggml_backend_cpu_buffer_type();
|
||||||
|
|
||||||
|
GGML_UNUSED(dev);
|
||||||
|
}
|
||||||
|
|
||||||
|
static ggml_backend_buffer_t ggml_backend_cpu_device_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
|
||||||
|
return ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
||||||
|
|
||||||
|
GGML_UNUSED(dev);
|
||||||
|
GGML_UNUSED(max_tensor_size);
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
|
||||||
|
switch (op->op) {
|
||||||
|
case GGML_OP_CPY:
|
||||||
|
return
|
||||||
|
op->type != GGML_TYPE_IQ2_XXS &&
|
||||||
|
op->type != GGML_TYPE_IQ2_XS &&
|
||||||
|
op->type != GGML_TYPE_IQ1_S &&
|
||||||
|
op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
|
||||||
|
case GGML_OP_MUL_MAT:
|
||||||
|
return op->src[1]->type == GGML_TYPE_F32;// FIXME || op->src[1]->type == ggml_get_type_traits(op->src[0]->type)->vec_dot_type;
|
||||||
|
case GGML_OP_ROPE_BACK:
|
||||||
|
return op->src[2] == NULL && (op->op_params[2] & 4) == 0;
|
||||||
|
case GGML_OP_IM2COL_BACK:
|
||||||
|
return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
|
||||||
|
case GGML_OP_OUT_PROD:
|
||||||
|
return (op->src[0]->type == GGML_TYPE_F32 || ggml_is_quantized(op->src[0]->type)) && op->src[1]->type == GGML_TYPE_F32;
|
||||||
|
default:
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
GGML_UNUSED(dev);
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool ggml_backend_cpu_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
|
||||||
|
return ggml_backend_buft_is_host(buft);
|
||||||
|
|
||||||
|
GGML_UNUSED(dev);
|
||||||
|
}
|
||||||
|
|
||||||
|
static const struct ggml_backend_device_i ggml_backend_cpu_device_i = {
|
||||||
|
/* .get_name = */ ggml_backend_cpu_device_get_name,
|
||||||
|
/* .get_description = */ ggml_backend_cpu_device_get_description,
|
||||||
|
/* .get_memory = */ ggml_backend_cpu_device_get_memory,
|
||||||
|
/* .get_type = */ ggml_backend_cpu_device_get_type,
|
||||||
|
/* .get_props = */ ggml_backend_cpu_device_get_props,
|
||||||
|
/* .init_backend = */ ggml_backend_cpu_device_init_backend,
|
||||||
|
/* .get_buffer_type = */ ggml_backend_cpu_device_get_buffer_type,
|
||||||
|
/* .get_host_buffer_type = */ NULL,
|
||||||
|
/* .buffer_from_host_ptr = */ ggml_backend_cpu_device_buffer_from_host_ptr,
|
||||||
|
/* .supports_op = */ ggml_backend_cpu_device_supports_op,
|
||||||
|
/* .supports_buft = */ ggml_backend_cpu_device_supports_buft,
|
||||||
|
/* .offload_op = */ NULL,
|
||||||
|
/* .event_new = */ NULL,
|
||||||
|
/* .event_free = */ NULL,
|
||||||
|
/* .event_synchronize = */ NULL,
|
||||||
|
};
|
||||||
|
|
||||||
|
// CPU backend - backend (reg)
|
||||||
|
|
||||||
|
static const char * ggml_backend_cpu_reg_get_name(ggml_backend_reg_t reg) {
|
||||||
|
return "CPU";
|
||||||
|
|
||||||
|
GGML_UNUSED(reg);
|
||||||
|
}
|
||||||
|
|
||||||
|
static size_t ggml_backend_cpu_reg_get_device_count(ggml_backend_reg_t reg) {
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
GGML_UNUSED(reg);
|
||||||
|
}
|
||||||
|
|
||||||
|
static ggml_backend_dev_t ggml_backend_cpu_reg_get_device(ggml_backend_reg_t reg, size_t index) {
|
||||||
|
GGML_ASSERT(index == 0);
|
||||||
|
|
||||||
|
static ggml_backend_cpu_device_context ctx;
|
||||||
|
static ggml_backend_device ggml_backend_cpu_device = {
|
||||||
|
/* .iface = */ ggml_backend_cpu_device_i,
|
||||||
|
/* .reg = */ reg,
|
||||||
|
/* .context = */ &ctx,
|
||||||
|
};
|
||||||
|
|
||||||
|
return &ggml_backend_cpu_device;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_backend_feature {
|
||||||
|
const char * name;
|
||||||
|
const char * value;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Not used yet
|
||||||
|
// This is intended to replace the the ggml_cpu_has_* functions when loading the CPU backend dynamically,
|
||||||
|
// and additionally to allow other backends to expose their own list of features that applications can query using the same API.
|
||||||
|
static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t reg) {
|
||||||
|
static std::vector<ggml_backend_feature> features = []() {
|
||||||
|
std::vector<ggml_backend_feature> features;
|
||||||
|
if (ggml_cpu_has_sse3()) {
|
||||||
|
features.push_back({ "SSE3", "1" });
|
||||||
|
}
|
||||||
|
if (ggml_cpu_has_ssse3()) {
|
||||||
|
features.push_back({ "SSSE3", "1" });
|
||||||
|
}
|
||||||
|
if (ggml_cpu_has_avx()) {
|
||||||
|
features.push_back({ "AVX", "1" });
|
||||||
|
}
|
||||||
|
if (ggml_cpu_has_avx2()) {
|
||||||
|
features.push_back({ "AVX2", "1" });
|
||||||
|
}
|
||||||
|
if (ggml_cpu_has_f16c()) {
|
||||||
|
features.push_back({ "F16C", "1" });
|
||||||
|
}
|
||||||
|
if (ggml_cpu_has_fma()) {
|
||||||
|
features.push_back({ "FMA", "1" });
|
||||||
|
}
|
||||||
|
if (ggml_cpu_has_avx_vnni()) {
|
||||||
|
features.push_back({ "AVX_VNNI", "1" });
|
||||||
|
}
|
||||||
|
if (ggml_cpu_has_avx512()) {
|
||||||
|
features.push_back({ "AVX512", "1" });
|
||||||
|
}
|
||||||
|
if (ggml_cpu_has_avx512_vbmi()) {
|
||||||
|
features.push_back({ "AVX512_VBMI", "1" });
|
||||||
|
}
|
||||||
|
if (ggml_cpu_has_avx512_vnni()) {
|
||||||
|
features.push_back({ "AVX512_VNNI", "1" });
|
||||||
|
}
|
||||||
|
if (ggml_cpu_has_avx512_bf16()) {
|
||||||
|
features.push_back({ "AVX512_BF16", "1" });
|
||||||
|
}
|
||||||
|
if (ggml_cpu_has_amx_int8()) {
|
||||||
|
features.push_back({ "AMX_INT8", "1" });
|
||||||
|
}
|
||||||
|
if (ggml_cpu_has_neon()) {
|
||||||
|
features.push_back({ "NEON", "1" });
|
||||||
|
}
|
||||||
|
if (ggml_cpu_has_arm_fma()) {
|
||||||
|
features.push_back({ "ARM_FMA", "1" });
|
||||||
|
}
|
||||||
|
if (ggml_cpu_has_fp16_va()) {
|
||||||
|
features.push_back({ "FP16_VA", "1" });
|
||||||
|
}
|
||||||
|
if (ggml_cpu_has_matmul_int8()) {
|
||||||
|
features.push_back({ "MATMUL_INT8", "1" });
|
||||||
|
}
|
||||||
|
if (ggml_cpu_has_sve()) {
|
||||||
|
features.push_back({ "SVE", "1" });
|
||||||
|
}
|
||||||
|
if (ggml_cpu_get_sve_cnt() > 0) {
|
||||||
|
static std::string sve_cnt = std::to_string(ggml_cpu_get_sve_cnt());
|
||||||
|
features.push_back({ "SVE_CNT", sve_cnt.c_str() });
|
||||||
|
}
|
||||||
|
if (ggml_cpu_has_riscv_v()) {
|
||||||
|
features.push_back({ "RISCV_V", "1" });
|
||||||
|
}
|
||||||
|
if (ggml_cpu_has_vsx()) {
|
||||||
|
features.push_back({ "VSX", "1" });
|
||||||
|
}
|
||||||
|
if (ggml_cpu_has_wasm_simd()) {
|
||||||
|
features.push_back({ "WASM_SIMD", "1" });
|
||||||
|
}
|
||||||
|
if (ggml_cpu_has_llamafile()) {
|
||||||
|
features.push_back({ "LLAMAFILE", "1" });
|
||||||
|
}
|
||||||
|
|
||||||
|
features.push_back({ nullptr, nullptr });
|
||||||
|
|
||||||
|
return features;
|
||||||
|
}();
|
||||||
|
|
||||||
|
return features.data();
|
||||||
|
|
||||||
|
GGML_UNUSED(reg);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void * ggml_backend_cpu_get_proc_address(ggml_backend_reg_t reg, const char * name) {
|
||||||
|
if (strcmp(name, "ggml_backend_set_n_threads") == 0) {
|
||||||
|
return (void *)ggml_backend_cpu_set_n_threads;
|
||||||
|
}
|
||||||
|
if (strcmp(name, "ggml_backend_dev_get_extra_bufts") == 0) {
|
||||||
|
return (void *)ggml_backend_cpu_get_extra_bufts;
|
||||||
|
}
|
||||||
|
|
||||||
|
return NULL;
|
||||||
|
|
||||||
|
GGML_UNUSED(reg);
|
||||||
|
}
|
||||||
|
|
||||||
|
static const struct ggml_backend_reg_i ggml_backend_cpu_reg_i = {
|
||||||
|
/* .get_name = */ ggml_backend_cpu_reg_get_name,
|
||||||
|
/* .get_device_count = */ ggml_backend_cpu_reg_get_device_count,
|
||||||
|
/* .get_device = */ ggml_backend_cpu_reg_get_device,
|
||||||
|
/* .get_proc_address = */ ggml_backend_cpu_get_proc_address,
|
||||||
|
};
|
||||||
|
|
||||||
|
ggml_backend_reg_t ggml_backend_cpu_reg(void) {
|
||||||
|
static struct ggml_backend_reg ggml_backend_cpu_reg = {
|
||||||
|
/* .iface = */ ggml_backend_cpu_reg_i,
|
||||||
|
/* .context = */ NULL,
|
||||||
|
};
|
||||||
|
|
||||||
|
return &ggml_backend_cpu_reg;
|
||||||
|
}
|
|
@ -6,7 +6,7 @@
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
|
|
||||||
#if defined(GGML_USE_HIPBLAS)
|
#if defined(GGML_USE_HIP)
|
||||||
#define GGML_COMMON_DECL_HIP
|
#define GGML_COMMON_DECL_HIP
|
||||||
#define GGML_COMMON_IMPL_HIP
|
#define GGML_COMMON_IMPL_HIP
|
||||||
#else
|
#else
|
||||||
|
@ -26,13 +26,13 @@
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
#if defined(GGML_USE_HIPBLAS)
|
#if defined(GGML_USE_HIP)
|
||||||
#include "vendors/hip.h"
|
#include "vendors/hip.h"
|
||||||
#elif defined(GGML_USE_MUSA)
|
#elif defined(GGML_USE_MUSA)
|
||||||
#include "vendors/musa.h"
|
#include "vendors/musa.h"
|
||||||
#else
|
#else
|
||||||
#include "vendors/cuda.h"
|
#include "vendors/cuda.h"
|
||||||
#endif // defined(GGML_USE_HIPBLAS)
|
#endif // defined(GGML_USE_HIP)
|
||||||
|
|
||||||
#define STRINGIZE_IMPL(...) #__VA_ARGS__
|
#define STRINGIZE_IMPL(...) #__VA_ARGS__
|
||||||
#define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__)
|
#define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__)
|
||||||
|
@ -97,7 +97,7 @@ void ggml_cuda_error(const char * stmt, const char * func, const char * file, in
|
||||||
|
|
||||||
#define CUBLAS_CHECK(err) CUDA_CHECK_GEN(err, CUBLAS_STATUS_SUCCESS, cublas_get_error_str)
|
#define CUBLAS_CHECK(err) CUDA_CHECK_GEN(err, CUBLAS_STATUS_SUCCESS, cublas_get_error_str)
|
||||||
|
|
||||||
#if !defined(GGML_USE_HIPBLAS)
|
#if !defined(GGML_USE_HIP)
|
||||||
static const char * cu_get_error_str(CUresult err) {
|
static const char * cu_get_error_str(CUresult err) {
|
||||||
const char * err_str;
|
const char * err_str;
|
||||||
cuGetErrorString(err, &err_str);
|
cuGetErrorString(err, &err_str);
|
||||||
|
@ -120,21 +120,21 @@ typedef float dfloat; // dequantize float
|
||||||
typedef float2 dfloat2;
|
typedef float2 dfloat2;
|
||||||
#endif // GGML_CUDA_F16
|
#endif // GGML_CUDA_F16
|
||||||
|
|
||||||
#if (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
|
#if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
|
||||||
#define FP16_AVAILABLE
|
#define FP16_AVAILABLE
|
||||||
#endif // (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
|
#endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
|
||||||
|
|
||||||
#if defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
|
#if defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
|
||||||
#define FAST_FP16_AVAILABLE
|
#define FAST_FP16_AVAILABLE
|
||||||
#endif // defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
|
#endif // defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
|
||||||
|
|
||||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA
|
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA
|
||||||
#define FP16_MMA_AVAILABLE
|
#define FP16_MMA_AVAILABLE
|
||||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA
|
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA
|
||||||
|
|
||||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING
|
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING
|
||||||
#define INT8_MMA_AVAILABLE
|
#define INT8_MMA_AVAILABLE
|
||||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING
|
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING
|
||||||
|
|
||||||
#if !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= CC_QY1)
|
#if !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= CC_QY1)
|
||||||
#define FLASH_ATTN_AVAILABLE
|
#define FLASH_ATTN_AVAILABLE
|
||||||
|
@ -156,14 +156,14 @@ static constexpr bool int8_mma_available(const int cc) {
|
||||||
static __device__ void no_device_code(
|
static __device__ void no_device_code(
|
||||||
const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {
|
const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {
|
||||||
|
|
||||||
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||||
printf("%s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.\n",
|
printf("%s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.\n",
|
||||||
file_name, line, function_name, arch);
|
file_name, line, function_name, arch);
|
||||||
GGML_UNUSED(arch_list);
|
GGML_UNUSED(arch_list);
|
||||||
#else
|
#else
|
||||||
printf("%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n",
|
printf("%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n",
|
||||||
file_name, line, function_name, arch, arch_list);
|
file_name, line, function_name, arch, arch_list);
|
||||||
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||||
__trap();
|
__trap();
|
||||||
|
|
||||||
GGML_UNUSED(no_device_code); // suppress unused function warning
|
GGML_UNUSED(no_device_code); // suppress unused function warning
|
||||||
|
@ -176,7 +176,7 @@ static __device__ void no_device_code(
|
||||||
#endif // __CUDA_ARCH__
|
#endif // __CUDA_ARCH__
|
||||||
|
|
||||||
static __device__ __forceinline__ int warp_reduce_sum(int x) {
|
static __device__ __forceinline__ int warp_reduce_sum(int x) {
|
||||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_AMPERE
|
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_AMPERE
|
||||||
return __reduce_add_sync(0xffffffff, x);
|
return __reduce_add_sync(0xffffffff, x);
|
||||||
#else
|
#else
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
|
@ -184,7 +184,7 @@ static __device__ __forceinline__ int warp_reduce_sum(int x) {
|
||||||
x += __shfl_xor_sync(0xffffffff, x, mask, 32);
|
x += __shfl_xor_sync(0xffffffff, x, mask, 32);
|
||||||
}
|
}
|
||||||
return x;
|
return x;
|
||||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_AMPERE
|
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_AMPERE
|
||||||
}
|
}
|
||||||
|
|
||||||
static __device__ __forceinline__ float warp_reduce_sum(float x) {
|
static __device__ __forceinline__ float warp_reduce_sum(float x) {
|
||||||
|
@ -207,7 +207,7 @@ static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
|
||||||
static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
|
static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
|
||||||
#ifdef FP16_AVAILABLE
|
#ifdef FP16_AVAILABLE
|
||||||
|
|
||||||
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||||
const half2 a_other = __shfl_xor_sync(0xffffffff, a, mask, 32);
|
const half2 a_other = __shfl_xor_sync(0xffffffff, a, mask, 32);
|
||||||
|
@ -221,7 +221,7 @@ static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
|
||||||
a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32));
|
a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32));
|
||||||
}
|
}
|
||||||
return a;
|
return a;
|
||||||
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||||
|
|
||||||
#else
|
#else
|
||||||
NO_DEVICE_CODE;
|
NO_DEVICE_CODE;
|
||||||
|
@ -240,11 +240,11 @@ static __device__ __forceinline__ float warp_reduce_max(float x) {
|
||||||
static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b) {
|
static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b) {
|
||||||
#ifdef FP16_AVAILABLE
|
#ifdef FP16_AVAILABLE
|
||||||
|
|
||||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
|
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
|
||||||
return __float2half(fmaxf(__half2float(a), __half2float(b)));
|
return __float2half(fmaxf(__half2float(a), __half2float(b)));
|
||||||
#else
|
#else
|
||||||
return __hmax(a, b);
|
return __hmax(a, b);
|
||||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
|
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
|
||||||
|
|
||||||
#else
|
#else
|
||||||
NO_DEVICE_CODE;
|
NO_DEVICE_CODE;
|
||||||
|
@ -254,7 +254,7 @@ static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b
|
||||||
}
|
}
|
||||||
|
|
||||||
static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const half2 b) {
|
static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const half2 b) {
|
||||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||||
|
|
||||||
#if CUDART_VERSION >= CUDART_HMAX
|
#if CUDART_VERSION >= CUDART_HMAX
|
||||||
return __hmax2(a, b);
|
return __hmax2(a, b);
|
||||||
|
@ -269,11 +269,11 @@ static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const hal
|
||||||
GGML_UNUSED(a);
|
GGML_UNUSED(a);
|
||||||
GGML_UNUSED(b);
|
GGML_UNUSED(b);
|
||||||
NO_DEVICE_CODE;
|
NO_DEVICE_CODE;
|
||||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||||
}
|
}
|
||||||
|
|
||||||
static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
|
static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
|
||||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
|
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||||
x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
|
x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
|
||||||
|
@ -282,7 +282,7 @@ static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
|
||||||
#else
|
#else
|
||||||
GGML_UNUSED(x);
|
GGML_UNUSED(x);
|
||||||
NO_DEVICE_CODE;
|
NO_DEVICE_CODE;
|
||||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
|
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
|
||||||
}
|
}
|
||||||
|
|
||||||
#if CUDART_VERSION < CUDART_HMASK
|
#if CUDART_VERSION < CUDART_HMASK
|
||||||
|
@ -294,7 +294,7 @@ static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half
|
||||||
#endif // CUDART_VERSION < CUDART_HMASK
|
#endif // CUDART_VERSION < CUDART_HMASK
|
||||||
|
|
||||||
static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, int c) {
|
static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, int c) {
|
||||||
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||||
#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(RDNA2)
|
#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(RDNA2)
|
||||||
c = __builtin_amdgcn_sdot4(a, b, c, false);
|
c = __builtin_amdgcn_sdot4(a, b, c, false);
|
||||||
#elif defined(RDNA3)
|
#elif defined(RDNA3)
|
||||||
|
@ -320,7 +320,7 @@ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, i
|
||||||
#endif
|
#endif
|
||||||
return c;
|
return c;
|
||||||
|
|
||||||
#else // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
#else // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||||
|
|
||||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A
|
#if __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||||
return __dp4a(a, b, c);
|
return __dp4a(a, b, c);
|
||||||
|
@ -330,7 +330,7 @@ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, i
|
||||||
return c + a8[0]*b8[0] + a8[1]*b8[1] + a8[2]*b8[2] + a8[3]*b8[3];
|
return c + a8[0]*b8[0] + a8[1]*b8[1] + a8[2]*b8[2] + a8[3]*b8[3];
|
||||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||||
|
|
||||||
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: move to ggml-common.h
|
// TODO: move to ggml-common.h
|
||||||
|
|
|
@ -517,9 +517,9 @@ constexpr __device__ dequantize_1_f32_t get_dequantize_1_f32(ggml_type type_V) {
|
||||||
}
|
}
|
||||||
|
|
||||||
template<int D, int parallel_blocks> // D == head size
|
template<int D, int parallel_blocks> // D == head size
|
||||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||||
__launch_bounds__(D, 1)
|
__launch_bounds__(D, 1)
|
||||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||||
static __global__ void flash_attn_combine_results(
|
static __global__ void flash_attn_combine_results(
|
||||||
const float * __restrict__ VKQ_parts,
|
const float * __restrict__ VKQ_parts,
|
||||||
const float2 * __restrict__ VKQ_meta,
|
const float2 * __restrict__ VKQ_meta,
|
||||||
|
|
|
@ -5,9 +5,9 @@
|
||||||
#define FATTN_KQ_STRIDE_TILE_F16 64
|
#define FATTN_KQ_STRIDE_TILE_F16 64
|
||||||
|
|
||||||
template<int D, int ncols, int nwarps, int parallel_blocks, bool use_logit_softcap> // D == head size
|
template<int D, int ncols, int nwarps, int parallel_blocks, bool use_logit_softcap> // D == head size
|
||||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||||
__launch_bounds__(nwarps*WARP_SIZE, 1)
|
__launch_bounds__(nwarps*WARP_SIZE, 1)
|
||||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||||
static __global__ void flash_attn_tile_ext_f16(
|
static __global__ void flash_attn_tile_ext_f16(
|
||||||
const char * __restrict__ Q,
|
const char * __restrict__ Q,
|
||||||
const char * __restrict__ K,
|
const char * __restrict__ K,
|
||||||
|
|
|
@ -5,9 +5,9 @@
|
||||||
#define FATTN_KQ_STRIDE_TILE_F32 32
|
#define FATTN_KQ_STRIDE_TILE_F32 32
|
||||||
|
|
||||||
template<int D, int ncols, int nwarps, int parallel_blocks, bool use_logit_softcap> // D == head size
|
template<int D, int ncols, int nwarps, int parallel_blocks, bool use_logit_softcap> // D == head size
|
||||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||||
__launch_bounds__(nwarps*WARP_SIZE, 1)
|
__launch_bounds__(nwarps*WARP_SIZE, 1)
|
||||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||||
static __global__ void flash_attn_tile_ext_f32(
|
static __global__ void flash_attn_tile_ext_f32(
|
||||||
const char * __restrict__ Q,
|
const char * __restrict__ Q,
|
||||||
const char * __restrict__ K,
|
const char * __restrict__ K,
|
||||||
|
|
|
@ -2,9 +2,9 @@
|
||||||
#include "fattn-common.cuh"
|
#include "fattn-common.cuh"
|
||||||
|
|
||||||
template<int D, int ncols, int parallel_blocks, ggml_type type_K, ggml_type type_V, bool use_logit_softcap> // D == head size
|
template<int D, int ncols, int parallel_blocks, ggml_type type_K, ggml_type type_V, bool use_logit_softcap> // D == head size
|
||||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||||
__launch_bounds__(D, 1)
|
__launch_bounds__(D, 1)
|
||||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||||
static __global__ void flash_attn_vec_ext_f16(
|
static __global__ void flash_attn_vec_ext_f16(
|
||||||
const char * __restrict__ Q,
|
const char * __restrict__ Q,
|
||||||
const char * __restrict__ K,
|
const char * __restrict__ K,
|
||||||
|
|
|
@ -2,9 +2,9 @@
|
||||||
#include "fattn-common.cuh"
|
#include "fattn-common.cuh"
|
||||||
|
|
||||||
template<int D, int ncols, int parallel_blocks, ggml_type type_K, ggml_type type_V, bool use_logit_softcap> // D == head size
|
template<int D, int ncols, int parallel_blocks, ggml_type type_K, ggml_type type_V, bool use_logit_softcap> // D == head size
|
||||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||||
__launch_bounds__(D, 1)
|
__launch_bounds__(D, 1)
|
||||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||||
static __global__ void flash_attn_vec_ext_f32(
|
static __global__ void flash_attn_vec_ext_f32(
|
||||||
const char * __restrict__ Q,
|
const char * __restrict__ Q,
|
||||||
const char * __restrict__ K,
|
const char * __restrict__ K,
|
||||||
|
|
|
@ -7,9 +7,9 @@
|
||||||
|
|
||||||
// D == head size, VKQ_stride == num VKQ rows calculated in parallel:
|
// D == head size, VKQ_stride == num VKQ rows calculated in parallel:
|
||||||
template<int D, int ncols, int nwarps, int VKQ_stride, int parallel_blocks, typename KQ_acc_t, bool use_logit_softcap>
|
template<int D, int ncols, int nwarps, int VKQ_stride, int parallel_blocks, typename KQ_acc_t, bool use_logit_softcap>
|
||||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||||
__launch_bounds__(nwarps*WARP_SIZE, 1)
|
__launch_bounds__(nwarps*WARP_SIZE, 1)
|
||||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||||
static __global__ void flash_attn_ext_f16(
|
static __global__ void flash_attn_ext_f16(
|
||||||
const char * __restrict__ Q,
|
const char * __restrict__ Q,
|
||||||
const char * __restrict__ K,
|
const char * __restrict__ K,
|
||||||
|
|
|
@ -93,7 +93,7 @@ int ggml_cuda_get_device() {
|
||||||
|
|
||||||
static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) {
|
static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) {
|
||||||
ggml_cuda_set_device(device);
|
ggml_cuda_set_device(device);
|
||||||
#if defined(GGML_USE_HIPBLAS) && defined(GGML_HIP_UMA)
|
#if defined(GGML_USE_HIP) && defined(GGML_HIP_UMA)
|
||||||
auto res = hipMallocManaged(ptr, size);
|
auto res = hipMallocManaged(ptr, size);
|
||||||
if (res == hipSuccess) {
|
if (res == hipSuccess) {
|
||||||
// if error we "need" to know why...
|
// if error we "need" to know why...
|
||||||
|
@ -102,7 +102,7 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device)
|
||||||
return res;
|
return res;
|
||||||
#else
|
#else
|
||||||
|
|
||||||
#if !defined(GGML_USE_HIPBLAS)
|
#if !defined(GGML_USE_HIP)
|
||||||
cudaError_t err;
|
cudaError_t err;
|
||||||
if (getenv("GGML_CUDA_ENABLE_UNIFIED_MEMORY") != nullptr)
|
if (getenv("GGML_CUDA_ENABLE_UNIFIED_MEMORY") != nullptr)
|
||||||
{
|
{
|
||||||
|
@ -115,7 +115,7 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device)
|
||||||
return err;
|
return err;
|
||||||
#else
|
#else
|
||||||
return cudaMalloc(ptr, size);
|
return cudaMalloc(ptr, size);
|
||||||
#endif // !defined(GGML_USE_HIPBLAS)
|
#endif // !defined(GGML_USE_HIP)
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
@ -154,7 +154,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
||||||
for (int id = 0; id < info.device_count; ++id) {
|
for (int id = 0; id < info.device_count; ++id) {
|
||||||
int device_vmm = 0;
|
int device_vmm = 0;
|
||||||
|
|
||||||
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
|
#if !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
|
||||||
CUdevice device;
|
CUdevice device;
|
||||||
CU_CHECK(cuDeviceGet(&device, id));
|
CU_CHECK(cuDeviceGet(&device, id));
|
||||||
CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
|
CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
|
||||||
|
@ -166,7 +166,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
||||||
alloc_prop.location.id = id;
|
alloc_prop.location.id = id;
|
||||||
CU_CHECK(cuMemGetAllocationGranularity(&info.devices[id].vmm_granularity, &alloc_prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
|
CU_CHECK(cuMemGetAllocationGranularity(&info.devices[id].vmm_granularity, &alloc_prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
|
||||||
}
|
}
|
||||||
#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
|
#endif // !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
|
||||||
info.devices[id].vmm = !!device_vmm;
|
info.devices[id].vmm = !!device_vmm;
|
||||||
|
|
||||||
cudaDeviceProp prop;
|
cudaDeviceProp prop;
|
||||||
|
@ -178,13 +178,13 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
||||||
|
|
||||||
info.devices[id].nsm = prop.multiProcessorCount;
|
info.devices[id].nsm = prop.multiProcessorCount;
|
||||||
info.devices[id].smpb = prop.sharedMemPerBlock;
|
info.devices[id].smpb = prop.sharedMemPerBlock;
|
||||||
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||||
info.devices[id].smpbo = prop.sharedMemPerBlock;
|
info.devices[id].smpbo = prop.sharedMemPerBlock;
|
||||||
info.devices[id].cc = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD;
|
info.devices[id].cc = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD;
|
||||||
#else
|
#else
|
||||||
info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
|
info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
|
||||||
info.devices[id].cc = 100*prop.major + 10*prop.minor;
|
info.devices[id].cc = 100*prop.major + 10*prop.minor;
|
||||||
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int id = 0; id < info.device_count; ++id) {
|
for (int id = 0; id < info.device_count; ++id) {
|
||||||
|
@ -300,7 +300,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
|
||||||
};
|
};
|
||||||
|
|
||||||
// pool with virtual memory
|
// pool with virtual memory
|
||||||
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
|
#if !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
|
||||||
struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
|
struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
|
||||||
static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
|
static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
|
||||||
|
|
||||||
|
@ -394,14 +394,14 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
|
||||||
GGML_ASSERT(ptr == (void *) (pool_addr + pool_used));
|
GGML_ASSERT(ptr == (void *) (pool_addr + pool_used));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
|
#endif // !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
|
||||||
|
|
||||||
std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device) {
|
std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device) {
|
||||||
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
|
#if !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
|
||||||
if (ggml_cuda_info().devices[device].vmm) {
|
if (ggml_cuda_info().devices[device].vmm) {
|
||||||
return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_vmm(device));
|
return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_vmm(device));
|
||||||
}
|
}
|
||||||
#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
|
#endif // !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
|
||||||
return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_leg(device));
|
return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_leg(device));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1326,7 +1326,7 @@ static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) {
|
||||||
static cudaError_t ggml_cuda_Memcpy2DPeerAsync(
|
static cudaError_t ggml_cuda_Memcpy2DPeerAsync(
|
||||||
void * dst, int dstDevice, size_t dpitch, void * src, int srcDevice, size_t spitch, size_t width, size_t height, cudaStream_t stream) {
|
void * dst, int dstDevice, size_t dpitch, void * src, int srcDevice, size_t spitch, size_t width, size_t height, cudaStream_t stream) {
|
||||||
|
|
||||||
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
|
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
|
||||||
// cudaMemcpy2DAsync may fail with copies between vmm pools of different devices
|
// cudaMemcpy2DAsync may fail with copies between vmm pools of different devices
|
||||||
cudaMemcpy3DPeerParms p = {};
|
cudaMemcpy3DPeerParms p = {};
|
||||||
p.dstDevice = dstDevice;
|
p.dstDevice = dstDevice;
|
||||||
|
@ -1340,7 +1340,7 @@ static cudaError_t ggml_cuda_Memcpy2DPeerAsync(
|
||||||
GGML_UNUSED(dstDevice);
|
GGML_UNUSED(dstDevice);
|
||||||
GGML_UNUSED(srcDevice);
|
GGML_UNUSED(srcDevice);
|
||||||
return cudaMemcpy2DAsync(dst, dpitch, src, spitch, width, height, cudaMemcpyDeviceToDevice, stream);
|
return cudaMemcpy2DAsync(dst, dpitch, src, spitch, width, height, cudaMemcpyDeviceToDevice, stream);
|
||||||
#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
|
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_cuda_op_mul_mat(
|
static void ggml_cuda_op_mul_mat(
|
||||||
|
@ -2983,6 +2983,17 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||||
{
|
{
|
||||||
struct ggml_tensor * a = op->src[0];
|
struct ggml_tensor * a = op->src[0];
|
||||||
struct ggml_tensor * b = op->src[1];
|
struct ggml_tensor * b = op->src[1];
|
||||||
|
// for small weight matrices the active device can end up without any rows, don't use row split in those cases
|
||||||
|
// this avoids some edge cases (and the performance would not be good anyways)
|
||||||
|
if (a->buffer && ggml_backend_buft_is_cuda_split(a->buffer->buft)) {
|
||||||
|
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) a->buffer->buft->context;
|
||||||
|
int64_t row_low;
|
||||||
|
int64_t row_high;
|
||||||
|
get_row_split(&row_low, &row_high, a, buft_ctx->tensor_split, dev_ctx->device);
|
||||||
|
if (row_low == row_high) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
if (b->type == GGML_TYPE_F16 && a->type != GGML_TYPE_F16) {
|
if (b->type == GGML_TYPE_F16 && a->type != GGML_TYPE_F16) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
|
@ -101,9 +101,9 @@ static constexpr __device__ int get_mmq_x_max_device() {
|
||||||
return 128;
|
return 128;
|
||||||
#else // INT8_MMA_AVAILABLE
|
#else // INT8_MMA_AVAILABLE
|
||||||
|
|
||||||
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||||
return 128;
|
return 128;
|
||||||
#else // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
#else // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||||
|
|
||||||
#if __CUDA_ARCH__ >= CC_VOLTA
|
#if __CUDA_ARCH__ >= CC_VOLTA
|
||||||
#ifdef GGML_CUDA_FORCE_MMQ
|
#ifdef GGML_CUDA_FORCE_MMQ
|
||||||
|
@ -116,7 +116,7 @@ static constexpr __device__ int get_mmq_x_max_device() {
|
||||||
return 64;
|
return 64;
|
||||||
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
||||||
|
|
||||||
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||||
#endif // INT8_MMA_AVAILABLE
|
#endif // INT8_MMA_AVAILABLE
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -125,7 +125,7 @@ static constexpr int get_mmq_y_host(const int cc) {
|
||||||
}
|
}
|
||||||
|
|
||||||
static constexpr __device__ int get_mmq_y_device() {
|
static constexpr __device__ int get_mmq_y_device() {
|
||||||
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||||
#if defined(RDNA1)
|
#if defined(RDNA1)
|
||||||
return 64;
|
return 64;
|
||||||
#else
|
#else
|
||||||
|
@ -137,7 +137,7 @@ static constexpr __device__ int get_mmq_y_device() {
|
||||||
#else
|
#else
|
||||||
return 64;
|
return 64;
|
||||||
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
||||||
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||||
}
|
}
|
||||||
|
|
||||||
#define MMQ_DP4A_TXS_Q4_0 tile_x_sizes{mmq_y*WARP_SIZE + mmq_y, mmq_y*WARP_SIZE/QI4_0 + mmq_y/QI4_0, 0}
|
#define MMQ_DP4A_TXS_Q4_0 tile_x_sizes{mmq_y*WARP_SIZE + mmq_y, mmq_y*WARP_SIZE/QI4_0 + mmq_y/QI4_0, 0}
|
||||||
|
@ -2570,7 +2570,7 @@ static __device__ void mul_mat_q_process_tile(
|
||||||
// The mul_mat_q kernel implements "stream-k" work partitioning as described in https://arxiv.org/abs/2301.03598
|
// The mul_mat_q kernel implements "stream-k" work partitioning as described in https://arxiv.org/abs/2301.03598
|
||||||
|
|
||||||
template <ggml_type type, int mmq_x, int nwarps, bool need_check>
|
template <ggml_type type, int mmq_x, int nwarps, bool need_check>
|
||||||
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||||
#if defined(RDNA3) || defined(RDNA2)
|
#if defined(RDNA3) || defined(RDNA2)
|
||||||
__launch_bounds__(WARP_SIZE*nwarps, 2)
|
__launch_bounds__(WARP_SIZE*nwarps, 2)
|
||||||
#endif // defined(RDNA3) || defined(RDNA2)
|
#endif // defined(RDNA3) || defined(RDNA2)
|
||||||
|
@ -2580,7 +2580,7 @@ template <ggml_type type, int mmq_x, int nwarps, bool need_check>
|
||||||
#else
|
#else
|
||||||
__launch_bounds__(WARP_SIZE*nwarps, 2)
|
__launch_bounds__(WARP_SIZE*nwarps, 2)
|
||||||
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
||||||
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||||
static __global__ void mul_mat_q(
|
static __global__ void mul_mat_q(
|
||||||
const char * __restrict__ x, const char * __restrict__ yc, float * __restrict__ dst, float * __restrict__ tmp_fixup,
|
const char * __restrict__ x, const char * __restrict__ yc, float * __restrict__ dst, float * __restrict__ tmp_fixup,
|
||||||
const int ne00, const int ne01, const int stride01, const int ne10, const int ne11, const int stride11, const int ne0) {
|
const int ne00, const int ne01, const int stride01, const int ne10, const int ne11, const int stride11, const int ne0) {
|
||||||
|
@ -2595,7 +2595,7 @@ static __global__ void mul_mat_q(
|
||||||
constexpr int mmq_y = get_mmq_y_device();
|
constexpr int mmq_y = get_mmq_y_device();
|
||||||
|
|
||||||
// On AMD or old CUDA the performance with stream-k was worse, use conventional tiling instead:
|
// On AMD or old CUDA the performance with stream-k was worse, use conventional tiling instead:
|
||||||
#if (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < CC_VOLTA
|
#if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < CC_VOLTA
|
||||||
{
|
{
|
||||||
constexpr bool fixup = false;
|
constexpr bool fixup = false;
|
||||||
mul_mat_q_process_tile<type, mmq_x, nwarps, need_check, fixup>
|
mul_mat_q_process_tile<type, mmq_x, nwarps, need_check, fixup>
|
||||||
|
@ -2603,7 +2603,7 @@ static __global__ void mul_mat_q(
|
||||||
blockIdx.x, blockIdx.y, 0, ne00/qk);
|
blockIdx.x, blockIdx.y, 0, ne00/qk);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
#endif // (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < CC_VOLTA
|
#endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < CC_VOLTA
|
||||||
|
|
||||||
const int64_t blocks_per_ne00 = ne00 / qk;
|
const int64_t blocks_per_ne00 = ne00 / qk;
|
||||||
constexpr int blocks_per_iter = MMQ_ITER_K / qk;
|
constexpr int blocks_per_iter = MMQ_ITER_K / qk;
|
||||||
|
@ -2766,14 +2766,14 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
|
||||||
|
|
||||||
const int shmem = mmq_get_shmem<type>(mmq_x, mmq_y, cc);
|
const int shmem = mmq_get_shmem<type>(mmq_x, mmq_y, cc);
|
||||||
|
|
||||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||||
static bool shmem_limit_raised[GGML_CUDA_MAX_DEVICES] = {false};
|
static bool shmem_limit_raised[GGML_CUDA_MAX_DEVICES] = {false};
|
||||||
if (!shmem_limit_raised[id]) {
|
if (!shmem_limit_raised[id]) {
|
||||||
CUDA_CHECK(cudaFuncSetAttribute(mul_mat_q<type, mmq_x, MMQ_NWARPS, false>, cudaFuncAttributeMaxDynamicSharedMemorySize, shmem));
|
CUDA_CHECK(cudaFuncSetAttribute(mul_mat_q<type, mmq_x, MMQ_NWARPS, false>, cudaFuncAttributeMaxDynamicSharedMemorySize, shmem));
|
||||||
CUDA_CHECK(cudaFuncSetAttribute(mul_mat_q<type, mmq_x, MMQ_NWARPS, true>, cudaFuncAttributeMaxDynamicSharedMemorySize, shmem));
|
CUDA_CHECK(cudaFuncSetAttribute(mul_mat_q<type, mmq_x, MMQ_NWARPS, true>, cudaFuncAttributeMaxDynamicSharedMemorySize, shmem));
|
||||||
shmem_limit_raised[id] = true;
|
shmem_limit_raised[id] = true;
|
||||||
}
|
}
|
||||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||||
|
|
||||||
const int nty = (args.ne01 + mmq_y - 1) / mmq_y;
|
const int nty = (args.ne01 + mmq_y - 1) / mmq_y;
|
||||||
const int ntx = (args.ne11 + mmq_x - 1) / mmq_x;
|
const int ntx = (args.ne11 + mmq_x - 1) / mmq_x;
|
||||||
|
|
|
@ -48,10 +48,10 @@ static constexpr __device__ int get_vdr_mmvq(ggml_type type) {
|
||||||
}
|
}
|
||||||
|
|
||||||
template <ggml_type type, int ncols_y>
|
template <ggml_type type, int ncols_y>
|
||||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||||
// tell the compiler to use as many registers as it wants, see nwarps definition below
|
// tell the compiler to use as many registers as it wants, see nwarps definition below
|
||||||
__launch_bounds__((ncols_y <= 4 ? 4 : 2)*WARP_SIZE, 1)
|
__launch_bounds__((ncols_y <= 4 ? 4 : 2)*WARP_SIZE, 1)
|
||||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||||
static __global__ void mul_mat_vec_q(
|
static __global__ void mul_mat_vec_q(
|
||||||
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
||||||
const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst) {
|
const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst) {
|
||||||
|
@ -62,13 +62,13 @@ static __global__ void mul_mat_vec_q(
|
||||||
|
|
||||||
constexpr vec_dot_q_cuda_t vec_dot_q_cuda = get_vec_dot_q_cuda(type);
|
constexpr vec_dot_q_cuda_t vec_dot_q_cuda = get_vec_dot_q_cuda(type);
|
||||||
|
|
||||||
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && (defined(RDNA2) || defined(RDNA3))
|
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && (defined(RDNA2) || defined(RDNA3))
|
||||||
constexpr int nwarps = 1;
|
constexpr int nwarps = 1;
|
||||||
constexpr int rows_per_cuda_block = 1;
|
constexpr int rows_per_cuda_block = 1;
|
||||||
#else
|
#else
|
||||||
constexpr int nwarps = ncols_y <= 4 ? 4 : 2;
|
constexpr int nwarps = ncols_y <= 4 ? 4 : 2;
|
||||||
constexpr int rows_per_cuda_block = ncols_y == 1 ? 1 : 2;
|
constexpr int rows_per_cuda_block = ncols_y == 1 ? 1 : 2;
|
||||||
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && !defined(RDNA2) && !defined(RDNA3)
|
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(RDNA2) && !defined(RDNA3)
|
||||||
|
|
||||||
const int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
|
const int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
|
||||||
const int row0 = rows_per_cuda_block*blockIdx.x;
|
const int row0 = rows_per_cuda_block*blockIdx.x;
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11700
|
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11700
|
||||||
#define USE_CUB
|
#define USE_CUB
|
||||||
#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11700
|
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11700
|
||||||
|
|
||||||
#ifdef USE_CUB
|
#ifdef USE_CUB
|
||||||
// On Windows CUB uses libraries with variables called CC_PASCAL which conflict with the define in common.cuh.
|
// On Windows CUB uses libraries with variables called CC_PASCAL which conflict with the define in common.cuh.
|
||||||
|
|
|
@ -3,13 +3,29 @@
|
||||||
// GGML internal header
|
// GGML internal header
|
||||||
|
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
|
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
|
#include <math.h>
|
||||||
#include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
|
#include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
|
||||||
#include <stdbool.h>
|
#include <stdbool.h>
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
|
|
||||||
|
#ifdef __ARM_FEATURE_SVE
|
||||||
|
#include <arm_sve.h>
|
||||||
|
#endif // __ARM_FEATURE_SVE
|
||||||
|
|
||||||
|
#if defined(__ARM_NEON)
|
||||||
|
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
|
||||||
|
//
|
||||||
|
// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
|
||||||
|
//
|
||||||
|
#include <arm_neon.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(__F16C__)
|
||||||
|
#include <immintrin.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
@ -120,14 +136,12 @@ struct ggml_map_custom1_op_params {
|
||||||
void * userdata;
|
void * userdata;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
struct ggml_map_custom2_op_params {
|
struct ggml_map_custom2_op_params {
|
||||||
ggml_custom2_op_t fun;
|
ggml_custom2_op_t fun;
|
||||||
int n_tasks;
|
int n_tasks;
|
||||||
void * userdata;
|
void * userdata;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
struct ggml_map_custom3_op_params {
|
struct ggml_map_custom3_op_params {
|
||||||
ggml_custom3_op_t fun;
|
ggml_custom3_op_t fun;
|
||||||
int n_tasks;
|
int n_tasks;
|
||||||
|
@ -287,9 +301,249 @@ struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1);
|
||||||
void * ggml_aligned_malloc(size_t size);
|
void * ggml_aligned_malloc(size_t size);
|
||||||
void ggml_aligned_free(void * ptr, size_t size);
|
void ggml_aligned_free(void * ptr, size_t size);
|
||||||
|
|
||||||
// TODO: move to threading file
|
// FP16 to FP32 conversion
|
||||||
void ggml_critical_section_start(void);
|
|
||||||
void ggml_critical_section_end(void);
|
#if defined(__ARM_NEON)
|
||||||
|
#ifdef _MSC_VER
|
||||||
|
typedef uint16_t ggml_fp16_internal_t;
|
||||||
|
#else
|
||||||
|
typedef __fp16 ggml_fp16_internal_t;
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(__ARM_NEON) && !defined(_MSC_VER)
|
||||||
|
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
||||||
|
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
||||||
|
|
||||||
|
#define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
||||||
|
|
||||||
|
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
||||||
|
ggml_fp16_internal_t tmp;
|
||||||
|
memcpy(&tmp, &h, sizeof(ggml_fp16_t));
|
||||||
|
return (float)tmp;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
||||||
|
ggml_fp16_t res;
|
||||||
|
ggml_fp16_internal_t tmp = f;
|
||||||
|
memcpy(&res, &tmp, sizeof(ggml_fp16_t));
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
#elif defined(__F16C__)
|
||||||
|
|
||||||
|
#ifdef _MSC_VER
|
||||||
|
#define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
|
||||||
|
#define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
|
||||||
|
#else
|
||||||
|
#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
|
||||||
|
#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#elif defined(__POWER9_VECTOR__)
|
||||||
|
|
||||||
|
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
||||||
|
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
||||||
|
/* the inline asm below is about 12% faster than the lookup method */
|
||||||
|
#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
|
||||||
|
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
|
||||||
|
|
||||||
|
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
||||||
|
register float f;
|
||||||
|
register double d;
|
||||||
|
__asm__(
|
||||||
|
"mtfprd %0,%2\n"
|
||||||
|
"xscvhpdp %0,%0\n"
|
||||||
|
"frsp %1,%0\n" :
|
||||||
|
/* temp */ "=d"(d),
|
||||||
|
/* out */ "=f"(f):
|
||||||
|
/* in */ "r"(h));
|
||||||
|
return f;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
||||||
|
register double d;
|
||||||
|
register ggml_fp16_t r;
|
||||||
|
__asm__( /* xscvdphp can work on double or single precision */
|
||||||
|
"xscvdphp %0,%2\n"
|
||||||
|
"mffprd %1,%0\n" :
|
||||||
|
/* temp */ "=d"(d),
|
||||||
|
/* out */ "=r"(r):
|
||||||
|
/* in */ "f"(f));
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
// FP16 <-> FP32
|
||||||
|
// ref: https://github.com/Maratyszcza/FP16
|
||||||
|
|
||||||
|
static inline float fp32_from_bits(uint32_t w) {
|
||||||
|
union {
|
||||||
|
uint32_t as_bits;
|
||||||
|
float as_value;
|
||||||
|
} fp32;
|
||||||
|
fp32.as_bits = w;
|
||||||
|
return fp32.as_value;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline uint32_t fp32_to_bits(float f) {
|
||||||
|
union {
|
||||||
|
float as_value;
|
||||||
|
uint32_t as_bits;
|
||||||
|
} fp32;
|
||||||
|
fp32.as_value = f;
|
||||||
|
return fp32.as_bits;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
||||||
|
const uint32_t w = (uint32_t) h << 16;
|
||||||
|
const uint32_t sign = w & UINT32_C(0x80000000);
|
||||||
|
const uint32_t two_w = w + w;
|
||||||
|
|
||||||
|
const uint32_t exp_offset = UINT32_C(0xE0) << 23;
|
||||||
|
#if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)) && (!defined(__cplusplus) || __cplusplus >= 201703L)
|
||||||
|
const float exp_scale = 0x1.0p-112f;
|
||||||
|
#else
|
||||||
|
const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
|
||||||
|
#endif
|
||||||
|
const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
|
||||||
|
|
||||||
|
const uint32_t magic_mask = UINT32_C(126) << 23;
|
||||||
|
const float magic_bias = 0.5f;
|
||||||
|
const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
|
||||||
|
|
||||||
|
const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
|
||||||
|
const uint32_t result = sign |
|
||||||
|
(two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
|
||||||
|
return fp32_from_bits(result);
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
||||||
|
#if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)) && (!defined(__cplusplus) || __cplusplus >= 201703L)
|
||||||
|
const float scale_to_inf = 0x1.0p+112f;
|
||||||
|
const float scale_to_zero = 0x1.0p-110f;
|
||||||
|
#else
|
||||||
|
const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
|
||||||
|
const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
|
||||||
|
#endif
|
||||||
|
float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
|
||||||
|
|
||||||
|
const uint32_t w = fp32_to_bits(f);
|
||||||
|
const uint32_t shl1_w = w + w;
|
||||||
|
const uint32_t sign = w & UINT32_C(0x80000000);
|
||||||
|
uint32_t bias = shl1_w & UINT32_C(0xFF000000);
|
||||||
|
if (bias < UINT32_C(0x71000000)) {
|
||||||
|
bias = UINT32_C(0x71000000);
|
||||||
|
}
|
||||||
|
|
||||||
|
base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
|
||||||
|
const uint32_t bits = fp32_to_bits(base);
|
||||||
|
const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
|
||||||
|
const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
|
||||||
|
const uint32_t nonsign = exp_bits + mantissa_bits;
|
||||||
|
return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
|
||||||
|
}
|
||||||
|
|
||||||
|
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
||||||
|
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
||||||
|
|
||||||
|
#endif // defined(__ARM_NEON) && (!defined(__MSC_VER)
|
||||||
|
|
||||||
|
// precomputed f32 table for f16 (256 KB)
|
||||||
|
// defined in ggml.c, initialized in ggml_init()
|
||||||
|
GGML_API float ggml_table_f32_f16[1 << 16];
|
||||||
|
|
||||||
|
// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
|
||||||
|
// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
|
||||||
|
// This is also true for POWER9.
|
||||||
|
#if !defined(GGML_FP16_TO_FP32)
|
||||||
|
inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
|
||||||
|
uint16_t s;
|
||||||
|
memcpy(&s, &f, sizeof(uint16_t));
|
||||||
|
return ggml_table_f32_f16[s];
|
||||||
|
}
|
||||||
|
|
||||||
|
#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if !defined(GGML_FP32_TO_FP16)
|
||||||
|
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Converts brain16 to float32.
|
||||||
|
*
|
||||||
|
* The bfloat16 floating point format has the following structure:
|
||||||
|
*
|
||||||
|
* ┌sign
|
||||||
|
* │
|
||||||
|
* │ ┌exponent
|
||||||
|
* │ │
|
||||||
|
* │ │ ┌mantissa
|
||||||
|
* │ │ │
|
||||||
|
* │┌──┴───┐┌─┴───┐
|
||||||
|
* 0b0000000000000000 brain16
|
||||||
|
*
|
||||||
|
* Since bf16 has the same number of exponent bits as a 32bit float,
|
||||||
|
* encoding and decoding numbers becomes relatively straightforward.
|
||||||
|
*
|
||||||
|
* ┌sign
|
||||||
|
* │
|
||||||
|
* │ ┌exponent
|
||||||
|
* │ │
|
||||||
|
* │ │ ┌mantissa
|
||||||
|
* │ │ │
|
||||||
|
* │┌──┴───┐┌─┴───────────────────┐
|
||||||
|
* 0b00000000000000000000000000000000 IEEE binary32
|
||||||
|
*
|
||||||
|
* For comparison, the standard fp16 format has fewer exponent bits.
|
||||||
|
*
|
||||||
|
* ┌sign
|
||||||
|
* │
|
||||||
|
* │ ┌exponent
|
||||||
|
* │ │
|
||||||
|
* │ │ ┌mantissa
|
||||||
|
* │ │ │
|
||||||
|
* │┌─┴─┐┌─┴──────┐
|
||||||
|
* 0b0000000000000000 IEEE binary16
|
||||||
|
*
|
||||||
|
* @see IEEE 754-2008
|
||||||
|
*/
|
||||||
|
static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
|
||||||
|
union {
|
||||||
|
float f;
|
||||||
|
uint32_t i;
|
||||||
|
} u;
|
||||||
|
u.i = (uint32_t)h.bits << 16;
|
||||||
|
return u.f;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Converts float32 to brain16.
|
||||||
|
*
|
||||||
|
* This is binary identical with Google Brain float conversion.
|
||||||
|
* Floats shall round to nearest even, and NANs shall be quiet.
|
||||||
|
* Subnormals aren't flushed to zero, except perhaps when used.
|
||||||
|
* This code should vectorize nicely if using modern compilers.
|
||||||
|
*/
|
||||||
|
static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
|
||||||
|
ggml_bf16_t h;
|
||||||
|
union {
|
||||||
|
float f;
|
||||||
|
uint32_t i;
|
||||||
|
} u;
|
||||||
|
u.f = s;
|
||||||
|
if ((u.i & 0x7fffffff) > 0x7f800000) { /* nan */
|
||||||
|
h.bits = (u.i >> 16) | 64; /* force to quiet */
|
||||||
|
return h;
|
||||||
|
}
|
||||||
|
h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16;
|
||||||
|
return h;
|
||||||
|
}
|
||||||
|
|
||||||
|
#define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x)
|
||||||
|
#define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x)
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -516,6 +516,10 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
|
||||||
[prep setObject:@"1" forKey:@"GGML_METAL_USE_BF16"];
|
[prep setObject:@"1" forKey:@"GGML_METAL_USE_BF16"];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if GGML_METAL_EMBED_LIBRARY
|
||||||
|
[prep setObject:@"1" forKey:@"GGML_METAL_EMBED_LIBRARY"];
|
||||||
|
#endif
|
||||||
|
|
||||||
MTLCompileOptions * options = [MTLCompileOptions new];
|
MTLCompileOptions * options = [MTLCompileOptions new];
|
||||||
options.preprocessorMacros = prep;
|
options.preprocessorMacros = prep;
|
||||||
|
|
|
@ -1,6 +1,11 @@
|
||||||
#define GGML_COMMON_DECL_METAL
|
#define GGML_COMMON_DECL_METAL
|
||||||
#define GGML_COMMON_IMPL_METAL
|
#define GGML_COMMON_IMPL_METAL
|
||||||
#include "ggml-common.h"
|
#if defined(GGML_METAL_EMBED_LIBRARY)
|
||||||
|
__embed_ggml-common.h__
|
||||||
|
#else
|
||||||
|
// TODO: this should not be a relative path, but can't figure out how to set Metal include paths in Package.swift
|
||||||
|
#include "../ggml-common.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
#include <metal_stdlib>
|
#include <metal_stdlib>
|
||||||
|
|
||||||
|
@ -15,8 +20,8 @@ using namespace metal;
|
||||||
// ref: https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf
|
// ref: https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf
|
||||||
//
|
//
|
||||||
// cmd:
|
// cmd:
|
||||||
// .../usr/bin/metal -dM -E -c ggml/src/ggml-metal.metal
|
// .../usr/bin/metal -dM -E -c ggml/src/ggml-metal/ggml-metal.metal
|
||||||
// .../usr/bin/metal -dM -E -c -target air64-apple-ios14.0 ggml/src/ggml-metal.metal
|
// .../usr/bin/metal -dM -E -c -target air64-apple-ios14.0 ggml/src/ggml-metal/ggml-metal.metal
|
||||||
//
|
//
|
||||||
#if __METAL_VERSION__ < 310 && defined(GGML_METAL_USE_BF16)
|
#if __METAL_VERSION__ < 310 && defined(GGML_METAL_USE_BF16)
|
||||||
#undef GGML_METAL_USE_BF16
|
#undef GGML_METAL_USE_BF16
|
10507
ggml/src/ggml-quants.c
10507
ggml/src/ggml-quants.c
File diff suppressed because it is too large
Load diff
|
@ -11,136 +11,89 @@
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
// NOTE: these functions are defined as GGML_API because they used by the CPU backend
|
||||||
|
|
||||||
// Quantization
|
// Quantization
|
||||||
void quantize_row_q4_0_ref(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t k);
|
GGML_API void quantize_row_q4_0_ref(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q4_1_ref(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int64_t k);
|
GGML_API void quantize_row_q4_1_ref(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q5_0_ref(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int64_t k);
|
GGML_API void quantize_row_q5_0_ref(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q5_1_ref(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int64_t k);
|
GGML_API void quantize_row_q5_1_ref(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q8_0_ref(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int64_t k);
|
GGML_API void quantize_row_q8_0_ref(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q8_1_ref(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int64_t k);
|
GGML_API void quantize_row_q8_1_ref(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int64_t k);
|
||||||
|
|
||||||
void quantize_row_q2_K_ref(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int64_t k);
|
GGML_API void quantize_row_q2_K_ref(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q3_K_ref(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int64_t k);
|
GGML_API void quantize_row_q3_K_ref(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q4_K_ref(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int64_t k);
|
GGML_API void quantize_row_q4_K_ref(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q5_K_ref(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int64_t k);
|
GGML_API void quantize_row_q5_K_ref(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q6_K_ref(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k);
|
GGML_API void quantize_row_q6_K_ref(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q8_K_ref(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int64_t k);
|
GGML_API void quantize_row_q8_K_ref(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int64_t k);
|
||||||
|
|
||||||
void quantize_row_tq1_0_ref(const float * GGML_RESTRICT x, block_tq1_0 * GGML_RESTRICT y, int64_t k);
|
GGML_API void quantize_row_tq1_0_ref(const float * GGML_RESTRICT x, block_tq1_0 * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_tq2_0_ref(const float * GGML_RESTRICT x, block_tq2_0 * GGML_RESTRICT y, int64_t k);
|
GGML_API void quantize_row_tq2_0_ref(const float * GGML_RESTRICT x, block_tq2_0 * GGML_RESTRICT y, int64_t k);
|
||||||
|
|
||||||
void quantize_row_iq3_xxs_ref(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int64_t k);
|
GGML_API void quantize_row_iq3_xxs_ref(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_iq4_nl_ref (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int64_t k);
|
GGML_API void quantize_row_iq4_nl_ref (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_iq4_xs_ref (const float * GGML_RESTRICT x, block_iq4_xs * GGML_RESTRICT y, int64_t k);
|
GGML_API void quantize_row_iq4_xs_ref (const float * GGML_RESTRICT x, block_iq4_xs * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_iq3_s_ref (const float * GGML_RESTRICT x, block_iq3_s * GGML_RESTRICT y, int64_t k);
|
GGML_API void quantize_row_iq3_s_ref (const float * GGML_RESTRICT x, block_iq3_s * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_iq2_s_ref (const float * GGML_RESTRICT x, block_iq2_s * GGML_RESTRICT y, int64_t k);
|
GGML_API void quantize_row_iq2_s_ref (const float * GGML_RESTRICT x, block_iq2_s * GGML_RESTRICT y, int64_t k);
|
||||||
|
|
||||||
void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
|
||||||
void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
|
||||||
void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
|
||||||
void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
|
||||||
void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
|
||||||
void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
|
||||||
|
|
||||||
void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
|
||||||
void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
|
||||||
void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
|
||||||
void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
|
||||||
void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
|
||||||
void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
|
||||||
|
|
||||||
void quantize_row_tq1_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
|
||||||
void quantize_row_tq2_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
|
||||||
|
|
||||||
void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
|
||||||
void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
|
||||||
void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
|
||||||
void quantize_row_iq3_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
|
||||||
void quantize_row_iq2_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
|
||||||
|
|
||||||
// Dequantization
|
// Dequantization
|
||||||
void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
GGML_API void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
GGML_API void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
GGML_API void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
GGML_API void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
GGML_API void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
//void dequantize_row_q8_1(const block_q8_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
//GGML_API void dequantize_row_q8_1(const block_q8_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
|
|
||||||
void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
GGML_API void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
GGML_API void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
GGML_API void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
GGML_API void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
GGML_API void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
GGML_API void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
|
|
||||||
void dequantize_row_tq1_0(const block_tq1_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
GGML_API void dequantize_row_tq1_0(const block_tq1_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_tq2_0(const block_tq2_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
GGML_API void dequantize_row_tq2_0(const block_tq2_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
|
|
||||||
void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
GGML_API void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
GGML_API void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_iq2_s (const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
GGML_API void dequantize_row_iq2_s (const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
GGML_API void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_iq1_s (const block_iq1_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
GGML_API void dequantize_row_iq1_s (const block_iq1_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_iq1_m (const block_iq1_m * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
GGML_API void dequantize_row_iq1_m (const block_iq1_m * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_iq4_nl (const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
GGML_API void dequantize_row_iq4_nl (const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_iq4_xs (const block_iq4_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
GGML_API void dequantize_row_iq4_xs (const block_iq4_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_iq3_s (const block_iq3_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
GGML_API void dequantize_row_iq3_s (const block_iq3_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
|
|
||||||
// Dot product
|
|
||||||
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
|
||||||
void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
|
||||||
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
|
||||||
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
|
||||||
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
|
||||||
|
|
||||||
void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
|
||||||
void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
|
||||||
void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
|
||||||
void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
|
||||||
void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
|
||||||
|
|
||||||
void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
|
||||||
void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
|
||||||
|
|
||||||
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
|
||||||
void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
|
||||||
void ggml_vec_dot_iq2_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
|
||||||
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
|
||||||
void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
|
||||||
void ggml_vec_dot_iq1_m_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
|
||||||
void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
|
||||||
void ggml_vec_dot_iq4_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
|
||||||
void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
|
||||||
|
|
||||||
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
|
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
|
||||||
size_t quantize_iq2_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
GGML_API size_t quantize_iq2_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_iq2_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
GGML_API size_t quantize_iq2_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_iq2_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
GGML_API size_t quantize_iq2_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_iq3_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
GGML_API size_t quantize_iq3_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_iq1_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
GGML_API size_t quantize_iq1_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_iq1_m (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
GGML_API size_t quantize_iq1_m (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_iq4_nl (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
GGML_API size_t quantize_iq4_nl (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_iq4_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
GGML_API size_t quantize_iq4_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_iq3_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
GGML_API size_t quantize_iq3_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
|
|
||||||
size_t quantize_tq1_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
GGML_API size_t quantize_tq1_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_tq2_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
GGML_API size_t quantize_tq2_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
|
|
||||||
size_t quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
GGML_API size_t quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
GGML_API size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
GGML_API size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_q5_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
GGML_API size_t quantize_q5_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_q6_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
GGML_API size_t quantize_q6_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_q4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
GGML_API size_t quantize_q4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
GGML_API size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
GGML_API size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
GGML_API size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
GGML_API size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
|
|
||||||
void iq2xs_init_impl(enum ggml_type type);
|
GGML_API void iq2xs_init_impl(enum ggml_type type);
|
||||||
void iq2xs_free_impl(enum ggml_type type);
|
GGML_API void iq2xs_free_impl(enum ggml_type type);
|
||||||
void iq3xs_init_impl(int grid_size);
|
GGML_API void iq3xs_init_impl(int grid_size);
|
||||||
void iq3xs_free_impl(int grid_size);
|
GGML_API void iq3xs_free_impl(int grid_size);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
|
|
|
@ -671,7 +671,7 @@ static ggml_backend_i ggml_backend_rpc_interface = {
|
||||||
/* .event_wait = */ NULL,
|
/* .event_wait = */ NULL,
|
||||||
};
|
};
|
||||||
|
|
||||||
GGML_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint) {
|
ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint) {
|
||||||
static std::mutex mutex;
|
static std::mutex mutex;
|
||||||
std::lock_guard<std::mutex> lock(mutex);
|
std::lock_guard<std::mutex> lock(mutex);
|
||||||
// NOTE: buffer types are allocated and never freed; this is by design
|
// NOTE: buffer types are allocated and never freed; this is by design
|
||||||
|
@ -718,7 +718,7 @@ ggml_backend_t ggml_backend_rpc_init(const char * endpoint) {
|
||||||
return backend;
|
return backend;
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_API bool ggml_backend_is_rpc(ggml_backend_t backend) {
|
bool ggml_backend_is_rpc(ggml_backend_t backend) {
|
||||||
return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_rpc_guid());
|
return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_rpc_guid());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -730,7 +730,7 @@ static void get_device_memory(const std::shared_ptr<socket_t> & sock, size_t * f
|
||||||
*total = response.total_mem;
|
*total = response.total_mem;
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_API void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total) {
|
void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total) {
|
||||||
auto sock = get_socket(endpoint);
|
auto sock = get_socket(endpoint);
|
||||||
if (sock == nullptr) {
|
if (sock == nullptr) {
|
||||||
*free = 0;
|
*free = 0;
|
12
ggml/src/ggml-threading.cpp
Normal file
12
ggml/src/ggml-threading.cpp
Normal file
|
@ -0,0 +1,12 @@
|
||||||
|
#include "ggml-threading.h"
|
||||||
|
#include <mutex>
|
||||||
|
|
||||||
|
std::mutex ggml_critical_section_mutex;
|
||||||
|
|
||||||
|
void ggml_critical_section_start() {
|
||||||
|
ggml_critical_section_mutex.lock();
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_critical_section_end(void) {
|
||||||
|
ggml_critical_section_mutex.unlock();
|
||||||
|
}
|
12
ggml/src/ggml-threading.h
Normal file
12
ggml/src/ggml-threading.h
Normal file
|
@ -0,0 +1,12 @@
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
void ggml_critical_section_start(void);
|
||||||
|
void ggml_critical_section_end(void);
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Add a link
Reference in a new issue