mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 09:34:37 +00:00
Merge branch 'upstream' into concedo_experimental
# Conflicts: # CMakeLists.txt # Makefile # Package.swift # build.zig # tests/test-backend-ops.cpp
This commit is contained in:
commit
bfbaf0011c
16 changed files with 2005 additions and 141 deletions
|
@ -80,6 +80,7 @@ if (LLAMA_CUBLAS)
|
||||||
|
|
||||||
enable_language(CUDA)
|
enable_language(CUDA)
|
||||||
|
|
||||||
|
add_compile_definitions(GGML_USE_LLAMAFILE)
|
||||||
add_compile_definitions(GGML_USE_CUDA)
|
add_compile_definitions(GGML_USE_CUDA)
|
||||||
add_compile_definitions(SD_USE_CUBLAS)
|
add_compile_definitions(SD_USE_CUBLAS)
|
||||||
add_compile_definitions(GGML_CUDA_MMQ_Y=${LLAMA_CUDA_MMQ_Y})
|
add_compile_definitions(GGML_CUDA_MMQ_Y=${LLAMA_CUDA_MMQ_Y})
|
||||||
|
@ -389,6 +390,8 @@ add_library(ggml OBJECT
|
||||||
ggml-backend.h
|
ggml-backend.h
|
||||||
ggml-quants.c
|
ggml-quants.c
|
||||||
ggml-quants.h
|
ggml-quants.h
|
||||||
|
sgemm.cpp
|
||||||
|
sgemm.h
|
||||||
${GGML_SOURCES_CUDA})
|
${GGML_SOURCES_CUDA})
|
||||||
target_include_directories(ggml PUBLIC . ./otherarch ./otherarch/tools)
|
target_include_directories(ggml PUBLIC . ./otherarch ./otherarch/tools)
|
||||||
target_compile_features(ggml PUBLIC c_std_11) # don't bump
|
target_compile_features(ggml PUBLIC c_std_11) # don't bump
|
||||||
|
|
57
Makefile
57
Makefile
|
@ -39,8 +39,8 @@ endif
|
||||||
#
|
#
|
||||||
|
|
||||||
# keep standard at C11 and C++11
|
# keep standard at C11 and C++11
|
||||||
CFLAGS = -I. -I./include -I./include/CL -I./otherarch -I./otherarch/tools -I./otherarch/sdcpp -I./otherarch/sdcpp/thirdparty -I./include/vulkan -O3 -DNDEBUG -std=c11 -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE
|
CFLAGS = -I. -I./include -I./include/CL -I./otherarch -I./otherarch/tools -I./otherarch/sdcpp -I./otherarch/sdcpp/thirdparty -I./include/vulkan -O3 -DNDEBUG -std=c11 -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE -DGGML_USE_LLAMAFILE
|
||||||
CXXFLAGS = -I. -I./common -I./include -I./include/CL -I./otherarch -I./otherarch/tools -I./otherarch/sdcpp -I./otherarch/sdcpp/thirdparty -I./include/vulkan -O3 -DNDEBUG -std=c++11 -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE
|
CXXFLAGS = -I. -I./common -I./include -I./include/CL -I./otherarch -I./otherarch/tools -I./otherarch/sdcpp -I./otherarch/sdcpp/thirdparty -I./include/vulkan -O3 -DNDEBUG -std=c++11 -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE -DGGML_USE_LLAMAFILE
|
||||||
LDFLAGS =
|
LDFLAGS =
|
||||||
FASTCFLAGS = $(subst -O3,-Ofast,$(CFLAGS))
|
FASTCFLAGS = $(subst -O3,-Ofast,$(CFLAGS))
|
||||||
FASTCXXFLAGS = $(subst -O3,-Ofast,$(CXXFLAGS))
|
FASTCXXFLAGS = $(subst -O3,-Ofast,$(CXXFLAGS))
|
||||||
|
@ -62,6 +62,10 @@ endif
|
||||||
CUBLASLD_FLAGS =
|
CUBLASLD_FLAGS =
|
||||||
CUBLAS_OBJS =
|
CUBLAS_OBJS =
|
||||||
|
|
||||||
|
OBJS_FULL += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o sgemm.o llava.o llavaclip.o common.o grammar-parser.o
|
||||||
|
OBJS_SIMPLE += ggml-alloc.o ggml-backend.o ggml-quants_noavx2.o unicode.o unicode-data.o sgemm_noavx2.o llava.o llavaclip.o common.o grammar-parser.o
|
||||||
|
OBJS_FAILSAFE += ggml-alloc.o ggml-backend.o ggml-quants_failsafe.o unicode.o unicode-data.o sgemm_failsafe.o llava.o llavaclip.o common.o grammar-parser.o
|
||||||
|
|
||||||
#lets try enabling everything
|
#lets try enabling everything
|
||||||
CFLAGS += -pthread -s -Wno-deprecated -Wno-deprecated-declarations
|
CFLAGS += -pthread -s -Wno-deprecated -Wno-deprecated-declarations
|
||||||
CXXFLAGS += -pthread -s -Wno-multichar -Wno-write-strings -Wno-deprecated -Wno-deprecated-declarations
|
CXXFLAGS += -pthread -s -Wno-multichar -Wno-write-strings -Wno-deprecated -Wno-deprecated-declarations
|
||||||
|
@ -406,6 +410,13 @@ ggml-quants_noavx2.o: ggml-quants.c ggml.h ggml-quants.h ggml-cuda.h ggml-common
|
||||||
ggml-quants_failsafe.o: ggml-quants.c ggml.h ggml-quants.h ggml-cuda.h ggml-common.h
|
ggml-quants_failsafe.o: ggml-quants.c ggml.h ggml-quants.h ggml-cuda.h ggml-common.h
|
||||||
$(CC) $(CFLAGS) $(NONECFLAGS) -c $< -o $@
|
$(CC) $(CFLAGS) $(NONECFLAGS) -c $< -o $@
|
||||||
|
|
||||||
|
#sgemm
|
||||||
|
sgemm.o: sgemm.cpp sgemm.h ggml.h
|
||||||
|
$(CXX) $(CXXFLAGS) $(FULLCFLAGS) -c $< -o $@
|
||||||
|
sgemm_noavx2.o: sgemm.cpp sgemm.h ggml.h
|
||||||
|
$(CXX) $(CXXFLAGS) $(SIMPLECFLAGS) -c $< -o $@
|
||||||
|
sgemm_failsafe.o: sgemm.cpp sgemm.h ggml.h
|
||||||
|
$(CXX) $(CXXFLAGS) $(NONECFLAGS) -c $< -o $@
|
||||||
|
|
||||||
#there's no intrinsics or special gpu ops used here, so we can have a universal object
|
#there's no intrinsics or special gpu ops used here, so we can have a universal object
|
||||||
ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
|
ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
|
||||||
|
@ -513,25 +524,25 @@ clean:
|
||||||
rm -vrf ggml-cuda/*.o
|
rm -vrf ggml-cuda/*.o
|
||||||
|
|
||||||
# useful tools
|
# useful tools
|
||||||
main: examples/main/main.cpp common/sampling.cpp build-info.h ggml.o ggml-quants.o ggml-alloc.o unicode.o unicode-data.o ggml-backend.o llama.o common.o console.o grammar-parser.o $(OBJS)
|
main: examples/main/main.cpp common/sampling.cpp build-info.h ggml.o llama.o console.o $(OBJS_FULL) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
@echo '==== Run ./main -h for help. ===='
|
@echo '==== Run ./main -h for help. ===='
|
||||||
sdmain: otherarch/sdcpp/util.cpp otherarch/sdcpp/main.cpp otherarch/sdcpp/stable-diffusion.cpp otherarch/sdcpp/upscaler.cpp otherarch/sdcpp/model.cpp otherarch/sdcpp/thirdparty/zip.c build-info.h ggml.o ggml-quants.o ggml-alloc.o unicode.o unicode-data.o ggml-backend.o llama.o common.o console.o grammar-parser.o $(OBJS)
|
sdmain: otherarch/sdcpp/util.cpp otherarch/sdcpp/main.cpp otherarch/sdcpp/stable-diffusion.cpp otherarch/sdcpp/upscaler.cpp otherarch/sdcpp/model.cpp otherarch/sdcpp/thirdparty/zip.c build-info.h ggml.o llama.o console.o $(OBJS_FULL) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
imatrix: examples/imatrix/imatrix.cpp common/sampling.cpp build-info.h ggml.o ggml-quants.o ggml-alloc.o unicode.o unicode-data.o ggml-backend.o llama.o common.o console.o grammar-parser.o $(OBJS)
|
imatrix: examples/imatrix/imatrix.cpp common/sampling.cpp build-info.h ggml.o llama.o console.o $(OBJS_FULL) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
gguf: examples/gguf/gguf.cpp build-info.h ggml.o llama.o unicode.o unicode-data.o $(OBJS)
|
gguf: examples/gguf/gguf.cpp build-info.h ggml.o llama.o $(OBJS_FULL) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
gguf-split: examples/gguf-split/gguf-split.cpp ggml.o ggml-quants.o ggml-alloc.o unicode.o unicode-data.o ggml-backend.o llama.o common/sampling.cpp build-info.h common.o grammar-parser.o $(OBJS)
|
gguf-split: examples/gguf-split/gguf-split.cpp ggml.o llama.o common/sampling.cpp build-info.h $(OBJS_FULL) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
|
|
||||||
#generated libraries
|
#generated libraries
|
||||||
koboldcpp_default: ggml.o ggml_v3.o ggml_v2.o ggml_v1.o expose.o common.o gpttype_adapter.o ggml-quants.o ggml-alloc.o ggml-backend.o llava.o llavaclip.o unicode.o unicode-data.o grammar-parser.o sdcpp_default.o $(OBJS)
|
koboldcpp_default: ggml.o ggml_v3.o ggml_v2.o ggml_v1.o expose.o gpttype_adapter.o sdcpp_default.o $(OBJS_FULL) $(OBJS)
|
||||||
$(DEFAULT_BUILD)
|
$(DEFAULT_BUILD)
|
||||||
|
|
||||||
ifdef OPENBLAS_BUILD
|
ifdef OPENBLAS_BUILD
|
||||||
koboldcpp_openblas: ggml_v4_openblas.o ggml_v3_openblas.o ggml_v2_openblas.o ggml_v1.o expose.o common.o gpttype_adapter.o ggml-quants.o ggml-alloc.o ggml-backend.o llava.o llavaclip.o unicode.o unicode-data.o grammar-parser.o sdcpp_default.o $(OBJS)
|
koboldcpp_openblas: ggml_v4_openblas.o ggml_v3_openblas.o ggml_v2_openblas.o ggml_v1.o expose.o gpttype_adapter.o sdcpp_default.o $(OBJS_FULL) $(OBJS)
|
||||||
$(OPENBLAS_BUILD)
|
$(OPENBLAS_BUILD)
|
||||||
else
|
else
|
||||||
koboldcpp_openblas:
|
koboldcpp_openblas:
|
||||||
|
@ -539,7 +550,7 @@ koboldcpp_openblas:
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifdef FAILSAFE_BUILD
|
ifdef FAILSAFE_BUILD
|
||||||
koboldcpp_failsafe: ggml_v4_failsafe.o ggml_v3_failsafe.o ggml_v2_failsafe.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_failsafe.o ggml-quants_failsafe.o ggml-alloc.o ggml-backend.o llava.o llavaclip.o unicode.o unicode-data.o grammar-parser.o sdcpp_default.o $(OBJS)
|
koboldcpp_failsafe: ggml_v4_failsafe.o ggml_v3_failsafe.o ggml_v2_failsafe.o ggml_v1_failsafe.o expose.o gpttype_adapter_failsafe.o sdcpp_default.o $(OBJS_FAILSAFE) $(OBJS)
|
||||||
$(FAILSAFE_BUILD)
|
$(FAILSAFE_BUILD)
|
||||||
else
|
else
|
||||||
koboldcpp_failsafe:
|
koboldcpp_failsafe:
|
||||||
|
@ -547,7 +558,7 @@ koboldcpp_failsafe:
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifdef NOAVX2_BUILD
|
ifdef NOAVX2_BUILD
|
||||||
koboldcpp_noavx2: ggml_v4_noavx2.o ggml_v3_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_failsafe.o ggml-quants_noavx2.o ggml-alloc.o ggml-backend.o llava.o llavaclip.o unicode.o unicode-data.o grammar-parser.o sdcpp_default.o $(OBJS)
|
koboldcpp_noavx2: ggml_v4_noavx2.o ggml_v3_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o gpttype_adapter_failsafe.o sdcpp_default.o $(OBJS_SIMPLE) $(OBJS)
|
||||||
$(NOAVX2_BUILD)
|
$(NOAVX2_BUILD)
|
||||||
else
|
else
|
||||||
koboldcpp_noavx2:
|
koboldcpp_noavx2:
|
||||||
|
@ -555,10 +566,10 @@ koboldcpp_noavx2:
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifdef CLBLAST_BUILD
|
ifdef CLBLAST_BUILD
|
||||||
koboldcpp_clblast: ggml_v4_clblast.o ggml_v3_clblast.o ggml_v2_clblast.o ggml_v1.o expose.o common.o gpttype_adapter_clblast.o ggml-opencl.o ggml_v3-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o ggml-quants.o ggml-alloc.o ggml-backend.o llava.o llavaclip.o unicode.o unicode-data.o grammar-parser.o sdcpp_default.o $(OBJS)
|
koboldcpp_clblast: ggml_v4_clblast.o ggml_v3_clblast.o ggml_v2_clblast.o ggml_v1.o expose.o gpttype_adapter_clblast.o ggml-opencl.o ggml_v3-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o sdcpp_default.o $(OBJS_FULL) $(OBJS)
|
||||||
$(CLBLAST_BUILD)
|
$(CLBLAST_BUILD)
|
||||||
ifdef NOAVX2_BUILD
|
ifdef NOAVX2_BUILD
|
||||||
koboldcpp_clblast_noavx2: ggml_v4_clblast_noavx2.o ggml_v3_clblast_noavx2.o ggml_v2_clblast_noavx2.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_clblast_noavx2.o ggml-opencl.o ggml_v3-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o ggml-quants_noavx2.o ggml-alloc.o ggml-backend.o llava.o llavaclip.o unicode.o unicode-data.o grammar-parser.o sdcpp_default.o $(OBJS)
|
koboldcpp_clblast_noavx2: ggml_v4_clblast_noavx2.o ggml_v3_clblast_noavx2.o ggml_v2_clblast_noavx2.o ggml_v1_failsafe.o expose.o gpttype_adapter_clblast_noavx2.o ggml-opencl.o ggml_v3-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o sdcpp_default.o $(OBJS_SIMPLE) $(OBJS)
|
||||||
$(CLBLAST_BUILD)
|
$(CLBLAST_BUILD)
|
||||||
else
|
else
|
||||||
koboldcpp_clblast_noavx2:
|
koboldcpp_clblast_noavx2:
|
||||||
|
@ -572,7 +583,7 @@ koboldcpp_clblast_noavx2:
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifdef CUBLAS_BUILD
|
ifdef CUBLAS_BUILD
|
||||||
koboldcpp_cublas: ggml_v4_cublas.o ggml_v3_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o common.o gpttype_adapter_cublas.o ggml-quants.o ggml-alloc.o ggml-backend.o llava.o llavaclip.o unicode.o unicode-data.o grammar-parser.o sdcpp_cublas.o $(CUBLAS_OBJS) $(OBJS)
|
koboldcpp_cublas: ggml_v4_cublas.o ggml_v3_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o gpttype_adapter_cublas.o sdcpp_cublas.o $(CUBLAS_OBJS) $(OBJS_FULL) $(OBJS)
|
||||||
$(CUBLAS_BUILD)
|
$(CUBLAS_BUILD)
|
||||||
else
|
else
|
||||||
koboldcpp_cublas:
|
koboldcpp_cublas:
|
||||||
|
@ -580,7 +591,7 @@ koboldcpp_cublas:
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifdef HIPBLAS_BUILD
|
ifdef HIPBLAS_BUILD
|
||||||
koboldcpp_hipblas: ggml_v4_cublas.o ggml_v3_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o common.o gpttype_adapter_cublas.o ggml-quants.o ggml-alloc.o ggml-backend.o llava.o llavaclip.o unicode.o unicode-data.o grammar-parser.o sdcpp_cublas.o $(HIP_OBJS) $(OBJS)
|
koboldcpp_hipblas: ggml_v4_cublas.o ggml_v3_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o gpttype_adapter_cublas.o sdcpp_cublas.o $(HIP_OBJS) $(OBJS_FULL) $(OBJS)
|
||||||
$(HIPBLAS_BUILD)
|
$(HIPBLAS_BUILD)
|
||||||
else
|
else
|
||||||
koboldcpp_hipblas:
|
koboldcpp_hipblas:
|
||||||
|
@ -588,10 +599,10 @@ koboldcpp_hipblas:
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifdef VULKAN_BUILD
|
ifdef VULKAN_BUILD
|
||||||
koboldcpp_vulkan: ggml_v4_vulkan.o ggml_v3.o ggml_v2.o ggml_v1.o expose.o common.o gpttype_adapter_vulkan.o ggml-vulkan.o ggml-quants.o ggml-alloc.o ggml-backend.o llava.o llavaclip.o unicode.o unicode-data.o grammar-parser.o sdcpp_default.o $(OBJS)
|
koboldcpp_vulkan: ggml_v4_vulkan.o ggml_v3.o ggml_v2.o ggml_v1.o expose.o gpttype_adapter_vulkan.o ggml-vulkan.o sdcpp_default.o $(OBJS_FULL) $(OBJS)
|
||||||
$(VULKAN_BUILD)
|
$(VULKAN_BUILD)
|
||||||
ifdef NOAVX2_BUILD
|
ifdef NOAVX2_BUILD
|
||||||
koboldcpp_vulkan_noavx2: ggml_v4_vulkan_noavx2.o ggml_v3_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_vulkan_noavx2.o ggml-vulkan.o ggml-quants_noavx2.o ggml-alloc.o ggml-backend.o llava.o llavaclip.o unicode.o unicode-data.o grammar-parser.o sdcpp_default.o $(OBJS)
|
koboldcpp_vulkan_noavx2: ggml_v4_vulkan_noavx2.o ggml_v3_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o gpttype_adapter_vulkan_noavx2.o ggml-vulkan.o sdcpp_default.o $(OBJS_SIMPLE) $(OBJS)
|
||||||
$(VULKAN_BUILD)
|
$(VULKAN_BUILD)
|
||||||
else
|
else
|
||||||
koboldcpp_vulkan_noavx2:
|
koboldcpp_vulkan_noavx2:
|
||||||
|
@ -605,17 +616,17 @@ koboldcpp_vulkan_noavx2:
|
||||||
endif
|
endif
|
||||||
|
|
||||||
# tools
|
# tools
|
||||||
quantize_gguf: examples/quantize/quantize.cpp ggml.o llama.o ggml-quants.o ggml-alloc.o ggml-backend.o unicode.o unicode-data.o
|
quantize_gguf: examples/quantize/quantize.cpp ggml.o llama.o $(OBJS_FULL)
|
||||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||||
quantize_gptj: ggml_v3.o ggml.o llama.o ggml-quants.o ggml-alloc.o ggml-backend.o unicode.o unicode-data.o otherarch/tools/gptj_quantize.cpp otherarch/tools/common-ggml.cpp
|
quantize_gptj: otherarch/tools/gptj_quantize.cpp otherarch/tools/common-ggml.cpp ggml_v3.o ggml.o llama.o $(OBJS_FULL)
|
||||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||||
quantize_gpt2: ggml_v3.o ggml.o llama.o ggml-quants.o ggml-alloc.o ggml-backend.o unicode.o unicode-data.o otherarch/tools/gpt2_quantize.cpp otherarch/tools/common-ggml.cpp
|
quantize_gpt2: otherarch/tools/gpt2_quantize.cpp otherarch/tools/common-ggml.cpp ggml_v3.o ggml.o llama.o $(OBJS_FULL)
|
||||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||||
quantize_neox: ggml_v3.o ggml.o llama.o ggml-quants.o ggml-alloc.o ggml-backend.o unicode.o unicode-data.o otherarch/tools/neox_quantize.cpp otherarch/tools/common-ggml.cpp
|
quantize_neox: otherarch/tools/neox_quantize.cpp otherarch/tools/common-ggml.cpp ggml_v3.o ggml.o llama.o $(OBJS_FULL)
|
||||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||||
quantize_mpt: ggml_v3.o ggml.o llama.o ggml-quants.o ggml-alloc.o ggml-backend.o unicode.o unicode-data.o otherarch/tools/mpt_quantize.cpp otherarch/tools/common-ggml.cpp
|
quantize_mpt: otherarch/tools/mpt_quantize.cpp otherarch/tools/common-ggml.cpp ggml_v3.o ggml.o llama.o $(OBJS_FULL)
|
||||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||||
quantize_clip: ggml_v3.o ggml.o llama.o ggml-quants.o ggml-alloc.o ggml-backend.o unicode.o unicode-data.o examples/llava/clip.cpp examples/llava/clip.h examples/llava/quantclip.cpp
|
quantize_clip: examples/llava/clip.cpp examples/llava/clip.h examples/llava/quantclip.cpp ggml_v3.o ggml.o llama.o $(OBJS_FULL)
|
||||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
#window simple clinfo
|
#window simple clinfo
|
||||||
|
|
|
@ -109,6 +109,79 @@ int32_t get_num_physical_cores() {
|
||||||
return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
|
return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if defined(__x86_64__) && defined(__linux__)
|
||||||
|
#include <pthread.h>
|
||||||
|
|
||||||
|
static void cpuid(unsigned leaf, unsigned subleaf,
|
||||||
|
unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx) {
|
||||||
|
__asm__("movq\t%%rbx,%%rsi\n\t"
|
||||||
|
"cpuid\n\t"
|
||||||
|
"xchgq\t%%rbx,%%rsi"
|
||||||
|
: "=a"(*eax), "=S"(*ebx), "=c"(*ecx), "=d"(*edx)
|
||||||
|
: "0"(leaf), "2"(subleaf));
|
||||||
|
}
|
||||||
|
|
||||||
|
static int pin_cpu(int cpu) {
|
||||||
|
cpu_set_t mask;
|
||||||
|
CPU_ZERO(&mask);
|
||||||
|
CPU_SET(cpu, &mask);
|
||||||
|
return pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask);
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool is_hybrid_cpu(void) {
|
||||||
|
unsigned eax, ebx, ecx, edx;
|
||||||
|
cpuid(7, 0, &eax, &ebx, &ecx, &edx);
|
||||||
|
return !!(edx & (1u << 15));
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool is_running_on_efficiency_core(void) {
|
||||||
|
unsigned eax, ebx, ecx, edx;
|
||||||
|
cpuid(0x1a, 0, &eax, &ebx, &ecx, &edx);
|
||||||
|
int intel_atom = 0x20;
|
||||||
|
int core_type = (eax & 0xff000000u) >> 24;
|
||||||
|
return core_type == intel_atom;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int count_math_cpus(int cpu_count) {
|
||||||
|
int result = 0;
|
||||||
|
for (int cpu = 0; cpu < cpu_count; ++cpu) {
|
||||||
|
if (pin_cpu(cpu)) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
if (is_running_on_efficiency_core()) {
|
||||||
|
continue; // efficiency cores harm lockstep threading
|
||||||
|
}
|
||||||
|
++cpu; // hyperthreading isn't useful for linear algebra
|
||||||
|
++result;
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif // __x86_64__ && __linux__
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns number of CPUs on system that are useful for math.
|
||||||
|
*/
|
||||||
|
int get_math_cpu_count() {
|
||||||
|
#if defined(__x86_64__) && defined(__linux__)
|
||||||
|
int cpu_count = sysconf(_SC_NPROCESSORS_ONLN);
|
||||||
|
if (cpu_count < 1) {
|
||||||
|
return get_num_physical_cores();
|
||||||
|
}
|
||||||
|
if (is_hybrid_cpu()) {
|
||||||
|
cpu_set_t affinity;
|
||||||
|
if (!pthread_getaffinity_np(pthread_self(), sizeof(affinity), &affinity)) {
|
||||||
|
int result = count_math_cpus(cpu_count);
|
||||||
|
pthread_setaffinity_np(pthread_self(), sizeof(affinity), &affinity);
|
||||||
|
if (result > 0) {
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
return get_num_physical_cores();
|
||||||
|
}
|
||||||
|
|
||||||
void process_escapes(std::string & input) {
|
void process_escapes(std::string & input) {
|
||||||
std::size_t input_len = input.length();
|
std::size_t input_len = input.length();
|
||||||
std::size_t output_idx = 0;
|
std::size_t output_idx = 0;
|
||||||
|
|
|
@ -35,6 +35,7 @@
|
||||||
|
|
||||||
struct llama_control_vector_load_info;
|
struct llama_control_vector_load_info;
|
||||||
|
|
||||||
|
int get_math_cpu_count();
|
||||||
int32_t get_num_physical_cores();
|
int32_t get_num_physical_cores();
|
||||||
|
|
||||||
//
|
//
|
||||||
|
@ -44,7 +45,7 @@ int32_t get_num_physical_cores();
|
||||||
struct gpt_params {
|
struct gpt_params {
|
||||||
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
|
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
|
||||||
|
|
||||||
int32_t n_threads = get_num_physical_cores();
|
int32_t n_threads = get_math_cpu_count();
|
||||||
int32_t n_threads_draft = -1;
|
int32_t n_threads_draft = -1;
|
||||||
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
|
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
|
||||||
int32_t n_threads_batch_draft = -1;
|
int32_t n_threads_batch_draft = -1;
|
||||||
|
|
|
@ -1207,9 +1207,91 @@ class StableLMModel(Model):
|
||||||
rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"])
|
rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"])
|
||||||
self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))
|
self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))
|
||||||
self.gguf_writer.add_head_count(hparams["num_attention_heads"])
|
self.gguf_writer.add_head_count(hparams["num_attention_heads"])
|
||||||
|
self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"])
|
||||||
self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True)
|
self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True)
|
||||||
self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_eps", "norm_eps"]))
|
self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_eps", "norm_eps"]))
|
||||||
|
|
||||||
|
def write_tensors(self):
|
||||||
|
block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
|
||||||
|
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
|
||||||
|
n_head = self.hparams.get("num_attention_heads")
|
||||||
|
n_kv_head = self.hparams.get("num_key_value_heads")
|
||||||
|
q_norms = dict()
|
||||||
|
k_norms = dict()
|
||||||
|
for name, data_torch in self.get_tensors():
|
||||||
|
# we don't need these
|
||||||
|
if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
|
||||||
|
continue
|
||||||
|
|
||||||
|
old_dtype = data_torch.dtype
|
||||||
|
|
||||||
|
# convert any unsupported data types to float32
|
||||||
|
if data_torch.dtype not in (torch.float16, torch.float32):
|
||||||
|
data_torch = data_torch.to(torch.float32)
|
||||||
|
|
||||||
|
data = data_torch.squeeze().numpy()
|
||||||
|
n_dims = len(data.shape)
|
||||||
|
if name.find("q_layernorm.norms") != -1:
|
||||||
|
q_norms[name] = data
|
||||||
|
if len(q_norms) >= (block_count * n_head):
|
||||||
|
self._stack_qk_norm(block_count, name, tensor_map, n_head, q_norms, n_dims, layer_name="q_layernorm")
|
||||||
|
continue
|
||||||
|
if name.find("k_layernorm.norms") != -1:
|
||||||
|
k_norms[name] = data
|
||||||
|
if len(k_norms) >= (block_count * n_kv_head):
|
||||||
|
self._stack_qk_norm(block_count, name, tensor_map, n_kv_head, k_norms, n_dims, layer_name="k_layernorm")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# map tensor names
|
||||||
|
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
||||||
|
if new_name is None:
|
||||||
|
print(f"Can not map tensor {name!r}")
|
||||||
|
sys.exit()
|
||||||
|
|
||||||
|
n_dims = len(data.shape)
|
||||||
|
data_dtype = data.dtype
|
||||||
|
|
||||||
|
# if f32 desired, convert any float16 to float32
|
||||||
|
if self.ftype == 0 and data_dtype == np.float16:
|
||||||
|
data = data.astype(np.float32)
|
||||||
|
|
||||||
|
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
||||||
|
if self.ftype == 1 and data_dtype == np.float16 and (n_dims == 1 or new_name.endswith("_norm.weight")):
|
||||||
|
data = data.astype(np.float32)
|
||||||
|
|
||||||
|
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
||||||
|
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and not new_name.endswith("_norm.weight") and n_dims == 2:
|
||||||
|
data = data.astype(np.float16)
|
||||||
|
|
||||||
|
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
||||||
|
|
||||||
|
self.gguf_writer.add_tensor(new_name, data)
|
||||||
|
|
||||||
|
def _stack_qk_norm(self, block_count, name, tensor_map, n_head, norms, n_dims, layer_name="q_layernorm"):
|
||||||
|
for bid in range(block_count):
|
||||||
|
datas = []
|
||||||
|
for xid in range(n_head):
|
||||||
|
ename = f"model.layers.{bid}.self_attn.{layer_name}.norms.{xid}.weight"
|
||||||
|
datas.append(norms[ename])
|
||||||
|
del norms[ename]
|
||||||
|
data = np.stack(datas, axis=0)
|
||||||
|
data_dtype = data.dtype
|
||||||
|
merged_name = f"model.layers.{bid}.self_attn.{layer_name}.weight"
|
||||||
|
new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias"))
|
||||||
|
if new_name is None:
|
||||||
|
print(f"Can not map tensor {name!r}")
|
||||||
|
sys.exit()
|
||||||
|
if self.ftype == 1 and data_dtype == np.float16 and (n_dims == 1 or new_name.endswith("_norm.weight")):
|
||||||
|
data = data.astype(np.float32)
|
||||||
|
|
||||||
|
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
||||||
|
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and not new_name.endswith("_norm.weight") and n_dims == 2:
|
||||||
|
data = data.astype(np.float16)
|
||||||
|
|
||||||
|
print(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}")
|
||||||
|
|
||||||
|
self.gguf_writer.add_tensor(new_name, data)
|
||||||
|
|
||||||
|
|
||||||
@Model.register("LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
|
@Model.register("LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
|
||||||
class LlamaModel(Model):
|
class LlamaModel(Model):
|
||||||
|
@ -1700,6 +1782,105 @@ class Qwen2Model(Model):
|
||||||
model_arch = gguf.MODEL_ARCH.QWEN2
|
model_arch = gguf.MODEL_ARCH.QWEN2
|
||||||
|
|
||||||
|
|
||||||
|
@Model.register("Qwen2MoeForCausalLM")
|
||||||
|
class Qwen2MoeModel(Model):
|
||||||
|
model_arch = gguf.MODEL_ARCH.QWEN2MOE
|
||||||
|
|
||||||
|
def set_gguf_parameters(self):
|
||||||
|
super().set_gguf_parameters()
|
||||||
|
if (n_experts := self.hparams.get("num_experts")) is not None:
|
||||||
|
self.gguf_writer.add_expert_count(n_experts)
|
||||||
|
|
||||||
|
def write_tensors(self):
|
||||||
|
block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
|
||||||
|
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
|
||||||
|
n_experts = self.hparams.get("num_experts")
|
||||||
|
experts = dict()
|
||||||
|
for name, data_torch in self.get_tensors():
|
||||||
|
# we don't need these
|
||||||
|
if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
|
||||||
|
continue
|
||||||
|
|
||||||
|
old_dtype = data_torch.dtype
|
||||||
|
|
||||||
|
# convert any unsupported data types to float32
|
||||||
|
if data_torch.dtype not in (torch.float16, torch.float32):
|
||||||
|
data_torch = data_torch.to(torch.float32)
|
||||||
|
|
||||||
|
data = data_torch.squeeze().numpy()
|
||||||
|
|
||||||
|
# process the experts separately
|
||||||
|
if name.find("experts") != -1:
|
||||||
|
experts[name] = data
|
||||||
|
if len(experts) >= n_experts * 3:
|
||||||
|
# merge the experts into a single 3d tensor
|
||||||
|
for bid in range(block_count):
|
||||||
|
for w_name in ["down_proj", "gate_proj", "up_proj"]:
|
||||||
|
full = True
|
||||||
|
for xid in range(n_experts):
|
||||||
|
ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
|
||||||
|
if ename not in experts:
|
||||||
|
full = False
|
||||||
|
break
|
||||||
|
if not full:
|
||||||
|
continue
|
||||||
|
|
||||||
|
datas = []
|
||||||
|
for xid in range(n_experts):
|
||||||
|
ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
|
||||||
|
datas.append(experts[ename])
|
||||||
|
del experts[ename]
|
||||||
|
|
||||||
|
data = np.stack(datas, axis=0)
|
||||||
|
data_dtype = data.dtype
|
||||||
|
|
||||||
|
if self.ftype == 0 and data_dtype == np.float16:
|
||||||
|
data = data.astype(np.float32)
|
||||||
|
|
||||||
|
if self.ftype == 1 and data_dtype == np.float32:
|
||||||
|
data = data.astype(np.float16)
|
||||||
|
|
||||||
|
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
|
||||||
|
|
||||||
|
new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias"))
|
||||||
|
if new_name is None:
|
||||||
|
print(f"Can not map tensor {name!r}")
|
||||||
|
sys.exit()
|
||||||
|
|
||||||
|
print(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}")
|
||||||
|
|
||||||
|
self.gguf_writer.add_tensor(new_name, data)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# map tensor names
|
||||||
|
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
||||||
|
if new_name is None:
|
||||||
|
print(f"Can not map tensor {name!r}")
|
||||||
|
sys.exit()
|
||||||
|
|
||||||
|
n_dims = len(data.shape)
|
||||||
|
data_dtype = data.dtype
|
||||||
|
|
||||||
|
# if f32 desired, convert any float16 to float32
|
||||||
|
if self.ftype == 0 and data_dtype == np.float16:
|
||||||
|
data = data.astype(np.float32)
|
||||||
|
|
||||||
|
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
||||||
|
if self.ftype == 1 and data_dtype == np.float16 and (n_dims == 1 or new_name.endswith("_norm.weight")):
|
||||||
|
data = data.astype(np.float32)
|
||||||
|
|
||||||
|
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
||||||
|
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
||||||
|
data = data.astype(np.float16)
|
||||||
|
|
||||||
|
print(f"{new_name}, n_dims = {n_dims}, shape = {data.shape}, {old_dtype} --> {data.dtype}")
|
||||||
|
|
||||||
|
self.gguf_writer.add_tensor(new_name, data)
|
||||||
|
|
||||||
|
if len(experts) > 0:
|
||||||
|
raise ValueError(f"Unprocessed experts: {experts.keys()}")
|
||||||
|
|
||||||
|
|
||||||
@Model.register("GPT2LMHeadModel")
|
@Model.register("GPT2LMHeadModel")
|
||||||
class GPT2Model(Model):
|
class GPT2Model(Model):
|
||||||
model_arch = gguf.MODEL_ARCH.GPT2
|
model_arch = gguf.MODEL_ARCH.GPT2
|
||||||
|
@ -2277,6 +2458,12 @@ class GemmaModel(Model):
|
||||||
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
|
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
|
||||||
|
|
||||||
for name, data_torch in self.get_tensors():
|
for name, data_torch in self.get_tensors():
|
||||||
|
# lm_head is not used in llama.cpp, while autoawq will include this tensor in model
|
||||||
|
# To prevent errors, skip loading lm_head.weight.
|
||||||
|
if name == "lm_head.weight":
|
||||||
|
print(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.")
|
||||||
|
continue
|
||||||
|
|
||||||
old_dtype = data_torch.dtype
|
old_dtype = data_torch.dtype
|
||||||
|
|
||||||
# convert any unsupported data types to float32
|
# convert any unsupported data types to float32
|
||||||
|
|
|
@ -191,7 +191,7 @@ static const cmd_params cmd_params_defaults = {
|
||||||
/* n_ubatch */ {512},
|
/* n_ubatch */ {512},
|
||||||
/* type_k */ {GGML_TYPE_F16},
|
/* type_k */ {GGML_TYPE_F16},
|
||||||
/* type_v */ {GGML_TYPE_F16},
|
/* type_v */ {GGML_TYPE_F16},
|
||||||
/* n_threads */ {get_num_physical_cores()},
|
/* n_threads */ {get_math_cpu_count()},
|
||||||
/* n_gpu_layers */ {99},
|
/* n_gpu_layers */ {99},
|
||||||
/* split_mode */ {LLAMA_SPLIT_MODE_LAYER},
|
/* split_mode */ {LLAMA_SPLIT_MODE_LAYER},
|
||||||
/* main_gpu */ {0},
|
/* main_gpu */ {0},
|
||||||
|
|
|
@ -88,7 +88,7 @@ typedef uint16_t ggml_fp16_internal_t;
|
||||||
#if defined(_MSC_VER) || defined(__MINGW32__)
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||||
#include <intrin.h>
|
#include <intrin.h>
|
||||||
#else
|
#else
|
||||||
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__)
|
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__) || defined(__SSE__)
|
||||||
#if !defined(__riscv)
|
#if !defined(__riscv)
|
||||||
#include <immintrin.h>
|
#include <immintrin.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
57
ggml-metal.m
57
ggml-metal.m
|
@ -41,8 +41,11 @@ enum ggml_metal_kernel_type {
|
||||||
GGML_METAL_KERNEL_TYPE_TANH,
|
GGML_METAL_KERNEL_TYPE_TANH,
|
||||||
GGML_METAL_KERNEL_TYPE_RELU,
|
GGML_METAL_KERNEL_TYPE_RELU,
|
||||||
GGML_METAL_KERNEL_TYPE_GELU,
|
GGML_METAL_KERNEL_TYPE_GELU,
|
||||||
|
GGML_METAL_KERNEL_TYPE_GELU_4,
|
||||||
GGML_METAL_KERNEL_TYPE_GELU_QUICK,
|
GGML_METAL_KERNEL_TYPE_GELU_QUICK,
|
||||||
|
GGML_METAL_KERNEL_TYPE_GELU_QUICK_4,
|
||||||
GGML_METAL_KERNEL_TYPE_SILU,
|
GGML_METAL_KERNEL_TYPE_SILU,
|
||||||
|
GGML_METAL_KERNEL_TYPE_SILU_4,
|
||||||
GGML_METAL_KERNEL_TYPE_SOFT_MAX,
|
GGML_METAL_KERNEL_TYPE_SOFT_MAX,
|
||||||
GGML_METAL_KERNEL_TYPE_SOFT_MAX_4,
|
GGML_METAL_KERNEL_TYPE_SOFT_MAX_4,
|
||||||
GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF,
|
GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF,
|
||||||
|
@ -473,8 +476,11 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TANH, tanh, true);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TANH, tanh, true);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RELU, relu, true);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RELU, relu, true);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU, gelu, true);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU, gelu, true);
|
||||||
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_4, gelu_4, true);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_QUICK, gelu_quick, true);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_QUICK, gelu_quick, true);
|
||||||
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_QUICK_4, gelu_quick_4, true);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SILU, silu, true);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SILU, silu, true);
|
||||||
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SILU_4, silu_4, true);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX, soft_max, ctx->support_simdgroup_reduction);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX, soft_max, ctx->support_simdgroup_reduction);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_4, soft_max_4, ctx->support_simdgroup_reduction);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_4, soft_max_4, ctx->support_simdgroup_reduction);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF, diag_mask_inf, true);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF, diag_mask_inf, true);
|
||||||
|
@ -1178,6 +1184,9 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_UNARY:
|
case GGML_OP_UNARY:
|
||||||
switch (ggml_get_unary_op(gf->nodes[i])) {
|
switch (ggml_get_unary_op(gf->nodes[i])) {
|
||||||
|
// we are not taking into account the strides, so for now require contiguous tensors
|
||||||
|
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||||
|
|
||||||
case GGML_UNARY_OP_TANH:
|
case GGML_UNARY_OP_TANH:
|
||||||
{
|
{
|
||||||
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_TANH].pipeline;
|
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_TANH].pipeline;
|
||||||
|
@ -1204,42 +1213,60 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||||
} break;
|
} break;
|
||||||
case GGML_UNARY_OP_GELU:
|
case GGML_UNARY_OP_GELU:
|
||||||
{
|
{
|
||||||
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU].pipeline;
|
int64_t n = ggml_nelements(dst);
|
||||||
|
|
||||||
|
id<MTLComputePipelineState> pipeline = nil;
|
||||||
|
|
||||||
|
if (n % 4 == 0) {
|
||||||
|
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU_4].pipeline;
|
||||||
|
n /= 4;
|
||||||
|
} else {
|
||||||
|
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU].pipeline;
|
||||||
|
}
|
||||||
|
|
||||||
[encoder setComputePipelineState:pipeline];
|
[encoder setComputePipelineState:pipeline];
|
||||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||||
|
|
||||||
const int64_t n = ggml_nelements(dst);
|
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||||
GGML_ASSERT(n % 4 == 0);
|
|
||||||
|
|
||||||
[encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
|
||||||
} break;
|
} break;
|
||||||
case GGML_UNARY_OP_GELU_QUICK:
|
case GGML_UNARY_OP_GELU_QUICK:
|
||||||
{
|
{
|
||||||
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU_QUICK].pipeline;
|
int64_t n = ggml_nelements(dst);
|
||||||
|
|
||||||
|
id<MTLComputePipelineState> pipeline = nil;
|
||||||
|
|
||||||
|
if (n % 4 == 0) {
|
||||||
|
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU_QUICK_4].pipeline;
|
||||||
|
n /= 4;
|
||||||
|
} else {
|
||||||
|
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU_QUICK].pipeline;
|
||||||
|
}
|
||||||
|
|
||||||
[encoder setComputePipelineState:pipeline];
|
[encoder setComputePipelineState:pipeline];
|
||||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||||
|
|
||||||
const int64_t n = ggml_nelements(dst);
|
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||||
GGML_ASSERT(n % 4 == 0);
|
|
||||||
|
|
||||||
[encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
|
||||||
} break;
|
} break;
|
||||||
case GGML_UNARY_OP_SILU:
|
case GGML_UNARY_OP_SILU:
|
||||||
{
|
{
|
||||||
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SILU].pipeline;
|
int64_t n = ggml_nelements(dst);
|
||||||
|
|
||||||
|
id<MTLComputePipelineState> pipeline = nil;
|
||||||
|
|
||||||
|
if (n % 4 == 0) {
|
||||||
|
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SILU_4].pipeline;
|
||||||
|
n /= 4;
|
||||||
|
} else {
|
||||||
|
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SILU].pipeline;
|
||||||
|
}
|
||||||
|
|
||||||
[encoder setComputePipelineState:pipeline];
|
[encoder setComputePipelineState:pipeline];
|
||||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||||
|
|
||||||
const int64_t n = ggml_nelements(dst);
|
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||||
GGML_ASSERT(n % 4 == 0);
|
|
||||||
|
|
||||||
[encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
|
||||||
} break;
|
} break;
|
||||||
default:
|
default:
|
||||||
{
|
{
|
||||||
|
|
|
@ -242,6 +242,15 @@ constant float GELU_QUICK_COEF = -1.702f;
|
||||||
constant float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
|
constant float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
|
||||||
|
|
||||||
kernel void kernel_gelu(
|
kernel void kernel_gelu(
|
||||||
|
device const float * src0,
|
||||||
|
device float * dst,
|
||||||
|
uint tpig[[thread_position_in_grid]]) {
|
||||||
|
device const float & x = src0[tpig];
|
||||||
|
|
||||||
|
dst[tpig] = 0.5f*x*(1.0f + precise::tanh(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
|
||||||
|
}
|
||||||
|
|
||||||
|
kernel void kernel_gelu_4(
|
||||||
device const float4 * src0,
|
device const float4 * src0,
|
||||||
device float4 * dst,
|
device float4 * dst,
|
||||||
uint tpig[[thread_position_in_grid]]) {
|
uint tpig[[thread_position_in_grid]]) {
|
||||||
|
@ -255,6 +264,15 @@ kernel void kernel_gelu(
|
||||||
}
|
}
|
||||||
|
|
||||||
kernel void kernel_gelu_quick(
|
kernel void kernel_gelu_quick(
|
||||||
|
device const float * src0,
|
||||||
|
device float * dst,
|
||||||
|
uint tpig[[thread_position_in_grid]]) {
|
||||||
|
device const float & x = src0[tpig];
|
||||||
|
|
||||||
|
dst[tpig] = x*(1.0f/(1.0f+exp(GELU_QUICK_COEF*x)));
|
||||||
|
}
|
||||||
|
|
||||||
|
kernel void kernel_gelu_quick_4(
|
||||||
device const float4 * src0,
|
device const float4 * src0,
|
||||||
device float4 * dst,
|
device float4 * dst,
|
||||||
uint tpig[[thread_position_in_grid]]) {
|
uint tpig[[thread_position_in_grid]]) {
|
||||||
|
@ -264,6 +282,14 @@ kernel void kernel_gelu_quick(
|
||||||
}
|
}
|
||||||
|
|
||||||
kernel void kernel_silu(
|
kernel void kernel_silu(
|
||||||
|
device const float * src0,
|
||||||
|
device float * dst,
|
||||||
|
uint tpig[[thread_position_in_grid]]) {
|
||||||
|
device const float & x = src0[tpig];
|
||||||
|
dst[tpig] = x / (1.0f + exp(-x));
|
||||||
|
}
|
||||||
|
|
||||||
|
kernel void kernel_silu_4(
|
||||||
device const float4 * src0,
|
device const float4 * src0,
|
||||||
device float4 * dst,
|
device float4 * dst,
|
||||||
uint tpig[[thread_position_in_grid]]) {
|
uint tpig[[thread_position_in_grid]]) {
|
||||||
|
|
|
@ -133,7 +133,7 @@ static inline __m256 sum_i16_pairs_float(const __m256i x) {
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
|
static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
|
||||||
#if defined(__AVXVNNI__) || defined(__AVX512VNNI__)
|
#if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__))
|
||||||
const __m256i zero = _mm256_setzero_si256();
|
const __m256i zero = _mm256_setzero_si256();
|
||||||
const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy);
|
const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy);
|
||||||
return _mm256_cvtepi32_ps(summed_pairs);
|
return _mm256_cvtepi32_ps(summed_pairs);
|
||||||
|
|
51
ggml.c
51
ggml.c
|
@ -4,6 +4,7 @@
|
||||||
#include "ggml-impl.h"
|
#include "ggml-impl.h"
|
||||||
#include "ggml-quants.h"
|
#include "ggml-quants.h"
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
|
#include "sgemm.h"
|
||||||
|
|
||||||
#if defined(_MSC_VER) || defined(__MINGW32__)
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||||
#include <malloc.h> // using malloc.h with MSC/MINGW
|
#include <malloc.h> // using malloc.h with MSC/MINGW
|
||||||
|
@ -32,6 +33,10 @@
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef __ARM_FEATURE_MATMUL_INT8
|
||||||
|
#undef GGML_USE_LLAMAFILE
|
||||||
|
#endif
|
||||||
|
|
||||||
#if defined(_MSC_VER)
|
#if defined(_MSC_VER)
|
||||||
// disable "possible loss of data" to avoid hundreds of casts
|
// disable "possible loss of data" to avoid hundreds of casts
|
||||||
// we should just be careful :)
|
// we should just be careful :)
|
||||||
|
@ -10811,6 +10816,28 @@ static void ggml_compute_forward_mul_mat(
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if GGML_USE_LLAMAFILE
|
||||||
|
if (nb10 == ggml_type_size(src1->type)) {
|
||||||
|
for (int64_t i13 = 0; i13 < ne13; i13++)
|
||||||
|
for (int64_t i12 = 0; i12 < ne12; i12++)
|
||||||
|
if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
|
||||||
|
(const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
|
||||||
|
nb01/ggml_type_size(src0->type),
|
||||||
|
(const char *)src1->data + i12*nb12 + i13*nb13,
|
||||||
|
nb11/ggml_type_size(src1->type),
|
||||||
|
(char *)dst->data + i12*nb2 + i13*nb3,
|
||||||
|
nb1/ggml_type_size(dst->type),
|
||||||
|
ith, nth,
|
||||||
|
params->type,
|
||||||
|
src0->type,
|
||||||
|
src1->type,
|
||||||
|
dst->type))
|
||||||
|
goto UseGgmlGemm1;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
UseGgmlGemm1:;
|
||||||
|
#endif
|
||||||
|
|
||||||
if (params->type == GGML_TASK_TYPE_INIT) {
|
if (params->type == GGML_TASK_TYPE_INIT) {
|
||||||
if (ith != 0) {
|
if (ith != 0) {
|
||||||
return;
|
return;
|
||||||
|
@ -10842,6 +10869,30 @@ static void ggml_compute_forward_mul_mat(
|
||||||
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
||||||
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
||||||
|
|
||||||
|
#if GGML_USE_LLAMAFILE
|
||||||
|
if (nb10 == ggml_type_size(src1->type) || src1->type != vec_dot_type) {
|
||||||
|
for (int64_t i13 = 0; i13 < ne13; i13++)
|
||||||
|
for (int64_t i12 = 0; i12 < ne12; i12++)
|
||||||
|
if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
|
||||||
|
(const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
|
||||||
|
nb01/ggml_type_size(src0->type),
|
||||||
|
(const char *)wdata + ggml_row_size(vec_dot_type,
|
||||||
|
nb12/ggml_type_size(src1->type)*i12 +
|
||||||
|
nb13/ggml_type_size(src1->type)*i13),
|
||||||
|
row_size/ggml_type_size(vec_dot_type),
|
||||||
|
(char *)dst->data + i12*nb2 + i13*nb3,
|
||||||
|
nb1/ggml_type_size(dst->type),
|
||||||
|
ith, nth,
|
||||||
|
params->type,
|
||||||
|
src0->type,
|
||||||
|
vec_dot_type,
|
||||||
|
dst->type))
|
||||||
|
goto UseGgmlGemm2;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
UseGgmlGemm2:;
|
||||||
|
#endif
|
||||||
|
|
||||||
const int64_t nr0 = ne01; // src0 rows
|
const int64_t nr0 = ne01; // src0 rows
|
||||||
const int64_t nr1 = ne1*ne12*ne13; // src1 rows
|
const int64_t nr1 = ne1*ne12*ne13; // src1 rows
|
||||||
|
|
||||||
|
|
|
@ -120,6 +120,7 @@ class MODEL_ARCH(IntEnum):
|
||||||
STABLELM = auto()
|
STABLELM = auto()
|
||||||
QWEN = auto()
|
QWEN = auto()
|
||||||
QWEN2 = auto()
|
QWEN2 = auto()
|
||||||
|
QWEN2MOE = auto()
|
||||||
PHI2 = auto()
|
PHI2 = auto()
|
||||||
PLAMO = auto()
|
PLAMO = auto()
|
||||||
CODESHELL = auto()
|
CODESHELL = auto()
|
||||||
|
@ -152,6 +153,7 @@ class MODEL_TENSOR(IntEnum):
|
||||||
ATTN_OUT_NORM = auto()
|
ATTN_OUT_NORM = auto()
|
||||||
ATTN_ROT_EMBD = auto()
|
ATTN_ROT_EMBD = auto()
|
||||||
FFN_GATE_INP = auto()
|
FFN_GATE_INP = auto()
|
||||||
|
FFN_GATE_INP_SHEXP = auto()
|
||||||
FFN_NORM = auto()
|
FFN_NORM = auto()
|
||||||
FFN_GATE = auto()
|
FFN_GATE = auto()
|
||||||
FFN_DOWN = auto()
|
FFN_DOWN = auto()
|
||||||
|
@ -160,6 +162,9 @@ class MODEL_TENSOR(IntEnum):
|
||||||
FFN_GATE_EXP = auto()
|
FFN_GATE_EXP = auto()
|
||||||
FFN_DOWN_EXP = auto()
|
FFN_DOWN_EXP = auto()
|
||||||
FFN_UP_EXP = auto()
|
FFN_UP_EXP = auto()
|
||||||
|
FFN_GATE_SHEXP = auto()
|
||||||
|
FFN_DOWN_SHEXP = auto()
|
||||||
|
FFN_UP_SHEXP = auto()
|
||||||
ATTN_Q_NORM = auto()
|
ATTN_Q_NORM = auto()
|
||||||
ATTN_K_NORM = auto()
|
ATTN_K_NORM = auto()
|
||||||
LAYER_OUT_NORM = auto()
|
LAYER_OUT_NORM = auto()
|
||||||
|
@ -190,6 +195,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||||
MODEL_ARCH.STABLELM: "stablelm",
|
MODEL_ARCH.STABLELM: "stablelm",
|
||||||
MODEL_ARCH.QWEN: "qwen",
|
MODEL_ARCH.QWEN: "qwen",
|
||||||
MODEL_ARCH.QWEN2: "qwen2",
|
MODEL_ARCH.QWEN2: "qwen2",
|
||||||
|
MODEL_ARCH.QWEN2MOE: "qwen2moe",
|
||||||
MODEL_ARCH.PHI2: "phi2",
|
MODEL_ARCH.PHI2: "phi2",
|
||||||
MODEL_ARCH.PLAMO: "plamo",
|
MODEL_ARCH.PLAMO: "plamo",
|
||||||
MODEL_ARCH.CODESHELL: "codeshell",
|
MODEL_ARCH.CODESHELL: "codeshell",
|
||||||
|
@ -224,10 +230,14 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||||
MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm",
|
MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm",
|
||||||
MODEL_TENSOR.ATTN_OUT_NORM: "blk.{bid}.attn_output_norm",
|
MODEL_TENSOR.ATTN_OUT_NORM: "blk.{bid}.attn_output_norm",
|
||||||
MODEL_TENSOR.FFN_GATE_INP: "blk.{bid}.ffn_gate_inp",
|
MODEL_TENSOR.FFN_GATE_INP: "blk.{bid}.ffn_gate_inp",
|
||||||
|
MODEL_TENSOR.FFN_GATE_INP_SHEXP: "blk.{bid}.ffn_gate_inp_shexp",
|
||||||
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
|
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
|
||||||
MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
|
MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
|
||||||
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
|
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
|
||||||
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
|
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
|
||||||
|
MODEL_TENSOR.FFN_GATE_SHEXP: "blk.{bid}.ffn_gate_shexp",
|
||||||
|
MODEL_TENSOR.FFN_DOWN_SHEXP: "blk.{bid}.ffn_down_shexp",
|
||||||
|
MODEL_TENSOR.FFN_UP_SHEXP: "blk.{bid}.ffn_up_shexp",
|
||||||
MODEL_TENSOR.FFN_ACT: "blk.{bid}.ffn",
|
MODEL_TENSOR.FFN_ACT: "blk.{bid}.ffn",
|
||||||
MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate_exps",
|
MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate_exps",
|
||||||
MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down_exps",
|
MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down_exps",
|
||||||
|
@ -445,6 +455,8 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
MODEL_TENSOR.FFN_GATE,
|
MODEL_TENSOR.FFN_GATE,
|
||||||
MODEL_TENSOR.FFN_DOWN,
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
MODEL_TENSOR.FFN_UP,
|
MODEL_TENSOR.FFN_UP,
|
||||||
|
MODEL_TENSOR.ATTN_Q_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_K_NORM,
|
||||||
],
|
],
|
||||||
MODEL_ARCH.QWEN: [
|
MODEL_ARCH.QWEN: [
|
||||||
MODEL_TENSOR.TOKEN_EMBD,
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
@ -474,6 +486,25 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
MODEL_TENSOR.FFN_DOWN,
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
MODEL_TENSOR.FFN_UP,
|
MODEL_TENSOR.FFN_UP,
|
||||||
],
|
],
|
||||||
|
MODEL_ARCH.QWEN2MOE: [
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.OUTPUT,
|
||||||
|
MODEL_TENSOR.ATTN_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_Q,
|
||||||
|
MODEL_TENSOR.ATTN_K,
|
||||||
|
MODEL_TENSOR.ATTN_V,
|
||||||
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
|
MODEL_TENSOR.FFN_NORM,
|
||||||
|
MODEL_TENSOR.FFN_GATE_INP,
|
||||||
|
MODEL_TENSOR.FFN_GATE_EXP,
|
||||||
|
MODEL_TENSOR.FFN_DOWN_EXP,
|
||||||
|
MODEL_TENSOR.FFN_UP_EXP,
|
||||||
|
MODEL_TENSOR.FFN_GATE_INP_SHEXP,
|
||||||
|
MODEL_TENSOR.FFN_GATE_SHEXP,
|
||||||
|
MODEL_TENSOR.FFN_DOWN_SHEXP,
|
||||||
|
MODEL_TENSOR.FFN_UP_SHEXP,
|
||||||
|
],
|
||||||
MODEL_ARCH.PLAMO: [
|
MODEL_ARCH.PLAMO: [
|
||||||
MODEL_TENSOR.TOKEN_EMBD,
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
MODEL_TENSOR.OUTPUT_NORM,
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
|
|
@ -208,10 +208,15 @@ class TensorNameMap:
|
||||||
MODEL_TENSOR.FFN_GATE_INP: (
|
MODEL_TENSOR.FFN_GATE_INP: (
|
||||||
"layers.{bid}.feed_forward.gate", # mixtral
|
"layers.{bid}.feed_forward.gate", # mixtral
|
||||||
"model.layers.{bid}.block_sparse_moe.gate", # mixtral
|
"model.layers.{bid}.block_sparse_moe.gate", # mixtral
|
||||||
|
"model.layers.{bid}.mlp.gate", # qwen2moe
|
||||||
"transformer.decoder_layer.{bid}.router", # Grok
|
"transformer.decoder_layer.{bid}.router", # Grok
|
||||||
"transformer.blocks.{bid}.ffn.router.layer", # dbrx
|
"transformer.blocks.{bid}.ffn.router.layer", # dbrx
|
||||||
),
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.FFN_GATE_INP_SHEXP: (
|
||||||
|
"model.layers.{bid}.mlp.shared_expert_gate", # qwen2moe
|
||||||
|
),
|
||||||
|
|
||||||
# Feed-forward up
|
# Feed-forward up
|
||||||
MODEL_TENSOR.FFN_UP: (
|
MODEL_TENSOR.FFN_UP: (
|
||||||
"gpt_neox.layers.{bid}.mlp.dense_h_to_4h", # gptneox
|
"gpt_neox.layers.{bid}.mlp.dense_h_to_4h", # gptneox
|
||||||
|
@ -239,6 +244,11 @@ class TensorNameMap:
|
||||||
"layers.{bid}.feed_forward.experts.w3", # mixtral (merged)
|
"layers.{bid}.feed_forward.experts.w3", # mixtral (merged)
|
||||||
"transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged)
|
"transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged)
|
||||||
"transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx
|
"transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx
|
||||||
|
"model.layers.{bid}.mlp.experts.up_proj", # qwen2moe (merged)
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.FFN_UP_SHEXP: (
|
||||||
|
"model.layers.{bid}.mlp.shared_expert.up_proj", # qwen2moe
|
||||||
),
|
),
|
||||||
|
|
||||||
# AWQ-activation gate
|
# AWQ-activation gate
|
||||||
|
@ -260,6 +270,11 @@ class TensorNameMap:
|
||||||
"layers.{bid}.feed_forward.experts.w1", # mixtral (merged)
|
"layers.{bid}.feed_forward.experts.w1", # mixtral (merged)
|
||||||
"transformer.decoder_layer.{bid}.moe.linear", # Grok (merged)
|
"transformer.decoder_layer.{bid}.moe.linear", # Grok (merged)
|
||||||
"transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx
|
"transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx
|
||||||
|
"model.layers.{bid}.mlp.experts.gate_proj", # qwen2moe (merged)
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.FFN_GATE_SHEXP: (
|
||||||
|
"model.layers.{bid}.mlp.shared_expert.gate_proj", # qwen2moe
|
||||||
),
|
),
|
||||||
|
|
||||||
# Feed-forward down
|
# Feed-forward down
|
||||||
|
@ -288,6 +303,11 @@ class TensorNameMap:
|
||||||
"layers.{bid}.feed_forward.experts.w2", # mixtral (merged)
|
"layers.{bid}.feed_forward.experts.w2", # mixtral (merged)
|
||||||
"transformer.decoder_layer.{bid}.moe.linear_1", # Grok (merged)
|
"transformer.decoder_layer.{bid}.moe.linear_1", # Grok (merged)
|
||||||
"transformer.blocks.{bid}.ffn.experts.mlp.w2", # dbrx
|
"transformer.blocks.{bid}.ffn.experts.mlp.w2", # dbrx
|
||||||
|
"model.layers.{bid}.mlp.experts.down_proj", # qwen2moe (merged)
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.FFN_DOWN_SHEXP: (
|
||||||
|
"model.layers.{bid}.mlp.shared_expert.down_proj", # qwen2moe
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.ATTN_Q_NORM: (
|
MODEL_TENSOR.ATTN_Q_NORM: (
|
||||||
|
@ -366,7 +386,7 @@ class TensorNameMap:
|
||||||
if tensor not in MODEL_TENSORS[arch]:
|
if tensor not in MODEL_TENSORS[arch]:
|
||||||
continue
|
continue
|
||||||
# TODO: make this configurable
|
# TODO: make this configurable
|
||||||
n_experts = 8
|
n_experts = 60
|
||||||
for xid in range(n_experts):
|
for xid in range(n_experts):
|
||||||
tensor_name = TENSOR_NAMES[tensor].format(bid = bid, xid = xid)
|
tensor_name = TENSOR_NAMES[tensor].format(bid = bid, xid = xid)
|
||||||
self.mapping[tensor_name] = (tensor, tensor_name)
|
self.mapping[tensor_name] = (tensor, tensor_name)
|
||||||
|
|
300
llama.cpp
300
llama.cpp
|
@ -108,7 +108,7 @@
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define LLAMA_MAX_NODES 8192
|
#define LLAMA_MAX_NODES 8192
|
||||||
#define LLAMA_MAX_EXPERTS 16
|
#define LLAMA_MAX_EXPERTS 60
|
||||||
|
|
||||||
|
|
||||||
//
|
//
|
||||||
|
@ -231,6 +231,7 @@ enum llm_arch {
|
||||||
LLM_ARCH_STABLELM,
|
LLM_ARCH_STABLELM,
|
||||||
LLM_ARCH_QWEN,
|
LLM_ARCH_QWEN,
|
||||||
LLM_ARCH_QWEN2,
|
LLM_ARCH_QWEN2,
|
||||||
|
LLM_ARCH_QWEN2MOE,
|
||||||
LLM_ARCH_PHI2,
|
LLM_ARCH_PHI2,
|
||||||
LLM_ARCH_PLAMO,
|
LLM_ARCH_PLAMO,
|
||||||
LLM_ARCH_CODESHELL,
|
LLM_ARCH_CODESHELL,
|
||||||
|
@ -264,6 +265,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||||
{ LLM_ARCH_STABLELM, "stablelm" },
|
{ LLM_ARCH_STABLELM, "stablelm" },
|
||||||
{ LLM_ARCH_QWEN, "qwen" },
|
{ LLM_ARCH_QWEN, "qwen" },
|
||||||
{ LLM_ARCH_QWEN2, "qwen2" },
|
{ LLM_ARCH_QWEN2, "qwen2" },
|
||||||
|
{ LLM_ARCH_QWEN2MOE, "qwen2moe" },
|
||||||
{ LLM_ARCH_PHI2, "phi2" },
|
{ LLM_ARCH_PHI2, "phi2" },
|
||||||
{ LLM_ARCH_PLAMO, "plamo" },
|
{ LLM_ARCH_PLAMO, "plamo" },
|
||||||
{ LLM_ARCH_CODESHELL, "codeshell" },
|
{ LLM_ARCH_CODESHELL, "codeshell" },
|
||||||
|
@ -459,6 +461,7 @@ enum llm_tensor {
|
||||||
LLM_TENSOR_ATTN_OUT_NORM,
|
LLM_TENSOR_ATTN_OUT_NORM,
|
||||||
LLM_TENSOR_ATTN_ROT_EMBD,
|
LLM_TENSOR_ATTN_ROT_EMBD,
|
||||||
LLM_TENSOR_FFN_GATE_INP,
|
LLM_TENSOR_FFN_GATE_INP,
|
||||||
|
LLM_TENSOR_FFN_GATE_INP_SHEXP,
|
||||||
LLM_TENSOR_FFN_NORM,
|
LLM_TENSOR_FFN_NORM,
|
||||||
LLM_TENSOR_FFN_GATE,
|
LLM_TENSOR_FFN_GATE,
|
||||||
LLM_TENSOR_FFN_DOWN,
|
LLM_TENSOR_FFN_DOWN,
|
||||||
|
@ -470,6 +473,9 @@ enum llm_tensor {
|
||||||
LLM_TENSOR_FFN_DOWN_EXPS, // merged experts
|
LLM_TENSOR_FFN_DOWN_EXPS, // merged experts
|
||||||
LLM_TENSOR_FFN_GATE_EXPS,
|
LLM_TENSOR_FFN_GATE_EXPS,
|
||||||
LLM_TENSOR_FFN_UP_EXPS,
|
LLM_TENSOR_FFN_UP_EXPS,
|
||||||
|
LLM_TENSOR_FFN_DOWN_SHEXP,
|
||||||
|
LLM_TENSOR_FFN_GATE_SHEXP,
|
||||||
|
LLM_TENSOR_FFN_UP_SHEXP,
|
||||||
LLM_TENSOR_ATTN_Q_NORM,
|
LLM_TENSOR_ATTN_Q_NORM,
|
||||||
LLM_TENSOR_ATTN_K_NORM,
|
LLM_TENSOR_ATTN_K_NORM,
|
||||||
LLM_TENSOR_LAYER_OUT_NORM,
|
LLM_TENSOR_LAYER_OUT_NORM,
|
||||||
|
@ -732,6 +738,8 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
||||||
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||||
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||||
|
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
||||||
|
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -767,6 +775,28 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
||||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
LLM_ARCH_QWEN2MOE,
|
||||||
|
{
|
||||||
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||||
|
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||||
|
{ LLM_TENSOR_OUTPUT, "output" },
|
||||||
|
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||||
|
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||||
|
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||||
|
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||||
|
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||||
|
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||||
|
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
||||||
|
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
||||||
|
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
||||||
|
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
||||||
|
{ LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
|
||||||
|
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
|
||||||
|
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
|
||||||
|
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
||||||
|
},
|
||||||
|
},
|
||||||
{
|
{
|
||||||
LLM_ARCH_PHI2,
|
LLM_ARCH_PHI2,
|
||||||
{
|
{
|
||||||
|
@ -1742,6 +1772,7 @@ enum e_model {
|
||||||
MODEL_4B,
|
MODEL_4B,
|
||||||
MODEL_7B,
|
MODEL_7B,
|
||||||
MODEL_8B,
|
MODEL_8B,
|
||||||
|
MODEL_12B,
|
||||||
MODEL_13B,
|
MODEL_13B,
|
||||||
MODEL_14B,
|
MODEL_14B,
|
||||||
MODEL_15B,
|
MODEL_15B,
|
||||||
|
@ -1757,6 +1788,7 @@ enum e_model {
|
||||||
MODEL_MEDIUM,
|
MODEL_MEDIUM,
|
||||||
MODEL_LARGE,
|
MODEL_LARGE,
|
||||||
MODEL_XL,
|
MODEL_XL,
|
||||||
|
MODEL_A2_7B,
|
||||||
MODEL_8x7B,
|
MODEL_8x7B,
|
||||||
MODEL_8x22B,
|
MODEL_8x22B,
|
||||||
MODEL_16x12B,
|
MODEL_16x12B,
|
||||||
|
@ -1943,6 +1975,12 @@ struct llama_layer {
|
||||||
struct ggml_tensor * ffn_down_exps;
|
struct ggml_tensor * ffn_down_exps;
|
||||||
struct ggml_tensor * ffn_up_exps ;
|
struct ggml_tensor * ffn_up_exps ;
|
||||||
|
|
||||||
|
// ff shared expert (shexp)
|
||||||
|
struct ggml_tensor * ffn_gate_inp_shexp;
|
||||||
|
struct ggml_tensor * ffn_gate_shexp;
|
||||||
|
struct ggml_tensor * ffn_down_shexp;
|
||||||
|
struct ggml_tensor * ffn_up_shexp;
|
||||||
|
|
||||||
// ff bias
|
// ff bias
|
||||||
struct ggml_tensor * ffn_down_b; // b2
|
struct ggml_tensor * ffn_down_b; // b2
|
||||||
struct ggml_tensor * ffn_up_b; // b3
|
struct ggml_tensor * ffn_up_b; // b3
|
||||||
|
@ -3616,6 +3654,7 @@ static const char * llama_model_type_name(e_model type) {
|
||||||
case MODEL_3B: return "3B";
|
case MODEL_3B: return "3B";
|
||||||
case MODEL_7B: return "7B";
|
case MODEL_7B: return "7B";
|
||||||
case MODEL_8B: return "8B";
|
case MODEL_8B: return "8B";
|
||||||
|
case MODEL_12B: return "12B";
|
||||||
case MODEL_13B: return "13B";
|
case MODEL_13B: return "13B";
|
||||||
case MODEL_14B: return "14B";
|
case MODEL_14B: return "14B";
|
||||||
case MODEL_15B: return "15B";
|
case MODEL_15B: return "15B";
|
||||||
|
@ -3631,6 +3670,7 @@ static const char * llama_model_type_name(e_model type) {
|
||||||
case MODEL_MEDIUM: return "0.4B";
|
case MODEL_MEDIUM: return "0.4B";
|
||||||
case MODEL_LARGE: return "0.8B";
|
case MODEL_LARGE: return "0.8B";
|
||||||
case MODEL_XL: return "1.5B";
|
case MODEL_XL: return "1.5B";
|
||||||
|
case MODEL_A2_7B: return "A2.7B";
|
||||||
case MODEL_8x7B: return "8x7B";
|
case MODEL_8x7B: return "8x7B";
|
||||||
case MODEL_8x22B: return "8x22B";
|
case MODEL_8x22B: return "8x22B";
|
||||||
case MODEL_16x12B: return "16x12B";
|
case MODEL_16x12B: return "16x12B";
|
||||||
|
@ -3906,6 +3946,7 @@ static void llm_load_hparams(
|
||||||
switch (hparams.n_layer) {
|
switch (hparams.n_layer) {
|
||||||
case 24: model.type = e_model::MODEL_1B; break;
|
case 24: model.type = e_model::MODEL_1B; break;
|
||||||
case 32: model.type = e_model::MODEL_3B; break;
|
case 32: model.type = e_model::MODEL_3B; break;
|
||||||
|
case 40: model.type = e_model::MODEL_12B; break;
|
||||||
default: model.type = e_model::MODEL_UNKNOWN;
|
default: model.type = e_model::MODEL_UNKNOWN;
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
@ -3930,6 +3971,14 @@ static void llm_load_hparams(
|
||||||
default: model.type = e_model::MODEL_UNKNOWN;
|
default: model.type = e_model::MODEL_UNKNOWN;
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_QWEN2MOE:
|
||||||
|
{
|
||||||
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||||
|
switch (hparams.n_layer) {
|
||||||
|
case 24: model.type = e_model::MODEL_A2_7B; break;
|
||||||
|
default: model.type = e_model::MODEL_UNKNOWN;
|
||||||
|
}
|
||||||
|
} break;
|
||||||
case LLM_ARCH_PHI2:
|
case LLM_ARCH_PHI2:
|
||||||
{
|
{
|
||||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||||
|
@ -4132,9 +4181,11 @@ static void llm_load_vocab(
|
||||||
// CodeGemma (LLM_ARCH_GEMMA). This can potentially be removed once
|
// CodeGemma (LLM_ARCH_GEMMA). This can potentially be removed once
|
||||||
// new versions of these models have been published.
|
// new versions of these models have been published.
|
||||||
std::string gen_name;
|
std::string gen_name;
|
||||||
ml.get_key(LLM_KV_GENERAL_NAME, gen_name);
|
ml.get_key(LLM_KV_GENERAL_NAME, gen_name, false);
|
||||||
|
|
||||||
std::transform(gen_name.begin(), gen_name.end(), gen_name.begin(),
|
std::transform(gen_name.begin(), gen_name.end(), gen_name.begin(),
|
||||||
[](unsigned char c){ return std::tolower(c); });
|
[](unsigned char c){ return std::tolower(c); });
|
||||||
|
|
||||||
if (gen_name.find("code") != std::string::npos) {
|
if (gen_name.find("code") != std::string::npos) {
|
||||||
if (model.arch == LLM_ARCH_LLAMA) {
|
if (model.arch == LLM_ARCH_LLAMA) {
|
||||||
vocab.special_prefix_id = 32007;
|
vocab.special_prefix_id = 32007;
|
||||||
|
@ -5154,8 +5205,13 @@ static bool llm_load_tensors(
|
||||||
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, false);
|
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, false);
|
||||||
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, false);
|
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, false);
|
||||||
|
|
||||||
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
// optional q and k layernorms, present in StableLM 2 12B
|
||||||
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
|
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head}, false);
|
||||||
|
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head_kv}, false);
|
||||||
|
|
||||||
|
// optional FFN norm, not present in StableLM 2 12B which uses parallel residual
|
||||||
|
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, false);
|
||||||
|
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, false);
|
||||||
|
|
||||||
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
||||||
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
||||||
|
@ -5226,6 +5282,54 @@ static bool llm_load_tensors(
|
||||||
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_QWEN2MOE:
|
||||||
|
{
|
||||||
|
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
||||||
|
|
||||||
|
// output
|
||||||
|
{
|
||||||
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
||||||
|
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < n_layer; ++i) {
|
||||||
|
ggml_context * ctx_layer = ctx_for_layer(i);
|
||||||
|
ggml_context * ctx_split = ctx_for_layer_split(i);
|
||||||
|
|
||||||
|
auto & layer = model.layers[i];
|
||||||
|
|
||||||
|
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
||||||
|
|
||||||
|
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
||||||
|
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
||||||
|
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
||||||
|
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
||||||
|
|
||||||
|
// optional bias tensors
|
||||||
|
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
|
||||||
|
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
|
||||||
|
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
|
||||||
|
|
||||||
|
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
||||||
|
|
||||||
|
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
|
||||||
|
|
||||||
|
GGML_ASSERT(hparams.n_expert > 0);
|
||||||
|
GGML_ASSERT(hparams.n_expert_used > 0);
|
||||||
|
|
||||||
|
// MoE branch
|
||||||
|
auto n_ff_exp = n_ff / hparams.n_expert_used;
|
||||||
|
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
|
||||||
|
layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert});
|
||||||
|
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
|
||||||
|
|
||||||
|
// Shared expert branch
|
||||||
|
layer.ffn_gate_inp_shexp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), {n_embd});
|
||||||
|
layer.ffn_gate_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff});
|
||||||
|
layer.ffn_down_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff, n_embd});
|
||||||
|
layer.ffn_up_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff});
|
||||||
|
}
|
||||||
|
} break;
|
||||||
case LLM_ARCH_PHI2:
|
case LLM_ARCH_PHI2:
|
||||||
{
|
{
|
||||||
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
||||||
|
@ -6602,7 +6706,7 @@ struct llm_build_context {
|
||||||
LLM_NORM_RMS, cb, il);
|
LLM_NORM_RMS, cb, il);
|
||||||
cb(cur, "ffn_norm", il);
|
cb(cur, "ffn_norm", il);
|
||||||
|
|
||||||
cur = build_moe_ffn(cur, n_tokens, LLM_FFN_SILU, il);
|
cur = build_moe_ffn(cur, n_tokens, LLM_FFN_SILU, true, il);
|
||||||
}
|
}
|
||||||
|
|
||||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||||
|
@ -6635,7 +6739,7 @@ struct llm_build_context {
|
||||||
}
|
}
|
||||||
|
|
||||||
// REVIEW: will be replaced by https://github.com/ggerganov/llama.cpp/pull/6505
|
// REVIEW: will be replaced by https://github.com/ggerganov/llama.cpp/pull/6505
|
||||||
ggml_tensor * build_moe_ffn(ggml_tensor * cur, int32_t n_tokens, llm_ffn_op_type type_op, int il) {
|
ggml_tensor * build_moe_ffn(ggml_tensor * cur, int32_t n_tokens, llm_ffn_op_type type_op, bool norm_w, int il) {
|
||||||
ggml_tensor * logits = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp, cur); // [n_tokens, num_experts]
|
ggml_tensor * logits = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp, cur); // [n_tokens, num_experts]
|
||||||
cb(logits, "ffn_moe_logits", il);
|
cb(logits, "ffn_moe_logits", il);
|
||||||
|
|
||||||
|
@ -6652,11 +6756,13 @@ struct llm_build_context {
|
||||||
|
|
||||||
weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); // [n_tokens, num_experts_per_tok]
|
weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); // [n_tokens, num_experts_per_tok]
|
||||||
|
|
||||||
|
if (norm_w) {
|
||||||
ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights);
|
ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights);
|
||||||
cb(weights_sum, "ffn_moe_weights_sum", il);
|
cb(weights_sum, "ffn_moe_weights_sum", il);
|
||||||
|
|
||||||
weights = ggml_div(ctx0, weights, weights_sum); // [n_tokens, num_experts_per_tok]
|
weights = ggml_div(ctx0, weights, weights_sum); // [n_tokens, num_experts_per_tok]
|
||||||
cb(weights, "ffn_moe_weights_norm", il);
|
cb(weights, "ffn_moe_weights_norm", il);
|
||||||
|
}
|
||||||
|
|
||||||
// compute expert outputs
|
// compute expert outputs
|
||||||
ggml_tensor * moe_out = nullptr;
|
ggml_tensor * moe_out = nullptr;
|
||||||
|
@ -7153,7 +7259,7 @@ struct llm_build_context {
|
||||||
LLM_NORM_RMS, cb, il);
|
LLM_NORM_RMS, cb, il);
|
||||||
cb(cur, "ffn_norm", il);
|
cb(cur, "ffn_norm", il);
|
||||||
|
|
||||||
cur = build_moe_ffn(cur, n_tokens, LLM_FFN_GELU, il);
|
cur = build_moe_ffn(cur, n_tokens, LLM_FFN_GELU, true, il);
|
||||||
|
|
||||||
// Grok
|
// Grok
|
||||||
// if layer_out_norm is present then apply it before adding the input
|
// if layer_out_norm is present then apply it before adding the input
|
||||||
|
@ -7289,7 +7395,7 @@ struct llm_build_context {
|
||||||
LLM_NORM, cb, il);
|
LLM_NORM, cb, il);
|
||||||
cb(cur, "attn_out_norm", il);
|
cb(cur, "attn_out_norm", il);
|
||||||
|
|
||||||
cur = build_moe_ffn(cur, n_tokens, LLM_FFN_SILU, il);
|
cur = build_moe_ffn(cur, n_tokens, LLM_FFN_SILU, true, il);
|
||||||
|
|
||||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||||
cb(cur, "ffn_out", il);
|
cb(cur, "ffn_out", il);
|
||||||
|
@ -8173,7 +8279,7 @@ struct llm_build_context {
|
||||||
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
struct ggml_tensor * inpSA = inpL;
|
|
||||||
|
|
||||||
// norm
|
// norm
|
||||||
cur = llm_build_norm(ctx0, inpL, hparams,
|
cur = llm_build_norm(ctx0, inpL, hparams,
|
||||||
|
@ -8182,6 +8288,8 @@ struct llm_build_context {
|
||||||
LLM_NORM, cb, il);
|
LLM_NORM, cb, il);
|
||||||
cb(cur, "attn_norm", il);
|
cb(cur, "attn_norm", il);
|
||||||
|
|
||||||
|
struct ggml_tensor * inpSA = cur;
|
||||||
|
|
||||||
// self-attention
|
// self-attention
|
||||||
{
|
{
|
||||||
// compute Q and K and RoPE them
|
// compute Q and K and RoPE them
|
||||||
|
@ -8206,15 +8314,36 @@ struct llm_build_context {
|
||||||
cb(Vcur, "Vcur", il);
|
cb(Vcur, "Vcur", il);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
|
||||||
|
if (model.layers[il].attn_q_norm) {
|
||||||
|
Qcur = llm_build_norm(ctx0, Qcur, hparams,
|
||||||
|
model.layers[il].attn_q_norm,
|
||||||
|
NULL,
|
||||||
|
LLM_NORM, cb, il);
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
}
|
||||||
|
if (model.layers[il].attn_k_norm) {
|
||||||
|
Kcur = llm_build_norm(ctx0, Kcur, hparams,
|
||||||
|
model.layers[il].attn_k_norm,
|
||||||
|
NULL,
|
||||||
|
LLM_NORM, cb, il);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
Qcur = ggml_rope_custom(
|
Qcur = ggml_rope_custom(
|
||||||
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
ctx0, Qcur, inp_pos,
|
||||||
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
||||||
ext_factor, attn_factor, beta_fast, beta_slow
|
ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
);
|
);
|
||||||
cb(Qcur, "Qcur", il);
|
cb(Qcur, "Qcur", il);
|
||||||
|
|
||||||
Kcur = ggml_rope_custom(
|
Kcur = ggml_rope_custom(
|
||||||
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
ctx0, Kcur, inp_pos,
|
||||||
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
||||||
ext_factor, attn_factor, beta_fast, beta_slow
|
ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
);
|
);
|
||||||
|
@ -8229,20 +8358,25 @@ struct llm_build_context {
|
||||||
// skip computing output for unused tokens
|
// skip computing output for unused tokens
|
||||||
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||||
|
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
||||||
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
|
||||||
cb(ffn_inp, "ffn_inp", il);
|
cb(ffn_inp, "ffn_inp", il);
|
||||||
|
|
||||||
// feed-forward network
|
// feed-forward network
|
||||||
{
|
{
|
||||||
|
if (model.layers[il].ffn_norm) {
|
||||||
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
||||||
model.layers[il].ffn_norm,
|
model.layers[il].ffn_norm,
|
||||||
model.layers[il].ffn_norm_b,
|
model.layers[il].ffn_norm_b,
|
||||||
LLM_NORM, cb, il);
|
LLM_NORM, cb, il);
|
||||||
cb(cur, "ffn_norm", il);
|
cb(cur, "ffn_norm", il);
|
||||||
|
} else {
|
||||||
|
// parallel residual
|
||||||
|
cur = inpSA;
|
||||||
|
}
|
||||||
cur = llm_build_ffn(ctx0, cur,
|
cur = llm_build_ffn(ctx0, cur,
|
||||||
model.layers[il].ffn_up, NULL,
|
model.layers[il].ffn_up, NULL,
|
||||||
model.layers[il].ffn_gate, NULL,
|
model.layers[il].ffn_gate, NULL,
|
||||||
|
@ -8504,6 +8638,141 @@ struct llm_build_context {
|
||||||
return gf;
|
return gf;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct ggml_cgraph * build_qwen2moe() {
|
||||||
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||||
|
|
||||||
|
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
||||||
|
int32_t n_tokens = this->n_tokens;
|
||||||
|
|
||||||
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||||
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||||
|
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||||
|
|
||||||
|
struct ggml_tensor * cur;
|
||||||
|
struct ggml_tensor * inpL;
|
||||||
|
|
||||||
|
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
||||||
|
|
||||||
|
// inp_pos - contains the positions
|
||||||
|
struct ggml_tensor * inp_pos = build_inp_pos();
|
||||||
|
|
||||||
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||||
|
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
||||||
|
|
||||||
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
|
struct ggml_tensor * inpSA = inpL;
|
||||||
|
|
||||||
|
// norm
|
||||||
|
cur = llm_build_norm(ctx0, inpL, hparams,
|
||||||
|
model.layers[il].attn_norm, NULL,
|
||||||
|
LLM_NORM_RMS, cb, il);
|
||||||
|
cb(cur, "attn_norm", il);
|
||||||
|
|
||||||
|
// self_attention
|
||||||
|
{
|
||||||
|
// compute Q and K and RoPE them
|
||||||
|
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
|
||||||
|
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
|
||||||
|
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
||||||
|
cb(Vcur, "Vcur", il);
|
||||||
|
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
||||||
|
cb(Vcur, "Vcur", il);
|
||||||
|
|
||||||
|
Qcur = ggml_rope_custom(
|
||||||
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
||||||
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
||||||
|
ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
|
);
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
|
||||||
|
Kcur = ggml_rope_custom(
|
||||||
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
||||||
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
||||||
|
ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
|
);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
|
||||||
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
||||||
|
model.layers[il].wo, model.layers[il].bo,
|
||||||
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (il == n_layer - 1) {
|
||||||
|
// skip computing output for unused tokens
|
||||||
|
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||||
|
n_tokens = n_outputs;
|
||||||
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||||
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
||||||
|
cb(ffn_inp, "ffn_inp", il);
|
||||||
|
|
||||||
|
// MoE branch
|
||||||
|
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
||||||
|
model.layers[il].ffn_norm, NULL,
|
||||||
|
LLM_NORM_RMS, cb, il);
|
||||||
|
cb(cur, "ffn_norm", il);
|
||||||
|
|
||||||
|
ggml_tensor * moe_out = build_moe_ffn(cur, n_tokens, LLM_FFN_SILU, false, il);
|
||||||
|
|
||||||
|
// FFN shared expert
|
||||||
|
{
|
||||||
|
ggml_tensor * cur_gate_inp = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp_shexp, cur);
|
||||||
|
cb(cur_gate_inp, "ffn_shexp_gate_inp", il);
|
||||||
|
|
||||||
|
// sigmoid
|
||||||
|
ggml_tensor * cur_gate = ggml_div(ctx0, ggml_silu(ctx0, cur_gate_inp), cur_gate_inp);
|
||||||
|
cb(cur_gate, "ffn_shexp_gate", il);
|
||||||
|
|
||||||
|
ggml_tensor * cur_ffn = llm_build_ffn(ctx0, cur,
|
||||||
|
model.layers[il].ffn_up_shexp, NULL,
|
||||||
|
model.layers[il].ffn_gate_shexp, NULL,
|
||||||
|
model.layers[il].ffn_down_shexp, NULL,
|
||||||
|
NULL,
|
||||||
|
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
||||||
|
cb(cur_ffn, "ffn_shexp", il);
|
||||||
|
|
||||||
|
ggml_tensor * ffn_shexp_out = ggml_mul(ctx0, cur_ffn, cur_gate);
|
||||||
|
cb(ffn_shexp_out, "ffn_shexp_out", il);
|
||||||
|
|
||||||
|
moe_out = ggml_add(ctx0, moe_out, ffn_shexp_out);
|
||||||
|
cb(moe_out, "ffn_out", il);
|
||||||
|
|
||||||
|
cur = moe_out;
|
||||||
|
}
|
||||||
|
|
||||||
|
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||||
|
cb(cur, "l_out", il);
|
||||||
|
|
||||||
|
// input for next layer
|
||||||
|
inpL = cur;
|
||||||
|
}
|
||||||
|
|
||||||
|
cur = inpL;
|
||||||
|
|
||||||
|
cur = llm_build_norm(ctx0, cur, hparams,
|
||||||
|
model.output_norm, NULL,
|
||||||
|
LLM_NORM_RMS, cb, -1);
|
||||||
|
cb(cur, "result_norm", -1);
|
||||||
|
|
||||||
|
// lm_head
|
||||||
|
cur = ggml_mul_mat(ctx0, model.output, cur);
|
||||||
|
cb(cur, "result_output", -1);
|
||||||
|
|
||||||
|
ggml_build_forward_expand(gf, cur);
|
||||||
|
|
||||||
|
return gf;
|
||||||
|
}
|
||||||
|
|
||||||
struct ggml_cgraph * build_phi2() {
|
struct ggml_cgraph * build_phi2() {
|
||||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||||
|
|
||||||
|
@ -9987,6 +10256,10 @@ static struct ggml_cgraph * llama_build_graph(
|
||||||
{
|
{
|
||||||
result = llm.build_qwen2();
|
result = llm.build_qwen2();
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_QWEN2MOE:
|
||||||
|
{
|
||||||
|
result = llm.build_qwen2moe();
|
||||||
|
} break;
|
||||||
case LLM_ARCH_PHI2:
|
case LLM_ARCH_PHI2:
|
||||||
{
|
{
|
||||||
result = llm.build_phi2();
|
result = llm.build_phi2();
|
||||||
|
@ -15139,6 +15412,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
||||||
case LLM_ARCH_STABLELM:
|
case LLM_ARCH_STABLELM:
|
||||||
case LLM_ARCH_QWEN:
|
case LLM_ARCH_QWEN:
|
||||||
case LLM_ARCH_QWEN2:
|
case LLM_ARCH_QWEN2:
|
||||||
|
case LLM_ARCH_QWEN2MOE:
|
||||||
case LLM_ARCH_PHI2:
|
case LLM_ARCH_PHI2:
|
||||||
case LLM_ARCH_GEMMA:
|
case LLM_ARCH_GEMMA:
|
||||||
case LLM_ARCH_STARCODER2:
|
case LLM_ARCH_STARCODER2:
|
||||||
|
|
12
sgemm.h
Normal file
12
sgemm.h
Normal file
|
@ -0,0 +1,12 @@
|
||||||
|
#pragma once
|
||||||
|
#include <stdbool.h>
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
bool llamafile_sgemm(int, int, int, const void *, int, const void *, int,
|
||||||
|
void *, int, int, int, int, int, int, int);
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
Loading…
Add table
Add a link
Reference in a new issue