mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-10 17:14:36 +00:00
Merge branch 'upstream' into concedo_experimental
# Conflicts: # CMakePresets.json # Makefile # Package.swift # ci/run.sh # common/CMakeLists.txt # examples/CMakeLists.txt # flake.lock # ggml/src/CMakeLists.txt # ggml/src/ggml-backend.cpp # ggml/src/ggml.c # pocs/vdot/q8dot.cpp # pocs/vdot/vdot.cpp # tests/test-backend-ops.cpp # tests/test-grad0.cpp # tests/test-quantize-fns.cpp # tests/test-quantize-perf.cpp # tests/test-rope.cpp
This commit is contained in:
commit
bb13925f39
34 changed files with 14917 additions and 19458 deletions
|
@ -421,6 +421,8 @@ endif()
|
||||||
add_library(ggml
|
add_library(ggml
|
||||||
ggml/src/ggml.c
|
ggml/src/ggml.c
|
||||||
ggml/include/ggml.h
|
ggml/include/ggml.h
|
||||||
|
ggml/src/ggml-cpu.c
|
||||||
|
ggml/include/ggml-cpu.h
|
||||||
ggml/src/ggml-alloc.c
|
ggml/src/ggml-alloc.c
|
||||||
ggml/include/ggml-alloc.h
|
ggml/include/ggml-alloc.h
|
||||||
ggml/src/ggml-backend.cpp
|
ggml/src/ggml-backend.cpp
|
||||||
|
|
53
Makefile
53
Makefile
|
@ -430,6 +430,17 @@ ggml_v4_vulkan.o: ggml/src/ggml.c ggml/include/ggml.h
|
||||||
ggml_v4_vulkan_noavx2.o: ggml/src/ggml.c ggml/include/ggml.h
|
ggml_v4_vulkan_noavx2.o: ggml/src/ggml.c ggml/include/ggml.h
|
||||||
$(CC) $(FASTCFLAGS) $(SIMPLECFLAGS) $(VULKAN_FLAGS) -c $< -o $@
|
$(CC) $(FASTCFLAGS) $(SIMPLECFLAGS) $(VULKAN_FLAGS) -c $< -o $@
|
||||||
|
|
||||||
|
ggml-cpu.o: ggml/src/ggml-cpu.c ggml/include/ggml-cpu.h
|
||||||
|
$(CC) $(FASTCFLAGS) $(FULLCFLAGS) -c $< -o $@
|
||||||
|
ggml-cpu_v4_failsafe.o: ggml/src/ggml-cpu.c ggml/include/ggml-cpu.h
|
||||||
|
$(CC) $(FASTCFLAGS) $(NONECFLAGS) -c $< -o $@
|
||||||
|
ggml-cpu_v4_noavx2.o: ggml/src/ggml-cpu.c ggml/include/ggml-cpu.h
|
||||||
|
$(CC) $(FASTCFLAGS) $(SIMPLECFLAGS) -c $< -o $@
|
||||||
|
ggml-cpu_v4_clblast.o: ggml/src/ggml-cpu.c ggml/include/ggml-cpu.h
|
||||||
|
$(CC) $(FASTCFLAGS) $(FULLCFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
|
||||||
|
ggml-cpu_v4_clblast_noavx2.o: ggml/src/ggml-cpu.c ggml/include/ggml-cpu.h
|
||||||
|
$(CC) $(FASTCFLAGS) $(SIMPLECFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
|
||||||
|
|
||||||
#quants
|
#quants
|
||||||
ggml-quants.o: ggml/src/ggml-quants.c ggml/include/ggml.h ggml/src/ggml-quants.h ggml/src/ggml-common.h
|
ggml-quants.o: ggml/src/ggml-quants.c ggml/include/ggml.h ggml/src/ggml-quants.h ggml/src/ggml-common.h
|
||||||
$(CC) $(CFLAGS) $(FULLCFLAGS) -c $< -o $@
|
$(CC) $(CFLAGS) $(FULLCFLAGS) -c $< -o $@
|
||||||
|
@ -574,18 +585,18 @@ clean:
|
||||||
rm -vrf ggml/src/ggml-cuda/template-instances/*.o
|
rm -vrf ggml/src/ggml-cuda/template-instances/*.o
|
||||||
|
|
||||||
# useful tools
|
# useful tools
|
||||||
main: examples/main/main.cpp common/json-schema-to-grammar.cpp common/arg.cpp build-info.h ggml.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o $(OBJS_FULL) $(OBJS)
|
main: examples/main/main.cpp common/json-schema-to-grammar.cpp common/arg.cpp build-info.h ggml.o ggml-cpu.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o $(OBJS_FULL) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
@echo '==== Run ./main -h for help. ===='
|
@echo '==== Run ./main -h for help. ===='
|
||||||
sdmain: otherarch/sdcpp/util.cpp otherarch/sdcpp/main.cpp otherarch/sdcpp/stable-diffusion.cpp otherarch/sdcpp/upscaler.cpp otherarch/sdcpp/model.cpp otherarch/sdcpp/thirdparty/zip.c build-info.h ggml.o llama.o console.o ggml-backend_default.o $(OBJS_FULL) $(OBJS)
|
sdmain: otherarch/sdcpp/util.cpp otherarch/sdcpp/main.cpp otherarch/sdcpp/stable-diffusion.cpp otherarch/sdcpp/upscaler.cpp otherarch/sdcpp/model.cpp otherarch/sdcpp/thirdparty/zip.c build-info.h ggml.o ggml-cpu.o llama.o console.o ggml-backend_default.o $(OBJS_FULL) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
whispermain: otherarch/whispercpp/main.cpp otherarch/whispercpp/whisper.cpp build-info.h ggml.o llama.o console.o ggml-backend_default.o $(OBJS_FULL) $(OBJS)
|
whispermain: otherarch/whispercpp/main.cpp otherarch/whispercpp/whisper.cpp build-info.h ggml.o ggml-cpu.o llama.o console.o ggml-backend_default.o $(OBJS_FULL) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
imatrix: examples/imatrix/imatrix.cpp build-info.h ggml.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o $(OBJS_FULL) $(OBJS)
|
imatrix: examples/imatrix/imatrix.cpp build-info.h ggml.o ggml-cpu.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o $(OBJS_FULL) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
gguf: examples/gguf/gguf.cpp build-info.h ggml.o llama.o llavaclip_default.o llava.o ggml-backend_default.o $(OBJS_FULL) $(OBJS)
|
gguf: examples/gguf/gguf.cpp build-info.h ggml.o ggml-cpu.o llama.o llavaclip_default.o llava.o ggml-backend_default.o $(OBJS_FULL) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
gguf-split: examples/gguf-split/gguf-split.cpp ggml.o llama.o build-info.h llavaclip_default.o llava.o ggml-backend_default.o $(OBJS_FULL) $(OBJS)
|
gguf-split: examples/gguf-split/gguf-split.cpp ggml.o ggml-cpu.o llama.o build-info.h llavaclip_default.o llava.o ggml-backend_default.o $(OBJS_FULL) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
vulkan-shaders-gen: ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp
|
vulkan-shaders-gen: ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp
|
||||||
|
@ -595,11 +606,11 @@ vulkan-shaders-gen: ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp
|
||||||
$(shell) vulkan-shaders-gen --glslc glslc --input-dir ggml/src/vulkan-shaders --target-hpp ggml/src/ggml-vulkan-shaders.hpp --target-cpp ggml/src/ggml-vulkan-shaders.cpp
|
$(shell) vulkan-shaders-gen --glslc glslc --input-dir ggml/src/vulkan-shaders --target-hpp ggml/src/ggml-vulkan-shaders.hpp --target-cpp ggml/src/ggml-vulkan-shaders.cpp
|
||||||
|
|
||||||
#generated libraries
|
#generated libraries
|
||||||
koboldcpp_default: ggml.o ggml_v3.o ggml_v2.o ggml_v1.o expose.o gpttype_adapter.o sdcpp_default.o whispercpp_default.o llavaclip_default.o llava.o ggml-backend_default.o $(OBJS_FULL) $(OBJS)
|
koboldcpp_default: ggml.o ggml-cpu.o ggml_v3.o ggml_v2.o ggml_v1.o expose.o gpttype_adapter.o sdcpp_default.o whispercpp_default.o llavaclip_default.o llava.o ggml-backend_default.o $(OBJS_FULL) $(OBJS)
|
||||||
$(DEFAULT_BUILD)
|
$(DEFAULT_BUILD)
|
||||||
|
|
||||||
ifdef FAILSAFE_BUILD
|
ifdef FAILSAFE_BUILD
|
||||||
koboldcpp_failsafe: ggml_v4_failsafe.o ggml_v3_failsafe.o ggml_v2_failsafe.o ggml_v1_failsafe.o expose.o gpttype_adapter_failsafe.o sdcpp_default.o whispercpp_default.o llavaclip_default.o llava.o ggml-backend_default.o $(OBJS_FAILSAFE) $(OBJS)
|
koboldcpp_failsafe: ggml_v4_failsafe.o ggml-cpu_v4_failsafe.o ggml_v3_failsafe.o ggml_v2_failsafe.o ggml_v1_failsafe.o expose.o gpttype_adapter_failsafe.o sdcpp_default.o whispercpp_default.o llavaclip_default.o llava.o ggml-backend_default.o $(OBJS_FAILSAFE) $(OBJS)
|
||||||
$(FAILSAFE_BUILD)
|
$(FAILSAFE_BUILD)
|
||||||
else
|
else
|
||||||
koboldcpp_failsafe:
|
koboldcpp_failsafe:
|
||||||
|
@ -607,7 +618,7 @@ koboldcpp_failsafe:
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifdef NOAVX2_BUILD
|
ifdef NOAVX2_BUILD
|
||||||
koboldcpp_noavx2: ggml_v4_noavx2.o ggml_v3_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o gpttype_adapter_failsafe.o sdcpp_default.o whispercpp_default.o llavaclip_default.o llava.o ggml-backend_default.o $(OBJS_SIMPLE) $(OBJS)
|
koboldcpp_noavx2: ggml_v4_noavx2.o ggml-cpu_v4_noavx2.o ggml_v3_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o gpttype_adapter_failsafe.o sdcpp_default.o whispercpp_default.o llavaclip_default.o llava.o ggml-backend_default.o $(OBJS_SIMPLE) $(OBJS)
|
||||||
$(NOAVX2_BUILD)
|
$(NOAVX2_BUILD)
|
||||||
else
|
else
|
||||||
koboldcpp_noavx2:
|
koboldcpp_noavx2:
|
||||||
|
@ -615,10 +626,10 @@ koboldcpp_noavx2:
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifdef CLBLAST_BUILD
|
ifdef CLBLAST_BUILD
|
||||||
koboldcpp_clblast: ggml_v4_clblast.o ggml_v3_clblast.o ggml_v2_clblast.o ggml_v1.o expose.o gpttype_adapter_clblast.o ggml-opencl.o ggml_v3-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o sdcpp_default.o whispercpp_default.o llavaclip_default.o llava.o ggml-backend_default.o $(OBJS_FULL) $(OBJS)
|
koboldcpp_clblast: ggml_v4_clblast.o ggml-cpu_v4_clblast.o ggml_v3_clblast.o ggml_v2_clblast.o ggml_v1.o expose.o gpttype_adapter_clblast.o ggml-opencl.o ggml_v3-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o sdcpp_default.o whispercpp_default.o llavaclip_default.o llava.o ggml-backend_default.o $(OBJS_FULL) $(OBJS)
|
||||||
$(CLBLAST_BUILD)
|
$(CLBLAST_BUILD)
|
||||||
ifdef NOAVX2_BUILD
|
ifdef NOAVX2_BUILD
|
||||||
koboldcpp_clblast_noavx2: ggml_v4_clblast_noavx2.o ggml_v3_clblast_noavx2.o ggml_v2_clblast_noavx2.o ggml_v1_failsafe.o expose.o gpttype_adapter_clblast_noavx2.o ggml-opencl.o ggml_v3-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o sdcpp_default.o whispercpp_default.o llavaclip_default.o llava.o ggml-backend_default.o $(OBJS_SIMPLE) $(OBJS)
|
koboldcpp_clblast_noavx2: ggml_v4_clblast_noavx2.o ggml-cpu_v4_clblast_noavx2.o ggml_v3_clblast_noavx2.o ggml_v2_clblast_noavx2.o ggml_v1_failsafe.o expose.o gpttype_adapter_clblast_noavx2.o ggml-opencl.o ggml_v3-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o sdcpp_default.o whispercpp_default.o llavaclip_default.o llava.o ggml-backend_default.o $(OBJS_SIMPLE) $(OBJS)
|
||||||
$(CLBLAST_BUILD)
|
$(CLBLAST_BUILD)
|
||||||
else
|
else
|
||||||
koboldcpp_clblast_noavx2:
|
koboldcpp_clblast_noavx2:
|
||||||
|
@ -632,7 +643,7 @@ koboldcpp_clblast_noavx2:
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifdef CUBLAS_BUILD
|
ifdef CUBLAS_BUILD
|
||||||
koboldcpp_cublas: ggml_v4_cublas.o ggml_v3_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o gpttype_adapter_cublas.o sdcpp_cublas.o whispercpp_cublas.o llavaclip_cublas.o llava.o ggml-backend_cublas.o $(CUBLAS_OBJS) $(OBJS_FULL) $(OBJS)
|
koboldcpp_cublas: ggml_v4_cublas.o ggml.o ggml_v3_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o gpttype_adapter_cublas.o sdcpp_cublas.o whispercpp_cublas.o llavaclip_cublas.o llava.o ggml-backend_cublas.o $(CUBLAS_OBJS) $(OBJS_FULL) $(OBJS)
|
||||||
$(CUBLAS_BUILD)
|
$(CUBLAS_BUILD)
|
||||||
else
|
else
|
||||||
koboldcpp_cublas:
|
koboldcpp_cublas:
|
||||||
|
@ -640,7 +651,7 @@ koboldcpp_cublas:
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifdef HIPBLAS_BUILD
|
ifdef HIPBLAS_BUILD
|
||||||
koboldcpp_hipblas: ggml_v4_cublas.o ggml_v3_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o gpttype_adapter_cublas.o sdcpp_cublas.o whispercpp_cublas.o llavaclip_cublas.o llava.o ggml-backend_cublas.o $(HIP_OBJS) $(OBJS_FULL) $(OBJS)
|
koboldcpp_hipblas: ggml_v4_cublas.o ggml.o ggml_v3_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o gpttype_adapter_cublas.o sdcpp_cublas.o whispercpp_cublas.o llavaclip_cublas.o llava.o ggml-backend_cublas.o $(HIP_OBJS) $(OBJS_FULL) $(OBJS)
|
||||||
$(HIPBLAS_BUILD)
|
$(HIPBLAS_BUILD)
|
||||||
else
|
else
|
||||||
koboldcpp_hipblas:
|
koboldcpp_hipblas:
|
||||||
|
@ -648,10 +659,10 @@ koboldcpp_hipblas:
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifdef VULKAN_BUILD
|
ifdef VULKAN_BUILD
|
||||||
koboldcpp_vulkan: ggml_v4_vulkan.o ggml_v3.o ggml_v2.o ggml_v1.o expose.o gpttype_adapter_vulkan.o ggml-vulkan.o sdcpp_vulkan.o whispercpp_default.o llavaclip_vulkan.o llava.o ggml-backend_vulkan.o $(OBJS_FULL) $(OBJS)
|
koboldcpp_vulkan: ggml_v4_vulkan.o ggml-cpu.o ggml_v3.o ggml_v2.o ggml_v1.o expose.o gpttype_adapter_vulkan.o ggml-vulkan.o sdcpp_vulkan.o whispercpp_default.o llavaclip_vulkan.o llava.o ggml-backend_vulkan.o $(OBJS_FULL) $(OBJS)
|
||||||
$(VULKAN_BUILD)
|
$(VULKAN_BUILD)
|
||||||
ifdef NOAVX2_BUILD
|
ifdef NOAVX2_BUILD
|
||||||
koboldcpp_vulkan_noavx2: ggml_v4_vulkan_noavx2.o ggml_v3_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o gpttype_adapter_vulkan_noavx2.o ggml-vulkan.o sdcpp_vulkan.o whispercpp_default.o llavaclip_vulkan.o llava.o ggml-backend_vulkan.o $(OBJS_SIMPLE) $(OBJS)
|
koboldcpp_vulkan_noavx2: ggml_v4_vulkan_noavx2.o ggml_v4_noavx2.o ggml_v3_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o gpttype_adapter_vulkan_noavx2.o ggml-vulkan.o sdcpp_vulkan.o whispercpp_default.o llavaclip_vulkan.o llava.o ggml-backend_vulkan.o $(OBJS_SIMPLE) $(OBJS)
|
||||||
$(VULKAN_BUILD)
|
$(VULKAN_BUILD)
|
||||||
else
|
else
|
||||||
koboldcpp_vulkan_noavx2:
|
koboldcpp_vulkan_noavx2:
|
||||||
|
@ -665,17 +676,17 @@ koboldcpp_vulkan_noavx2:
|
||||||
endif
|
endif
|
||||||
|
|
||||||
# tools
|
# tools
|
||||||
quantize_gguf: examples/quantize/quantize.cpp ggml.o llama.o llavaclip_default.o llava.o ggml-backend_default.o $(OBJS_FULL)
|
quantize_gguf: examples/quantize/quantize.cpp ggml.o ggml-cpu.o llama.o llavaclip_default.o llava.o ggml-backend_default.o $(OBJS_FULL)
|
||||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||||
quantize_gptj: otherarch/tools/gptj_quantize.cpp otherarch/tools/common-ggml.cpp ggml_v3.o ggml.o llama.o llavaclip_default.o llava.o ggml-backend_default.o $(OBJS_FULL)
|
quantize_gptj: otherarch/tools/gptj_quantize.cpp otherarch/tools/common-ggml.cpp ggml_v3.o ggml.o ggml-cpu.o llama.o llavaclip_default.o llava.o ggml-backend_default.o $(OBJS_FULL)
|
||||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||||
quantize_gpt2: otherarch/tools/gpt2_quantize.cpp otherarch/tools/common-ggml.cpp ggml_v3.o ggml.o llama.o llavaclip_default.o llava.o ggml-backend_default.o $(OBJS_FULL)
|
quantize_gpt2: otherarch/tools/gpt2_quantize.cpp otherarch/tools/common-ggml.cpp ggml_v3.o ggml.o ggml-cpu.o llama.o llavaclip_default.o llava.o ggml-backend_default.o $(OBJS_FULL)
|
||||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||||
quantize_neox: otherarch/tools/neox_quantize.cpp otherarch/tools/common-ggml.cpp ggml_v3.o ggml.o llama.o llavaclip_default.o llava.o ggml-backend_default.o $(OBJS_FULL)
|
quantize_neox: otherarch/tools/neox_quantize.cpp otherarch/tools/common-ggml.cpp ggml_v3.o ggml.o ggml-cpu.o llama.o llavaclip_default.o llava.o ggml-backend_default.o $(OBJS_FULL)
|
||||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||||
quantize_mpt: otherarch/tools/mpt_quantize.cpp otherarch/tools/common-ggml.cpp ggml_v3.o ggml.o llama.o llavaclip_default.o llava.o ggml-backend_default.o $(OBJS_FULL)
|
quantize_mpt: otherarch/tools/mpt_quantize.cpp otherarch/tools/common-ggml.cpp ggml_v3.o ggml.o ggml-cpu.o llama.o llavaclip_default.o llava.o ggml-backend_default.o $(OBJS_FULL)
|
||||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||||
quantize_clip: examples/llava/clip.cpp examples/llava/clip.h examples/llava/quantclip.cpp ggml_v3.o ggml.o llama.o ggml-backend_default.o $(OBJS_FULL)
|
quantize_clip: examples/llava/clip.cpp examples/llava/clip.h examples/llava/quantclip.cpp ggml_v3.o ggml.o ggml-cpu.o llama.o ggml-backend_default.o $(OBJS_FULL)
|
||||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
#window simple clinfo
|
#window simple clinfo
|
||||||
|
|
16
cmake/arm64-apple-clang.cmake
Normal file
16
cmake/arm64-apple-clang.cmake
Normal file
|
@ -0,0 +1,16 @@
|
||||||
|
set( CMAKE_SYSTEM_NAME Darwin )
|
||||||
|
set( CMAKE_SYSTEM_PROCESSOR arm64 )
|
||||||
|
|
||||||
|
set( target arm64-apple-darwin-macho )
|
||||||
|
|
||||||
|
set( CMAKE_C_COMPILER clang )
|
||||||
|
set( CMAKE_CXX_COMPILER clang++ )
|
||||||
|
|
||||||
|
set( CMAKE_C_COMPILER_TARGET ${target} )
|
||||||
|
set( CMAKE_CXX_COMPILER_TARGET ${target} )
|
||||||
|
|
||||||
|
set( arch_c_flags "-march=armv8.4-a -fvectorize -ffp-model=fast -fno-finite-math-only" )
|
||||||
|
set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function" )
|
||||||
|
|
||||||
|
set( CMAKE_C_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
|
||||||
|
set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
|
|
@ -1953,6 +1953,8 @@ void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const cha
|
||||||
|
|
||||||
void yaml_dump_non_result_info(FILE * stream, const common_params & params, const llama_context * lctx,
|
void yaml_dump_non_result_info(FILE * stream, const common_params & params, const llama_context * lctx,
|
||||||
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
|
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
|
||||||
|
ggml_cpu_init(); // some ARM features are detected at runtime
|
||||||
|
|
||||||
const auto & sparams = params.sparams;
|
const auto & sparams = params.sparams;
|
||||||
|
|
||||||
fprintf(stream, "build_commit: %s\n", LLAMA_COMMIT);
|
fprintf(stream, "build_commit: %s\n", LLAMA_COMMIT);
|
||||||
|
|
|
@ -151,7 +151,7 @@ struct common_sampler_params {
|
||||||
|
|
||||||
struct common_params {
|
struct common_params {
|
||||||
int32_t n_predict = -1; // new tokens to predict
|
int32_t n_predict = -1; // new tokens to predict
|
||||||
int32_t n_ctx = 0; // context size
|
int32_t n_ctx = 4096; // context size
|
||||||
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
|
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
|
||||||
int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
|
int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
|
||||||
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
||||||
|
|
1515
common/train.cpp
1515
common/train.cpp
File diff suppressed because it is too large
Load diff
233
common/train.h
233
common/train.h
|
@ -1,233 +0,0 @@
|
||||||
// Various helper functions and utilities for training
|
|
||||||
|
|
||||||
#pragma once
|
|
||||||
|
|
||||||
#include <string>
|
|
||||||
#include <random>
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
#include "ggml.h"
|
|
||||||
#include "llama.h"
|
|
||||||
|
|
||||||
#define LLAMA_TRAIN_MAX_NODES 16384
|
|
||||||
|
|
||||||
typedef std::string mt19937_state;
|
|
||||||
|
|
||||||
struct train_state {
|
|
||||||
struct ggml_opt_context * opt;
|
|
||||||
|
|
||||||
uint64_t train_its;
|
|
||||||
uint64_t train_samples;
|
|
||||||
uint64_t train_tokens;
|
|
||||||
uint64_t train_epochs;
|
|
||||||
|
|
||||||
size_t shuffle_samples_hash; // fn, sample_count, *zip(sample_begins, sample_sizes)
|
|
||||||
mt19937_state shuffle_rng_state_current;
|
|
||||||
mt19937_state shuffle_rng_state_next;
|
|
||||||
size_t shuffle_sample_count;
|
|
||||||
size_t shuffle_next_sample;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct train_params_common {
|
|
||||||
const char * fn_train_data;
|
|
||||||
const char * fn_checkpoint_in;
|
|
||||||
const char * fn_checkpoint_out;
|
|
||||||
const char * pattern_fn_it;
|
|
||||||
const char * fn_latest;
|
|
||||||
|
|
||||||
bool print_usage;
|
|
||||||
|
|
||||||
int save_every;
|
|
||||||
|
|
||||||
uint32_t seed;
|
|
||||||
|
|
||||||
int n_ctx;
|
|
||||||
int n_threads;
|
|
||||||
int n_batch;
|
|
||||||
int n_gradient_accumulation;
|
|
||||||
int n_epochs;
|
|
||||||
int n_gpu_layers;
|
|
||||||
|
|
||||||
bool custom_n_ctx;
|
|
||||||
|
|
||||||
bool use_flash;
|
|
||||||
bool use_checkpointing;
|
|
||||||
|
|
||||||
std::string sample_start;
|
|
||||||
bool include_sample_start;
|
|
||||||
bool escape;
|
|
||||||
bool overlapping_samples;
|
|
||||||
bool fill_with_next_samples;
|
|
||||||
bool separate_with_eos;
|
|
||||||
bool separate_with_bos;
|
|
||||||
bool sample_random_offsets;
|
|
||||||
|
|
||||||
bool force_reshuffle;
|
|
||||||
|
|
||||||
int warmup;
|
|
||||||
int cos_decay_steps;
|
|
||||||
float cos_decay_restart;
|
|
||||||
float cos_decay_min;
|
|
||||||
bool enable_restart;
|
|
||||||
|
|
||||||
int opt_past;
|
|
||||||
float opt_delta;
|
|
||||||
int opt_max_no_improvement;
|
|
||||||
|
|
||||||
int adam_n_iter;
|
|
||||||
float adam_alpha;
|
|
||||||
float adam_min_alpha;
|
|
||||||
float adam_decay;
|
|
||||||
int adam_decay_min_ndim;
|
|
||||||
float adam_beta1;
|
|
||||||
float adam_beta2;
|
|
||||||
float adam_gclip;
|
|
||||||
float adam_eps_f;
|
|
||||||
};
|
|
||||||
|
|
||||||
typedef void (*save_train_files_callback)(void * data, struct train_state * train);
|
|
||||||
|
|
||||||
struct train_opt_callback_data {
|
|
||||||
struct train_params_common * params;
|
|
||||||
struct train_state * train;
|
|
||||||
save_train_files_callback save_cb;
|
|
||||||
void * save_data;
|
|
||||||
struct llama_context * lctx;
|
|
||||||
int last_save_iter;
|
|
||||||
llama_token * tokens_data;
|
|
||||||
size_t tokens_size;
|
|
||||||
size_t * samples_begin;
|
|
||||||
size_t * samples_size;
|
|
||||||
size_t * shuffled_samples_offs;
|
|
||||||
size_t * shuffled_samples_begin;
|
|
||||||
size_t * shuffled_samples_size;
|
|
||||||
size_t samples_count;
|
|
||||||
struct ggml_tensor * tokens_input;
|
|
||||||
struct ggml_tensor * target_probs;
|
|
||||||
int first_iter;
|
|
||||||
int first_epoch;
|
|
||||||
int iter_at_last_epoch;
|
|
||||||
int64_t last_time;
|
|
||||||
double millis_per_iter;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct train_state * init_train_state();
|
|
||||||
void free_train_state(struct train_state * state);
|
|
||||||
|
|
||||||
struct train_params_common get_default_train_params_common();
|
|
||||||
void print_common_train_usage(int /*argc*/, char ** argv, const struct train_params_common * params);
|
|
||||||
|
|
||||||
bool consume_common_train_arg(int argc, char ** argv, int * idx, struct train_params_common * params, bool * invalid_param);
|
|
||||||
void finish_processing_train_args(struct train_params_common * params);
|
|
||||||
|
|
||||||
struct random_normal_distribution;
|
|
||||||
struct random_uniform_distribution;
|
|
||||||
|
|
||||||
struct random_normal_distribution * init_random_normal_distribution (int seed, float mean, float std, float min, float max);
|
|
||||||
struct random_uniform_distribution * init_random_uniform_distribution(int seed, float min, float max);
|
|
||||||
|
|
||||||
void free_random_normal_distribution (struct random_normal_distribution * rnd);
|
|
||||||
void free_random_uniform_distribution(struct random_uniform_distribution * rnd);
|
|
||||||
|
|
||||||
struct ggml_tensor * randomize_tensor_normal (struct ggml_tensor * tensor, struct random_normal_distribution * rnd);
|
|
||||||
struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struct random_uniform_distribution * rnd);
|
|
||||||
|
|
||||||
// generate random float in interval [0,1)
|
|
||||||
float frand();
|
|
||||||
float frand_normal (struct random_normal_distribution * rnd);
|
|
||||||
float frand_uniform(struct random_uniform_distribution * rnd);
|
|
||||||
|
|
||||||
int clamp (const int v, const int min, const int max);
|
|
||||||
float fclamp(const float v, const float min, const float max);
|
|
||||||
|
|
||||||
void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0);
|
|
||||||
void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1);
|
|
||||||
void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2);
|
|
||||||
void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3);
|
|
||||||
|
|
||||||
size_t tokenize_file(
|
|
||||||
struct llama_context * lctx,
|
|
||||||
const char * filename,
|
|
||||||
const std::string & sample_start,
|
|
||||||
bool include_sample_start,
|
|
||||||
bool overlapping_samples,
|
|
||||||
unsigned context_length,
|
|
||||||
std::vector<llama_token> & out_tokens,
|
|
||||||
std::vector<size_t> & out_samples_begin,
|
|
||||||
std::vector<size_t> & out_samples_size);
|
|
||||||
|
|
||||||
int64_t get_example_targets_batch(
|
|
||||||
struct llama_context * lctx,
|
|
||||||
struct ggml_tensor * tokens_input,
|
|
||||||
struct ggml_tensor * target_probs,
|
|
||||||
int64_t example_id,
|
|
||||||
const size_t * samples_offs,
|
|
||||||
const size_t * samples_begin,
|
|
||||||
const size_t * samples_size,
|
|
||||||
size_t samples_count,
|
|
||||||
const llama_token * train_data,
|
|
||||||
size_t n_train_data,
|
|
||||||
bool separate_with_eos,
|
|
||||||
bool separate_with_bos,
|
|
||||||
bool fill_with_next_samples,
|
|
||||||
bool sample_random_offsets);
|
|
||||||
|
|
||||||
|
|
||||||
void mt19937_set_state(std::mt19937& rng, const mt19937_state& rng_state);
|
|
||||||
mt19937_state mt19937_get_state(const std::mt19937& rng);
|
|
||||||
mt19937_state mt19937_seed_to_state(unsigned seed);
|
|
||||||
|
|
||||||
mt19937_state shuffle_samples(
|
|
||||||
const mt19937_state & rng_state,
|
|
||||||
size_t * shuffled_offs,
|
|
||||||
size_t * shuffled_begins,
|
|
||||||
size_t * shuffled_sizes,
|
|
||||||
const size_t * begins,
|
|
||||||
const size_t * sizes,
|
|
||||||
size_t count);
|
|
||||||
|
|
||||||
size_t hash_combine(size_t h1, size_t h2);
|
|
||||||
|
|
||||||
size_t compute_samples_hash(
|
|
||||||
const char* fn,
|
|
||||||
const size_t* samples_begin,
|
|
||||||
const size_t* samples_size,
|
|
||||||
size_t sample_count);
|
|
||||||
|
|
||||||
|
|
||||||
std::string replace_str(const char * s, const char * needle, const char * replacement);
|
|
||||||
|
|
||||||
void print_duration(double milliseconds);
|
|
||||||
|
|
||||||
float cosine_decay(
|
|
||||||
int64_t step,
|
|
||||||
int64_t decay_steps,
|
|
||||||
float minimum);
|
|
||||||
|
|
||||||
float cosine_decay_restart(
|
|
||||||
int64_t step,
|
|
||||||
int64_t decay_steps,
|
|
||||||
float minimum,
|
|
||||||
float restart_step_mult);
|
|
||||||
|
|
||||||
float learning_schedule(
|
|
||||||
int64_t step,
|
|
||||||
int64_t warmup_steps,
|
|
||||||
int64_t decay_steps,
|
|
||||||
float learning_rate,
|
|
||||||
float overall_minimum,
|
|
||||||
float cos_decay_minimum,
|
|
||||||
float cos_decay_restart_step_mult,
|
|
||||||
bool enable_restart);
|
|
||||||
|
|
||||||
void copy_tensor_by_name(struct ggml_tensor * dst, struct ggml_context * ctx, const char * name);
|
|
||||||
|
|
||||||
void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct ggml_opt_context * opt);
|
|
||||||
void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context * opt);
|
|
||||||
|
|
||||||
bool load_train_state_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct train_state * train);
|
|
||||||
void save_train_state_gguf(struct gguf_context * fctx, struct train_state * train);
|
|
||||||
|
|
||||||
std::string get_train_filename(const char * filename, const char * pattern_it, const char * latest, int64_t iteration);
|
|
||||||
|
|
||||||
void train_opt_callback(void * vdata, int accum_step, float * sched, bool * cancel);
|
|
|
@ -72,7 +72,8 @@ class Model:
|
||||||
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool = False,
|
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool = False,
|
||||||
use_temp_file: bool = False, eager: bool = False,
|
use_temp_file: bool = False, eager: bool = False,
|
||||||
metadata_override: Path | None = None, model_name: str | None = None,
|
metadata_override: Path | None = None, model_name: str | None = None,
|
||||||
split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False):
|
split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False,
|
||||||
|
small_first_shard: bool = False, hparams: dict[str, Any] | None = None):
|
||||||
if type(self) is Model:
|
if type(self) is Model:
|
||||||
raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
|
raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
|
||||||
|
|
||||||
|
@ -87,7 +88,7 @@ class Model:
|
||||||
self.is_safetensors = len(self.part_names) > 0
|
self.is_safetensors = len(self.part_names) > 0
|
||||||
if not self.is_safetensors:
|
if not self.is_safetensors:
|
||||||
self.part_names = Model.get_model_part_names(self.dir_model, "pytorch_model", ".bin")
|
self.part_names = Model.get_model_part_names(self.dir_model, "pytorch_model", ".bin")
|
||||||
self.hparams = Model.load_hparams(self.dir_model)
|
self.hparams = Model.load_hparams(self.dir_model) if hparams is None else hparams
|
||||||
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"])
|
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"])
|
||||||
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
|
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
|
||||||
self.tensor_names = None
|
self.tensor_names = None
|
||||||
|
@ -1541,6 +1542,17 @@ class LlamaModel(Model):
|
||||||
special_vocab._set_special_token("eot", 32010)
|
special_vocab._set_special_token("eot", 32010)
|
||||||
special_vocab.add_to_gguf(self.gguf_writer)
|
special_vocab.add_to_gguf(self.gguf_writer)
|
||||||
|
|
||||||
|
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
|
||||||
|
if tokenizer_config_file.is_file():
|
||||||
|
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
|
||||||
|
tokenizer_config_json = json.load(f)
|
||||||
|
if "add_prefix_space" in tokenizer_config_json:
|
||||||
|
self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
|
||||||
|
|
||||||
|
# Apply to granite small models only
|
||||||
|
if self.hparams.get("vocab_size", 32000) == 49152:
|
||||||
|
self.gguf_writer.add_add_bos_token(False)
|
||||||
|
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
super().set_gguf_parameters()
|
super().set_gguf_parameters()
|
||||||
hparams = self.hparams
|
hparams = self.hparams
|
||||||
|
@ -1557,17 +1569,6 @@ class LlamaModel(Model):
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
||||||
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
|
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
|
||||||
|
|
||||||
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
|
|
||||||
if tokenizer_config_file.is_file():
|
|
||||||
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
|
|
||||||
tokenizer_config_json = json.load(f)
|
|
||||||
if "add_prefix_space" in tokenizer_config_json:
|
|
||||||
self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
|
|
||||||
|
|
||||||
# Apply to granite small models only
|
|
||||||
if self.hparams.get("vocab_size", 32000) == 49152:
|
|
||||||
self.gguf_writer.add_add_bos_token(False)
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
|
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
|
||||||
if n_head_kv is not None and n_head != n_head_kv:
|
if n_head_kv is not None and n_head != n_head_kv:
|
||||||
|
|
|
@ -12,6 +12,7 @@ import json
|
||||||
from math import prod
|
from math import prod
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import TYPE_CHECKING, Any, Callable, Iterable, Iterator, Sequence, SupportsIndex, cast
|
from typing import TYPE_CHECKING, Any, Callable, Iterable, Iterator, Sequence, SupportsIndex, cast
|
||||||
|
from transformers import AutoConfig
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
|
@ -256,8 +257,8 @@ def parse_args() -> argparse.Namespace:
|
||||||
help="only print out what will be done, without writing any new files",
|
help="only print out what will be done, without writing any new files",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--base", type=Path, required=True,
|
"--base", type=Path,
|
||||||
help="directory containing Hugging Face model config files (config.json, tokenizer.json) for the base model that the adapter is based on - only config is needed, actual model weights are not required",
|
help="directory containing Hugging Face model config files (config.json, tokenizer.json) for the base model that the adapter is based on - only config is needed, actual model weights are not required. If base model is unspecified, it will be loaded from Hugging Face hub based on the adapter config",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"lora_path", type=Path,
|
"lora_path", type=Path,
|
||||||
|
@ -267,6 +268,12 @@ def parse_args() -> argparse.Namespace:
|
||||||
return parser.parse_args()
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
def load_hparams_from_hf(hf_model_id: str) -> dict[str, Any]:
|
||||||
|
# normally, adapter does not come with base model config, we need to load it from AutoConfig
|
||||||
|
config = AutoConfig.from_pretrained(hf_model_id)
|
||||||
|
return config.to_dict()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
args = parse_args()
|
args = parse_args()
|
||||||
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
|
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
|
||||||
|
@ -281,7 +288,7 @@ if __name__ == '__main__':
|
||||||
|
|
||||||
ftype = ftype_map[args.outtype]
|
ftype = ftype_map[args.outtype]
|
||||||
|
|
||||||
dir_base_model: Path = args.base
|
dir_base_model: Path | None = args.base
|
||||||
dir_lora: Path = args.lora_path
|
dir_lora: Path = args.lora_path
|
||||||
lora_config = dir_lora / "adapter_config.json"
|
lora_config = dir_lora / "adapter_config.json"
|
||||||
input_model = dir_lora / "adapter_model.safetensors"
|
input_model = dir_lora / "adapter_model.safetensors"
|
||||||
|
@ -301,9 +308,29 @@ if __name__ == '__main__':
|
||||||
input_model = os.path.join(dir_lora, "adapter_model.bin")
|
input_model = os.path.join(dir_lora, "adapter_model.bin")
|
||||||
lora_model = torch.load(input_model, map_location="cpu", weights_only=True)
|
lora_model = torch.load(input_model, map_location="cpu", weights_only=True)
|
||||||
|
|
||||||
|
# load LoRA config
|
||||||
|
with open(lora_config, "r") as f:
|
||||||
|
lparams: dict[str, Any] = json.load(f)
|
||||||
|
|
||||||
# load base model
|
# load base model
|
||||||
logger.info(f"Loading base model: {dir_base_model.name}")
|
if dir_base_model is None:
|
||||||
hparams = Model.load_hparams(dir_base_model)
|
if "base_model_name_or_path" in lparams:
|
||||||
|
model_id = lparams["base_model_name_or_path"]
|
||||||
|
logger.info(f"Loading base model from Hugging Face: {model_id}")
|
||||||
|
try:
|
||||||
|
hparams = load_hparams_from_hf(model_id)
|
||||||
|
except OSError as e:
|
||||||
|
logger.error(f"Failed to load base model config: {e}")
|
||||||
|
logger.error("Please try downloading the base model and add its path to --base")
|
||||||
|
sys.exit(1)
|
||||||
|
else:
|
||||||
|
logger.error("'base_model_name_or_path' is not found in adapter_config.json")
|
||||||
|
logger.error("Base model config is required. Please download the base model and add its path to --base")
|
||||||
|
sys.exit(1)
|
||||||
|
else:
|
||||||
|
logger.info(f"Loading base model: {dir_base_model.name}")
|
||||||
|
hparams = Model.load_hparams(dir_base_model)
|
||||||
|
|
||||||
with torch.inference_mode():
|
with torch.inference_mode():
|
||||||
try:
|
try:
|
||||||
model_class = Model.from_model_architecture(hparams["architectures"][0])
|
model_class = Model.from_model_architecture(hparams["architectures"][0])
|
||||||
|
@ -323,13 +350,15 @@ if __name__ == '__main__':
|
||||||
self.dir_model_card = dir_lora_model
|
self.dir_model_card = dir_lora_model
|
||||||
self.lora_alpha = float(lora_alpha)
|
self.lora_alpha = float(lora_alpha)
|
||||||
|
|
||||||
|
def set_vocab(self):
|
||||||
|
pass
|
||||||
|
|
||||||
def set_type(self):
|
def set_type(self):
|
||||||
self.gguf_writer.add_type(gguf.GGUFType.ADAPTER)
|
self.gguf_writer.add_type(gguf.GGUFType.ADAPTER)
|
||||||
self.gguf_writer.add_string(gguf.Keys.Adapter.TYPE, "lora")
|
self.gguf_writer.add_string(gguf.Keys.Adapter.TYPE, "lora")
|
||||||
|
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
self.gguf_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, self.lora_alpha)
|
self.gguf_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, self.lora_alpha)
|
||||||
super().set_gguf_parameters()
|
|
||||||
|
|
||||||
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
||||||
# Never add extra tensors (e.g. rope_freqs) for LoRA adapters
|
# Never add extra tensors (e.g. rope_freqs) for LoRA adapters
|
||||||
|
@ -350,7 +379,7 @@ if __name__ == '__main__':
|
||||||
logger.error(f"Unexpected name '{name}': Not a lora_A or lora_B tensor")
|
logger.error(f"Unexpected name '{name}': Not a lora_A or lora_B tensor")
|
||||||
if ".embed_tokens.weight" in name or ".lm_head.weight" in name:
|
if ".embed_tokens.weight" in name or ".lm_head.weight" in name:
|
||||||
logger.error("Embeddings is present in the adapter. This can be due to new tokens added during fine tuning")
|
logger.error("Embeddings is present in the adapter. This can be due to new tokens added during fine tuning")
|
||||||
logger.error("Hint: if you are using TRL, make sure not to call setup_chat_format()")
|
logger.error("Please refer to https://github.com/ggerganov/llama.cpp/pull/9948")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
if base_name in tensor_map:
|
if base_name in tensor_map:
|
||||||
|
@ -384,9 +413,6 @@ if __name__ == '__main__':
|
||||||
yield (dest_name + ".lora_a", lora_a)
|
yield (dest_name + ".lora_a", lora_a)
|
||||||
yield (dest_name + ".lora_b", lora_b)
|
yield (dest_name + ".lora_b", lora_b)
|
||||||
|
|
||||||
with open(lora_config, "r") as f:
|
|
||||||
lparams: dict[str, Any] = json.load(f)
|
|
||||||
|
|
||||||
alpha: float = lparams["lora_alpha"]
|
alpha: float = lparams["lora_alpha"]
|
||||||
|
|
||||||
model_instance = LoraModel(
|
model_instance = LoraModel(
|
||||||
|
@ -399,6 +425,7 @@ if __name__ == '__main__':
|
||||||
dry_run=args.dry_run,
|
dry_run=args.dry_run,
|
||||||
dir_lora_model=dir_lora,
|
dir_lora_model=dir_lora,
|
||||||
lora_alpha=alpha,
|
lora_alpha=alpha,
|
||||||
|
hparams=hparams,
|
||||||
)
|
)
|
||||||
|
|
||||||
logger.info("Exporting model...")
|
logger.info("Exporting model...")
|
||||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -4,6 +4,7 @@
|
||||||
// Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
|
// Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
|
||||||
#include "clip.h"
|
#include "clip.h"
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
|
#include "ggml-cpu.h"
|
||||||
#include "ggml-alloc.h"
|
#include "ggml-alloc.h"
|
||||||
#include "ggml-backend.h"
|
#include "ggml-backend.h"
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
#include "ggml-cpu.h"
|
||||||
|
|
||||||
#ifdef GGML_USE_CUDA
|
#ifdef GGML_USE_CUDA
|
||||||
#include "ggml-cuda.h"
|
#include "ggml-cuda.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -248,6 +248,7 @@ struct server_slot {
|
||||||
if (is_processing()) {
|
if (is_processing()) {
|
||||||
SLT_INF(*this, "stop processing: n_past = %d, truncated = %d\n", n_past, truncated);
|
SLT_INF(*this, "stop processing: n_past = %d, truncated = %d\n", n_past, truncated);
|
||||||
|
|
||||||
|
t_last_used = ggml_time_us();
|
||||||
t_token_generation = (ggml_time_us() - t_start_generation) / 1e3;
|
t_token_generation = (ggml_time_us() - t_start_generation) / 1e3;
|
||||||
state = SLOT_STATE_IDLE;
|
state = SLOT_STATE_IDLE;
|
||||||
callback_on_release(id);
|
callback_on_release(id);
|
||||||
|
@ -731,7 +732,7 @@ struct server_context {
|
||||||
|
|
||||||
// find the slot that has at least n% prompt similarity
|
// find the slot that has at least n% prompt similarity
|
||||||
if (ret == nullptr && slot_prompt_similarity != 0.0f) {
|
if (ret == nullptr && slot_prompt_similarity != 0.0f) {
|
||||||
int max_lcs_len = 0;
|
int lcs_len = 0;
|
||||||
float similarity = 0;
|
float similarity = 0;
|
||||||
|
|
||||||
for (server_slot & slot : slots) {
|
for (server_slot & slot : slots) {
|
||||||
|
@ -746,20 +747,21 @@ struct server_context {
|
||||||
}
|
}
|
||||||
|
|
||||||
// length of the Longest Common Subsequence between the current slot's prompt and the input prompt
|
// length of the Longest Common Subsequence between the current slot's prompt and the input prompt
|
||||||
int lcs_len = longest_common_subsequence(slot.cache_tokens, task.prompt_tokens);
|
int cur_lcs_len = longest_common_subsequence(slot.cache_tokens, task.prompt_tokens);
|
||||||
|
|
||||||
// fraction of the common subsequence length compared to the current slot's prompt length
|
// fraction of the common subsequence length compared to the current slot's prompt length
|
||||||
similarity = static_cast<float>(lcs_len) / static_cast<int>(slot.cache_tokens.size());
|
float cur_similarity = static_cast<float>(cur_lcs_len) / static_cast<int>(slot.cache_tokens.size());
|
||||||
|
|
||||||
// select the current slot if the criteria match
|
// select the current slot if the criteria match
|
||||||
if (lcs_len > max_lcs_len && similarity > slot_prompt_similarity) {
|
if (cur_lcs_len > lcs_len && cur_similarity > slot_prompt_similarity) {
|
||||||
max_lcs_len = lcs_len;
|
lcs_len = cur_lcs_len;
|
||||||
|
similarity = cur_similarity;
|
||||||
ret = &slot;
|
ret = &slot;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ret != nullptr) {
|
if (ret != nullptr) {
|
||||||
SLT_DBG(*ret, "selected slot by lcs similarity, max_lcs_len = %d, similarity = %f\n", max_lcs_len, similarity);
|
SLT_DBG(*ret, "selected slot by lcs similarity, lcs_len = %d, similarity = %f\n", lcs_len, similarity);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2704,8 +2706,8 @@ int main(int argc, char ** argv) {
|
||||||
};
|
};
|
||||||
|
|
||||||
const auto handle_completions_generic = [&ctx_server, &res_error, &res_ok](server_task_inf_type inf_type, json & data, httplib::Response & res) {
|
const auto handle_completions_generic = [&ctx_server, &res_error, &res_ok](server_task_inf_type inf_type, json & data, httplib::Response & res) {
|
||||||
if (ctx_server.params.embedding || ctx_server.params.reranking) {
|
if (ctx_server.params.embedding) {
|
||||||
res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings` or `--reranking`", ERROR_TYPE_NOT_SUPPORTED));
|
res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2810,8 +2812,8 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
// TODO: maybe merge this function with "handle_completions_generic"
|
// TODO: maybe merge this function with "handle_completions_generic"
|
||||||
const auto handle_chat_completions = [&ctx_server, ¶ms, &res_error, &res_ok, verbose](const httplib::Request & req, httplib::Response & res) {
|
const auto handle_chat_completions = [&ctx_server, ¶ms, &res_error, &res_ok, verbose](const httplib::Request & req, httplib::Response & res) {
|
||||||
if (ctx_server.params.embedding || ctx_server.params.reranking) {
|
if (ctx_server.params.embedding) {
|
||||||
res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings` or `--reranking`", ERROR_TYPE_NOT_SUPPORTED));
|
res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2936,11 +2938,6 @@ int main(int argc, char ** argv) {
|
||||||
};
|
};
|
||||||
|
|
||||||
const auto handle_embeddings = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) {
|
const auto handle_embeddings = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) {
|
||||||
// TODO: somehow clean up this checks in the future
|
|
||||||
if (!ctx_server.params.embedding || ctx_server.params.reranking) {
|
|
||||||
res_error(res, format_error_response("This server does not support embeddings. Start it with `--embeddings` and without `--reranking`", ERROR_TYPE_NOT_SUPPORTED));
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
const json body = json::parse(req.body);
|
const json body = json::parse(req.body);
|
||||||
bool is_openai = false;
|
bool is_openai = false;
|
||||||
|
|
||||||
|
@ -2992,10 +2989,11 @@ int main(int argc, char ** argv) {
|
||||||
};
|
};
|
||||||
|
|
||||||
const auto handle_rerank = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) {
|
const auto handle_rerank = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) {
|
||||||
if (!ctx_server.params.reranking) {
|
if (!ctx_server.params.reranking || ctx_server.params.embedding) {
|
||||||
res_error(res, format_error_response("This server does not support reranking. Start it with `--reranking`", ERROR_TYPE_NOT_SUPPORTED));
|
res_error(res, format_error_response("This server does not support reranking. Start it with `--reranking` and without `--embedding`", ERROR_TYPE_NOT_SUPPORTED));
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
const json body = json::parse(req.body);
|
const json body = json::parse(req.body);
|
||||||
|
|
||||||
// TODO: implement
|
// TODO: implement
|
||||||
|
|
|
@ -453,20 +453,20 @@ static size_t longest_common_subsequence(const llama_tokens & a, const llama_tok
|
||||||
}
|
}
|
||||||
|
|
||||||
// get the lengths of the input sequences
|
// get the lengths of the input sequences
|
||||||
int a_len = a.size();
|
size_t a_len = a.size();
|
||||||
int b_len = b.size();
|
size_t b_len = b.size();
|
||||||
|
|
||||||
// initialize the maximum length of the longest common subsequence (LCS)
|
// initialize the maximum length of the longest common subsequence (LCS)
|
||||||
int max_length = 0;
|
size_t max_length = 0;
|
||||||
|
|
||||||
// use two rows instead of a 2D matrix to optimize space
|
// use two rows instead of a 2D matrix to optimize space
|
||||||
std::vector<int> prev_row(b_len + 1, 0);
|
std::vector<size_t> prev_row(b_len + 1, 0);
|
||||||
std::vector<int> curr_row(b_len + 1, 0);
|
std::vector<size_t> curr_row(b_len + 1, 0);
|
||||||
|
|
||||||
// iterate through the elements of a
|
// iterate through the elements of a
|
||||||
for (int i = 1; i <= a_len; i++) {
|
for (size_t i = 1; i <= a_len; i++) {
|
||||||
// iterate through the elements of b
|
// iterate through the elements of b
|
||||||
for (int j = 1; j <= b_len; j++) {
|
for (size_t j = 1; j <= b_len; j++) {
|
||||||
// if elements at the current positions match
|
// if elements at the current positions match
|
||||||
if (a[i - 1] == b[j - 1]) {
|
if (a[i - 1] == b[j - 1]) {
|
||||||
// if it's the first element of either sequences, set LCS length to 1
|
// if it's the first element of either sequences, set LCS length to 1
|
||||||
|
|
|
@ -96,7 +96,7 @@ int main(int argc, char ** argv) {
|
||||||
// tokenize the prompt
|
// tokenize the prompt
|
||||||
const int n_prompt_tokens = -llama_tokenize(model, prompt.c_str(), prompt.size(), NULL, 0, true, true);
|
const int n_prompt_tokens = -llama_tokenize(model, prompt.c_str(), prompt.size(), NULL, 0, true, true);
|
||||||
std::vector<llama_token> prompt_tokens(n_prompt_tokens);
|
std::vector<llama_token> prompt_tokens(n_prompt_tokens);
|
||||||
if (llama_tokenize(model, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true, true) < 0) {
|
if (llama_tokenize(model, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), llama_get_kv_cache_used_cells(ctx) == 0, true) < 0) {
|
||||||
GGML_ABORT("failed to tokenize the prompt\n");
|
GGML_ABORT("failed to tokenize the prompt\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
#include "ggml-opencl.h"
|
#include "ggml-opencl.h"
|
||||||
#include "ggml-backend-impl.h"
|
#include "ggml-backend-impl.h"
|
||||||
|
#include "ggml-cpu.h"
|
||||||
|
|
||||||
#include <array>
|
#include <array>
|
||||||
#include <atomic>
|
#include <atomic>
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
|
#include "ggml-cpu.h"
|
||||||
#include "ggml-backend.h"
|
#include "ggml-backend.h"
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
|
|
|
@ -305,27 +305,10 @@ extern "C" {
|
||||||
GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
|
GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
|
||||||
GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor);
|
GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor);
|
||||||
|
|
||||||
//
|
// CPU buffer types are always available
|
||||||
// CPU backend
|
|
||||||
//
|
|
||||||
|
|
||||||
GGML_API ggml_backend_t ggml_backend_cpu_init(void);
|
|
||||||
|
|
||||||
GGML_API bool ggml_backend_is_cpu (ggml_backend_t backend);
|
|
||||||
GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads);
|
|
||||||
GGML_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
|
|
||||||
GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
|
|
||||||
|
|
||||||
// Create a backend buffer from an existing pointer
|
|
||||||
GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
|
GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
|
||||||
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
|
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
|
||||||
|
|
||||||
GGML_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
|
|
||||||
|
|
||||||
#ifdef GGML_USE_CPU_HBM
|
|
||||||
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
150
ggml/include/ggml-cpu.h
Normal file
150
ggml/include/ggml-cpu.h
Normal file
|
@ -0,0 +1,150 @@
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "ggml.h"
|
||||||
|
#include "ggml-backend.h"
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Scheduling priorities
|
||||||
|
enum ggml_sched_priority {
|
||||||
|
GGML_SCHED_PRIO_NORMAL,
|
||||||
|
GGML_SCHED_PRIO_MEDIUM,
|
||||||
|
GGML_SCHED_PRIO_HIGH,
|
||||||
|
GGML_SCHED_PRIO_REALTIME
|
||||||
|
};
|
||||||
|
|
||||||
|
// Threadpool params
|
||||||
|
// Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults
|
||||||
|
struct ggml_threadpool_params {
|
||||||
|
bool cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings)
|
||||||
|
int n_threads; // number of threads
|
||||||
|
enum ggml_sched_priority prio; // thread priority
|
||||||
|
uint32_t poll; // polling level (0 - no polling, 100 - aggressive polling)
|
||||||
|
bool strict_cpu; // strict cpu placement
|
||||||
|
bool paused; // start in paused state
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ggml_threadpool; // forward declaration, see ggml.c
|
||||||
|
|
||||||
|
typedef struct ggml_threadpool * ggml_threadpool_t;
|
||||||
|
|
||||||
|
// the compute plan that needs to be prepared for ggml_graph_compute()
|
||||||
|
// since https://github.com/ggerganov/ggml/issues/287
|
||||||
|
struct ggml_cplan {
|
||||||
|
size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()`
|
||||||
|
uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
|
||||||
|
|
||||||
|
int n_threads;
|
||||||
|
struct ggml_threadpool * threadpool;
|
||||||
|
|
||||||
|
// abort ggml_graph_compute when true
|
||||||
|
ggml_abort_callback abort_callback;
|
||||||
|
void * abort_callback_data;
|
||||||
|
};
|
||||||
|
|
||||||
|
// numa strategies
|
||||||
|
enum ggml_numa_strategy {
|
||||||
|
GGML_NUMA_STRATEGY_DISABLED = 0,
|
||||||
|
GGML_NUMA_STRATEGY_DISTRIBUTE = 1,
|
||||||
|
GGML_NUMA_STRATEGY_ISOLATE = 2,
|
||||||
|
GGML_NUMA_STRATEGY_NUMACTL = 3,
|
||||||
|
GGML_NUMA_STRATEGY_MIRROR = 4,
|
||||||
|
GGML_NUMA_STRATEGY_COUNT
|
||||||
|
};
|
||||||
|
|
||||||
|
GGML_API void ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems
|
||||||
|
GGML_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
|
||||||
|
GGML_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
|
||||||
|
GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
|
||||||
|
|
||||||
|
GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
|
||||||
|
GGML_API void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
|
||||||
|
|
||||||
|
GGML_API int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
|
||||||
|
GGML_API void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value);
|
||||||
|
|
||||||
|
GGML_API float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
|
||||||
|
GGML_API void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
|
||||||
|
|
||||||
|
GGML_API float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
|
||||||
|
GGML_API void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);
|
||||||
|
|
||||||
|
GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
|
||||||
|
GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads);
|
||||||
|
GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
|
||||||
|
GGML_API struct ggml_threadpool * ggml_threadpool_new (struct ggml_threadpool_params * params);
|
||||||
|
GGML_API void ggml_threadpool_free (struct ggml_threadpool * threadpool);
|
||||||
|
GGML_API int ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
|
||||||
|
GGML_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool);
|
||||||
|
GGML_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool);
|
||||||
|
|
||||||
|
// ggml_graph_plan() has to be called before ggml_graph_compute()
|
||||||
|
// when plan.work_size > 0, caller must allocate memory for plan.work_data
|
||||||
|
GGML_API struct ggml_cplan ggml_graph_plan(
|
||||||
|
const struct ggml_cgraph * cgraph,
|
||||||
|
int n_threads, /* = GGML_DEFAULT_N_THREADS */
|
||||||
|
struct ggml_threadpool * threadpool /* = NULL */ );
|
||||||
|
GGML_API enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
|
||||||
|
|
||||||
|
// same as ggml_graph_compute() but the work data is allocated as a part of the context
|
||||||
|
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
|
||||||
|
GGML_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
|
||||||
|
|
||||||
|
// TODO: move to backend interface
|
||||||
|
GGML_API int ggml_cpu_has_neon (void);
|
||||||
|
GGML_API int ggml_cpu_has_sve (void);
|
||||||
|
GGML_API int ggml_cpu_has_matmul_int8(void);
|
||||||
|
// get the sve vector length in bytes
|
||||||
|
GGML_API int ggml_cpu_get_sve_cnt(void);
|
||||||
|
|
||||||
|
// Internal types and functions exposed for tests and benchmarks
|
||||||
|
|
||||||
|
typedef void (*ggml_from_float_to_mat_t)
|
||||||
|
(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nr, int64_t k, int64_t bs);
|
||||||
|
typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
|
||||||
|
const void * GGML_RESTRICT y, size_t by, int nrc);
|
||||||
|
typedef void (*ggml_gemv_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
|
||||||
|
const void * GGML_RESTRICT y, int nr, int nc);
|
||||||
|
typedef void (*ggml_gemm_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
|
||||||
|
const void * GGML_RESTRICT y, int nr, int nc);
|
||||||
|
|
||||||
|
struct ggml_type_traits_cpu {
|
||||||
|
ggml_from_float_to_mat_t from_float_to_mat;
|
||||||
|
ggml_vec_dot_t vec_dot;
|
||||||
|
enum ggml_type vec_dot_type;
|
||||||
|
int64_t nrows; // number of rows to process simultaneously
|
||||||
|
int64_t ncols; // number of columns to process simultaneously
|
||||||
|
ggml_gemv_t gemv;
|
||||||
|
ggml_gemm_t gemm;
|
||||||
|
};
|
||||||
|
|
||||||
|
GGML_API const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type);
|
||||||
|
|
||||||
|
GGML_API void ggml_cpu_init(void);
|
||||||
|
|
||||||
|
//
|
||||||
|
// CPU backend
|
||||||
|
//
|
||||||
|
|
||||||
|
GGML_API ggml_backend_t ggml_backend_cpu_init(void);
|
||||||
|
|
||||||
|
GGML_API bool ggml_backend_is_cpu (ggml_backend_t backend);
|
||||||
|
GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads);
|
||||||
|
GGML_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
|
||||||
|
GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
|
||||||
|
|
||||||
|
GGML_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
|
||||||
|
|
||||||
|
#ifdef GGML_USE_CPU_HBM
|
||||||
|
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
|
@ -579,6 +579,13 @@ extern "C" {
|
||||||
GGML_TENSOR_FLAG_LOSS = 8, // ...defines loss for numerical optimization (multiple loss tensors add up)
|
GGML_TENSOR_FLAG_LOSS = 8, // ...defines loss for numerical optimization (multiple loss tensors add up)
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct ggml_init_params {
|
||||||
|
// memory pool
|
||||||
|
size_t mem_size; // bytes
|
||||||
|
void * mem_buffer; // if NULL, memory will be allocated internally
|
||||||
|
bool no_alloc; // don't allocate memory for the tensor data
|
||||||
|
};
|
||||||
|
|
||||||
// n-dimensional tensor
|
// n-dimensional tensor
|
||||||
struct ggml_tensor {
|
struct ggml_tensor {
|
||||||
enum ggml_type type;
|
enum ggml_type type;
|
||||||
|
@ -624,59 +631,6 @@ extern "C" {
|
||||||
// If it returns true, the computation is aborted
|
// If it returns true, the computation is aborted
|
||||||
typedef bool (*ggml_abort_callback)(void * data);
|
typedef bool (*ggml_abort_callback)(void * data);
|
||||||
|
|
||||||
// Scheduling priorities
|
|
||||||
enum ggml_sched_priority {
|
|
||||||
GGML_SCHED_PRIO_NORMAL,
|
|
||||||
GGML_SCHED_PRIO_MEDIUM,
|
|
||||||
GGML_SCHED_PRIO_HIGH,
|
|
||||||
GGML_SCHED_PRIO_REALTIME
|
|
||||||
};
|
|
||||||
|
|
||||||
// Threadpool params
|
|
||||||
// Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults
|
|
||||||
struct ggml_threadpool_params {
|
|
||||||
bool cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings)
|
|
||||||
int n_threads; // number of threads
|
|
||||||
enum ggml_sched_priority prio; // thread priority
|
|
||||||
uint32_t poll; // polling level (0 - no polling, 100 - aggressive polling)
|
|
||||||
bool strict_cpu; // strict cpu placement
|
|
||||||
bool paused; // start in paused state
|
|
||||||
};
|
|
||||||
|
|
||||||
struct ggml_threadpool; // forward declaration, see ggml.c
|
|
||||||
|
|
||||||
typedef struct ggml_threadpool * ggml_threadpool_t;
|
|
||||||
|
|
||||||
// the compute plan that needs to be prepared for ggml_graph_compute()
|
|
||||||
// since https://github.com/ggerganov/ggml/issues/287
|
|
||||||
struct ggml_cplan {
|
|
||||||
size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()`
|
|
||||||
uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
|
|
||||||
|
|
||||||
int n_threads;
|
|
||||||
struct ggml_threadpool * threadpool;
|
|
||||||
|
|
||||||
// abort ggml_graph_compute when true
|
|
||||||
ggml_abort_callback abort_callback;
|
|
||||||
void * abort_callback_data;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct ggml_init_params {
|
|
||||||
// memory pool
|
|
||||||
size_t mem_size; // bytes
|
|
||||||
void * mem_buffer; // if NULL, memory will be allocated internally
|
|
||||||
bool no_alloc; // don't allocate memory for the tensor data
|
|
||||||
};
|
|
||||||
|
|
||||||
// numa strategies
|
|
||||||
enum ggml_numa_strategy {
|
|
||||||
GGML_NUMA_STRATEGY_DISABLED = 0,
|
|
||||||
GGML_NUMA_STRATEGY_DISTRIBUTE = 1,
|
|
||||||
GGML_NUMA_STRATEGY_ISOLATE = 2,
|
|
||||||
GGML_NUMA_STRATEGY_NUMACTL = 3,
|
|
||||||
GGML_NUMA_STRATEGY_MIRROR = 4,
|
|
||||||
GGML_NUMA_STRATEGY_COUNT
|
|
||||||
};
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// GUID
|
// GUID
|
||||||
|
@ -699,9 +653,6 @@ extern "C" {
|
||||||
// accepts a UTF-8 path, even on Windows
|
// accepts a UTF-8 path, even on Windows
|
||||||
GGML_API FILE * ggml_fopen(const char * fname, const char * mode);
|
GGML_API FILE * ggml_fopen(const char * fname, const char * mode);
|
||||||
|
|
||||||
GGML_API void ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems
|
|
||||||
GGML_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node
|
|
||||||
|
|
||||||
GGML_API void ggml_print_object (const struct ggml_object * obj);
|
GGML_API void ggml_print_object (const struct ggml_object * obj);
|
||||||
GGML_API void ggml_print_objects(const struct ggml_context * ctx);
|
GGML_API void ggml_print_objects(const struct ggml_context * ctx);
|
||||||
|
|
||||||
|
@ -803,8 +754,7 @@ extern "C" {
|
||||||
int64_t ne2,
|
int64_t ne2,
|
||||||
int64_t ne3);
|
int64_t ne3);
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
|
GGML_API void * ggml_new_buffer(struct ggml_context * ctx, size_t nbytes);
|
||||||
GGML_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
|
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
|
GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
|
||||||
GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src);
|
GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src);
|
||||||
|
@ -814,35 +764,25 @@ extern "C" {
|
||||||
GGML_API struct ggml_tensor * ggml_get_next_tensor (const struct ggml_context * ctx, struct ggml_tensor * tensor);
|
GGML_API struct ggml_tensor * ggml_get_next_tensor (const struct ggml_context * ctx, struct ggml_tensor * tensor);
|
||||||
GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
|
GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
|
|
||||||
GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
|
|
||||||
GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
|
|
||||||
|
|
||||||
// Converts a flat index into coordinates
|
// Converts a flat index into coordinates
|
||||||
GGML_API void ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3);
|
GGML_API void ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3);
|
||||||
|
|
||||||
GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
|
GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
|
||||||
GGML_API void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
|
|
||||||
|
|
||||||
GGML_API int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
|
|
||||||
GGML_API void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value);
|
|
||||||
|
|
||||||
GGML_API float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
|
|
||||||
GGML_API void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
|
|
||||||
|
|
||||||
GGML_API float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
|
|
||||||
GGML_API void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);
|
|
||||||
|
|
||||||
GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
|
GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
|
||||||
GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
|
GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
|
||||||
|
|
||||||
GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
|
|
||||||
|
|
||||||
GGML_API const char * ggml_get_name (const struct ggml_tensor * tensor);
|
GGML_API const char * ggml_get_name (const struct ggml_tensor * tensor);
|
||||||
GGML_API struct ggml_tensor * ggml_set_name ( struct ggml_tensor * tensor, const char * name);
|
GGML_API struct ggml_tensor * ggml_set_name ( struct ggml_tensor * tensor, const char * name);
|
||||||
GGML_ATTRIBUTE_FORMAT(2, 3)
|
GGML_ATTRIBUTE_FORMAT(2, 3)
|
||||||
GGML_API struct ggml_tensor * ggml_format_name( struct ggml_tensor * tensor, const char * fmt, ...);
|
GGML_API struct ggml_tensor * ggml_format_name( struct ggml_tensor * tensor, const char * fmt, ...);
|
||||||
|
|
||||||
|
// Tensor flags
|
||||||
|
GGML_API void ggml_set_input(struct ggml_tensor * tensor);
|
||||||
|
GGML_API void ggml_set_output(struct ggml_tensor * tensor);
|
||||||
|
GGML_API void ggml_set_param(struct ggml_context * ctx, struct ggml_tensor * tensor);
|
||||||
|
GGML_API void ggml_set_loss(struct ggml_tensor * tensor);
|
||||||
|
|
||||||
//
|
//
|
||||||
// operations on tensors with backpropagation
|
// operations on tensors with backpropagation
|
||||||
//
|
//
|
||||||
|
@ -2058,9 +1998,6 @@ extern "C" {
|
||||||
// automatic differentiation
|
// automatic differentiation
|
||||||
//
|
//
|
||||||
|
|
||||||
GGML_API void ggml_set_param(struct ggml_context * ctx, struct ggml_tensor * tensor);
|
|
||||||
GGML_API void ggml_set_loss(struct ggml_tensor * tensor);
|
|
||||||
|
|
||||||
GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
|
GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
|
||||||
GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool accumulate);
|
GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool accumulate);
|
||||||
|
|
||||||
|
@ -2092,27 +2029,6 @@ extern "C" {
|
||||||
GGML_API size_t ggml_graph_overhead(void);
|
GGML_API size_t ggml_graph_overhead(void);
|
||||||
GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);
|
GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);
|
||||||
|
|
||||||
GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
|
|
||||||
GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads);
|
|
||||||
GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
|
|
||||||
GGML_API struct ggml_threadpool * ggml_threadpool_new (struct ggml_threadpool_params * params);
|
|
||||||
GGML_API void ggml_threadpool_free (struct ggml_threadpool * threadpool);
|
|
||||||
GGML_API int ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
|
|
||||||
GGML_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool);
|
|
||||||
GGML_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool);
|
|
||||||
|
|
||||||
// ggml_graph_plan() has to be called before ggml_graph_compute()
|
|
||||||
// when plan.work_size > 0, caller must allocate memory for plan.work_data
|
|
||||||
GGML_API struct ggml_cplan ggml_graph_plan(
|
|
||||||
const struct ggml_cgraph * cgraph,
|
|
||||||
int n_threads, /* = GGML_DEFAULT_N_THREADS */
|
|
||||||
struct ggml_threadpool * threadpool /* = NULL */ );
|
|
||||||
GGML_API enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
|
|
||||||
|
|
||||||
// same as ggml_graph_compute() but the work data is allocated as a part of the context
|
|
||||||
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
|
|
||||||
GGML_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
|
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
|
GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
|
||||||
|
|
||||||
GGML_API void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);
|
GGML_API void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);
|
||||||
|
@ -2283,6 +2199,8 @@ extern "C" {
|
||||||
} lbfgs;
|
} lbfgs;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
|
||||||
|
|
||||||
GGML_API struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);
|
GGML_API struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);
|
||||||
|
|
||||||
// optimize the function defined by the tensor f
|
// optimize the function defined by the tensor f
|
||||||
|
@ -2314,12 +2232,6 @@ extern "C" {
|
||||||
ggml_opt_callback callback,
|
ggml_opt_callback callback,
|
||||||
void * callback_data);
|
void * callback_data);
|
||||||
|
|
||||||
//
|
|
||||||
// tensor flags
|
|
||||||
//
|
|
||||||
GGML_API void ggml_set_input(struct ggml_tensor * tensor);
|
|
||||||
GGML_API void ggml_set_output(struct ggml_tensor * tensor);
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// quantization
|
// quantization
|
||||||
//
|
//
|
||||||
|
@ -2488,8 +2400,6 @@ extern "C" {
|
||||||
GGML_API int ggml_cpu_has_avx512_bf16(void);
|
GGML_API int ggml_cpu_has_avx512_bf16(void);
|
||||||
GGML_API int ggml_cpu_has_amx_int8 (void);
|
GGML_API int ggml_cpu_has_amx_int8 (void);
|
||||||
GGML_API int ggml_cpu_has_fma (void);
|
GGML_API int ggml_cpu_has_fma (void);
|
||||||
GGML_API int ggml_cpu_has_neon (void);
|
|
||||||
GGML_API int ggml_cpu_has_sve (void);
|
|
||||||
GGML_API int ggml_cpu_has_arm_fma (void);
|
GGML_API int ggml_cpu_has_arm_fma (void);
|
||||||
GGML_API int ggml_cpu_has_metal (void);
|
GGML_API int ggml_cpu_has_metal (void);
|
||||||
GGML_API int ggml_cpu_has_f16c (void);
|
GGML_API int ggml_cpu_has_f16c (void);
|
||||||
|
@ -2506,17 +2416,9 @@ extern "C" {
|
||||||
GGML_API int ggml_cpu_has_sycl (void);
|
GGML_API int ggml_cpu_has_sycl (void);
|
||||||
GGML_API int ggml_cpu_has_rpc (void);
|
GGML_API int ggml_cpu_has_rpc (void);
|
||||||
GGML_API int ggml_cpu_has_vsx (void);
|
GGML_API int ggml_cpu_has_vsx (void);
|
||||||
GGML_API int ggml_cpu_has_matmul_int8(void);
|
|
||||||
GGML_API int ggml_cpu_has_cann (void);
|
GGML_API int ggml_cpu_has_cann (void);
|
||||||
GGML_API int ggml_cpu_has_llamafile (void);
|
GGML_API int ggml_cpu_has_llamafile (void);
|
||||||
|
|
||||||
// get the sve vector length in bytes
|
|
||||||
GGML_API int ggml_cpu_get_sve_cnt(void);
|
|
||||||
|
|
||||||
//
|
|
||||||
// Internal types and functions exposed for tests and benchmarks
|
|
||||||
//
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
// restrict not standard in C++
|
// restrict not standard in C++
|
||||||
#define GGML_RESTRICT
|
#define GGML_RESTRICT
|
||||||
|
@ -2525,14 +2427,6 @@ extern "C" {
|
||||||
#endif
|
#endif
|
||||||
typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
typedef void (*ggml_from_float_to_mat_t)
|
|
||||||
(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nr, int64_t k, int64_t bs);
|
|
||||||
typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
|
|
||||||
const void * GGML_RESTRICT y, size_t by, int nrc);
|
|
||||||
typedef void (*ggml_gemv_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
|
|
||||||
const void * GGML_RESTRICT y, int nr, int nc);
|
|
||||||
typedef void (*ggml_gemm_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
|
|
||||||
const void * GGML_RESTRICT y, int nr, int nc);
|
|
||||||
|
|
||||||
struct ggml_type_traits {
|
struct ggml_type_traits {
|
||||||
const char * type_name;
|
const char * type_name;
|
||||||
|
@ -2543,13 +2437,6 @@ extern "C" {
|
||||||
ggml_to_float_t to_float;
|
ggml_to_float_t to_float;
|
||||||
ggml_from_float_t from_float;
|
ggml_from_float_t from_float;
|
||||||
ggml_from_float_t from_float_ref;
|
ggml_from_float_t from_float_ref;
|
||||||
ggml_from_float_to_mat_t from_float_to_mat;
|
|
||||||
ggml_vec_dot_t vec_dot;
|
|
||||||
enum ggml_type vec_dot_type;
|
|
||||||
int64_t nrows; // number of rows to process simultaneously
|
|
||||||
int64_t ncols; // number of columns to process simultaneously
|
|
||||||
ggml_gemv_t gemv;
|
|
||||||
ggml_gemm_t gemm;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
GGML_API const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type);
|
GGML_API const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type);
|
||||||
|
|
|
@ -7,6 +7,7 @@
|
||||||
|
|
||||||
#include "ggml-quants.h"
|
#include "ggml-quants.h"
|
||||||
#include "ggml-impl.h"
|
#include "ggml-impl.h"
|
||||||
|
#include "ggml-cpu.h"
|
||||||
#include "ggml-cpu-impl.h"
|
#include "ggml-cpu-impl.h"
|
||||||
|
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
|
|
File diff suppressed because it is too large
Load diff
13759
ggml/src/ggml-cpu.c
Normal file
13759
ggml/src/ggml-cpu.c
Normal file
File diff suppressed because it is too large
Load diff
|
@ -8,6 +8,7 @@
|
||||||
#include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
|
#include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
|
||||||
#include <stdbool.h>
|
#include <stdbool.h>
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C" {
|
extern "C" {
|
||||||
|
@ -36,6 +37,20 @@ extern "C" {
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
static inline int ggml_up32(int n) {
|
||||||
|
return (n + 31) & ~31;
|
||||||
|
}
|
||||||
|
|
||||||
|
//static inline int ggml_up64(int n) {
|
||||||
|
// return (n + 63) & ~63;
|
||||||
|
//}
|
||||||
|
|
||||||
|
static inline int ggml_up(int n, int m) {
|
||||||
|
// assert m is a power of 2
|
||||||
|
GGML_ASSERT((m & (m - 1)) == 0);
|
||||||
|
return (n + m - 1) & ~(m - 1);
|
||||||
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
// logging
|
// logging
|
||||||
//
|
//
|
||||||
|
@ -51,6 +66,74 @@ void ggml_log_callback_default(enum ggml_log_level level, const char * text, voi
|
||||||
#define GGML_LOG_DEBUG(...) ggml_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
|
#define GGML_LOG_DEBUG(...) ggml_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
|
||||||
#define GGML_LOG_CONT(...) ggml_log_internal(GGML_LOG_LEVEL_CONT , __VA_ARGS__)
|
#define GGML_LOG_CONT(...) ggml_log_internal(GGML_LOG_LEVEL_CONT , __VA_ARGS__)
|
||||||
|
|
||||||
|
#define GGML_DEBUG 0
|
||||||
|
|
||||||
|
#if (GGML_DEBUG >= 1)
|
||||||
|
#define GGML_PRINT_DEBUG(...) GGML_LOG_DEBUG(__VA_ARGS__)
|
||||||
|
#else
|
||||||
|
#define GGML_PRINT_DEBUG(...)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if (GGML_DEBUG >= 5)
|
||||||
|
#define GGML_PRINT_DEBUG_5(...) GGML_LOG_DEBUG(__VA_ARGS__)
|
||||||
|
#else
|
||||||
|
#define GGML_PRINT_DEBUG_5(...)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if (GGML_DEBUG >= 10)
|
||||||
|
#define GGML_PRINT_DEBUG_10(...) GGML_LOG_DEBUG(__VA_ARGS__)
|
||||||
|
#else
|
||||||
|
#define GGML_PRINT_DEBUG_10(...)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// tensor params
|
||||||
|
|
||||||
|
static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) {
|
||||||
|
GGML_ASSERT(tensor != NULL); // silence -Warray-bounds warnings
|
||||||
|
assert(params_size <= GGML_MAX_OP_PARAMS);
|
||||||
|
memcpy(tensor->op_params, params, params_size);
|
||||||
|
}
|
||||||
|
|
||||||
|
static int32_t ggml_get_op_params_i32(const struct ggml_tensor * tensor, uint32_t i) {
|
||||||
|
assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
|
||||||
|
return ((const int32_t *)(tensor->op_params))[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
static float ggml_get_op_params_f32(const struct ggml_tensor * tensor, uint32_t i) {
|
||||||
|
assert(i < GGML_MAX_OP_PARAMS / sizeof(float));
|
||||||
|
return ((const float *)(tensor->op_params))[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int32_t value) {
|
||||||
|
assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
|
||||||
|
((int32_t *)(tensor->op_params))[i] = value;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_set_op_params_f32(struct ggml_tensor * tensor, uint32_t i, float value) {
|
||||||
|
assert(i < GGML_MAX_OP_PARAMS / sizeof(float));
|
||||||
|
((float *)(tensor->op_params))[i] = value;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_map_custom1_op_params {
|
||||||
|
ggml_custom1_op_t fun;
|
||||||
|
int n_tasks;
|
||||||
|
void * userdata;
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
struct ggml_map_custom2_op_params {
|
||||||
|
ggml_custom2_op_t fun;
|
||||||
|
int n_tasks;
|
||||||
|
void * userdata;
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
struct ggml_map_custom3_op_params {
|
||||||
|
ggml_custom3_op_t fun;
|
||||||
|
int n_tasks;
|
||||||
|
void * userdata;
|
||||||
|
};
|
||||||
|
|
||||||
// bitset
|
// bitset
|
||||||
|
|
||||||
typedef uint32_t ggml_bitset_t;
|
typedef uint32_t ggml_bitset_t;
|
||||||
|
@ -204,6 +287,10 @@ struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1);
|
||||||
void * ggml_aligned_malloc(size_t size);
|
void * ggml_aligned_malloc(size_t size);
|
||||||
void ggml_aligned_free(void * ptr, size_t size);
|
void ggml_aligned_free(void * ptr, size_t size);
|
||||||
|
|
||||||
|
// TODO: move to threading file
|
||||||
|
void ggml_critical_section_start(void);
|
||||||
|
void ggml_critical_section_end(void);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -2776,11 +2776,11 @@ kernel void kernel_flash_attn_ext_vec_f16(
|
||||||
const short iv3 = iq3 / rv3;
|
const short iv3 = iq3 / rv3;
|
||||||
|
|
||||||
// load the queries from shared memory into local memory
|
// load the queries from shared memory into local memory
|
||||||
float4 mq[D4];
|
float4 mq[D4/NW];
|
||||||
|
|
||||||
for (short ii = 0; ii < D4; ii += NW) {
|
for (short ii = 0; ii < D4; ii += NW) {
|
||||||
short i = ii + tiisg;
|
short i = ii + tiisg;
|
||||||
mq[i] = (float4) sq4[i];
|
mq[ii/NW] = (float4) sq4[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
// pointer to the mask
|
// pointer to the mask
|
||||||
|
@ -2812,7 +2812,7 @@ kernel void kernel_flash_attn_ext_vec_f16(
|
||||||
mk[2] = (float4) pk4[i + 2*(nb11/8)];
|
mk[2] = (float4) pk4[i + 2*(nb11/8)];
|
||||||
mk[3] = (float4) pk4[i + 3*(nb11/8)];
|
mk[3] = (float4) pk4[i + 3*(nb11/8)];
|
||||||
|
|
||||||
mqk += (float4) (mq[i] * mk);
|
mqk += (float4) (mq[ii/NW] * mk);
|
||||||
}
|
}
|
||||||
|
|
||||||
// reduce the results from the threads in the simdgroup
|
// reduce the results from the threads in the simdgroup
|
||||||
|
@ -2857,8 +2857,7 @@ kernel void kernel_flash_attn_ext_vec_f16(
|
||||||
// O = diag(ms)*O
|
// O = diag(ms)*O
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (short ii = 0; ii < D4; ii += NW) {
|
for (short ii = 0; ii < D4; ii += NW) {
|
||||||
const short i = ii + tiisg;
|
lo[ii/NW] *= ms;
|
||||||
lo[i/NW] *= ms;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2872,10 +2871,10 @@ kernel void kernel_flash_attn_ext_vec_f16(
|
||||||
for (short ii = 0; ii < D4; ii += NW) {
|
for (short ii = 0; ii < D4; ii += NW) {
|
||||||
const short i = ii + tiisg;
|
const short i = ii + tiisg;
|
||||||
|
|
||||||
lo[i/NW] += pv4[i + 0*(nb21/8)] * ss[4*cc + 0];
|
lo[ii/NW] += pv4[i + 0*(nb21/8)] * ss[4*cc + 0];
|
||||||
lo[i/NW] += pv4[i + 1*(nb21/8)] * ss[4*cc + 1];
|
lo[ii/NW] += pv4[i + 1*(nb21/8)] * ss[4*cc + 1];
|
||||||
lo[i/NW] += pv4[i + 2*(nb21/8)] * ss[4*cc + 2];
|
lo[ii/NW] += pv4[i + 2*(nb21/8)] * ss[4*cc + 2];
|
||||||
lo[i/NW] += pv4[i + 3*(nb21/8)] * ss[4*cc + 3];
|
lo[ii/NW] += pv4[i + 3*(nb21/8)] * ss[4*cc + 3];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1296,13 +1296,6 @@ static ggml_backend_buffer_type_t ggml_backend_rpc_device_get_buffer_type(ggml_b
|
||||||
UNUSED(dev);
|
UNUSED(dev);
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_backend_buffer_t ggml_backend_rpc_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
|
|
||||||
return ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
|
||||||
|
|
||||||
UNUSED(dev);
|
|
||||||
UNUSED(max_tensor_size);
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool ggml_backend_rpc_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
|
static bool ggml_backend_rpc_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
|
||||||
UNUSED(dev);
|
UNUSED(dev);
|
||||||
UNUSED(op);
|
UNUSED(op);
|
||||||
|
@ -1328,7 +1321,7 @@ static const struct ggml_backend_device_i ggml_backend_rpc_device_i = {
|
||||||
/* .init_backend = */ ggml_backend_rpc_device_init,
|
/* .init_backend = */ ggml_backend_rpc_device_init,
|
||||||
/* .get_buffer_type = */ ggml_backend_rpc_device_get_buffer_type,
|
/* .get_buffer_type = */ ggml_backend_rpc_device_get_buffer_type,
|
||||||
/* .get_host_buffer_type = */ NULL,
|
/* .get_host_buffer_type = */ NULL,
|
||||||
/* .buffer_from_host_ptr = */ ggml_backend_rpc_device_buffer_from_ptr,
|
/* .buffer_from_host_ptr = */ NULL,
|
||||||
/* .supports_op = */ ggml_backend_rpc_device_supports_op,
|
/* .supports_op = */ ggml_backend_rpc_device_supports_op,
|
||||||
/* .supports_buft = */ ggml_backend_rpc_device_supports_buft,
|
/* .supports_buft = */ ggml_backend_rpc_device_supports_buft,
|
||||||
/* .offload_op = */ NULL,
|
/* .offload_op = */ NULL,
|
||||||
|
|
15324
ggml/src/ggml.c
15324
ggml/src/ggml.c
File diff suppressed because it is too large
Load diff
|
@ -2,6 +2,7 @@
|
||||||
#define LLAMA_H
|
#define LLAMA_H
|
||||||
|
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
|
#include "ggml-cpu.h"
|
||||||
#include "ggml-backend.h"
|
#include "ggml-backend.h"
|
||||||
|
|
||||||
#include <stddef.h>
|
#include <stddef.h>
|
||||||
|
|
|
@ -11,6 +11,7 @@
|
||||||
|
|
||||||
#include "model_adapter.h"
|
#include "model_adapter.h"
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
|
#include "ggml-cpu.h"
|
||||||
|
|
||||||
#include <chrono>
|
#include <chrono>
|
||||||
|
|
||||||
|
|
|
@ -23,6 +23,7 @@
|
||||||
#include "ggml-alloc.h"
|
#include "ggml-alloc.h"
|
||||||
#include "ggml-backend.h"
|
#include "ggml-backend.h"
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
|
#include "ggml-cpu.h"
|
||||||
|
|
||||||
#ifdef SD_USE_CUBLAS
|
#ifdef SD_USE_CUBLAS
|
||||||
#include "ggml-cuda.h"
|
#include "ggml-cuda.h"
|
||||||
|
|
|
@ -21,6 +21,7 @@
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
|
#include "ggml-cpu.h"
|
||||||
#include "ggml-alloc.h"
|
#include "ggml-alloc.h"
|
||||||
#include "ggml-backend.h"
|
#include "ggml-backend.h"
|
||||||
|
|
||||||
|
|
1
spm-headers/ggml-cpu.h
Symbolic link
1
spm-headers/ggml-cpu.h
Symbolic link
|
@ -0,0 +1 @@
|
||||||
|
../ggml/include/ggml-cpu.h
|
|
@ -19524,12 +19524,26 @@ struct llama_context * llama_new_context_with_model(
|
||||||
cparams.causal_attn = params.attention_type == LLAMA_ATTENTION_TYPE_CAUSAL;
|
cparams.causal_attn = params.attention_type == LLAMA_ATTENTION_TYPE_CAUSAL;
|
||||||
}
|
}
|
||||||
|
|
||||||
LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx);
|
const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
|
||||||
LLAMA_LOG_INFO("%s: n_batch = %u\n", __func__, cparams.n_batch);
|
|
||||||
LLAMA_LOG_INFO("%s: n_ubatch = %u\n", __func__, cparams.n_ubatch);
|
LLAMA_LOG_INFO("%s: n_seq_max = %u\n", __func__, cparams.n_seq_max);
|
||||||
LLAMA_LOG_INFO("%s: flash_attn = %d\n", __func__, cparams.flash_attn);
|
LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx);
|
||||||
LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
|
LLAMA_LOG_INFO("%s: n_ctx_per_seq = %u\n", __func__, n_ctx_per_seq);
|
||||||
LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);
|
LLAMA_LOG_INFO("%s: n_batch = %u\n", __func__, cparams.n_batch);
|
||||||
|
LLAMA_LOG_INFO("%s: n_ubatch = %u\n", __func__, cparams.n_ubatch);
|
||||||
|
LLAMA_LOG_INFO("%s: flash_attn = %d\n", __func__, cparams.flash_attn);
|
||||||
|
LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
|
||||||
|
LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);
|
||||||
|
|
||||||
|
if (n_ctx_per_seq < hparams.n_ctx_train) {
|
||||||
|
LLAMA_LOG_WARN("%s: n_ctx_per_seq (%u) < n_ctx_train (%u) -- the full capacity of the model will not be utilized\n",
|
||||||
|
__func__, n_ctx_per_seq, hparams.n_ctx_train);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (n_ctx_per_seq > hparams.n_ctx_train) {
|
||||||
|
LLAMA_LOG_WARN("%s: n_ctx_pre_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n",
|
||||||
|
__func__, n_ctx_per_seq, hparams.n_ctx_train);
|
||||||
|
}
|
||||||
|
|
||||||
ctx->abort_callback = params.abort_callback;
|
ctx->abort_callback = params.abort_callback;
|
||||||
ctx->abort_callback_data = params.abort_callback_data;
|
ctx->abort_callback_data = params.abort_callback_data;
|
||||||
|
@ -21980,6 +21994,8 @@ int llama_split_prefix(char * dest, size_t maxlen, const char * split_path, int
|
||||||
}
|
}
|
||||||
|
|
||||||
const char * llama_print_system_info(void) {
|
const char * llama_print_system_info(void) {
|
||||||
|
ggml_cpu_init(); // some ARM features are detected at runtime
|
||||||
|
|
||||||
static std::string s;
|
static std::string s;
|
||||||
|
|
||||||
s = "";
|
s = "";
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
|
#include "ggml-cpu.h"
|
||||||
#include "ggml-backend.h"
|
#include "ggml-backend.h"
|
||||||
|
|
||||||
#include <chrono>
|
#include <chrono>
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue