Merge branch 'upstream' into concedo_experimental

# Conflicts:
#	CMakePresets.json
#	Makefile
#	Package.swift
#	ci/run.sh
#	common/CMakeLists.txt
#	examples/CMakeLists.txt
#	flake.lock
#	ggml/src/CMakeLists.txt
#	ggml/src/ggml-backend.cpp
#	ggml/src/ggml.c
#	pocs/vdot/q8dot.cpp
#	pocs/vdot/vdot.cpp
#	tests/test-backend-ops.cpp
#	tests/test-grad0.cpp
#	tests/test-quantize-fns.cpp
#	tests/test-quantize-perf.cpp
#	tests/test-rope.cpp
This commit is contained in:
Concedo 2024-11-04 16:54:53 +08:00
commit bb13925f39
34 changed files with 14917 additions and 19458 deletions

View file

@ -421,6 +421,8 @@ endif()
add_library(ggml add_library(ggml
ggml/src/ggml.c ggml/src/ggml.c
ggml/include/ggml.h ggml/include/ggml.h
ggml/src/ggml-cpu.c
ggml/include/ggml-cpu.h
ggml/src/ggml-alloc.c ggml/src/ggml-alloc.c
ggml/include/ggml-alloc.h ggml/include/ggml-alloc.h
ggml/src/ggml-backend.cpp ggml/src/ggml-backend.cpp

View file

@ -430,6 +430,17 @@ ggml_v4_vulkan.o: ggml/src/ggml.c ggml/include/ggml.h
ggml_v4_vulkan_noavx2.o: ggml/src/ggml.c ggml/include/ggml.h ggml_v4_vulkan_noavx2.o: ggml/src/ggml.c ggml/include/ggml.h
$(CC) $(FASTCFLAGS) $(SIMPLECFLAGS) $(VULKAN_FLAGS) -c $< -o $@ $(CC) $(FASTCFLAGS) $(SIMPLECFLAGS) $(VULKAN_FLAGS) -c $< -o $@
ggml-cpu.o: ggml/src/ggml-cpu.c ggml/include/ggml-cpu.h
$(CC) $(FASTCFLAGS) $(FULLCFLAGS) -c $< -o $@
ggml-cpu_v4_failsafe.o: ggml/src/ggml-cpu.c ggml/include/ggml-cpu.h
$(CC) $(FASTCFLAGS) $(NONECFLAGS) -c $< -o $@
ggml-cpu_v4_noavx2.o: ggml/src/ggml-cpu.c ggml/include/ggml-cpu.h
$(CC) $(FASTCFLAGS) $(SIMPLECFLAGS) -c $< -o $@
ggml-cpu_v4_clblast.o: ggml/src/ggml-cpu.c ggml/include/ggml-cpu.h
$(CC) $(FASTCFLAGS) $(FULLCFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
ggml-cpu_v4_clblast_noavx2.o: ggml/src/ggml-cpu.c ggml/include/ggml-cpu.h
$(CC) $(FASTCFLAGS) $(SIMPLECFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
#quants #quants
ggml-quants.o: ggml/src/ggml-quants.c ggml/include/ggml.h ggml/src/ggml-quants.h ggml/src/ggml-common.h ggml-quants.o: ggml/src/ggml-quants.c ggml/include/ggml.h ggml/src/ggml-quants.h ggml/src/ggml-common.h
$(CC) $(CFLAGS) $(FULLCFLAGS) -c $< -o $@ $(CC) $(CFLAGS) $(FULLCFLAGS) -c $< -o $@
@ -574,18 +585,18 @@ clean:
rm -vrf ggml/src/ggml-cuda/template-instances/*.o rm -vrf ggml/src/ggml-cuda/template-instances/*.o
# useful tools # useful tools
main: examples/main/main.cpp common/json-schema-to-grammar.cpp common/arg.cpp build-info.h ggml.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o $(OBJS_FULL) $(OBJS) main: examples/main/main.cpp common/json-schema-to-grammar.cpp common/arg.cpp build-info.h ggml.o ggml-cpu.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o $(OBJS_FULL) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
@echo '==== Run ./main -h for help. ====' @echo '==== Run ./main -h for help. ===='
sdmain: otherarch/sdcpp/util.cpp otherarch/sdcpp/main.cpp otherarch/sdcpp/stable-diffusion.cpp otherarch/sdcpp/upscaler.cpp otherarch/sdcpp/model.cpp otherarch/sdcpp/thirdparty/zip.c build-info.h ggml.o llama.o console.o ggml-backend_default.o $(OBJS_FULL) $(OBJS) sdmain: otherarch/sdcpp/util.cpp otherarch/sdcpp/main.cpp otherarch/sdcpp/stable-diffusion.cpp otherarch/sdcpp/upscaler.cpp otherarch/sdcpp/model.cpp otherarch/sdcpp/thirdparty/zip.c build-info.h ggml.o ggml-cpu.o llama.o console.o ggml-backend_default.o $(OBJS_FULL) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
whispermain: otherarch/whispercpp/main.cpp otherarch/whispercpp/whisper.cpp build-info.h ggml.o llama.o console.o ggml-backend_default.o $(OBJS_FULL) $(OBJS) whispermain: otherarch/whispercpp/main.cpp otherarch/whispercpp/whisper.cpp build-info.h ggml.o ggml-cpu.o llama.o console.o ggml-backend_default.o $(OBJS_FULL) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
imatrix: examples/imatrix/imatrix.cpp build-info.h ggml.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o $(OBJS_FULL) $(OBJS) imatrix: examples/imatrix/imatrix.cpp build-info.h ggml.o ggml-cpu.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o $(OBJS_FULL) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
gguf: examples/gguf/gguf.cpp build-info.h ggml.o llama.o llavaclip_default.o llava.o ggml-backend_default.o $(OBJS_FULL) $(OBJS) gguf: examples/gguf/gguf.cpp build-info.h ggml.o ggml-cpu.o llama.o llavaclip_default.o llava.o ggml-backend_default.o $(OBJS_FULL) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
gguf-split: examples/gguf-split/gguf-split.cpp ggml.o llama.o build-info.h llavaclip_default.o llava.o ggml-backend_default.o $(OBJS_FULL) $(OBJS) gguf-split: examples/gguf-split/gguf-split.cpp ggml.o ggml-cpu.o llama.o build-info.h llavaclip_default.o llava.o ggml-backend_default.o $(OBJS_FULL) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
vulkan-shaders-gen: ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp vulkan-shaders-gen: ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp
@ -595,11 +606,11 @@ vulkan-shaders-gen: ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp
$(shell) vulkan-shaders-gen --glslc glslc --input-dir ggml/src/vulkan-shaders --target-hpp ggml/src/ggml-vulkan-shaders.hpp --target-cpp ggml/src/ggml-vulkan-shaders.cpp $(shell) vulkan-shaders-gen --glslc glslc --input-dir ggml/src/vulkan-shaders --target-hpp ggml/src/ggml-vulkan-shaders.hpp --target-cpp ggml/src/ggml-vulkan-shaders.cpp
#generated libraries #generated libraries
koboldcpp_default: ggml.o ggml_v3.o ggml_v2.o ggml_v1.o expose.o gpttype_adapter.o sdcpp_default.o whispercpp_default.o llavaclip_default.o llava.o ggml-backend_default.o $(OBJS_FULL) $(OBJS) koboldcpp_default: ggml.o ggml-cpu.o ggml_v3.o ggml_v2.o ggml_v1.o expose.o gpttype_adapter.o sdcpp_default.o whispercpp_default.o llavaclip_default.o llava.o ggml-backend_default.o $(OBJS_FULL) $(OBJS)
$(DEFAULT_BUILD) $(DEFAULT_BUILD)
ifdef FAILSAFE_BUILD ifdef FAILSAFE_BUILD
koboldcpp_failsafe: ggml_v4_failsafe.o ggml_v3_failsafe.o ggml_v2_failsafe.o ggml_v1_failsafe.o expose.o gpttype_adapter_failsafe.o sdcpp_default.o whispercpp_default.o llavaclip_default.o llava.o ggml-backend_default.o $(OBJS_FAILSAFE) $(OBJS) koboldcpp_failsafe: ggml_v4_failsafe.o ggml-cpu_v4_failsafe.o ggml_v3_failsafe.o ggml_v2_failsafe.o ggml_v1_failsafe.o expose.o gpttype_adapter_failsafe.o sdcpp_default.o whispercpp_default.o llavaclip_default.o llava.o ggml-backend_default.o $(OBJS_FAILSAFE) $(OBJS)
$(FAILSAFE_BUILD) $(FAILSAFE_BUILD)
else else
koboldcpp_failsafe: koboldcpp_failsafe:
@ -607,7 +618,7 @@ koboldcpp_failsafe:
endif endif
ifdef NOAVX2_BUILD ifdef NOAVX2_BUILD
koboldcpp_noavx2: ggml_v4_noavx2.o ggml_v3_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o gpttype_adapter_failsafe.o sdcpp_default.o whispercpp_default.o llavaclip_default.o llava.o ggml-backend_default.o $(OBJS_SIMPLE) $(OBJS) koboldcpp_noavx2: ggml_v4_noavx2.o ggml-cpu_v4_noavx2.o ggml_v3_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o gpttype_adapter_failsafe.o sdcpp_default.o whispercpp_default.o llavaclip_default.o llava.o ggml-backend_default.o $(OBJS_SIMPLE) $(OBJS)
$(NOAVX2_BUILD) $(NOAVX2_BUILD)
else else
koboldcpp_noavx2: koboldcpp_noavx2:
@ -615,10 +626,10 @@ koboldcpp_noavx2:
endif endif
ifdef CLBLAST_BUILD ifdef CLBLAST_BUILD
koboldcpp_clblast: ggml_v4_clblast.o ggml_v3_clblast.o ggml_v2_clblast.o ggml_v1.o expose.o gpttype_adapter_clblast.o ggml-opencl.o ggml_v3-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o sdcpp_default.o whispercpp_default.o llavaclip_default.o llava.o ggml-backend_default.o $(OBJS_FULL) $(OBJS) koboldcpp_clblast: ggml_v4_clblast.o ggml-cpu_v4_clblast.o ggml_v3_clblast.o ggml_v2_clblast.o ggml_v1.o expose.o gpttype_adapter_clblast.o ggml-opencl.o ggml_v3-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o sdcpp_default.o whispercpp_default.o llavaclip_default.o llava.o ggml-backend_default.o $(OBJS_FULL) $(OBJS)
$(CLBLAST_BUILD) $(CLBLAST_BUILD)
ifdef NOAVX2_BUILD ifdef NOAVX2_BUILD
koboldcpp_clblast_noavx2: ggml_v4_clblast_noavx2.o ggml_v3_clblast_noavx2.o ggml_v2_clblast_noavx2.o ggml_v1_failsafe.o expose.o gpttype_adapter_clblast_noavx2.o ggml-opencl.o ggml_v3-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o sdcpp_default.o whispercpp_default.o llavaclip_default.o llava.o ggml-backend_default.o $(OBJS_SIMPLE) $(OBJS) koboldcpp_clblast_noavx2: ggml_v4_clblast_noavx2.o ggml-cpu_v4_clblast_noavx2.o ggml_v3_clblast_noavx2.o ggml_v2_clblast_noavx2.o ggml_v1_failsafe.o expose.o gpttype_adapter_clblast_noavx2.o ggml-opencl.o ggml_v3-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o sdcpp_default.o whispercpp_default.o llavaclip_default.o llava.o ggml-backend_default.o $(OBJS_SIMPLE) $(OBJS)
$(CLBLAST_BUILD) $(CLBLAST_BUILD)
else else
koboldcpp_clblast_noavx2: koboldcpp_clblast_noavx2:
@ -632,7 +643,7 @@ koboldcpp_clblast_noavx2:
endif endif
ifdef CUBLAS_BUILD ifdef CUBLAS_BUILD
koboldcpp_cublas: ggml_v4_cublas.o ggml_v3_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o gpttype_adapter_cublas.o sdcpp_cublas.o whispercpp_cublas.o llavaclip_cublas.o llava.o ggml-backend_cublas.o $(CUBLAS_OBJS) $(OBJS_FULL) $(OBJS) koboldcpp_cublas: ggml_v4_cublas.o ggml.o ggml_v3_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o gpttype_adapter_cublas.o sdcpp_cublas.o whispercpp_cublas.o llavaclip_cublas.o llava.o ggml-backend_cublas.o $(CUBLAS_OBJS) $(OBJS_FULL) $(OBJS)
$(CUBLAS_BUILD) $(CUBLAS_BUILD)
else else
koboldcpp_cublas: koboldcpp_cublas:
@ -640,7 +651,7 @@ koboldcpp_cublas:
endif endif
ifdef HIPBLAS_BUILD ifdef HIPBLAS_BUILD
koboldcpp_hipblas: ggml_v4_cublas.o ggml_v3_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o gpttype_adapter_cublas.o sdcpp_cublas.o whispercpp_cublas.o llavaclip_cublas.o llava.o ggml-backend_cublas.o $(HIP_OBJS) $(OBJS_FULL) $(OBJS) koboldcpp_hipblas: ggml_v4_cublas.o ggml.o ggml_v3_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o gpttype_adapter_cublas.o sdcpp_cublas.o whispercpp_cublas.o llavaclip_cublas.o llava.o ggml-backend_cublas.o $(HIP_OBJS) $(OBJS_FULL) $(OBJS)
$(HIPBLAS_BUILD) $(HIPBLAS_BUILD)
else else
koboldcpp_hipblas: koboldcpp_hipblas:
@ -648,10 +659,10 @@ koboldcpp_hipblas:
endif endif
ifdef VULKAN_BUILD ifdef VULKAN_BUILD
koboldcpp_vulkan: ggml_v4_vulkan.o ggml_v3.o ggml_v2.o ggml_v1.o expose.o gpttype_adapter_vulkan.o ggml-vulkan.o sdcpp_vulkan.o whispercpp_default.o llavaclip_vulkan.o llava.o ggml-backend_vulkan.o $(OBJS_FULL) $(OBJS) koboldcpp_vulkan: ggml_v4_vulkan.o ggml-cpu.o ggml_v3.o ggml_v2.o ggml_v1.o expose.o gpttype_adapter_vulkan.o ggml-vulkan.o sdcpp_vulkan.o whispercpp_default.o llavaclip_vulkan.o llava.o ggml-backend_vulkan.o $(OBJS_FULL) $(OBJS)
$(VULKAN_BUILD) $(VULKAN_BUILD)
ifdef NOAVX2_BUILD ifdef NOAVX2_BUILD
koboldcpp_vulkan_noavx2: ggml_v4_vulkan_noavx2.o ggml_v3_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o gpttype_adapter_vulkan_noavx2.o ggml-vulkan.o sdcpp_vulkan.o whispercpp_default.o llavaclip_vulkan.o llava.o ggml-backend_vulkan.o $(OBJS_SIMPLE) $(OBJS) koboldcpp_vulkan_noavx2: ggml_v4_vulkan_noavx2.o ggml_v4_noavx2.o ggml_v3_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o gpttype_adapter_vulkan_noavx2.o ggml-vulkan.o sdcpp_vulkan.o whispercpp_default.o llavaclip_vulkan.o llava.o ggml-backend_vulkan.o $(OBJS_SIMPLE) $(OBJS)
$(VULKAN_BUILD) $(VULKAN_BUILD)
else else
koboldcpp_vulkan_noavx2: koboldcpp_vulkan_noavx2:
@ -665,17 +676,17 @@ koboldcpp_vulkan_noavx2:
endif endif
# tools # tools
quantize_gguf: examples/quantize/quantize.cpp ggml.o llama.o llavaclip_default.o llava.o ggml-backend_default.o $(OBJS_FULL) quantize_gguf: examples/quantize/quantize.cpp ggml.o ggml-cpu.o llama.o llavaclip_default.o llava.o ggml-backend_default.o $(OBJS_FULL)
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
quantize_gptj: otherarch/tools/gptj_quantize.cpp otherarch/tools/common-ggml.cpp ggml_v3.o ggml.o llama.o llavaclip_default.o llava.o ggml-backend_default.o $(OBJS_FULL) quantize_gptj: otherarch/tools/gptj_quantize.cpp otherarch/tools/common-ggml.cpp ggml_v3.o ggml.o ggml-cpu.o llama.o llavaclip_default.o llava.o ggml-backend_default.o $(OBJS_FULL)
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
quantize_gpt2: otherarch/tools/gpt2_quantize.cpp otherarch/tools/common-ggml.cpp ggml_v3.o ggml.o llama.o llavaclip_default.o llava.o ggml-backend_default.o $(OBJS_FULL) quantize_gpt2: otherarch/tools/gpt2_quantize.cpp otherarch/tools/common-ggml.cpp ggml_v3.o ggml.o ggml-cpu.o llama.o llavaclip_default.o llava.o ggml-backend_default.o $(OBJS_FULL)
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
quantize_neox: otherarch/tools/neox_quantize.cpp otherarch/tools/common-ggml.cpp ggml_v3.o ggml.o llama.o llavaclip_default.o llava.o ggml-backend_default.o $(OBJS_FULL) quantize_neox: otherarch/tools/neox_quantize.cpp otherarch/tools/common-ggml.cpp ggml_v3.o ggml.o ggml-cpu.o llama.o llavaclip_default.o llava.o ggml-backend_default.o $(OBJS_FULL)
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
quantize_mpt: otherarch/tools/mpt_quantize.cpp otherarch/tools/common-ggml.cpp ggml_v3.o ggml.o llama.o llavaclip_default.o llava.o ggml-backend_default.o $(OBJS_FULL) quantize_mpt: otherarch/tools/mpt_quantize.cpp otherarch/tools/common-ggml.cpp ggml_v3.o ggml.o ggml-cpu.o llama.o llavaclip_default.o llava.o ggml-backend_default.o $(OBJS_FULL)
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
quantize_clip: examples/llava/clip.cpp examples/llava/clip.h examples/llava/quantclip.cpp ggml_v3.o ggml.o llama.o ggml-backend_default.o $(OBJS_FULL) quantize_clip: examples/llava/clip.cpp examples/llava/clip.h examples/llava/quantclip.cpp ggml_v3.o ggml.o ggml-cpu.o llama.o ggml-backend_default.o $(OBJS_FULL)
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
#window simple clinfo #window simple clinfo

View file

@ -0,0 +1,16 @@
set( CMAKE_SYSTEM_NAME Darwin )
set( CMAKE_SYSTEM_PROCESSOR arm64 )
set( target arm64-apple-darwin-macho )
set( CMAKE_C_COMPILER clang )
set( CMAKE_CXX_COMPILER clang++ )
set( CMAKE_C_COMPILER_TARGET ${target} )
set( CMAKE_CXX_COMPILER_TARGET ${target} )
set( arch_c_flags "-march=armv8.4-a -fvectorize -ffp-model=fast -fno-finite-math-only" )
set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function" )
set( CMAKE_C_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )

View file

@ -1953,6 +1953,8 @@ void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const cha
void yaml_dump_non_result_info(FILE * stream, const common_params & params, const llama_context * lctx, void yaml_dump_non_result_info(FILE * stream, const common_params & params, const llama_context * lctx,
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) { const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
ggml_cpu_init(); // some ARM features are detected at runtime
const auto & sparams = params.sparams; const auto & sparams = params.sparams;
fprintf(stream, "build_commit: %s\n", LLAMA_COMMIT); fprintf(stream, "build_commit: %s\n", LLAMA_COMMIT);

View file

@ -151,7 +151,7 @@ struct common_sampler_params {
struct common_params { struct common_params {
int32_t n_predict = -1; // new tokens to predict int32_t n_predict = -1; // new tokens to predict
int32_t n_ctx = 0; // context size int32_t n_ctx = 4096; // context size
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS) int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS) int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
int32_t n_keep = 0; // number of tokens to keep from initial prompt int32_t n_keep = 0; // number of tokens to keep from initial prompt

File diff suppressed because it is too large Load diff

View file

@ -1,233 +0,0 @@
// Various helper functions and utilities for training
#pragma once
#include <string>
#include <random>
#include <vector>
#include "ggml.h"
#include "llama.h"
#define LLAMA_TRAIN_MAX_NODES 16384
typedef std::string mt19937_state;
struct train_state {
struct ggml_opt_context * opt;
uint64_t train_its;
uint64_t train_samples;
uint64_t train_tokens;
uint64_t train_epochs;
size_t shuffle_samples_hash; // fn, sample_count, *zip(sample_begins, sample_sizes)
mt19937_state shuffle_rng_state_current;
mt19937_state shuffle_rng_state_next;
size_t shuffle_sample_count;
size_t shuffle_next_sample;
};
struct train_params_common {
const char * fn_train_data;
const char * fn_checkpoint_in;
const char * fn_checkpoint_out;
const char * pattern_fn_it;
const char * fn_latest;
bool print_usage;
int save_every;
uint32_t seed;
int n_ctx;
int n_threads;
int n_batch;
int n_gradient_accumulation;
int n_epochs;
int n_gpu_layers;
bool custom_n_ctx;
bool use_flash;
bool use_checkpointing;
std::string sample_start;
bool include_sample_start;
bool escape;
bool overlapping_samples;
bool fill_with_next_samples;
bool separate_with_eos;
bool separate_with_bos;
bool sample_random_offsets;
bool force_reshuffle;
int warmup;
int cos_decay_steps;
float cos_decay_restart;
float cos_decay_min;
bool enable_restart;
int opt_past;
float opt_delta;
int opt_max_no_improvement;
int adam_n_iter;
float adam_alpha;
float adam_min_alpha;
float adam_decay;
int adam_decay_min_ndim;
float adam_beta1;
float adam_beta2;
float adam_gclip;
float adam_eps_f;
};
typedef void (*save_train_files_callback)(void * data, struct train_state * train);
struct train_opt_callback_data {
struct train_params_common * params;
struct train_state * train;
save_train_files_callback save_cb;
void * save_data;
struct llama_context * lctx;
int last_save_iter;
llama_token * tokens_data;
size_t tokens_size;
size_t * samples_begin;
size_t * samples_size;
size_t * shuffled_samples_offs;
size_t * shuffled_samples_begin;
size_t * shuffled_samples_size;
size_t samples_count;
struct ggml_tensor * tokens_input;
struct ggml_tensor * target_probs;
int first_iter;
int first_epoch;
int iter_at_last_epoch;
int64_t last_time;
double millis_per_iter;
};
struct train_state * init_train_state();
void free_train_state(struct train_state * state);
struct train_params_common get_default_train_params_common();
void print_common_train_usage(int /*argc*/, char ** argv, const struct train_params_common * params);
bool consume_common_train_arg(int argc, char ** argv, int * idx, struct train_params_common * params, bool * invalid_param);
void finish_processing_train_args(struct train_params_common * params);
struct random_normal_distribution;
struct random_uniform_distribution;
struct random_normal_distribution * init_random_normal_distribution (int seed, float mean, float std, float min, float max);
struct random_uniform_distribution * init_random_uniform_distribution(int seed, float min, float max);
void free_random_normal_distribution (struct random_normal_distribution * rnd);
void free_random_uniform_distribution(struct random_uniform_distribution * rnd);
struct ggml_tensor * randomize_tensor_normal (struct ggml_tensor * tensor, struct random_normal_distribution * rnd);
struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struct random_uniform_distribution * rnd);
// generate random float in interval [0,1)
float frand();
float frand_normal (struct random_normal_distribution * rnd);
float frand_uniform(struct random_uniform_distribution * rnd);
int clamp (const int v, const int min, const int max);
float fclamp(const float v, const float min, const float max);
void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0);
void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1);
void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2);
void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3);
size_t tokenize_file(
struct llama_context * lctx,
const char * filename,
const std::string & sample_start,
bool include_sample_start,
bool overlapping_samples,
unsigned context_length,
std::vector<llama_token> & out_tokens,
std::vector<size_t> & out_samples_begin,
std::vector<size_t> & out_samples_size);
int64_t get_example_targets_batch(
struct llama_context * lctx,
struct ggml_tensor * tokens_input,
struct ggml_tensor * target_probs,
int64_t example_id,
const size_t * samples_offs,
const size_t * samples_begin,
const size_t * samples_size,
size_t samples_count,
const llama_token * train_data,
size_t n_train_data,
bool separate_with_eos,
bool separate_with_bos,
bool fill_with_next_samples,
bool sample_random_offsets);
void mt19937_set_state(std::mt19937& rng, const mt19937_state& rng_state);
mt19937_state mt19937_get_state(const std::mt19937& rng);
mt19937_state mt19937_seed_to_state(unsigned seed);
mt19937_state shuffle_samples(
const mt19937_state & rng_state,
size_t * shuffled_offs,
size_t * shuffled_begins,
size_t * shuffled_sizes,
const size_t * begins,
const size_t * sizes,
size_t count);
size_t hash_combine(size_t h1, size_t h2);
size_t compute_samples_hash(
const char* fn,
const size_t* samples_begin,
const size_t* samples_size,
size_t sample_count);
std::string replace_str(const char * s, const char * needle, const char * replacement);
void print_duration(double milliseconds);
float cosine_decay(
int64_t step,
int64_t decay_steps,
float minimum);
float cosine_decay_restart(
int64_t step,
int64_t decay_steps,
float minimum,
float restart_step_mult);
float learning_schedule(
int64_t step,
int64_t warmup_steps,
int64_t decay_steps,
float learning_rate,
float overall_minimum,
float cos_decay_minimum,
float cos_decay_restart_step_mult,
bool enable_restart);
void copy_tensor_by_name(struct ggml_tensor * dst, struct ggml_context * ctx, const char * name);
void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct ggml_opt_context * opt);
void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context * opt);
bool load_train_state_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct train_state * train);
void save_train_state_gguf(struct gguf_context * fctx, struct train_state * train);
std::string get_train_filename(const char * filename, const char * pattern_it, const char * latest, int64_t iteration);
void train_opt_callback(void * vdata, int accum_step, float * sched, bool * cancel);

View file

@ -72,7 +72,8 @@ class Model:
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool = False, def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool = False,
use_temp_file: bool = False, eager: bool = False, use_temp_file: bool = False, eager: bool = False,
metadata_override: Path | None = None, model_name: str | None = None, metadata_override: Path | None = None, model_name: str | None = None,
split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False): split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False,
small_first_shard: bool = False, hparams: dict[str, Any] | None = None):
if type(self) is Model: if type(self) is Model:
raise TypeError(f"{type(self).__name__!r} should not be directly instantiated") raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
@ -87,7 +88,7 @@ class Model:
self.is_safetensors = len(self.part_names) > 0 self.is_safetensors = len(self.part_names) > 0
if not self.is_safetensors: if not self.is_safetensors:
self.part_names = Model.get_model_part_names(self.dir_model, "pytorch_model", ".bin") self.part_names = Model.get_model_part_names(self.dir_model, "pytorch_model", ".bin")
self.hparams = Model.load_hparams(self.dir_model) self.hparams = Model.load_hparams(self.dir_model) if hparams is None else hparams
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"]) self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"])
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
self.tensor_names = None self.tensor_names = None
@ -1541,6 +1542,17 @@ class LlamaModel(Model):
special_vocab._set_special_token("eot", 32010) special_vocab._set_special_token("eot", 32010)
special_vocab.add_to_gguf(self.gguf_writer) special_vocab.add_to_gguf(self.gguf_writer)
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
if tokenizer_config_file.is_file():
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
tokenizer_config_json = json.load(f)
if "add_prefix_space" in tokenizer_config_json:
self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
# Apply to granite small models only
if self.hparams.get("vocab_size", 32000) == 49152:
self.gguf_writer.add_add_bos_token(False)
def set_gguf_parameters(self): def set_gguf_parameters(self):
super().set_gguf_parameters() super().set_gguf_parameters()
hparams = self.hparams hparams = self.hparams
@ -1557,17 +1569,6 @@ class LlamaModel(Model):
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
if tokenizer_config_file.is_file():
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
tokenizer_config_json = json.load(f)
if "add_prefix_space" in tokenizer_config_json:
self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
# Apply to granite small models only
if self.hparams.get("vocab_size", 32000) == 49152:
self.gguf_writer.add_add_bos_token(False)
@staticmethod @staticmethod
def permute(weights: Tensor, n_head: int, n_head_kv: int | None): def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
if n_head_kv is not None and n_head != n_head_kv: if n_head_kv is not None and n_head != n_head_kv:

View file

@ -12,6 +12,7 @@ import json
from math import prod from math import prod
from pathlib import Path from pathlib import Path
from typing import TYPE_CHECKING, Any, Callable, Iterable, Iterator, Sequence, SupportsIndex, cast from typing import TYPE_CHECKING, Any, Callable, Iterable, Iterator, Sequence, SupportsIndex, cast
from transformers import AutoConfig
import torch import torch
@ -256,8 +257,8 @@ def parse_args() -> argparse.Namespace:
help="only print out what will be done, without writing any new files", help="only print out what will be done, without writing any new files",
) )
parser.add_argument( parser.add_argument(
"--base", type=Path, required=True, "--base", type=Path,
help="directory containing Hugging Face model config files (config.json, tokenizer.json) for the base model that the adapter is based on - only config is needed, actual model weights are not required", help="directory containing Hugging Face model config files (config.json, tokenizer.json) for the base model that the adapter is based on - only config is needed, actual model weights are not required. If base model is unspecified, it will be loaded from Hugging Face hub based on the adapter config",
) )
parser.add_argument( parser.add_argument(
"lora_path", type=Path, "lora_path", type=Path,
@ -267,6 +268,12 @@ def parse_args() -> argparse.Namespace:
return parser.parse_args() return parser.parse_args()
def load_hparams_from_hf(hf_model_id: str) -> dict[str, Any]:
# normally, adapter does not come with base model config, we need to load it from AutoConfig
config = AutoConfig.from_pretrained(hf_model_id)
return config.to_dict()
if __name__ == '__main__': if __name__ == '__main__':
args = parse_args() args = parse_args()
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
@ -281,7 +288,7 @@ if __name__ == '__main__':
ftype = ftype_map[args.outtype] ftype = ftype_map[args.outtype]
dir_base_model: Path = args.base dir_base_model: Path | None = args.base
dir_lora: Path = args.lora_path dir_lora: Path = args.lora_path
lora_config = dir_lora / "adapter_config.json" lora_config = dir_lora / "adapter_config.json"
input_model = dir_lora / "adapter_model.safetensors" input_model = dir_lora / "adapter_model.safetensors"
@ -301,9 +308,29 @@ if __name__ == '__main__':
input_model = os.path.join(dir_lora, "adapter_model.bin") input_model = os.path.join(dir_lora, "adapter_model.bin")
lora_model = torch.load(input_model, map_location="cpu", weights_only=True) lora_model = torch.load(input_model, map_location="cpu", weights_only=True)
# load LoRA config
with open(lora_config, "r") as f:
lparams: dict[str, Any] = json.load(f)
# load base model # load base model
logger.info(f"Loading base model: {dir_base_model.name}") if dir_base_model is None:
hparams = Model.load_hparams(dir_base_model) if "base_model_name_or_path" in lparams:
model_id = lparams["base_model_name_or_path"]
logger.info(f"Loading base model from Hugging Face: {model_id}")
try:
hparams = load_hparams_from_hf(model_id)
except OSError as e:
logger.error(f"Failed to load base model config: {e}")
logger.error("Please try downloading the base model and add its path to --base")
sys.exit(1)
else:
logger.error("'base_model_name_or_path' is not found in adapter_config.json")
logger.error("Base model config is required. Please download the base model and add its path to --base")
sys.exit(1)
else:
logger.info(f"Loading base model: {dir_base_model.name}")
hparams = Model.load_hparams(dir_base_model)
with torch.inference_mode(): with torch.inference_mode():
try: try:
model_class = Model.from_model_architecture(hparams["architectures"][0]) model_class = Model.from_model_architecture(hparams["architectures"][0])
@ -323,13 +350,15 @@ if __name__ == '__main__':
self.dir_model_card = dir_lora_model self.dir_model_card = dir_lora_model
self.lora_alpha = float(lora_alpha) self.lora_alpha = float(lora_alpha)
def set_vocab(self):
pass
def set_type(self): def set_type(self):
self.gguf_writer.add_type(gguf.GGUFType.ADAPTER) self.gguf_writer.add_type(gguf.GGUFType.ADAPTER)
self.gguf_writer.add_string(gguf.Keys.Adapter.TYPE, "lora") self.gguf_writer.add_string(gguf.Keys.Adapter.TYPE, "lora")
def set_gguf_parameters(self): def set_gguf_parameters(self):
self.gguf_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, self.lora_alpha) self.gguf_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, self.lora_alpha)
super().set_gguf_parameters()
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
# Never add extra tensors (e.g. rope_freqs) for LoRA adapters # Never add extra tensors (e.g. rope_freqs) for LoRA adapters
@ -350,7 +379,7 @@ if __name__ == '__main__':
logger.error(f"Unexpected name '{name}': Not a lora_A or lora_B tensor") logger.error(f"Unexpected name '{name}': Not a lora_A or lora_B tensor")
if ".embed_tokens.weight" in name or ".lm_head.weight" in name: if ".embed_tokens.weight" in name or ".lm_head.weight" in name:
logger.error("Embeddings is present in the adapter. This can be due to new tokens added during fine tuning") logger.error("Embeddings is present in the adapter. This can be due to new tokens added during fine tuning")
logger.error("Hint: if you are using TRL, make sure not to call setup_chat_format()") logger.error("Please refer to https://github.com/ggerganov/llama.cpp/pull/9948")
sys.exit(1) sys.exit(1)
if base_name in tensor_map: if base_name in tensor_map:
@ -384,9 +413,6 @@ if __name__ == '__main__':
yield (dest_name + ".lora_a", lora_a) yield (dest_name + ".lora_a", lora_a)
yield (dest_name + ".lora_b", lora_b) yield (dest_name + ".lora_b", lora_b)
with open(lora_config, "r") as f:
lparams: dict[str, Any] = json.load(f)
alpha: float = lparams["lora_alpha"] alpha: float = lparams["lora_alpha"]
model_instance = LoraModel( model_instance = LoraModel(
@ -399,6 +425,7 @@ if __name__ == '__main__':
dry_run=args.dry_run, dry_run=args.dry_run,
dir_lora_model=dir_lora, dir_lora_model=dir_lora,
lora_alpha=alpha, lora_alpha=alpha,
hparams=hparams,
) )
logger.info("Exporting model...") logger.info("Exporting model...")

File diff suppressed because it is too large Load diff

View file

@ -4,6 +4,7 @@
// Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch // Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
#include "clip.h" #include "clip.h"
#include "ggml.h" #include "ggml.h"
#include "ggml-cpu.h"
#include "ggml-alloc.h" #include "ggml-alloc.h"
#include "ggml-backend.h" #include "ggml-backend.h"

View file

@ -1,3 +1,5 @@
#include "ggml-cpu.h"
#ifdef GGML_USE_CUDA #ifdef GGML_USE_CUDA
#include "ggml-cuda.h" #include "ggml-cuda.h"
#endif #endif

View file

@ -248,6 +248,7 @@ struct server_slot {
if (is_processing()) { if (is_processing()) {
SLT_INF(*this, "stop processing: n_past = %d, truncated = %d\n", n_past, truncated); SLT_INF(*this, "stop processing: n_past = %d, truncated = %d\n", n_past, truncated);
t_last_used = ggml_time_us();
t_token_generation = (ggml_time_us() - t_start_generation) / 1e3; t_token_generation = (ggml_time_us() - t_start_generation) / 1e3;
state = SLOT_STATE_IDLE; state = SLOT_STATE_IDLE;
callback_on_release(id); callback_on_release(id);
@ -731,7 +732,7 @@ struct server_context {
// find the slot that has at least n% prompt similarity // find the slot that has at least n% prompt similarity
if (ret == nullptr && slot_prompt_similarity != 0.0f) { if (ret == nullptr && slot_prompt_similarity != 0.0f) {
int max_lcs_len = 0; int lcs_len = 0;
float similarity = 0; float similarity = 0;
for (server_slot & slot : slots) { for (server_slot & slot : slots) {
@ -746,20 +747,21 @@ struct server_context {
} }
// length of the Longest Common Subsequence between the current slot's prompt and the input prompt // length of the Longest Common Subsequence between the current slot's prompt and the input prompt
int lcs_len = longest_common_subsequence(slot.cache_tokens, task.prompt_tokens); int cur_lcs_len = longest_common_subsequence(slot.cache_tokens, task.prompt_tokens);
// fraction of the common subsequence length compared to the current slot's prompt length // fraction of the common subsequence length compared to the current slot's prompt length
similarity = static_cast<float>(lcs_len) / static_cast<int>(slot.cache_tokens.size()); float cur_similarity = static_cast<float>(cur_lcs_len) / static_cast<int>(slot.cache_tokens.size());
// select the current slot if the criteria match // select the current slot if the criteria match
if (lcs_len > max_lcs_len && similarity > slot_prompt_similarity) { if (cur_lcs_len > lcs_len && cur_similarity > slot_prompt_similarity) {
max_lcs_len = lcs_len; lcs_len = cur_lcs_len;
similarity = cur_similarity;
ret = &slot; ret = &slot;
} }
} }
if (ret != nullptr) { if (ret != nullptr) {
SLT_DBG(*ret, "selected slot by lcs similarity, max_lcs_len = %d, similarity = %f\n", max_lcs_len, similarity); SLT_DBG(*ret, "selected slot by lcs similarity, lcs_len = %d, similarity = %f\n", lcs_len, similarity);
} }
} }
@ -2704,8 +2706,8 @@ int main(int argc, char ** argv) {
}; };
const auto handle_completions_generic = [&ctx_server, &res_error, &res_ok](server_task_inf_type inf_type, json & data, httplib::Response & res) { const auto handle_completions_generic = [&ctx_server, &res_error, &res_ok](server_task_inf_type inf_type, json & data, httplib::Response & res) {
if (ctx_server.params.embedding || ctx_server.params.reranking) { if (ctx_server.params.embedding) {
res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings` or `--reranking`", ERROR_TYPE_NOT_SUPPORTED)); res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
return; return;
} }
@ -2810,8 +2812,8 @@ int main(int argc, char ** argv) {
// TODO: maybe merge this function with "handle_completions_generic" // TODO: maybe merge this function with "handle_completions_generic"
const auto handle_chat_completions = [&ctx_server, &params, &res_error, &res_ok, verbose](const httplib::Request & req, httplib::Response & res) { const auto handle_chat_completions = [&ctx_server, &params, &res_error, &res_ok, verbose](const httplib::Request & req, httplib::Response & res) {
if (ctx_server.params.embedding || ctx_server.params.reranking) { if (ctx_server.params.embedding) {
res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings` or `--reranking`", ERROR_TYPE_NOT_SUPPORTED)); res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
return; return;
} }
@ -2936,11 +2938,6 @@ int main(int argc, char ** argv) {
}; };
const auto handle_embeddings = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) { const auto handle_embeddings = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) {
// TODO: somehow clean up this checks in the future
if (!ctx_server.params.embedding || ctx_server.params.reranking) {
res_error(res, format_error_response("This server does not support embeddings. Start it with `--embeddings` and without `--reranking`", ERROR_TYPE_NOT_SUPPORTED));
return;
}
const json body = json::parse(req.body); const json body = json::parse(req.body);
bool is_openai = false; bool is_openai = false;
@ -2992,10 +2989,11 @@ int main(int argc, char ** argv) {
}; };
const auto handle_rerank = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) { const auto handle_rerank = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) {
if (!ctx_server.params.reranking) { if (!ctx_server.params.reranking || ctx_server.params.embedding) {
res_error(res, format_error_response("This server does not support reranking. Start it with `--reranking`", ERROR_TYPE_NOT_SUPPORTED)); res_error(res, format_error_response("This server does not support reranking. Start it with `--reranking` and without `--embedding`", ERROR_TYPE_NOT_SUPPORTED));
return; return;
} }
const json body = json::parse(req.body); const json body = json::parse(req.body);
// TODO: implement // TODO: implement

View file

@ -453,20 +453,20 @@ static size_t longest_common_subsequence(const llama_tokens & a, const llama_tok
} }
// get the lengths of the input sequences // get the lengths of the input sequences
int a_len = a.size(); size_t a_len = a.size();
int b_len = b.size(); size_t b_len = b.size();
// initialize the maximum length of the longest common subsequence (LCS) // initialize the maximum length of the longest common subsequence (LCS)
int max_length = 0; size_t max_length = 0;
// use two rows instead of a 2D matrix to optimize space // use two rows instead of a 2D matrix to optimize space
std::vector<int> prev_row(b_len + 1, 0); std::vector<size_t> prev_row(b_len + 1, 0);
std::vector<int> curr_row(b_len + 1, 0); std::vector<size_t> curr_row(b_len + 1, 0);
// iterate through the elements of a // iterate through the elements of a
for (int i = 1; i <= a_len; i++) { for (size_t i = 1; i <= a_len; i++) {
// iterate through the elements of b // iterate through the elements of b
for (int j = 1; j <= b_len; j++) { for (size_t j = 1; j <= b_len; j++) {
// if elements at the current positions match // if elements at the current positions match
if (a[i - 1] == b[j - 1]) { if (a[i - 1] == b[j - 1]) {
// if it's the first element of either sequences, set LCS length to 1 // if it's the first element of either sequences, set LCS length to 1

View file

@ -96,7 +96,7 @@ int main(int argc, char ** argv) {
// tokenize the prompt // tokenize the prompt
const int n_prompt_tokens = -llama_tokenize(model, prompt.c_str(), prompt.size(), NULL, 0, true, true); const int n_prompt_tokens = -llama_tokenize(model, prompt.c_str(), prompt.size(), NULL, 0, true, true);
std::vector<llama_token> prompt_tokens(n_prompt_tokens); std::vector<llama_token> prompt_tokens(n_prompt_tokens);
if (llama_tokenize(model, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true, true) < 0) { if (llama_tokenize(model, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), llama_get_kv_cache_used_cells(ctx) == 0, true) < 0) {
GGML_ABORT("failed to tokenize the prompt\n"); GGML_ABORT("failed to tokenize the prompt\n");
} }

View file

@ -1,6 +1,7 @@
#include "ggml.h" #include "ggml.h"
#include "ggml-opencl.h" #include "ggml-opencl.h"
#include "ggml-backend-impl.h" #include "ggml-backend-impl.h"
#include "ggml-cpu.h"
#include <array> #include <array>
#include <atomic> #include <atomic>

View file

@ -1,6 +1,7 @@
#pragma once #pragma once
#include "ggml.h" #include "ggml.h"
#include "ggml-cpu.h"
#include "ggml-backend.h" #include "ggml-backend.h"
#ifdef __cplusplus #ifdef __cplusplus

View file

@ -305,27 +305,10 @@ extern "C" {
GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr); GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor); GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor);
// // CPU buffer types are always available
// CPU backend
//
GGML_API ggml_backend_t ggml_backend_cpu_init(void);
GGML_API bool ggml_backend_is_cpu (ggml_backend_t backend);
GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads);
GGML_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
// Create a backend buffer from an existing pointer
GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size); GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void); GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
GGML_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
#ifdef GGML_USE_CPU_HBM
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
#endif
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif

150
ggml/include/ggml-cpu.h Normal file
View file

@ -0,0 +1,150 @@
#pragma once
#include "ggml.h"
#include "ggml-backend.h"
#ifdef __cplusplus
extern "C" {
#endif
// Scheduling priorities
enum ggml_sched_priority {
GGML_SCHED_PRIO_NORMAL,
GGML_SCHED_PRIO_MEDIUM,
GGML_SCHED_PRIO_HIGH,
GGML_SCHED_PRIO_REALTIME
};
// Threadpool params
// Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults
struct ggml_threadpool_params {
bool cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings)
int n_threads; // number of threads
enum ggml_sched_priority prio; // thread priority
uint32_t poll; // polling level (0 - no polling, 100 - aggressive polling)
bool strict_cpu; // strict cpu placement
bool paused; // start in paused state
};
struct ggml_threadpool; // forward declaration, see ggml.c
typedef struct ggml_threadpool * ggml_threadpool_t;
// the compute plan that needs to be prepared for ggml_graph_compute()
// since https://github.com/ggerganov/ggml/issues/287
struct ggml_cplan {
size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()`
uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
int n_threads;
struct ggml_threadpool * threadpool;
// abort ggml_graph_compute when true
ggml_abort_callback abort_callback;
void * abort_callback_data;
};
// numa strategies
enum ggml_numa_strategy {
GGML_NUMA_STRATEGY_DISABLED = 0,
GGML_NUMA_STRATEGY_DISTRIBUTE = 1,
GGML_NUMA_STRATEGY_ISOLATE = 2,
GGML_NUMA_STRATEGY_NUMACTL = 3,
GGML_NUMA_STRATEGY_MIRROR = 4,
GGML_NUMA_STRATEGY_COUNT
};
GGML_API void ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems
GGML_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node
GGML_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
GGML_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
GGML_API void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
GGML_API int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
GGML_API void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value);
GGML_API float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
GGML_API void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
GGML_API float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
GGML_API void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);
GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads);
GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
GGML_API struct ggml_threadpool * ggml_threadpool_new (struct ggml_threadpool_params * params);
GGML_API void ggml_threadpool_free (struct ggml_threadpool * threadpool);
GGML_API int ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
GGML_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool);
GGML_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool);
// ggml_graph_plan() has to be called before ggml_graph_compute()
// when plan.work_size > 0, caller must allocate memory for plan.work_data
GGML_API struct ggml_cplan ggml_graph_plan(
const struct ggml_cgraph * cgraph,
int n_threads, /* = GGML_DEFAULT_N_THREADS */
struct ggml_threadpool * threadpool /* = NULL */ );
GGML_API enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
// same as ggml_graph_compute() but the work data is allocated as a part of the context
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
GGML_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
// TODO: move to backend interface
GGML_API int ggml_cpu_has_neon (void);
GGML_API int ggml_cpu_has_sve (void);
GGML_API int ggml_cpu_has_matmul_int8(void);
// get the sve vector length in bytes
GGML_API int ggml_cpu_get_sve_cnt(void);
// Internal types and functions exposed for tests and benchmarks
typedef void (*ggml_from_float_to_mat_t)
(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nr, int64_t k, int64_t bs);
typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
const void * GGML_RESTRICT y, size_t by, int nrc);
typedef void (*ggml_gemv_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
const void * GGML_RESTRICT y, int nr, int nc);
typedef void (*ggml_gemm_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
const void * GGML_RESTRICT y, int nr, int nc);
struct ggml_type_traits_cpu {
ggml_from_float_to_mat_t from_float_to_mat;
ggml_vec_dot_t vec_dot;
enum ggml_type vec_dot_type;
int64_t nrows; // number of rows to process simultaneously
int64_t ncols; // number of columns to process simultaneously
ggml_gemv_t gemv;
ggml_gemm_t gemm;
};
GGML_API const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type);
GGML_API void ggml_cpu_init(void);
//
// CPU backend
//
GGML_API ggml_backend_t ggml_backend_cpu_init(void);
GGML_API bool ggml_backend_is_cpu (ggml_backend_t backend);
GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads);
GGML_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
GGML_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
#ifdef GGML_USE_CPU_HBM
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
#endif
#ifdef __cplusplus
}
#endif

View file

@ -579,6 +579,13 @@ extern "C" {
GGML_TENSOR_FLAG_LOSS = 8, // ...defines loss for numerical optimization (multiple loss tensors add up) GGML_TENSOR_FLAG_LOSS = 8, // ...defines loss for numerical optimization (multiple loss tensors add up)
}; };
struct ggml_init_params {
// memory pool
size_t mem_size; // bytes
void * mem_buffer; // if NULL, memory will be allocated internally
bool no_alloc; // don't allocate memory for the tensor data
};
// n-dimensional tensor // n-dimensional tensor
struct ggml_tensor { struct ggml_tensor {
enum ggml_type type; enum ggml_type type;
@ -624,59 +631,6 @@ extern "C" {
// If it returns true, the computation is aborted // If it returns true, the computation is aborted
typedef bool (*ggml_abort_callback)(void * data); typedef bool (*ggml_abort_callback)(void * data);
// Scheduling priorities
enum ggml_sched_priority {
GGML_SCHED_PRIO_NORMAL,
GGML_SCHED_PRIO_MEDIUM,
GGML_SCHED_PRIO_HIGH,
GGML_SCHED_PRIO_REALTIME
};
// Threadpool params
// Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults
struct ggml_threadpool_params {
bool cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings)
int n_threads; // number of threads
enum ggml_sched_priority prio; // thread priority
uint32_t poll; // polling level (0 - no polling, 100 - aggressive polling)
bool strict_cpu; // strict cpu placement
bool paused; // start in paused state
};
struct ggml_threadpool; // forward declaration, see ggml.c
typedef struct ggml_threadpool * ggml_threadpool_t;
// the compute plan that needs to be prepared for ggml_graph_compute()
// since https://github.com/ggerganov/ggml/issues/287
struct ggml_cplan {
size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()`
uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
int n_threads;
struct ggml_threadpool * threadpool;
// abort ggml_graph_compute when true
ggml_abort_callback abort_callback;
void * abort_callback_data;
};
struct ggml_init_params {
// memory pool
size_t mem_size; // bytes
void * mem_buffer; // if NULL, memory will be allocated internally
bool no_alloc; // don't allocate memory for the tensor data
};
// numa strategies
enum ggml_numa_strategy {
GGML_NUMA_STRATEGY_DISABLED = 0,
GGML_NUMA_STRATEGY_DISTRIBUTE = 1,
GGML_NUMA_STRATEGY_ISOLATE = 2,
GGML_NUMA_STRATEGY_NUMACTL = 3,
GGML_NUMA_STRATEGY_MIRROR = 4,
GGML_NUMA_STRATEGY_COUNT
};
// //
// GUID // GUID
@ -699,9 +653,6 @@ extern "C" {
// accepts a UTF-8 path, even on Windows // accepts a UTF-8 path, even on Windows
GGML_API FILE * ggml_fopen(const char * fname, const char * mode); GGML_API FILE * ggml_fopen(const char * fname, const char * mode);
GGML_API void ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems
GGML_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node
GGML_API void ggml_print_object (const struct ggml_object * obj); GGML_API void ggml_print_object (const struct ggml_object * obj);
GGML_API void ggml_print_objects(const struct ggml_context * ctx); GGML_API void ggml_print_objects(const struct ggml_context * ctx);
@ -803,8 +754,7 @@ extern "C" {
int64_t ne2, int64_t ne2,
int64_t ne3); int64_t ne3);
GGML_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value); GGML_API void * ggml_new_buffer(struct ggml_context * ctx, size_t nbytes);
GGML_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src); GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src); GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src);
@ -814,35 +764,25 @@ extern "C" {
GGML_API struct ggml_tensor * ggml_get_next_tensor (const struct ggml_context * ctx, struct ggml_tensor * tensor); GGML_API struct ggml_tensor * ggml_get_next_tensor (const struct ggml_context * ctx, struct ggml_tensor * tensor);
GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name); GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
// Converts a flat index into coordinates // Converts a flat index into coordinates
GGML_API void ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3); GGML_API void ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3);
GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i); GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
GGML_API void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
GGML_API int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
GGML_API void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value);
GGML_API float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
GGML_API void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
GGML_API float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
GGML_API void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);
GGML_API void * ggml_get_data (const struct ggml_tensor * tensor); GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor); GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
GGML_API const char * ggml_get_name (const struct ggml_tensor * tensor); GGML_API const char * ggml_get_name (const struct ggml_tensor * tensor);
GGML_API struct ggml_tensor * ggml_set_name ( struct ggml_tensor * tensor, const char * name); GGML_API struct ggml_tensor * ggml_set_name ( struct ggml_tensor * tensor, const char * name);
GGML_ATTRIBUTE_FORMAT(2, 3) GGML_ATTRIBUTE_FORMAT(2, 3)
GGML_API struct ggml_tensor * ggml_format_name( struct ggml_tensor * tensor, const char * fmt, ...); GGML_API struct ggml_tensor * ggml_format_name( struct ggml_tensor * tensor, const char * fmt, ...);
// Tensor flags
GGML_API void ggml_set_input(struct ggml_tensor * tensor);
GGML_API void ggml_set_output(struct ggml_tensor * tensor);
GGML_API void ggml_set_param(struct ggml_context * ctx, struct ggml_tensor * tensor);
GGML_API void ggml_set_loss(struct ggml_tensor * tensor);
// //
// operations on tensors with backpropagation // operations on tensors with backpropagation
// //
@ -2058,9 +1998,6 @@ extern "C" {
// automatic differentiation // automatic differentiation
// //
GGML_API void ggml_set_param(struct ggml_context * ctx, struct ggml_tensor * tensor);
GGML_API void ggml_set_loss(struct ggml_tensor * tensor);
GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor); GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool accumulate); GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool accumulate);
@ -2092,27 +2029,6 @@ extern "C" {
GGML_API size_t ggml_graph_overhead(void); GGML_API size_t ggml_graph_overhead(void);
GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads); GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);
GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads);
GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
GGML_API struct ggml_threadpool * ggml_threadpool_new (struct ggml_threadpool_params * params);
GGML_API void ggml_threadpool_free (struct ggml_threadpool * threadpool);
GGML_API int ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
GGML_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool);
GGML_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool);
// ggml_graph_plan() has to be called before ggml_graph_compute()
// when plan.work_size > 0, caller must allocate memory for plan.work_data
GGML_API struct ggml_cplan ggml_graph_plan(
const struct ggml_cgraph * cgraph,
int n_threads, /* = GGML_DEFAULT_N_THREADS */
struct ggml_threadpool * threadpool /* = NULL */ );
GGML_API enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
// same as ggml_graph_compute() but the work data is allocated as a part of the context
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
GGML_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name); GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
GGML_API void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname); GGML_API void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);
@ -2283,6 +2199,8 @@ extern "C" {
} lbfgs; } lbfgs;
}; };
GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
GGML_API struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type); GGML_API struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);
// optimize the function defined by the tensor f // optimize the function defined by the tensor f
@ -2314,12 +2232,6 @@ extern "C" {
ggml_opt_callback callback, ggml_opt_callback callback,
void * callback_data); void * callback_data);
//
// tensor flags
//
GGML_API void ggml_set_input(struct ggml_tensor * tensor);
GGML_API void ggml_set_output(struct ggml_tensor * tensor);
// //
// quantization // quantization
// //
@ -2488,8 +2400,6 @@ extern "C" {
GGML_API int ggml_cpu_has_avx512_bf16(void); GGML_API int ggml_cpu_has_avx512_bf16(void);
GGML_API int ggml_cpu_has_amx_int8 (void); GGML_API int ggml_cpu_has_amx_int8 (void);
GGML_API int ggml_cpu_has_fma (void); GGML_API int ggml_cpu_has_fma (void);
GGML_API int ggml_cpu_has_neon (void);
GGML_API int ggml_cpu_has_sve (void);
GGML_API int ggml_cpu_has_arm_fma (void); GGML_API int ggml_cpu_has_arm_fma (void);
GGML_API int ggml_cpu_has_metal (void); GGML_API int ggml_cpu_has_metal (void);
GGML_API int ggml_cpu_has_f16c (void); GGML_API int ggml_cpu_has_f16c (void);
@ -2506,17 +2416,9 @@ extern "C" {
GGML_API int ggml_cpu_has_sycl (void); GGML_API int ggml_cpu_has_sycl (void);
GGML_API int ggml_cpu_has_rpc (void); GGML_API int ggml_cpu_has_rpc (void);
GGML_API int ggml_cpu_has_vsx (void); GGML_API int ggml_cpu_has_vsx (void);
GGML_API int ggml_cpu_has_matmul_int8(void);
GGML_API int ggml_cpu_has_cann (void); GGML_API int ggml_cpu_has_cann (void);
GGML_API int ggml_cpu_has_llamafile (void); GGML_API int ggml_cpu_has_llamafile (void);
// get the sve vector length in bytes
GGML_API int ggml_cpu_get_sve_cnt(void);
//
// Internal types and functions exposed for tests and benchmarks
//
#ifdef __cplusplus #ifdef __cplusplus
// restrict not standard in C++ // restrict not standard in C++
#define GGML_RESTRICT #define GGML_RESTRICT
@ -2525,14 +2427,6 @@ extern "C" {
#endif #endif
typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
typedef void (*ggml_from_float_to_mat_t)
(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nr, int64_t k, int64_t bs);
typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
const void * GGML_RESTRICT y, size_t by, int nrc);
typedef void (*ggml_gemv_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
const void * GGML_RESTRICT y, int nr, int nc);
typedef void (*ggml_gemm_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
const void * GGML_RESTRICT y, int nr, int nc);
struct ggml_type_traits { struct ggml_type_traits {
const char * type_name; const char * type_name;
@ -2543,13 +2437,6 @@ extern "C" {
ggml_to_float_t to_float; ggml_to_float_t to_float;
ggml_from_float_t from_float; ggml_from_float_t from_float;
ggml_from_float_t from_float_ref; ggml_from_float_t from_float_ref;
ggml_from_float_to_mat_t from_float_to_mat;
ggml_vec_dot_t vec_dot;
enum ggml_type vec_dot_type;
int64_t nrows; // number of rows to process simultaneously
int64_t ncols; // number of columns to process simultaneously
ggml_gemv_t gemv;
ggml_gemm_t gemm;
}; };
GGML_API const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type); GGML_API const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type);

View file

@ -7,6 +7,7 @@
#include "ggml-quants.h" #include "ggml-quants.h"
#include "ggml-impl.h" #include "ggml-impl.h"
#include "ggml-cpu.h"
#include "ggml-cpu-impl.h" #include "ggml-cpu-impl.h"
#include <math.h> #include <math.h>

File diff suppressed because it is too large Load diff

13759
ggml/src/ggml-cpu.c Normal file

File diff suppressed because it is too large Load diff

View file

@ -8,6 +8,7 @@
#include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/ #include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
#include <stdbool.h> #include <stdbool.h>
#include <stdint.h> #include <stdint.h>
#include <string.h>
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
@ -36,6 +37,20 @@ extern "C" {
#endif #endif
#endif #endif
static inline int ggml_up32(int n) {
return (n + 31) & ~31;
}
//static inline int ggml_up64(int n) {
// return (n + 63) & ~63;
//}
static inline int ggml_up(int n, int m) {
// assert m is a power of 2
GGML_ASSERT((m & (m - 1)) == 0);
return (n + m - 1) & ~(m - 1);
}
// //
// logging // logging
// //
@ -51,6 +66,74 @@ void ggml_log_callback_default(enum ggml_log_level level, const char * text, voi
#define GGML_LOG_DEBUG(...) ggml_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__) #define GGML_LOG_DEBUG(...) ggml_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
#define GGML_LOG_CONT(...) ggml_log_internal(GGML_LOG_LEVEL_CONT , __VA_ARGS__) #define GGML_LOG_CONT(...) ggml_log_internal(GGML_LOG_LEVEL_CONT , __VA_ARGS__)
#define GGML_DEBUG 0
#if (GGML_DEBUG >= 1)
#define GGML_PRINT_DEBUG(...) GGML_LOG_DEBUG(__VA_ARGS__)
#else
#define GGML_PRINT_DEBUG(...)
#endif
#if (GGML_DEBUG >= 5)
#define GGML_PRINT_DEBUG_5(...) GGML_LOG_DEBUG(__VA_ARGS__)
#else
#define GGML_PRINT_DEBUG_5(...)
#endif
#if (GGML_DEBUG >= 10)
#define GGML_PRINT_DEBUG_10(...) GGML_LOG_DEBUG(__VA_ARGS__)
#else
#define GGML_PRINT_DEBUG_10(...)
#endif
// tensor params
static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) {
GGML_ASSERT(tensor != NULL); // silence -Warray-bounds warnings
assert(params_size <= GGML_MAX_OP_PARAMS);
memcpy(tensor->op_params, params, params_size);
}
static int32_t ggml_get_op_params_i32(const struct ggml_tensor * tensor, uint32_t i) {
assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
return ((const int32_t *)(tensor->op_params))[i];
}
static float ggml_get_op_params_f32(const struct ggml_tensor * tensor, uint32_t i) {
assert(i < GGML_MAX_OP_PARAMS / sizeof(float));
return ((const float *)(tensor->op_params))[i];
}
static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int32_t value) {
assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
((int32_t *)(tensor->op_params))[i] = value;
}
static void ggml_set_op_params_f32(struct ggml_tensor * tensor, uint32_t i, float value) {
assert(i < GGML_MAX_OP_PARAMS / sizeof(float));
((float *)(tensor->op_params))[i] = value;
}
struct ggml_map_custom1_op_params {
ggml_custom1_op_t fun;
int n_tasks;
void * userdata;
};
struct ggml_map_custom2_op_params {
ggml_custom2_op_t fun;
int n_tasks;
void * userdata;
};
struct ggml_map_custom3_op_params {
ggml_custom3_op_t fun;
int n_tasks;
void * userdata;
};
// bitset // bitset
typedef uint32_t ggml_bitset_t; typedef uint32_t ggml_bitset_t;
@ -204,6 +287,10 @@ struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1);
void * ggml_aligned_malloc(size_t size); void * ggml_aligned_malloc(size_t size);
void ggml_aligned_free(void * ptr, size_t size); void ggml_aligned_free(void * ptr, size_t size);
// TODO: move to threading file
void ggml_critical_section_start(void);
void ggml_critical_section_end(void);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif

View file

@ -2776,11 +2776,11 @@ kernel void kernel_flash_attn_ext_vec_f16(
const short iv3 = iq3 / rv3; const short iv3 = iq3 / rv3;
// load the queries from shared memory into local memory // load the queries from shared memory into local memory
float4 mq[D4]; float4 mq[D4/NW];
for (short ii = 0; ii < D4; ii += NW) { for (short ii = 0; ii < D4; ii += NW) {
short i = ii + tiisg; short i = ii + tiisg;
mq[i] = (float4) sq4[i]; mq[ii/NW] = (float4) sq4[i];
} }
// pointer to the mask // pointer to the mask
@ -2812,7 +2812,7 @@ kernel void kernel_flash_attn_ext_vec_f16(
mk[2] = (float4) pk4[i + 2*(nb11/8)]; mk[2] = (float4) pk4[i + 2*(nb11/8)];
mk[3] = (float4) pk4[i + 3*(nb11/8)]; mk[3] = (float4) pk4[i + 3*(nb11/8)];
mqk += (float4) (mq[i] * mk); mqk += (float4) (mq[ii/NW] * mk);
} }
// reduce the results from the threads in the simdgroup // reduce the results from the threads in the simdgroup
@ -2857,8 +2857,7 @@ kernel void kernel_flash_attn_ext_vec_f16(
// O = diag(ms)*O // O = diag(ms)*O
#pragma unroll #pragma unroll
for (short ii = 0; ii < D4; ii += NW) { for (short ii = 0; ii < D4; ii += NW) {
const short i = ii + tiisg; lo[ii/NW] *= ms;
lo[i/NW] *= ms;
} }
} }
@ -2872,10 +2871,10 @@ kernel void kernel_flash_attn_ext_vec_f16(
for (short ii = 0; ii < D4; ii += NW) { for (short ii = 0; ii < D4; ii += NW) {
const short i = ii + tiisg; const short i = ii + tiisg;
lo[i/NW] += pv4[i + 0*(nb21/8)] * ss[4*cc + 0]; lo[ii/NW] += pv4[i + 0*(nb21/8)] * ss[4*cc + 0];
lo[i/NW] += pv4[i + 1*(nb21/8)] * ss[4*cc + 1]; lo[ii/NW] += pv4[i + 1*(nb21/8)] * ss[4*cc + 1];
lo[i/NW] += pv4[i + 2*(nb21/8)] * ss[4*cc + 2]; lo[ii/NW] += pv4[i + 2*(nb21/8)] * ss[4*cc + 2];
lo[i/NW] += pv4[i + 3*(nb21/8)] * ss[4*cc + 3]; lo[ii/NW] += pv4[i + 3*(nb21/8)] * ss[4*cc + 3];
} }
} }
} }

View file

@ -1296,13 +1296,6 @@ static ggml_backend_buffer_type_t ggml_backend_rpc_device_get_buffer_type(ggml_b
UNUSED(dev); UNUSED(dev);
} }
static ggml_backend_buffer_t ggml_backend_rpc_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
return ggml_backend_cpu_buffer_from_ptr(ptr, size);
UNUSED(dev);
UNUSED(max_tensor_size);
}
static bool ggml_backend_rpc_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) { static bool ggml_backend_rpc_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
UNUSED(dev); UNUSED(dev);
UNUSED(op); UNUSED(op);
@ -1328,7 +1321,7 @@ static const struct ggml_backend_device_i ggml_backend_rpc_device_i = {
/* .init_backend = */ ggml_backend_rpc_device_init, /* .init_backend = */ ggml_backend_rpc_device_init,
/* .get_buffer_type = */ ggml_backend_rpc_device_get_buffer_type, /* .get_buffer_type = */ ggml_backend_rpc_device_get_buffer_type,
/* .get_host_buffer_type = */ NULL, /* .get_host_buffer_type = */ NULL,
/* .buffer_from_host_ptr = */ ggml_backend_rpc_device_buffer_from_ptr, /* .buffer_from_host_ptr = */ NULL,
/* .supports_op = */ ggml_backend_rpc_device_supports_op, /* .supports_op = */ ggml_backend_rpc_device_supports_op,
/* .supports_buft = */ ggml_backend_rpc_device_supports_buft, /* .supports_buft = */ ggml_backend_rpc_device_supports_buft,
/* .offload_op = */ NULL, /* .offload_op = */ NULL,

File diff suppressed because it is too large Load diff

View file

@ -2,6 +2,7 @@
#define LLAMA_H #define LLAMA_H
#include "ggml.h" #include "ggml.h"
#include "ggml-cpu.h"
#include "ggml-backend.h" #include "ggml-backend.h"
#include <stddef.h> #include <stddef.h>

View file

@ -11,6 +11,7 @@
#include "model_adapter.h" #include "model_adapter.h"
#include "ggml.h" #include "ggml.h"
#include "ggml-cpu.h"
#include <chrono> #include <chrono>

View file

@ -23,6 +23,7 @@
#include "ggml-alloc.h" #include "ggml-alloc.h"
#include "ggml-backend.h" #include "ggml-backend.h"
#include "ggml.h" #include "ggml.h"
#include "ggml-cpu.h"
#ifdef SD_USE_CUBLAS #ifdef SD_USE_CUBLAS
#include "ggml-cuda.h" #include "ggml-cuda.h"

View file

@ -21,6 +21,7 @@
#endif #endif
#include "ggml.h" #include "ggml.h"
#include "ggml-cpu.h"
#include "ggml-alloc.h" #include "ggml-alloc.h"
#include "ggml-backend.h" #include "ggml-backend.h"

1
spm-headers/ggml-cpu.h Symbolic link
View file

@ -0,0 +1 @@
../ggml/include/ggml-cpu.h

View file

@ -19524,12 +19524,26 @@ struct llama_context * llama_new_context_with_model(
cparams.causal_attn = params.attention_type == LLAMA_ATTENTION_TYPE_CAUSAL; cparams.causal_attn = params.attention_type == LLAMA_ATTENTION_TYPE_CAUSAL;
} }
LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx); const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
LLAMA_LOG_INFO("%s: n_batch = %u\n", __func__, cparams.n_batch);
LLAMA_LOG_INFO("%s: n_ubatch = %u\n", __func__, cparams.n_ubatch); LLAMA_LOG_INFO("%s: n_seq_max = %u\n", __func__, cparams.n_seq_max);
LLAMA_LOG_INFO("%s: flash_attn = %d\n", __func__, cparams.flash_attn); LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx);
LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base); LLAMA_LOG_INFO("%s: n_ctx_per_seq = %u\n", __func__, n_ctx_per_seq);
LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale); LLAMA_LOG_INFO("%s: n_batch = %u\n", __func__, cparams.n_batch);
LLAMA_LOG_INFO("%s: n_ubatch = %u\n", __func__, cparams.n_ubatch);
LLAMA_LOG_INFO("%s: flash_attn = %d\n", __func__, cparams.flash_attn);
LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);
if (n_ctx_per_seq < hparams.n_ctx_train) {
LLAMA_LOG_WARN("%s: n_ctx_per_seq (%u) < n_ctx_train (%u) -- the full capacity of the model will not be utilized\n",
__func__, n_ctx_per_seq, hparams.n_ctx_train);
}
if (n_ctx_per_seq > hparams.n_ctx_train) {
LLAMA_LOG_WARN("%s: n_ctx_pre_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n",
__func__, n_ctx_per_seq, hparams.n_ctx_train);
}
ctx->abort_callback = params.abort_callback; ctx->abort_callback = params.abort_callback;
ctx->abort_callback_data = params.abort_callback_data; ctx->abort_callback_data = params.abort_callback_data;
@ -21980,6 +21994,8 @@ int llama_split_prefix(char * dest, size_t maxlen, const char * split_path, int
} }
const char * llama_print_system_info(void) { const char * llama_print_system_info(void) {
ggml_cpu_init(); // some ARM features are detected at runtime
static std::string s; static std::string s;
s = ""; s = "";

View file

@ -1,4 +1,5 @@
#include "ggml.h" #include "ggml.h"
#include "ggml-cpu.h"
#include "ggml-backend.h" #include "ggml-backend.h"
#include <chrono> #include <chrono>