mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-10 09:04:36 +00:00
test to see if Ofast for ggml library plus batching adjustments fixes speed regression for ggmlv1 models
This commit is contained in:
parent
a6aff3fba0
commit
b5ba6c9ece
2 changed files with 35 additions and 34 deletions
51
Makefile
51
Makefile
|
@ -42,6 +42,7 @@ endif
|
||||||
CFLAGS = -I. -I./include -I./include/CL -I./otherarch -I./otherarch/tools -I./include/vulkan -O3 -DNDEBUG -std=c11 -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE
|
CFLAGS = -I. -I./include -I./include/CL -I./otherarch -I./otherarch/tools -I./include/vulkan -O3 -DNDEBUG -std=c11 -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE
|
||||||
CXXFLAGS = -I. -I./common -I./include -I./include/CL -I./otherarch -I./otherarch/tools -I./include/vulkan -O3 -DNDEBUG -std=c++11 -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE
|
CXXFLAGS = -I. -I./common -I./include -I./include/CL -I./otherarch -I./otherarch/tools -I./include/vulkan -O3 -DNDEBUG -std=c++11 -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE
|
||||||
LDFLAGS =
|
LDFLAGS =
|
||||||
|
FASTCFLAGS = $(subst -O3,-Ofast,$(CFLAGS))
|
||||||
|
|
||||||
# these are used on windows, to build some libraries with extra old device compatibility
|
# these are used on windows, to build some libraries with extra old device compatibility
|
||||||
SIMPLECFLAGS =
|
SIMPLECFLAGS =
|
||||||
|
@ -380,23 +381,23 @@ $(info )
|
||||||
#
|
#
|
||||||
|
|
||||||
ggml.o: ggml.c ggml.h ggml-cuda.h
|
ggml.o: ggml.c ggml.h ggml-cuda.h
|
||||||
$(CC) $(CFLAGS) $(FULLCFLAGS) -c $< -o $@
|
$(CC) $(FASTCFLAGS) $(FULLCFLAGS) -c $< -o $@
|
||||||
ggml_v4_openblas.o: ggml.c ggml.h ggml-cuda.h
|
ggml_v4_openblas.o: ggml.c ggml.h ggml-cuda.h
|
||||||
$(CC) $(CFLAGS) $(FULLCFLAGS) $(OPENBLAS_FLAGS) -c $< -o $@
|
$(CC) $(FASTCFLAGS) $(FULLCFLAGS) $(OPENBLAS_FLAGS) -c $< -o $@
|
||||||
ggml_v4_failsafe.o: ggml.c ggml.h ggml-cuda.h
|
ggml_v4_failsafe.o: ggml.c ggml.h ggml-cuda.h
|
||||||
$(CC) $(CFLAGS) $(NONECFLAGS) -c $< -o $@
|
$(CC) $(FASTCFLAGS) $(NONECFLAGS) -c $< -o $@
|
||||||
ggml_v4_noavx2.o: ggml.c ggml.h ggml-cuda.h
|
ggml_v4_noavx2.o: ggml.c ggml.h ggml-cuda.h
|
||||||
$(CC) $(CFLAGS) $(SIMPLECFLAGS) -c $< -o $@
|
$(CC) $(FASTCFLAGS) $(SIMPLECFLAGS) -c $< -o $@
|
||||||
ggml_v4_clblast.o: ggml.c ggml.h ggml-cuda.h
|
ggml_v4_clblast.o: ggml.c ggml.h ggml-cuda.h
|
||||||
$(CC) $(CFLAGS) $(FULLCFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
|
$(CC) $(FASTCFLAGS) $(FULLCFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
|
||||||
ggml_v4_cublas.o: ggml.c ggml.h ggml-cuda.h
|
ggml_v4_cublas.o: ggml.c ggml.h ggml-cuda.h
|
||||||
$(CC) $(CFLAGS) $(FULLCFLAGS) $(CUBLAS_FLAGS) $(HIPFLAGS) -c $< -o $@
|
$(CC) $(FASTCFLAGS) $(FULLCFLAGS) $(CUBLAS_FLAGS) $(HIPFLAGS) -c $< -o $@
|
||||||
ggml_v4_clblast_noavx2.o: ggml.c ggml.h ggml-cuda.h
|
ggml_v4_clblast_noavx2.o: ggml.c ggml.h ggml-cuda.h
|
||||||
$(CC) $(CFLAGS) $(SIMPLECFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
|
$(CC) $(FASTCFLAGS) $(SIMPLECFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
|
||||||
ggml_v4_vulkan.o: ggml.c ggml.h ggml-cuda.h
|
ggml_v4_vulkan.o: ggml.c ggml.h ggml-cuda.h
|
||||||
$(CC) $(CFLAGS) $(FULLCFLAGS) $(VULKAN_FLAGS) -c $< -o $@
|
$(CC) $(FASTCFLAGS) $(FULLCFLAGS) $(VULKAN_FLAGS) -c $< -o $@
|
||||||
ggml_v4_vulkan_noavx2.o: ggml.c ggml.h ggml-cuda.h
|
ggml_v4_vulkan_noavx2.o: ggml.c ggml.h ggml-cuda.h
|
||||||
$(CC) $(CFLAGS) $(SIMPLECFLAGS) $(VULKAN_FLAGS) -c $< -o $@
|
$(CC) $(FASTCFLAGS) $(SIMPLECFLAGS) $(VULKAN_FLAGS) -c $< -o $@
|
||||||
|
|
||||||
#quants
|
#quants
|
||||||
ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h ggml-cuda.h
|
ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h ggml-cuda.h
|
||||||
|
@ -415,41 +416,41 @@ ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h
|
||||||
|
|
||||||
#version 3 libs
|
#version 3 libs
|
||||||
ggml_v3.o: otherarch/ggml_v3.c otherarch/ggml_v3.h
|
ggml_v3.o: otherarch/ggml_v3.c otherarch/ggml_v3.h
|
||||||
$(CC) $(CFLAGS) $(FULLCFLAGS) -c $< -o $@
|
$(CC) $(FASTCFLAGS) $(FULLCFLAGS) -c $< -o $@
|
||||||
ggml_v3_openblas.o: otherarch/ggml_v3.c otherarch/ggml_v3.h
|
ggml_v3_openblas.o: otherarch/ggml_v3.c otherarch/ggml_v3.h
|
||||||
$(CC) $(CFLAGS) $(FULLCFLAGS) $(OPENBLAS_FLAGS) -c $< -o $@
|
$(CC) $(FASTCFLAGS) $(FULLCFLAGS) $(OPENBLAS_FLAGS) -c $< -o $@
|
||||||
ggml_v3_failsafe.o: otherarch/ggml_v3.c otherarch/ggml_v3.h
|
ggml_v3_failsafe.o: otherarch/ggml_v3.c otherarch/ggml_v3.h
|
||||||
$(CC) $(CFLAGS) $(NONECFLAGS) -c $< -o $@
|
$(CC) $(FASTCFLAGS) $(NONECFLAGS) -c $< -o $@
|
||||||
ggml_v3_noavx2.o: otherarch/ggml_v3.c otherarch/ggml_v3.h
|
ggml_v3_noavx2.o: otherarch/ggml_v3.c otherarch/ggml_v3.h
|
||||||
$(CC) $(CFLAGS) $(SIMPLECFLAGS) -c $< -o $@
|
$(CC) $(FASTCFLAGS) $(SIMPLECFLAGS) -c $< -o $@
|
||||||
ggml_v3_clblast.o: otherarch/ggml_v3.c otherarch/ggml_v3.h
|
ggml_v3_clblast.o: otherarch/ggml_v3.c otherarch/ggml_v3.h
|
||||||
$(CC) $(CFLAGS) $(FULLCFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
|
$(CC) $(FASTCFLAGS) $(FULLCFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
|
||||||
ggml_v3_cublas.o: otherarch/ggml_v3.c otherarch/ggml_v3.h
|
ggml_v3_cublas.o: otherarch/ggml_v3.c otherarch/ggml_v3.h
|
||||||
$(CC) $(CFLAGS) $(FULLCFLAGS) $(CUBLAS_FLAGS) $(HIPFLAGS) -c $< -o $@
|
$(CC) $(FASTCFLAGS) $(FULLCFLAGS) $(CUBLAS_FLAGS) $(HIPFLAGS) -c $< -o $@
|
||||||
ggml_v3_clblast_noavx2.o: otherarch/ggml_v3.c otherarch/ggml_v3.h
|
ggml_v3_clblast_noavx2.o: otherarch/ggml_v3.c otherarch/ggml_v3.h
|
||||||
$(CC) $(CFLAGS) $(SIMPLECFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
|
$(CC) $(FASTCFLAGS) $(SIMPLECFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
|
||||||
|
|
||||||
#version 2 libs
|
#version 2 libs
|
||||||
ggml_v2.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
|
ggml_v2.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
|
||||||
$(CC) $(CFLAGS) $(FULLCFLAGS) -c $< -o $@
|
$(CC) $(FASTCFLAGS) $(FULLCFLAGS) -c $< -o $@
|
||||||
ggml_v2_openblas.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
|
ggml_v2_openblas.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
|
||||||
$(CC) $(CFLAGS) $(FULLCFLAGS) $(OPENBLAS_FLAGS) -c $< -o $@
|
$(CC) $(FASTCFLAGS) $(FULLCFLAGS) $(OPENBLAS_FLAGS) -c $< -o $@
|
||||||
ggml_v2_failsafe.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
|
ggml_v2_failsafe.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
|
||||||
$(CC) $(CFLAGS) $(NONECFLAGS) -c $< -o $@
|
$(CC) $(FASTCFLAGS) $(NONECFLAGS) -c $< -o $@
|
||||||
ggml_v2_noavx2.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
|
ggml_v2_noavx2.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
|
||||||
$(CC) $(CFLAGS) $(SIMPLECFLAGS) -c $< -o $@
|
$(CC) $(FASTCFLAGS) $(SIMPLECFLAGS) -c $< -o $@
|
||||||
ggml_v2_clblast.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
|
ggml_v2_clblast.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
|
||||||
$(CC) $(CFLAGS) $(FULLCFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
|
$(CC) $(FASTCFLAGS) $(FULLCFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
|
||||||
ggml_v2_cublas.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
|
ggml_v2_cublas.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
|
||||||
$(CC) $(CFLAGS) $(FULLCFLAGS) $(CUBLAS_FLAGS) $(HIPFLAGS) -c $< -o $@
|
$(CC) $(FASTCFLAGS) $(FULLCFLAGS) $(CUBLAS_FLAGS) $(HIPFLAGS) -c $< -o $@
|
||||||
ggml_v2_clblast_noavx2.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
|
ggml_v2_clblast_noavx2.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
|
||||||
$(CC) $(CFLAGS) $(SIMPLECFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
|
$(CC) $(FASTCFLAGS) $(SIMPLECFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
|
||||||
|
|
||||||
#extreme old version compat
|
#extreme old version compat
|
||||||
ggml_v1.o: otherarch/ggml_v1.c otherarch/ggml_v1.h
|
ggml_v1.o: otherarch/ggml_v1.c otherarch/ggml_v1.h
|
||||||
$(CC) $(CFLAGS) $(FULLCFLAGS) -c $< -o $@
|
$(CC) $(FASTCFLAGS) $(FULLCFLAGS) -c $< -o $@
|
||||||
ggml_v1_failsafe.o: otherarch/ggml_v1.c otherarch/ggml_v1.h
|
ggml_v1_failsafe.o: otherarch/ggml_v1.c otherarch/ggml_v1.h
|
||||||
$(CC) $(CFLAGS) $(NONECFLAGS) -c $< -o $@
|
$(CC) $(FASTCFLAGS) $(NONECFLAGS) -c $< -o $@
|
||||||
|
|
||||||
#opencl
|
#opencl
|
||||||
ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
|
ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
|
||||||
|
|
|
@ -683,7 +683,14 @@ void PurgeMissingTokens(llama_context * ctx, std::vector<int> ¤t_context_t
|
||||||
|
|
||||||
static int GetBatchSize(int desiredBlasBatchSize,FileFormat in_file_format)
|
static int GetBatchSize(int desiredBlasBatchSize,FileFormat in_file_format)
|
||||||
{
|
{
|
||||||
if(desiredBlasBatchSize<=0)
|
//check if approved to use BLAS
|
||||||
|
bool approved_format = !(file_format == FileFormat::BADFORMAT ||
|
||||||
|
file_format == FileFormat::GPT2_1 ||
|
||||||
|
file_format == FileFormat::GPTJ_1 ||
|
||||||
|
file_format == FileFormat::GPTJ_2 ||
|
||||||
|
file_format == FileFormat::RWKV_1 ||
|
||||||
|
file_format==FileFormat::RWKV_2);
|
||||||
|
if(!approved_format || desiredBlasBatchSize<=0)
|
||||||
{
|
{
|
||||||
desiredBlasBatchSize = 16;
|
desiredBlasBatchSize = 16;
|
||||||
}
|
}
|
||||||
|
@ -1684,14 +1691,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
//if using BLAS and prompt is big enough, switch to single thread and use a huge batch
|
bool blasmode = (embd_inp.size() >= 32 && ggml_cpu_has_blas() && kcpp_params->n_batch>=32);
|
||||||
bool approved_format = !(file_format == FileFormat::BADFORMAT ||
|
|
||||||
file_format == FileFormat::GPT2_1 ||
|
|
||||||
file_format == FileFormat::GPTJ_1 ||
|
|
||||||
file_format == FileFormat::GPTJ_2 ||
|
|
||||||
file_format == FileFormat::RWKV_1 ||
|
|
||||||
file_format==FileFormat::RWKV_2);
|
|
||||||
bool blasmode = (approved_format && embd_inp.size() >= 32 && ggml_cpu_has_blas() && kcpp_params->n_batch>=32);
|
|
||||||
|
|
||||||
current_context_tokens.resize(n_past);
|
current_context_tokens.resize(n_past);
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue