diff --git a/Makefile b/Makefile index ff7c533ba..9a731e36c 100644 --- a/Makefile +++ b/Makefile @@ -42,6 +42,7 @@ endif CFLAGS = -I. -I./include -I./include/CL -I./otherarch -I./otherarch/tools -I./include/vulkan -O3 -DNDEBUG -std=c11 -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE CXXFLAGS = -I. -I./common -I./include -I./include/CL -I./otherarch -I./otherarch/tools -I./include/vulkan -O3 -DNDEBUG -std=c++11 -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE LDFLAGS = +FASTCFLAGS = $(subst -O3,-Ofast,$(CFLAGS)) # these are used on windows, to build some libraries with extra old device compatibility SIMPLECFLAGS = @@ -380,23 +381,23 @@ $(info ) # ggml.o: ggml.c ggml.h ggml-cuda.h - $(CC) $(CFLAGS) $(FULLCFLAGS) -c $< -o $@ + $(CC) $(FASTCFLAGS) $(FULLCFLAGS) -c $< -o $@ ggml_v4_openblas.o: ggml.c ggml.h ggml-cuda.h - $(CC) $(CFLAGS) $(FULLCFLAGS) $(OPENBLAS_FLAGS) -c $< -o $@ + $(CC) $(FASTCFLAGS) $(FULLCFLAGS) $(OPENBLAS_FLAGS) -c $< -o $@ ggml_v4_failsafe.o: ggml.c ggml.h ggml-cuda.h - $(CC) $(CFLAGS) $(NONECFLAGS) -c $< -o $@ + $(CC) $(FASTCFLAGS) $(NONECFLAGS) -c $< -o $@ ggml_v4_noavx2.o: ggml.c ggml.h ggml-cuda.h - $(CC) $(CFLAGS) $(SIMPLECFLAGS) -c $< -o $@ + $(CC) $(FASTCFLAGS) $(SIMPLECFLAGS) -c $< -o $@ ggml_v4_clblast.o: ggml.c ggml.h ggml-cuda.h - $(CC) $(CFLAGS) $(FULLCFLAGS) $(CLBLAST_FLAGS) -c $< -o $@ + $(CC) $(FASTCFLAGS) $(FULLCFLAGS) $(CLBLAST_FLAGS) -c $< -o $@ ggml_v4_cublas.o: ggml.c ggml.h ggml-cuda.h - $(CC) $(CFLAGS) $(FULLCFLAGS) $(CUBLAS_FLAGS) $(HIPFLAGS) -c $< -o $@ + $(CC) $(FASTCFLAGS) $(FULLCFLAGS) $(CUBLAS_FLAGS) $(HIPFLAGS) -c $< -o $@ ggml_v4_clblast_noavx2.o: ggml.c ggml.h ggml-cuda.h - $(CC) $(CFLAGS) $(SIMPLECFLAGS) $(CLBLAST_FLAGS) -c $< -o $@ + $(CC) $(FASTCFLAGS) $(SIMPLECFLAGS) $(CLBLAST_FLAGS) -c $< -o $@ ggml_v4_vulkan.o: ggml.c ggml.h ggml-cuda.h - $(CC) $(CFLAGS) $(FULLCFLAGS) $(VULKAN_FLAGS) -c $< -o $@ + $(CC) $(FASTCFLAGS) $(FULLCFLAGS) $(VULKAN_FLAGS) -c $< -o $@ ggml_v4_vulkan_noavx2.o: ggml.c ggml.h ggml-cuda.h - $(CC) $(CFLAGS) $(SIMPLECFLAGS) $(VULKAN_FLAGS) -c $< -o $@ + $(CC) $(FASTCFLAGS) $(SIMPLECFLAGS) $(VULKAN_FLAGS) -c $< -o $@ #quants ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h ggml-cuda.h @@ -415,41 +416,41 @@ ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h #version 3 libs ggml_v3.o: otherarch/ggml_v3.c otherarch/ggml_v3.h - $(CC) $(CFLAGS) $(FULLCFLAGS) -c $< -o $@ + $(CC) $(FASTCFLAGS) $(FULLCFLAGS) -c $< -o $@ ggml_v3_openblas.o: otherarch/ggml_v3.c otherarch/ggml_v3.h - $(CC) $(CFLAGS) $(FULLCFLAGS) $(OPENBLAS_FLAGS) -c $< -o $@ + $(CC) $(FASTCFLAGS) $(FULLCFLAGS) $(OPENBLAS_FLAGS) -c $< -o $@ ggml_v3_failsafe.o: otherarch/ggml_v3.c otherarch/ggml_v3.h - $(CC) $(CFLAGS) $(NONECFLAGS) -c $< -o $@ + $(CC) $(FASTCFLAGS) $(NONECFLAGS) -c $< -o $@ ggml_v3_noavx2.o: otherarch/ggml_v3.c otherarch/ggml_v3.h - $(CC) $(CFLAGS) $(SIMPLECFLAGS) -c $< -o $@ + $(CC) $(FASTCFLAGS) $(SIMPLECFLAGS) -c $< -o $@ ggml_v3_clblast.o: otherarch/ggml_v3.c otherarch/ggml_v3.h - $(CC) $(CFLAGS) $(FULLCFLAGS) $(CLBLAST_FLAGS) -c $< -o $@ + $(CC) $(FASTCFLAGS) $(FULLCFLAGS) $(CLBLAST_FLAGS) -c $< -o $@ ggml_v3_cublas.o: otherarch/ggml_v3.c otherarch/ggml_v3.h - $(CC) $(CFLAGS) $(FULLCFLAGS) $(CUBLAS_FLAGS) $(HIPFLAGS) -c $< -o $@ + $(CC) $(FASTCFLAGS) $(FULLCFLAGS) $(CUBLAS_FLAGS) $(HIPFLAGS) -c $< -o $@ ggml_v3_clblast_noavx2.o: otherarch/ggml_v3.c otherarch/ggml_v3.h - $(CC) $(CFLAGS) $(SIMPLECFLAGS) $(CLBLAST_FLAGS) -c $< -o $@ + $(CC) $(FASTCFLAGS) $(SIMPLECFLAGS) $(CLBLAST_FLAGS) -c $< -o $@ #version 2 libs ggml_v2.o: otherarch/ggml_v2.c otherarch/ggml_v2.h - $(CC) $(CFLAGS) $(FULLCFLAGS) -c $< -o $@ + $(CC) $(FASTCFLAGS) $(FULLCFLAGS) -c $< -o $@ ggml_v2_openblas.o: otherarch/ggml_v2.c otherarch/ggml_v2.h - $(CC) $(CFLAGS) $(FULLCFLAGS) $(OPENBLAS_FLAGS) -c $< -o $@ + $(CC) $(FASTCFLAGS) $(FULLCFLAGS) $(OPENBLAS_FLAGS) -c $< -o $@ ggml_v2_failsafe.o: otherarch/ggml_v2.c otherarch/ggml_v2.h - $(CC) $(CFLAGS) $(NONECFLAGS) -c $< -o $@ + $(CC) $(FASTCFLAGS) $(NONECFLAGS) -c $< -o $@ ggml_v2_noavx2.o: otherarch/ggml_v2.c otherarch/ggml_v2.h - $(CC) $(CFLAGS) $(SIMPLECFLAGS) -c $< -o $@ + $(CC) $(FASTCFLAGS) $(SIMPLECFLAGS) -c $< -o $@ ggml_v2_clblast.o: otherarch/ggml_v2.c otherarch/ggml_v2.h - $(CC) $(CFLAGS) $(FULLCFLAGS) $(CLBLAST_FLAGS) -c $< -o $@ + $(CC) $(FASTCFLAGS) $(FULLCFLAGS) $(CLBLAST_FLAGS) -c $< -o $@ ggml_v2_cublas.o: otherarch/ggml_v2.c otherarch/ggml_v2.h - $(CC) $(CFLAGS) $(FULLCFLAGS) $(CUBLAS_FLAGS) $(HIPFLAGS) -c $< -o $@ + $(CC) $(FASTCFLAGS) $(FULLCFLAGS) $(CUBLAS_FLAGS) $(HIPFLAGS) -c $< -o $@ ggml_v2_clblast_noavx2.o: otherarch/ggml_v2.c otherarch/ggml_v2.h - $(CC) $(CFLAGS) $(SIMPLECFLAGS) $(CLBLAST_FLAGS) -c $< -o $@ + $(CC) $(FASTCFLAGS) $(SIMPLECFLAGS) $(CLBLAST_FLAGS) -c $< -o $@ #extreme old version compat ggml_v1.o: otherarch/ggml_v1.c otherarch/ggml_v1.h - $(CC) $(CFLAGS) $(FULLCFLAGS) -c $< -o $@ + $(CC) $(FASTCFLAGS) $(FULLCFLAGS) -c $< -o $@ ggml_v1_failsafe.o: otherarch/ggml_v1.c otherarch/ggml_v1.h - $(CC) $(CFLAGS) $(NONECFLAGS) -c $< -o $@ + $(CC) $(FASTCFLAGS) $(NONECFLAGS) -c $< -o $@ #opencl ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index 95b72d69d..02c7d56b0 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -683,7 +683,14 @@ void PurgeMissingTokens(llama_context * ctx, std::vector ¤t_context_t static int GetBatchSize(int desiredBlasBatchSize,FileFormat in_file_format) { - if(desiredBlasBatchSize<=0) + //check if approved to use BLAS + bool approved_format = !(file_format == FileFormat::BADFORMAT || + file_format == FileFormat::GPT2_1 || + file_format == FileFormat::GPTJ_1 || + file_format == FileFormat::GPTJ_2 || + file_format == FileFormat::RWKV_1 || + file_format==FileFormat::RWKV_2); + if(!approved_format || desiredBlasBatchSize<=0) { desiredBlasBatchSize = 16; } @@ -1684,14 +1691,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o } } - //if using BLAS and prompt is big enough, switch to single thread and use a huge batch - bool approved_format = !(file_format == FileFormat::BADFORMAT || - file_format == FileFormat::GPT2_1 || - file_format == FileFormat::GPTJ_1 || - file_format == FileFormat::GPTJ_2 || - file_format == FileFormat::RWKV_1 || - file_format==FileFormat::RWKV_2); - bool blasmode = (approved_format && embd_inp.size() >= 32 && ggml_cpu_has_blas() && kcpp_params->n_batch>=32); + bool blasmode = (embd_inp.size() >= 32 && ggml_cpu_has_blas() && kcpp_params->n_batch>=32); current_context_tokens.resize(n_past);