diff --git a/Makefile b/Makefile index a413190c6..646597100 100644 --- a/Makefile +++ b/Makefile @@ -470,6 +470,8 @@ llavaclip_default.o: examples/llava/clip.cpp examples/llava/clip.h $(CXX) $(CXXFLAGS) -c $< -o $@ llavaclip_cublas.o: examples/llava/clip.cpp examples/llava/clip.h $(CXX) $(CXXFLAGS) $(CUBLAS_FLAGS) -c $< -o $@ +llavaclip_vulkan.o: examples/llava/clip.cpp examples/llava/clip.h + $(CXX) $(CXXFLAGS) $(VULKAN_FLAGS) -c $< -o $@ #this is only used for openblas and accelerate ggml-blas.o: ggml/src/ggml-blas.cpp ggml/include/ggml-blas.h @@ -663,10 +665,10 @@ koboldcpp_hipblas: endif ifdef VULKAN_BUILD -koboldcpp_vulkan: ggml_v4_vulkan.o ggml_v3.o ggml_v2.o ggml_v1.o expose.o gpttype_adapter_vulkan.o ggml-vulkan.o sdcpp_vulkan.o whispercpp_default.o llavaclip_default.o llava.o ggml-backend_vulkan.o $(OBJS_FULL) $(OBJS) +koboldcpp_vulkan: ggml_v4_vulkan.o ggml_v3.o ggml_v2.o ggml_v1.o expose.o gpttype_adapter_vulkan.o ggml-vulkan.o sdcpp_vulkan.o whispercpp_default.o llavaclip_vulkan.o llava.o ggml-backend_vulkan.o $(OBJS_FULL) $(OBJS) $(VULKAN_BUILD) ifdef NOAVX2_BUILD -koboldcpp_vulkan_noavx2: ggml_v4_vulkan_noavx2.o ggml_v3_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o gpttype_adapter_vulkan_noavx2.o ggml-vulkan.o sdcpp_vulkan.o whispercpp_default.o llavaclip_default.o llava.o ggml-backend_vulkan.o $(OBJS_SIMPLE) $(OBJS) +koboldcpp_vulkan_noavx2: ggml_v4_vulkan_noavx2.o ggml_v3_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o gpttype_adapter_vulkan_noavx2.o ggml-vulkan.o sdcpp_vulkan.o whispercpp_default.o llavaclip_vulkan.o llava.o ggml-backend_vulkan.o $(OBJS_SIMPLE) $(OBJS) $(VULKAN_BUILD) else koboldcpp_vulkan_noavx2: diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index 15f80dba3..4ad320826 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -1961,6 +1961,9 @@ generation_outputs gpttype_generate(const generation_inputs inputs) dry_sequence_breakers.clear(); dry_max_token_repeat.clear(); + double time0 = 0, time1 = 0, time2 = 0; + timer_start(); + for(int x=0;xn_predict, time1, pt1, ts1, time2, pt2, ts2, (time1 + time2), tokens_per_second); + printf("\nCtxLimit:%d/%d, Amt:%d/%d, Init:%.2fs, Process:%.2fs (%.1fms/T = %.2fT/s), Generate:%.2fs (%.1fms/T = %.2fT/s), Total:%.2fs (%.2fT/s)",(int)current_context_tokens.size(),(int)nctx, realnpredict, kcpp_params->n_predict, time0, time1, pt1, ts1, time2, pt2, ts2, (time1 + time2), tokens_per_second); fflush(stdout); output.status = 1; output.stopreason = last_stop_reason;