From b4dc29f42579263f4623efeccc4304ca20b6f62d Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Sat, 4 Jan 2025 00:49:04 +0800 Subject: [PATCH] kobo cheats death again (+1 squashed commits) Squashed commits: [708e2429] kobo cheats death again --- Makefile | 2 +- include/llama.h | 2 - otherarch/sdcpp/model.cpp | 13 +++ otherarch/sdcpp/util.cpp | 13 --- otherarch/sdcpp/util.h | 2 - src/llama-mmap.cpp | 4 + src/llama-model-loader.cpp | 33 +++++++ src/llama-model.cpp | 28 +++++- src/llama-quant.cpp | 2 +- src/llama.cpp | 172 ++++++++++++++++++++++++++++++++----- 10 files changed, 225 insertions(+), 46 deletions(-) diff --git a/Makefile b/Makefile index 08881d75a..44e148523 100644 --- a/Makefile +++ b/Makefile @@ -626,7 +626,7 @@ whispercpp_cublas.o: otherarch/whispercpp/whisper_adapter.cpp $(CXX) $(CXXFLAGS) $(CUBLAS_FLAGS) $(HIPFLAGS) -c $< -o $@ # idiotic "for easier compilation" -GPTTYPE_ADAPTER = gpttype_adapter.cpp otherarch/llama_v2.cpp otherarch/llama_v3.cpp src/llama.cpp src/llama-grammar.cpp src/llama-sampling.cpp src/llama-vocab.cpp otherarch/utils.cpp otherarch/gptj_v1.cpp otherarch/gptj_v2.cpp otherarch/gptj_v3.cpp otherarch/gpt2_v1.cpp otherarch/gpt2_v2.cpp otherarch/gpt2_v3.cpp otherarch/rwkv_v2.cpp otherarch/rwkv_v3.cpp otherarch/neox_v2.cpp otherarch/neox_v3.cpp otherarch/mpt_v3.cpp ggml/include/ggml.h ggml/include/ggml-cpu.h ggml/include/ggml-cuda.h include/llama.h otherarch/llama-util.h +GPTTYPE_ADAPTER = gpttype_adapter.cpp otherarch/llama_v2.cpp otherarch/llama_v3.cpp src/llama.cpp src/llama-impl.cpp src/llama-chat.cpp src/llama-mmap.cpp src/llama-context.cpp src/llama-adapter.cpp src/llama-arch.cpp src/llama-batch.cpp src/llama-vocab.cpp src/llama-grammar.cpp src/llama-sampling.cpp src/llama-kv-cache.cpp src/llama-model-loader.cpp src/llama-model.cpp src/llama-quant.cpp src/llama-hparams.cpp otherarch/utils.cpp otherarch/gptj_v1.cpp otherarch/gptj_v2.cpp otherarch/gptj_v3.cpp otherarch/gpt2_v1.cpp otherarch/gpt2_v2.cpp otherarch/gpt2_v3.cpp otherarch/rwkv_v2.cpp otherarch/rwkv_v3.cpp otherarch/neox_v2.cpp otherarch/neox_v3.cpp otherarch/mpt_v3.cpp ggml/include/ggml.h ggml/include/ggml-cpu.h ggml/include/ggml-cuda.h include/llama.h otherarch/llama-util.h gpttype_adapter_failsafe.o: $(GPTTYPE_ADAPTER) $(CXX) $(CXXFLAGS) $(FAILSAFE_FLAGS) -c $< -o $@ gpttype_adapter.o: $(GPTTYPE_ADAPTER) diff --git a/include/llama.h b/include/llama.h index 0bbf7fdc1..7b305b299 100644 --- a/include/llama.h +++ b/include/llama.h @@ -631,8 +631,6 @@ extern "C" { llama_pos p0, llama_pos p1); - LLAMA_API void printcache(struct llama_context * ctx); - // Copy all tokens that belong to the specified sequence to another sequence // Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence // p0 < 0 : [0, p1] diff --git a/otherarch/sdcpp/model.cpp b/otherarch/sdcpp/model.cpp index fc6a45554..175931fee 100644 --- a/otherarch/sdcpp/model.cpp +++ b/otherarch/sdcpp/model.cpp @@ -28,6 +28,19 @@ #define ST_HEADER_SIZE_LEN 8 +static std::string format(const char* fmt, ...) { + va_list ap; + va_list ap2; + va_start(ap, fmt); + va_copy(ap2, ap); + int size = vsnprintf(NULL, 0, fmt, ap); + std::vector buf(size + 1); + int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2); + va_end(ap2); + va_end(ap); + return std::string(buf.data(), size); +} + uint64_t read_u64(uint8_t* buffer) { // little endian uint64_t value = 0; diff --git a/otherarch/sdcpp/util.cpp b/otherarch/sdcpp/util.cpp index f1057f01c..d5cd8e229 100644 --- a/otherarch/sdcpp/util.cpp +++ b/otherarch/sdcpp/util.cpp @@ -62,19 +62,6 @@ void replace_all_chars(std::string& str, char target, char replacement) { } } -std::string format(const char* fmt, ...) { - va_list ap; - va_list ap2; - va_start(ap, fmt); - va_copy(ap2, ap); - int size = vsnprintf(NULL, 0, fmt, ap); - std::vector buf(size + 1); - int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2); - va_end(ap2); - va_end(ap); - return std::string(buf.data(), size); -} - #ifdef _WIN32 // code for windows #include diff --git a/otherarch/sdcpp/util.h b/otherarch/sdcpp/util.h index fafce7ba8..607ea1bb1 100644 --- a/otherarch/sdcpp/util.h +++ b/otherarch/sdcpp/util.h @@ -11,8 +11,6 @@ bool ends_with(const std::string& str, const std::string& ending); bool starts_with(const std::string& str, const std::string& start); bool contains(const std::string& str, const std::string& substr); -std::string format(const char* fmt, ...); - void replace_all_chars(std::string& str, char target, char replacement); bool file_exists(const std::string& filename); diff --git a/src/llama-mmap.cpp b/src/llama-mmap.cpp index a99326335..0487d0836 100644 --- a/src/llama-mmap.cpp +++ b/src/llama-mmap.cpp @@ -374,6 +374,7 @@ struct llama_mmap::impl { throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str())); } +#ifndef USE_FAILSAFE if (prefetch > 0) { #if _WIN32_WINNT >= 0x602 BOOL (WINAPI *pPrefetchVirtualMemory) (HANDLE, ULONG_PTR, PWIN32_MEMORY_RANGE_ENTRY, ULONG); @@ -394,6 +395,9 @@ struct llama_mmap::impl { throw std::runtime_error("PrefetchVirtualMemory unavailable"); #endif } +#else +printf("\nPrefetchVirtualMemory skipped in compatibility mode.\n"); +#endif } void unmap_fragment(size_t first, size_t last) { diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 7743b4652..776993acb 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -7,6 +7,10 @@ #include #include +#if defined(GGML_USE_CLBLAST) +# include "ggml_v3b-opencl.h" +#endif + const char * llama_file_version_name(llama_fver version) { switch (version) { case GGUF_FILE_VERSION_V1: return "GGUF V1 (support until nov 2023)"; @@ -479,6 +483,7 @@ llama_model_loader::llama_model_loader(const std::string & fname, bool use_mmap, // determine file type based on the number of tensors for each quantization and print meta data // TODO: make optional + if(false) //disable this log for now { std::map n_type; @@ -776,6 +781,24 @@ void llama_model_loader::load_data_for(struct ggml_tensor * cur) const { } } +static int clblast_offload_fallback_layers = 0; +static int layer_name_to_number(std::string inputString) +{ + size_t firstDotPosition = inputString.find('.'); + int converted = -1; + + if (firstDotPosition != std::string::npos) { + size_t secondDotPosition = inputString.find('.', firstDotPosition + 1); + if (secondDotPosition != std::string::npos) { + std::string numbersPortion = inputString.substr(firstDotPosition + 1, secondDotPosition - firstDotPosition - 1); + try{converted = std::stoi(numbersPortion);} + catch (const std::invalid_argument& e) {} + catch (const std::out_of_range& e) {} + } + } + return converted; +} + bool llama_model_loader::load_all_data( struct ggml_context * ctx, llama_buf_map & bufs, @@ -960,6 +983,16 @@ bool llama_model_loader::load_all_data( } } + #if defined(GGML_USE_CLBLAST) + int layernum = layer_name_to_number(cur->name); + bool shouldoffload = (layernum>=0 && clblast_offload_fallback_layers>layernum); + if(shouldoffload) + { + cur->backend = GGML_BACKEND_TYPE_GPU; + ggml_cl_transform_tensor(cur->data, cur); + } + #endif + size_done += n_size; } diff --git a/src/llama-model.cpp b/src/llama-model.cpp index ace0ba262..af70705f6 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -11,6 +11,10 @@ #include #include +#if defined(GGML_USE_CLBLAST) +# include "ggml_v3b-opencl.h" +#endif + static const size_t kiB = 1024; static const size_t MiB = 1024*kiB; static const size_t GiB = 1024*MiB; @@ -150,6 +154,9 @@ static bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t d throw std::runtime_error(format("failed to create ggml context")); } + #if defined(GGML_USE_CLBLAST) + ggml_cl_init(); + #endif ggml_backend_buffer_ptr buf { ggml_backend_buft_alloc_buffer(buft, 0) }; ggml_tensor * op_tensor = fn(ctx.get()); for (int i = 0; i < GGML_MAX_SRC; i++) { @@ -1153,6 +1160,16 @@ void llm_load_vocab(llama_model_loader & ml, llama_model & model) { const int n_merges = gguf_get_arr_n(ctx, merges_keyidx); for (int i = 0; i < n_merges; i++) { const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i); + if (!OldBPETokenizerMode) + { + auto validcodepoints = unicode_cpts_from_utf8(word).size() > 0; + GGML_ASSERT_CONTINUE(validcodepoints); + if(!validcodepoints) + { + OldBPETokenizerMode = true; + printf("\nFalling Back to older tokenizer..."); + } + } GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0); std::string first; @@ -1398,10 +1415,13 @@ void llm_load_vocab(llama_model_loader & ml, llama_model & model) { for (uint32_t i = 0; i < n_vocab; i++) { std::string word = gguf_get_arr_str(ctx, token_idx, i); - if (word.empty()) { + if (!OldBPETokenizerMode) + { + if (word.empty()) { LLAMA_LOG_WARN("%s: empty token at index %u\n", __func__, i); word = "[EMPTY_" + std::to_string(i) + "]"; } + } vocab.token_to_id[word] = i; vocab.max_token_len = std::max(vocab.max_token_len, (int) word.size()); @@ -1424,7 +1444,7 @@ void llm_load_vocab(llama_model_loader & ml, llama_model & model) { } } } - GGML_ASSERT(vocab.id_to_token.size() == vocab.token_to_id.size()); + GGML_ASSERT_CONTINUE(vocab.id_to_token.size() == vocab.token_to_id.size()); vocab.init_tokenizer(); @@ -1681,8 +1701,8 @@ void llm_load_vocab(llama_model_loader & ml, llama_model & model) { } else { // token is control, but not marked as EOG -> print a debug log if (vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL && vocab.special_eog_ids.count(t.second) == 0) { - LLAMA_LOG_DEBUG("%s: control token: %6d '%s' is not marked as EOG\n", - __func__, t.second, t.first.c_str()); + // LLAMA_LOG_DEBUG("%s: control token: %6d '%s' is not marked as EOG\n", + // __func__, t.second, t.first.c_str()); } } } diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 42974f8f1..0526f1d1c 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -394,7 +394,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n case GGML_TYPE_IQ1_M: case GGML_TYPE_Q2_K: case GGML_TYPE_Q3_K: - case GGML_TYPE_IQ4_XS: new_type = GGML_TYPE_IQ4_NL; break; + case GGML_TYPE_IQ4_XS: new_type = GGML_TYPE_Q4_0; break; case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break; case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break; case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break; diff --git a/src/llama.cpp b/src/llama.cpp index d7110b90b..959d97f95 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -1,14 +1,19 @@ -#include "llama-impl.h" - -#include "llama-chat.h" -#include "llama-mmap.h" -#include "llama-context.h" -#include "llama-vocab.h" -#include "llama-sampling.h" -#include "llama-kv-cache.h" -#include "llama-model-loader.h" -#include "llama-model.h" -#include "llama-quant.h" +// we do what we must because we can +#include "llama-impl.cpp" +#include "llama-chat.cpp" +#include "llama-mmap.cpp" +#include "llama-context.cpp" +#include "llama-adapter.cpp" +#include "llama-arch.cpp" +#include "llama-batch.cpp" +#include "llama-vocab.cpp" +#include "llama-grammar.cpp" +#include "llama-sampling.cpp" +#include "llama-kv-cache.cpp" +#include "llama-model-loader.cpp" +#include "llama-model.cpp" +#include "llama-quant.cpp" +#include "llama-hparams.cpp" #include "ggml.h" #include "ggml-alloc.h" @@ -35,6 +40,14 @@ #include #include #include +#include + +#ifdef GGML_USE_CUDA +# include "ggml-cuda.h" +#elif defined(GGML_USE_CLBLAST) +# include "ggml_v3b-opencl.h" +#endif +static bool old_mixtral_warning_showed = false; #if defined(_MSC_VER) #pragma warning(disable: 4244 4267) // possible loss of data @@ -327,7 +340,13 @@ static bool llm_load_tensors( } ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); - const int i_gpu_start = std::max((int) hparams.n_layer - n_gpu_layers, (int) 0); + int i_gpu_start = std::max((int) hparams.n_layer - n_gpu_layers, (int) 0); + + #if defined(GGML_USE_CLBLAST) + printf("\nOpenCL GPU Offload Fallback...\n"); + clblast_offload_fallback_layers = n_gpu_layers; + i_gpu_start = std::max((int64_t) hparams.n_layer, (int64_t) 0); + #endif const int act_gpu_layers = model.devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1); auto get_layer_buft_list = [&](int il) -> llama_model::layer_dev { if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) { @@ -378,6 +397,9 @@ static bool llm_load_tensors( // create tensors for the weights { + //this is a very dirty kcpp hack that attempts to reuse the most recently use ctx for old mixtral models + ggml_context * last_used_ctx = nullptr; + // note: cast to int64_t since we will use these for the tensor dimensions const int64_t n_head = hparams.n_head(); const int64_t n_head_kv = hparams.n_head_kv(); @@ -400,6 +422,7 @@ static bool llm_load_tensors( } int n_moved_tensors = 0; + int n_total_tensors = 0; ggml_tensor * first_moved_tensor = nullptr; ggml_backend_buffer_type_t first_moved_from_buft = nullptr; ggml_backend_buffer_type_t first_moved_to_buft = nullptr; @@ -485,6 +508,7 @@ static bool llm_load_tensors( first_moved_to_buft = buft; } } + n_total_tensors++; ggml_context * ctx = ctx_for_buft(buft); @@ -495,6 +519,7 @@ static bool llm_load_tensors( return t; } } + last_used_ctx = ctx; //this caches the last ctx which should match the buft we want for this layer. kobo forgive me. return ml.create_tensor(ctx, tn, ne, flags); }; @@ -558,8 +583,47 @@ static bool llm_load_tensors( } else { layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0); layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, llama_model_loader::TENSOR_NOT_REQUIRED); - layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0); - layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0); + + if (layer.ffn_gate_exps) { + layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0); + layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0); + } else { + // merge split expert into a single tensor for compatibility with older MIXTRAL models + // requires disabling mmap + //slaren removed this in #10026, but i think its useful to keep. + use_mmap_buffer = false; + ml.use_mmap = false; + if(!old_mixtral_warning_showed) + { + std::cout << "\n!!!!!!\nWARNING: Using extremely outdated MoE quant. Please update it!\nAttempting to apply hacky kcpp fallback, using last ctx:" << last_used_ctx << "\n"; + old_mixtral_warning_showed = true; + } + ggml_context * ctx_split = last_used_ctx; + // for(auto it = ctx_map.cbegin(); it != ctx_map.cend(); ++it) + // { + // std::cout << "\nName: " << ggml_backend_buft_name(it->first) << " Addr: " << it->second << "\n"; + // ctx_split = it->second; + // } + + ggml_type type_gate = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, 0).str().c_str())->type; + ggml_type type_down = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, 0).str().c_str())->type; + ggml_type type_up = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, 0).str().c_str())->type; + + layer.ffn_gate_exps = ggml_new_tensor_3d(ctx_split, type_gate, n_embd, n_ff, n_expert); + layer.ffn_down_exps = ggml_new_tensor_3d(ctx_split, type_down, n_ff, n_embd, n_expert); + layer.ffn_up_exps = ggml_new_tensor_3d(ctx_split, type_up, n_embd, n_ff, n_expert); + + ggml_set_name(layer.ffn_gate_exps, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i).str().c_str()); + ggml_set_name(layer.ffn_down_exps, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i).str().c_str()); + ggml_set_name(layer.ffn_up_exps, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i).str().c_str()); + + for (uint32_t x = 0; x < n_expert; ++x) { + // the individual experts are loaded into a view of the merged tensor + ml.create_tensor_as_view(ctx_split, layer.ffn_gate_exps, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), { n_embd, n_ff }, layer.ffn_gate_exps->nb[2]*x); + ml.create_tensor_as_view(ctx_split, layer.ffn_down_exps, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd }, layer.ffn_down_exps->nb[2]*x); + ml.create_tensor_as_view(ctx_split, layer.ffn_up_exps, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), { n_embd, n_ff }, layer.ffn_up_exps->nb[2]*x); + } + } } } } break; @@ -701,8 +765,47 @@ static bool llm_load_tensors( layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0); layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, llama_model_loader::TENSOR_NOT_REQUIRED); - layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0); - layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0); + + if (layer.ffn_gate_exps) { + layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0); + layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0); + } else { + // merge split expert into a single tensor for compatibility with older MIXTRAL models + // requires disabling mmap + //slaren removed this in #10026, but i think its useful to keep. + use_mmap_buffer = false; + ml.use_mmap = false; + if(!old_mixtral_warning_showed) + { + std::cout << "\n!!!!!!\nWARNING: Using extremely outdated MoE quant. Please update it!\nAttempting to apply hacky kcpp fallback, using last ctx:" << last_used_ctx << "\n"; + old_mixtral_warning_showed = true; + } + ggml_context * ctx_split = last_used_ctx; + // for(auto it = ctx_map.cbegin(); it != ctx_map.cend(); ++it) + // { + // std::cout << "\nName: " << ggml_backend_buft_name(it->first) << " Addr: " << it->second << "\n"; + // ctx_split = it->second; + // } + + ggml_type type_gate = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, 0).str().c_str())->type; + ggml_type type_down = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, 0).str().c_str())->type; + ggml_type type_up = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, 0).str().c_str())->type; + + layer.ffn_gate_exps = ggml_new_tensor_3d(ctx_split, type_gate, n_embd, n_ff, n_expert); + layer.ffn_down_exps = ggml_new_tensor_3d(ctx_split, type_down, n_ff, n_embd, n_expert); + layer.ffn_up_exps = ggml_new_tensor_3d(ctx_split, type_up, n_embd, n_ff, n_expert); + + ggml_set_name(layer.ffn_gate_exps, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i).str().c_str()); + ggml_set_name(layer.ffn_down_exps, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i).str().c_str()); + ggml_set_name(layer.ffn_up_exps, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i).str().c_str()); + + for (uint32_t x = 0; x < n_expert; ++x) { + // the individual experts are loaded into a view of the merged tensor + ml.create_tensor_as_view(ctx_split, layer.ffn_gate_exps, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), { n_embd, n_ff }, layer.ffn_gate_exps->nb[2]*x); + ml.create_tensor_as_view(ctx_split, layer.ffn_down_exps, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd }, layer.ffn_down_exps->nb[2]*x); + ml.create_tensor_as_view(ctx_split, layer.ffn_up_exps, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), { n_embd, n_ff }, layer.ffn_up_exps->nb[2]*x); + } + } layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0); } @@ -2291,11 +2394,12 @@ static bool llm_load_tensors( throw std::runtime_error("unknown architecture"); } - if (n_moved_tensors > 0) { - LLAMA_LOG_DEBUG("%s: tensor '%s' (%s) (and %d others) cannot be used with preferred buffer type %s, using %s instead\n", - __func__, first_moved_tensor->name, ggml_type_name(first_moved_tensor->type), n_moved_tensors - 1, - ggml_backend_buft_name(first_moved_from_buft), ggml_backend_buft_name(first_moved_to_buft)); - } + // if (n_moved_tensors > 0) { + // LLAMA_LOG_DEBUG("%s: tensor '%s' (%s) (and %d others) cannot be used with preferred buffer type %s, using %s instead\n", + // __func__, first_moved_tensor->name, ggml_type_name(first_moved_tensor->type), n_moved_tensors - 1, + // ggml_backend_buft_name(first_moved_from_buft), ggml_backend_buft_name(first_moved_to_buft)); + // } + LLAMA_LOG_DEBUG("%s: relocated tensors: %d of %d\n", __func__, n_moved_tensors, n_total_tensors); } ml.done_getting_tensors(); @@ -2959,16 +3063,30 @@ static struct ggml_tensor * llm_build_kqv( cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias, hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f); +#if defined(GGML_USE_HIP) //workaround for speed regression on rocm + if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX || model.arch == LLM_ARCH_GEMMA2 || model.arch == LLM_ARCH_GRANITE || model.arch == LLM_ARCH_GRANITE_MOE) { + ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32); + } +#else ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32); +#endif cur = ggml_reshape_2d(ctx, cur, n_embd_head_v*n_head, n_tokens); } else { struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q); cb(kq, "kq", il); +#if defined(GGML_USE_HIP) //workaround for speed regression on rocm + if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX || model.arch == LLM_ARCH_QWEN2 || model.arch == LLM_ARCH_NEMOTRON || model.arch == LLM_ARCH_CHATGLM || model.arch == LLM_ARCH_GRANITE || model.arch == LLM_ARCH_GRANITE_MOE) { + // for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs + // ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847 + ggml_mul_mat_set_prec(kq, GGML_PREC_F32); + } +#else // note: this op tends to require high floating point range // while for some models F16 is enough, for others it is not, so we default to F32 here ggml_mul_mat_set_prec(kq, GGML_PREC_F32); +#endif if (model.arch == LLM_ARCH_GROK) { // need to do the following: @@ -3865,7 +3983,11 @@ struct llm_build_context { // self-attention { // rope freq factors for llama3; may return nullptr for llama2 and other models + #if defined(GGML_USE_CLBLAST) + struct ggml_tensor * rope_factors = nullptr; //clblast does not work with rope_factors + #else struct ggml_tensor * rope_factors = build_rope_factors(il); + #endif // compute Q and K and RoPE them struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); @@ -10629,7 +10751,7 @@ static int llama_decode_internal( lctx.n_outputs = n_outputs_new; } - int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch; + int n_threads = (n_tokens < 32) ? cparams.n_threads : cparams.n_threads_batch; ggml_threadpool_t threadpool = n_tokens == 1 ? lctx.threadpool : lctx.threadpool_batch; GGML_ASSERT(n_threads > 0); @@ -10912,7 +11034,7 @@ static int llama_encode_internal( lctx.inp_embd_enc = NULL; lctx.n_outputs = n_tokens; - int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch; + int n_threads = (n_tokens < 32) ? cparams.n_threads : cparams.n_threads_batch; ggml_threadpool_t threadpool = n_tokens == 1 ? lctx.threadpool : lctx.threadpool_batch; GGML_ASSERT(n_threads > 0); @@ -11416,8 +11538,12 @@ bool llama_supports_mlock(void) { } bool llama_supports_gpu_offload(void) { + #if defined(GGML_USE_CLBLAST) + return true; + #else return ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU) != nullptr || llama_supports_rpc(); + #endif } bool llama_supports_rpc(void) {