From f273fd35b9274a89d5a89f377e8ad95e30516b5c Mon Sep 17 00:00:00 2001 From: Wagner Bruna Date: Sat, 16 May 2026 00:23:10 -0300 Subject: [PATCH] sd: sync to master-601-eeac950 (#2206) * sd: sync to master-601-eeac950 * sd: add mmap support --- expose.h | 1 + koboldcpp.py | 2 + otherarch/sdcpp/common/common.cpp | 4 +- otherarch/sdcpp/denoiser.hpp | 108 +++++++++++---- otherarch/sdcpp/ggml_extend.hpp | 19 ++- otherarch/sdcpp/model.cpp | 195 +++++++++++++++++++++++---- otherarch/sdcpp/model.h | 21 +++ otherarch/sdcpp/sdtype_adapter.cpp | 11 ++ otherarch/sdcpp/stable-diffusion.cpp | 148 ++++++++++++++++---- otherarch/sdcpp/stable-diffusion.h | 2 + otherarch/sdcpp/util.cpp | 108 ++++++++++++--- otherarch/sdcpp/util.h | 3 +- 12 files changed, 514 insertions(+), 108 deletions(-) diff --git a/expose.h b/expose.h index e4ee7ff6c..695771a9f 100644 --- a/expose.h +++ b/expose.h @@ -182,6 +182,7 @@ struct sd_load_model_inputs const int quant = 0; const bool flash_attention = false; const bool offload_cpu = false; + const bool use_mmap = false; const bool vae_cpu = false; const bool clip_cpu = false; const bool diffusion_conv_direct = false; diff --git a/koboldcpp.py b/koboldcpp.py index d39cba4df..1694289dd 100644 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -363,6 +363,7 @@ class sd_load_model_inputs(ctypes.Structure): ("quant", ctypes.c_int), ("flash_attention", ctypes.c_bool), ("offload_cpu", ctypes.c_bool), + ("use_mmap", ctypes.c_bool), ("vae_cpu", ctypes.c_bool), ("clip_cpu", ctypes.c_bool), ("diffusion_conv_direct", ctypes.c_bool), @@ -2401,6 +2402,7 @@ def sd_load_model(model_filename,vae_filename,t5xxl_filename,clip1_filename,clip inputs.quant = args.sdquant inputs.flash_attention = args.sdflashattention inputs.offload_cpu = args.sdoffloadcpu + inputs.use_mmap = args.usemmap inputs.vae_cpu = args.sdvaecpu inputs.clip_cpu = False if args.sdclipgpu else True sdconvdirect = sd_convdirect_option(args.sdconvdirect) diff --git a/otherarch/sdcpp/common/common.cpp b/otherarch/sdcpp/common/common.cpp index d4c8a72b8..8ca7a2dcb 100644 --- a/otherarch/sdcpp/common/common.cpp +++ b/otherarch/sdcpp/common/common.cpp @@ -1244,12 +1244,12 @@ ArgOptions SDGenerationParams::get_options() { on_seed_arg}, {"", "--sampling-method", - "sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd, res_multistep, res_2s, er_sde] " + "sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd, res_multistep, res_2s, er_sde, euler_cfg_pp, euler_a_cfg_pp]" "(default: euler for Flux/SD3/Wan, euler_a otherwise)", on_sample_method_arg}, {"", "--high-noise-sampling-method", - "(high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd, res_multistep, res_2s, er_sde]" + "(high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd, res_multistep, res_2s, er_sde, euler_cfg_pp, euler_a_cfg_pp]" " default: euler for Flux/SD3/Wan, euler_a otherwise", on_high_noise_sample_method_arg}, {"", diff --git a/otherarch/sdcpp/denoiser.hpp b/otherarch/sdcpp/denoiser.hpp index 831da2580..3742f53ba 100644 --- a/otherarch/sdcpp/denoiser.hpp +++ b/otherarch/sdcpp/denoiser.hpp @@ -752,7 +752,7 @@ struct Flux2FlowDenoiser : public FluxFlowDenoiser { } }; -typedef std::function(const sd::Tensor&, float, int)> denoise_cb_t; +typedef std::function(const sd::Tensor&, float, int, sd::Tensor*)> denoise_cb_t; static std::pair get_ancestral_step(float sigma_from, float sigma_to, @@ -828,7 +828,7 @@ static sd::Tensor sample_euler_ancestral(denoise_cb_t model, int steps = static_cast(sigmas.size()) - 1; for (int i = 0; i < steps; i++) { float sigma = sigmas[i]; - auto denoised_opt = model(x, sigma, i + 1); + auto denoised_opt = model(x, sigma, i + 1, nullptr); if (denoised_opt.empty()) { return {}; } @@ -851,7 +851,7 @@ static sd::Tensor sample_euler_flow(denoise_cb_t model, int steps = static_cast(sigmas.size()) - 1; for (int i = 0; i < steps; i++) { float sigma = sigmas[i]; - auto denoised_opt = model(x, sigma, i + 1); + auto denoised_opt = model(x, sigma, i + 1, nullptr); if (denoised_opt.empty()) { return {}; } @@ -873,7 +873,7 @@ static sd::Tensor sample_euler(denoise_cb_t model, int steps = static_cast(sigmas.size()) - 1; for (int i = 0; i < steps; i++) { float sigma = sigmas[i]; - auto denoised_opt = model(x, sigma, i + 1); + auto denoised_opt = model(x, sigma, i + 1, nullptr); if (denoised_opt.empty()) { return {}; } @@ -889,7 +889,7 @@ static sd::Tensor sample_heun(denoise_cb_t model, const std::vector& sigmas) { int steps = static_cast(sigmas.size()) - 1; for (int i = 0; i < steps; i++) { - auto denoised_opt = model(x, sigmas[i], -(i + 1)); + auto denoised_opt = model(x, sigmas[i], -(i + 1), nullptr); if (denoised_opt.empty()) { return {}; } @@ -900,7 +900,7 @@ static sd::Tensor sample_heun(denoise_cb_t model, x += d * dt; } else { sd::Tensor x2 = x + d * dt; - auto denoised2_opt = model(x2, sigmas[i + 1], i + 1); + auto denoised2_opt = model(x2, sigmas[i + 1], i + 1, nullptr); if (denoised2_opt.empty()) { return {}; } @@ -917,7 +917,7 @@ static sd::Tensor sample_dpm2(denoise_cb_t model, const std::vector& sigmas) { int steps = static_cast(sigmas.size()) - 1; for (int i = 0; i < steps; i++) { - auto denoised_opt = model(x, sigmas[i], -(i + 1)); + auto denoised_opt = model(x, sigmas[i], -(i + 1), nullptr); if (denoised_opt.empty()) { return {}; } @@ -930,7 +930,7 @@ static sd::Tensor sample_dpm2(denoise_cb_t model, float dt_1 = sigma_mid - sigmas[i]; float dt_2 = sigmas[i + 1] - sigmas[i]; sd::Tensor x2 = x + d * dt_1; - auto denoised2_opt = model(x2, sigma_mid, i + 1); + auto denoised2_opt = model(x2, sigma_mid, i + 1, nullptr); if (denoised2_opt.empty()) { return {}; } @@ -951,7 +951,7 @@ static sd::Tensor sample_dpmpp_2s_ancestral(denoise_cb_t model, int steps = static_cast(sigmas.size()) - 1; for (int i = 0; i < steps; i++) { - auto denoised_opt = model(x, sigmas[i], -(i + 1)); + auto denoised_opt = model(x, sigmas[i], -(i + 1), nullptr); if (denoised_opt.empty()) { return {}; } @@ -967,7 +967,7 @@ static sd::Tensor sample_dpmpp_2s_ancestral(denoise_cb_t model, float s = t + 0.5f * h; float sigma_s = sigma_fn(s); sd::Tensor x2 = (sigma_s / sigma_fn(t)) * x - (exp(-h * 0.5f) - 1) * denoised; - auto denoised2_opt = model(x2, sigma_s, i + 1); + auto denoised2_opt = model(x2, sigma_s, i + 1, nullptr); if (denoised2_opt.empty()) { return {}; } @@ -994,7 +994,7 @@ static sd::Tensor sample_dpmpp_2s_ancestral_flow(denoise_cb_t model, bool opt_first_step = (1.0 - sigma < 1e-6); - auto denoised_opt = model(x, sigma, (opt_first_step ? 1 : -1) * (i + 1)); + auto denoised_opt = model(x, sigma, (opt_first_step ? 1 : -1) * (i + 1), nullptr); if (denoised_opt.empty()) { return {}; } @@ -1023,8 +1023,8 @@ static sd::Tensor sample_dpmpp_2s_ancestral_flow(denoise_cb_t model, // so sigma_s = 1 = sigma, and sigma_s_i_ratio = sigma_s / sigma = 1 // u = (x*sigma_s_i_ratio)+(denoised*(1.0f-sigma_s_i_ratio)) // = (x*1)+(denoised*0) = x - // so D_i = model(u, sigma_s, i + 1) - // = model(x, sigma, i + 1) + // so D_i = model(u, sigma_s, i + 1, nullptr) + // = model(x, sigma, i + 1, nullptr) // = denoised D_i = denoised; @@ -1057,7 +1057,7 @@ static sd::Tensor sample_dpmpp_2s_ancestral_flow(denoise_cb_t model, float sigma_s_i_ratio = sigma_s / sigma; sd::Tensor u = (x * sigma_s_i_ratio) + (denoised * (1.0f - sigma_s_i_ratio)); - auto denoised2_opt = model(u, sigma_s, i + 1); + auto denoised2_opt = model(u, sigma_s, i + 1, nullptr); if (denoised2_opt.empty()) { return {}; } @@ -1084,7 +1084,7 @@ static sd::Tensor sample_dpmpp_2m(denoise_cb_t model, int steps = static_cast(sigmas.size()) - 1; for (int i = 0; i < steps; i++) { - auto denoised_opt = model(x, sigmas[i], i + 1); + auto denoised_opt = model(x, sigmas[i], i + 1, nullptr); if (denoised_opt.empty()) { return {}; } @@ -1116,7 +1116,7 @@ static sd::Tensor sample_dpmpp_2m_v2(denoise_cb_t model, int steps = static_cast(sigmas.size()) - 1; for (int i = 0; i < steps; i++) { - auto denoised_opt = model(x, sigmas[i], i + 1); + auto denoised_opt = model(x, sigmas[i], i + 1, nullptr); if (denoised_opt.empty()) { return {}; } @@ -1151,7 +1151,7 @@ static sd::Tensor sample_lcm(denoise_cb_t model, bool is_flow_denoiser) { int steps = static_cast(sigmas.size()) - 1; for (int i = 0; i < steps; i++) { - auto denoised_opt = model(x, sigmas[i], i + 1); + auto denoised_opt = model(x, sigmas[i], i + 1, nullptr); if (denoised_opt.empty()) { return {}; } @@ -1177,7 +1177,7 @@ static sd::Tensor sample_ipndm(denoise_cb_t model, float sigma = sigmas[i]; float sigma_next = sigmas[i + 1]; - auto denoised_opt = model(x, sigma, i + 1); + auto denoised_opt = model(x, sigma, i + 1, nullptr); if (denoised_opt.empty()) { return {}; } @@ -1221,7 +1221,7 @@ static sd::Tensor sample_ipndm_v(denoise_cb_t model, float sigma = sigmas[i]; float t_next = sigmas[i + 1]; - auto denoised_opt = model(x, sigma, i + 1); + auto denoised_opt = model(x, sigma, i + 1, nullptr); if (denoised_opt.empty()) { return {}; } @@ -1283,14 +1283,14 @@ static sd::Tensor sample_res_multistep(denoise_cb_t model, int steps = static_cast(sigmas.size()) - 1; for (int i = 0; i < steps; i++) { - auto denoised_opt = model(x, sigmas[i], i + 1); + auto denoised_opt = model(x, sigmas[i], i + 1, nullptr); if (denoised_opt.empty()) { return {}; } sd::Tensor denoised = std::move(denoised_opt); - float sigma_from = sigmas[i]; - float sigma_to = sigmas[i + 1]; + float sigma_from = sigmas[i]; + float sigma_to = sigmas[i + 1]; auto [sigma_down, sigma_up, alpha_scale] = get_ancestral_step(sigma_from, sigma_to, eta, is_flow_denoiser); @@ -1360,7 +1360,7 @@ static sd::Tensor sample_res_2s(denoise_cb_t model, float sigma_from = sigmas[i]; float sigma_to = sigmas[i + 1]; - auto denoised_opt = model(x, sigma_from, -(i + 1)); + auto denoised_opt = model(x, sigma_from, -(i + 1), nullptr); if (denoised_opt.empty()) { return {}; } @@ -1386,7 +1386,7 @@ static sd::Tensor sample_res_2s(denoise_cb_t model, sd::Tensor eps1 = denoised - x0; sd::Tensor x2 = x0 + eps1 * (h * a21); - auto denoised2_opt = model(x2, sigma_c2, i + 1); + auto denoised2_opt = model(x2, sigma_c2, i + 1, nullptr); if (denoised2_opt.empty()) { return {}; } @@ -1463,7 +1463,7 @@ static sd::Tensor sample_er_sde(denoise_cb_t model, int steps = static_cast(sigmas.size()) - 1; for (int i = 0; i < steps; i++) { - sd::Tensor denoised = model(x, sigmas[i], i + 1); + sd::Tensor denoised = model(x, sigmas[i], i + 1, nullptr); if (denoised.empty()) { return {}; } @@ -1549,7 +1549,7 @@ static sd::Tensor sample_ddim_trailing(denoise_cb_t model, float sigma = sigmas[i]; float sigma_to = sigmas[i + 1]; - auto model_output_opt = model(x, sigma, i + 1); + auto model_output_opt = model(x, sigma, i + 1, nullptr); if (model_output_opt.empty()) { return {}; } @@ -1621,7 +1621,7 @@ static sd::Tensor sample_tcd(denoise_cb_t model, int timestep_s = (int)floor((1 - eta) * prev_timestep); float sigma = sigmas[i]; - auto model_output_opt = model(x, sigma, i + 1); + auto model_output_opt = model(x, sigma, i + 1, nullptr); if (model_output_opt.empty()) { return {}; } @@ -1649,6 +1649,56 @@ static sd::Tensor sample_tcd(denoise_cb_t model, return x; } +static sd::Tensor sample_euler_cfg_pp(denoise_cb_t model, + sd::Tensor x, + const std::vector& sigmas) { + int steps = static_cast(sigmas.size()) - 1; + for (int i = 0; i < steps; i++) { + float sigma = sigmas[i]; + sd::Tensor uncond_denoised; + + auto denoised_opt = model(x, sigma, i + 1, &uncond_denoised); + if (denoised_opt.empty() || uncond_denoised.empty()) { + return {}; + } + + sd::Tensor denoised = std::move(denoised_opt); + sd::Tensor d = (x - uncond_denoised) / sigma; + + x = denoised + d * sigmas[i + 1]; + } + return x; +} + +static sd::Tensor sample_euler_ancestral_cfg_pp(denoise_cb_t model, + sd::Tensor x, + const std::vector& sigmas, + std::shared_ptr rng, + float eta) { + int steps = static_cast(sigmas.size()) - 1; + for (int i = 0; i < steps; i++) { + float sigma = sigmas[i]; + sd::Tensor uncond_denoised; + + auto denoised_opt = model(x, sigma, i + 1, &uncond_denoised); + if (denoised_opt.empty() || uncond_denoised.empty()) { + return {}; + } + + sd::Tensor denoised = std::move(denoised_opt); + sd::Tensor d = (x - uncond_denoised) / sigma; + + auto [sigma_down, sigma_up] = get_ancestral_step(sigmas[i], sigmas[i + 1], eta); + + x = denoised + d * sigma_down; + + if (sigmas[i + 1] > 0) { + x += sd::Tensor::randn_like(x, rng) * sigma_up; + } + } + return x; +} + // k diffusion reverse ODE: dx = (x - D(x;\sigma)) / \sigma dt; \sigma(t) = t static sd::Tensor sample_k_diffusion(sample_method_t method, denoise_cb_t model, @@ -1694,6 +1744,10 @@ static sd::Tensor sample_k_diffusion(sample_method_t method, return sample_ddim_trailing(model, std::move(x), sigmas, rng, eta); case TCD_SAMPLE_METHOD: return sample_tcd(model, std::move(x), sigmas, rng, eta); + case EULER_CFG_PP_SAMPLE_METHOD: + return sample_euler_cfg_pp(model, std::move(x), sigmas); + case EULER_A_CFG_PP_SAMPLE_METHOD: + return sample_euler_ancestral_cfg_pp(model, std::move(x), sigmas, rng, eta); default: return {}; } diff --git a/otherarch/sdcpp/ggml_extend.hpp b/otherarch/sdcpp/ggml_extend.hpp index 32d17916e..3ef49b239 100644 --- a/otherarch/sdcpp/ggml_extend.hpp +++ b/otherarch/sdcpp/ggml_extend.hpp @@ -2567,7 +2567,24 @@ public: bool alloc_params_buffer() { size_t num_tensors = ggml_tensor_num(params_ctx); - params_buffer = ggml_backend_alloc_ctx_tensors(params_ctx, params_backend); + if (num_tensors > 0) { + // ggml_backend_alloc_ctx_tensors fails when all tensors are already allocated + // (typical for memory-mapped weights). See ggml-alloc.c n_buffers==0 branch. + bool all_have_data = true; + for (ggml_tensor* t = ggml_get_first_tensor(params_ctx); t != nullptr; t = ggml_get_next_tensor(params_ctx, t)) { + if (t->data == nullptr) { + all_have_data = false; + break; + } + } + if (all_have_data) { + LOG_DEBUG("%s all params already mmap-allocated (no separate buffer needed)", get_desc().c_str()); + params_buffer = nullptr; + rebuild_params_tensor_set(); + return true; + } + } + params_buffer = ggml_backend_alloc_ctx_tensors(params_ctx, params_backend); if (params_buffer == nullptr) { LOG_ERROR("%s alloc params backend buffer failed, num_tensors = %i", get_desc().c_str(), diff --git a/otherarch/sdcpp/model.cpp b/otherarch/sdcpp/model.cpp index 4c05f756c..af9fcc3bd 100644 --- a/otherarch/sdcpp/model.cpp +++ b/otherarch/sdcpp/model.cpp @@ -758,16 +758,10 @@ void ModelLoader::set_wtype_override(ggml_type wtype, std::string tensor_type_ru } } -bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_threads_p, bool enable_mmap) { - int64_t process_time_ms = 0; - std::atomic read_time_ms(0); - std::atomic memcpy_time_ms(0); - std::atomic copy_to_backend_time_ms(0); - std::atomic convert_time_ms(0); - std::atomic bytes_processed(0); - - int num_threads_to_use = n_threads_p > 0 ? n_threads_p : sd_get_num_physical_cores(); - LOG_DEBUG("using %d threads for model loading", num_threads_to_use); +void ModelLoader::process_model_files(bool enable_mmap, bool writable_mmap) { + if (model_files_processed) { + return; + } int64_t start_time = ggml_time_ms(); @@ -779,22 +773,13 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread processed_tensor_storages.push_back(tensor_storage); } - process_time_ms = ggml_time_ms() - start_time; - - bool success = true; - size_t total_tensors_processed = 0; - const size_t total_tensors_to_process = processed_tensor_storages.size(); - const int64_t t_start = ggml_time_ms(); - int last_n_threads = 1; - for (size_t file_index = 0; file_index < file_paths_.size(); file_index++) { std::string file_path = file_paths_[file_index]; - LOG_DEBUG("loading tensors from %s", file_path.c_str()); - std::vector file_tensors; + std::vector file_tensors; for (const auto& ts : processed_tensor_storages) { if (ts.file_index == file_index) { - file_tensors.push_back(&ts); + file_tensors.push_back(ts); } } if (file_tensors.empty()) { @@ -803,21 +788,169 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread bool is_zip = false; for (auto const& ts : file_tensors) { - if (ts->index_in_zip >= 0) { + if (ts.index_in_zip >= 0) { is_zip = true; break; } } - std::unique_ptr mmapped; + ModelFileData fdata = {}; + fdata.path = file_path; + fdata.is_zip = is_zip; + fdata.tensors = std::move(file_tensors); + if (enable_mmap && !is_zip) { LOG_DEBUG("using mmap for I/O"); - mmapped = MmapWrapper::create(file_path); - if (!mmapped) { - LOG_WARN("failed to memory-map '%s'", file_path.c_str()); + std::unique_ptr mmapped = MmapWrapper::create(file_path, writable_mmap); + if (mmapped) { + uint8_t* mmap_data = static_cast(mmapped->writable_data()); + ggml_backend_buffer_t buf_mmap = ggml_backend_cpu_buffer_from_ptr(mmap_data, mmapped->size()); + if (buf_mmap) { + LOG_INFO("using mmap for '%s'", file_path.c_str()); + fdata.mmbuffer = std::shared_ptr(buf_mmap, ggml_backend_buffer_free); + } else { + LOG_WARN("mmap: failed to create backend buffer for file %s", fdata.path.c_str()); + } + fdata.mmapped = std::shared_ptr(std::move(mmapped)); + } else { + LOG_WARN("failed to memory-map '%s' (falling back to read())", file_path.c_str()); } + } else if (!is_zip) { + LOG_INFO("NOT using mmap for '%s' (mmap disabled by caller)", + file_path.c_str()); } + file_data.push_back(std::move(fdata)); + } + + model_files_processed = true; + + int64_t end_time = ggml_time_ms(); + int64_t process_time_ms = end_time - start_time; + + LOG_INFO("model files processing completed in %.2fs", process_time_ms / 1000.f); +} + +std::vector ModelLoader::mmap_tensors(std::map& tensors, + std::set ignore_tensors, + bool writable_mmap) { + process_model_files(true, writable_mmap); + + std::vector result; + uint64_t mapped_bytes = 0; + size_t mapped_tensors = 0; + + LOG_DEBUG("memory-mapping tensors..."); + + int64_t t_start = ggml_time_ms(); + + for (auto& fdata : file_data) { + if (!fdata.mmbuffer) + continue; + + const std::vector& file_tensors = fdata.tensors; + + size_t file_mapped_bytes = 0; + size_t file_mapped_tensors = 0; + + for (const auto& tensor_storage : file_tensors) { + const std::string& name = tensor_storage.name; + + bool is_ignored = false; + for (const auto& ignore_prefix : ignore_tensors) { + if (starts_with(name, ignore_prefix)) { + is_ignored = true; + break; + } + } + if (is_ignored) + continue; + + auto it = tensors.find(name); + if (it == tensors.end()) + continue; + + ggml_tensor* dst_tensor = it->second; + if (dst_tensor == nullptr) + continue; + + if (tensor_storage.type != dst_tensor->type) + continue; + + size_t tensor_size = tensor_storage.nbytes(); + size_t tensor_offset = tensor_storage.offset; + + if (tensor_storage.ne[0] != dst_tensor->ne[0] || + tensor_storage.ne[1] != dst_tensor->ne[1] || + tensor_storage.ne[2] != dst_tensor->ne[2] || + tensor_storage.ne[3] != dst_tensor->ne[3] || + tensor_size != ggml_nbytes(dst_tensor)) { + // let load_tensors worry about this + continue; + } + + ggml_backend_buffer_t buf_mmap = fdata.mmbuffer.get(); + uint8_t* mmap_data = static_cast(ggml_backend_buffer_get_base(buf_mmap)); + dst_tensor->buffer = buf_mmap; + dst_tensor->data = mmap_data + tensor_offset; + + file_mapped_bytes += tensor_size; + file_mapped_tensors++; + } + + if (file_mapped_bytes > 0) { + mapped_tensors += file_mapped_tensors; + mapped_bytes += file_mapped_bytes; + result.push_back({fdata.mmapped, fdata.mmbuffer}); + } + } + + int64_t t_end = ggml_time_ms(); + int64_t duration_ms = t_end - t_start; + + LOG_INFO("memory-mapped %zu tensors in %zu files (%.2f MB), taking %.2fs", + mapped_tensors, + result.size(), + mapped_bytes / (1024.0 * 1024.0), + duration_ms / 1000.0); + + return result; +} + +bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_threads_p, bool enable_mmap) { + process_model_files(enable_mmap, false); + + std::atomic read_time_ms(0); + std::atomic memcpy_time_ms(0); + std::atomic copy_to_backend_time_ms(0); + std::atomic convert_time_ms(0); + std::atomic bytes_processed(0); + + int num_threads_to_use = n_threads_p > 0 ? n_threads_p : sd_get_num_physical_cores(); + LOG_DEBUG("using %d threads for model loading", num_threads_to_use); + + int64_t start_time = ggml_time_ms(); + + size_t total_tensors_to_process = 0; + for (const auto& fdata : file_data) { + total_tensors_to_process += fdata.tensors.size(); + } + + bool success = true; + size_t total_tensors_processed = 0; + const int64_t t_start = start_time; + int last_n_threads = 1; + + for (auto& fdata : file_data) { + const std::string& file_path = fdata.path; + LOG_DEBUG("loading tensors from %s", file_path.c_str()); + + const std::vector& file_tensors = fdata.tensors; + + bool is_zip = fdata.is_zip; + + std::shared_ptr mmapped = fdata.mmapped; + int n_threads = is_zip ? 1 : std::min(num_threads_to_use, (int)file_tensors.size()); if (n_threads < 1) { n_threads = 1; @@ -858,7 +991,7 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread break; } - const TensorStorage& tensor_storage = *file_tensors[idx]; + const TensorStorage& tensor_storage = file_tensors[idx]; ggml_tensor* dst_tensor = nullptr; t0 = ggml_time_ms(); @@ -875,6 +1008,11 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread continue; } + // skip mmapped tensors + if (dst_tensor->buffer != nullptr && dst_tensor->buffer == fdata.mmbuffer.get()) { + continue; + } + size_t nbytes_to_read = tensor_storage.nbytes_to_read(); auto read_data = [&](char* buf, size_t n) { @@ -1018,9 +1156,8 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread } int64_t end_time = ggml_time_ms(); - LOG_INFO("loading tensors completed, taking %.2fs (process: %.2fs, read: %.2fs, memcpy: %.2fs, convert: %.2fs, copy_to_backend: %.2fs)", + LOG_INFO("loading tensors completed, taking %.2fs (read: %.2fs, memcpy: %.2fs, convert: %.2fs, copy_to_backend: %.2fs)", (end_time - start_time) / 1000.f, - process_time_ms / 1000.f, (read_time_ms.load() / (float)last_n_threads) / 1000.f, (memcpy_time_ms.load() / (float)last_n_threads) / 1000.f, (convert_time_ms.load() / (float)last_n_threads) / 1000.f, diff --git a/otherarch/sdcpp/model.h b/otherarch/sdcpp/model.h index 2689f63bd..97509587b 100644 --- a/otherarch/sdcpp/model.h +++ b/otherarch/sdcpp/model.h @@ -193,10 +193,27 @@ using TensorTypeRules = std::vector>; TensorTypeRules parse_tensor_type_rules(const std::string& tensor_type_rules); +class MmapWrapper; + +struct ModelFileData { + std::string path; + std::vector tensors; + std::shared_ptr mmapped; + std::shared_ptr mmbuffer; + bool is_zip; +}; + +struct MmapTensorStore { + std::shared_ptr mmapped; + std::shared_ptr mmbuffer; +}; + class ModelLoader { protected: SDVersion version_ = VERSION_COUNT; std::vector file_paths_; + std::vector file_data; + bool model_files_processed = false; String2TensorStorage tensor_storage_map; void add_tensor_storage(const TensorStorage& tensor_storage); @@ -221,6 +238,10 @@ public: std::map get_vae_wtype_stat(); String2TensorStorage& get_tensor_storage_map() { return tensor_storage_map; } void set_wtype_override(ggml_type wtype, std::string tensor_type_rules = ""); + void process_model_files(bool enable_mmap = false, bool writable_mmap = true); + std::vector mmap_tensors(std::map& tensors, + std::set ignore_tensors = {}, + bool writable = true); bool load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_threads = 0, bool use_mmap = false); bool load_tensors(std::map& tensors, std::set ignore_tensors = {}, diff --git a/otherarch/sdcpp/sdtype_adapter.cpp b/otherarch/sdcpp/sdtype_adapter.cpp index f622a4354..eb7d6eb91 100644 --- a/otherarch/sdcpp/sdtype_adapter.cpp +++ b/otherarch/sdcpp/sdtype_adapter.cpp @@ -360,6 +360,16 @@ bool sdtype_load_model(const sd_load_model_inputs inputs) { { printf("Conv2D Direct for VAE model is enabled\n"); } + if (inputs.use_mmap && inputs.offload_cpu) { + printf("Offloading weights to system RAM with mmap\n"); + if (!lora_dynamic && inputs.lora_len > 0) { + printf("Note: static LoRAs can reduce mmap memory savings!\n"); + } + } else if (inputs.offload_cpu) { + printf("Offloading weights to system RAM\n"); + } else if (inputs.use_mmap) { + printf("Using mmap for I/O\n"); + } if(inputs.quant > 0) { printf("Note: Loading a pre-quantized model is always faster than using compress weights!\n"); @@ -424,6 +434,7 @@ bool sdtype_load_model(const sd_load_model_inputs inputs) { params.vae_conv_direct = sd_params->vae_conv_direct; params.chroma_use_dit_mask = sd_params->chroma_use_dit_mask; params.offload_params_to_cpu = inputs.offload_cpu; + params.enable_mmap = inputs.use_mmap; params.keep_vae_on_cpu = inputs.vae_cpu; params.keep_clip_on_cpu = inputs.clip_cpu; params.lora_apply_mode = (lora_apply_mode_t)lora_apply_mode; diff --git a/otherarch/sdcpp/stable-diffusion.cpp b/otherarch/sdcpp/stable-diffusion.cpp index 9077f0618..cc44f18f4 100644 --- a/otherarch/sdcpp/stable-diffusion.cpp +++ b/otherarch/sdcpp/stable-diffusion.cpp @@ -75,6 +75,8 @@ const char* sampling_methods_str[] = { "Res Multistep", "Res 2s", "ER-SDE", + "Euler CFG++", + "Euler A CFG++", }; /*================================================== Helper Functions ================================================*/ @@ -110,6 +112,7 @@ static float get_cache_reuse_threshold(const sd_cache_params_t& params) { class StableDiffusionGGML { public: + std::vector mmap_tensor_store; ggml_backend_t backend = nullptr; // general backend ggml_backend_t clip_backend = nullptr; ggml_backend_t control_net_backend = nullptr; @@ -560,6 +563,51 @@ public: apply_lora_immediately = false; } + std::map mmap_able_tensors; + bool enable_mmap_tensors = false; + bool main_backend_mmap = false; + bool needs_writable_mmap = false; + if (sd_ctx_params->enable_mmap) { + if (apply_lora_immediately) { + needs_writable_mmap = true; + LOG_WARN("in mode 'immediately', LoRAs will cause extra memory usage with mmap"); + } + enable_mmap_tensors = true; + if (offload_params_to_cpu) { + main_backend_mmap = true; + } else { + ggml_backend_dev_t dev = ggml_backend_get_device(backend); + struct ggml_backend_dev_props props; + ggml_backend_dev_get_props(dev, &props); + main_backend_mmap = props.caps.buffer_from_host_ptr; + } + } + + // split definition to avoid msvc choking on the extra parameter handling + auto get_param_tensors_p = [&](auto&& model, bool force_cpu, const char* prefix) { + std::map temp; + model->get_param_tensors(temp, prefix); + bool do_mmap = enable_mmap_tensors && (main_backend_mmap || force_cpu); + for (const auto& [key, tensor] : temp) { + tensors[key] = tensor; + if (do_mmap) { + mmap_able_tensors[key] = tensor; + } + } + }; + + auto get_param_tensors = [&](auto&& model, bool force_cpu = false) { + std::map temp; + model->get_param_tensors(temp); + bool do_mmap = enable_mmap_tensors && (main_backend_mmap || force_cpu); + for (const auto& [key, tensor] : temp) { + tensors[key] = tensor; + if (do_mmap) { + mmap_able_tensors[key] = tensor; + } + } + }; + if (sd_version_is_control(version)) { // Might need vae encode for control cond vae_decode_only = false; @@ -671,8 +719,7 @@ public: offload_params_to_cpu, tensor_storage_map); clip_vision->set_max_graph_vram_bytes(max_graph_vram_bytes); - clip_vision->alloc_params_buffer(); - clip_vision->get_param_tensors(tensors); + get_param_tensors(clip_vision); } } else if (sd_version_is_qwen_image(version)) { bool enable_vision = false; @@ -748,12 +795,10 @@ public: } cond_stage_model->set_max_graph_vram_bytes(max_graph_vram_bytes); - cond_stage_model->alloc_params_buffer(); - cond_stage_model->get_param_tensors(tensors); + get_param_tensors(cond_stage_model, clip_on_cpu); diffusion_model->set_max_graph_vram_bytes(max_graph_vram_bytes); - diffusion_model->alloc_params_buffer(); - diffusion_model->get_param_tensors(tensors); + get_param_tensors(diffusion_model); if (sd_version_is_unet_edit(version)) { vae_decode_only = false; @@ -761,8 +806,7 @@ public: if (high_noise_diffusion_model) { high_noise_diffusion_model->set_max_graph_vram_bytes(max_graph_vram_bytes); - high_noise_diffusion_model->alloc_params_buffer(); - high_noise_diffusion_model->get_param_tensors(tensors); + get_param_tensors(high_noise_diffusion_model); } if (sd_ctx_params->keep_vae_on_cpu && !ggml_backend_is_cpu(backend)) { @@ -825,6 +869,8 @@ public: } }; + bool force_vae_cpu = sd_ctx_params->keep_vae_on_cpu; + if (version == VERSION_CHROMA_RADIANCE) { LOG_INFO("using FakeVAE"); first_stage_model = std::make_shared(version, @@ -834,20 +880,17 @@ public: LOG_INFO("using TAE for encoding / decoding"); first_stage_model = create_tae(); first_stage_model->set_max_graph_vram_bytes(max_graph_vram_bytes); - first_stage_model->alloc_params_buffer(); - first_stage_model->get_param_tensors(tensors, "tae"); + get_param_tensors_p(first_stage_model, force_vae_cpu, "tae"); } else { LOG_INFO("using VAE for encoding / decoding"); first_stage_model = create_vae(); first_stage_model->set_max_graph_vram_bytes(max_graph_vram_bytes); - first_stage_model->alloc_params_buffer(); - first_stage_model->get_param_tensors(tensors, "first_stage_model"); + get_param_tensors_p(first_stage_model, force_vae_cpu, "first_stage_model"); if (use_tae && tae_preview_only) { LOG_INFO("using TAE for preview"); preview_vae = create_tae(); preview_vae->set_max_graph_vram_bytes(max_graph_vram_bytes); - preview_vae->alloc_params_buffer(); - preview_vae->get_param_tensors(tensors, "tae"); + get_param_tensors_p(first_stage_model, force_vae_cpu, "vae"); } } @@ -916,11 +959,7 @@ public: } } if (use_pmid) { - if (!pmid_model->alloc_params_buffer()) { - LOG_ERROR(" pmid model params buffer allocation failed"); - return false; - } - pmid_model->get_param_tensors(tensors, "pmid"); + get_param_tensors_p(pmid_model, false, "pmid"); } if (sd_ctx_params->flash_attn) { @@ -1003,6 +1042,41 @@ public: ignore_tensors.insert("text_encoders.llm.vision_tower."); ignore_tensors.insert("text_encoders.llm.multi_modal_projector."); } + + if (enable_mmap_tensors) { + if (mmap_able_tensors.empty()) { + LOG_DEBUG("no tensors could be memory-mapped"); + } else { + mmap_tensor_store = model_loader.mmap_tensors(mmap_able_tensors, ignore_tensors, needs_writable_mmap); + } + } + + if (clip_vision) { + clip_vision->alloc_params_buffer(); + } + if (cond_stage_model) { + cond_stage_model->alloc_params_buffer(); + } + if (diffusion_model) { + diffusion_model->alloc_params_buffer(); + } + if (high_noise_diffusion_model) { + high_noise_diffusion_model->alloc_params_buffer(); + } + if (first_stage_model) { + first_stage_model->alloc_params_buffer(); + } + if (preview_vae) { + preview_vae->alloc_params_buffer(); + } + if (use_pmid && pmid_model) { + if (!pmid_model->alloc_params_buffer()) { + LOG_ERROR(" pmid model params buffer allocation failed"); + ggml_free(ctx); + return false; + } + } + bool success = model_loader.load_tensors(tensors, ignore_tensors, n_threads, sd_ctx_params->enable_mmap); if (!success) { LOG_ERROR("load tensors from model loader failed"); @@ -1828,6 +1902,15 @@ public: cache_params, denoiser.get(), sigmas); + + // Spectrum cache is not supported for CFG++ samplers + if (method == EULER_CFG_PP_SAMPLE_METHOD || method == EULER_A_CFG_PP_SAMPLE_METHOD) { + if (cache_runtime.spectrum_enabled) { + LOG_WARN("Spectrum cache requested but not supported for CFG++ samplers"); + cache_runtime.spectrum_enabled = false; + } + } + size_t steps = sigmas.size() - 1; bool has_skiplayer = slg_scale != 0.0f && !skip_layers.empty(); if (has_skiplayer && !sd_version_is_dit(version)) { @@ -1842,7 +1925,7 @@ public: sd::Tensor denoised = x_t; SamplePreviewContext preview = prepare_sample_preview_context(); - auto denoise = [&](const sd::Tensor& x, float sigma, int step) -> sd::Tensor { + auto denoise = [&](const sd::Tensor& x, float sigma, int step, sd::Tensor* out_uncond_denoised = nullptr) -> sd::Tensor { if (step == 1 || step == -1) { pretty_progress(0, (int)steps, 0); } @@ -1865,15 +1948,17 @@ public: } if (cache_runtime.spectrum_enabled && cache_runtime.spectrum.should_predict()) { - cache_runtime.spectrum.predict(&denoised); - if (!denoise_mask.empty()) { - denoised = denoised * denoise_mask + init_latent * (1.0f - denoise_mask); + if (out_uncond_denoised == nullptr) { + cache_runtime.spectrum.predict(&denoised); + if (!denoise_mask.empty()) { + denoised = denoised * denoise_mask + init_latent * (1.0f - denoise_mask); + } + if (sd_should_preview_denoised() && preview.callback != nullptr) { + preview_image(step, denoised, version, preview.mode, preview.callback, preview.data, false); + } + report_sample_progress(step, steps, t0); + return denoised; } - if (sd_should_preview_denoised() && preview.callback != nullptr) { - preview_image(step, denoised, version, preview.mode, preview.callback, preview.data, false); - } - report_sample_progress(step, steps, t0); - return denoised; } if (sd_should_preview_noisy() && preview.callback != nullptr) { @@ -1996,6 +2081,10 @@ public: latent_result += (cond_out - skip_cond_out) * slg_scale; } denoised = latent_result * c_out + x * c_skip; + if (out_uncond_denoised != nullptr) { + sd::Tensor base_uncond = !uncond_out.empty() ? uncond_out : cond_out; + *out_uncond_denoised = base_uncond * c_out + x * c_skip; + } if (cache_runtime.spectrum_enabled) { cache_runtime.spectrum.update(denoised); } @@ -2210,6 +2299,8 @@ const char* sample_method_to_str[] = { "res_multistep", "res_2s", "er_sde", + "euler_cfg_pp", + "euler_a_cfg_pp", }; const char* sd_sample_method_name(enum sample_method_t sample_method) { @@ -2772,6 +2863,7 @@ static float resolve_eta(sd_ctx_t* sd_ctx, case EULER_A_SAMPLE_METHOD: case DPMPP2S_A_SAMPLE_METHOD: case ER_SDE_SAMPLE_METHOD: + case EULER_A_CFG_PP_SAMPLE_METHOD: return 1.0f; default:; } diff --git a/otherarch/sdcpp/stable-diffusion.h b/otherarch/sdcpp/stable-diffusion.h index e4f1b307e..449224eb5 100644 --- a/otherarch/sdcpp/stable-diffusion.h +++ b/otherarch/sdcpp/stable-diffusion.h @@ -51,6 +51,8 @@ enum sample_method_t { RES_MULTISTEP_SAMPLE_METHOD, RES_2S_SAMPLE_METHOD, ER_SDE_SAMPLE_METHOD, + EULER_CFG_PP_SAMPLE_METHOD, + EULER_A_CFG_PP_SAMPLE_METHOD, SAMPLE_METHOD_COUNT }; diff --git a/otherarch/sdcpp/util.cpp b/otherarch/sdcpp/util.cpp index 17a41043d..0f81152ed 100644 --- a/otherarch/sdcpp/util.cpp +++ b/otherarch/sdcpp/util.cpp @@ -121,7 +121,7 @@ private: HANDLE hmapping_; }; -std::unique_ptr MmapWrapper::create(const std::string& filename) { +std::unique_ptr MmapWrapper::create(const std::string& filename, bool writable) { void* mapped_data = nullptr; size_t file_size = 0; @@ -146,14 +146,18 @@ std::unique_ptr MmapWrapper::create(const std::string& filename) { file_size = static_cast(size.QuadPart); - HANDLE mapping_handle = CreateFileMapping(file_handle, nullptr, PAGE_READONLY, 0, 0, nullptr); + DWORD page_prot = writable ? PAGE_WRITECOPY : PAGE_READONLY; + + HANDLE mapping_handle = CreateFileMapping(file_handle, nullptr, page_prot, 0, 0, nullptr); if (mapping_handle == nullptr) { CloseHandle(file_handle); return nullptr; } - mapped_data = MapViewOfFile(mapping_handle, FILE_MAP_READ, 0, 0, file_size); + DWORD view_access = writable ? FILE_MAP_COPY : FILE_MAP_READ; + + mapped_data = MapViewOfFile(mapping_handle, view_access, 0, 0, file_size); if (mapped_data == nullptr) { CloseHandle(mapping_handle); @@ -186,28 +190,85 @@ std::string sd_get_u8path(const std::string& file_path) return std::filesystem::path(file_path).string(); } -class MmapWrapperImpl : public MmapWrapper { -public: - MmapWrapperImpl(void* data, size_t size) - : MmapWrapper(data, size) {} - - ~MmapWrapperImpl() override { - munmap(data_, size_); - } +struct MmapFlags { + bool sequential; + bool populate; + bool willneed; + bool dontneed; }; -std::unique_ptr MmapWrapper::create(const std::string& filename) { +static MmapFlags get_mmap_flags() { + MmapFlags result = {}; + const char* SD_MMAP_FLAGS = std::getenv("SD_MMAP_FLAGS"); + if (SD_MMAP_FLAGS && *SD_MMAP_FLAGS) { + std::stringstream ss(SD_MMAP_FLAGS); + std::string token; + while (std::getline(ss, token, ',')) { + std::string ntoken = trim(token); + std::transform(ntoken.begin(), ntoken.end(), ntoken.begin(), ::tolower); + if (ntoken == "sequential") { + result.sequential = true; + } else if (ntoken == "populate") { + result.populate = true; + } else if (ntoken == "willneed") { + result.willneed = true; + } else if (ntoken == "dontneed") { + result.dontneed = true; + } + } + } + return result; +} + +class MmapWrapperImpl : public MmapWrapper { +public: + MmapWrapperImpl(void* data, size_t size, int fd) + : MmapWrapper(data, size), fd_(fd) {} + + ~MmapWrapperImpl() override { +#ifdef __linux__ + auto cfg_flags = get_mmap_flags(); + + // Drop the kernel pagecache pages for this file. madvise(DONTNEED) + // alone only unmaps from the process address space; pagecache + // entries persist (`free` reports them as buff/cache and the OOM + // killer doesn't touch them, but they ARE counted against + // overcommit and can starve other allocations on tight-RAM + // systems). posix_fadvise(POSIX_FADV_DONTNEED) is the documented + // way to evict pagecache for a specific fd's pages. + if (cfg_flags.dontneed) { + madvise(data_, size_, MADV_DONTNEED); + posix_fadvise(fd_, 0, 0, POSIX_FADV_DONTNEED); + } +#endif + munmap(data_, size_); + close(fd_); + } + +private: + int fd_; +}; + +std::unique_ptr MmapWrapper::create(const std::string& filename, bool writable) { int file_descriptor = open(filename.c_str(), O_RDONLY); if (file_descriptor == -1) { return nullptr; } + auto cfg_flags = get_mmap_flags(); + int mmap_flags = MAP_PRIVATE; #ifdef __linux__ - // performance flags used by llama.cpp - // posix_fadvise(file_descriptor, 0, 0, POSIX_FADV_SEQUENTIAL); - // mmap_flags |= MAP_POPULATE; + // Sequential access hint helps the kernel read-ahead efficiently and + // also encourages eviction of already-read pages (the kernel keeps + // a smaller working set when this is set). + if (cfg_flags.sequential) { + posix_fadvise(file_descriptor, 0, 0, POSIX_FADV_SEQUENTIAL); + } + if (cfg_flags.populate) { + mmap_flags |= MAP_POPULATE; + } #endif struct stat sb; @@ -218,20 +279,27 @@ std::unique_ptr MmapWrapper::create(const std::string& filename) { size_t file_size = sb.st_size; - void* mapped_data = mmap(nullptr, file_size, PROT_READ, mmap_flags, file_descriptor, 0); + if (file_size == 0) { + close(file_descriptor); + return nullptr; + } - close(file_descriptor); + int mmap_prot = PROT_READ | (writable ? PROT_WRITE : 0); + + void* mapped_data = mmap(nullptr, file_size, mmap_prot, mmap_flags, file_descriptor, 0); if (mapped_data == MAP_FAILED) { + close(file_descriptor); return nullptr; } #ifdef __linux__ - // performance flags used by llama.cpp - // posix_madvise(mapped_data, file_size, POSIX_MADV_WILLNEED); + if (cfg_flags.willneed) { + posix_madvise(mapped_data, file_size, POSIX_MADV_WILLNEED); + } #endif - return std::make_unique(mapped_data, file_size); + return std::make_unique(mapped_data, file_size, file_descriptor); } #endif diff --git a/otherarch/sdcpp/util.h b/otherarch/sdcpp/util.h index 36d168e2d..f9b7d4aba 100644 --- a/otherarch/sdcpp/util.h +++ b/otherarch/sdcpp/util.h @@ -45,7 +45,7 @@ sd::Tensor clip_preprocess(const sd::Tensor& image, int target_wid class MmapWrapper { public: - static std::unique_ptr create(const std::string& filename); + static std::unique_ptr create(const std::string& filename, bool writable = false); virtual ~MmapWrapper() = default; @@ -55,6 +55,7 @@ public: MmapWrapper& operator=(MmapWrapper&&) = delete; const uint8_t* data() const { return static_cast(data_); } + uint8_t* writable_data() { return static_cast(data_); } size_t size() const { return size_; } bool copy_data(void* buf, size_t n, size_t offset) const;