From f273fd35b9274a89d5a89f377e8ad95e30516b5c Mon Sep 17 00:00:00 2001
From: Wagner Bruna <wbruna@users.noreply.github.com>
Date: Sat, 16 May 2026 00:23:10 -0300
Subject: [PATCH] sd: sync to master-601-eeac950 (#2206)

* sd: sync to master-601-eeac950

* sd: add mmap support
---
 expose.h                             |   1 +
 koboldcpp.py                         |   2 +
 otherarch/sdcpp/common/common.cpp    |   4 +-
 otherarch/sdcpp/denoiser.hpp         | 108 +++++++++++----
 otherarch/sdcpp/ggml_extend.hpp      |  19 ++-
 otherarch/sdcpp/model.cpp            | 195 +++++++++++++++++++++++----
 otherarch/sdcpp/model.h              |  21 +++
 otherarch/sdcpp/sdtype_adapter.cpp   |  11 ++
 otherarch/sdcpp/stable-diffusion.cpp | 148 ++++++++++++++++----
 otherarch/sdcpp/stable-diffusion.h   |   2 +
 otherarch/sdcpp/util.cpp             | 108 ++++++++++++---
 otherarch/sdcpp/util.h               |   3 +-
 12 files changed, 514 insertions(+), 108 deletions(-)
diff --git a/expose.h b/expose.h
index e4ee7ff6c..695771a9f 100644
--- a/expose.h
+++ b/expose.h
@@ -182,6 +182,7 @@ struct sd_load_model_inputs
     const int quant = 0;
     const bool flash_attention = false;
     const bool offload_cpu = false;
+    const bool use_mmap = false;
     const bool vae_cpu = false;
     const bool clip_cpu = false;
     const bool diffusion_conv_direct = false;
diff --git a/koboldcpp.py b/koboldcpp.py
index d39cba4df..1694289dd 100644
--- a/koboldcpp.py
+++ b/koboldcpp.py
@@ -363,6 +363,7 @@ class sd_load_model_inputs(ctypes.Structure):
                 ("quant", ctypes.c_int),
                 ("flash_attention", ctypes.c_bool),
                 ("offload_cpu", ctypes.c_bool),
+                ("use_mmap", ctypes.c_bool),
                 ("vae_cpu", ctypes.c_bool),
                 ("clip_cpu", ctypes.c_bool),
                 ("diffusion_conv_direct", ctypes.c_bool),
@@ -2401,6 +2402,7 @@ def sd_load_model(model_filename,vae_filename,t5xxl_filename,clip1_filename,clip
     inputs.quant = args.sdquant
     inputs.flash_attention = args.sdflashattention
     inputs.offload_cpu = args.sdoffloadcpu
+    inputs.use_mmap = args.usemmap
     inputs.vae_cpu = args.sdvaecpu
     inputs.clip_cpu = False if args.sdclipgpu else True
     sdconvdirect = sd_convdirect_option(args.sdconvdirect)
diff --git a/otherarch/sdcpp/common/common.cpp b/otherarch/sdcpp/common/common.cpp
index d4c8a72b8..8ca7a2dcb 100644
--- a/otherarch/sdcpp/common/common.cpp
+++ b/otherarch/sdcpp/common/common.cpp
@@ -1244,12 +1244,12 @@ ArgOptions SDGenerationParams::get_options() {
          on_seed_arg},
         {"",
          "--sampling-method",
-         "sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd, res_multistep, res_2s, er_sde] "
+         "sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd, res_multistep, res_2s, er_sde, euler_cfg_pp, euler_a_cfg_pp]"
          "(default: euler for Flux/SD3/Wan, euler_a otherwise)",
          on_sample_method_arg},
         {"",
          "--high-noise-sampling-method",
-         "(high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd, res_multistep, res_2s, er_sde]"
+         "(high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd, res_multistep, res_2s, er_sde, euler_cfg_pp, euler_a_cfg_pp]"
          " default: euler for Flux/SD3/Wan, euler_a otherwise",
          on_high_noise_sample_method_arg},
         {"",
diff --git a/otherarch/sdcpp/denoiser.hpp b/otherarch/sdcpp/denoiser.hpp
index 831da2580..3742f53ba 100644
--- a/otherarch/sdcpp/denoiser.hpp
+++ b/otherarch/sdcpp/denoiser.hpp
@@ -752,7 +752,7 @@ struct Flux2FlowDenoiser : public FluxFlowDenoiser {
     }
 };
 
-typedef std::function<sd::Tensor<float>(const sd::Tensor<float>&, float, int)> denoise_cb_t;
+typedef std::function<sd::Tensor<float>(const sd::Tensor<float>&, float, int, sd::Tensor<float>*)> denoise_cb_t;
 
 static std::pair<float, float> get_ancestral_step(float sigma_from,
                                                   float sigma_to,
@@ -828,7 +828,7 @@ static sd::Tensor<float> sample_euler_ancestral(denoise_cb_t model,
     int steps = static_cast<int>(sigmas.size()) - 1;
     for (int i = 0; i < steps; i++) {
         float sigma       = sigmas[i];
-        auto denoised_opt = model(x, sigma, i + 1);
+        auto denoised_opt = model(x, sigma, i + 1, nullptr);
         if (denoised_opt.empty()) {
             return {};
         }
@@ -851,7 +851,7 @@ static sd::Tensor<float> sample_euler_flow(denoise_cb_t model,
     int steps = static_cast<int>(sigmas.size()) - 1;
     for (int i = 0; i < steps; i++) {
         float sigma       = sigmas[i];
-        auto denoised_opt = model(x, sigma, i + 1);
+        auto denoised_opt = model(x, sigma, i + 1, nullptr);
         if (denoised_opt.empty()) {
             return {};
         }
@@ -873,7 +873,7 @@ static sd::Tensor<float> sample_euler(denoise_cb_t model,
     int steps = static_cast<int>(sigmas.size()) - 1;
     for (int i = 0; i < steps; i++) {
         float sigma       = sigmas[i];
-        auto denoised_opt = model(x, sigma, i + 1);
+        auto denoised_opt = model(x, sigma, i + 1, nullptr);
         if (denoised_opt.empty()) {
             return {};
         }
@@ -889,7 +889,7 @@ static sd::Tensor<float> sample_heun(denoise_cb_t model,
                                      const std::vector<float>& sigmas) {
     int steps = static_cast<int>(sigmas.size()) - 1;
     for (int i = 0; i < steps; i++) {
-        auto denoised_opt = model(x, sigmas[i], -(i + 1));
+        auto denoised_opt = model(x, sigmas[i], -(i + 1), nullptr);
         if (denoised_opt.empty()) {
             return {};
         }
@@ -900,7 +900,7 @@ static sd::Tensor<float> sample_heun(denoise_cb_t model,
             x += d * dt;
         } else {
             sd::Tensor<float> x2 = x + d * dt;
-            auto denoised2_opt   = model(x2, sigmas[i + 1], i + 1);
+            auto denoised2_opt   = model(x2, sigmas[i + 1], i + 1, nullptr);
             if (denoised2_opt.empty()) {
                 return {};
             }
@@ -917,7 +917,7 @@ static sd::Tensor<float> sample_dpm2(denoise_cb_t model,
                                      const std::vector<float>& sigmas) {
     int steps = static_cast<int>(sigmas.size()) - 1;
     for (int i = 0; i < steps; i++) {
-        auto denoised_opt = model(x, sigmas[i], -(i + 1));
+        auto denoised_opt = model(x, sigmas[i], -(i + 1), nullptr);
         if (denoised_opt.empty()) {
             return {};
         }
@@ -930,7 +930,7 @@ static sd::Tensor<float> sample_dpm2(denoise_cb_t model,
             float dt_1           = sigma_mid - sigmas[i];
             float dt_2           = sigmas[i + 1] - sigmas[i];
             sd::Tensor<float> x2 = x + d * dt_1;
-            auto denoised2_opt   = model(x2, sigma_mid, i + 1);
+            auto denoised2_opt   = model(x2, sigma_mid, i + 1, nullptr);
             if (denoised2_opt.empty()) {
                 return {};
             }
@@ -951,7 +951,7 @@ static sd::Tensor<float> sample_dpmpp_2s_ancestral(denoise_cb_t model,
 
     int steps = static_cast<int>(sigmas.size()) - 1;
     for (int i = 0; i < steps; i++) {
-        auto denoised_opt = model(x, sigmas[i], -(i + 1));
+        auto denoised_opt = model(x, sigmas[i], -(i + 1), nullptr);
         if (denoised_opt.empty()) {
             return {};
         }
@@ -967,7 +967,7 @@ static sd::Tensor<float> sample_dpmpp_2s_ancestral(denoise_cb_t model,
             float s              = t + 0.5f * h;
             float sigma_s        = sigma_fn(s);
             sd::Tensor<float> x2 = (sigma_s / sigma_fn(t)) * x - (exp(-h * 0.5f) - 1) * denoised;
-            auto denoised2_opt   = model(x2, sigma_s, i + 1);
+            auto denoised2_opt   = model(x2, sigma_s, i + 1, nullptr);
             if (denoised2_opt.empty()) {
                 return {};
             }
@@ -994,7 +994,7 @@ static sd::Tensor<float> sample_dpmpp_2s_ancestral_flow(denoise_cb_t model,
 
         bool opt_first_step = (1.0 - sigma < 1e-6);
 
-        auto denoised_opt = model(x, sigma, (opt_first_step ? 1 : -1) * (i + 1));
+        auto denoised_opt = model(x, sigma, (opt_first_step ? 1 : -1) * (i + 1), nullptr);
         if (denoised_opt.empty()) {
             return {};
         }
@@ -1023,8 +1023,8 @@ static sd::Tensor<float> sample_dpmpp_2s_ancestral_flow(denoise_cb_t model,
                 // so sigma_s = 1 = sigma, and sigma_s_i_ratio = sigma_s / sigma = 1
                 // u = (x*sigma_s_i_ratio)+(denoised*(1.0f-sigma_s_i_ratio))
                 //   = (x*1)+(denoised*0) = x
-                // so D_i = model(u, sigma_s, i + 1)
-                //        = model(x, sigma,   i + 1)
+                // so D_i = model(u, sigma_s, i + 1, nullptr)
+                //        = model(x, sigma,   i + 1, nullptr)
                 //        = denoised
                 D_i = denoised;
 
@@ -1057,7 +1057,7 @@ static sd::Tensor<float> sample_dpmpp_2s_ancestral_flow(denoise_cb_t model,
                 float sigma_s_i_ratio = sigma_s / sigma;
                 sd::Tensor<float> u   = (x * sigma_s_i_ratio) + (denoised * (1.0f - sigma_s_i_ratio));
 
-                auto denoised2_opt = model(u, sigma_s, i + 1);
+                auto denoised2_opt = model(u, sigma_s, i + 1, nullptr);
                 if (denoised2_opt.empty()) {
                     return {};
                 }
@@ -1084,7 +1084,7 @@ static sd::Tensor<float> sample_dpmpp_2m(denoise_cb_t model,
 
     int steps = static_cast<int>(sigmas.size()) - 1;
     for (int i = 0; i < steps; i++) {
-        auto denoised_opt = model(x, sigmas[i], i + 1);
+        auto denoised_opt = model(x, sigmas[i], i + 1, nullptr);
         if (denoised_opt.empty()) {
             return {};
         }
@@ -1116,7 +1116,7 @@ static sd::Tensor<float> sample_dpmpp_2m_v2(denoise_cb_t model,
 
     int steps = static_cast<int>(sigmas.size()) - 1;
     for (int i = 0; i < steps; i++) {
-        auto denoised_opt = model(x, sigmas[i], i + 1);
+        auto denoised_opt = model(x, sigmas[i], i + 1, nullptr);
         if (denoised_opt.empty()) {
             return {};
         }
@@ -1151,7 +1151,7 @@ static sd::Tensor<float> sample_lcm(denoise_cb_t model,
                                     bool is_flow_denoiser) {
     int steps = static_cast<int>(sigmas.size()) - 1;
     for (int i = 0; i < steps; i++) {
-        auto denoised_opt = model(x, sigmas[i], i + 1);
+        auto denoised_opt = model(x, sigmas[i], i + 1, nullptr);
         if (denoised_opt.empty()) {
             return {};
         }
@@ -1177,7 +1177,7 @@ static sd::Tensor<float> sample_ipndm(denoise_cb_t model,
         float sigma      = sigmas[i];
         float sigma_next = sigmas[i + 1];
 
-        auto denoised_opt = model(x, sigma, i + 1);
+        auto denoised_opt = model(x, sigma, i + 1, nullptr);
         if (denoised_opt.empty()) {
             return {};
         }
@@ -1221,7 +1221,7 @@ static sd::Tensor<float> sample_ipndm_v(denoise_cb_t model,
         float sigma  = sigmas[i];
         float t_next = sigmas[i + 1];
 
-        auto denoised_opt = model(x, sigma, i + 1);
+        auto denoised_opt = model(x, sigma, i + 1, nullptr);
         if (denoised_opt.empty()) {
             return {};
         }
@@ -1283,14 +1283,14 @@ static sd::Tensor<float> sample_res_multistep(denoise_cb_t model,
 
     int steps = static_cast<int>(sigmas.size()) - 1;
     for (int i = 0; i < steps; i++) {
-        auto denoised_opt = model(x, sigmas[i], i + 1);
+        auto denoised_opt = model(x, sigmas[i], i + 1, nullptr);
         if (denoised_opt.empty()) {
             return {};
         }
         sd::Tensor<float> denoised = std::move(denoised_opt);
 
-        float sigma_from            = sigmas[i];
-        float sigma_to              = sigmas[i + 1];
+        float sigma_from = sigmas[i];
+        float sigma_to   = sigmas[i + 1];
 
         auto [sigma_down, sigma_up, alpha_scale] = get_ancestral_step(sigma_from, sigma_to, eta, is_flow_denoiser);
 
@@ -1360,7 +1360,7 @@ static sd::Tensor<float> sample_res_2s(denoise_cb_t model,
         float sigma_from = sigmas[i];
         float sigma_to   = sigmas[i + 1];
 
-        auto denoised_opt = model(x, sigma_from, -(i + 1));
+        auto denoised_opt = model(x, sigma_from, -(i + 1), nullptr);
         if (denoised_opt.empty()) {
             return {};
         }
@@ -1386,7 +1386,7 @@ static sd::Tensor<float> sample_res_2s(denoise_cb_t model,
             sd::Tensor<float> eps1 = denoised - x0;
             sd::Tensor<float> x2   = x0 + eps1 * (h * a21);
 
-            auto denoised2_opt = model(x2, sigma_c2, i + 1);
+            auto denoised2_opt = model(x2, sigma_c2, i + 1, nullptr);
             if (denoised2_opt.empty()) {
                 return {};
             }
@@ -1463,7 +1463,7 @@ static sd::Tensor<float> sample_er_sde(denoise_cb_t model,
 
     int steps = static_cast<int>(sigmas.size()) - 1;
     for (int i = 0; i < steps; i++) {
-        sd::Tensor<float> denoised = model(x, sigmas[i], i + 1);
+        sd::Tensor<float> denoised = model(x, sigmas[i], i + 1, nullptr);
         if (denoised.empty()) {
             return {};
         }
@@ -1549,7 +1549,7 @@ static sd::Tensor<float> sample_ddim_trailing(denoise_cb_t model,
         float sigma    = sigmas[i];
         float sigma_to = sigmas[i + 1];
 
-        auto model_output_opt = model(x, sigma, i + 1);
+        auto model_output_opt = model(x, sigma, i + 1, nullptr);
         if (model_output_opt.empty()) {
             return {};
         }
@@ -1621,7 +1621,7 @@ static sd::Tensor<float> sample_tcd(denoise_cb_t model,
         int timestep_s    = (int)floor((1 - eta) * prev_timestep);
         float sigma       = sigmas[i];
 
-        auto model_output_opt = model(x, sigma, i + 1);
+        auto model_output_opt = model(x, sigma, i + 1, nullptr);
         if (model_output_opt.empty()) {
             return {};
         }
@@ -1649,6 +1649,56 @@ static sd::Tensor<float> sample_tcd(denoise_cb_t model,
     return x;
 }
 
+static sd::Tensor<float> sample_euler_cfg_pp(denoise_cb_t model,
+                                             sd::Tensor<float> x,
+                                             const std::vector<float>& sigmas) {
+    int steps = static_cast<int>(sigmas.size()) - 1;
+    for (int i = 0; i < steps; i++) {
+        float sigma = sigmas[i];
+        sd::Tensor<float> uncond_denoised;
+        
+        auto denoised_opt = model(x, sigma, i + 1, &uncond_denoised);
+        if (denoised_opt.empty() || uncond_denoised.empty()) {
+            return {};
+        }
+        
+        sd::Tensor<float> denoised = std::move(denoised_opt);
+        sd::Tensor<float> d = (x - uncond_denoised) / sigma;
+        
+        x = denoised + d * sigmas[i + 1];
+    }
+    return x;
+}
+
+static sd::Tensor<float> sample_euler_ancestral_cfg_pp(denoise_cb_t model,
+                                                       sd::Tensor<float> x,
+                                                       const std::vector<float>& sigmas,
+                                                       std::shared_ptr<RNG> rng,
+                                                       float eta) {
+    int steps = static_cast<int>(sigmas.size()) - 1;
+    for (int i = 0; i < steps; i++) {
+        float sigma = sigmas[i];
+        sd::Tensor<float> uncond_denoised;
+        
+        auto denoised_opt = model(x, sigma, i + 1, &uncond_denoised);
+        if (denoised_opt.empty() || uncond_denoised.empty()) {
+            return {};
+        }
+        
+        sd::Tensor<float> denoised = std::move(denoised_opt);
+        sd::Tensor<float> d = (x - uncond_denoised) / sigma;
+        
+        auto [sigma_down, sigma_up] = get_ancestral_step(sigmas[i], sigmas[i + 1], eta);
+        
+        x = denoised + d * sigma_down;
+        
+        if (sigmas[i + 1] > 0) {
+            x += sd::Tensor<float>::randn_like(x, rng) * sigma_up;
+        }
+    }
+    return x;
+}
+
 // k diffusion reverse ODE: dx = (x - D(x;\sigma)) / \sigma dt; \sigma(t) = t
 static sd::Tensor<float> sample_k_diffusion(sample_method_t method,
                                             denoise_cb_t model,
@@ -1694,6 +1744,10 @@ static sd::Tensor<float> sample_k_diffusion(sample_method_t method,
             return sample_ddim_trailing(model, std::move(x), sigmas, rng, eta);
         case TCD_SAMPLE_METHOD:
             return sample_tcd(model, std::move(x), sigmas, rng, eta);
+        case EULER_CFG_PP_SAMPLE_METHOD:
+            return sample_euler_cfg_pp(model, std::move(x), sigmas);
+        case EULER_A_CFG_PP_SAMPLE_METHOD:
+            return sample_euler_ancestral_cfg_pp(model, std::move(x), sigmas, rng, eta);
         default:
             return {};
     }
diff --git a/otherarch/sdcpp/ggml_extend.hpp b/otherarch/sdcpp/ggml_extend.hpp
index 32d17916e..3ef49b239 100644
--- a/otherarch/sdcpp/ggml_extend.hpp
+++ b/otherarch/sdcpp/ggml_extend.hpp
@@ -2567,7 +2567,24 @@ public:
 
     bool alloc_params_buffer() {
         size_t num_tensors = ggml_tensor_num(params_ctx);
-        params_buffer      = ggml_backend_alloc_ctx_tensors(params_ctx, params_backend);
+        if (num_tensors > 0) {
+            // ggml_backend_alloc_ctx_tensors fails when all tensors are already allocated
+            // (typical for memory-mapped weights). See ggml-alloc.c n_buffers==0 branch.
+            bool all_have_data = true;
+            for (ggml_tensor* t = ggml_get_first_tensor(params_ctx); t != nullptr; t = ggml_get_next_tensor(params_ctx, t)) {
+                if (t->data == nullptr) {
+                    all_have_data = false;
+                    break;
+                }
+            }
+            if (all_have_data) {
+                LOG_DEBUG("%s all params already mmap-allocated (no separate buffer needed)", get_desc().c_str());
+                params_buffer = nullptr;
+                rebuild_params_tensor_set();
+                return true;
+            }
+        }
+        params_buffer = ggml_backend_alloc_ctx_tensors(params_ctx, params_backend);
         if (params_buffer == nullptr) {
             LOG_ERROR("%s alloc params backend buffer failed, num_tensors = %i",
                       get_desc().c_str(),
diff --git a/otherarch/sdcpp/model.cpp b/otherarch/sdcpp/model.cpp
index 4c05f756c..af9fcc3bd 100644
--- a/otherarch/sdcpp/model.cpp
+++ b/otherarch/sdcpp/model.cpp
@@ -758,16 +758,10 @@ void ModelLoader::set_wtype_override(ggml_type wtype, std::string tensor_type_ru
     }
 }
 
-bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_threads_p, bool enable_mmap) {
-    int64_t process_time_ms = 0;
-    std::atomic<int64_t> read_time_ms(0);
-    std::atomic<int64_t> memcpy_time_ms(0);
-    std::atomic<int64_t> copy_to_backend_time_ms(0);
-    std::atomic<int64_t> convert_time_ms(0);
-    std::atomic<uint64_t> bytes_processed(0);
-
-    int num_threads_to_use = n_threads_p > 0 ? n_threads_p : sd_get_num_physical_cores();
-    LOG_DEBUG("using %d threads for model loading", num_threads_to_use);
+void ModelLoader::process_model_files(bool enable_mmap, bool writable_mmap) {
+    if (model_files_processed) {
+        return;
+    }
 
     int64_t start_time = ggml_time_ms();
 
@@ -779,22 +773,13 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
         processed_tensor_storages.push_back(tensor_storage);
     }
 
-    process_time_ms = ggml_time_ms() - start_time;
-
-    bool success                          = true;
-    size_t total_tensors_processed        = 0;
-    const size_t total_tensors_to_process = processed_tensor_storages.size();
-    const int64_t t_start                 = ggml_time_ms();
-    int last_n_threads                    = 1;
-
     for (size_t file_index = 0; file_index < file_paths_.size(); file_index++) {
         std::string file_path = file_paths_[file_index];
-        LOG_DEBUG("loading tensors from %s", file_path.c_str());
 
-        std::vector<const TensorStorage*> file_tensors;
+        std::vector<TensorStorage> file_tensors;
         for (const auto& ts : processed_tensor_storages) {
             if (ts.file_index == file_index) {
-                file_tensors.push_back(&ts);
+                file_tensors.push_back(ts);
             }
         }
         if (file_tensors.empty()) {
@@ -803,21 +788,169 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
 
         bool is_zip = false;
         for (auto const& ts : file_tensors) {
-            if (ts->index_in_zip >= 0) {
+            if (ts.index_in_zip >= 0) {
                 is_zip = true;
                 break;
             }
         }
 
-        std::unique_ptr<MmapWrapper> mmapped;
+        ModelFileData fdata = {};
+        fdata.path          = file_path;
+        fdata.is_zip        = is_zip;
+        fdata.tensors       = std::move(file_tensors);
+
         if (enable_mmap && !is_zip) {
             LOG_DEBUG("using mmap for I/O");
-            mmapped = MmapWrapper::create(file_path);
-            if (!mmapped) {
-                LOG_WARN("failed to memory-map '%s'", file_path.c_str());
+            std::unique_ptr<MmapWrapper> mmapped = MmapWrapper::create(file_path, writable_mmap);
+            if (mmapped) {
+                uint8_t* mmap_data             = static_cast<uint8_t*>(mmapped->writable_data());
+                ggml_backend_buffer_t buf_mmap = ggml_backend_cpu_buffer_from_ptr(mmap_data, mmapped->size());
+                if (buf_mmap) {
+                    LOG_INFO("using mmap for '%s'", file_path.c_str());
+                    fdata.mmbuffer = std::shared_ptr<struct ggml_backend_buffer>(buf_mmap, ggml_backend_buffer_free);
+                } else {
+                    LOG_WARN("mmap: failed to create backend buffer for file %s", fdata.path.c_str());
+                }
+                fdata.mmapped = std::shared_ptr<MmapWrapper>(std::move(mmapped));
+            } else {
+                LOG_WARN("failed to memory-map '%s' (falling back to read())", file_path.c_str());
             }
+        } else if (!is_zip) {
+            LOG_INFO("NOT using mmap for '%s' (mmap disabled by caller)",
+                     file_path.c_str());
         }
 
+        file_data.push_back(std::move(fdata));
+    }
+
+    model_files_processed = true;
+
+    int64_t end_time        = ggml_time_ms();
+    int64_t process_time_ms = end_time - start_time;
+
+    LOG_INFO("model files processing completed in %.2fs", process_time_ms / 1000.f);
+}
+
+std::vector<MmapTensorStore> ModelLoader::mmap_tensors(std::map<std::string, ggml_tensor*>& tensors,
+                                                       std::set<std::string> ignore_tensors,
+                                                       bool writable_mmap) {
+    process_model_files(true, writable_mmap);
+
+    std::vector<MmapTensorStore> result;
+    uint64_t mapped_bytes = 0;
+    size_t mapped_tensors = 0;
+
+    LOG_DEBUG("memory-mapping tensors...");
+
+    int64_t t_start = ggml_time_ms();
+
+    for (auto& fdata : file_data) {
+        if (!fdata.mmbuffer)
+            continue;
+
+        const std::vector<TensorStorage>& file_tensors = fdata.tensors;
+
+        size_t file_mapped_bytes   = 0;
+        size_t file_mapped_tensors = 0;
+
+        for (const auto& tensor_storage : file_tensors) {
+            const std::string& name = tensor_storage.name;
+
+            bool is_ignored = false;
+            for (const auto& ignore_prefix : ignore_tensors) {
+                if (starts_with(name, ignore_prefix)) {
+                    is_ignored = true;
+                    break;
+                }
+            }
+            if (is_ignored)
+                continue;
+
+            auto it = tensors.find(name);
+            if (it == tensors.end())
+                continue;
+
+            ggml_tensor* dst_tensor = it->second;
+            if (dst_tensor == nullptr)
+                continue;
+
+            if (tensor_storage.type != dst_tensor->type)
+                continue;
+
+            size_t tensor_size   = tensor_storage.nbytes();
+            size_t tensor_offset = tensor_storage.offset;
+
+            if (tensor_storage.ne[0] != dst_tensor->ne[0] ||
+                tensor_storage.ne[1] != dst_tensor->ne[1] ||
+                tensor_storage.ne[2] != dst_tensor->ne[2] ||
+                tensor_storage.ne[3] != dst_tensor->ne[3] ||
+                tensor_size != ggml_nbytes(dst_tensor)) {
+                // let load_tensors worry about this
+                continue;
+            }
+
+            ggml_backend_buffer_t buf_mmap = fdata.mmbuffer.get();
+            uint8_t* mmap_data             = static_cast<uint8_t*>(ggml_backend_buffer_get_base(buf_mmap));
+            dst_tensor->buffer             = buf_mmap;
+            dst_tensor->data               = mmap_data + tensor_offset;
+
+            file_mapped_bytes += tensor_size;
+            file_mapped_tensors++;
+        }
+
+        if (file_mapped_bytes > 0) {
+            mapped_tensors += file_mapped_tensors;
+            mapped_bytes += file_mapped_bytes;
+            result.push_back({fdata.mmapped, fdata.mmbuffer});
+        }
+    }
+
+    int64_t t_end       = ggml_time_ms();
+    int64_t duration_ms = t_end - t_start;
+
+    LOG_INFO("memory-mapped %zu tensors in %zu files (%.2f MB), taking %.2fs",
+             mapped_tensors,
+             result.size(),
+             mapped_bytes / (1024.0 * 1024.0),
+             duration_ms / 1000.0);
+
+    return result;
+}
+
+bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_threads_p, bool enable_mmap) {
+    process_model_files(enable_mmap, false);
+
+    std::atomic<int64_t> read_time_ms(0);
+    std::atomic<int64_t> memcpy_time_ms(0);
+    std::atomic<int64_t> copy_to_backend_time_ms(0);
+    std::atomic<int64_t> convert_time_ms(0);
+    std::atomic<uint64_t> bytes_processed(0);
+
+    int num_threads_to_use = n_threads_p > 0 ? n_threads_p : sd_get_num_physical_cores();
+    LOG_DEBUG("using %d threads for model loading", num_threads_to_use);
+
+    int64_t start_time = ggml_time_ms();
+
+    size_t total_tensors_to_process = 0;
+    for (const auto& fdata : file_data) {
+        total_tensors_to_process += fdata.tensors.size();
+    }
+
+    bool success                   = true;
+    size_t total_tensors_processed = 0;
+    const int64_t t_start          = start_time;
+    int last_n_threads             = 1;
+
+    for (auto& fdata : file_data) {
+        const std::string& file_path = fdata.path;
+        LOG_DEBUG("loading tensors from %s", file_path.c_str());
+
+        const std::vector<TensorStorage>& file_tensors = fdata.tensors;
+
+        bool is_zip = fdata.is_zip;
+
+        std::shared_ptr<MmapWrapper> mmapped = fdata.mmapped;
+
         int n_threads = is_zip ? 1 : std::min(num_threads_to_use, (int)file_tensors.size());
         if (n_threads < 1) {
             n_threads = 1;
@@ -858,7 +991,7 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
                         break;
                     }
 
-                    const TensorStorage& tensor_storage = *file_tensors[idx];
+                    const TensorStorage& tensor_storage = file_tensors[idx];
                     ggml_tensor* dst_tensor             = nullptr;
 
                     t0 = ggml_time_ms();
@@ -875,6 +1008,11 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
                         continue;
                     }
 
+                    // skip mmapped tensors
+                    if (dst_tensor->buffer != nullptr && dst_tensor->buffer == fdata.mmbuffer.get()) {
+                        continue;
+                    }
+
                     size_t nbytes_to_read = tensor_storage.nbytes_to_read();
 
                     auto read_data = [&](char* buf, size_t n) {
@@ -1018,9 +1156,8 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
     }
 
     int64_t end_time = ggml_time_ms();
-    LOG_INFO("loading tensors completed, taking %.2fs (process: %.2fs, read: %.2fs, memcpy: %.2fs, convert: %.2fs, copy_to_backend: %.2fs)",
+    LOG_INFO("loading tensors completed, taking %.2fs (read: %.2fs, memcpy: %.2fs, convert: %.2fs, copy_to_backend: %.2fs)",
              (end_time - start_time) / 1000.f,
-             process_time_ms / 1000.f,
              (read_time_ms.load() / (float)last_n_threads) / 1000.f,
              (memcpy_time_ms.load() / (float)last_n_threads) / 1000.f,
              (convert_time_ms.load() / (float)last_n_threads) / 1000.f,
diff --git a/otherarch/sdcpp/model.h b/otherarch/sdcpp/model.h
index 2689f63bd..97509587b 100644
--- a/otherarch/sdcpp/model.h
+++ b/otherarch/sdcpp/model.h
@@ -193,10 +193,27 @@ using TensorTypeRules = std::vector<std::pair<std::string, ggml_type>>;
 
 TensorTypeRules parse_tensor_type_rules(const std::string& tensor_type_rules);
 
+class MmapWrapper;
+
+struct ModelFileData {
+    std::string path;
+    std::vector<TensorStorage> tensors;
+    std::shared_ptr<MmapWrapper> mmapped;
+    std::shared_ptr<struct ggml_backend_buffer> mmbuffer;
+    bool is_zip;
+};
+
+struct MmapTensorStore {
+    std::shared_ptr<MmapWrapper> mmapped;
+    std::shared_ptr<struct ggml_backend_buffer> mmbuffer;
+};
+
 class ModelLoader {
 protected:
     SDVersion version_ = VERSION_COUNT;
     std::vector<std::string> file_paths_;
+    std::vector<ModelFileData> file_data;
+    bool model_files_processed = false;
     String2TensorStorage tensor_storage_map;
 
     void add_tensor_storage(const TensorStorage& tensor_storage);
@@ -221,6 +238,10 @@ public:
     std::map<ggml_type, uint32_t> get_vae_wtype_stat();
     String2TensorStorage& get_tensor_storage_map() { return tensor_storage_map; }
     void set_wtype_override(ggml_type wtype, std::string tensor_type_rules = "");
+    void process_model_files(bool enable_mmap = false, bool writable_mmap = true);
+    std::vector<MmapTensorStore> mmap_tensors(std::map<std::string, ggml_tensor*>& tensors,
+                                              std::set<std::string> ignore_tensors = {},
+                                              bool writable                        = true);
     bool load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_threads = 0, bool use_mmap = false);
     bool load_tensors(std::map<std::string, ggml_tensor*>& tensors,
                       std::set<std::string> ignore_tensors = {},
diff --git a/otherarch/sdcpp/sdtype_adapter.cpp b/otherarch/sdcpp/sdtype_adapter.cpp
index f622a4354..eb7d6eb91 100644
--- a/otherarch/sdcpp/sdtype_adapter.cpp
+++ b/otherarch/sdcpp/sdtype_adapter.cpp
@@ -360,6 +360,16 @@ bool sdtype_load_model(const sd_load_model_inputs inputs) {
     {
         printf("Conv2D Direct for VAE model is enabled\n");
     }
+    if (inputs.use_mmap && inputs.offload_cpu) {
+        printf("Offloading weights to system RAM with mmap\n");
+        if (!lora_dynamic && inputs.lora_len > 0) {
+            printf("Note: static LoRAs can reduce mmap memory savings!\n");
+        }
+    } else if (inputs.offload_cpu) {
+        printf("Offloading weights to system RAM\n");
+    } else if (inputs.use_mmap) {
+        printf("Using mmap for I/O\n");
+    }
     if(inputs.quant > 0)
     {
         printf("Note: Loading a pre-quantized model is always faster than using compress weights!\n");
@@ -424,6 +434,7 @@ bool sdtype_load_model(const sd_load_model_inputs inputs) {
     params.vae_conv_direct = sd_params->vae_conv_direct;
     params.chroma_use_dit_mask = sd_params->chroma_use_dit_mask;
     params.offload_params_to_cpu = inputs.offload_cpu;
+    params.enable_mmap = inputs.use_mmap;
     params.keep_vae_on_cpu = inputs.vae_cpu;
     params.keep_clip_on_cpu = inputs.clip_cpu;
     params.lora_apply_mode = (lora_apply_mode_t)lora_apply_mode;
diff --git a/otherarch/sdcpp/stable-diffusion.cpp b/otherarch/sdcpp/stable-diffusion.cpp
index 9077f0618..cc44f18f4 100644
--- a/otherarch/sdcpp/stable-diffusion.cpp
+++ b/otherarch/sdcpp/stable-diffusion.cpp
@@ -75,6 +75,8 @@ const char* sampling_methods_str[] = {
     "Res Multistep",
     "Res 2s",
     "ER-SDE",
+    "Euler CFG++",
+    "Euler A CFG++",
 };
 
 /*================================================== Helper Functions ================================================*/
@@ -110,6 +112,7 @@ static float get_cache_reuse_threshold(const sd_cache_params_t& params) {
 
 class StableDiffusionGGML {
 public:
+    std::vector<MmapTensorStore> mmap_tensor_store;
     ggml_backend_t backend             = nullptr;  // general backend
     ggml_backend_t clip_backend        = nullptr;
     ggml_backend_t control_net_backend = nullptr;
@@ -560,6 +563,51 @@ public:
             apply_lora_immediately = false;
         }
 
+        std::map<std::string, ggml_tensor*> mmap_able_tensors;
+        bool enable_mmap_tensors = false;
+        bool main_backend_mmap   = false;
+        bool needs_writable_mmap = false;
+        if (sd_ctx_params->enable_mmap) {
+            if (apply_lora_immediately) {
+                needs_writable_mmap = true;
+                LOG_WARN("in mode 'immediately', LoRAs will cause extra memory usage with mmap");
+            }
+            enable_mmap_tensors = true;
+            if (offload_params_to_cpu) {
+                main_backend_mmap = true;
+            } else {
+                ggml_backend_dev_t dev = ggml_backend_get_device(backend);
+                struct ggml_backend_dev_props props;
+                ggml_backend_dev_get_props(dev, &props);
+                main_backend_mmap = props.caps.buffer_from_host_ptr;
+            }
+        }
+
+        // split definition to avoid msvc choking on the extra parameter handling
+        auto get_param_tensors_p = [&](auto&& model, bool force_cpu, const char* prefix) {
+            std::map<std::string, ggml_tensor*> temp;
+            model->get_param_tensors(temp, prefix);
+            bool do_mmap = enable_mmap_tensors && (main_backend_mmap || force_cpu);
+            for (const auto& [key, tensor] : temp) {
+                tensors[key] = tensor;
+                if (do_mmap) {
+                    mmap_able_tensors[key] = tensor;
+                }
+            }
+        };
+
+        auto get_param_tensors = [&](auto&& model, bool force_cpu = false) {
+            std::map<std::string, ggml_tensor*> temp;
+            model->get_param_tensors(temp);
+            bool do_mmap = enable_mmap_tensors && (main_backend_mmap || force_cpu);
+            for (const auto& [key, tensor] : temp) {
+                tensors[key] = tensor;
+                if (do_mmap) {
+                    mmap_able_tensors[key] = tensor;
+                }
+            }
+        };
+
         if (sd_version_is_control(version)) {
             // Might need vae encode for control cond
             vae_decode_only = false;
@@ -671,8 +719,7 @@ public:
                                                                              offload_params_to_cpu,
                                                                              tensor_storage_map);
                     clip_vision->set_max_graph_vram_bytes(max_graph_vram_bytes);
-                    clip_vision->alloc_params_buffer();
-                    clip_vision->get_param_tensors(tensors);
+                    get_param_tensors(clip_vision);
                 }
             } else if (sd_version_is_qwen_image(version)) {
                 bool enable_vision = false;
@@ -748,12 +795,10 @@ public:
             }
 
             cond_stage_model->set_max_graph_vram_bytes(max_graph_vram_bytes);
-            cond_stage_model->alloc_params_buffer();
-            cond_stage_model->get_param_tensors(tensors);
+            get_param_tensors(cond_stage_model, clip_on_cpu);
 
             diffusion_model->set_max_graph_vram_bytes(max_graph_vram_bytes);
-            diffusion_model->alloc_params_buffer();
-            diffusion_model->get_param_tensors(tensors);
+            get_param_tensors(diffusion_model);
 
             if (sd_version_is_unet_edit(version)) {
                 vae_decode_only = false;
@@ -761,8 +806,7 @@ public:
 
             if (high_noise_diffusion_model) {
                 high_noise_diffusion_model->set_max_graph_vram_bytes(max_graph_vram_bytes);
-                high_noise_diffusion_model->alloc_params_buffer();
-                high_noise_diffusion_model->get_param_tensors(tensors);
+                get_param_tensors(high_noise_diffusion_model);
             }
 
             if (sd_ctx_params->keep_vae_on_cpu && !ggml_backend_is_cpu(backend)) {
@@ -825,6 +869,8 @@ public:
                 }
             };
 
+            bool force_vae_cpu = sd_ctx_params->keep_vae_on_cpu;
+
             if (version == VERSION_CHROMA_RADIANCE) {
                 LOG_INFO("using FakeVAE");
                 first_stage_model = std::make_shared<FakeVAE>(version,
@@ -834,20 +880,17 @@ public:
                 LOG_INFO("using TAE for encoding / decoding");
                 first_stage_model = create_tae();
                 first_stage_model->set_max_graph_vram_bytes(max_graph_vram_bytes);
-                first_stage_model->alloc_params_buffer();
-                first_stage_model->get_param_tensors(tensors, "tae");
+                get_param_tensors_p(first_stage_model, force_vae_cpu, "tae");
             } else {
                 LOG_INFO("using VAE for encoding / decoding");
                 first_stage_model = create_vae();
                 first_stage_model->set_max_graph_vram_bytes(max_graph_vram_bytes);
-                first_stage_model->alloc_params_buffer();
-                first_stage_model->get_param_tensors(tensors, "first_stage_model");
+                get_param_tensors_p(first_stage_model, force_vae_cpu, "first_stage_model");
                 if (use_tae && tae_preview_only) {
                     LOG_INFO("using TAE for preview");
                     preview_vae = create_tae();
                     preview_vae->set_max_graph_vram_bytes(max_graph_vram_bytes);
-                    preview_vae->alloc_params_buffer();
-                    preview_vae->get_param_tensors(tensors, "tae");
+                    get_param_tensors_p(first_stage_model, force_vae_cpu, "vae");
                 }
             }
 
@@ -916,11 +959,7 @@ public:
               }
             }
             if (use_pmid) {
-                if (!pmid_model->alloc_params_buffer()) {
-                    LOG_ERROR(" pmid model params buffer allocation failed");
-                    return false;
-                }
-                pmid_model->get_param_tensors(tensors, "pmid");
+                get_param_tensors_p(pmid_model, false, "pmid");
             }
 
             if (sd_ctx_params->flash_attn) {
@@ -1003,6 +1042,41 @@ public:
             ignore_tensors.insert("text_encoders.llm.vision_tower.");
             ignore_tensors.insert("text_encoders.llm.multi_modal_projector.");
         }
+
+        if (enable_mmap_tensors) {
+            if (mmap_able_tensors.empty()) {
+                LOG_DEBUG("no tensors could be memory-mapped");
+            } else {
+                mmap_tensor_store = model_loader.mmap_tensors(mmap_able_tensors, ignore_tensors, needs_writable_mmap);
+            }
+        }
+
+        if (clip_vision) {
+            clip_vision->alloc_params_buffer();
+        }
+        if (cond_stage_model) {
+            cond_stage_model->alloc_params_buffer();
+        }
+        if (diffusion_model) {
+            diffusion_model->alloc_params_buffer();
+        }
+        if (high_noise_diffusion_model) {
+            high_noise_diffusion_model->alloc_params_buffer();
+        }
+        if (first_stage_model) {
+            first_stage_model->alloc_params_buffer();
+        }
+        if (preview_vae) {
+            preview_vae->alloc_params_buffer();
+        }
+        if (use_pmid && pmid_model) {
+            if (!pmid_model->alloc_params_buffer()) {
+                LOG_ERROR(" pmid model params buffer allocation failed");
+                ggml_free(ctx);
+                return false;
+            }
+        }
+
         bool success = model_loader.load_tensors(tensors, ignore_tensors, n_threads, sd_ctx_params->enable_mmap);
         if (!success) {
             LOG_ERROR("load tensors from model loader failed");
@@ -1828,6 +1902,15 @@ public:
                                                                                            cache_params,
                                                                                            denoiser.get(),
                                                                                            sigmas);
+
+        // Spectrum cache is not supported for CFG++ samplers
+        if (method == EULER_CFG_PP_SAMPLE_METHOD || method == EULER_A_CFG_PP_SAMPLE_METHOD) {
+            if (cache_runtime.spectrum_enabled) {
+                LOG_WARN("Spectrum cache requested but not supported for CFG++ samplers");
+                cache_runtime.spectrum_enabled = false;
+            }
+        }
+
         size_t steps                                = sigmas.size() - 1;
         bool has_skiplayer                          = slg_scale != 0.0f && !skip_layers.empty();
         if (has_skiplayer && !sd_version_is_dit(version)) {
@@ -1842,7 +1925,7 @@ public:
         sd::Tensor<float> denoised   = x_t;
         SamplePreviewContext preview = prepare_sample_preview_context();
 
-        auto denoise = [&](const sd::Tensor<float>& x, float sigma, int step) -> sd::Tensor<float> {
+        auto denoise = [&](const sd::Tensor<float>& x, float sigma, int step, sd::Tensor<float>* out_uncond_denoised = nullptr) -> sd::Tensor<float> {
             if (step == 1 || step == -1) {
                 pretty_progress(0, (int)steps, 0);
             }
@@ -1865,15 +1948,17 @@ public:
             }
 
             if (cache_runtime.spectrum_enabled && cache_runtime.spectrum.should_predict()) {
-                cache_runtime.spectrum.predict(&denoised);
-                if (!denoise_mask.empty()) {
-                    denoised = denoised * denoise_mask + init_latent * (1.0f - denoise_mask);
+                if (out_uncond_denoised == nullptr) {
+                    cache_runtime.spectrum.predict(&denoised);
+                    if (!denoise_mask.empty()) {
+                        denoised = denoised * denoise_mask + init_latent * (1.0f - denoise_mask);
+                    }
+                    if (sd_should_preview_denoised() && preview.callback != nullptr) {
+                        preview_image(step, denoised, version, preview.mode, preview.callback, preview.data, false);
+                    }
+                    report_sample_progress(step, steps, t0);
+                    return denoised;
                 }
-                if (sd_should_preview_denoised() && preview.callback != nullptr) {
-                    preview_image(step, denoised, version, preview.mode, preview.callback, preview.data, false);
-                }
-                report_sample_progress(step, steps, t0);
-                return denoised;
             }
 
             if (sd_should_preview_noisy() && preview.callback != nullptr) {
@@ -1996,6 +2081,10 @@ public:
                 latent_result += (cond_out - skip_cond_out) * slg_scale;
             }
             denoised = latent_result * c_out + x * c_skip;
+            if (out_uncond_denoised != nullptr) {
+                sd::Tensor<float> base_uncond = !uncond_out.empty() ? uncond_out : cond_out;
+                *out_uncond_denoised = base_uncond * c_out + x * c_skip;
+            }
             if (cache_runtime.spectrum_enabled) {
                 cache_runtime.spectrum.update(denoised);
             }
@@ -2210,6 +2299,8 @@ const char* sample_method_to_str[] = {
     "res_multistep",
     "res_2s",
     "er_sde",
+    "euler_cfg_pp",
+    "euler_a_cfg_pp",
 };
 
 const char* sd_sample_method_name(enum sample_method_t sample_method) {
@@ -2772,6 +2863,7 @@ static float resolve_eta(sd_ctx_t* sd_ctx,
             case EULER_A_SAMPLE_METHOD:
             case DPMPP2S_A_SAMPLE_METHOD:
             case ER_SDE_SAMPLE_METHOD:
+            case EULER_A_CFG_PP_SAMPLE_METHOD:
                 return 1.0f;
             default:;
         }
diff --git a/otherarch/sdcpp/stable-diffusion.h b/otherarch/sdcpp/stable-diffusion.h
index e4f1b307e..449224eb5 100644
--- a/otherarch/sdcpp/stable-diffusion.h
+++ b/otherarch/sdcpp/stable-diffusion.h
@@ -51,6 +51,8 @@ enum sample_method_t {
     RES_MULTISTEP_SAMPLE_METHOD,
     RES_2S_SAMPLE_METHOD,
     ER_SDE_SAMPLE_METHOD,
+    EULER_CFG_PP_SAMPLE_METHOD,
+    EULER_A_CFG_PP_SAMPLE_METHOD,
     SAMPLE_METHOD_COUNT
 };
 
diff --git a/otherarch/sdcpp/util.cpp b/otherarch/sdcpp/util.cpp
index 17a41043d..0f81152ed 100644
--- a/otherarch/sdcpp/util.cpp
+++ b/otherarch/sdcpp/util.cpp
@@ -121,7 +121,7 @@ private:
     HANDLE hmapping_;
 };
 
-std::unique_ptr<MmapWrapper> MmapWrapper::create(const std::string& filename) {
+std::unique_ptr<MmapWrapper> MmapWrapper::create(const std::string& filename, bool writable) {
     void* mapped_data = nullptr;
     size_t file_size  = 0;
 
@@ -146,14 +146,18 @@ std::unique_ptr<MmapWrapper> MmapWrapper::create(const std::string& filename) {
 
     file_size = static_cast<size_t>(size.QuadPart);
 
-    HANDLE mapping_handle = CreateFileMapping(file_handle, nullptr, PAGE_READONLY, 0, 0, nullptr);
+    DWORD page_prot = writable ? PAGE_WRITECOPY : PAGE_READONLY;
+
+    HANDLE mapping_handle = CreateFileMapping(file_handle, nullptr, page_prot, 0, 0, nullptr);
 
     if (mapping_handle == nullptr) {
         CloseHandle(file_handle);
         return nullptr;
     }
 
-    mapped_data = MapViewOfFile(mapping_handle, FILE_MAP_READ, 0, 0, file_size);
+    DWORD view_access = writable ? FILE_MAP_COPY : FILE_MAP_READ;
+
+    mapped_data = MapViewOfFile(mapping_handle, view_access, 0, 0, file_size);
 
     if (mapped_data == nullptr) {
         CloseHandle(mapping_handle);
@@ -186,28 +190,85 @@ std::string sd_get_u8path(const std::string& file_path)
     return std::filesystem::path(file_path).string();
 }
 
-class MmapWrapperImpl : public MmapWrapper {
-public:
-    MmapWrapperImpl(void* data, size_t size)
-        : MmapWrapper(data, size) {}
-
-    ~MmapWrapperImpl() override {
-        munmap(data_, size_);
-    }
+struct MmapFlags {
+    bool sequential;
+    bool populate;
+    bool willneed;
+    bool dontneed;
 };
 
-std::unique_ptr<MmapWrapper> MmapWrapper::create(const std::string& filename) {
+static MmapFlags get_mmap_flags() {
+    MmapFlags result          = {};
+    const char* SD_MMAP_FLAGS = std::getenv("SD_MMAP_FLAGS");
+    if (SD_MMAP_FLAGS && *SD_MMAP_FLAGS) {
+        std::stringstream ss(SD_MMAP_FLAGS);
+        std::string token;
+        while (std::getline(ss, token, ',')) {
+            std::string ntoken = trim(token);
+            std::transform(ntoken.begin(), ntoken.end(), ntoken.begin(), ::tolower);
+            if (ntoken == "sequential") {
+                result.sequential = true;
+            } else if (ntoken == "populate") {
+                result.populate = true;
+            } else if (ntoken == "willneed") {
+                result.willneed = true;
+            } else if (ntoken == "dontneed") {
+                result.dontneed = true;
+            }
+        }
+    }
+    return result;
+}
+
+class MmapWrapperImpl : public MmapWrapper {
+public:
+    MmapWrapperImpl(void* data, size_t size, int fd)
+        : MmapWrapper(data, size), fd_(fd) {}
+
+    ~MmapWrapperImpl() override {
+#ifdef __linux__
+        auto cfg_flags = get_mmap_flags();
+
+        // Drop the kernel pagecache pages for this file. madvise(DONTNEED)
+        // alone only unmaps from the process address space; pagecache
+        // entries persist (`free` reports them as buff/cache and the OOM
+        // killer doesn't touch them, but they ARE counted against
+        // overcommit and can starve other allocations on tight-RAM
+        // systems). posix_fadvise(POSIX_FADV_DONTNEED) is the documented
+        // way to evict pagecache for a specific fd's pages.
+        if (cfg_flags.dontneed) {
+            madvise(data_, size_, MADV_DONTNEED);
+            posix_fadvise(fd_, 0, 0, POSIX_FADV_DONTNEED);
+        }
+#endif
+        munmap(data_, size_);
+        close(fd_);
+    }
+
+private:
+    int fd_;
+};
+
+std::unique_ptr<MmapWrapper> MmapWrapper::create(const std::string& filename, bool writable) {
     int file_descriptor = open(filename.c_str(), O_RDONLY);
     if (file_descriptor == -1) {
         return nullptr;
     }
 
+    auto cfg_flags = get_mmap_flags();
+
     int mmap_flags = MAP_PRIVATE;
 
 #ifdef __linux__
-    // performance flags used by llama.cpp
-    // posix_fadvise(file_descriptor, 0, 0, POSIX_FADV_SEQUENTIAL);
-    // mmap_flags |= MAP_POPULATE;
+    // Sequential access hint helps the kernel read-ahead efficiently and
+    // also encourages eviction of already-read pages (the kernel keeps
+    // a smaller working set when this is set).
+    if (cfg_flags.sequential) {
+        posix_fadvise(file_descriptor, 0, 0, POSIX_FADV_SEQUENTIAL);
+    }
+    if (cfg_flags.populate) {
+        mmap_flags |= MAP_POPULATE;
+    }
 #endif
 
     struct stat sb;
@@ -218,20 +279,27 @@ std::unique_ptr<MmapWrapper> MmapWrapper::create(const std::string& filename) {
 
     size_t file_size = sb.st_size;
 
-    void* mapped_data = mmap(nullptr, file_size, PROT_READ, mmap_flags, file_descriptor, 0);
+    if (file_size == 0) {
+        close(file_descriptor);
+        return nullptr;
+    }
 
-    close(file_descriptor);
+    int mmap_prot = PROT_READ | (writable ? PROT_WRITE : 0);
+
+    void* mapped_data = mmap(nullptr, file_size, mmap_prot, mmap_flags, file_descriptor, 0);
 
     if (mapped_data == MAP_FAILED) {
+        close(file_descriptor);
         return nullptr;
     }
 
 #ifdef __linux__
-    // performance flags used by llama.cpp
-    // posix_madvise(mapped_data, file_size, POSIX_MADV_WILLNEED);
+    if (cfg_flags.willneed) {
+        posix_madvise(mapped_data, file_size, POSIX_MADV_WILLNEED);
+    }
 #endif
 
-    return std::make_unique<MmapWrapperImpl>(mapped_data, file_size);
+    return std::make_unique<MmapWrapperImpl>(mapped_data, file_size, file_descriptor);
 }
 
 #endif
diff --git a/otherarch/sdcpp/util.h b/otherarch/sdcpp/util.h
index 36d168e2d..f9b7d4aba 100644
--- a/otherarch/sdcpp/util.h
+++ b/otherarch/sdcpp/util.h
@@ -45,7 +45,7 @@ sd::Tensor<float> clip_preprocess(const sd::Tensor<float>& image, int target_wid
 
 class MmapWrapper {
 public:
-    static std::unique_ptr<MmapWrapper> create(const std::string& filename);
+    static std::unique_ptr<MmapWrapper> create(const std::string& filename, bool writable = false);
 
     virtual ~MmapWrapper() = default;
 
@@ -55,6 +55,7 @@ public:
     MmapWrapper& operator=(MmapWrapper&&)      = delete;
 
     const uint8_t* data() const { return static_cast<uint8_t*>(data_); }
+    uint8_t* writable_data() { return static_cast<uint8_t*>(data_); }
     size_t size() const { return size_; }
     bool copy_data(void* buf, size_t n, size_t offset) const;