From 627e317cd7c6c643b892e2fde1c7d664f404893e Mon Sep 17 00:00:00 2001
From: Wagner Bruna <wbruna@yahoo.com>
Date: Tue, 19 May 2026 23:37:17 -0300
Subject: [PATCH] sd: sync to master-633-5b0267e

---
 otherarch/sdcpp/common/common.cpp    |  2 +-
 otherarch/sdcpp/ggml_graph_cut.cpp   | 27 +++++++-------
 otherarch/sdcpp/llm.hpp              |  3 +-
 otherarch/sdcpp/ltx_audio_vae.h      | 54 ++++++++++++++++++++--------
 otherarch/sdcpp/stable-diffusion.cpp |  3 +-
 otherarch/sdcpp/stable-diffusion.h   |  2 +-
 6 files changed, 59 insertions(+), 32 deletions(-)
diff --git a/otherarch/sdcpp/common/common.cpp b/otherarch/sdcpp/common/common.cpp
index dd4581eb0..85c03b412 100644
--- a/otherarch/sdcpp/common/common.cpp
+++ b/otherarch/sdcpp/common/common.cpp
@@ -413,7 +413,7 @@ ArgOptions SDContextParams::get_options() {
     options.float_options = {
         {"",
          "--max-vram",
-         "maximum VRAM budget in GiB for graph-cut segmented execution. 0 disables graph splitting; -1 auto-detects free VRAM minus 1 GiB",
+         "maximum VRAM budget in GiB for graph-cut segmented execution. 0 disables graph splitting; a negative value auto-detects free VRAM, sparing the specified value (e.g. -0.5 will keep at least 0.5 GiB free)",
          &max_vram},
     };
 
diff --git a/otherarch/sdcpp/ggml_graph_cut.cpp b/otherarch/sdcpp/ggml_graph_cut.cpp
index d07e08be7..a20de30c9 100644
--- a/otherarch/sdcpp/ggml_graph_cut.cpp
+++ b/otherarch/sdcpp/ggml_graph_cut.cpp
@@ -16,8 +16,7 @@
 
 namespace sd::ggml_graph_cut {
 
-    static constexpr double MAX_VRAM_BYTES_PER_GIB      = 1024.0 * 1024.0 * 1024.0;
-    static constexpr size_t MAX_VRAM_AUTO_RESERVE_BYTES = 1024ULL * 1024ULL * 1024ULL;
+    static constexpr double MAX_VRAM_BYTES_PER_GIB = 1024.0 * 1024.0 * 1024.0;
 
     static std::string graph_cut_tensor_display_name(const ggml_tensor* tensor) {
         if (tensor == nullptr) {
@@ -93,45 +92,47 @@ namespace sd::ggml_graph_cut {
         return static_cast<float>(static_cast<double>(max_vram_bytes) / MAX_VRAM_BYTES_PER_GIB);
     }
 
-    static size_t resolve_auto_max_vram_bytes(ggml_backend_t backend) {
+    static size_t resolve_auto_max_vram_bytes(float spare_vram, ggml_backend_t backend) {
         if (backend == nullptr) {
-            LOG_WARN("--max-vram -1 requested, but no backend is available; disabling graph splitting");
+            LOG_WARN("--max-vram < 0 requested, but no backend is available; disabling graph splitting");
             return 0;
         }
 
         ggml_backend_dev_t dev = ggml_backend_get_device(backend);
         if (dev == nullptr) {
-            LOG_WARN("--max-vram -1 requested, but no backend device is available; disabling graph splitting");
+            LOG_WARN("--max-vram < 0 requested, but no backend device is available; disabling graph splitting");
             return 0;
         }
         if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
-            LOG_WARN("--max-vram -1 requested, but the main backend is CPU; disabling graph splitting");
+            LOG_WARN("--max-vram < 0 requested, but the main backend is CPU; disabling graph splitting");
             return 0;
         }
 
         size_t free_vram  = 0;
         size_t total_vram = 0;
         ggml_backend_dev_memory(dev, &free_vram, &total_vram);
+        size_t spare_bytes = static_cast<size_t>(MAX_VRAM_BYTES_PER_GIB * spare_vram);
 
-        if (free_vram <= MAX_VRAM_AUTO_RESERVE_BYTES) {
-            LOG_WARN("--max-vram -1 requested, but free VRAM is %.2f GiB; reserving 1.00 GiB leaves no graph budget",
-                     free_vram / MAX_VRAM_BYTES_PER_GIB);
+        if (free_vram <= spare_bytes) {
+            LOG_WARN("--max-vram < 0 requested, but free VRAM is %.2f GiB; reserving %.2f GiB leaves no graph budget",
+                     free_vram / MAX_VRAM_BYTES_PER_GIB, spare_vram);
             return 0;
         }
 
-        const size_t max_vram_bytes = free_vram - MAX_VRAM_AUTO_RESERVE_BYTES;
-        LOG_INFO("--max-vram -1 auto-detected %.2f GiB free VRAM (%.2f GiB total), reserving 1.00 GiB; using %.2f GiB",
+        const size_t max_vram_bytes = free_vram - spare_bytes;
+        LOG_INFO("--max-vram < 0 auto-detected %.2f GiB free VRAM (%.2f GiB total), reserving %.2f GiB; using %.2f GiB",
                  free_vram / MAX_VRAM_BYTES_PER_GIB,
                  total_vram / MAX_VRAM_BYTES_PER_GIB,
+                 spare_vram,
                  max_vram_bytes / MAX_VRAM_BYTES_PER_GIB);
         return max_vram_bytes;
     }
 
     float resolve_max_vram_gib(float max_vram, ggml_backend_t backend) {
-        if (max_vram != -1.f) {
+        if (max_vram >= 0.f) {
             return max_vram;
         }
-        return max_vram_bytes_to_gib(resolve_auto_max_vram_bytes(backend));
+        return max_vram_bytes_to_gib(resolve_auto_max_vram_bytes(-max_vram, backend));
     }
 
     static Segment make_segment_seed(const Plan& plan,
diff --git a/otherarch/sdcpp/llm.hpp b/otherarch/sdcpp/llm.hpp
index ab673b208..cec8b1dcc 100644
--- a/otherarch/sdcpp/llm.hpp
+++ b/otherarch/sdcpp/llm.hpp
@@ -1403,7 +1403,8 @@ namespace LLM {
                                    out_layers,
                                    return_all_hidden_states);
             };
-            return take_or_empty(GGMLRunner::compute<float>(get_graph, n_threads, true));
+            return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, true),
+                                                   input_ids.dim() + 1);
         }
 
         int64_t get_num_image_tokens(int64_t t, int64_t h, int64_t w) {
diff --git a/otherarch/sdcpp/ltx_audio_vae.h b/otherarch/sdcpp/ltx_audio_vae.h
index d1d765d75..aad8e0f87 100644
--- a/otherarch/sdcpp/ltx_audio_vae.h
+++ b/otherarch/sdcpp/ltx_audio_vae.h
@@ -349,42 +349,66 @@ namespace LTXV {
         return type == GGML_TYPE_BF16 ? GGML_TYPE_F16 : type;
     }
 
-    static ggml_tensor* repeat_1d_value(ggml_context* ctx, ggml_tensor* x, int64_t count) {
-        GGML_ASSERT(x->ne[0] == 1);
-        ggml_tensor* target = ggml_new_tensor_4d(ctx, x->type, count, x->ne[1], x->ne[2], x->ne[3]);
-        return ggml_repeat(ctx, x, target);
+    static ggml_tensor* repeat_with_vulkan_f32_workaround(ggml_backend_t backend,
+                                                          ggml_context* ctx,
+                                                          ggml_tensor* x,
+                                                          int64_t ne0,
+                                                          int64_t ne1,
+                                                          int64_t ne2,
+                                                          int64_t ne3) {
+        if (x->type != GGML_TYPE_F32 &&
+            (x->type == GGML_TYPE_F16 || x->type == GGML_TYPE_BF16) &&
+            sd_backend_is(backend, "vulkan")) {
+            auto x_f32    = ggml_cast(ctx, x, GGML_TYPE_F32);
+            auto repeated = ggml_repeat_4d(ctx,
+                                           x_f32,
+                                           ne0,
+                                           ne1,
+                                           ne2,
+                                           ne3);
+            return ggml_cast(ctx, repeated, x->type);
+        }
+        return ggml_repeat_4d(ctx, x, ne0, ne1, ne2, ne3);
     }
 
-    static ggml_tensor* replicate_pad_1d(ggml_context* ctx, ggml_tensor* x, int64_t left, int64_t right) {
+    static ggml_tensor* repeat_1d_value(GGMLRunnerContext* runner_ctx, ggml_tensor* x, int64_t count) {
+        auto ctx = runner_ctx->ggml_ctx;
+        GGML_ASSERT(x->ne[0] == 1);
+        return repeat_with_vulkan_f32_workaround(runner_ctx->backend, ctx, x, count, x->ne[1], x->ne[2], x->ne[3]);
+    }
+
+    static ggml_tensor* replicate_pad_1d(GGMLRunnerContext* runner_ctx, ggml_tensor* x, int64_t left, int64_t right) {
+        auto ctx = runner_ctx->ggml_ctx;
         if (left > 0) {
             auto first = ggml_ext_slice(ctx, x, 0, 0, 1);
-            x          = ggml_concat(ctx, repeat_1d_value(ctx, first, left), x, 0);
+            x          = ggml_concat(ctx, repeat_1d_value(runner_ctx, first, left), x, 0);
         }
         if (right > 0) {
             auto last = ggml_ext_slice(ctx, x, 0, x->ne[0] - 1, x->ne[0]);
-            x         = ggml_concat(ctx, x, repeat_1d_value(ctx, last, right), 0);
+            x         = ggml_concat(ctx, x, repeat_1d_value(runner_ctx, last, right), 0);
         }
         return x;
     }
 
-    static ggml_tensor* tile_depthwise_filter_1d(ggml_context* ctx, ggml_tensor* filter, int64_t channels) {
+    static ggml_tensor* tile_depthwise_filter_1d(GGMLRunnerContext* runner_ctx, ggml_tensor* filter, int64_t channels) {
+        auto ctx          = runner_ctx->ggml_ctx;
         ggml_tensor* base = filter;
         if (ggml_n_dims(base) == 3) {
             base = ggml_reshape_4d(ctx, base, base->ne[0], 1, 1, 1);
         } else if (ggml_n_dims(base) == 1) {
             base = ggml_reshape_4d(ctx, base, base->ne[0], 1, 1, 1);
         }
-        ggml_tensor* target = ggml_new_tensor_4d(ctx, base->type, base->ne[0], 1, channels, 1);
-        return ggml_repeat(ctx, base, target);
+        return repeat_with_vulkan_f32_workaround(runner_ctx->backend, ctx, base, base->ne[0], 1, channels, 1);
     }
 
-    static ggml_tensor* depthwise_conv1d(ggml_context* ctx,
+    static ggml_tensor* depthwise_conv1d(GGMLRunnerContext* runner_ctx,
                                          ggml_tensor* x,
                                          ggml_tensor* filter,
                                          int stride,
                                          int padding) {
+        auto ctx = runner_ctx->ggml_ctx;
         GGML_ASSERT(x->ne[3] == 1);
-        auto tiled = tile_depthwise_filter_1d(ctx, filter, x->ne[1]);
+        auto tiled = tile_depthwise_filter_1d(runner_ctx, filter, x->ne[1]);
         auto out   = ggml_conv_1d_dw(ctx, tiled, x, stride, padding, 1);
         return ggml_reshape_4d(ctx, out, out->ne[0], out->ne[1], 1, 1);
     }
@@ -654,7 +678,7 @@ namespace LTXV {
             int up_pad_left  = up_pad * up_ratio + (up_kernel_size - up_ratio) / 2;
             int up_pad_right = up_pad * up_ratio + (up_kernel_size - up_ratio + 1) / 2;
 
-            x = replicate_pad_1d(ctx->ggml_ctx, x, up_pad, up_pad);
+            x = replicate_pad_1d(ctx, x, up_pad, up_pad);
             x = depthwise_conv_transpose1d(ctx->ggml_ctx, x, up_filter, up_ratio);
             x = ggml_ext_slice(ctx->ggml_ctx, x, 0, up_pad_left, x->ne[0] - up_pad_right);
 
@@ -662,8 +686,8 @@ namespace LTXV {
 
             int down_pad_left  = down_kernel_size / 2 - (down_kernel_size % 2 == 0 ? 1 : 0);
             int down_pad_right = down_kernel_size / 2;
-            x                  = replicate_pad_1d(ctx->ggml_ctx, x, down_pad_left, down_pad_right);
-            x                  = depthwise_conv1d(ctx->ggml_ctx, x, down_filter, down_ratio, 0);
+            x                  = replicate_pad_1d(ctx, x, down_pad_left, down_pad_right);
+            x                  = depthwise_conv1d(ctx, x, down_filter, down_ratio, 0);
             return x;
         }
     };
diff --git a/otherarch/sdcpp/stable-diffusion.cpp b/otherarch/sdcpp/stable-diffusion.cpp
index fc60d672f..fa90942e1 100644
--- a/otherarch/sdcpp/stable-diffusion.cpp
+++ b/otherarch/sdcpp/stable-diffusion.cpp
@@ -531,8 +531,9 @@ public:
             LOG_INFO("loading tae from '%s'", sd_ctx_params->taesd_path);
             if (!model_loader.init_from_file(sd_ctx_params->taesd_path, "tae.")) {
                 LOG_WARN("loading tae from '%s' failed", sd_ctx_params->taesd_path);
+            } else {
+                use_tae = true;
             }
-            use_tae = true;
         }
 
         if (strlen(SAFE_STR(sd_ctx_params->embeddings_connectors_path)) > 0) {
diff --git a/otherarch/sdcpp/stable-diffusion.h b/otherarch/sdcpp/stable-diffusion.h
index dc1fb093f..3d5b3fd18 100644
--- a/otherarch/sdcpp/stable-diffusion.h
+++ b/otherarch/sdcpp/stable-diffusion.h
@@ -126,7 +126,7 @@ enum sd_type_t {
     // SD_TYPE_IQ4_NL_8_8 = 38,
     SD_TYPE_MXFP4 = 39,  // MXFP4 (1 block)
     SD_TYPE_NVFP4 = 40,  // NVFP4 (4 blocks, E4M3 scale)
-    SD_TYPE_Q1_0 = 41,
+    SD_TYPE_Q1_0  = 41,
     SD_TYPE_COUNT = 42,
 };