From 627e317cd7c6c643b892e2fde1c7d664f404893e Mon Sep 17 00:00:00 2001 From: Wagner Bruna Date: Tue, 19 May 2026 23:37:17 -0300 Subject: [PATCH] sd: sync to master-633-5b0267e --- otherarch/sdcpp/common/common.cpp | 2 +- otherarch/sdcpp/ggml_graph_cut.cpp | 27 +++++++------- otherarch/sdcpp/llm.hpp | 3 +- otherarch/sdcpp/ltx_audio_vae.h | 54 ++++++++++++++++++++-------- otherarch/sdcpp/stable-diffusion.cpp | 3 +- otherarch/sdcpp/stable-diffusion.h | 2 +- 6 files changed, 59 insertions(+), 32 deletions(-) diff --git a/otherarch/sdcpp/common/common.cpp b/otherarch/sdcpp/common/common.cpp index dd4581eb0..85c03b412 100644 --- a/otherarch/sdcpp/common/common.cpp +++ b/otherarch/sdcpp/common/common.cpp @@ -413,7 +413,7 @@ ArgOptions SDContextParams::get_options() { options.float_options = { {"", "--max-vram", - "maximum VRAM budget in GiB for graph-cut segmented execution. 0 disables graph splitting; -1 auto-detects free VRAM minus 1 GiB", + "maximum VRAM budget in GiB for graph-cut segmented execution. 0 disables graph splitting; a negative value auto-detects free VRAM, sparing the specified value (e.g. -0.5 will keep at least 0.5 GiB free)", &max_vram}, }; diff --git a/otherarch/sdcpp/ggml_graph_cut.cpp b/otherarch/sdcpp/ggml_graph_cut.cpp index d07e08be7..a20de30c9 100644 --- a/otherarch/sdcpp/ggml_graph_cut.cpp +++ b/otherarch/sdcpp/ggml_graph_cut.cpp @@ -16,8 +16,7 @@ namespace sd::ggml_graph_cut { - static constexpr double MAX_VRAM_BYTES_PER_GIB = 1024.0 * 1024.0 * 1024.0; - static constexpr size_t MAX_VRAM_AUTO_RESERVE_BYTES = 1024ULL * 1024ULL * 1024ULL; + static constexpr double MAX_VRAM_BYTES_PER_GIB = 1024.0 * 1024.0 * 1024.0; static std::string graph_cut_tensor_display_name(const ggml_tensor* tensor) { if (tensor == nullptr) { @@ -93,45 +92,47 @@ namespace sd::ggml_graph_cut { return static_cast(static_cast(max_vram_bytes) / MAX_VRAM_BYTES_PER_GIB); } - static size_t resolve_auto_max_vram_bytes(ggml_backend_t backend) { + static size_t resolve_auto_max_vram_bytes(float spare_vram, ggml_backend_t backend) { if (backend == nullptr) { - LOG_WARN("--max-vram -1 requested, but no backend is available; disabling graph splitting"); + LOG_WARN("--max-vram < 0 requested, but no backend is available; disabling graph splitting"); return 0; } ggml_backend_dev_t dev = ggml_backend_get_device(backend); if (dev == nullptr) { - LOG_WARN("--max-vram -1 requested, but no backend device is available; disabling graph splitting"); + LOG_WARN("--max-vram < 0 requested, but no backend device is available; disabling graph splitting"); return 0; } if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) { - LOG_WARN("--max-vram -1 requested, but the main backend is CPU; disabling graph splitting"); + LOG_WARN("--max-vram < 0 requested, but the main backend is CPU; disabling graph splitting"); return 0; } size_t free_vram = 0; size_t total_vram = 0; ggml_backend_dev_memory(dev, &free_vram, &total_vram); + size_t spare_bytes = static_cast(MAX_VRAM_BYTES_PER_GIB * spare_vram); - if (free_vram <= MAX_VRAM_AUTO_RESERVE_BYTES) { - LOG_WARN("--max-vram -1 requested, but free VRAM is %.2f GiB; reserving 1.00 GiB leaves no graph budget", - free_vram / MAX_VRAM_BYTES_PER_GIB); + if (free_vram <= spare_bytes) { + LOG_WARN("--max-vram < 0 requested, but free VRAM is %.2f GiB; reserving %.2f GiB leaves no graph budget", + free_vram / MAX_VRAM_BYTES_PER_GIB, spare_vram); return 0; } - const size_t max_vram_bytes = free_vram - MAX_VRAM_AUTO_RESERVE_BYTES; - LOG_INFO("--max-vram -1 auto-detected %.2f GiB free VRAM (%.2f GiB total), reserving 1.00 GiB; using %.2f GiB", + const size_t max_vram_bytes = free_vram - spare_bytes; + LOG_INFO("--max-vram < 0 auto-detected %.2f GiB free VRAM (%.2f GiB total), reserving %.2f GiB; using %.2f GiB", free_vram / MAX_VRAM_BYTES_PER_GIB, total_vram / MAX_VRAM_BYTES_PER_GIB, + spare_vram, max_vram_bytes / MAX_VRAM_BYTES_PER_GIB); return max_vram_bytes; } float resolve_max_vram_gib(float max_vram, ggml_backend_t backend) { - if (max_vram != -1.f) { + if (max_vram >= 0.f) { return max_vram; } - return max_vram_bytes_to_gib(resolve_auto_max_vram_bytes(backend)); + return max_vram_bytes_to_gib(resolve_auto_max_vram_bytes(-max_vram, backend)); } static Segment make_segment_seed(const Plan& plan, diff --git a/otherarch/sdcpp/llm.hpp b/otherarch/sdcpp/llm.hpp index ab673b208..cec8b1dcc 100644 --- a/otherarch/sdcpp/llm.hpp +++ b/otherarch/sdcpp/llm.hpp @@ -1403,7 +1403,8 @@ namespace LLM { out_layers, return_all_hidden_states); }; - return take_or_empty(GGMLRunner::compute(get_graph, n_threads, true)); + return restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, true), + input_ids.dim() + 1); } int64_t get_num_image_tokens(int64_t t, int64_t h, int64_t w) { diff --git a/otherarch/sdcpp/ltx_audio_vae.h b/otherarch/sdcpp/ltx_audio_vae.h index d1d765d75..aad8e0f87 100644 --- a/otherarch/sdcpp/ltx_audio_vae.h +++ b/otherarch/sdcpp/ltx_audio_vae.h @@ -349,42 +349,66 @@ namespace LTXV { return type == GGML_TYPE_BF16 ? GGML_TYPE_F16 : type; } - static ggml_tensor* repeat_1d_value(ggml_context* ctx, ggml_tensor* x, int64_t count) { - GGML_ASSERT(x->ne[0] == 1); - ggml_tensor* target = ggml_new_tensor_4d(ctx, x->type, count, x->ne[1], x->ne[2], x->ne[3]); - return ggml_repeat(ctx, x, target); + static ggml_tensor* repeat_with_vulkan_f32_workaround(ggml_backend_t backend, + ggml_context* ctx, + ggml_tensor* x, + int64_t ne0, + int64_t ne1, + int64_t ne2, + int64_t ne3) { + if (x->type != GGML_TYPE_F32 && + (x->type == GGML_TYPE_F16 || x->type == GGML_TYPE_BF16) && + sd_backend_is(backend, "vulkan")) { + auto x_f32 = ggml_cast(ctx, x, GGML_TYPE_F32); + auto repeated = ggml_repeat_4d(ctx, + x_f32, + ne0, + ne1, + ne2, + ne3); + return ggml_cast(ctx, repeated, x->type); + } + return ggml_repeat_4d(ctx, x, ne0, ne1, ne2, ne3); } - static ggml_tensor* replicate_pad_1d(ggml_context* ctx, ggml_tensor* x, int64_t left, int64_t right) { + static ggml_tensor* repeat_1d_value(GGMLRunnerContext* runner_ctx, ggml_tensor* x, int64_t count) { + auto ctx = runner_ctx->ggml_ctx; + GGML_ASSERT(x->ne[0] == 1); + return repeat_with_vulkan_f32_workaround(runner_ctx->backend, ctx, x, count, x->ne[1], x->ne[2], x->ne[3]); + } + + static ggml_tensor* replicate_pad_1d(GGMLRunnerContext* runner_ctx, ggml_tensor* x, int64_t left, int64_t right) { + auto ctx = runner_ctx->ggml_ctx; if (left > 0) { auto first = ggml_ext_slice(ctx, x, 0, 0, 1); - x = ggml_concat(ctx, repeat_1d_value(ctx, first, left), x, 0); + x = ggml_concat(ctx, repeat_1d_value(runner_ctx, first, left), x, 0); } if (right > 0) { auto last = ggml_ext_slice(ctx, x, 0, x->ne[0] - 1, x->ne[0]); - x = ggml_concat(ctx, x, repeat_1d_value(ctx, last, right), 0); + x = ggml_concat(ctx, x, repeat_1d_value(runner_ctx, last, right), 0); } return x; } - static ggml_tensor* tile_depthwise_filter_1d(ggml_context* ctx, ggml_tensor* filter, int64_t channels) { + static ggml_tensor* tile_depthwise_filter_1d(GGMLRunnerContext* runner_ctx, ggml_tensor* filter, int64_t channels) { + auto ctx = runner_ctx->ggml_ctx; ggml_tensor* base = filter; if (ggml_n_dims(base) == 3) { base = ggml_reshape_4d(ctx, base, base->ne[0], 1, 1, 1); } else if (ggml_n_dims(base) == 1) { base = ggml_reshape_4d(ctx, base, base->ne[0], 1, 1, 1); } - ggml_tensor* target = ggml_new_tensor_4d(ctx, base->type, base->ne[0], 1, channels, 1); - return ggml_repeat(ctx, base, target); + return repeat_with_vulkan_f32_workaround(runner_ctx->backend, ctx, base, base->ne[0], 1, channels, 1); } - static ggml_tensor* depthwise_conv1d(ggml_context* ctx, + static ggml_tensor* depthwise_conv1d(GGMLRunnerContext* runner_ctx, ggml_tensor* x, ggml_tensor* filter, int stride, int padding) { + auto ctx = runner_ctx->ggml_ctx; GGML_ASSERT(x->ne[3] == 1); - auto tiled = tile_depthwise_filter_1d(ctx, filter, x->ne[1]); + auto tiled = tile_depthwise_filter_1d(runner_ctx, filter, x->ne[1]); auto out = ggml_conv_1d_dw(ctx, tiled, x, stride, padding, 1); return ggml_reshape_4d(ctx, out, out->ne[0], out->ne[1], 1, 1); } @@ -654,7 +678,7 @@ namespace LTXV { int up_pad_left = up_pad * up_ratio + (up_kernel_size - up_ratio) / 2; int up_pad_right = up_pad * up_ratio + (up_kernel_size - up_ratio + 1) / 2; - x = replicate_pad_1d(ctx->ggml_ctx, x, up_pad, up_pad); + x = replicate_pad_1d(ctx, x, up_pad, up_pad); x = depthwise_conv_transpose1d(ctx->ggml_ctx, x, up_filter, up_ratio); x = ggml_ext_slice(ctx->ggml_ctx, x, 0, up_pad_left, x->ne[0] - up_pad_right); @@ -662,8 +686,8 @@ namespace LTXV { int down_pad_left = down_kernel_size / 2 - (down_kernel_size % 2 == 0 ? 1 : 0); int down_pad_right = down_kernel_size / 2; - x = replicate_pad_1d(ctx->ggml_ctx, x, down_pad_left, down_pad_right); - x = depthwise_conv1d(ctx->ggml_ctx, x, down_filter, down_ratio, 0); + x = replicate_pad_1d(ctx, x, down_pad_left, down_pad_right); + x = depthwise_conv1d(ctx, x, down_filter, down_ratio, 0); return x; } }; diff --git a/otherarch/sdcpp/stable-diffusion.cpp b/otherarch/sdcpp/stable-diffusion.cpp index fc60d672f..fa90942e1 100644 --- a/otherarch/sdcpp/stable-diffusion.cpp +++ b/otherarch/sdcpp/stable-diffusion.cpp @@ -531,8 +531,9 @@ public: LOG_INFO("loading tae from '%s'", sd_ctx_params->taesd_path); if (!model_loader.init_from_file(sd_ctx_params->taesd_path, "tae.")) { LOG_WARN("loading tae from '%s' failed", sd_ctx_params->taesd_path); + } else { + use_tae = true; } - use_tae = true; } if (strlen(SAFE_STR(sd_ctx_params->embeddings_connectors_path)) > 0) { diff --git a/otherarch/sdcpp/stable-diffusion.h b/otherarch/sdcpp/stable-diffusion.h index dc1fb093f..3d5b3fd18 100644 --- a/otherarch/sdcpp/stable-diffusion.h +++ b/otherarch/sdcpp/stable-diffusion.h @@ -126,7 +126,7 @@ enum sd_type_t { // SD_TYPE_IQ4_NL_8_8 = 38, SD_TYPE_MXFP4 = 39, // MXFP4 (1 block) SD_TYPE_NVFP4 = 40, // NVFP4 (4 blocks, E4M3 scale) - SD_TYPE_Q1_0 = 41, + SD_TYPE_Q1_0 = 41, SD_TYPE_COUNT = 42, };