sd: sync to master-391-5865b5e (#1872)

2026-05-19 16:31:59 +00:00 · 2025-12-04 05:29:38 -03:00 · 2025-12-04 05:29:38 -03:00 · 510508e7da
commit 510508e7da
parent 03cec02a3d
5 changed files with 1544 additions and 1395 deletions
--- a/otherarch/sdcpp/main.cpp
+++ b/otherarch/sdcpp/main.cpp
--- a/otherarch/sdcpp/model.cpp
+++ b/otherarch/sdcpp/model.cpp
@ -106,8 +106,12 @@ const char* unused_tensors[] = {
    "model_ema.diffusion_model",
    "embedding_manager",
    "denoiser.sigmas",
-    "edm_vpred.sigma_max",
    "text_encoders.t5xxl.transformer.encoder.embed_tokens.weight",  // only used during training
+    "ztsnr",  // Found in some SDXL vpred models
+    "edm_vpred.sigma_min", // Found in CosXL
+    // TODO: find another way to avoid the "unknown tensor" for these two
+    // "edm_vpred.sigma_max", // Used to detect CosXL
+    // "v_pred", // Used to detect SDXL vpred models
    "text_encoders.llm.output.weight",
    "text_encoders.llm.lm_head.",
    "first_stage_model.bn.",
--- a/otherarch/sdcpp/stable-diffusion.cpp
+++ b/otherarch/sdcpp/stable-diffusion.cpp
@ -2267,12 +2267,12 @@ public:
    }

    ggml_tensor* vae_encode(ggml_context* work_ctx, ggml_tensor* x, bool encode_video = false) {
-        int64_t t0          = ggml_time_ms();
-        ggml_tensor* result = nullptr;
+        int64_t t0                 = ggml_time_ms();
+        ggml_tensor* result        = nullptr;
        const int vae_scale_factor = get_vae_scale_factor();
        int W                      = x->ne[0] / vae_scale_factor;
        int H                      = x->ne[1] / vae_scale_factor;
-        int C               = get_latent_channel();
+        int C                      = get_latent_channel();
        if (vae_tiling_params.enabled && !encode_video) {
            // TODO wan2.2 vae support?
            int ne2;
@ -2397,8 +2397,8 @@ public:
        const int vae_scale_factor = get_vae_scale_factor();
        int64_t W                  = x->ne[0] * vae_scale_factor;
        int64_t H                  = x->ne[1] * vae_scale_factor;
-        int64_t C           = 3;
-        ggml_tensor* result = nullptr;
+        int64_t C                  = 3;
+        ggml_tensor* result        = nullptr;
        if (decode_video) {
            int T = x->ne[2];
            if (sd_version_is_wan(version)) {
--- a/otherarch/sdcpp/util.cpp
+++ b/otherarch/sdcpp/util.cpp
@ -411,19 +411,19 @@ const char* sd_get_system_info() {
    static char buffer[1024];
    std::stringstream ss;
    ss << "System Info: \n";
-    ss << "    SSE3 = " << ggml_cpu_has_sse3() << std::endl;
-    ss << "    AVX = " << ggml_cpu_has_avx() << std::endl;
-    ss << "    AVX2 = " << ggml_cpu_has_avx2() << std::endl;
-    ss << "    AVX512 = " << ggml_cpu_has_avx512() << std::endl;
-    ss << "    AVX512_VBMI = " << ggml_cpu_has_avx512_vbmi() << std::endl;
-    ss << "    AVX512_VNNI = " << ggml_cpu_has_avx512_vnni() << std::endl;
-    ss << "    FMA = " << ggml_cpu_has_fma() << std::endl;
-    ss << "    NEON = " << ggml_cpu_has_neon() << std::endl;
-    ss << "    ARM_FMA = " << ggml_cpu_has_arm_fma() << std::endl;
-    ss << "    F16C = " << ggml_cpu_has_f16c() << std::endl;
-    ss << "    FP16_VA = " << ggml_cpu_has_fp16_va() << std::endl;
-    ss << "    WASM_SIMD = " << ggml_cpu_has_wasm_simd() << std::endl;
-    ss << "    VSX = " << ggml_cpu_has_vsx() << std::endl;
+    ss << "    SSE3 = " << ggml_cpu_has_sse3() << " | ";
+    ss << "    AVX = " << ggml_cpu_has_avx() << " | ";
+    ss << "    AVX2 = " << ggml_cpu_has_avx2() << " | ";
+    ss << "    AVX512 = " << ggml_cpu_has_avx512() << " | ";
+    ss << "    AVX512_VBMI = " << ggml_cpu_has_avx512_vbmi() << " | ";
+    ss << "    AVX512_VNNI = " << ggml_cpu_has_avx512_vnni() << " | ";
+    ss << "    FMA = " << ggml_cpu_has_fma() << " | ";
+    ss << "    NEON = " << ggml_cpu_has_neon() << " | ";
+    ss << "    ARM_FMA = " << ggml_cpu_has_arm_fma() << " | ";
+    ss << "    F16C = " << ggml_cpu_has_f16c() << " | ";
+    ss << "    FP16_VA = " << ggml_cpu_has_fp16_va() << " | ";
+    ss << "    WASM_SIMD = " << ggml_cpu_has_wasm_simd() << " | ";
+    ss << "    VSX = " << ggml_cpu_has_vsx() << " | ";
    snprintf(buffer, sizeof(buffer), "%s", ss.str().c_str());
    return buffer;
 }
--- a/otherarch/sdcpp/z_image.hpp
+++ b/otherarch/sdcpp/z_image.hpp
@ -30,7 +30,12 @@ namespace ZImage {
        JointAttention(int64_t hidden_size, int64_t head_dim, int64_t num_heads, int64_t num_kv_heads, bool qk_norm)
            : head_dim(head_dim), num_heads(num_heads), num_kv_heads(num_kv_heads), qk_norm(qk_norm) {
            blocks["qkv"] = std::make_shared<Linear>(hidden_size, (num_heads + num_kv_heads * 2) * head_dim, false);
-            blocks["out"] = std::make_shared<Linear>(num_heads * head_dim, hidden_size, false);
+            float scale = 1.f;
+#if GGML_USE_HIP
+            // Prevent NaN issues with certain ROCm setups
+            scale = 1.f / 16.f;
+#endif
+            blocks["out"] = std::make_shared<Linear>(num_heads * head_dim, hidden_size, false, false, false, scale);
            if (qk_norm) {
                blocks["q_norm"] = std::make_shared<RMSNorm>(head_dim);
                blocks["k_norm"] = std::make_shared<RMSNorm>(head_dim);
@ -93,7 +98,7 @@ namespace ZImage {
 #endif
            // The purpose of the scale here is to prevent NaN issues in certain situations.
            // For example, when using CUDA but the weights are k-quants.
-            blocks["w2"] = std::make_shared<Linear>(hidden_dim, dim, false, false, force_prec_f32, 1.f / 128.f);
+            blocks["w2"] = std::make_shared<Linear>(hidden_dim, dim, false, false, force_prec_f32, scale);
            blocks["w3"] = std::make_shared<Linear>(dim, hidden_dim, false);
        }

@ -667,4 +672,4 @@ namespace ZImage {

 }  // namespace ZImage

-#endif  // __Z_IMAGE_HPP__
+#endif  // __Z_IMAGE_HPP__