sd 3.5 medium

2026-05-22 03:10:03 +00:00 · 2024-11-03 23:27:06 +08:00 · 2024-11-03 23:27:06 +08:00 · 5233e8ed1d
commit 5233e8ed1d
parent f32a874966
5 changed files with 204 additions and 49 deletions
--- a/otherarch/sdcpp/stable-diffusion.cpp
+++ b/otherarch/sdcpp/stable-diffusion.cpp
@ -31,7 +31,8 @@ const char* model_version_to_str[] = {
    "SD3 2B",
    "Flux Dev",
    "Flux Schnell",
-    "SD3.5 8B"};
+    "SD3.5 8B",
+    "SD3.5 2B"};

 const char* sampling_methods_str[] = {
    "Euler A",
@ -294,7 +295,7 @@ public:
                    "try specifying SDXL VAE FP16 Fix with the --vae parameter. "
                    "You can find it here: https://huggingface.co/madebyollin/sdxl-vae-fp16-fix/blob/main/sdxl_vae.safetensors");
            }
-        } else if (version == VERSION_SD3_2B || version == VERSION_SD3_5_8B) {
+        } else if (version == VERSION_SD3_2B || version == VERSION_SD3_5_8B || version == VERSION_SD3_5_2B) {
            scale_factor = 1.5305f;
        } else if (version == VERSION_FLUX_DEV || version == VERSION_FLUX_SCHNELL) {
            scale_factor = 0.3611;
@ -317,7 +318,7 @@ public:
        } else {
            clip_backend   = backend;
            bool use_t5xxl = false;
-            if (version == VERSION_SD3_2B || version == VERSION_SD3_5_8B || version == VERSION_FLUX_DEV || version == VERSION_FLUX_SCHNELL) {
+            if (version == VERSION_SD3_2B || version == VERSION_SD3_5_8B || version == VERSION_SD3_5_2B || version == VERSION_FLUX_DEV || version == VERSION_FLUX_SCHNELL) {
                use_t5xxl = true;
            }
            if (!ggml_backend_is_cpu(backend) && use_t5xxl && conditioner_wtype != GGML_TYPE_F32) {
@ -328,7 +329,7 @@ public:
                LOG_INFO("CLIP: Using CPU backend");
                clip_backend = ggml_backend_cpu_init();
            }
-            if (version == VERSION_SD3_2B || version == VERSION_SD3_5_8B) {
+            if (version == VERSION_SD3_2B || version == VERSION_SD3_5_8B || version == VERSION_SD3_5_2B) {
                cond_stage_model = std::make_shared<SD3CLIPEmbedder>(clip_backend, conditioner_wtype);
                diffusion_model  = std::make_shared<MMDiTModel>(backend, diffusion_model_wtype, version);
            } else if (version == VERSION_FLUX_DEV || version == VERSION_FLUX_SCHNELL) {
@ -526,7 +527,7 @@ public:
            is_using_v_parameterization = true;
        }

-        if (version == VERSION_SD3_2B || version == VERSION_SD3_5_8B) {
+        if (version == VERSION_SD3_2B || version == VERSION_SD3_5_8B || version == VERSION_SD3_5_2B) {
            LOG_INFO("running in FLOW mode");
            denoiser = std::make_shared<DiscreteFlowDenoiser>();
        } else if (version == VERSION_FLUX_DEV || version == VERSION_FLUX_SCHNELL) {
@ -986,7 +987,7 @@ public:
        if (use_tiny_autoencoder) {
            C = 4;
        } else {
-            if (version == VERSION_SD3_2B || version == VERSION_SD3_5_8B) {
+            if (version == VERSION_SD3_2B || version == VERSION_SD3_5_8B || version == VERSION_SD3_5_2B) {
                C = 32;
            } else if (version == VERSION_FLUX_DEV || version == VERSION_FLUX_SCHNELL) {
                C = 32;
@ -1325,7 +1326,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
    // Sample
    std::vector<struct ggml_tensor*> final_latents;  // collect latents to decode
    int C = 4;
-    if (sd_ctx->sd->version == VERSION_SD3_2B || sd_ctx->sd->version == VERSION_SD3_5_8B) {
+    if (sd_ctx->sd->version == VERSION_SD3_2B || sd_ctx->sd->version == VERSION_SD3_5_8B || sd_ctx->sd->version == VERSION_SD3_5_2B) {
        C = 16;
    } else if (sd_ctx->sd->version == VERSION_FLUX_DEV || sd_ctx->sd->version == VERSION_FLUX_SCHNELL) {
        C = 16;
@ -1438,7 +1439,7 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx,

    struct ggml_init_params params;
    params.mem_size = static_cast<size_t>(10 * 1024 * 1024);  // 10 MB
-    if (sd_ctx->sd->version == VERSION_SD3_2B || sd_ctx->sd->version == VERSION_SD3_5_8B) {
+    if (sd_ctx->sd->version == VERSION_SD3_2B || sd_ctx->sd->version == VERSION_SD3_5_8B || sd_ctx->sd->version == VERSION_SD3_5_2B) {
        params.mem_size *= 3;
    }
    if (sd_ctx->sd->version == VERSION_FLUX_DEV || sd_ctx->sd->version == VERSION_FLUX_SCHNELL) {
@ -1464,7 +1465,7 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx,
    std::vector<float> sigmas = sd_ctx->sd->denoiser->get_sigmas(sample_steps);

    int C = 4;
-    if (sd_ctx->sd->version == VERSION_SD3_2B || sd_ctx->sd->version == VERSION_SD3_5_8B) {
+    if (sd_ctx->sd->version == VERSION_SD3_2B || sd_ctx->sd->version == VERSION_SD3_5_8B || sd_ctx->sd->version == VERSION_SD3_5_2B) {
        C = 16;
    } else if (sd_ctx->sd->version == VERSION_FLUX_DEV || sd_ctx->sd->version == VERSION_FLUX_SCHNELL) {
        C = 16;
@ -1472,7 +1473,7 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx,
    int W                    = width / 8;
    int H                    = height / 8;
    ggml_tensor* init_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, 1);
-    if (sd_ctx->sd->version == VERSION_SD3_2B || sd_ctx->sd->version == VERSION_SD3_5_8B) {
+    if (sd_ctx->sd->version == VERSION_SD3_2B || sd_ctx->sd->version == VERSION_SD3_5_8B || sd_ctx->sd->version == VERSION_SD3_5_2B) {
        ggml_set_f32(init_latent, 0.0609f);
    } else if (sd_ctx->sd->version == VERSION_FLUX_DEV || sd_ctx->sd->version == VERSION_FLUX_SCHNELL) {
        ggml_set_f32(init_latent, 0.1159f);
@ -1533,7 +1534,7 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,

    struct ggml_init_params params;
    params.mem_size = static_cast<size_t>(10 * 1024 * 1024);  // 10 MB
-    if (sd_ctx->sd->version == VERSION_SD3_2B || sd_ctx->sd->version == VERSION_SD3_5_8B) {
+    if (sd_ctx->sd->version == VERSION_SD3_2B || sd_ctx->sd->version == VERSION_SD3_5_8B || sd_ctx->sd->version == VERSION_SD3_5_2B) {
        params.mem_size *= 2;
    }
    if (sd_ctx->sd->version == VERSION_FLUX_DEV || sd_ctx->sd->version == VERSION_FLUX_SCHNELL) {
@ -1571,7 +1572,7 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
    } else {
        init_latent = sd_ctx->sd->encode_first_stage(work_ctx, init_img);
    }
-    print_ggml_tensor(init_latent, true);
+    // print_ggml_tensor(init_latent, true);
    size_t t1 = ggml_time_ms();
    LOG_INFO("encode_first_stage completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);