From fab2ff0687ffb6e2f57e256900a55c616a52cf58 Mon Sep 17 00:00:00 2001
From: Concedo <39025047+LostRuins@users.noreply.github.com>
Date: Mon, 20 Oct 2025 10:45:34 +0800
Subject: [PATCH] sync sd.cpp to e370258

---
 otherarch/sdcpp/main.cpp             | 197 +++++++++++++++----------
 otherarch/sdcpp/stable-diffusion.cpp | 210 ++++++++++++++++++---------
 otherarch/sdcpp/stable-diffusion.h   |  15 ++
 3 files changed, 276 insertions(+), 146 deletions(-)

diff --git a/otherarch/sdcpp/main.cpp b/otherarch/sdcpp/main.cpp
index ee16763a3..538e1e64f 100644
--- a/otherarch/sdcpp/main.cpp
+++ b/otherarch/sdcpp/main.cpp
@@ -41,13 +41,15 @@ const char* modes_str[] = {
     "img_gen",
     "vid_gen",
     "convert",
+    "upscale",
 };
-#define SD_ALL_MODES_STR "img_gen, vid_gen, convert"
+#define SD_ALL_MODES_STR "img_gen, vid_gen, convert, upscale"
 
 enum SDMode {
     IMG_GEN,
     VID_GEN,
     CONVERT,
+    UPSCALE,
     MODE_COUNT
 };
 
@@ -82,6 +84,7 @@ struct SDParams {
 
     std::string prompt;
     std::string negative_prompt;
+
     int clip_skip   = -1;  // <= 0 represents unspecified
     int width       = 512;
     int height      = 512;
@@ -125,6 +128,8 @@ struct SDParams {
     int chroma_t5_mask_pad   = 1;
     float flow_shift         = INFINITY;
 
+    prediction_t prediction = DEFAULT_PRED;
+
     sd_tiling_params_t vae_tiling_params = {false, 0, 0, 0.5f, 0.0f, 0.0f};
 
     SDParams() {
@@ -186,6 +191,7 @@ void print_params(SDParams params) {
     printf("    sample_params:                     %s\n", SAFE_STR(sample_params_str));
     printf("    high_noise_sample_params:          %s\n", SAFE_STR(high_noise_sample_params_str));
     printf("    moe_boundary:                      %.3f\n", params.moe_boundary);
+    printf("    prediction:                        %s\n", sd_prediction_name(params.prediction));
     printf("    flow_shift:                        %.2f\n", params.flow_shift);
     printf("    strength(img2img):                 %.2f\n", params.strength);
     printf("    rng:                               %s\n", sd_rng_type_name(params.rng_type));
@@ -208,7 +214,7 @@ void print_usage(int argc, const char* argv[]) {
     printf("\n");
     printf("arguments:\n");
     printf("  -h, --help                         show this help message and exit\n");
-    printf("  -M, --mode [MODE]                  run mode, one of: [img_gen, vid_gen, convert], default: img_gen\n");
+    printf("  -M, --mode [MODE]                  run mode, one of: [img_gen, vid_gen, upscale, convert], default: img_gen\n");
     printf("  -t, --threads N                    number of threads to use during computation (default: -1)\n");
     printf("                                     If threads <= 0, then threads will be set to the number of CPU physical cores\n");
     printf("  --offload-to-cpu                   place the weights in RAM to save VRAM, and automatically load them into VRAM when needed\n");
@@ -225,7 +231,7 @@ void print_usage(int argc, const char* argv[]) {
     printf("  --taesd [TAESD_PATH]               path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)\n");
     printf("  --control-net [CONTROL_PATH]       path to control net model\n");
     printf("  --embd-dir [EMBEDDING_PATH]        path to embeddings\n");
-    printf("  --upscale-model [ESRGAN_PATH]      path to esrgan model. Upscale images after generate, just RealESRGAN_x4plus_anime_6B supported by now\n");
+    printf("  --upscale-model [ESRGAN_PATH]      path to esrgan model. For img_gen mode, upscale images after generate, just RealESRGAN_x4plus_anime_6B supported by now\n");
     printf("  --upscale-repeats                  Run the ESRGAN upscaler this many times (default 1)\n");
     printf("  --type [TYPE]                      weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K)\n");
     printf("                                     If not specified, the default is the type of the weight file\n");
@@ -279,6 +285,7 @@ void print_usage(int argc, const char* argv[]) {
     printf("  --rng {std_default, cuda}          RNG (default: cuda)\n");
     printf("  -s SEED, --seed SEED               RNG seed (default: 42, use random seed for < 0)\n");
     printf("  -b, --batch-count COUNT            number of images to generate\n");
+    printf("  --prediction {eps, v, edm_v, sd3_flow, flux_flow}        Prediction type override.\n");
     printf("  --clip-skip N                      ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1)\n");
     printf("                                     <= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x\n");
     printf("  --vae-tiling                       process vae in tiles to reduce memory usage\n");
@@ -649,6 +656,20 @@ void parse_args(int argc, const char** argv, SDParams& params) {
         return 1;
     };
 
+    auto on_prediction_arg = [&](int argc, const char** argv, int index) {
+        if (++index >= argc) {
+            return -1;
+        }
+        const char* arg   = argv[index];
+        params.prediction = str_to_prediction(arg);
+        if (params.prediction == PREDICTION_COUNT) {
+            fprintf(stderr, "error: invalid prediction type %s\n",
+                    arg);
+            return -1;
+        }
+        return 1;
+    };
+
     auto on_sample_method_arg = [&](int argc, const char** argv, int index) {
         if (++index >= argc) {
             return -1;
@@ -805,6 +826,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
         {"", "--rng", "", on_rng_arg},
         {"-s", "--seed", "", on_seed_arg},
         {"", "--sampling-method", "", on_sample_method_arg},
+        {"", "--prediction", "", on_prediction_arg},
         {"", "--scheduler", "", on_schedule_arg},
         {"", "--skip-layers", "", on_skip_layers_arg},
         {"", "--high-noise-sampling-method", "", on_high_noise_sample_method_arg},
@@ -825,13 +847,13 @@ void parse_args(int argc, const char** argv, SDParams& params) {
         params.n_threads = sd_get_num_physical_cores();
     }
 
-    if (params.mode != CONVERT && params.mode != VID_GEN && params.prompt.length() == 0) {
+    if ((params.mode == IMG_GEN || params.mode == VID_GEN) && params.prompt.length() == 0) {
         fprintf(stderr, "error: the following arguments are required: prompt\n");
         print_usage(argc, argv);
         exit(1);
     }
 
-    if (params.model_path.length() == 0 && params.diffusion_model_path.length() == 0) {
+    if (params.mode != UPSCALE && params.model_path.length() == 0 && params.diffusion_model_path.length() == 0) {
         fprintf(stderr, "error: the following arguments are required: model_path/diffusion_model\n");
         print_usage(argc, argv);
         exit(1);
@@ -891,6 +913,17 @@ void parse_args(int argc, const char** argv, SDParams& params) {
         exit(1);
     }
 
+    if (params.mode == UPSCALE) {
+        if (params.esrgan_path.length() == 0) {
+            fprintf(stderr, "error: upscale mode needs an upscaler model (--upscale-model)\n");
+            exit(1);
+        }
+        if (params.init_image_path.length() == 0) {
+            fprintf(stderr, "error: upscale mode needs an init image (--init-img)\n");
+            exit(1);
+        }
+    }
+
     if (params.seed < 0) {
         srand((int)time(NULL));
         params.seed = rand();
@@ -901,14 +934,6 @@ void parse_args(int argc, const char** argv, SDParams& params) {
             params.output_path = "output.gguf";
         }
     }
-
-    if (!isfinite(params.sample_params.guidance.img_cfg)) {
-        params.sample_params.guidance.img_cfg = params.sample_params.guidance.txt_cfg;
-    }
-
-    if (!isfinite(params.high_noise_sample_params.guidance.img_cfg)) {
-        params.high_noise_sample_params.guidance.img_cfg = params.high_noise_sample_params.guidance.txt_cfg;
-    }
 }
 
 static std::string sd_basename(const std::string& path) {
@@ -1349,6 +1374,7 @@ int main(int argc, const char* argv[]) {
         params.n_threads,
         params.wtype,
         params.rng_type,
+        params.prediction,
         params.offload_params_to_cpu,
         params.clip_on_cpu,
         params.control_net_cpu,
@@ -1362,76 +1388,92 @@ int main(int argc, const char* argv[]) {
         params.flow_shift,
     };
 
-    sd_ctx_t* sd_ctx = new_sd_ctx(&sd_ctx_params);
+    sd_image_t* results = nullptr;
+    int num_results     = 0;
 
-    if (sd_ctx == NULL) {
-        printf("new_sd_ctx_t failed\n");
-        release_all_resources();
-        return 1;
-    }
+    if (params.mode == UPSCALE) {
+        num_results = 1;
+        results     = (sd_image_t*)calloc(num_results, sizeof(sd_image_t));
+        if (results == NULL) {
+            printf("failed to allocate results array\n");
+            release_all_resources();
+            return 1;
+        }
 
-    if (params.sample_params.sample_method == SAMPLE_METHOD_DEFAULT) {
-        params.sample_params.sample_method = sd_get_default_sample_method(sd_ctx);
-    }
+        results[0]      = init_image;
+        init_image.data = NULL;
+    } else {
+        sd_ctx_t* sd_ctx = new_sd_ctx(&sd_ctx_params);
 
-    sd_image_t* results;
-    int num_results = 1;
-    if (params.mode == IMG_GEN) {
-        sd_img_gen_params_t img_gen_params = {
-            params.prompt.c_str(),
-            params.negative_prompt.c_str(),
-            params.clip_skip,
-            init_image,
-            ref_images.data(),
-            (int)ref_images.size(),
-            params.increase_ref_index,
-            mask_image,
-            params.width,
-            params.height,
-            params.sample_params,
-            params.strength,
-            params.seed,
-            params.batch_count,
-            control_image,
-            params.control_strength,
-            {
-                pmid_images.data(),
-                (int)pmid_images.size(),
-                params.pm_id_embed_path.c_str(),
-                params.pm_style_strength,
-            },  // pm_params
-            params.vae_tiling_params,
-        };
+        if (sd_ctx == NULL) {
+            printf("new_sd_ctx_t failed\n");
+            release_all_resources();
+            return 1;
+        }
 
-        results     = generate_image(sd_ctx, &img_gen_params);
-        num_results = params.batch_count;
-    } else if (params.mode == VID_GEN) {
-        sd_vid_gen_params_t vid_gen_params = {
-            params.prompt.c_str(),
-            params.negative_prompt.c_str(),
-            params.clip_skip,
-            init_image,
-            end_image,
-            control_frames.data(),
-            (int)control_frames.size(),
-            params.width,
-            params.height,
-            params.sample_params,
-            params.high_noise_sample_params,
-            params.moe_boundary,
-            params.strength,
-            params.seed,
-            params.video_frames,
-            params.vace_strength,
-        };
+        if (params.sample_params.sample_method == SAMPLE_METHOD_DEFAULT) {
+            params.sample_params.sample_method = sd_get_default_sample_method(sd_ctx);
+        }
 
-        results = generate_video(sd_ctx, &vid_gen_params, &num_results);
-    }
+        if (params.mode == IMG_GEN) {
+            sd_img_gen_params_t img_gen_params = {
+                params.prompt.c_str(),
+                params.negative_prompt.c_str(),
+                params.clip_skip,
+                init_image,
+                ref_images.data(),
+                (int)ref_images.size(),
+                params.increase_ref_index,
+                mask_image,
+                params.width,
+                params.height,
+                params.sample_params,
+                params.strength,
+                params.seed,
+                params.batch_count,
+                control_image,
+                params.control_strength,
+                {
+                    pmid_images.data(),
+                    (int)pmid_images.size(),
+                    params.pm_id_embed_path.c_str(),
+                    params.pm_style_strength,
+                },  // pm_params
+                params.vae_tiling_params,
+            };
+
+            results     = generate_image(sd_ctx, &img_gen_params);
+            num_results = params.batch_count;
+        } else if (params.mode == VID_GEN) {
+            sd_vid_gen_params_t vid_gen_params = {
+                params.prompt.c_str(),
+                params.negative_prompt.c_str(),
+                params.clip_skip,
+                init_image,
+                end_image,
+                control_frames.data(),
+                (int)control_frames.size(),
+                params.width,
+                params.height,
+                params.sample_params,
+                params.high_noise_sample_params,
+                params.moe_boundary,
+                params.strength,
+                params.seed,
+                params.video_frames,
+                params.vace_strength,
+            };
+
+            results = generate_video(sd_ctx, &vid_gen_params, &num_results);
+        }
+
+        if (results == NULL) {
+            printf("generate failed\n");
+            free_sd_ctx(sd_ctx);
+            return 1;
+        }
 
-    if (results == NULL) {
-        printf("generate failed\n");
         free_sd_ctx(sd_ctx);
-        return 1;
     }
 
     int upscale_factor = 4;  // unused for RealESRGAN_x4plus_anime_6B.pth
@@ -1444,7 +1486,7 @@ int main(int argc, const char* argv[]) {
         if (upscaler_ctx == NULL) {
             printf("new_upscaler_ctx failed\n");
         } else {
-            for (int i = 0; i < params.batch_count; i++) {
+            for (int i = 0; i < num_results; i++) {
                 if (results[i].data == NULL) {
                     continue;
                 }
@@ -1530,7 +1572,6 @@ int main(int argc, const char* argv[]) {
         results[i].data = NULL;
     }
     free(results);
-    free_sd_ctx(sd_ctx);
 
     release_all_resources();
 
diff --git a/otherarch/sdcpp/stable-diffusion.cpp b/otherarch/sdcpp/stable-diffusion.cpp
index db788c99b..52b8ed28c 100644
--- a/otherarch/sdcpp/stable-diffusion.cpp
+++ b/otherarch/sdcpp/stable-diffusion.cpp
@@ -836,64 +836,102 @@ public:
                 ggml_backend_is_cpu(clip_backend) ? "RAM" : "VRAM");
         }
 
-        // check is_using_v_parameterization_for_sd2
-        if (sd_version_is_sd2(version)) {
-            if (is_using_v_parameterization_for_sd2(ctx, sd_version_is_inpaint(version))) {
-                is_using_v_parameterization = true;
-            }
-        } else if (sd_version_is_sdxl(version)) {
-            if (model_loader.tensor_storages_types.find("edm_vpred.sigma_max") != model_loader.tensor_storages_types.end()) {
-                // CosXL models
-                // TODO: get sigma_min and sigma_max values from file
-                is_using_edm_v_parameterization = true;
-            }
-            if (model_loader.tensor_storages_types.find("v_pred") != model_loader.tensor_storages_types.end()) {
-                is_using_v_parameterization = true;
-            }
-        } else if (version == VERSION_SVD) {
-            // TODO: V_PREDICTION_EDM
-            is_using_v_parameterization = true;
-        }
-
-        if (sd_version_is_sd3(version)) {
-            LOG_INFO("running in FLOW mode");
-            float shift = sd_ctx_params->flow_shift;
-            if (shift == INFINITY) {
-                shift = 3.0;
-            }
-            denoiser = std::make_shared<DiscreteFlowDenoiser>(shift);
-        } else if (sd_version_is_flux(version)) {
-            LOG_INFO("running in Flux FLOW mode");
-            float shift = 1.0f;  // TODO: validate
-            for (auto pair : model_loader.tensor_storages_types) {
-                if (pair.first.find("model.diffusion_model.guidance_in.in_layer.weight") != std::string::npos) {
-                    shift = 1.15f;
+        if (sd_ctx_params->prediction != DEFAULT_PRED) {
+            switch (sd_ctx_params->prediction) {
+                case EPS_PRED:
+                    LOG_INFO("running in eps-prediction mode");
+                    break;
+                case V_PRED:
+                    LOG_INFO("running in v-prediction mode");
+                    denoiser = std::make_shared<CompVisVDenoiser>();
+                    break;
+                case EDM_V_PRED:
+                    LOG_INFO("running in v-prediction EDM mode");
+                    denoiser = std::make_shared<EDMVDenoiser>();
+                    break;
+                case SD3_FLOW_PRED: {
+                    LOG_INFO("running in FLOW mode");
+                    float shift = sd_ctx_params->flow_shift;
+                    if (shift == INFINITY) {
+                        shift = 3.0;
+                    }
+                    denoiser = std::make_shared<DiscreteFlowDenoiser>(shift);
                     break;
                 }
+                case FLUX_FLOW_PRED: {
+                    LOG_INFO("running in Flux FLOW mode");
+                    float shift = sd_ctx_params->flow_shift;
+                    if (shift == INFINITY) {
+                        shift = 3.0;
+                    }
+                    denoiser = std::make_shared<FluxFlowDenoiser>(shift);
+                    break;
+                }
+                default: {
+                    LOG_ERROR("Unknown parametrization %i", sd_ctx_params->prediction);
+                    return false;
+                }
             }
-            denoiser = std::make_shared<FluxFlowDenoiser>(shift);
-        } else if (sd_version_is_wan(version)) {
-            LOG_INFO("running in FLOW mode");
-            float shift = sd_ctx_params->flow_shift;
-            if (shift == INFINITY) {
-                shift = 5.0;
-            }
-            denoiser = std::make_shared<DiscreteFlowDenoiser>(shift);
-        } else if (sd_version_is_qwen_image(version)) {
-            LOG_INFO("running in FLOW mode");
-            float shift = sd_ctx_params->flow_shift;
-            if (shift == INFINITY) {
-                shift = 3.0;
-            }
-            denoiser = std::make_shared<DiscreteFlowDenoiser>(shift);
-        } else if (is_using_v_parameterization) {
-            LOG_INFO("running in v-prediction mode");
-            denoiser = std::make_shared<CompVisVDenoiser>();
-        } else if (is_using_edm_v_parameterization) {
-            LOG_INFO("running in v-prediction EDM mode");
-            denoiser = std::make_shared<EDMVDenoiser>();
         } else {
-            LOG_INFO("running in eps-prediction mode");
+            if (sd_version_is_sd2(version)) {
+                // check is_using_v_parameterization_for_sd2
+                if (is_using_v_parameterization_for_sd2(ctx, sd_version_is_inpaint(version))) {
+                    is_using_v_parameterization = true;
+                }
+            } else if (sd_version_is_sdxl(version)) {
+                if (model_loader.tensor_storages_types.find("edm_vpred.sigma_max") != model_loader.tensor_storages_types.end()) {
+                    // CosXL models
+                    // TODO: get sigma_min and sigma_max values from file
+                    is_using_edm_v_parameterization = true;
+                }
+                if (model_loader.tensor_storages_types.find("v_pred") != model_loader.tensor_storages_types.end()) {
+                    is_using_v_parameterization = true;
+                }
+            } else if (version == VERSION_SVD) {
+                // TODO: V_PREDICTION_EDM
+                is_using_v_parameterization = true;
+            }
+
+            if (sd_version_is_sd3(version)) {
+                LOG_INFO("running in FLOW mode");
+                float shift = sd_ctx_params->flow_shift;
+                if (shift == INFINITY) {
+                    shift = 3.0;
+                }
+                denoiser = std::make_shared<DiscreteFlowDenoiser>(shift);
+            } else if (sd_version_is_flux(version)) {
+                LOG_INFO("running in Flux FLOW mode");
+                float shift = 1.0f;  // TODO: validate
+                for (auto pair : model_loader.tensor_storages_types) {
+                    if (pair.first.find("model.diffusion_model.guidance_in.in_layer.weight") != std::string::npos) {
+                        shift = 1.15f;
+                        break;
+                    }
+                }
+                denoiser = std::make_shared<FluxFlowDenoiser>(shift);
+            } else if (sd_version_is_wan(version)) {
+                LOG_INFO("running in FLOW mode");
+                float shift = sd_ctx_params->flow_shift;
+                if (shift == INFINITY) {
+                    shift = 5.0;
+                }
+                denoiser = std::make_shared<DiscreteFlowDenoiser>(shift);
+            } else if (sd_version_is_qwen_image(version)) {
+                LOG_INFO("running in FLOW mode");
+                float shift = sd_ctx_params->flow_shift;
+                if (shift == INFINITY) {
+                    shift = 3.0;
+                }
+                denoiser = std::make_shared<DiscreteFlowDenoiser>(shift);
+            } else if (is_using_v_parameterization) {
+                LOG_INFO("running in v-prediction mode");
+                denoiser = std::make_shared<CompVisVDenoiser>();
+            } else if (is_using_edm_v_parameterization) {
+                LOG_INFO("running in v-prediction EDM mode");
+                denoiser = std::make_shared<EDMVDenoiser>();
+            } else {
+                LOG_INFO("running in eps-prediction mode");
+            }
         }
 
         auto comp_vis_denoiser = std::dynamic_pointer_cast<CompVisDenoiser>(denoiser);
@@ -1281,7 +1319,7 @@ public:
         std::vector<int> skip_layers(guidance.slg.layers, guidance.slg.layers + guidance.slg.layer_count);
 
         float cfg_scale     = guidance.txt_cfg;
-        float img_cfg_scale = guidance.img_cfg;
+        float img_cfg_scale = isfinite(guidance.img_cfg) ? guidance.img_cfg : guidance.txt_cfg;
         float slg_scale     = guidance.slg.scale;
 
         if (img_cfg_scale != cfg_scale && !sd_version_is_inpaint_or_unet_edit(version)) {
@@ -1325,11 +1363,12 @@ public:
         }
         struct ggml_tensor* denoised = ggml_dup_tensor(work_ctx, x);
 
+        int64_t t0 = ggml_time_us();
+
         auto denoise = [&](ggml_tensor* input, float sigma, int step) -> ggml_tensor* {
-            if (step == 1) {
+            if (step == 1 || step == -1) {
                 pretty_progress(0, (int)steps, 0);
             }
-            int64_t t0 = ggml_time_us();
 
             std::vector<float> scaling = denoiser->get_scalings(sigma);
             GGML_ASSERT(scaling.size() == 3);
@@ -1483,8 +1522,9 @@ public:
             }
 
             int64_t t1 = ggml_time_us();
-            if (step > 0) {
-                pretty_progress(step, (int)steps, (t1 - t0) / 1000000.f);
+            if (step > 0 || step == -(int)steps) {
+                int showstep = std::abs(step);
+                pretty_progress(showstep, (int)steps, (t1 - t0) / 1000000.f / showstep);
                 // LOG_INFO("step %d sampling completed taking %.2fs", step, (t1 - t0) * 1.0f / 1000000);
             }
             if (denoise_mask != nullptr) {
@@ -1625,19 +1665,19 @@ public:
         if (vae_tiling_params.enabled && !encode_video) {
             // TODO wan2.2 vae support?
             int C = sd_version_is_dit(version) ? 16 : 4;
-            int NE2, NE3;
+            int ne2;
+            int ne3;
             if (sd_version_is_qwen_image(version)) {
-                NE2 = x->ne[3];
-                NE3 = C;
-            }
-            else {
+                ne2 = 1;
+                ne3 = C * x->ne[3];
+            } else {
                 if (!use_tiny_autoencoder) {
                     C *= 2;
                 }
-                NE2 = C;
-                NE3 = x->ne[3];
+                ne2 = C;
+                ne3 = x->ne[3];
             }
-            result = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, NE2, NE3);
+            result = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, ne2, ne3);
         }
 
         if (sd_version_is_qwen_image(version)) {
@@ -1911,6 +1951,31 @@ enum scheduler_t str_to_schedule(const char* str) {
     return SCHEDULE_COUNT;
 }
 
+const char* prediction_to_str[] = {
+    "default",
+    "eps",
+    "v",
+    "edm_v",
+    "sd3_flow",
+    "flux_flow",
+};
+
+const char* sd_prediction_name(enum prediction_t prediction) {
+    if (prediction < PREDICTION_COUNT) {
+        return prediction_to_str[prediction];
+    }
+    return NONE_STR;
+}
+
+enum prediction_t str_to_prediction(const char* str) {
+    for (int i = 0; i < PREDICTION_COUNT; i++) {
+        if (!strcmp(str, prediction_to_str[i])) {
+            return (enum prediction_t)i;
+        }
+    }
+    return PREDICTION_COUNT;
+}
+
 void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) {
     *sd_ctx_params                         = {};
     sd_ctx_params->vae_decode_only         = true;
@@ -1918,6 +1983,7 @@ void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) {
     sd_ctx_params->n_threads               = sd_get_num_physical_cores();
     sd_ctx_params->wtype                   = SD_TYPE_COUNT;
     sd_ctx_params->rng_type                = CUDA_RNG;
+    sd_ctx_params->prediction              = DEFAULT_PRED;
     sd_ctx_params->offload_params_to_cpu   = false;
     sd_ctx_params->keep_clip_on_cpu        = false;
     sd_ctx_params->keep_control_net_on_cpu = false;
@@ -1957,6 +2023,7 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
              "n_threads: %d\n"
              "wtype: %s\n"
              "rng_type: %s\n"
+             "prediction: %s\n"
              "offload_params_to_cpu: %s\n"
              "keep_clip_on_cpu: %s\n"
              "keep_control_net_on_cpu: %s\n"
@@ -1985,6 +2052,7 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
              sd_ctx_params->n_threads,
              sd_type_name(sd_ctx_params->wtype),
              sd_rng_type_name(sd_ctx_params->rng_type),
+             sd_prediction_name(sd_ctx_params->prediction),
              BOOL_STR(sd_ctx_params->offload_params_to_cpu),
              BOOL_STR(sd_ctx_params->keep_clip_on_cpu),
              BOOL_STR(sd_ctx_params->keep_control_net_on_cpu),
@@ -2031,7 +2099,9 @@ char* sd_sample_params_to_str(const sd_sample_params_t* sample_params) {
              "eta: %.2f, "
              "shifted_timestep: %d)",
              sample_params->guidance.txt_cfg,
-             sample_params->guidance.img_cfg,
+             isfinite(sample_params->guidance.img_cfg)
+                 ? sample_params->guidance.img_cfg
+                 : sample_params->guidance.txt_cfg,
              sample_params->guidance.distilled_guidance,
              sample_params->guidance.slg.layer_count,
              sample_params->guidance.slg.layer_start,
@@ -2193,6 +2263,10 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
         seed = rand();
     }
 
+    if (!isfinite(guidance.img_cfg)) {
+        guidance.img_cfg = guidance.txt_cfg;
+    }
+
     // for (auto v : sigmas) {
     //     std::cout << v << " ";
     // }
diff --git a/otherarch/sdcpp/stable-diffusion.h b/otherarch/sdcpp/stable-diffusion.h
index bc5d9a0d2..7c638469e 100644
--- a/otherarch/sdcpp/stable-diffusion.h
+++ b/otherarch/sdcpp/stable-diffusion.h
@@ -64,6 +64,16 @@ enum scheduler_t {
     SCHEDULE_COUNT
 };
 
+enum prediction_t {
+    DEFAULT_PRED,
+    EPS_PRED,
+    V_PRED,
+    EDM_V_PRED,
+    SD3_FLOW_PRED,
+    FLUX_FLOW_PRED,
+    PREDICTION_COUNT
+};
+
 // same as enum ggml_type
 enum sd_type_t {
     SD_TYPE_F32  = 0,
@@ -146,6 +156,7 @@ typedef struct {
     int n_threads;
     enum sd_type_t wtype;
     enum rng_type_t rng_type;
+    enum prediction_t prediction;
     bool offload_params_to_cpu;
     bool keep_clip_on_cpu;
     bool keep_control_net_on_cpu;
@@ -255,6 +266,8 @@ SD_API const char* sd_sample_method_name(enum sample_method_t sample_method);
 SD_API enum sample_method_t str_to_sample_method(const char* str);
 SD_API const char* sd_schedule_name(enum scheduler_t scheduler);
 SD_API enum scheduler_t str_to_schedule(const char* str);
+SD_API const char* sd_prediction_name(enum prediction_t prediction);
+SD_API enum prediction_t str_to_prediction(const char* str);
 
 SD_API void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params);
 SD_API char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params);
@@ -285,6 +298,8 @@ SD_API sd_image_t upscale(upscaler_ctx_t* upscaler_ctx,
                           sd_image_t input_image,
                           uint32_t upscale_factor);
 
+SD_API int get_upscale_factor(upscaler_ctx_t* upscaler_ctx);
+
 SD_API bool convert(const char* input_path,
                     const char* vae_path,
                     const char* output_path,