sd: sync to master-637-ef92a00

2026-05-22 19:47:49 +00:00 · 2026-05-20 22:42:01 -03:00 · 2026-05-20 22:42:01 -03:00 · f27795cef0
commit f27795cef0
parent 627e317cd7
9 changed files with 1070 additions and 142 deletions
--- a/2
+++ b/2
@ -695,7 +695,7 @@ llama-impl.o: src/llama-impl.cpp src/llama-impl.h
 budget.o: common/reasoning-budget.cpp common/reasoning-budget.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@

-SDCPP_COMMON_BASENAMES := anima.hpp auto_encoder_kl.hpp avi_writer.h cache_dit.hpp clip.hpp common_block.hpp common_dit.hpp condition_cache_utils.hpp conditioner.hpp control.hpp convert.cpp denoiser.hpp diffusion_model.hpp easycache.hpp ernie_image.hpp esrgan.hpp flux.hpp ggml_extend.hpp ggml_extend_backend.cpp ggml_extend_backend.h ggml_graph_cut.cpp ggml_graph_cut.h gits_noise.inl guidance.cpp guidance.h hidream_o1.hpp image_metadata.cpp image_metadata.h kcpp_sd_extensions.h latent-preview.h llm.hpp lora.hpp ltx_audio_vae.h ltx_vae.hpp ltxv.hpp mmdit.hpp model.cpp model.h model_io/binary_io.h model_io/gguf_io.cpp model_io/gguf_io.h model_io/gguf_reader_ext.h model_io/pickle_io.cpp model_io/pickle_io.h model_io/safetensors_io.cpp model_io/safetensors_io.h model_io/tensor_storage.h model_io/torch_legacy_io.cpp model_io/torch_legacy_io.h model_io/torch_zip_io.cpp model_io/torch_zip_io.h msf_gif.h name_conversion.cpp name_conversion.h ordered_map.hpp pmid.hpp preprocessing.hpp qwen_image.hpp rng.hpp rng_mt19937.hpp rng_philox.hpp rope.hpp sample-cache.cpp sample-cache.h spectrum.hpp stable-diffusion.cpp stable-diffusion.h t5.hpp tae.hpp tensor.hpp tensor_ggml.hpp thirdparty/LICENSE.darts_clone.txt thirdparty/darts.h thirdparty/miniz.h thirdparty/stb_image_resize.h thirdparty/stb_image_write.h thirdparty/zip.c thirdparty/zip.h tokenizers/bpe_tokenizer.cpp tokenizers/bpe_tokenizer.h tokenizers/clip_tokenizer.cpp tokenizers/clip_tokenizer.h tokenizers/gemma_tokenizer.cpp tokenizers/gemma_tokenizer.h tokenizers/mistral_tokenizer.cpp tokenizers/mistral_tokenizer.h tokenizers/qwen2_tokenizer.cpp tokenizers/qwen2_tokenizer.h tokenizers/t5_unigram_tokenizer.cpp tokenizers/t5_unigram_tokenizer.h tokenizers/tokenize_util.cpp tokenizers/tokenize_util.h tokenizers/tokenizer.cpp tokenizers/tokenizer.h tokenizers/vocab/vocab.h ucache.hpp unet.hpp upscaler.cpp upscaler.h util.cpp util.h vae.hpp wan.hpp z_image.hpp
+SDCPP_COMMON_BASENAMES := anima.hpp auto_encoder_kl.hpp avi_writer.h cache_dit.hpp clip.hpp common_block.hpp common_dit.hpp condition_cache_utils.hpp conditioner.hpp control.hpp convert.cpp denoiser.hpp diffusion_model.hpp easycache.hpp ernie_image.hpp esrgan.hpp flux.hpp ggml_extend.hpp ggml_extend_backend.cpp ggml_extend_backend.h ggml_graph_cut.cpp ggml_graph_cut.h gits_noise.inl guidance.cpp guidance.h hidream_o1.hpp image_metadata.cpp image_metadata.h kcpp_sd_extensions.h latent-preview.h llm.hpp lora.hpp ltx_audio_vae.h ltx_latent_upscaler.hpp ltx_vae.hpp ltxv.hpp mmdit.hpp model.cpp model.h model_io/binary_io.h model_io/gguf_io.cpp model_io/gguf_io.h model_io/gguf_reader_ext.h model_io/pickle_io.cpp model_io/pickle_io.h model_io/safetensors_io.cpp model_io/safetensors_io.h model_io/tensor_storage.h model_io/torch_legacy_io.cpp model_io/torch_legacy_io.h model_io/torch_zip_io.cpp model_io/torch_zip_io.h msf_gif.h name_conversion.cpp name_conversion.h ordered_map.hpp pmid.hpp preprocessing.hpp qwen_image.hpp rng.hpp rng_mt19937.hpp rng_philox.hpp rope.hpp sample-cache.cpp sample-cache.h spectrum.hpp stable-diffusion.cpp stable-diffusion.h t5.hpp tae.hpp tensor.hpp tensor_ggml.hpp thirdparty/LICENSE.darts_clone.txt thirdparty/darts.h thirdparty/miniz.h thirdparty/stb_image_resize.h thirdparty/stb_image_write.h thirdparty/zip.c thirdparty/zip.h tokenizers/bpe_tokenizer.cpp tokenizers/bpe_tokenizer.h tokenizers/clip_tokenizer.cpp tokenizers/clip_tokenizer.h tokenizers/gemma_tokenizer.cpp tokenizers/gemma_tokenizer.h tokenizers/mistral_tokenizer.cpp tokenizers/mistral_tokenizer.h tokenizers/qwen2_tokenizer.cpp tokenizers/qwen2_tokenizer.h tokenizers/t5_unigram_tokenizer.cpp tokenizers/t5_unigram_tokenizer.h tokenizers/tokenize_util.cpp tokenizers/tokenize_util.h tokenizers/tokenizer.cpp tokenizers/tokenizer.h tokenizers/vocab/vocab.h ucache.hpp unet.hpp upscaler.cpp upscaler.h util.cpp util.h vae.hpp wan.hpp z_image.hpp

 SDCPP_MAIN_BASENAMES := common/common.cpp common/common.h common/log.cpp common/log.h common/media_io.cpp common/media_io.cpp common/media_io.h common/resource_owners.hpp convert.cpp image_metadata.cpp main.cpp tokenizers/vocab/clip_merges.hpp tokenizers/vocab/gemma_merges.hpp tokenizers/vocab/gemma_vocab.hpp tokenizers/vocab/mistral_merges.hpp tokenizers/vocab/mistral_vocab.hpp tokenizers/vocab/qwen_merges.hpp tokenizers/vocab/t5.hpp tokenizers/vocab/umt5.hpp tokenizers/vocab/vocab.cpp version.cpp

--- a/otherarch/sdcpp/common/common.cpp
+++ b/otherarch/sdcpp/common/common.cpp
@ -1134,11 +1134,11 @@ ArgOptions SDGenerationParams::get_options() {
        return 1;
    };

-    auto on_sigmas_arg = [&](int argc, const char** argv, int index) {
-        if (++index >= argc) {
+    auto parse_sigmas_arg = [&](const char* value, std::vector<float>* target, const char* option_name) {
+        if (target == nullptr || value == nullptr) {
            return -1;
        }
-        std::string sigmas_str = argv[index];
+        std::string sigmas_str = value;
        if (!sigmas_str.empty() && sigmas_str.front() == '[') {
            sigmas_str.erase(0, 1);
        }
@ -1146,6 +1146,7 @@ ArgOptions SDGenerationParams::get_options() {
            sigmas_str.pop_back();
        }

+        size_t before = target->size();
        std::stringstream ss(sigmas_str);
        std::string item;
        while (std::getline(ss, item, ',')) {
@ -1153,24 +1154,38 @@ ArgOptions SDGenerationParams::get_options() {
            item.erase(item.find_last_not_of(" \t\n\r\f\v") + 1);
            if (!item.empty()) {
                try {
-                    custom_sigmas.push_back(std::stof(item));
+                    target->push_back(std::stof(item));
                } catch (const std::invalid_argument&) {
-                    LOG_ERROR("error: invalid float value '%s' in --sigmas", item.c_str());
+                    LOG_ERROR("error: invalid float value '%s' in %s", item.c_str(), option_name);
                    return -1;
                } catch (const std::out_of_range&) {
-                    LOG_ERROR("error: float value '%s' out of range in --sigmas", item.c_str());
+                    LOG_ERROR("error: float value '%s' out of range in %s", item.c_str(), option_name);
                    return -1;
                }
            }
        }

-        if (custom_sigmas.empty() && !sigmas_str.empty()) {
-            LOG_ERROR("error: could not parse any sigma values from '%s'", argv[index]);
+        if (target->size() == before && !sigmas_str.empty()) {
+            LOG_ERROR("error: could not parse any sigma values from '%s'", value);
            return -1;
        }
        return 1;
    };

+    auto on_sigmas_arg = [&](int argc, const char** argv, int index) {
+        if (++index >= argc) {
+            return -1;
+        }
+        return parse_sigmas_arg(argv[index], &custom_sigmas, "--sigmas");
+    };
+
+    auto on_hires_sigmas_arg = [&](int argc, const char** argv, int index) {
+        if (++index >= argc) {
+            return -1;
+        }
+        return parse_sigmas_arg(argv[index], &hires_custom_sigmas, "--hires-sigmas");
+    };
+
    auto on_ref_image_arg = [&](int argc, const char** argv, int index) {
        if (++index >= argc) {
            return -1;
@ -1293,6 +1308,10 @@ ArgOptions SDGenerationParams::get_options() {
         "--sigmas",
         "custom sigma values for the sampler, comma-separated (e.g., \"14.61,7.8,3.5,0.0\").",
         on_sigmas_arg},
+        {"",
+         "--hires-sigmas",
+         "custom sigma values for the highres fix second pass, comma-separated (e.g., \"0.85,0.725,0.421875,0.0\").",
+         on_hires_sigmas_arg},
        {"",
         "--skip-layers",
         "layers to skip for SLG steps (default: [7,8,9])",
@ -1525,11 +1544,31 @@ static bool resolve_model_file_from_dir(const std::string& model_name,
        LOG_ERROR("%s directory is empty", label);
        return false;
    }
+    auto ends_with_valid_ext = [&]() {
+        for (const auto& ext : valid_ext) {
+            if (model_name.size() < ext.size()) {
+                continue;
+            }
+            auto suffix = model_name.substr(model_name.size() - ext.size());
+            std::transform(suffix.begin(), suffix.end(), suffix.begin(), [](unsigned char c) {
+                return static_cast<char>(std::tolower(c));
+            });
+            std::string lower_ext = ext;
+            std::transform(lower_ext.begin(), lower_ext.end(), lower_ext.begin(), [](unsigned char c) {
+                return static_cast<char>(std::tolower(c));
+            });
+            if (suffix == lower_ext) {
+                return true;
+            }
+        }
+        return false;
+    };
+
    if (model_name.empty() ||
        model_name.find('/') != std::string::npos ||
        model_name.find('\\') != std::string::npos ||
        fs::path(model_name).has_root_path() ||
-        fs::path(model_name).has_extension()) {
+        ends_with_valid_ext()) {
        LOG_ERROR("%s must be a model name without path or extension: %s", label, model_name.c_str());
        return false;
    }
@ -1633,6 +1672,9 @@ bool SDGenerationParams::from_json_str(
        if (hires_json.contains("denoising_strength") && hires_json["denoising_strength"].is_number()) {
            hires_denoising_strength = hires_json["denoising_strength"];
        }
+        if (hires_json.contains("custom_sigmas") && hires_json["custom_sigmas"].is_array()) {
+            hires_custom_sigmas = hires_json["custom_sigmas"].get<std::vector<float>>();
+        }
        if (hires_json.contains("upscale_tile_size") && hires_json["upscale_tile_size"].is_number_integer()) {
            hires_upscale_tile_size = hires_json["upscale_tile_size"];
        }
@ -2080,6 +2122,10 @@ bool SDGenerationParams::validate(SDMode mode) {
            LOG_ERROR("error: hires denoising strength must be in (0.0, 1.0]");
            return false;
        }
+        if (!hires_custom_sigmas.empty() && hires_custom_sigmas.size() < 2) {
+            LOG_ERROR("error: hires custom sigmas must contain at least two values");
+            return false;
+        }
        if (hires_upscale_tile_size < 1) {
            LOG_ERROR("error: hires upscale tile size must be positive");
            return false;
@ -2174,15 +2220,17 @@ sd_img_gen_params_t SDGenerationParams::to_sd_img_gen_params_t() {
    params.vae_tiling_params     = vae_tiling_params;
    params.cache                 = cache_params;

-    params.hires.enabled            = hires_enabled;
-    params.hires.upscaler           = resolved_hires_upscaler;
-    params.hires.model_path         = hires_upscaler_model_path.empty() ? nullptr : hires_upscaler_model_path.c_str();
-    params.hires.scale              = hires_scale;
-    params.hires.target_width       = hires_width;
-    params.hires.target_height      = hires_height;
-    params.hires.steps              = hires_steps;
-    params.hires.denoising_strength = hires_denoising_strength;
-    params.hires.upscale_tile_size  = hires_upscale_tile_size;
+    params.hires.enabled             = hires_enabled;
+    params.hires.upscaler            = resolved_hires_upscaler;
+    params.hires.model_path          = hires_upscaler_model_path.empty() ? nullptr : hires_upscaler_model_path.c_str();
+    params.hires.scale               = hires_scale;
+    params.hires.target_width        = hires_width;
+    params.hires.target_height       = hires_height;
+    params.hires.steps               = hires_steps;
+    params.hires.denoising_strength  = hires_denoising_strength;
+    params.hires.upscale_tile_size   = hires_upscale_tile_size;
+    params.hires.custom_sigmas       = hires_custom_sigmas.empty() ? nullptr : hires_custom_sigmas.data();
+    params.hires.custom_sigmas_count = static_cast<int>(hires_custom_sigmas.size());
    return params;
 }

@ -2215,27 +2263,38 @@ sd_vid_gen_params_t SDGenerationParams::to_sd_vid_gen_params_t() {
    high_noise_sample_params.extra_sample_args        = high_noise_extra_sample_args.empty() ? nullptr : high_noise_extra_sample_args.c_str();
    cache_params.scm_mask                             = scm_mask.empty() ? nullptr : scm_mask.c_str();

-    params.loras                    = lora_vec.empty() ? nullptr : lora_vec.data();
-    params.lora_count               = static_cast<uint32_t>(lora_vec.size());
-    params.prompt                   = prompt.c_str();
-    params.negative_prompt          = negative_prompt.c_str();
-    params.clip_skip                = clip_skip;
-    params.init_image               = init_image.get();
-    params.end_image                = end_image.get();
-    params.control_frames           = control_frame_views.empty() ? nullptr : control_frame_views.data();
-    params.control_frames_size      = static_cast<int>(control_frame_views.size());
-    params.width                    = get_resolved_width();
-    params.height                   = get_resolved_height();
-    params.sample_params            = sample_params;
-    params.high_noise_sample_params = high_noise_sample_params;
-    params.moe_boundary             = moe_boundary;
-    params.strength                 = strength;
-    params.seed                     = seed;
-    params.video_frames             = video_frames;
-    params.fps                      = fps;
-    params.vace_strength            = vace_strength;
-    params.vae_tiling_params        = vae_tiling_params;
-    params.cache                    = cache_params;
+    params.loras                     = lora_vec.empty() ? nullptr : lora_vec.data();
+    params.lora_count                = static_cast<uint32_t>(lora_vec.size());
+    params.prompt                    = prompt.c_str();
+    params.negative_prompt           = negative_prompt.c_str();
+    params.clip_skip                 = clip_skip;
+    params.init_image                = init_image.get();
+    params.end_image                 = end_image.get();
+    params.control_frames            = control_frame_views.empty() ? nullptr : control_frame_views.data();
+    params.control_frames_size       = static_cast<int>(control_frame_views.size());
+    params.width                     = get_resolved_width();
+    params.height                    = get_resolved_height();
+    params.sample_params             = sample_params;
+    params.high_noise_sample_params  = high_noise_sample_params;
+    params.moe_boundary              = moe_boundary;
+    params.strength                  = strength;
+    params.seed                      = seed;
+    params.video_frames              = video_frames;
+    params.fps                       = fps;
+    params.vace_strength             = vace_strength;
+    params.vae_tiling_params         = vae_tiling_params;
+    params.cache                     = cache_params;
+    params.hires.enabled             = hires_enabled;
+    params.hires.upscaler            = resolved_hires_upscaler;
+    params.hires.model_path          = hires_upscaler_model_path.empty() ? nullptr : hires_upscaler_model_path.c_str();
+    params.hires.scale               = hires_scale;
+    params.hires.target_width        = hires_width;
+    params.hires.target_height       = hires_height;
+    params.hires.steps               = hires_steps;
+    params.hires.denoising_strength  = hires_denoising_strength;
+    params.hires.upscale_tile_size   = hires_upscale_tile_size;
+    params.hires.custom_sigmas       = hires_custom_sigmas.empty() ? nullptr : hires_custom_sigmas.data();
+    params.hires.custom_sigmas_count = static_cast<int>(hires_custom_sigmas.size());
    return params;
 }

@ -2318,6 +2377,7 @@ std::string SDGenerationParams::to_string() const {
        << ", target_height: " << hires_height
        << ", steps: " << hires_steps
        << ", denoising_strength: " << hires_denoising_strength
+        << ", custom_sigmas: " << vec_to_string(hires_custom_sigmas)
        << ", upscale_tile_size: " << hires_upscale_tile_size << " },\n"
        << "  vae_tiling_params: { "
        << vae_tiling_params.enabled << ", "
@ -2469,6 +2529,7 @@ std::string build_sdcpp_image_metadata_json(const SDContextParams& ctx_params,
            {"target_height", gen_params.hires_height},
            {"steps", gen_params.hires_steps},
            {"denoising_strength", gen_params.hires_denoising_strength},
+            {"custom_sigmas", gen_params.hires_custom_sigmas},
            {"upscale_tile_size", gen_params.hires_upscale_tile_size},
        };
    }
@ -2588,6 +2649,9 @@ std::string get_image_params(const SDContextParams& ctx_params,
        parameter_string += "Hires resize: " + std::to_string(gen_params.hires_width) + "x" + std::to_string(gen_params.hires_height) + ", ";
        parameter_string += "Hires steps: " + std::to_string(gen_params.hires_steps) + ", ";
        parameter_string += "Denoising strength: " + std::to_string(gen_params.hires_denoising_strength) + ", ";
+        if (!gen_params.hires_custom_sigmas.empty()) {
+            parameter_string += "Hires custom sigmas: " + vec_to_string(gen_params.hires_custom_sigmas) + ", ";
+        }
    }
    parameter_string += "Version: stable-diffusion.cpp";
    parameter_string += ", SDCPP: " + build_sdcpp_image_metadata_json(ctx_params, gen_params, seed, mode);
--- a/otherarch/sdcpp/common/common.h
+++ b/otherarch/sdcpp/common/common.h
@ -207,6 +207,7 @@ struct SDGenerationParams {
    int hires_steps                = 0;
    float hires_denoising_strength = 0.7f;
    int hires_upscale_tile_size    = 128;
+    std::vector<float> hires_custom_sigmas;

    std::map<std::string, float> lora_map;
    std::map<std::string, float> high_noise_lora_map;
--- a/otherarch/sdcpp/ltx_latent_upscaler.hpp
+++ b/otherarch/sdcpp/ltx_latent_upscaler.hpp
@ -0,0 +1,348 @@
+#ifndef __SD_LTX_LATENT_UPSCALER_HPP__
+#define __SD_LTX_LATENT_UPSCALER_HPP__
+
+#include <cinttypes>
+#include <cmath>
+#include <cstdlib>
+#include <map>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "common_dit.hpp"
+#include "ggml_extend.hpp"
+#include "ggml_graph_cut.h"
+#include "model.h"
+#include "util.h"
+
+namespace LTXVUpsampler {
+    constexpr int LTX_UPSAMPLER_GRAPH_SIZE = 10240;
+
+    struct LatentUpsamplerConfig {
+        int64_t in_channels      = 128;
+        int64_t mid_channels     = 1024;
+        int num_blocks_per_stage = 4;
+        int dims                 = 3;
+        bool spatial_upsample    = true;
+        bool temporal_upsample   = false;
+        bool rational_resampler  = false;
+    };
+
+    static inline bool has_tensor(const String2TensorStorage& tensor_storage_map,
+                                  const std::string& name) {
+        return tensor_storage_map.find(name) != tensor_storage_map.end();
+    }
+
+    static inline int64_t get_tensor_ne0(const String2TensorStorage& tensor_storage_map,
+                                         const std::string& name,
+                                         int64_t fallback) {
+        auto it = tensor_storage_map.find(name);
+        if (it == tensor_storage_map.end()) {
+            return fallback;
+        }
+        return it->second.ne[0];
+    }
+
+    static inline int count_module_blocks(const String2TensorStorage& tensor_storage_map,
+                                          const std::string& module_name) {
+        int max_block            = -1;
+        const std::string prefix = module_name + ".";
+        for (const auto& pair : tensor_storage_map) {
+            const std::string& name = pair.first;
+            if (name.find(prefix) != 0) {
+                continue;
+            }
+            size_t begin = prefix.size();
+            size_t end   = name.find('.', begin);
+            if (end == std::string::npos) {
+                continue;
+            }
+            int index = atoi(name.substr(begin, end - begin).c_str());
+            max_block = std::max(max_block, index);
+        }
+        return max_block + 1;
+    }
+
+    static inline LatentUpsamplerConfig detect_config_from_weights(const String2TensorStorage& tensor_storage_map) {
+        LatentUpsamplerConfig config;
+        config.mid_channels = get_tensor_ne0(tensor_storage_map, "initial_norm.weight", config.mid_channels);
+        config.in_channels  = get_tensor_ne0(tensor_storage_map, "final_conv.bias", config.in_channels);
+        int detected_blocks = count_module_blocks(tensor_storage_map, "res_blocks");
+        if (detected_blocks > 0) {
+            config.num_blocks_per_stage = detected_blocks;
+        }
+        config.spatial_upsample  = has_tensor(tensor_storage_map, "upsampler.0.weight");
+        config.temporal_upsample = has_tensor(tensor_storage_map, "temporal_upsampler.0.weight");
+        return config;
+    }
+
+    class VideoGroupNorm : public GGMLBlock {
+    protected:
+        int num_groups;
+        int64_t num_channels;
+        float eps;
+        std::string prefix;
+
+        void init_params(ggml_context* ctx,
+                         const String2TensorStorage& tensor_storage_map = {},
+                         const std::string prefix                       = "") override {
+            SD_UNUSED(tensor_storage_map);
+            this->prefix     = prefix;
+            params["weight"] = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, num_channels);
+            params["bias"]   = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, num_channels);
+        }
+
+    public:
+        VideoGroupNorm(int num_groups, int64_t num_channels, float eps = 1e-05f)
+            : num_groups(num_groups),
+              num_channels(num_channels),
+              eps(eps) {}
+
+        ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
+            // LTX video latent layout is [W, H, T, C]. ggml_group_norm treats ne[2]
+            // as channels, so fold only H/T internally and restore the same layout.
+            GGML_ASSERT(x->ne[3] == num_channels);
+            const int64_t W = x->ne[0];
+            const int64_t H = x->ne[1];
+            const int64_t T = x->ne[2];
+            x               = ggml_ext_cont(ctx->ggml_ctx, x);
+            x               = ggml_reshape_4d(ctx->ggml_ctx, x, W, H * T, num_channels, 1);
+            x               = ggml_group_norm(ctx->ggml_ctx, x, num_groups, eps);
+
+            ggml_tensor* weight = params["weight"];
+            ggml_tensor* bias   = params["bias"];
+            if (ctx->weight_adapter) {
+                weight = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, ctx->backend, weight, prefix + "weight");
+                bias   = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, ctx->backend, bias, prefix + "bias");
+            }
+            weight = ggml_reshape_4d(ctx->ggml_ctx, weight, 1, 1, num_channels, 1);
+            bias   = ggml_reshape_4d(ctx->ggml_ctx, bias, 1, 1, num_channels, 1);
+            x      = ggml_mul_inplace(ctx->ggml_ctx, x, weight);
+            x      = ggml_add_inplace(ctx->ggml_ctx, x, bias);
+            return ggml_reshape_4d(ctx->ggml_ctx, x, W, H, T, num_channels);
+        }
+    };
+
+    class ResBlock : public GGMLBlock {
+    public:
+        ResBlock(int64_t channels, int dims = 3) {
+            GGML_ASSERT(dims == 3);
+            blocks["conv1"] = std::shared_ptr<GGMLBlock>(new Conv3d(channels, channels, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}));
+            blocks["norm1"] = std::shared_ptr<GGMLBlock>(new VideoGroupNorm(32, channels));
+            blocks["conv2"] = std::shared_ptr<GGMLBlock>(new Conv3d(channels, channels, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}));
+            blocks["norm2"] = std::shared_ptr<GGMLBlock>(new VideoGroupNorm(32, channels));
+        }
+
+        ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
+            auto conv1 = std::dynamic_pointer_cast<Conv3d>(blocks["conv1"]);
+            auto norm1 = std::dynamic_pointer_cast<VideoGroupNorm>(blocks["norm1"]);
+            auto conv2 = std::dynamic_pointer_cast<Conv3d>(blocks["conv2"]);
+            auto norm2 = std::dynamic_pointer_cast<VideoGroupNorm>(blocks["norm2"]);
+
+            ggml_tensor* residual = x;
+
+            x = conv1->forward(ctx, x);
+            x = norm1->forward(ctx, x);
+            x = ggml_silu_inplace(ctx->ggml_ctx, x);
+            x = conv2->forward(ctx, x);
+            x = norm2->forward(ctx, x);
+            x = ggml_add(ctx->ggml_ctx, x, residual);
+            return ggml_silu(ctx->ggml_ctx, x);
+        }
+    };
+
+    class PixelShuffleND : public UnaryBlock {
+    protected:
+        int upscale_factor;
+
+    public:
+        explicit PixelShuffleND(int upscale_factor)
+            : upscale_factor(upscale_factor) {}
+
+        ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
+            GGML_ASSERT(upscale_factor == 2);
+            int64_t h = x->ne[1];
+            int64_t w = x->ne[0];
+            // x: [b*f, c*4, h, w] -> [b*f, c, h*2, w*2]
+            x = ggml_ext_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, x, 2, 0, 1, 3));  // [b*f, h, w, c*4]
+            x = ggml_reshape_3d(ctx->ggml_ctx, x, x->ne[0], x->ne[1] * x->ne[2], x->ne[3]);          // [b*f, h*w, c*4]
+            return DiT::unpatchify(ctx->ggml_ctx, x, h, w, upscale_factor, upscale_factor, true);
+        }
+    };
+
+    class LatentUpsampler : public GGMLBlock {
+    public:
+        LatentUpsamplerConfig config;
+
+        explicit LatentUpsampler(LatentUpsamplerConfig config)
+            : config(std::move(config)) {
+            GGML_ASSERT(this->config.dims == 3);
+            GGML_ASSERT(this->config.spatial_upsample);
+            GGML_ASSERT(!this->config.temporal_upsample);
+            GGML_ASSERT(!this->config.rational_resampler);
+
+            blocks["initial_conv"] = std::shared_ptr<GGMLBlock>(new Conv3d(this->config.in_channels,
+                                                                           this->config.mid_channels,
+                                                                           {3, 3, 3},
+                                                                           {1, 1, 1},
+                                                                           {1, 1, 1}));
+            blocks["initial_norm"] = std::shared_ptr<GGMLBlock>(new VideoGroupNorm(32, this->config.mid_channels));
+            for (int i = 0; i < this->config.num_blocks_per_stage; ++i) {
+                blocks["res_blocks." + std::to_string(i)] = std::shared_ptr<GGMLBlock>(new ResBlock(this->config.mid_channels, this->config.dims));
+            }
+            blocks["upsampler.0"] = std::shared_ptr<GGMLBlock>(new Conv2d(this->config.mid_channels,
+                                                                          4 * this->config.mid_channels,
+                                                                          {3, 3},
+                                                                          {1, 1},
+                                                                          {1, 1}));
+            blocks["upsampler.1"] = std::shared_ptr<GGMLBlock>(new PixelShuffleND(2));
+            for (int i = 0; i < this->config.num_blocks_per_stage; ++i) {
+                blocks["post_upsample_res_blocks." + std::to_string(i)] = std::shared_ptr<GGMLBlock>(new ResBlock(this->config.mid_channels, this->config.dims));
+            }
+            blocks["final_conv"] = std::shared_ptr<GGMLBlock>(new Conv3d(this->config.mid_channels,
+                                                                         this->config.in_channels,
+                                                                         {3, 3, 3},
+                                                                         {1, 1, 1},
+                                                                         {1, 1, 1}));
+        }
+
+        ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
+            // x: [b*c, f, h, w]
+            // return: [b*c, f, h*2, w*2]
+            auto initial_conv  = std::dynamic_pointer_cast<Conv3d>(blocks["initial_conv"]);
+            auto initial_norm  = std::dynamic_pointer_cast<VideoGroupNorm>(blocks["initial_norm"]);
+            auto upsample_conv = std::dynamic_pointer_cast<Conv2d>(blocks["upsampler.0"]);
+            auto pixel_shuffle = std::dynamic_pointer_cast<PixelShuffleND>(blocks["upsampler.1"]);
+            auto final_conv    = std::dynamic_pointer_cast<Conv3d>(blocks["final_conv"]);
+
+            x = initial_conv->forward(ctx, x);
+            x = initial_norm->forward(ctx, x);
+            x = ggml_silu(ctx->ggml_ctx, x);
+            sd::ggml_graph_cut::mark_graph_cut(x, "ltx_latent_upsampler.initial", "x");
+
+            for (int i = 0; i < config.num_blocks_per_stage; ++i) {
+                auto block = std::dynamic_pointer_cast<ResBlock>(blocks["res_blocks." + std::to_string(i)]);
+                x          = block->forward(ctx, x);
+                sd::ggml_graph_cut::mark_graph_cut(x, "ltx_latent_upsampler.res_blocks." + std::to_string(i), "x");
+            }
+
+            // rearrange(x, "b c f h w -> (b f) c h w"),
+            x = ggml_ext_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, x, 0, 1, 3, 2));  // [b*f, c, h, w]
+            x = upsample_conv->forward(ctx, x);                                                      // [b*f, c*4, h, w]
+            x = pixel_shuffle->forward(ctx, x);                                                      // [b*f, c, h*2, w*2]
+            x = ggml_ext_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, x, 0, 1, 3, 2));  // [b*c, f, h, w]
+            sd::ggml_graph_cut::mark_graph_cut(x, "ltx_latent_upsampler.spatial_up", "x");
+
+            for (int i = 0; i < config.num_blocks_per_stage; ++i) {
+                auto block = std::dynamic_pointer_cast<ResBlock>(blocks["post_upsample_res_blocks." + std::to_string(i)]);
+                x          = block->forward(ctx, x);
+                sd::ggml_graph_cut::mark_graph_cut(x, "ltx_latent_upsampler.post_blocks." + std::to_string(i), "x");
+            }
+
+            x = final_conv->forward(ctx, x);
+            sd::ggml_graph_cut::mark_graph_cut(x, "ltx_latent_upsampler.final", "x");
+            return x;
+        }
+    };
+
+    struct LatentUpsamplerRunner : public GGMLRunner {
+        std::unique_ptr<LatentUpsampler> model;
+
+        LatentUpsamplerRunner(ggml_backend_t backend,
+                              ggml_backend_t params_backend)
+            : GGMLRunner(backend, params_backend) {}
+
+        std::string get_desc() override {
+            return "ltx_latent_upsampler";
+        }
+
+        bool load_from_file(const std::string& file_path, int n_threads) {
+            LOG_INFO("loading LTX latent upsampler from '%s'", file_path.c_str());
+            ModelLoader model_loader;
+            if (!model_loader.init_from_file(file_path)) {
+                LOG_ERROR("init LTX latent upsampler model loader from file failed: '%s'", file_path.c_str());
+                return false;
+            }
+
+            const auto& tensor_storage_map = model_loader.get_tensor_storage_map();
+            if (!has_tensor(tensor_storage_map, "post_upsample_res_blocks.0.conv2.bias") ||
+                !has_tensor(tensor_storage_map, "upsampler.0.weight")) {
+                LOG_ERROR("unsupported LTX latent upsampler weights: expected spatial upsampler tensors");
+                return false;
+            }
+
+            LatentUpsamplerConfig config = detect_config_from_weights(tensor_storage_map);
+            if (config.dims != 3 || !config.spatial_upsample || config.temporal_upsample ||
+                config.rational_resampler) {
+                LOG_ERROR("unsupported LTX latent upsampler config: dims=%d spatial=%d temporal=%d rational=%d",
+                          config.dims,
+                          config.spatial_upsample,
+                          config.temporal_upsample,
+                          config.rational_resampler);
+                return false;
+            }
+
+            model = std::make_unique<LatentUpsampler>(config);
+            model->init(params_ctx, tensor_storage_map, "");
+            if (!alloc_params_buffer()) {
+                LOG_ERROR("LTX latent upsampler params buffer allocation failed");
+                return false;
+            }
+
+            std::map<std::string, ggml_tensor*> tensors;
+            model->get_param_tensors(tensors);
+            if (!model_loader.load_tensors(tensors, {}, n_threads)) {
+                LOG_ERROR("load LTX latent upsampler tensors failed");
+                return false;
+            }
+
+            LOG_INFO("LTX latent upsampler loaded: in_channels=%" PRId64 ", mid_channels=%" PRId64 ", blocks=%d",
+                     config.in_channels,
+                     config.mid_channels,
+                     config.num_blocks_per_stage);
+            return true;
+        }
+
+        ggml_cgraph* build_graph(const sd::Tensor<float>& x_tensor) {
+            if (!model) {
+                return nullptr;
+            }
+            ggml_cgraph* gf  = new_graph_custom(LTX_UPSAMPLER_GRAPH_SIZE);
+            ggml_tensor* x   = make_input(x_tensor);
+            auto runner_ctx  = get_context();
+            ggml_tensor* out = model->forward(&runner_ctx, x);
+            ggml_build_forward_expand(gf, out);
+            return gf;
+        }
+
+        sd::Tensor<float> compute(const int n_threads,
+                                  const sd::Tensor<float>& x) {
+            if (!model) {
+                LOG_ERROR("LTX latent upsampler is not loaded");
+                return {};
+            }
+            if (x.dim() != 4 && x.dim() != 5) {
+                LOG_ERROR("LTX latent upsampler expects 4D or 5D video latent, got dim=%lld",
+                          (long long)x.dim());
+                return {};
+            }
+            if (x.dim() == 5 && x.shape()[4] != 1) {
+                LOG_ERROR("LTX latent upsampler currently supports batch size 1, got batch=%lld",
+                          (long long)x.shape()[4]);
+                return {};
+            }
+            if (x.shape()[3] != model->config.in_channels) {
+                LOG_ERROR("LTX latent upsampler expected %" PRId64 " channels, got %lld",
+                          model->config.in_channels,
+                          (long long)x.shape()[3]);
+                return {};
+            }
+            size_t expected_dim = static_cast<size_t>(x.dim());
+            auto get_graph      = [&]() -> ggml_cgraph* { return build_graph(x); };
+            return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), expected_dim);
+        }
+    };
+
+}  // namespace LTXVUpsampler
+
+#endif  // __SD_LTX_LATENT_UPSCALER_HPP__
--- a/otherarch/sdcpp/ltx_vae.hpp
+++ b/otherarch/sdcpp/ltx_vae.hpp
@ -1123,6 +1123,18 @@ namespace LTXVAE {
            mean      = ggml_cont(ctx->ggml_ctx, mean);
            return processor->normalize(ctx, mean);
        }
+
+        ggml_tensor* normalize_latents(GGMLRunnerContext* ctx,
+                                       ggml_tensor* x) {
+            auto processor = std::dynamic_pointer_cast<PerChannelStatistics>(blocks["per_channel_statistics"]);
+            return processor->normalize(ctx, x);
+        }
+
+        ggml_tensor* un_normalize_latents(GGMLRunnerContext* ctx,
+                                          ggml_tensor* x) {
+            auto processor = std::dynamic_pointer_cast<PerChannelStatistics>(blocks["per_channel_statistics"]);
+            return processor->un_normalize(ctx, x);
+        }
    };

 }  // namespace LTXVAE
@ -1192,6 +1204,17 @@ struct LTXVideoVAE : public VAE {
        return gf;
    }

+    ggml_cgraph* build_latent_statistics_graph(const sd::Tensor<float>& z_tensor, bool normalize) {
+        ggml_cgraph* gf = new_graph_custom(1024);
+        ggml_tensor* z  = make_input(z_tensor);
+
+        auto runner_ctx  = get_context();
+        ggml_tensor* out = normalize ? vae.normalize_latents(&runner_ctx, z)
+                                     : vae.un_normalize_latents(&runner_ctx, z);
+        ggml_build_forward_expand(gf, out);
+        return gf;
+    }
+
    sd::Tensor<float> _compute(const int n_threads,
                               const sd::Tensor<float>& z,
                               bool decode_graph) override {
@ -1226,6 +1249,26 @@ struct LTXVideoVAE : public VAE {
        return result;
    }

+    sd::Tensor<float> apply_latent_statistics(const int n_threads,
+                                              const sd::Tensor<float>& z,
+                                              bool normalize) {
+        auto get_graph = [&]() -> ggml_cgraph* {
+            return build_latent_statistics_graph(z, normalize);
+        };
+        return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false),
+                                               static_cast<size_t>(z.dim()));
+    }
+
+    sd::Tensor<float> normalize_latents(const int n_threads,
+                                        const sd::Tensor<float>& z) {
+        return apply_latent_statistics(n_threads, z, true);
+    }
+
+    sd::Tensor<float> un_normalize_latents(const int n_threads,
+                                           const sd::Tensor<float>& z) {
+        return apply_latent_statistics(n_threads, z, false);
+    }
+
    int get_encoder_output_channels(int input_channels) override {
        SD_UNUSED(input_channels);
        return 256;
--- a/otherarch/sdcpp/ltxv.hpp
+++ b/otherarch/sdcpp/ltxv.hpp
@ -1487,6 +1487,9 @@ namespace LTXV {
                    ->forward(ctx, ggml_ext_scale(ctx->ggml_ctx, av_ca_audio_timestep, av_ca_factor))
                    .first;

+            sd::ggml_graph_cut::mark_graph_cut(vx, "ltxav.prelude", "vx");
+            sd::ggml_graph_cut::mark_graph_cut(ax, "ltxav.prelude", "ax");
+
            for (int i = 0; i < cfg.num_layers; i++) {
                auto block = std::dynamic_pointer_cast<BasicAVTransformerBlock>(blocks["transformer_blocks." + std::to_string(i)]);
                auto out   = block->forward(ctx,
@ -1509,6 +1512,8 @@ namespace LTXV {
                                            a_prompt_timestep_mod);
                vx         = out.first;
                ax         = out.second;
+                sd::ggml_graph_cut::mark_graph_cut(vx, "ltxav.transformer_blocks." + std::to_string(i), "vx");
+                sd::ggml_graph_cut::mark_graph_cut(ax, "ltxav.transformer_blocks." + std::to_string(i), "ax");
            }

            auto v_shift_scale = get_output_scale_shift(ctx, params["scale_shift_table"], v_embedded_time, cfg.hidden_size);
--- a/otherarch/sdcpp/stable-diffusion.cpp
+++ b/otherarch/sdcpp/stable-diffusion.cpp
@ -17,6 +17,7 @@
 #include "guidance.h"
 #include "lora.hpp"
 #include "ltx_audio_vae.h"
+#include "ltx_latent_upscaler.hpp"
 #include "ltx_vae.hpp"
 #include "pmid.hpp"
 #include "sample-cache.h"
@ -883,7 +884,8 @@ public:
            auto create_tae = [&]() -> std::shared_ptr<VAE> {
                if (sd_version_is_wan(version) ||
                    sd_version_is_qwen_image(version) ||
-                    sd_version_is_anima(version)) {
+                    sd_version_is_anima(version) ||
+                    sd_version_is_ltxav(version)) {
                    return std::make_shared<TinyVideoAutoEncoder>(backend_for(SDBackendModule::VAE),
                                                                  params_backend_for(SDBackendModule::VAE),
                                                                  tensor_storage_map,
@ -1430,7 +1432,7 @@ public:
        }
        auto lora = std::make_shared<LoraModel>(lora_id,
                                                backend_for(module),
-                                                params_backend_for(module),
+                                                backend_for(module),
                                                lora_path,
                                                is_high_noise ? "model.high_noise_" : "",
                                                version);
@ -2421,6 +2423,24 @@ public:
        return first_stage_model->decode(n_threads, latents, vae_tiling_params, decode_video, circular_x, circular_y);
    }

+    sd::Tensor<float> normalize_ltx_video_latents(const sd::Tensor<float>& x) {
+        auto ltx_vae = std::dynamic_pointer_cast<LTXVideoVAE>(first_stage_model);
+        if (!ltx_vae) {
+            LOG_ERROR("LTX latent normalization requires LTX video VAE");
+            return {};
+        }
+        return ltx_vae->normalize_latents(n_threads, x);
+    }
+
+    sd::Tensor<float> un_normalize_ltx_video_latents(const sd::Tensor<float>& x) {
+        auto ltx_vae = std::dynamic_pointer_cast<LTXVideoVAE>(first_stage_model);
+        if (!ltx_vae) {
+            LOG_ERROR("LTX latent un-normalization requires LTX video VAE");
+            return {};
+        }
+        return ltx_vae->un_normalize_latents(n_threads, x);
+    }
+
    sd::Tensor<float> decode_ltx_audio_latent(const sd::Tensor<float>& audio_latent) {
        if (audio_vae_model == nullptr || audio_latent.empty()) {
            return {};
@ -2704,16 +2724,18 @@ void sd_cache_params_init(sd_cache_params_t* cache_params) {
 }

 void sd_hires_params_init(sd_hires_params_t* hires_params) {
-    *hires_params                    = {};
-    hires_params->enabled            = false;
-    hires_params->upscaler           = SD_HIRES_UPSCALER_LATENT;
-    hires_params->model_path         = nullptr;
-    hires_params->scale              = 2.0f;
-    hires_params->target_width       = 0;
-    hires_params->target_height      = 0;
-    hires_params->steps              = 0;
-    hires_params->denoising_strength = 0.7f;
-    hires_params->upscale_tile_size  = 128;
+    *hires_params                     = {};
+    hires_params->enabled             = false;
+    hires_params->upscaler            = SD_HIRES_UPSCALER_LATENT;
+    hires_params->model_path          = nullptr;
+    hires_params->scale               = 2.0f;
+    hires_params->target_width        = 0;
+    hires_params->target_height       = 0;
+    hires_params->steps               = 0;
+    hires_params->denoising_strength  = 0.7f;
+    hires_params->upscale_tile_size   = 128;
+    hires_params->custom_sigmas       = nullptr;
+    hires_params->custom_sigmas_count = 0;
 }

 void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) {
@ -2986,6 +3008,16 @@ void sd_vid_gen_params_init(sd_vid_gen_params_t* sd_vid_gen_params) {
    sd_vid_gen_params->moe_boundary                          = 0.875f;
    sd_vid_gen_params->vace_strength                         = 1.f;
    sd_vid_gen_params->vae_tiling_params                     = {false, false, 0, 0, 0.5f, 0.0f, 0.0f};
+    sd_vid_gen_params->hires.enabled                         = false;
+    sd_vid_gen_params->hires.upscaler                        = SD_HIRES_UPSCALER_LATENT;
+    sd_vid_gen_params->hires.scale                           = 2.f;
+    sd_vid_gen_params->hires.target_width                    = 0;
+    sd_vid_gen_params->hires.target_height                   = 0;
+    sd_vid_gen_params->hires.steps                           = 0;
+    sd_vid_gen_params->hires.denoising_strength              = 0.7f;
+    sd_vid_gen_params->hires.upscale_tile_size               = 128;
+    sd_vid_gen_params->hires.custom_sigmas                   = nullptr;
+    sd_vid_gen_params->hires.custom_sigmas_count             = 0;
    sd_cache_params_init(&sd_vid_gen_params->cache);
 }

@ -3235,6 +3267,7 @@ struct GenerationRequest {
        vace_strength               = sd_vid_gen_params->vace_strength;
        guidance                    = sd_vid_gen_params->sample_params.guidance;
        high_noise_guidance         = sd_vid_gen_params->high_noise_sample_params.guidance;
+        hires                       = sd_vid_gen_params->hires;
        resolve(sd_ctx);
        if (frames != requested_frames) {
            LOG_WARN("align video frames from %d to %d for %s",
@ -3293,6 +3326,20 @@ struct GenerationRequest {
            hires.enabled = false;
            return;
        }
+        if (hires.custom_sigmas_count < 0) {
+            LOG_WARN("hires custom sigmas count is negative, ignoring custom sigmas");
+            hires.custom_sigmas       = nullptr;
+            hires.custom_sigmas_count = 0;
+        }
+        if (hires.custom_sigmas_count > 0 && hires.custom_sigmas == nullptr) {
+            LOG_WARN("hires custom sigmas count is positive but custom sigmas are null, ignoring custom sigmas");
+            hires.custom_sigmas_count = 0;
+        }
+        if (hires.custom_sigmas_count == 1) {
+            LOG_WARN("hires custom sigmas requires at least two values, ignoring custom sigmas");
+            hires.custom_sigmas       = nullptr;
+            hires.custom_sigmas_count = 0;
+        }
        hires.denoising_strength = std::clamp(hires.denoising_strength, 0.0001f, 1.f);
        hires.steps              = std::max(0, hires.steps);

@ -3657,6 +3704,85 @@ static sd::Tensor<float> pack_ltxav_audio_and_video_denoise_mask(const sd::Tenso
    return sd::ops::concat(video_mask_full, audio_mask, 3);
 }

+static sd::Tensor<float> make_ltxav_video_denoise_mask(const sd::Tensor<float>& video_latent, float value = 1.f) {
+    if (video_latent.empty()) {
+        return {};
+    }
+    return sd::full<float>({video_latent.shape()[0],
+                            video_latent.shape()[1],
+                            video_latent.shape()[2],
+                            1,
+                            1},
+                           value);
+}
+
+static sd::Tensor<float> encode_ltxav_condition_image(sd_ctx_t* sd_ctx,
+                                                      const sd::Tensor<float>& image,
+                                                      const char* name) {
+    if (sd_ctx == nullptr || sd_ctx->sd == nullptr || image.empty()) {
+        return {};
+    }
+    auto condition_image  = image.reshape({image.shape()[0],
+                                           image.shape()[1],
+                                           1,
+                                           image.shape()[2],
+                                           image.shape()[3]});
+    auto condition_latent = sd_ctx->sd->encode_first_stage(condition_image);
+    if (condition_latent.empty()) {
+        LOG_ERROR("failed to encode LTXAV %s image", name);
+    }
+    return condition_latent;
+}
+
+static bool apply_ltxav_condition_by_latent_index(sd::Tensor<float>* video_latent,
+                                                  sd::Tensor<float>* video_mask,
+                                                  const sd::Tensor<float>& condition_latent,
+                                                  int64_t latent_idx,
+                                                  const char* name,
+                                                  float conditioned_mask) {
+    if (video_latent == nullptr || video_mask == nullptr || video_latent->empty() || video_mask->empty()) {
+        return false;
+    }
+    if (condition_latent.empty() ||
+        condition_latent.shape()[0] != video_latent->shape()[0] ||
+        condition_latent.shape()[1] != video_latent->shape()[1] ||
+        condition_latent.shape()[3] != video_latent->shape()[3]) {
+        LOG_ERROR("invalid LTXAV %s condition latent shape", name);
+        return false;
+    }
+    int64_t latent_frames    = video_latent->shape()[2];
+    int64_t condition_frames = condition_latent.shape()[2];
+    if (latent_idx < 0 || condition_frames <= 0 || latent_idx + condition_frames > latent_frames) {
+        LOG_ERROR("invalid LTXAV %s image latent range: start=%" PRId64 ", length=%" PRId64 ", latent_frames=%" PRId64,
+                  name,
+                  latent_idx,
+                  condition_frames,
+                  latent_frames);
+        return false;
+    }
+
+    sd::ops::slice_assign(video_latent, 2, latent_idx, latent_idx + condition_frames, condition_latent);
+    sd::ops::fill_slice(video_mask, 2, latent_idx, latent_idx + condition_frames, conditioned_mask);
+    return true;
+}
+
+static bool apply_ltxav_condition_image_by_latent_index(sd_ctx_t* sd_ctx,
+                                                        const sd::Tensor<float>& image,
+                                                        sd::Tensor<float>* video_latent,
+                                                        sd::Tensor<float>* video_mask,
+                                                        int64_t latent_idx,
+                                                        const char* name,
+                                                        float strength) {
+    auto condition_latent = encode_ltxav_condition_image(sd_ctx, image, name);
+    return !condition_latent.empty() &&
+           apply_ltxav_condition_by_latent_index(video_latent,
+                                                 video_mask,
+                                                 condition_latent,
+                                                 latent_idx,
+                                                 name,
+                                                 1.0f - std::clamp(strength, 0.f, 1.f));
+}
+
 static sd::Tensor<float> unpack_ltxav_audio_latent(const sd::Tensor<float>& packed_latent,
                                                   int audio_length,
                                                   int video_channels) {
@ -4218,6 +4344,53 @@ static sd::Tensor<float> upscale_hires_latent(sd_ctx_t* sd_ctx,
    return {};
 }

+static std::vector<float> make_hires_sigma_schedule(sd_ctx_t* sd_ctx,
+                                                    const sd_hires_params_t& hires,
+                                                    const sd_sample_params_t& sample_params,
+                                                    sample_method_t sample_method,
+                                                    int default_steps,
+                                                    int sample_seq_len,
+                                                    int* scheduler_steps_out) {
+    if (scheduler_steps_out != nullptr) {
+        *scheduler_steps_out = 0;
+    }
+
+    if (hires.custom_sigmas_count > 0 && hires.custom_sigmas != nullptr) {
+        std::vector<float> custom_sigmas(hires.custom_sigmas,
+                                         hires.custom_sigmas + hires.custom_sigmas_count);
+        if (scheduler_steps_out != nullptr) {
+            *scheduler_steps_out = static_cast<int>(custom_sigmas.size()) - 1;
+        }
+        return custom_sigmas;
+    }
+
+    int effective_steps = hires.steps > 0 ? hires.steps : default_steps;
+    effective_steps     = std::max(1, effective_steps);
+
+    // sd-webui behavior: scale up total steps so trimming by denoising_strength yields exactly hires_steps effective steps,
+    // unlike img2img which trims from a fixed step count.
+    int scheduler_steps = static_cast<int>(effective_steps / hires.denoising_strength);
+    scheduler_steps     = std::max(1, scheduler_steps);
+
+    scheduler_t scheduler     = resolve_scheduler(sd_ctx,
+                                                  sample_params.scheduler,
+                                                  sample_method);
+    std::vector<float> sigmas = sd_ctx->sd->denoiser->get_sigmas(scheduler_steps,
+                                                                 sample_seq_len,
+                                                                 scheduler,
+                                                                 sd_ctx->sd->version,
+                                                                 sample_params.extra_sample_args);
+    size_t t_enc              = static_cast<size_t>(scheduler_steps * hires.denoising_strength);
+    if (t_enc >= static_cast<size_t>(scheduler_steps)) {
+        t_enc = static_cast<size_t>(scheduler_steps) - 1;
+    }
+    if (scheduler_steps_out != nullptr) {
+        *scheduler_steps_out = scheduler_steps;
+    }
+    return std::vector<float>(sigmas.begin() + scheduler_steps - static_cast<int>(t_enc) - 1,
+                              sigmas.end());
+}
+
 SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params) {
    if (sd_ctx == nullptr || sd_img_gen_params == nullptr) {
        return nullptr;
@ -4340,29 +4513,20 @@ SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* s
            }
        }

-        int hires_steps = request.hires.steps > 0 ? request.hires.steps : plan.sample_steps;
-
-        // sd-webui behavior: scale up total steps so trimming by denoising_strength yields exactly hires_steps effective steps,
-        // unlike img2img which trims from a fixed step count
-        hires_steps = static_cast<int>(hires_steps / request.hires.denoising_strength);
-
-        std::vector<float> hires_sigmas = sd_ctx->sd->denoiser->get_sigmas(
-            hires_steps,
-            sd_ctx->sd->get_image_seq_len(request.hires.target_height, request.hires.target_width),
-            sd_img_gen_params->sample_params.scheduler,
-            sd_ctx->sd->version,
-            sd_img_gen_params->sample_params.extra_sample_args);
-
-        size_t t_enc = static_cast<size_t>(hires_steps * request.hires.denoising_strength);
-        if (t_enc >= static_cast<size_t>(hires_steps)) {
-            t_enc = static_cast<size_t>(hires_steps) - 1;
-        }
-        std::vector<float> hires_sigma_sched(hires_sigmas.begin() + hires_steps - static_cast<int>(t_enc) - 1,
-                                             hires_sigmas.end());
-        LOG_INFO("hires fix: %d steps, denoising_strength=%.2f, sigma_sched_size=%zu",
-                 hires_steps,
+        int hires_scheduler_steps = 0;
+        std::vector<float> hires_sigma_sched =
+            make_hires_sigma_schedule(sd_ctx,
+                                      request.hires,
+                                      sd_img_gen_params->sample_params,
+                                      plan.sample_method,
+                                      plan.sample_steps,
+                                      sd_ctx->sd->get_image_seq_len(request.hires.target_height, request.hires.target_width),
+                                      &hires_scheduler_steps);
+        LOG_INFO("hires fix: scheduler_steps=%d, denoising_strength=%.2f, sigma_sched_size=%zu%s",
+                 hires_scheduler_steps,
                 request.hires.denoising_strength,
-                 hires_sigma_sched.size());
+                 hires_sigma_sched.size(),
+                 request.hires.custom_sigmas_count > 0 ? ", custom_sigmas=true" : "");

        std::vector<sd::Tensor<float>> hires_final_latents;
        int64_t hires_denoise_start = ggml_time_ms();
@ -4510,44 +4674,7 @@ static std::optional<ImageGenerationLatents> prepare_video_generation_latents(sd

            float conditioning_strength = std::clamp(request->strength, 0.f, 1.f);
            float conditioned_mask      = 1.0f - conditioning_strength;
-            latents.denoise_mask        = sd::full<float>({latents.init_latent.shape()[0],
-                                                           latents.init_latent.shape()[1],
-                                                           latents.init_latent.shape()[2],
-                                                           1,
-                                                           1},
-                                                   1.f);
-
-            auto encode_ltxav_condition_image = [&](const sd::Tensor<float>& image, const char* name) -> sd::Tensor<float> {
-                auto condition_image  = image.reshape({image.shape()[0],
-                                                       image.shape()[1],
-                                                       1,
-                                                       image.shape()[2],
-                                                       image.shape()[3]});
-                auto condition_latent = sd_ctx->sd->encode_first_stage(condition_image);
-                if (condition_latent.empty()) {
-                    LOG_ERROR("failed to encode LTXAV %s image", name);
-                }
-                return condition_latent;
-            };
-
-            auto apply_video_condition_by_latent_index = [&](const sd::Tensor<float>& condition_latent,
-                                                             int64_t latent_idx,
-                                                             const char* name) -> bool {
-                int64_t latent_frames    = latents.init_latent.shape()[2];
-                int64_t condition_frames = condition_latent.shape()[2];
-                if (latent_idx < 0 || condition_frames <= 0 || latent_idx + condition_frames > latent_frames) {
-                    LOG_ERROR("invalid LTXAV %s image latent range: start=%" PRId64 ", length=%" PRId64 ", latent_frames=%" PRId64,
-                              name,
-                              latent_idx,
-                              condition_frames,
-                              latent_frames);
-                    return false;
-                }
-
-                sd::ops::slice_assign(&latents.init_latent, 2, latent_idx, latent_idx + condition_frames, condition_latent);
-                sd::ops::fill_slice(&latents.denoise_mask, 2, latent_idx, latent_idx + condition_frames, conditioned_mask);
-                return true;
-            };
+            latents.denoise_mask        = make_ltxav_video_denoise_mask(latents.init_latent, 1.f);

            auto apply_video_condition_by_keyframe_index = [&](const sd::Tensor<float>& keyframes,
                                                               int frame_idx,
@ -4585,20 +4712,30 @@ static std::optional<ImageGenerationLatents> prepare_video_generation_latents(sd
            };

            if (!start_image.empty()) {
-                auto start_image_latent = encode_ltxav_condition_image(start_image, "init");
-                if (start_image_latent.empty() || !apply_video_condition_by_latent_index(start_image_latent, 0, "init")) {
+                if (!apply_ltxav_condition_image_by_latent_index(sd_ctx,
+                                                                 start_image,
+                                                                 &latents.init_latent,
+                                                                 &latents.denoise_mask,
+                                                                 0,
+                                                                 "init",
+                                                                 conditioning_strength)) {
                    return std::nullopt;
                }
            }

            if (!end_image.empty()) {
-                auto end_image_latent = encode_ltxav_condition_image(end_image, "end");
+                auto end_image_latent = encode_ltxav_condition_image(sd_ctx, end_image, "end");
                if (end_image_latent.empty()) {
                    return std::nullopt;
                }

                int frame_idx = request->frames - 1;
-                bool ok       = frame_idx == 0 ? apply_video_condition_by_latent_index(end_image_latent, 0, "end")
+                bool ok       = frame_idx == 0 ? apply_ltxav_condition_by_latent_index(&latents.init_latent,
+                                                                                       &latents.denoise_mask,
+                                                                                       end_image_latent,
+                                                                                       0,
+                                                                                       "end",
+                                                                                       conditioned_mask)
                                               : apply_video_condition_by_keyframe_index(end_image_latent, frame_idx, "end");
                if (!ok) {
                    return std::nullopt;
@ -4879,6 +5016,175 @@ static sd_image_t* decode_video_outputs(sd_ctx_t* sd_ctx,
    return result_images;
 }

+static sd::Tensor<float> upscale_ltx_spatial_video_latent(sd_ctx_t* sd_ctx,
+                                                          const char* model_path,
+                                                          const sd::Tensor<float>& packed_latent,
+                                                          int audio_length) {
+    if (sd_ctx == nullptr || sd_ctx->sd == nullptr || packed_latent.empty()) {
+        return {};
+    }
+    if (strlen(SAFE_STR(model_path)) == 0) {
+        LOG_ERROR("LTX latent spatial upscale requires a model path");
+        return {};
+    }
+    if (!sd_ctx->sd->ensure_backend_pair(SDBackendModule::UPSCALER)) {
+        return {};
+    }
+
+    int latent_channels            = sd_ctx->sd->get_latent_channel();
+    sd::Tensor<float> video_latent = packed_latent;
+    sd::Tensor<float> audio_latent;
+    if (packed_latent.shape()[3] > latent_channels) {
+        video_latent = sd::ops::slice(packed_latent, 3, 0, latent_channels);
+        audio_latent = unpack_ltxav_audio_latent(packed_latent, audio_length, latent_channels);
+    }
+
+    LOG_INFO("LTX latent spatial upscale: latent %dx%dx%dx%d -> x2",
+             (int)video_latent.shape()[0],
+             (int)video_latent.shape()[1],
+             (int)video_latent.shape()[2],
+             (int)video_latent.shape()[3]);
+
+    sd::Tensor<float> unnormalized = sd_ctx->sd->un_normalize_ltx_video_latents(video_latent);
+    if (unnormalized.empty()) {
+        LOG_ERROR("LTX latent un-normalization failed before spatial upscale");
+        return {};
+    }
+
+    std::unique_ptr<LTXVUpsampler::LatentUpsamplerRunner> upsampler =
+        std::make_unique<LTXVUpsampler::LatentUpsamplerRunner>(sd_ctx->sd->backend_for(SDBackendModule::UPSCALER),
+                                                               sd_ctx->sd->params_backend_for(SDBackendModule::UPSCALER));
+    const size_t max_graph_vram_bytes = sd::ggml_graph_cut::max_vram_gib_to_bytes(sd_ctx->sd->max_vram);
+    upsampler->set_max_graph_vram_bytes(max_graph_vram_bytes);
+    if (!upsampler->load_from_file(model_path, sd_ctx->sd->n_threads)) {
+        LOG_ERROR("load LTX latent upsampler failed");
+        return {};
+    }
+
+    sd::Tensor<float> upscaled = upsampler->compute(sd_ctx->sd->n_threads, unnormalized);
+    upsampler.reset();
+    if (upscaled.empty()) {
+        LOG_ERROR("LTX latent spatial upscale failed");
+        return {};
+    }
+
+    upscaled = sd_ctx->sd->normalize_ltx_video_latents(upscaled);
+    if (upscaled.empty()) {
+        LOG_ERROR("LTX latent normalization failed after spatial upscale");
+        return {};
+    }
+
+    if (!audio_latent.empty()) {
+        upscaled = pack_ltxav_audio_and_video_latents(upscaled, audio_latent);
+    }
+    return upscaled;
+}
+
+static bool apply_ltxv_refine_image_conditioning(sd_ctx_t* sd_ctx,
+                                                 const sd_vid_gen_params_t* sd_vid_gen_params,
+                                                 const GenerationRequest& request,
+                                                 const ImageGenerationLatents& latents,
+                                                 sd::Tensor<float>* latent,
+                                                 sd::Tensor<float>* denoise_mask,
+                                                 sd::Tensor<float>* video_positions) {
+    if (sd_ctx == nullptr || sd_ctx->sd == nullptr || sd_vid_gen_params == nullptr ||
+        latent == nullptr || latent->empty() || denoise_mask == nullptr || video_positions == nullptr) {
+        return true;
+    }
+    if (sd_vid_gen_params->init_image.data == nullptr &&
+        sd_vid_gen_params->end_image.data == nullptr) {
+        return true;
+    }
+    if (sd_ctx->sd->vae_decode_only) {
+        LOG_ERROR("LTXV refine image conditioning requires VAE encoder weights; create the context with vae_decode_only=false");
+        return false;
+    }
+
+    constexpr float conditioning_strength = 1.f;
+    int latent_channels                   = sd_ctx->sd->get_latent_channel();
+    sd::Tensor<float> video_latent        = *latent;
+    sd::Tensor<float> audio_latent;
+    if (latent->shape()[3] > latent_channels) {
+        video_latent = sd::ops::slice(*latent, 3, 0, latent_channels);
+        audio_latent = unpack_ltxav_audio_latent(*latent, latents.audio_length, latent_channels);
+        if (audio_latent.empty()) {
+            LOG_ERROR("failed to unpack LTXAV audio latent before image-to-video inplace conditioning");
+            return false;
+        }
+    }
+
+    int image_width              = static_cast<int>(video_latent.shape()[0]) * request.vae_scale_factor;
+    int image_height             = static_cast<int>(video_latent.shape()[1]) * request.vae_scale_factor;
+    sd::Tensor<float> video_mask = make_ltxav_video_denoise_mask(video_latent, 1.f);
+
+    if (sd_vid_gen_params->init_image.data != nullptr) {
+        sd::Tensor<float> start_image = sd_image_to_tensor(sd_vid_gen_params->init_image, image_width, image_height);
+        if (!apply_ltxav_condition_image_by_latent_index(sd_ctx,
+                                                         start_image,
+                                                         &video_latent,
+                                                         &video_mask,
+                                                         0,
+                                                         "init",
+                                                         conditioning_strength)) {
+            return false;
+        }
+    }
+
+    if (sd_vid_gen_params->end_image.data != nullptr) {
+        sd::Tensor<float> end_image        = sd_image_to_tensor(sd_vid_gen_params->end_image, image_width, image_height);
+        sd::Tensor<float> end_image_latent = encode_ltxav_condition_image(sd_ctx, end_image, "end");
+        if (end_image_latent.empty()) {
+            return false;
+        }
+
+        int frame_idx = request.frames - 1;
+        if (frame_idx == 0) {
+            if (!apply_ltxav_condition_by_latent_index(&video_latent,
+                                                       &video_mask,
+                                                       end_image_latent,
+                                                       0,
+                                                       "end",
+                                                       1.f - conditioning_strength)) {
+                return false;
+            }
+        } else {
+            if (latents.video_conditioning_frame_count <= 0 || latents.video_target_frame_count <= 0) {
+                LOG_ERROR("LTXV FLF2V refine conditioning requires low-resolution keyframe conditioning metadata");
+                return false;
+            }
+            int64_t target_latent_frames = latents.video_target_frame_count;
+            if (!apply_ltxav_condition_by_latent_index(&video_latent,
+                                                       &video_mask,
+                                                       end_image_latent,
+                                                       target_latent_frames,
+                                                       "end",
+                                                       1.f - conditioning_strength)) {
+                return false;
+            }
+            *video_positions = build_ltxv_video_positions(video_latent.shape()[0],
+                                                          video_latent.shape()[1],
+                                                          target_latent_frames,
+                                                          end_image_latent.shape()[2],
+                                                          frame_idx,
+                                                          1,
+                                                          request.fps,
+                                                          request.vae_scale_factor,
+                                                          8,
+                                                          true);
+        }
+    }
+
+    if (!audio_latent.empty()) {
+        *latent       = pack_ltxav_audio_and_video_latents(video_latent, audio_latent);
+        *denoise_mask = pack_ltxav_audio_and_video_denoise_mask(video_mask, video_latent, audio_latent);
+    } else {
+        *latent       = std::move(video_latent);
+        *denoise_mask = std::move(video_mask);
+    }
+    LOG_INFO("LTXV refine image conditioning applied at %dx%d", image_width, image_height);
+    return true;
+}
+
 SD_API bool generate_video(sd_ctx_t* sd_ctx,
                           const sd_vid_gen_params_t* sd_vid_gen_params,
                           sd_image_t** frames_out,
@ -4899,6 +5205,23 @@ SD_API bool generate_video(sd_ctx_t* sd_ctx,
    int64_t t0                    = ggml_time_ms();
    sd_ctx->sd->vae_tiling_params = sd_vid_gen_params->vae_tiling_params;
    GenerationRequest request(sd_ctx, sd_vid_gen_params);
+    bool latent_upscale_enabled     = request.hires.enabled;
+    GenerationRequest hires_request = request;
+    if (latent_upscale_enabled) {
+        if (!sd_version_is_ltxav(sd_ctx->sd->version)) {
+            LOG_ERROR("LTX latent spatial upscale is only supported for LTX video models");
+            return false;
+        }
+        if (request.hires.upscaler != SD_HIRES_UPSCALER_MODEL) {
+            LOG_ERROR("LTX latent spatial upscale currently requires hires upscaler MODEL");
+            return false;
+        }
+        if (strlen(SAFE_STR(request.hires.model_path)) == 0) {
+            LOG_ERROR("LTX latent spatial upscale is enabled but hires model path was not provided");
+            return false;
+        }
+    }
+
    sd_ctx->sd->rng->manual_seed(request.seed);
    sd_ctx->sd->sampler_rng->manual_seed(request.seed);
    sd_ctx->sd->set_flow_shift(sd_vid_gen_params->sample_params.flow_shift);
@ -4910,14 +5233,22 @@ SD_API bool generate_video(sd_ctx_t* sd_ctx,
        return false;
    }
    ImageGenerationLatents latents = std::move(*latent_inputs_opt);
-    ImageGenerationEmbeds embeds   = prepare_video_generation_embeds(sd_ctx,
-                                                                     sd_vid_gen_params,
-                                                                     request,
-                                                                     latents);
-    LOG_INFO("generate_video %dx%dx%d",
-             request.width,
-             request.height,
-             request.frames);
+
+    ImageGenerationEmbeds embeds = prepare_video_generation_embeds(sd_ctx,
+                                                                   sd_vid_gen_params,
+                                                                   request,
+                                                                   latents);
+    if (latent_upscale_enabled) {
+        LOG_INFO("generate_video %dx%dx%d -> LTX latent spatial upscale",
+                 request.width,
+                 request.height,
+                 request.frames);
+    } else {
+        LOG_INFO("generate_video %dx%dx%d",
+                 request.width,
+                 request.height,
+                 request.frames);
+    }

    int64_t latent_start = ggml_time_ms();
    int W                = request.width / request.vae_scale_factor;
@ -5009,15 +5340,126 @@ SD_API bool generate_video(sd_ctx_t* sd_ctx,
                                                        latents.video_positions);

    int64_t sampling_end = ggml_time_ms();
-    if (sd_ctx->sd->free_params_immediately) {
-        sd_ctx->sd->diffusion_model->free_params_buffer();
-    }
    if (final_latent.empty()) {
+        if (sd_ctx->sd->free_params_immediately) {
+            sd_ctx->sd->diffusion_model->free_params_buffer();
+        }
        LOG_ERROR("sampling failed after %.2fs", (sampling_end - sampling_start) * 1.0f / 1000);
        return false;
    }
    LOG_INFO("sampling completed, taking %.2fs", (sampling_end - sampling_start) * 1.0f / 1000);

+    if (latent_upscale_enabled) {
+        int64_t upscale_start             = ggml_time_ms();
+        sd::Tensor<float> upscaled_latent = upscale_ltx_spatial_video_latent(sd_ctx,
+                                                                             request.hires.model_path,
+                                                                             final_latent,
+                                                                             latents.audio_length);
+        int64_t upscale_end               = ggml_time_ms();
+        if (upscaled_latent.empty()) {
+            if (sd_ctx->sd->free_params_immediately) {
+                sd_ctx->sd->diffusion_model->free_params_buffer();
+            }
+            return false;
+        }
+        LOG_INFO("LTX latent spatial upscale completed, taking %.2fs",
+                 (upscale_end - upscale_start) * 1.0f / 1000);
+
+        x_t                  = std::move(upscaled_latent);
+        hires_request.width  = static_cast<int>(x_t.shape()[0]) * hires_request.vae_scale_factor;
+        hires_request.height = static_cast<int>(x_t.shape()[1]) * hires_request.vae_scale_factor;
+        if ((request.hires.target_width > 0 || request.hires.target_height > 0) &&
+            (request.hires.target_width != hires_request.width || request.hires.target_height != hires_request.height)) {
+            LOG_WARN("LTX latent spatial upsampler output is %dx%d; ignoring hires target %dx%d",
+                     hires_request.width,
+                     hires_request.height,
+                     request.hires.target_width,
+                     request.hires.target_height);
+        }
+        sd::Tensor<float> hires_denoise_mask;
+        sd::Tensor<float> hires_video_positions;
+        if (!apply_ltxv_refine_image_conditioning(sd_ctx,
+                                                  sd_vid_gen_params,
+                                                  hires_request,
+                                                  latents,
+                                                  &x_t,
+                                                  &hires_denoise_mask,
+                                                  &hires_video_positions)) {
+            if (sd_ctx->sd->free_params_immediately) {
+                sd_ctx->sd->diffusion_model->free_params_buffer();
+            }
+            return false;
+        }
+        noise = sd::Tensor<float>::randn_like(x_t, sd_ctx->sd->rng);
+
+        W                                   = hires_request.width / hires_request.vae_scale_factor;
+        H                                   = hires_request.height / hires_request.vae_scale_factor;
+        T                                   = static_cast<int>(x_t.shape()[2]);
+        sample_method_t hires_sample_method = plan.sample_method;
+        int hires_scheduler_steps           = 0;
+        std::vector<float> hires_sigma_sched =
+            make_hires_sigma_schedule(sd_ctx,
+                                      request.hires,
+                                      sd_vid_gen_params->sample_params,
+                                      hires_sample_method,
+                                      plan.sample_steps,
+                                      sd_ctx->sd->get_image_seq_len(hires_request.height, hires_request.width) * T,
+                                      &hires_scheduler_steps);
+        float hires_eta = resolve_eta(sd_ctx,
+                                      sd_vid_gen_params->sample_params.eta,
+                                      hires_sample_method);
+
+        LOG_DEBUG("sample(latent upscale) %dx%dx%d", W, H, T);
+        LOG_INFO("LTX latent spatial upscale refine: scheduler_steps=%d, denoising_strength=%.2f, sampler=%s, sigma_sched_size=%zu%s",
+                 hires_scheduler_steps,
+                 request.hires.denoising_strength,
+                 sampling_methods_str[hires_sample_method],
+                 hires_sigma_sched.size(),
+                 request.hires.custom_sigmas_count > 0 ? ", custom_sigmas=true" : "");
+
+        sampling_start = ggml_time_ms();
+        final_latent   = sd_ctx->sd->sample(sd_ctx->sd->diffusion_model,
+                                            true,
+                                            x_t,
+                                            std::move(noise),
+                                            embeds.cond,
+                                          hires_request.use_uncond ? embeds.uncond : SDCondition(),
+                                            embeds.img_cond,
+                                            embeds.id_cond,
+                                            sd::Tensor<float>(),
+                                            0.f,
+                                            sd_vid_gen_params->sample_params.guidance,
+                                            hires_eta,
+                                            sd_vid_gen_params->sample_params.shifted_timestep,
+                                            hires_sample_method,
+                                            sd_ctx->sd->is_flow_denoiser(),
+                                            plan.extra_sample_args,
+                                            hires_sigma_sched,
+                                            -1,
+                                            std::vector<sd::Tensor<float>>{},
+                                            false,
+                                            hires_denoise_mask,
+                                            sd::Tensor<float>(),
+                                            hires_request.vace_strength,
+                                            latents.audio_length,
+                                            static_cast<float>(hires_request.fps),
+                                            hires_request.cache_params,
+                                            hires_video_positions);
+        sampling_end   = ggml_time_ms();
+        if (sd_ctx->sd->free_params_immediately) {
+            sd_ctx->sd->diffusion_model->free_params_buffer();
+        }
+        if (final_latent.empty()) {
+            LOG_ERROR("sampling(latent upscale) failed after %.2fs",
+                      (sampling_end - sampling_start) * 1.0f / 1000);
+            return false;
+        }
+        LOG_INFO("sampling(latent upscale) completed, taking %.2fs",
+                 (sampling_end - sampling_start) * 1.0f / 1000);
+    } else if (sd_ctx->sd->free_params_immediately) {
+        sd_ctx->sd->diffusion_model->free_params_buffer();
+    }
+
    sd_audio_t* generated_audio = nullptr;
    if (sd_version_is_ltxav(sd_ctx->sd->version) &&
        latents.audio_length > 0 &&
@ -5048,7 +5490,7 @@ SD_API bool generate_video(sd_ctx_t* sd_ctx,
    int64_t latent_end = ggml_time_ms();
    LOG_INFO("generating latent video completed, taking %.2fs", (latent_end - latent_start) * 1.0f / 1000);

-    auto result = decode_video_outputs(sd_ctx, request, final_latent, num_frames_out);
+    auto result = decode_video_outputs(sd_ctx, latent_upscale_enabled ? hires_request : request, final_latent, num_frames_out);
    if (result == nullptr) {
        free_sd_audio(generated_audio);
        return false;
--- a/otherarch/sdcpp/stable-diffusion.h
+++ b/otherarch/sdcpp/stable-diffusion.h
@ -332,6 +332,8 @@ typedef struct {
    int steps;
    float denoising_strength;
    int upscale_tile_size;
+    float* custom_sigmas;
+    int custom_sigmas_count;
 } sd_hires_params_t;

 typedef struct {
@ -382,6 +384,7 @@ typedef struct {
    float vace_strength;
    sd_tiling_params_t vae_tiling_params;
    sd_cache_params_t cache;
+    sd_hires_params_t hires;
 } sd_vid_gen_params_t;

 typedef struct sd_ctx_t sd_ctx_t;
--- a/otherarch/sdcpp/tae.hpp
+++ b/otherarch/sdcpp/tae.hpp
@ -322,13 +322,21 @@ class TinyVideoEncoder : public UnaryBlock {
    int patch_size  = 1;

 public:
-    TinyVideoEncoder(int z_channels = 4, int patch_size = 1)
+    int t_downscale = 1;
+    TinyVideoEncoder(int z_channels = 4, int patch_size = 1, std::vector<bool> time_downscale = {true, true, false})
        : z_channels(z_channels), patch_size(patch_size) {
+        //         self.t_downscale = 2**sum(t.stride == 2 for t in self.encoder if isinstance(t, TPool))
+        t_downscale = 1;
+        for (bool downscale : time_downscale) {
+            if (downscale) {
+                t_downscale *= 2;
+            }
+        }
        int index                       = 0;
        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels * patch_size * patch_size, hidden, {3, 3}, {1, 1}, {1, 1}));
        index++;  // nn.ReLU()
        for (int i = 0; i < num_layers; i++) {
-            int stride                      = i == num_layers - 1 ? 1 : 2;
+            int stride                      = time_downscale[i] ? 2 : 1;
            blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TPool(hidden, stride));
            blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(hidden, hidden, {3, 3}, {2, 2}, {1, 1}, {1, 1}, false));
            for (int j = 0; j < num_blocks; j++) {
@ -375,15 +383,22 @@ class TinyVideoDecoder : public UnaryBlock {
    static const int num_layers  = 3;
    int channels[num_layers + 1] = {256, 128, 64, 64};
    int patch_size               = 1;
+    int t_upscale                = 1;

 public:
-    TinyVideoDecoder(int z_channels = 4, int patch_size = 1)
+    TinyVideoDecoder(int z_channels = 4, int patch_size = 1, std::vector<bool> time_upscale = {false, true, true})
        : z_channels(z_channels), patch_size(patch_size) {
+        t_upscale = 1;
+        for (bool upscale : time_upscale) {
+            if (upscale) {
+                t_upscale *= 2;
+            }
+        }
        int index                       = 1;  // Clamp()
        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(z_channels, channels[0], {3, 3}, {1, 1}, {1, 1}));
        index++;  // nn.ReLU()
        for (int i = 0; i < num_layers; i++) {
-            int stride = i == 0 ? 1 : 2;
+            int stride = time_upscale[i] ? 2 : 1;
            for (int j = 0; j < num_blocks; j++) {
                blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new MemBlock(channels[i], channels[i]));
            }
@ -430,8 +445,8 @@ public:
        if (patch_size > 1) {
            h = unpatchify(ctx->ggml_ctx, h, patch_size, 1);
        }
-        // shape(W, H, 3, 3 + T) => shape(W, H, 3, T)
-        h = ggml_view_4d(ctx->ggml_ctx, h, h->ne[0], h->ne[1], h->ne[2], h->ne[3] - 3, h->nb[1], h->nb[2], h->nb[3], 3 * h->nb[3]);
+        // shape(W, H, 3, (t_upscale - 1) + T) => shape(W, H, 3, T)
+        h = ggml_view_4d(ctx->ggml_ctx, h, h->ne[0], h->ne[1], h->ne[2], h->ne[3] - (t_upscale - 1), h->nb[1], h->nb[2], h->nb[3], (t_upscale - 1) * h->nb[3]);
        return h;
    }
 };
@ -442,7 +457,9 @@ protected:
    SDVersion version;

 public:
-    int z_channels = 16;
+    int z_channels                   = 16;
+    std::vector<bool> time_downscale = {true, true, false};
+    std::vector<bool> time_upscale   = {false, true, true};

 public:
    TAEHV(bool decode_only = true, SDVersion version = VERSION_WAN2)
@ -451,21 +468,26 @@ public:
        if (version == VERSION_WAN2_2_TI2V) {
            z_channels = 48;
            patch      = 2;
+        } else if (sd_version_is_ltxav(version)) {
+            z_channels     = 128;
+            patch          = 4;
+            time_downscale = {true, true, true};
+            time_upscale   = {true, true, true};
        }
-        blocks["decoder"] = std::shared_ptr<GGMLBlock>(new TinyVideoDecoder(z_channels, patch));
+        blocks["decoder"] = std::shared_ptr<GGMLBlock>(new TinyVideoDecoder(z_channels, patch, time_upscale));
        if (!decode_only) {
-            blocks["encoder"] = std::shared_ptr<GGMLBlock>(new TinyVideoEncoder(z_channels, patch));
+            blocks["encoder"] = std::shared_ptr<GGMLBlock>(new TinyVideoEncoder(z_channels, patch, time_downscale));
        }
    }

    ggml_tensor* decode(GGMLRunnerContext* ctx, ggml_tensor* z) {
        auto decoder = std::dynamic_pointer_cast<TinyVideoDecoder>(blocks["decoder"]);
-        if (sd_version_is_wan(version)) {
+        if (sd_version_is_wan(version) || sd_version_is_ltxav(version)) {
            // (W, H, C, T) -> (W, H, T, C)
            z = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, z, 0, 1, 3, 2));
        }
        auto result = decoder->forward(ctx, z);
-        if (sd_version_is_wan(version)) {
+        if (sd_version_is_wan(version) || sd_version_is_ltxav(version)) {
            // (W, H, C, T) -> (W, H, T, C)
            result = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, result, 0, 1, 3, 2));
        }
@ -477,10 +499,10 @@ public:
        // (W, H, T, C) -> (W, H, C, T)
        x                  = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 0, 1, 3, 2));
        int64_t num_frames = x->ne[3];
-        if (num_frames % 4) {
-            // pad to multiple of 4 at the end
+        if (num_frames % encoder->t_downscale) {
+            // pad to multiple of encoder->t_downscale at the end
            auto last_frame = ggml_view_4d(ctx->ggml_ctx, x, x->ne[0], x->ne[1], x->ne[2], 1, x->nb[1], x->nb[2], x->nb[3], (num_frames - 1) * x->nb[3]);
-            for (int i = 0; i < 4 - num_frames % 4; i++) {
+            for (int i = 0; i < encoder->t_downscale - num_frames % encoder->t_downscale; i++) {
                x = ggml_concat(ctx->ggml_ctx, x, last_frame, 3);
            }
        }