diff --git a/Makefile b/Makefile index 8f39e3c40..7e86c77f6 100644 --- a/Makefile +++ b/Makefile @@ -695,7 +695,7 @@ llama-impl.o: src/llama-impl.cpp src/llama-impl.h budget.o: common/reasoning-budget.cpp common/reasoning-budget.h $(CXX) $(CXXFLAGS) -c $< -o $@ -SDCPP_COMMON_BASENAMES := anima.hpp auto_encoder_kl.hpp avi_writer.h cache_dit.hpp clip.hpp common_block.hpp common_dit.hpp condition_cache_utils.hpp conditioner.hpp control.hpp convert.cpp denoiser.hpp diffusion_model.hpp easycache.hpp ernie_image.hpp esrgan.hpp flux.hpp ggml_extend.hpp ggml_extend_backend.cpp ggml_extend_backend.h ggml_graph_cut.cpp ggml_graph_cut.h gits_noise.inl guidance.cpp guidance.h hidream_o1.hpp image_metadata.cpp image_metadata.h kcpp_sd_extensions.h latent-preview.h llm.hpp lora.hpp ltx_audio_vae.h ltx_vae.hpp ltxv.hpp mmdit.hpp model.cpp model.h model_io/binary_io.h model_io/gguf_io.cpp model_io/gguf_io.h model_io/gguf_reader_ext.h model_io/pickle_io.cpp model_io/pickle_io.h model_io/safetensors_io.cpp model_io/safetensors_io.h model_io/tensor_storage.h model_io/torch_legacy_io.cpp model_io/torch_legacy_io.h model_io/torch_zip_io.cpp model_io/torch_zip_io.h msf_gif.h name_conversion.cpp name_conversion.h ordered_map.hpp pmid.hpp preprocessing.hpp qwen_image.hpp rng.hpp rng_mt19937.hpp rng_philox.hpp rope.hpp sample-cache.cpp sample-cache.h spectrum.hpp stable-diffusion.cpp stable-diffusion.h t5.hpp tae.hpp tensor.hpp tensor_ggml.hpp thirdparty/LICENSE.darts_clone.txt thirdparty/darts.h thirdparty/miniz.h thirdparty/stb_image_resize.h thirdparty/stb_image_write.h thirdparty/zip.c thirdparty/zip.h tokenizers/bpe_tokenizer.cpp tokenizers/bpe_tokenizer.h tokenizers/clip_tokenizer.cpp tokenizers/clip_tokenizer.h tokenizers/gemma_tokenizer.cpp tokenizers/gemma_tokenizer.h tokenizers/mistral_tokenizer.cpp tokenizers/mistral_tokenizer.h tokenizers/qwen2_tokenizer.cpp tokenizers/qwen2_tokenizer.h tokenizers/t5_unigram_tokenizer.cpp tokenizers/t5_unigram_tokenizer.h tokenizers/tokenize_util.cpp tokenizers/tokenize_util.h tokenizers/tokenizer.cpp tokenizers/tokenizer.h tokenizers/vocab/vocab.h ucache.hpp unet.hpp upscaler.cpp upscaler.h util.cpp util.h vae.hpp wan.hpp z_image.hpp +SDCPP_COMMON_BASENAMES := anima.hpp auto_encoder_kl.hpp avi_writer.h cache_dit.hpp clip.hpp common_block.hpp common_dit.hpp condition_cache_utils.hpp conditioner.hpp control.hpp convert.cpp denoiser.hpp diffusion_model.hpp easycache.hpp ernie_image.hpp esrgan.hpp flux.hpp ggml_extend.hpp ggml_extend_backend.cpp ggml_extend_backend.h ggml_graph_cut.cpp ggml_graph_cut.h gits_noise.inl guidance.cpp guidance.h hidream_o1.hpp image_metadata.cpp image_metadata.h kcpp_sd_extensions.h latent-preview.h llm.hpp lora.hpp ltx_audio_vae.h ltx_latent_upscaler.hpp ltx_vae.hpp ltxv.hpp mmdit.hpp model.cpp model.h model_io/binary_io.h model_io/gguf_io.cpp model_io/gguf_io.h model_io/gguf_reader_ext.h model_io/pickle_io.cpp model_io/pickle_io.h model_io/safetensors_io.cpp model_io/safetensors_io.h model_io/tensor_storage.h model_io/torch_legacy_io.cpp model_io/torch_legacy_io.h model_io/torch_zip_io.cpp model_io/torch_zip_io.h msf_gif.h name_conversion.cpp name_conversion.h ordered_map.hpp pmid.hpp preprocessing.hpp qwen_image.hpp rng.hpp rng_mt19937.hpp rng_philox.hpp rope.hpp sample-cache.cpp sample-cache.h spectrum.hpp stable-diffusion.cpp stable-diffusion.h t5.hpp tae.hpp tensor.hpp tensor_ggml.hpp thirdparty/LICENSE.darts_clone.txt thirdparty/darts.h thirdparty/miniz.h thirdparty/stb_image_resize.h thirdparty/stb_image_write.h thirdparty/zip.c thirdparty/zip.h tokenizers/bpe_tokenizer.cpp tokenizers/bpe_tokenizer.h tokenizers/clip_tokenizer.cpp tokenizers/clip_tokenizer.h tokenizers/gemma_tokenizer.cpp tokenizers/gemma_tokenizer.h tokenizers/mistral_tokenizer.cpp tokenizers/mistral_tokenizer.h tokenizers/qwen2_tokenizer.cpp tokenizers/qwen2_tokenizer.h tokenizers/t5_unigram_tokenizer.cpp tokenizers/t5_unigram_tokenizer.h tokenizers/tokenize_util.cpp tokenizers/tokenize_util.h tokenizers/tokenizer.cpp tokenizers/tokenizer.h tokenizers/vocab/vocab.h ucache.hpp unet.hpp upscaler.cpp upscaler.h util.cpp util.h vae.hpp wan.hpp z_image.hpp SDCPP_MAIN_BASENAMES := common/common.cpp common/common.h common/log.cpp common/log.h common/media_io.cpp common/media_io.cpp common/media_io.h common/resource_owners.hpp convert.cpp image_metadata.cpp main.cpp tokenizers/vocab/clip_merges.hpp tokenizers/vocab/gemma_merges.hpp tokenizers/vocab/gemma_vocab.hpp tokenizers/vocab/mistral_merges.hpp tokenizers/vocab/mistral_vocab.hpp tokenizers/vocab/qwen_merges.hpp tokenizers/vocab/t5.hpp tokenizers/vocab/umt5.hpp tokenizers/vocab/vocab.cpp version.cpp diff --git a/otherarch/sdcpp/common/common.cpp b/otherarch/sdcpp/common/common.cpp index 85c03b412..f32d0c6ff 100644 --- a/otherarch/sdcpp/common/common.cpp +++ b/otherarch/sdcpp/common/common.cpp @@ -1134,11 +1134,11 @@ ArgOptions SDGenerationParams::get_options() { return 1; }; - auto on_sigmas_arg = [&](int argc, const char** argv, int index) { - if (++index >= argc) { + auto parse_sigmas_arg = [&](const char* value, std::vector* target, const char* option_name) { + if (target == nullptr || value == nullptr) { return -1; } - std::string sigmas_str = argv[index]; + std::string sigmas_str = value; if (!sigmas_str.empty() && sigmas_str.front() == '[') { sigmas_str.erase(0, 1); } @@ -1146,6 +1146,7 @@ ArgOptions SDGenerationParams::get_options() { sigmas_str.pop_back(); } + size_t before = target->size(); std::stringstream ss(sigmas_str); std::string item; while (std::getline(ss, item, ',')) { @@ -1153,24 +1154,38 @@ ArgOptions SDGenerationParams::get_options() { item.erase(item.find_last_not_of(" \t\n\r\f\v") + 1); if (!item.empty()) { try { - custom_sigmas.push_back(std::stof(item)); + target->push_back(std::stof(item)); } catch (const std::invalid_argument&) { - LOG_ERROR("error: invalid float value '%s' in --sigmas", item.c_str()); + LOG_ERROR("error: invalid float value '%s' in %s", item.c_str(), option_name); return -1; } catch (const std::out_of_range&) { - LOG_ERROR("error: float value '%s' out of range in --sigmas", item.c_str()); + LOG_ERROR("error: float value '%s' out of range in %s", item.c_str(), option_name); return -1; } } } - if (custom_sigmas.empty() && !sigmas_str.empty()) { - LOG_ERROR("error: could not parse any sigma values from '%s'", argv[index]); + if (target->size() == before && !sigmas_str.empty()) { + LOG_ERROR("error: could not parse any sigma values from '%s'", value); return -1; } return 1; }; + auto on_sigmas_arg = [&](int argc, const char** argv, int index) { + if (++index >= argc) { + return -1; + } + return parse_sigmas_arg(argv[index], &custom_sigmas, "--sigmas"); + }; + + auto on_hires_sigmas_arg = [&](int argc, const char** argv, int index) { + if (++index >= argc) { + return -1; + } + return parse_sigmas_arg(argv[index], &hires_custom_sigmas, "--hires-sigmas"); + }; + auto on_ref_image_arg = [&](int argc, const char** argv, int index) { if (++index >= argc) { return -1; @@ -1293,6 +1308,10 @@ ArgOptions SDGenerationParams::get_options() { "--sigmas", "custom sigma values for the sampler, comma-separated (e.g., \"14.61,7.8,3.5,0.0\").", on_sigmas_arg}, + {"", + "--hires-sigmas", + "custom sigma values for the highres fix second pass, comma-separated (e.g., \"0.85,0.725,0.421875,0.0\").", + on_hires_sigmas_arg}, {"", "--skip-layers", "layers to skip for SLG steps (default: [7,8,9])", @@ -1525,11 +1544,31 @@ static bool resolve_model_file_from_dir(const std::string& model_name, LOG_ERROR("%s directory is empty", label); return false; } + auto ends_with_valid_ext = [&]() { + for (const auto& ext : valid_ext) { + if (model_name.size() < ext.size()) { + continue; + } + auto suffix = model_name.substr(model_name.size() - ext.size()); + std::transform(suffix.begin(), suffix.end(), suffix.begin(), [](unsigned char c) { + return static_cast(std::tolower(c)); + }); + std::string lower_ext = ext; + std::transform(lower_ext.begin(), lower_ext.end(), lower_ext.begin(), [](unsigned char c) { + return static_cast(std::tolower(c)); + }); + if (suffix == lower_ext) { + return true; + } + } + return false; + }; + if (model_name.empty() || model_name.find('/') != std::string::npos || model_name.find('\\') != std::string::npos || fs::path(model_name).has_root_path() || - fs::path(model_name).has_extension()) { + ends_with_valid_ext()) { LOG_ERROR("%s must be a model name without path or extension: %s", label, model_name.c_str()); return false; } @@ -1633,6 +1672,9 @@ bool SDGenerationParams::from_json_str( if (hires_json.contains("denoising_strength") && hires_json["denoising_strength"].is_number()) { hires_denoising_strength = hires_json["denoising_strength"]; } + if (hires_json.contains("custom_sigmas") && hires_json["custom_sigmas"].is_array()) { + hires_custom_sigmas = hires_json["custom_sigmas"].get>(); + } if (hires_json.contains("upscale_tile_size") && hires_json["upscale_tile_size"].is_number_integer()) { hires_upscale_tile_size = hires_json["upscale_tile_size"]; } @@ -2080,6 +2122,10 @@ bool SDGenerationParams::validate(SDMode mode) { LOG_ERROR("error: hires denoising strength must be in (0.0, 1.0]"); return false; } + if (!hires_custom_sigmas.empty() && hires_custom_sigmas.size() < 2) { + LOG_ERROR("error: hires custom sigmas must contain at least two values"); + return false; + } if (hires_upscale_tile_size < 1) { LOG_ERROR("error: hires upscale tile size must be positive"); return false; @@ -2174,15 +2220,17 @@ sd_img_gen_params_t SDGenerationParams::to_sd_img_gen_params_t() { params.vae_tiling_params = vae_tiling_params; params.cache = cache_params; - params.hires.enabled = hires_enabled; - params.hires.upscaler = resolved_hires_upscaler; - params.hires.model_path = hires_upscaler_model_path.empty() ? nullptr : hires_upscaler_model_path.c_str(); - params.hires.scale = hires_scale; - params.hires.target_width = hires_width; - params.hires.target_height = hires_height; - params.hires.steps = hires_steps; - params.hires.denoising_strength = hires_denoising_strength; - params.hires.upscale_tile_size = hires_upscale_tile_size; + params.hires.enabled = hires_enabled; + params.hires.upscaler = resolved_hires_upscaler; + params.hires.model_path = hires_upscaler_model_path.empty() ? nullptr : hires_upscaler_model_path.c_str(); + params.hires.scale = hires_scale; + params.hires.target_width = hires_width; + params.hires.target_height = hires_height; + params.hires.steps = hires_steps; + params.hires.denoising_strength = hires_denoising_strength; + params.hires.upscale_tile_size = hires_upscale_tile_size; + params.hires.custom_sigmas = hires_custom_sigmas.empty() ? nullptr : hires_custom_sigmas.data(); + params.hires.custom_sigmas_count = static_cast(hires_custom_sigmas.size()); return params; } @@ -2215,27 +2263,38 @@ sd_vid_gen_params_t SDGenerationParams::to_sd_vid_gen_params_t() { high_noise_sample_params.extra_sample_args = high_noise_extra_sample_args.empty() ? nullptr : high_noise_extra_sample_args.c_str(); cache_params.scm_mask = scm_mask.empty() ? nullptr : scm_mask.c_str(); - params.loras = lora_vec.empty() ? nullptr : lora_vec.data(); - params.lora_count = static_cast(lora_vec.size()); - params.prompt = prompt.c_str(); - params.negative_prompt = negative_prompt.c_str(); - params.clip_skip = clip_skip; - params.init_image = init_image.get(); - params.end_image = end_image.get(); - params.control_frames = control_frame_views.empty() ? nullptr : control_frame_views.data(); - params.control_frames_size = static_cast(control_frame_views.size()); - params.width = get_resolved_width(); - params.height = get_resolved_height(); - params.sample_params = sample_params; - params.high_noise_sample_params = high_noise_sample_params; - params.moe_boundary = moe_boundary; - params.strength = strength; - params.seed = seed; - params.video_frames = video_frames; - params.fps = fps; - params.vace_strength = vace_strength; - params.vae_tiling_params = vae_tiling_params; - params.cache = cache_params; + params.loras = lora_vec.empty() ? nullptr : lora_vec.data(); + params.lora_count = static_cast(lora_vec.size()); + params.prompt = prompt.c_str(); + params.negative_prompt = negative_prompt.c_str(); + params.clip_skip = clip_skip; + params.init_image = init_image.get(); + params.end_image = end_image.get(); + params.control_frames = control_frame_views.empty() ? nullptr : control_frame_views.data(); + params.control_frames_size = static_cast(control_frame_views.size()); + params.width = get_resolved_width(); + params.height = get_resolved_height(); + params.sample_params = sample_params; + params.high_noise_sample_params = high_noise_sample_params; + params.moe_boundary = moe_boundary; + params.strength = strength; + params.seed = seed; + params.video_frames = video_frames; + params.fps = fps; + params.vace_strength = vace_strength; + params.vae_tiling_params = vae_tiling_params; + params.cache = cache_params; + params.hires.enabled = hires_enabled; + params.hires.upscaler = resolved_hires_upscaler; + params.hires.model_path = hires_upscaler_model_path.empty() ? nullptr : hires_upscaler_model_path.c_str(); + params.hires.scale = hires_scale; + params.hires.target_width = hires_width; + params.hires.target_height = hires_height; + params.hires.steps = hires_steps; + params.hires.denoising_strength = hires_denoising_strength; + params.hires.upscale_tile_size = hires_upscale_tile_size; + params.hires.custom_sigmas = hires_custom_sigmas.empty() ? nullptr : hires_custom_sigmas.data(); + params.hires.custom_sigmas_count = static_cast(hires_custom_sigmas.size()); return params; } @@ -2318,6 +2377,7 @@ std::string SDGenerationParams::to_string() const { << ", target_height: " << hires_height << ", steps: " << hires_steps << ", denoising_strength: " << hires_denoising_strength + << ", custom_sigmas: " << vec_to_string(hires_custom_sigmas) << ", upscale_tile_size: " << hires_upscale_tile_size << " },\n" << " vae_tiling_params: { " << vae_tiling_params.enabled << ", " @@ -2469,6 +2529,7 @@ std::string build_sdcpp_image_metadata_json(const SDContextParams& ctx_params, {"target_height", gen_params.hires_height}, {"steps", gen_params.hires_steps}, {"denoising_strength", gen_params.hires_denoising_strength}, + {"custom_sigmas", gen_params.hires_custom_sigmas}, {"upscale_tile_size", gen_params.hires_upscale_tile_size}, }; } @@ -2588,6 +2649,9 @@ std::string get_image_params(const SDContextParams& ctx_params, parameter_string += "Hires resize: " + std::to_string(gen_params.hires_width) + "x" + std::to_string(gen_params.hires_height) + ", "; parameter_string += "Hires steps: " + std::to_string(gen_params.hires_steps) + ", "; parameter_string += "Denoising strength: " + std::to_string(gen_params.hires_denoising_strength) + ", "; + if (!gen_params.hires_custom_sigmas.empty()) { + parameter_string += "Hires custom sigmas: " + vec_to_string(gen_params.hires_custom_sigmas) + ", "; + } } parameter_string += "Version: stable-diffusion.cpp"; parameter_string += ", SDCPP: " + build_sdcpp_image_metadata_json(ctx_params, gen_params, seed, mode); diff --git a/otherarch/sdcpp/common/common.h b/otherarch/sdcpp/common/common.h index a515adf42..d526ca3a5 100644 --- a/otherarch/sdcpp/common/common.h +++ b/otherarch/sdcpp/common/common.h @@ -207,6 +207,7 @@ struct SDGenerationParams { int hires_steps = 0; float hires_denoising_strength = 0.7f; int hires_upscale_tile_size = 128; + std::vector hires_custom_sigmas; std::map lora_map; std::map high_noise_lora_map; diff --git a/otherarch/sdcpp/ltx_latent_upscaler.hpp b/otherarch/sdcpp/ltx_latent_upscaler.hpp new file mode 100644 index 000000000..93254454d --- /dev/null +++ b/otherarch/sdcpp/ltx_latent_upscaler.hpp @@ -0,0 +1,348 @@ +#ifndef __SD_LTX_LATENT_UPSCALER_HPP__ +#define __SD_LTX_LATENT_UPSCALER_HPP__ + +#include +#include +#include +#include +#include +#include +#include + +#include "common_dit.hpp" +#include "ggml_extend.hpp" +#include "ggml_graph_cut.h" +#include "model.h" +#include "util.h" + +namespace LTXVUpsampler { + constexpr int LTX_UPSAMPLER_GRAPH_SIZE = 10240; + + struct LatentUpsamplerConfig { + int64_t in_channels = 128; + int64_t mid_channels = 1024; + int num_blocks_per_stage = 4; + int dims = 3; + bool spatial_upsample = true; + bool temporal_upsample = false; + bool rational_resampler = false; + }; + + static inline bool has_tensor(const String2TensorStorage& tensor_storage_map, + const std::string& name) { + return tensor_storage_map.find(name) != tensor_storage_map.end(); + } + + static inline int64_t get_tensor_ne0(const String2TensorStorage& tensor_storage_map, + const std::string& name, + int64_t fallback) { + auto it = tensor_storage_map.find(name); + if (it == tensor_storage_map.end()) { + return fallback; + } + return it->second.ne[0]; + } + + static inline int count_module_blocks(const String2TensorStorage& tensor_storage_map, + const std::string& module_name) { + int max_block = -1; + const std::string prefix = module_name + "."; + for (const auto& pair : tensor_storage_map) { + const std::string& name = pair.first; + if (name.find(prefix) != 0) { + continue; + } + size_t begin = prefix.size(); + size_t end = name.find('.', begin); + if (end == std::string::npos) { + continue; + } + int index = atoi(name.substr(begin, end - begin).c_str()); + max_block = std::max(max_block, index); + } + return max_block + 1; + } + + static inline LatentUpsamplerConfig detect_config_from_weights(const String2TensorStorage& tensor_storage_map) { + LatentUpsamplerConfig config; + config.mid_channels = get_tensor_ne0(tensor_storage_map, "initial_norm.weight", config.mid_channels); + config.in_channels = get_tensor_ne0(tensor_storage_map, "final_conv.bias", config.in_channels); + int detected_blocks = count_module_blocks(tensor_storage_map, "res_blocks"); + if (detected_blocks > 0) { + config.num_blocks_per_stage = detected_blocks; + } + config.spatial_upsample = has_tensor(tensor_storage_map, "upsampler.0.weight"); + config.temporal_upsample = has_tensor(tensor_storage_map, "temporal_upsampler.0.weight"); + return config; + } + + class VideoGroupNorm : public GGMLBlock { + protected: + int num_groups; + int64_t num_channels; + float eps; + std::string prefix; + + void init_params(ggml_context* ctx, + const String2TensorStorage& tensor_storage_map = {}, + const std::string prefix = "") override { + SD_UNUSED(tensor_storage_map); + this->prefix = prefix; + params["weight"] = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, num_channels); + params["bias"] = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, num_channels); + } + + public: + VideoGroupNorm(int num_groups, int64_t num_channels, float eps = 1e-05f) + : num_groups(num_groups), + num_channels(num_channels), + eps(eps) {} + + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) { + // LTX video latent layout is [W, H, T, C]. ggml_group_norm treats ne[2] + // as channels, so fold only H/T internally and restore the same layout. + GGML_ASSERT(x->ne[3] == num_channels); + const int64_t W = x->ne[0]; + const int64_t H = x->ne[1]; + const int64_t T = x->ne[2]; + x = ggml_ext_cont(ctx->ggml_ctx, x); + x = ggml_reshape_4d(ctx->ggml_ctx, x, W, H * T, num_channels, 1); + x = ggml_group_norm(ctx->ggml_ctx, x, num_groups, eps); + + ggml_tensor* weight = params["weight"]; + ggml_tensor* bias = params["bias"]; + if (ctx->weight_adapter) { + weight = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, ctx->backend, weight, prefix + "weight"); + bias = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, ctx->backend, bias, prefix + "bias"); + } + weight = ggml_reshape_4d(ctx->ggml_ctx, weight, 1, 1, num_channels, 1); + bias = ggml_reshape_4d(ctx->ggml_ctx, bias, 1, 1, num_channels, 1); + x = ggml_mul_inplace(ctx->ggml_ctx, x, weight); + x = ggml_add_inplace(ctx->ggml_ctx, x, bias); + return ggml_reshape_4d(ctx->ggml_ctx, x, W, H, T, num_channels); + } + }; + + class ResBlock : public GGMLBlock { + public: + ResBlock(int64_t channels, int dims = 3) { + GGML_ASSERT(dims == 3); + blocks["conv1"] = std::shared_ptr(new Conv3d(channels, channels, {3, 3, 3}, {1, 1, 1}, {1, 1, 1})); + blocks["norm1"] = std::shared_ptr(new VideoGroupNorm(32, channels)); + blocks["conv2"] = std::shared_ptr(new Conv3d(channels, channels, {3, 3, 3}, {1, 1, 1}, {1, 1, 1})); + blocks["norm2"] = std::shared_ptr(new VideoGroupNorm(32, channels)); + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) { + auto conv1 = std::dynamic_pointer_cast(blocks["conv1"]); + auto norm1 = std::dynamic_pointer_cast(blocks["norm1"]); + auto conv2 = std::dynamic_pointer_cast(blocks["conv2"]); + auto norm2 = std::dynamic_pointer_cast(blocks["norm2"]); + + ggml_tensor* residual = x; + + x = conv1->forward(ctx, x); + x = norm1->forward(ctx, x); + x = ggml_silu_inplace(ctx->ggml_ctx, x); + x = conv2->forward(ctx, x); + x = norm2->forward(ctx, x); + x = ggml_add(ctx->ggml_ctx, x, residual); + return ggml_silu(ctx->ggml_ctx, x); + } + }; + + class PixelShuffleND : public UnaryBlock { + protected: + int upscale_factor; + + public: + explicit PixelShuffleND(int upscale_factor) + : upscale_factor(upscale_factor) {} + + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override { + GGML_ASSERT(upscale_factor == 2); + int64_t h = x->ne[1]; + int64_t w = x->ne[0]; + // x: [b*f, c*4, h, w] -> [b*f, c, h*2, w*2] + x = ggml_ext_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, x, 2, 0, 1, 3)); // [b*f, h, w, c*4] + x = ggml_reshape_3d(ctx->ggml_ctx, x, x->ne[0], x->ne[1] * x->ne[2], x->ne[3]); // [b*f, h*w, c*4] + return DiT::unpatchify(ctx->ggml_ctx, x, h, w, upscale_factor, upscale_factor, true); + } + }; + + class LatentUpsampler : public GGMLBlock { + public: + LatentUpsamplerConfig config; + + explicit LatentUpsampler(LatentUpsamplerConfig config) + : config(std::move(config)) { + GGML_ASSERT(this->config.dims == 3); + GGML_ASSERT(this->config.spatial_upsample); + GGML_ASSERT(!this->config.temporal_upsample); + GGML_ASSERT(!this->config.rational_resampler); + + blocks["initial_conv"] = std::shared_ptr(new Conv3d(this->config.in_channels, + this->config.mid_channels, + {3, 3, 3}, + {1, 1, 1}, + {1, 1, 1})); + blocks["initial_norm"] = std::shared_ptr(new VideoGroupNorm(32, this->config.mid_channels)); + for (int i = 0; i < this->config.num_blocks_per_stage; ++i) { + blocks["res_blocks." + std::to_string(i)] = std::shared_ptr(new ResBlock(this->config.mid_channels, this->config.dims)); + } + blocks["upsampler.0"] = std::shared_ptr(new Conv2d(this->config.mid_channels, + 4 * this->config.mid_channels, + {3, 3}, + {1, 1}, + {1, 1})); + blocks["upsampler.1"] = std::shared_ptr(new PixelShuffleND(2)); + for (int i = 0; i < this->config.num_blocks_per_stage; ++i) { + blocks["post_upsample_res_blocks." + std::to_string(i)] = std::shared_ptr(new ResBlock(this->config.mid_channels, this->config.dims)); + } + blocks["final_conv"] = std::shared_ptr(new Conv3d(this->config.mid_channels, + this->config.in_channels, + {3, 3, 3}, + {1, 1, 1}, + {1, 1, 1})); + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) { + // x: [b*c, f, h, w] + // return: [b*c, f, h*2, w*2] + auto initial_conv = std::dynamic_pointer_cast(blocks["initial_conv"]); + auto initial_norm = std::dynamic_pointer_cast(blocks["initial_norm"]); + auto upsample_conv = std::dynamic_pointer_cast(blocks["upsampler.0"]); + auto pixel_shuffle = std::dynamic_pointer_cast(blocks["upsampler.1"]); + auto final_conv = std::dynamic_pointer_cast(blocks["final_conv"]); + + x = initial_conv->forward(ctx, x); + x = initial_norm->forward(ctx, x); + x = ggml_silu(ctx->ggml_ctx, x); + sd::ggml_graph_cut::mark_graph_cut(x, "ltx_latent_upsampler.initial", "x"); + + for (int i = 0; i < config.num_blocks_per_stage; ++i) { + auto block = std::dynamic_pointer_cast(blocks["res_blocks." + std::to_string(i)]); + x = block->forward(ctx, x); + sd::ggml_graph_cut::mark_graph_cut(x, "ltx_latent_upsampler.res_blocks." + std::to_string(i), "x"); + } + + // rearrange(x, "b c f h w -> (b f) c h w"), + x = ggml_ext_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, x, 0, 1, 3, 2)); // [b*f, c, h, w] + x = upsample_conv->forward(ctx, x); // [b*f, c*4, h, w] + x = pixel_shuffle->forward(ctx, x); // [b*f, c, h*2, w*2] + x = ggml_ext_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, x, 0, 1, 3, 2)); // [b*c, f, h, w] + sd::ggml_graph_cut::mark_graph_cut(x, "ltx_latent_upsampler.spatial_up", "x"); + + for (int i = 0; i < config.num_blocks_per_stage; ++i) { + auto block = std::dynamic_pointer_cast(blocks["post_upsample_res_blocks." + std::to_string(i)]); + x = block->forward(ctx, x); + sd::ggml_graph_cut::mark_graph_cut(x, "ltx_latent_upsampler.post_blocks." + std::to_string(i), "x"); + } + + x = final_conv->forward(ctx, x); + sd::ggml_graph_cut::mark_graph_cut(x, "ltx_latent_upsampler.final", "x"); + return x; + } + }; + + struct LatentUpsamplerRunner : public GGMLRunner { + std::unique_ptr model; + + LatentUpsamplerRunner(ggml_backend_t backend, + ggml_backend_t params_backend) + : GGMLRunner(backend, params_backend) {} + + std::string get_desc() override { + return "ltx_latent_upsampler"; + } + + bool load_from_file(const std::string& file_path, int n_threads) { + LOG_INFO("loading LTX latent upsampler from '%s'", file_path.c_str()); + ModelLoader model_loader; + if (!model_loader.init_from_file(file_path)) { + LOG_ERROR("init LTX latent upsampler model loader from file failed: '%s'", file_path.c_str()); + return false; + } + + const auto& tensor_storage_map = model_loader.get_tensor_storage_map(); + if (!has_tensor(tensor_storage_map, "post_upsample_res_blocks.0.conv2.bias") || + !has_tensor(tensor_storage_map, "upsampler.0.weight")) { + LOG_ERROR("unsupported LTX latent upsampler weights: expected spatial upsampler tensors"); + return false; + } + + LatentUpsamplerConfig config = detect_config_from_weights(tensor_storage_map); + if (config.dims != 3 || !config.spatial_upsample || config.temporal_upsample || + config.rational_resampler) { + LOG_ERROR("unsupported LTX latent upsampler config: dims=%d spatial=%d temporal=%d rational=%d", + config.dims, + config.spatial_upsample, + config.temporal_upsample, + config.rational_resampler); + return false; + } + + model = std::make_unique(config); + model->init(params_ctx, tensor_storage_map, ""); + if (!alloc_params_buffer()) { + LOG_ERROR("LTX latent upsampler params buffer allocation failed"); + return false; + } + + std::map tensors; + model->get_param_tensors(tensors); + if (!model_loader.load_tensors(tensors, {}, n_threads)) { + LOG_ERROR("load LTX latent upsampler tensors failed"); + return false; + } + + LOG_INFO("LTX latent upsampler loaded: in_channels=%" PRId64 ", mid_channels=%" PRId64 ", blocks=%d", + config.in_channels, + config.mid_channels, + config.num_blocks_per_stage); + return true; + } + + ggml_cgraph* build_graph(const sd::Tensor& x_tensor) { + if (!model) { + return nullptr; + } + ggml_cgraph* gf = new_graph_custom(LTX_UPSAMPLER_GRAPH_SIZE); + ggml_tensor* x = make_input(x_tensor); + auto runner_ctx = get_context(); + ggml_tensor* out = model->forward(&runner_ctx, x); + ggml_build_forward_expand(gf, out); + return gf; + } + + sd::Tensor compute(const int n_threads, + const sd::Tensor& x) { + if (!model) { + LOG_ERROR("LTX latent upsampler is not loaded"); + return {}; + } + if (x.dim() != 4 && x.dim() != 5) { + LOG_ERROR("LTX latent upsampler expects 4D or 5D video latent, got dim=%lld", + (long long)x.dim()); + return {}; + } + if (x.dim() == 5 && x.shape()[4] != 1) { + LOG_ERROR("LTX latent upsampler currently supports batch size 1, got batch=%lld", + (long long)x.shape()[4]); + return {}; + } + if (x.shape()[3] != model->config.in_channels) { + LOG_ERROR("LTX latent upsampler expected %" PRId64 " channels, got %lld", + model->config.in_channels, + (long long)x.shape()[3]); + return {}; + } + size_t expected_dim = static_cast(x.dim()); + auto get_graph = [&]() -> ggml_cgraph* { return build_graph(x); }; + return restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, false), expected_dim); + } + }; + +} // namespace LTXVUpsampler + +#endif // __SD_LTX_LATENT_UPSCALER_HPP__ diff --git a/otherarch/sdcpp/ltx_vae.hpp b/otherarch/sdcpp/ltx_vae.hpp index 8c41a51ce..751995860 100644 --- a/otherarch/sdcpp/ltx_vae.hpp +++ b/otherarch/sdcpp/ltx_vae.hpp @@ -1123,6 +1123,18 @@ namespace LTXVAE { mean = ggml_cont(ctx->ggml_ctx, mean); return processor->normalize(ctx, mean); } + + ggml_tensor* normalize_latents(GGMLRunnerContext* ctx, + ggml_tensor* x) { + auto processor = std::dynamic_pointer_cast(blocks["per_channel_statistics"]); + return processor->normalize(ctx, x); + } + + ggml_tensor* un_normalize_latents(GGMLRunnerContext* ctx, + ggml_tensor* x) { + auto processor = std::dynamic_pointer_cast(blocks["per_channel_statistics"]); + return processor->un_normalize(ctx, x); + } }; } // namespace LTXVAE @@ -1192,6 +1204,17 @@ struct LTXVideoVAE : public VAE { return gf; } + ggml_cgraph* build_latent_statistics_graph(const sd::Tensor& z_tensor, bool normalize) { + ggml_cgraph* gf = new_graph_custom(1024); + ggml_tensor* z = make_input(z_tensor); + + auto runner_ctx = get_context(); + ggml_tensor* out = normalize ? vae.normalize_latents(&runner_ctx, z) + : vae.un_normalize_latents(&runner_ctx, z); + ggml_build_forward_expand(gf, out); + return gf; + } + sd::Tensor _compute(const int n_threads, const sd::Tensor& z, bool decode_graph) override { @@ -1226,6 +1249,26 @@ struct LTXVideoVAE : public VAE { return result; } + sd::Tensor apply_latent_statistics(const int n_threads, + const sd::Tensor& z, + bool normalize) { + auto get_graph = [&]() -> ggml_cgraph* { + return build_latent_statistics_graph(z, normalize); + }; + return restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, false), + static_cast(z.dim())); + } + + sd::Tensor normalize_latents(const int n_threads, + const sd::Tensor& z) { + return apply_latent_statistics(n_threads, z, true); + } + + sd::Tensor un_normalize_latents(const int n_threads, + const sd::Tensor& z) { + return apply_latent_statistics(n_threads, z, false); + } + int get_encoder_output_channels(int input_channels) override { SD_UNUSED(input_channels); return 256; diff --git a/otherarch/sdcpp/ltxv.hpp b/otherarch/sdcpp/ltxv.hpp index fa6c0601d..fd19595ef 100644 --- a/otherarch/sdcpp/ltxv.hpp +++ b/otherarch/sdcpp/ltxv.hpp @@ -1487,6 +1487,9 @@ namespace LTXV { ->forward(ctx, ggml_ext_scale(ctx->ggml_ctx, av_ca_audio_timestep, av_ca_factor)) .first; + sd::ggml_graph_cut::mark_graph_cut(vx, "ltxav.prelude", "vx"); + sd::ggml_graph_cut::mark_graph_cut(ax, "ltxav.prelude", "ax"); + for (int i = 0; i < cfg.num_layers; i++) { auto block = std::dynamic_pointer_cast(blocks["transformer_blocks." + std::to_string(i)]); auto out = block->forward(ctx, @@ -1509,6 +1512,8 @@ namespace LTXV { a_prompt_timestep_mod); vx = out.first; ax = out.second; + sd::ggml_graph_cut::mark_graph_cut(vx, "ltxav.transformer_blocks." + std::to_string(i), "vx"); + sd::ggml_graph_cut::mark_graph_cut(ax, "ltxav.transformer_blocks." + std::to_string(i), "ax"); } auto v_shift_scale = get_output_scale_shift(ctx, params["scale_shift_table"], v_embedded_time, cfg.hidden_size); diff --git a/otherarch/sdcpp/stable-diffusion.cpp b/otherarch/sdcpp/stable-diffusion.cpp index fa90942e1..a5fbe89fe 100644 --- a/otherarch/sdcpp/stable-diffusion.cpp +++ b/otherarch/sdcpp/stable-diffusion.cpp @@ -17,6 +17,7 @@ #include "guidance.h" #include "lora.hpp" #include "ltx_audio_vae.h" +#include "ltx_latent_upscaler.hpp" #include "ltx_vae.hpp" #include "pmid.hpp" #include "sample-cache.h" @@ -883,7 +884,8 @@ public: auto create_tae = [&]() -> std::shared_ptr { if (sd_version_is_wan(version) || sd_version_is_qwen_image(version) || - sd_version_is_anima(version)) { + sd_version_is_anima(version) || + sd_version_is_ltxav(version)) { return std::make_shared(backend_for(SDBackendModule::VAE), params_backend_for(SDBackendModule::VAE), tensor_storage_map, @@ -1430,7 +1432,7 @@ public: } auto lora = std::make_shared(lora_id, backend_for(module), - params_backend_for(module), + backend_for(module), lora_path, is_high_noise ? "model.high_noise_" : "", version); @@ -2421,6 +2423,24 @@ public: return first_stage_model->decode(n_threads, latents, vae_tiling_params, decode_video, circular_x, circular_y); } + sd::Tensor normalize_ltx_video_latents(const sd::Tensor& x) { + auto ltx_vae = std::dynamic_pointer_cast(first_stage_model); + if (!ltx_vae) { + LOG_ERROR("LTX latent normalization requires LTX video VAE"); + return {}; + } + return ltx_vae->normalize_latents(n_threads, x); + } + + sd::Tensor un_normalize_ltx_video_latents(const sd::Tensor& x) { + auto ltx_vae = std::dynamic_pointer_cast(first_stage_model); + if (!ltx_vae) { + LOG_ERROR("LTX latent un-normalization requires LTX video VAE"); + return {}; + } + return ltx_vae->un_normalize_latents(n_threads, x); + } + sd::Tensor decode_ltx_audio_latent(const sd::Tensor& audio_latent) { if (audio_vae_model == nullptr || audio_latent.empty()) { return {}; @@ -2704,16 +2724,18 @@ void sd_cache_params_init(sd_cache_params_t* cache_params) { } void sd_hires_params_init(sd_hires_params_t* hires_params) { - *hires_params = {}; - hires_params->enabled = false; - hires_params->upscaler = SD_HIRES_UPSCALER_LATENT; - hires_params->model_path = nullptr; - hires_params->scale = 2.0f; - hires_params->target_width = 0; - hires_params->target_height = 0; - hires_params->steps = 0; - hires_params->denoising_strength = 0.7f; - hires_params->upscale_tile_size = 128; + *hires_params = {}; + hires_params->enabled = false; + hires_params->upscaler = SD_HIRES_UPSCALER_LATENT; + hires_params->model_path = nullptr; + hires_params->scale = 2.0f; + hires_params->target_width = 0; + hires_params->target_height = 0; + hires_params->steps = 0; + hires_params->denoising_strength = 0.7f; + hires_params->upscale_tile_size = 128; + hires_params->custom_sigmas = nullptr; + hires_params->custom_sigmas_count = 0; } void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) { @@ -2986,6 +3008,16 @@ void sd_vid_gen_params_init(sd_vid_gen_params_t* sd_vid_gen_params) { sd_vid_gen_params->moe_boundary = 0.875f; sd_vid_gen_params->vace_strength = 1.f; sd_vid_gen_params->vae_tiling_params = {false, false, 0, 0, 0.5f, 0.0f, 0.0f}; + sd_vid_gen_params->hires.enabled = false; + sd_vid_gen_params->hires.upscaler = SD_HIRES_UPSCALER_LATENT; + sd_vid_gen_params->hires.scale = 2.f; + sd_vid_gen_params->hires.target_width = 0; + sd_vid_gen_params->hires.target_height = 0; + sd_vid_gen_params->hires.steps = 0; + sd_vid_gen_params->hires.denoising_strength = 0.7f; + sd_vid_gen_params->hires.upscale_tile_size = 128; + sd_vid_gen_params->hires.custom_sigmas = nullptr; + sd_vid_gen_params->hires.custom_sigmas_count = 0; sd_cache_params_init(&sd_vid_gen_params->cache); } @@ -3235,6 +3267,7 @@ struct GenerationRequest { vace_strength = sd_vid_gen_params->vace_strength; guidance = sd_vid_gen_params->sample_params.guidance; high_noise_guidance = sd_vid_gen_params->high_noise_sample_params.guidance; + hires = sd_vid_gen_params->hires; resolve(sd_ctx); if (frames != requested_frames) { LOG_WARN("align video frames from %d to %d for %s", @@ -3293,6 +3326,20 @@ struct GenerationRequest { hires.enabled = false; return; } + if (hires.custom_sigmas_count < 0) { + LOG_WARN("hires custom sigmas count is negative, ignoring custom sigmas"); + hires.custom_sigmas = nullptr; + hires.custom_sigmas_count = 0; + } + if (hires.custom_sigmas_count > 0 && hires.custom_sigmas == nullptr) { + LOG_WARN("hires custom sigmas count is positive but custom sigmas are null, ignoring custom sigmas"); + hires.custom_sigmas_count = 0; + } + if (hires.custom_sigmas_count == 1) { + LOG_WARN("hires custom sigmas requires at least two values, ignoring custom sigmas"); + hires.custom_sigmas = nullptr; + hires.custom_sigmas_count = 0; + } hires.denoising_strength = std::clamp(hires.denoising_strength, 0.0001f, 1.f); hires.steps = std::max(0, hires.steps); @@ -3657,6 +3704,85 @@ static sd::Tensor pack_ltxav_audio_and_video_denoise_mask(const sd::Tenso return sd::ops::concat(video_mask_full, audio_mask, 3); } +static sd::Tensor make_ltxav_video_denoise_mask(const sd::Tensor& video_latent, float value = 1.f) { + if (video_latent.empty()) { + return {}; + } + return sd::full({video_latent.shape()[0], + video_latent.shape()[1], + video_latent.shape()[2], + 1, + 1}, + value); +} + +static sd::Tensor encode_ltxav_condition_image(sd_ctx_t* sd_ctx, + const sd::Tensor& image, + const char* name) { + if (sd_ctx == nullptr || sd_ctx->sd == nullptr || image.empty()) { + return {}; + } + auto condition_image = image.reshape({image.shape()[0], + image.shape()[1], + 1, + image.shape()[2], + image.shape()[3]}); + auto condition_latent = sd_ctx->sd->encode_first_stage(condition_image); + if (condition_latent.empty()) { + LOG_ERROR("failed to encode LTXAV %s image", name); + } + return condition_latent; +} + +static bool apply_ltxav_condition_by_latent_index(sd::Tensor* video_latent, + sd::Tensor* video_mask, + const sd::Tensor& condition_latent, + int64_t latent_idx, + const char* name, + float conditioned_mask) { + if (video_latent == nullptr || video_mask == nullptr || video_latent->empty() || video_mask->empty()) { + return false; + } + if (condition_latent.empty() || + condition_latent.shape()[0] != video_latent->shape()[0] || + condition_latent.shape()[1] != video_latent->shape()[1] || + condition_latent.shape()[3] != video_latent->shape()[3]) { + LOG_ERROR("invalid LTXAV %s condition latent shape", name); + return false; + } + int64_t latent_frames = video_latent->shape()[2]; + int64_t condition_frames = condition_latent.shape()[2]; + if (latent_idx < 0 || condition_frames <= 0 || latent_idx + condition_frames > latent_frames) { + LOG_ERROR("invalid LTXAV %s image latent range: start=%" PRId64 ", length=%" PRId64 ", latent_frames=%" PRId64, + name, + latent_idx, + condition_frames, + latent_frames); + return false; + } + + sd::ops::slice_assign(video_latent, 2, latent_idx, latent_idx + condition_frames, condition_latent); + sd::ops::fill_slice(video_mask, 2, latent_idx, latent_idx + condition_frames, conditioned_mask); + return true; +} + +static bool apply_ltxav_condition_image_by_latent_index(sd_ctx_t* sd_ctx, + const sd::Tensor& image, + sd::Tensor* video_latent, + sd::Tensor* video_mask, + int64_t latent_idx, + const char* name, + float strength) { + auto condition_latent = encode_ltxav_condition_image(sd_ctx, image, name); + return !condition_latent.empty() && + apply_ltxav_condition_by_latent_index(video_latent, + video_mask, + condition_latent, + latent_idx, + name, + 1.0f - std::clamp(strength, 0.f, 1.f)); +} + static sd::Tensor unpack_ltxav_audio_latent(const sd::Tensor& packed_latent, int audio_length, int video_channels) { @@ -4218,6 +4344,53 @@ static sd::Tensor upscale_hires_latent(sd_ctx_t* sd_ctx, return {}; } +static std::vector make_hires_sigma_schedule(sd_ctx_t* sd_ctx, + const sd_hires_params_t& hires, + const sd_sample_params_t& sample_params, + sample_method_t sample_method, + int default_steps, + int sample_seq_len, + int* scheduler_steps_out) { + if (scheduler_steps_out != nullptr) { + *scheduler_steps_out = 0; + } + + if (hires.custom_sigmas_count > 0 && hires.custom_sigmas != nullptr) { + std::vector custom_sigmas(hires.custom_sigmas, + hires.custom_sigmas + hires.custom_sigmas_count); + if (scheduler_steps_out != nullptr) { + *scheduler_steps_out = static_cast(custom_sigmas.size()) - 1; + } + return custom_sigmas; + } + + int effective_steps = hires.steps > 0 ? hires.steps : default_steps; + effective_steps = std::max(1, effective_steps); + + // sd-webui behavior: scale up total steps so trimming by denoising_strength yields exactly hires_steps effective steps, + // unlike img2img which trims from a fixed step count. + int scheduler_steps = static_cast(effective_steps / hires.denoising_strength); + scheduler_steps = std::max(1, scheduler_steps); + + scheduler_t scheduler = resolve_scheduler(sd_ctx, + sample_params.scheduler, + sample_method); + std::vector sigmas = sd_ctx->sd->denoiser->get_sigmas(scheduler_steps, + sample_seq_len, + scheduler, + sd_ctx->sd->version, + sample_params.extra_sample_args); + size_t t_enc = static_cast(scheduler_steps * hires.denoising_strength); + if (t_enc >= static_cast(scheduler_steps)) { + t_enc = static_cast(scheduler_steps) - 1; + } + if (scheduler_steps_out != nullptr) { + *scheduler_steps_out = scheduler_steps; + } + return std::vector(sigmas.begin() + scheduler_steps - static_cast(t_enc) - 1, + sigmas.end()); +} + SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params) { if (sd_ctx == nullptr || sd_img_gen_params == nullptr) { return nullptr; @@ -4340,29 +4513,20 @@ SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* s } } - int hires_steps = request.hires.steps > 0 ? request.hires.steps : plan.sample_steps; - - // sd-webui behavior: scale up total steps so trimming by denoising_strength yields exactly hires_steps effective steps, - // unlike img2img which trims from a fixed step count - hires_steps = static_cast(hires_steps / request.hires.denoising_strength); - - std::vector hires_sigmas = sd_ctx->sd->denoiser->get_sigmas( - hires_steps, - sd_ctx->sd->get_image_seq_len(request.hires.target_height, request.hires.target_width), - sd_img_gen_params->sample_params.scheduler, - sd_ctx->sd->version, - sd_img_gen_params->sample_params.extra_sample_args); - - size_t t_enc = static_cast(hires_steps * request.hires.denoising_strength); - if (t_enc >= static_cast(hires_steps)) { - t_enc = static_cast(hires_steps) - 1; - } - std::vector hires_sigma_sched(hires_sigmas.begin() + hires_steps - static_cast(t_enc) - 1, - hires_sigmas.end()); - LOG_INFO("hires fix: %d steps, denoising_strength=%.2f, sigma_sched_size=%zu", - hires_steps, + int hires_scheduler_steps = 0; + std::vector hires_sigma_sched = + make_hires_sigma_schedule(sd_ctx, + request.hires, + sd_img_gen_params->sample_params, + plan.sample_method, + plan.sample_steps, + sd_ctx->sd->get_image_seq_len(request.hires.target_height, request.hires.target_width), + &hires_scheduler_steps); + LOG_INFO("hires fix: scheduler_steps=%d, denoising_strength=%.2f, sigma_sched_size=%zu%s", + hires_scheduler_steps, request.hires.denoising_strength, - hires_sigma_sched.size()); + hires_sigma_sched.size(), + request.hires.custom_sigmas_count > 0 ? ", custom_sigmas=true" : ""); std::vector> hires_final_latents; int64_t hires_denoise_start = ggml_time_ms(); @@ -4510,44 +4674,7 @@ static std::optional prepare_video_generation_latents(sd float conditioning_strength = std::clamp(request->strength, 0.f, 1.f); float conditioned_mask = 1.0f - conditioning_strength; - latents.denoise_mask = sd::full({latents.init_latent.shape()[0], - latents.init_latent.shape()[1], - latents.init_latent.shape()[2], - 1, - 1}, - 1.f); - - auto encode_ltxav_condition_image = [&](const sd::Tensor& image, const char* name) -> sd::Tensor { - auto condition_image = image.reshape({image.shape()[0], - image.shape()[1], - 1, - image.shape()[2], - image.shape()[3]}); - auto condition_latent = sd_ctx->sd->encode_first_stage(condition_image); - if (condition_latent.empty()) { - LOG_ERROR("failed to encode LTXAV %s image", name); - } - return condition_latent; - }; - - auto apply_video_condition_by_latent_index = [&](const sd::Tensor& condition_latent, - int64_t latent_idx, - const char* name) -> bool { - int64_t latent_frames = latents.init_latent.shape()[2]; - int64_t condition_frames = condition_latent.shape()[2]; - if (latent_idx < 0 || condition_frames <= 0 || latent_idx + condition_frames > latent_frames) { - LOG_ERROR("invalid LTXAV %s image latent range: start=%" PRId64 ", length=%" PRId64 ", latent_frames=%" PRId64, - name, - latent_idx, - condition_frames, - latent_frames); - return false; - } - - sd::ops::slice_assign(&latents.init_latent, 2, latent_idx, latent_idx + condition_frames, condition_latent); - sd::ops::fill_slice(&latents.denoise_mask, 2, latent_idx, latent_idx + condition_frames, conditioned_mask); - return true; - }; + latents.denoise_mask = make_ltxav_video_denoise_mask(latents.init_latent, 1.f); auto apply_video_condition_by_keyframe_index = [&](const sd::Tensor& keyframes, int frame_idx, @@ -4585,20 +4712,30 @@ static std::optional prepare_video_generation_latents(sd }; if (!start_image.empty()) { - auto start_image_latent = encode_ltxav_condition_image(start_image, "init"); - if (start_image_latent.empty() || !apply_video_condition_by_latent_index(start_image_latent, 0, "init")) { + if (!apply_ltxav_condition_image_by_latent_index(sd_ctx, + start_image, + &latents.init_latent, + &latents.denoise_mask, + 0, + "init", + conditioning_strength)) { return std::nullopt; } } if (!end_image.empty()) { - auto end_image_latent = encode_ltxav_condition_image(end_image, "end"); + auto end_image_latent = encode_ltxav_condition_image(sd_ctx, end_image, "end"); if (end_image_latent.empty()) { return std::nullopt; } int frame_idx = request->frames - 1; - bool ok = frame_idx == 0 ? apply_video_condition_by_latent_index(end_image_latent, 0, "end") + bool ok = frame_idx == 0 ? apply_ltxav_condition_by_latent_index(&latents.init_latent, + &latents.denoise_mask, + end_image_latent, + 0, + "end", + conditioned_mask) : apply_video_condition_by_keyframe_index(end_image_latent, frame_idx, "end"); if (!ok) { return std::nullopt; @@ -4879,6 +5016,175 @@ static sd_image_t* decode_video_outputs(sd_ctx_t* sd_ctx, return result_images; } +static sd::Tensor upscale_ltx_spatial_video_latent(sd_ctx_t* sd_ctx, + const char* model_path, + const sd::Tensor& packed_latent, + int audio_length) { + if (sd_ctx == nullptr || sd_ctx->sd == nullptr || packed_latent.empty()) { + return {}; + } + if (strlen(SAFE_STR(model_path)) == 0) { + LOG_ERROR("LTX latent spatial upscale requires a model path"); + return {}; + } + if (!sd_ctx->sd->ensure_backend_pair(SDBackendModule::UPSCALER)) { + return {}; + } + + int latent_channels = sd_ctx->sd->get_latent_channel(); + sd::Tensor video_latent = packed_latent; + sd::Tensor audio_latent; + if (packed_latent.shape()[3] > latent_channels) { + video_latent = sd::ops::slice(packed_latent, 3, 0, latent_channels); + audio_latent = unpack_ltxav_audio_latent(packed_latent, audio_length, latent_channels); + } + + LOG_INFO("LTX latent spatial upscale: latent %dx%dx%dx%d -> x2", + (int)video_latent.shape()[0], + (int)video_latent.shape()[1], + (int)video_latent.shape()[2], + (int)video_latent.shape()[3]); + + sd::Tensor unnormalized = sd_ctx->sd->un_normalize_ltx_video_latents(video_latent); + if (unnormalized.empty()) { + LOG_ERROR("LTX latent un-normalization failed before spatial upscale"); + return {}; + } + + std::unique_ptr upsampler = + std::make_unique(sd_ctx->sd->backend_for(SDBackendModule::UPSCALER), + sd_ctx->sd->params_backend_for(SDBackendModule::UPSCALER)); + const size_t max_graph_vram_bytes = sd::ggml_graph_cut::max_vram_gib_to_bytes(sd_ctx->sd->max_vram); + upsampler->set_max_graph_vram_bytes(max_graph_vram_bytes); + if (!upsampler->load_from_file(model_path, sd_ctx->sd->n_threads)) { + LOG_ERROR("load LTX latent upsampler failed"); + return {}; + } + + sd::Tensor upscaled = upsampler->compute(sd_ctx->sd->n_threads, unnormalized); + upsampler.reset(); + if (upscaled.empty()) { + LOG_ERROR("LTX latent spatial upscale failed"); + return {}; + } + + upscaled = sd_ctx->sd->normalize_ltx_video_latents(upscaled); + if (upscaled.empty()) { + LOG_ERROR("LTX latent normalization failed after spatial upscale"); + return {}; + } + + if (!audio_latent.empty()) { + upscaled = pack_ltxav_audio_and_video_latents(upscaled, audio_latent); + } + return upscaled; +} + +static bool apply_ltxv_refine_image_conditioning(sd_ctx_t* sd_ctx, + const sd_vid_gen_params_t* sd_vid_gen_params, + const GenerationRequest& request, + const ImageGenerationLatents& latents, + sd::Tensor* latent, + sd::Tensor* denoise_mask, + sd::Tensor* video_positions) { + if (sd_ctx == nullptr || sd_ctx->sd == nullptr || sd_vid_gen_params == nullptr || + latent == nullptr || latent->empty() || denoise_mask == nullptr || video_positions == nullptr) { + return true; + } + if (sd_vid_gen_params->init_image.data == nullptr && + sd_vid_gen_params->end_image.data == nullptr) { + return true; + } + if (sd_ctx->sd->vae_decode_only) { + LOG_ERROR("LTXV refine image conditioning requires VAE encoder weights; create the context with vae_decode_only=false"); + return false; + } + + constexpr float conditioning_strength = 1.f; + int latent_channels = sd_ctx->sd->get_latent_channel(); + sd::Tensor video_latent = *latent; + sd::Tensor audio_latent; + if (latent->shape()[3] > latent_channels) { + video_latent = sd::ops::slice(*latent, 3, 0, latent_channels); + audio_latent = unpack_ltxav_audio_latent(*latent, latents.audio_length, latent_channels); + if (audio_latent.empty()) { + LOG_ERROR("failed to unpack LTXAV audio latent before image-to-video inplace conditioning"); + return false; + } + } + + int image_width = static_cast(video_latent.shape()[0]) * request.vae_scale_factor; + int image_height = static_cast(video_latent.shape()[1]) * request.vae_scale_factor; + sd::Tensor video_mask = make_ltxav_video_denoise_mask(video_latent, 1.f); + + if (sd_vid_gen_params->init_image.data != nullptr) { + sd::Tensor start_image = sd_image_to_tensor(sd_vid_gen_params->init_image, image_width, image_height); + if (!apply_ltxav_condition_image_by_latent_index(sd_ctx, + start_image, + &video_latent, + &video_mask, + 0, + "init", + conditioning_strength)) { + return false; + } + } + + if (sd_vid_gen_params->end_image.data != nullptr) { + sd::Tensor end_image = sd_image_to_tensor(sd_vid_gen_params->end_image, image_width, image_height); + sd::Tensor end_image_latent = encode_ltxav_condition_image(sd_ctx, end_image, "end"); + if (end_image_latent.empty()) { + return false; + } + + int frame_idx = request.frames - 1; + if (frame_idx == 0) { + if (!apply_ltxav_condition_by_latent_index(&video_latent, + &video_mask, + end_image_latent, + 0, + "end", + 1.f - conditioning_strength)) { + return false; + } + } else { + if (latents.video_conditioning_frame_count <= 0 || latents.video_target_frame_count <= 0) { + LOG_ERROR("LTXV FLF2V refine conditioning requires low-resolution keyframe conditioning metadata"); + return false; + } + int64_t target_latent_frames = latents.video_target_frame_count; + if (!apply_ltxav_condition_by_latent_index(&video_latent, + &video_mask, + end_image_latent, + target_latent_frames, + "end", + 1.f - conditioning_strength)) { + return false; + } + *video_positions = build_ltxv_video_positions(video_latent.shape()[0], + video_latent.shape()[1], + target_latent_frames, + end_image_latent.shape()[2], + frame_idx, + 1, + request.fps, + request.vae_scale_factor, + 8, + true); + } + } + + if (!audio_latent.empty()) { + *latent = pack_ltxav_audio_and_video_latents(video_latent, audio_latent); + *denoise_mask = pack_ltxav_audio_and_video_denoise_mask(video_mask, video_latent, audio_latent); + } else { + *latent = std::move(video_latent); + *denoise_mask = std::move(video_mask); + } + LOG_INFO("LTXV refine image conditioning applied at %dx%d", image_width, image_height); + return true; +} + SD_API bool generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* sd_vid_gen_params, sd_image_t** frames_out, @@ -4899,6 +5205,23 @@ SD_API bool generate_video(sd_ctx_t* sd_ctx, int64_t t0 = ggml_time_ms(); sd_ctx->sd->vae_tiling_params = sd_vid_gen_params->vae_tiling_params; GenerationRequest request(sd_ctx, sd_vid_gen_params); + bool latent_upscale_enabled = request.hires.enabled; + GenerationRequest hires_request = request; + if (latent_upscale_enabled) { + if (!sd_version_is_ltxav(sd_ctx->sd->version)) { + LOG_ERROR("LTX latent spatial upscale is only supported for LTX video models"); + return false; + } + if (request.hires.upscaler != SD_HIRES_UPSCALER_MODEL) { + LOG_ERROR("LTX latent spatial upscale currently requires hires upscaler MODEL"); + return false; + } + if (strlen(SAFE_STR(request.hires.model_path)) == 0) { + LOG_ERROR("LTX latent spatial upscale is enabled but hires model path was not provided"); + return false; + } + } + sd_ctx->sd->rng->manual_seed(request.seed); sd_ctx->sd->sampler_rng->manual_seed(request.seed); sd_ctx->sd->set_flow_shift(sd_vid_gen_params->sample_params.flow_shift); @@ -4910,14 +5233,22 @@ SD_API bool generate_video(sd_ctx_t* sd_ctx, return false; } ImageGenerationLatents latents = std::move(*latent_inputs_opt); - ImageGenerationEmbeds embeds = prepare_video_generation_embeds(sd_ctx, - sd_vid_gen_params, - request, - latents); - LOG_INFO("generate_video %dx%dx%d", - request.width, - request.height, - request.frames); + + ImageGenerationEmbeds embeds = prepare_video_generation_embeds(sd_ctx, + sd_vid_gen_params, + request, + latents); + if (latent_upscale_enabled) { + LOG_INFO("generate_video %dx%dx%d -> LTX latent spatial upscale", + request.width, + request.height, + request.frames); + } else { + LOG_INFO("generate_video %dx%dx%d", + request.width, + request.height, + request.frames); + } int64_t latent_start = ggml_time_ms(); int W = request.width / request.vae_scale_factor; @@ -5009,15 +5340,126 @@ SD_API bool generate_video(sd_ctx_t* sd_ctx, latents.video_positions); int64_t sampling_end = ggml_time_ms(); - if (sd_ctx->sd->free_params_immediately) { - sd_ctx->sd->diffusion_model->free_params_buffer(); - } if (final_latent.empty()) { + if (sd_ctx->sd->free_params_immediately) { + sd_ctx->sd->diffusion_model->free_params_buffer(); + } LOG_ERROR("sampling failed after %.2fs", (sampling_end - sampling_start) * 1.0f / 1000); return false; } LOG_INFO("sampling completed, taking %.2fs", (sampling_end - sampling_start) * 1.0f / 1000); + if (latent_upscale_enabled) { + int64_t upscale_start = ggml_time_ms(); + sd::Tensor upscaled_latent = upscale_ltx_spatial_video_latent(sd_ctx, + request.hires.model_path, + final_latent, + latents.audio_length); + int64_t upscale_end = ggml_time_ms(); + if (upscaled_latent.empty()) { + if (sd_ctx->sd->free_params_immediately) { + sd_ctx->sd->diffusion_model->free_params_buffer(); + } + return false; + } + LOG_INFO("LTX latent spatial upscale completed, taking %.2fs", + (upscale_end - upscale_start) * 1.0f / 1000); + + x_t = std::move(upscaled_latent); + hires_request.width = static_cast(x_t.shape()[0]) * hires_request.vae_scale_factor; + hires_request.height = static_cast(x_t.shape()[1]) * hires_request.vae_scale_factor; + if ((request.hires.target_width > 0 || request.hires.target_height > 0) && + (request.hires.target_width != hires_request.width || request.hires.target_height != hires_request.height)) { + LOG_WARN("LTX latent spatial upsampler output is %dx%d; ignoring hires target %dx%d", + hires_request.width, + hires_request.height, + request.hires.target_width, + request.hires.target_height); + } + sd::Tensor hires_denoise_mask; + sd::Tensor hires_video_positions; + if (!apply_ltxv_refine_image_conditioning(sd_ctx, + sd_vid_gen_params, + hires_request, + latents, + &x_t, + &hires_denoise_mask, + &hires_video_positions)) { + if (sd_ctx->sd->free_params_immediately) { + sd_ctx->sd->diffusion_model->free_params_buffer(); + } + return false; + } + noise = sd::Tensor::randn_like(x_t, sd_ctx->sd->rng); + + W = hires_request.width / hires_request.vae_scale_factor; + H = hires_request.height / hires_request.vae_scale_factor; + T = static_cast(x_t.shape()[2]); + sample_method_t hires_sample_method = plan.sample_method; + int hires_scheduler_steps = 0; + std::vector hires_sigma_sched = + make_hires_sigma_schedule(sd_ctx, + request.hires, + sd_vid_gen_params->sample_params, + hires_sample_method, + plan.sample_steps, + sd_ctx->sd->get_image_seq_len(hires_request.height, hires_request.width) * T, + &hires_scheduler_steps); + float hires_eta = resolve_eta(sd_ctx, + sd_vid_gen_params->sample_params.eta, + hires_sample_method); + + LOG_DEBUG("sample(latent upscale) %dx%dx%d", W, H, T); + LOG_INFO("LTX latent spatial upscale refine: scheduler_steps=%d, denoising_strength=%.2f, sampler=%s, sigma_sched_size=%zu%s", + hires_scheduler_steps, + request.hires.denoising_strength, + sampling_methods_str[hires_sample_method], + hires_sigma_sched.size(), + request.hires.custom_sigmas_count > 0 ? ", custom_sigmas=true" : ""); + + sampling_start = ggml_time_ms(); + final_latent = sd_ctx->sd->sample(sd_ctx->sd->diffusion_model, + true, + x_t, + std::move(noise), + embeds.cond, + hires_request.use_uncond ? embeds.uncond : SDCondition(), + embeds.img_cond, + embeds.id_cond, + sd::Tensor(), + 0.f, + sd_vid_gen_params->sample_params.guidance, + hires_eta, + sd_vid_gen_params->sample_params.shifted_timestep, + hires_sample_method, + sd_ctx->sd->is_flow_denoiser(), + plan.extra_sample_args, + hires_sigma_sched, + -1, + std::vector>{}, + false, + hires_denoise_mask, + sd::Tensor(), + hires_request.vace_strength, + latents.audio_length, + static_cast(hires_request.fps), + hires_request.cache_params, + hires_video_positions); + sampling_end = ggml_time_ms(); + if (sd_ctx->sd->free_params_immediately) { + sd_ctx->sd->diffusion_model->free_params_buffer(); + } + if (final_latent.empty()) { + LOG_ERROR("sampling(latent upscale) failed after %.2fs", + (sampling_end - sampling_start) * 1.0f / 1000); + return false; + } + LOG_INFO("sampling(latent upscale) completed, taking %.2fs", + (sampling_end - sampling_start) * 1.0f / 1000); + } else if (sd_ctx->sd->free_params_immediately) { + sd_ctx->sd->diffusion_model->free_params_buffer(); + } + sd_audio_t* generated_audio = nullptr; if (sd_version_is_ltxav(sd_ctx->sd->version) && latents.audio_length > 0 && @@ -5048,7 +5490,7 @@ SD_API bool generate_video(sd_ctx_t* sd_ctx, int64_t latent_end = ggml_time_ms(); LOG_INFO("generating latent video completed, taking %.2fs", (latent_end - latent_start) * 1.0f / 1000); - auto result = decode_video_outputs(sd_ctx, request, final_latent, num_frames_out); + auto result = decode_video_outputs(sd_ctx, latent_upscale_enabled ? hires_request : request, final_latent, num_frames_out); if (result == nullptr) { free_sd_audio(generated_audio); return false; diff --git a/otherarch/sdcpp/stable-diffusion.h b/otherarch/sdcpp/stable-diffusion.h index 3d5b3fd18..3ae44addf 100644 --- a/otherarch/sdcpp/stable-diffusion.h +++ b/otherarch/sdcpp/stable-diffusion.h @@ -332,6 +332,8 @@ typedef struct { int steps; float denoising_strength; int upscale_tile_size; + float* custom_sigmas; + int custom_sigmas_count; } sd_hires_params_t; typedef struct { @@ -382,6 +384,7 @@ typedef struct { float vace_strength; sd_tiling_params_t vae_tiling_params; sd_cache_params_t cache; + sd_hires_params_t hires; } sd_vid_gen_params_t; typedef struct sd_ctx_t sd_ctx_t; diff --git a/otherarch/sdcpp/tae.hpp b/otherarch/sdcpp/tae.hpp index 823cff2d6..c7217ceb1 100644 --- a/otherarch/sdcpp/tae.hpp +++ b/otherarch/sdcpp/tae.hpp @@ -322,13 +322,21 @@ class TinyVideoEncoder : public UnaryBlock { int patch_size = 1; public: - TinyVideoEncoder(int z_channels = 4, int patch_size = 1) + int t_downscale = 1; + TinyVideoEncoder(int z_channels = 4, int patch_size = 1, std::vector time_downscale = {true, true, false}) : z_channels(z_channels), patch_size(patch_size) { + // self.t_downscale = 2**sum(t.stride == 2 for t in self.encoder if isinstance(t, TPool)) + t_downscale = 1; + for (bool downscale : time_downscale) { + if (downscale) { + t_downscale *= 2; + } + } int index = 0; blocks[std::to_string(index++)] = std::shared_ptr(new Conv2d(in_channels * patch_size * patch_size, hidden, {3, 3}, {1, 1}, {1, 1})); index++; // nn.ReLU() for (int i = 0; i < num_layers; i++) { - int stride = i == num_layers - 1 ? 1 : 2; + int stride = time_downscale[i] ? 2 : 1; blocks[std::to_string(index++)] = std::shared_ptr(new TPool(hidden, stride)); blocks[std::to_string(index++)] = std::shared_ptr(new Conv2d(hidden, hidden, {3, 3}, {2, 2}, {1, 1}, {1, 1}, false)); for (int j = 0; j < num_blocks; j++) { @@ -375,15 +383,22 @@ class TinyVideoDecoder : public UnaryBlock { static const int num_layers = 3; int channels[num_layers + 1] = {256, 128, 64, 64}; int patch_size = 1; + int t_upscale = 1; public: - TinyVideoDecoder(int z_channels = 4, int patch_size = 1) + TinyVideoDecoder(int z_channels = 4, int patch_size = 1, std::vector time_upscale = {false, true, true}) : z_channels(z_channels), patch_size(patch_size) { + t_upscale = 1; + for (bool upscale : time_upscale) { + if (upscale) { + t_upscale *= 2; + } + } int index = 1; // Clamp() blocks[std::to_string(index++)] = std::shared_ptr(new Conv2d(z_channels, channels[0], {3, 3}, {1, 1}, {1, 1})); index++; // nn.ReLU() for (int i = 0; i < num_layers; i++) { - int stride = i == 0 ? 1 : 2; + int stride = time_upscale[i] ? 2 : 1; for (int j = 0; j < num_blocks; j++) { blocks[std::to_string(index++)] = std::shared_ptr(new MemBlock(channels[i], channels[i])); } @@ -430,8 +445,8 @@ public: if (patch_size > 1) { h = unpatchify(ctx->ggml_ctx, h, patch_size, 1); } - // shape(W, H, 3, 3 + T) => shape(W, H, 3, T) - h = ggml_view_4d(ctx->ggml_ctx, h, h->ne[0], h->ne[1], h->ne[2], h->ne[3] - 3, h->nb[1], h->nb[2], h->nb[3], 3 * h->nb[3]); + // shape(W, H, 3, (t_upscale - 1) + T) => shape(W, H, 3, T) + h = ggml_view_4d(ctx->ggml_ctx, h, h->ne[0], h->ne[1], h->ne[2], h->ne[3] - (t_upscale - 1), h->nb[1], h->nb[2], h->nb[3], (t_upscale - 1) * h->nb[3]); return h; } }; @@ -442,7 +457,9 @@ protected: SDVersion version; public: - int z_channels = 16; + int z_channels = 16; + std::vector time_downscale = {true, true, false}; + std::vector time_upscale = {false, true, true}; public: TAEHV(bool decode_only = true, SDVersion version = VERSION_WAN2) @@ -451,21 +468,26 @@ public: if (version == VERSION_WAN2_2_TI2V) { z_channels = 48; patch = 2; + } else if (sd_version_is_ltxav(version)) { + z_channels = 128; + patch = 4; + time_downscale = {true, true, true}; + time_upscale = {true, true, true}; } - blocks["decoder"] = std::shared_ptr(new TinyVideoDecoder(z_channels, patch)); + blocks["decoder"] = std::shared_ptr(new TinyVideoDecoder(z_channels, patch, time_upscale)); if (!decode_only) { - blocks["encoder"] = std::shared_ptr(new TinyVideoEncoder(z_channels, patch)); + blocks["encoder"] = std::shared_ptr(new TinyVideoEncoder(z_channels, patch, time_downscale)); } } ggml_tensor* decode(GGMLRunnerContext* ctx, ggml_tensor* z) { auto decoder = std::dynamic_pointer_cast(blocks["decoder"]); - if (sd_version_is_wan(version)) { + if (sd_version_is_wan(version) || sd_version_is_ltxav(version)) { // (W, H, C, T) -> (W, H, T, C) z = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, z, 0, 1, 3, 2)); } auto result = decoder->forward(ctx, z); - if (sd_version_is_wan(version)) { + if (sd_version_is_wan(version) || sd_version_is_ltxav(version)) { // (W, H, C, T) -> (W, H, T, C) result = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, result, 0, 1, 3, 2)); } @@ -477,10 +499,10 @@ public: // (W, H, T, C) -> (W, H, C, T) x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 0, 1, 3, 2)); int64_t num_frames = x->ne[3]; - if (num_frames % 4) { - // pad to multiple of 4 at the end + if (num_frames % encoder->t_downscale) { + // pad to multiple of encoder->t_downscale at the end auto last_frame = ggml_view_4d(ctx->ggml_ctx, x, x->ne[0], x->ne[1], x->ne[2], 1, x->nb[1], x->nb[2], x->nb[3], (num_frames - 1) * x->nb[3]); - for (int i = 0; i < 4 - num_frames % 4; i++) { + for (int i = 0; i < encoder->t_downscale - num_frames % encoder->t_downscale; i++) { x = ggml_concat(ctx->ggml_ctx, x, last_frame, 3); } }