sd: sync to master-637-ef92a00

This commit is contained in:
Wagner Bruna 2026-05-20 22:42:01 -03:00
parent 627e317cd7
commit f27795cef0
9 changed files with 1070 additions and 142 deletions

View file

@ -695,7 +695,7 @@ llama-impl.o: src/llama-impl.cpp src/llama-impl.h
budget.o: common/reasoning-budget.cpp common/reasoning-budget.h
$(CXX) $(CXXFLAGS) -c $< -o $@
SDCPP_COMMON_BASENAMES := anima.hpp auto_encoder_kl.hpp avi_writer.h cache_dit.hpp clip.hpp common_block.hpp common_dit.hpp condition_cache_utils.hpp conditioner.hpp control.hpp convert.cpp denoiser.hpp diffusion_model.hpp easycache.hpp ernie_image.hpp esrgan.hpp flux.hpp ggml_extend.hpp ggml_extend_backend.cpp ggml_extend_backend.h ggml_graph_cut.cpp ggml_graph_cut.h gits_noise.inl guidance.cpp guidance.h hidream_o1.hpp image_metadata.cpp image_metadata.h kcpp_sd_extensions.h latent-preview.h llm.hpp lora.hpp ltx_audio_vae.h ltx_vae.hpp ltxv.hpp mmdit.hpp model.cpp model.h model_io/binary_io.h model_io/gguf_io.cpp model_io/gguf_io.h model_io/gguf_reader_ext.h model_io/pickle_io.cpp model_io/pickle_io.h model_io/safetensors_io.cpp model_io/safetensors_io.h model_io/tensor_storage.h model_io/torch_legacy_io.cpp model_io/torch_legacy_io.h model_io/torch_zip_io.cpp model_io/torch_zip_io.h msf_gif.h name_conversion.cpp name_conversion.h ordered_map.hpp pmid.hpp preprocessing.hpp qwen_image.hpp rng.hpp rng_mt19937.hpp rng_philox.hpp rope.hpp sample-cache.cpp sample-cache.h spectrum.hpp stable-diffusion.cpp stable-diffusion.h t5.hpp tae.hpp tensor.hpp tensor_ggml.hpp thirdparty/LICENSE.darts_clone.txt thirdparty/darts.h thirdparty/miniz.h thirdparty/stb_image_resize.h thirdparty/stb_image_write.h thirdparty/zip.c thirdparty/zip.h tokenizers/bpe_tokenizer.cpp tokenizers/bpe_tokenizer.h tokenizers/clip_tokenizer.cpp tokenizers/clip_tokenizer.h tokenizers/gemma_tokenizer.cpp tokenizers/gemma_tokenizer.h tokenizers/mistral_tokenizer.cpp tokenizers/mistral_tokenizer.h tokenizers/qwen2_tokenizer.cpp tokenizers/qwen2_tokenizer.h tokenizers/t5_unigram_tokenizer.cpp tokenizers/t5_unigram_tokenizer.h tokenizers/tokenize_util.cpp tokenizers/tokenize_util.h tokenizers/tokenizer.cpp tokenizers/tokenizer.h tokenizers/vocab/vocab.h ucache.hpp unet.hpp upscaler.cpp upscaler.h util.cpp util.h vae.hpp wan.hpp z_image.hpp
SDCPP_COMMON_BASENAMES := anima.hpp auto_encoder_kl.hpp avi_writer.h cache_dit.hpp clip.hpp common_block.hpp common_dit.hpp condition_cache_utils.hpp conditioner.hpp control.hpp convert.cpp denoiser.hpp diffusion_model.hpp easycache.hpp ernie_image.hpp esrgan.hpp flux.hpp ggml_extend.hpp ggml_extend_backend.cpp ggml_extend_backend.h ggml_graph_cut.cpp ggml_graph_cut.h gits_noise.inl guidance.cpp guidance.h hidream_o1.hpp image_metadata.cpp image_metadata.h kcpp_sd_extensions.h latent-preview.h llm.hpp lora.hpp ltx_audio_vae.h ltx_latent_upscaler.hpp ltx_vae.hpp ltxv.hpp mmdit.hpp model.cpp model.h model_io/binary_io.h model_io/gguf_io.cpp model_io/gguf_io.h model_io/gguf_reader_ext.h model_io/pickle_io.cpp model_io/pickle_io.h model_io/safetensors_io.cpp model_io/safetensors_io.h model_io/tensor_storage.h model_io/torch_legacy_io.cpp model_io/torch_legacy_io.h model_io/torch_zip_io.cpp model_io/torch_zip_io.h msf_gif.h name_conversion.cpp name_conversion.h ordered_map.hpp pmid.hpp preprocessing.hpp qwen_image.hpp rng.hpp rng_mt19937.hpp rng_philox.hpp rope.hpp sample-cache.cpp sample-cache.h spectrum.hpp stable-diffusion.cpp stable-diffusion.h t5.hpp tae.hpp tensor.hpp tensor_ggml.hpp thirdparty/LICENSE.darts_clone.txt thirdparty/darts.h thirdparty/miniz.h thirdparty/stb_image_resize.h thirdparty/stb_image_write.h thirdparty/zip.c thirdparty/zip.h tokenizers/bpe_tokenizer.cpp tokenizers/bpe_tokenizer.h tokenizers/clip_tokenizer.cpp tokenizers/clip_tokenizer.h tokenizers/gemma_tokenizer.cpp tokenizers/gemma_tokenizer.h tokenizers/mistral_tokenizer.cpp tokenizers/mistral_tokenizer.h tokenizers/qwen2_tokenizer.cpp tokenizers/qwen2_tokenizer.h tokenizers/t5_unigram_tokenizer.cpp tokenizers/t5_unigram_tokenizer.h tokenizers/tokenize_util.cpp tokenizers/tokenize_util.h tokenizers/tokenizer.cpp tokenizers/tokenizer.h tokenizers/vocab/vocab.h ucache.hpp unet.hpp upscaler.cpp upscaler.h util.cpp util.h vae.hpp wan.hpp z_image.hpp
SDCPP_MAIN_BASENAMES := common/common.cpp common/common.h common/log.cpp common/log.h common/media_io.cpp common/media_io.cpp common/media_io.h common/resource_owners.hpp convert.cpp image_metadata.cpp main.cpp tokenizers/vocab/clip_merges.hpp tokenizers/vocab/gemma_merges.hpp tokenizers/vocab/gemma_vocab.hpp tokenizers/vocab/mistral_merges.hpp tokenizers/vocab/mistral_vocab.hpp tokenizers/vocab/qwen_merges.hpp tokenizers/vocab/t5.hpp tokenizers/vocab/umt5.hpp tokenizers/vocab/vocab.cpp version.cpp

View file

@ -1134,11 +1134,11 @@ ArgOptions SDGenerationParams::get_options() {
return 1;
};
auto on_sigmas_arg = [&](int argc, const char** argv, int index) {
if (++index >= argc) {
auto parse_sigmas_arg = [&](const char* value, std::vector<float>* target, const char* option_name) {
if (target == nullptr || value == nullptr) {
return -1;
}
std::string sigmas_str = argv[index];
std::string sigmas_str = value;
if (!sigmas_str.empty() && sigmas_str.front() == '[') {
sigmas_str.erase(0, 1);
}
@ -1146,6 +1146,7 @@ ArgOptions SDGenerationParams::get_options() {
sigmas_str.pop_back();
}
size_t before = target->size();
std::stringstream ss(sigmas_str);
std::string item;
while (std::getline(ss, item, ',')) {
@ -1153,24 +1154,38 @@ ArgOptions SDGenerationParams::get_options() {
item.erase(item.find_last_not_of(" \t\n\r\f\v") + 1);
if (!item.empty()) {
try {
custom_sigmas.push_back(std::stof(item));
target->push_back(std::stof(item));
} catch (const std::invalid_argument&) {
LOG_ERROR("error: invalid float value '%s' in --sigmas", item.c_str());
LOG_ERROR("error: invalid float value '%s' in %s", item.c_str(), option_name);
return -1;
} catch (const std::out_of_range&) {
LOG_ERROR("error: float value '%s' out of range in --sigmas", item.c_str());
LOG_ERROR("error: float value '%s' out of range in %s", item.c_str(), option_name);
return -1;
}
}
}
if (custom_sigmas.empty() && !sigmas_str.empty()) {
LOG_ERROR("error: could not parse any sigma values from '%s'", argv[index]);
if (target->size() == before && !sigmas_str.empty()) {
LOG_ERROR("error: could not parse any sigma values from '%s'", value);
return -1;
}
return 1;
};
auto on_sigmas_arg = [&](int argc, const char** argv, int index) {
if (++index >= argc) {
return -1;
}
return parse_sigmas_arg(argv[index], &custom_sigmas, "--sigmas");
};
auto on_hires_sigmas_arg = [&](int argc, const char** argv, int index) {
if (++index >= argc) {
return -1;
}
return parse_sigmas_arg(argv[index], &hires_custom_sigmas, "--hires-sigmas");
};
auto on_ref_image_arg = [&](int argc, const char** argv, int index) {
if (++index >= argc) {
return -1;
@ -1293,6 +1308,10 @@ ArgOptions SDGenerationParams::get_options() {
"--sigmas",
"custom sigma values for the sampler, comma-separated (e.g., \"14.61,7.8,3.5,0.0\").",
on_sigmas_arg},
{"",
"--hires-sigmas",
"custom sigma values for the highres fix second pass, comma-separated (e.g., \"0.85,0.725,0.421875,0.0\").",
on_hires_sigmas_arg},
{"",
"--skip-layers",
"layers to skip for SLG steps (default: [7,8,9])",
@ -1525,11 +1544,31 @@ static bool resolve_model_file_from_dir(const std::string& model_name,
LOG_ERROR("%s directory is empty", label);
return false;
}
auto ends_with_valid_ext = [&]() {
for (const auto& ext : valid_ext) {
if (model_name.size() < ext.size()) {
continue;
}
auto suffix = model_name.substr(model_name.size() - ext.size());
std::transform(suffix.begin(), suffix.end(), suffix.begin(), [](unsigned char c) {
return static_cast<char>(std::tolower(c));
});
std::string lower_ext = ext;
std::transform(lower_ext.begin(), lower_ext.end(), lower_ext.begin(), [](unsigned char c) {
return static_cast<char>(std::tolower(c));
});
if (suffix == lower_ext) {
return true;
}
}
return false;
};
if (model_name.empty() ||
model_name.find('/') != std::string::npos ||
model_name.find('\\') != std::string::npos ||
fs::path(model_name).has_root_path() ||
fs::path(model_name).has_extension()) {
ends_with_valid_ext()) {
LOG_ERROR("%s must be a model name without path or extension: %s", label, model_name.c_str());
return false;
}
@ -1633,6 +1672,9 @@ bool SDGenerationParams::from_json_str(
if (hires_json.contains("denoising_strength") && hires_json["denoising_strength"].is_number()) {
hires_denoising_strength = hires_json["denoising_strength"];
}
if (hires_json.contains("custom_sigmas") && hires_json["custom_sigmas"].is_array()) {
hires_custom_sigmas = hires_json["custom_sigmas"].get<std::vector<float>>();
}
if (hires_json.contains("upscale_tile_size") && hires_json["upscale_tile_size"].is_number_integer()) {
hires_upscale_tile_size = hires_json["upscale_tile_size"];
}
@ -2080,6 +2122,10 @@ bool SDGenerationParams::validate(SDMode mode) {
LOG_ERROR("error: hires denoising strength must be in (0.0, 1.0]");
return false;
}
if (!hires_custom_sigmas.empty() && hires_custom_sigmas.size() < 2) {
LOG_ERROR("error: hires custom sigmas must contain at least two values");
return false;
}
if (hires_upscale_tile_size < 1) {
LOG_ERROR("error: hires upscale tile size must be positive");
return false;
@ -2174,15 +2220,17 @@ sd_img_gen_params_t SDGenerationParams::to_sd_img_gen_params_t() {
params.vae_tiling_params = vae_tiling_params;
params.cache = cache_params;
params.hires.enabled = hires_enabled;
params.hires.upscaler = resolved_hires_upscaler;
params.hires.model_path = hires_upscaler_model_path.empty() ? nullptr : hires_upscaler_model_path.c_str();
params.hires.scale = hires_scale;
params.hires.target_width = hires_width;
params.hires.target_height = hires_height;
params.hires.steps = hires_steps;
params.hires.denoising_strength = hires_denoising_strength;
params.hires.upscale_tile_size = hires_upscale_tile_size;
params.hires.enabled = hires_enabled;
params.hires.upscaler = resolved_hires_upscaler;
params.hires.model_path = hires_upscaler_model_path.empty() ? nullptr : hires_upscaler_model_path.c_str();
params.hires.scale = hires_scale;
params.hires.target_width = hires_width;
params.hires.target_height = hires_height;
params.hires.steps = hires_steps;
params.hires.denoising_strength = hires_denoising_strength;
params.hires.upscale_tile_size = hires_upscale_tile_size;
params.hires.custom_sigmas = hires_custom_sigmas.empty() ? nullptr : hires_custom_sigmas.data();
params.hires.custom_sigmas_count = static_cast<int>(hires_custom_sigmas.size());
return params;
}
@ -2215,27 +2263,38 @@ sd_vid_gen_params_t SDGenerationParams::to_sd_vid_gen_params_t() {
high_noise_sample_params.extra_sample_args = high_noise_extra_sample_args.empty() ? nullptr : high_noise_extra_sample_args.c_str();
cache_params.scm_mask = scm_mask.empty() ? nullptr : scm_mask.c_str();
params.loras = lora_vec.empty() ? nullptr : lora_vec.data();
params.lora_count = static_cast<uint32_t>(lora_vec.size());
params.prompt = prompt.c_str();
params.negative_prompt = negative_prompt.c_str();
params.clip_skip = clip_skip;
params.init_image = init_image.get();
params.end_image = end_image.get();
params.control_frames = control_frame_views.empty() ? nullptr : control_frame_views.data();
params.control_frames_size = static_cast<int>(control_frame_views.size());
params.width = get_resolved_width();
params.height = get_resolved_height();
params.sample_params = sample_params;
params.high_noise_sample_params = high_noise_sample_params;
params.moe_boundary = moe_boundary;
params.strength = strength;
params.seed = seed;
params.video_frames = video_frames;
params.fps = fps;
params.vace_strength = vace_strength;
params.vae_tiling_params = vae_tiling_params;
params.cache = cache_params;
params.loras = lora_vec.empty() ? nullptr : lora_vec.data();
params.lora_count = static_cast<uint32_t>(lora_vec.size());
params.prompt = prompt.c_str();
params.negative_prompt = negative_prompt.c_str();
params.clip_skip = clip_skip;
params.init_image = init_image.get();
params.end_image = end_image.get();
params.control_frames = control_frame_views.empty() ? nullptr : control_frame_views.data();
params.control_frames_size = static_cast<int>(control_frame_views.size());
params.width = get_resolved_width();
params.height = get_resolved_height();
params.sample_params = sample_params;
params.high_noise_sample_params = high_noise_sample_params;
params.moe_boundary = moe_boundary;
params.strength = strength;
params.seed = seed;
params.video_frames = video_frames;
params.fps = fps;
params.vace_strength = vace_strength;
params.vae_tiling_params = vae_tiling_params;
params.cache = cache_params;
params.hires.enabled = hires_enabled;
params.hires.upscaler = resolved_hires_upscaler;
params.hires.model_path = hires_upscaler_model_path.empty() ? nullptr : hires_upscaler_model_path.c_str();
params.hires.scale = hires_scale;
params.hires.target_width = hires_width;
params.hires.target_height = hires_height;
params.hires.steps = hires_steps;
params.hires.denoising_strength = hires_denoising_strength;
params.hires.upscale_tile_size = hires_upscale_tile_size;
params.hires.custom_sigmas = hires_custom_sigmas.empty() ? nullptr : hires_custom_sigmas.data();
params.hires.custom_sigmas_count = static_cast<int>(hires_custom_sigmas.size());
return params;
}
@ -2318,6 +2377,7 @@ std::string SDGenerationParams::to_string() const {
<< ", target_height: " << hires_height
<< ", steps: " << hires_steps
<< ", denoising_strength: " << hires_denoising_strength
<< ", custom_sigmas: " << vec_to_string(hires_custom_sigmas)
<< ", upscale_tile_size: " << hires_upscale_tile_size << " },\n"
<< " vae_tiling_params: { "
<< vae_tiling_params.enabled << ", "
@ -2469,6 +2529,7 @@ std::string build_sdcpp_image_metadata_json(const SDContextParams& ctx_params,
{"target_height", gen_params.hires_height},
{"steps", gen_params.hires_steps},
{"denoising_strength", gen_params.hires_denoising_strength},
{"custom_sigmas", gen_params.hires_custom_sigmas},
{"upscale_tile_size", gen_params.hires_upscale_tile_size},
};
}
@ -2588,6 +2649,9 @@ std::string get_image_params(const SDContextParams& ctx_params,
parameter_string += "Hires resize: " + std::to_string(gen_params.hires_width) + "x" + std::to_string(gen_params.hires_height) + ", ";
parameter_string += "Hires steps: " + std::to_string(gen_params.hires_steps) + ", ";
parameter_string += "Denoising strength: " + std::to_string(gen_params.hires_denoising_strength) + ", ";
if (!gen_params.hires_custom_sigmas.empty()) {
parameter_string += "Hires custom sigmas: " + vec_to_string(gen_params.hires_custom_sigmas) + ", ";
}
}
parameter_string += "Version: stable-diffusion.cpp";
parameter_string += ", SDCPP: " + build_sdcpp_image_metadata_json(ctx_params, gen_params, seed, mode);

View file

@ -207,6 +207,7 @@ struct SDGenerationParams {
int hires_steps = 0;
float hires_denoising_strength = 0.7f;
int hires_upscale_tile_size = 128;
std::vector<float> hires_custom_sigmas;
std::map<std::string, float> lora_map;
std::map<std::string, float> high_noise_lora_map;

View file

@ -0,0 +1,348 @@
#ifndef __SD_LTX_LATENT_UPSCALER_HPP__
#define __SD_LTX_LATENT_UPSCALER_HPP__
#include <cinttypes>
#include <cmath>
#include <cstdlib>
#include <map>
#include <memory>
#include <string>
#include <utility>
#include "common_dit.hpp"
#include "ggml_extend.hpp"
#include "ggml_graph_cut.h"
#include "model.h"
#include "util.h"
namespace LTXVUpsampler {
constexpr int LTX_UPSAMPLER_GRAPH_SIZE = 10240;
struct LatentUpsamplerConfig {
int64_t in_channels = 128;
int64_t mid_channels = 1024;
int num_blocks_per_stage = 4;
int dims = 3;
bool spatial_upsample = true;
bool temporal_upsample = false;
bool rational_resampler = false;
};
static inline bool has_tensor(const String2TensorStorage& tensor_storage_map,
const std::string& name) {
return tensor_storage_map.find(name) != tensor_storage_map.end();
}
static inline int64_t get_tensor_ne0(const String2TensorStorage& tensor_storage_map,
const std::string& name,
int64_t fallback) {
auto it = tensor_storage_map.find(name);
if (it == tensor_storage_map.end()) {
return fallback;
}
return it->second.ne[0];
}
static inline int count_module_blocks(const String2TensorStorage& tensor_storage_map,
const std::string& module_name) {
int max_block = -1;
const std::string prefix = module_name + ".";
for (const auto& pair : tensor_storage_map) {
const std::string& name = pair.first;
if (name.find(prefix) != 0) {
continue;
}
size_t begin = prefix.size();
size_t end = name.find('.', begin);
if (end == std::string::npos) {
continue;
}
int index = atoi(name.substr(begin, end - begin).c_str());
max_block = std::max(max_block, index);
}
return max_block + 1;
}
static inline LatentUpsamplerConfig detect_config_from_weights(const String2TensorStorage& tensor_storage_map) {
LatentUpsamplerConfig config;
config.mid_channels = get_tensor_ne0(tensor_storage_map, "initial_norm.weight", config.mid_channels);
config.in_channels = get_tensor_ne0(tensor_storage_map, "final_conv.bias", config.in_channels);
int detected_blocks = count_module_blocks(tensor_storage_map, "res_blocks");
if (detected_blocks > 0) {
config.num_blocks_per_stage = detected_blocks;
}
config.spatial_upsample = has_tensor(tensor_storage_map, "upsampler.0.weight");
config.temporal_upsample = has_tensor(tensor_storage_map, "temporal_upsampler.0.weight");
return config;
}
class VideoGroupNorm : public GGMLBlock {
protected:
int num_groups;
int64_t num_channels;
float eps;
std::string prefix;
void init_params(ggml_context* ctx,
const String2TensorStorage& tensor_storage_map = {},
const std::string prefix = "") override {
SD_UNUSED(tensor_storage_map);
this->prefix = prefix;
params["weight"] = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, num_channels);
params["bias"] = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, num_channels);
}
public:
VideoGroupNorm(int num_groups, int64_t num_channels, float eps = 1e-05f)
: num_groups(num_groups),
num_channels(num_channels),
eps(eps) {}
ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
// LTX video latent layout is [W, H, T, C]. ggml_group_norm treats ne[2]
// as channels, so fold only H/T internally and restore the same layout.
GGML_ASSERT(x->ne[3] == num_channels);
const int64_t W = x->ne[0];
const int64_t H = x->ne[1];
const int64_t T = x->ne[2];
x = ggml_ext_cont(ctx->ggml_ctx, x);
x = ggml_reshape_4d(ctx->ggml_ctx, x, W, H * T, num_channels, 1);
x = ggml_group_norm(ctx->ggml_ctx, x, num_groups, eps);
ggml_tensor* weight = params["weight"];
ggml_tensor* bias = params["bias"];
if (ctx->weight_adapter) {
weight = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, ctx->backend, weight, prefix + "weight");
bias = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, ctx->backend, bias, prefix + "bias");
}
weight = ggml_reshape_4d(ctx->ggml_ctx, weight, 1, 1, num_channels, 1);
bias = ggml_reshape_4d(ctx->ggml_ctx, bias, 1, 1, num_channels, 1);
x = ggml_mul_inplace(ctx->ggml_ctx, x, weight);
x = ggml_add_inplace(ctx->ggml_ctx, x, bias);
return ggml_reshape_4d(ctx->ggml_ctx, x, W, H, T, num_channels);
}
};
class ResBlock : public GGMLBlock {
public:
ResBlock(int64_t channels, int dims = 3) {
GGML_ASSERT(dims == 3);
blocks["conv1"] = std::shared_ptr<GGMLBlock>(new Conv3d(channels, channels, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}));
blocks["norm1"] = std::shared_ptr<GGMLBlock>(new VideoGroupNorm(32, channels));
blocks["conv2"] = std::shared_ptr<GGMLBlock>(new Conv3d(channels, channels, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}));
blocks["norm2"] = std::shared_ptr<GGMLBlock>(new VideoGroupNorm(32, channels));
}
ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
auto conv1 = std::dynamic_pointer_cast<Conv3d>(blocks["conv1"]);
auto norm1 = std::dynamic_pointer_cast<VideoGroupNorm>(blocks["norm1"]);
auto conv2 = std::dynamic_pointer_cast<Conv3d>(blocks["conv2"]);
auto norm2 = std::dynamic_pointer_cast<VideoGroupNorm>(blocks["norm2"]);
ggml_tensor* residual = x;
x = conv1->forward(ctx, x);
x = norm1->forward(ctx, x);
x = ggml_silu_inplace(ctx->ggml_ctx, x);
x = conv2->forward(ctx, x);
x = norm2->forward(ctx, x);
x = ggml_add(ctx->ggml_ctx, x, residual);
return ggml_silu(ctx->ggml_ctx, x);
}
};
class PixelShuffleND : public UnaryBlock {
protected:
int upscale_factor;
public:
explicit PixelShuffleND(int upscale_factor)
: upscale_factor(upscale_factor) {}
ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
GGML_ASSERT(upscale_factor == 2);
int64_t h = x->ne[1];
int64_t w = x->ne[0];
// x: [b*f, c*4, h, w] -> [b*f, c, h*2, w*2]
x = ggml_ext_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, x, 2, 0, 1, 3)); // [b*f, h, w, c*4]
x = ggml_reshape_3d(ctx->ggml_ctx, x, x->ne[0], x->ne[1] * x->ne[2], x->ne[3]); // [b*f, h*w, c*4]
return DiT::unpatchify(ctx->ggml_ctx, x, h, w, upscale_factor, upscale_factor, true);
}
};
class LatentUpsampler : public GGMLBlock {
public:
LatentUpsamplerConfig config;
explicit LatentUpsampler(LatentUpsamplerConfig config)
: config(std::move(config)) {
GGML_ASSERT(this->config.dims == 3);
GGML_ASSERT(this->config.spatial_upsample);
GGML_ASSERT(!this->config.temporal_upsample);
GGML_ASSERT(!this->config.rational_resampler);
blocks["initial_conv"] = std::shared_ptr<GGMLBlock>(new Conv3d(this->config.in_channels,
this->config.mid_channels,
{3, 3, 3},
{1, 1, 1},
{1, 1, 1}));
blocks["initial_norm"] = std::shared_ptr<GGMLBlock>(new VideoGroupNorm(32, this->config.mid_channels));
for (int i = 0; i < this->config.num_blocks_per_stage; ++i) {
blocks["res_blocks." + std::to_string(i)] = std::shared_ptr<GGMLBlock>(new ResBlock(this->config.mid_channels, this->config.dims));
}
blocks["upsampler.0"] = std::shared_ptr<GGMLBlock>(new Conv2d(this->config.mid_channels,
4 * this->config.mid_channels,
{3, 3},
{1, 1},
{1, 1}));
blocks["upsampler.1"] = std::shared_ptr<GGMLBlock>(new PixelShuffleND(2));
for (int i = 0; i < this->config.num_blocks_per_stage; ++i) {
blocks["post_upsample_res_blocks." + std::to_string(i)] = std::shared_ptr<GGMLBlock>(new ResBlock(this->config.mid_channels, this->config.dims));
}
blocks["final_conv"] = std::shared_ptr<GGMLBlock>(new Conv3d(this->config.mid_channels,
this->config.in_channels,
{3, 3, 3},
{1, 1, 1},
{1, 1, 1}));
}
ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
// x: [b*c, f, h, w]
// return: [b*c, f, h*2, w*2]
auto initial_conv = std::dynamic_pointer_cast<Conv3d>(blocks["initial_conv"]);
auto initial_norm = std::dynamic_pointer_cast<VideoGroupNorm>(blocks["initial_norm"]);
auto upsample_conv = std::dynamic_pointer_cast<Conv2d>(blocks["upsampler.0"]);
auto pixel_shuffle = std::dynamic_pointer_cast<PixelShuffleND>(blocks["upsampler.1"]);
auto final_conv = std::dynamic_pointer_cast<Conv3d>(blocks["final_conv"]);
x = initial_conv->forward(ctx, x);
x = initial_norm->forward(ctx, x);
x = ggml_silu(ctx->ggml_ctx, x);
sd::ggml_graph_cut::mark_graph_cut(x, "ltx_latent_upsampler.initial", "x");
for (int i = 0; i < config.num_blocks_per_stage; ++i) {
auto block = std::dynamic_pointer_cast<ResBlock>(blocks["res_blocks." + std::to_string(i)]);
x = block->forward(ctx, x);
sd::ggml_graph_cut::mark_graph_cut(x, "ltx_latent_upsampler.res_blocks." + std::to_string(i), "x");
}
// rearrange(x, "b c f h w -> (b f) c h w"),
x = ggml_ext_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, x, 0, 1, 3, 2)); // [b*f, c, h, w]
x = upsample_conv->forward(ctx, x); // [b*f, c*4, h, w]
x = pixel_shuffle->forward(ctx, x); // [b*f, c, h*2, w*2]
x = ggml_ext_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, x, 0, 1, 3, 2)); // [b*c, f, h, w]
sd::ggml_graph_cut::mark_graph_cut(x, "ltx_latent_upsampler.spatial_up", "x");
for (int i = 0; i < config.num_blocks_per_stage; ++i) {
auto block = std::dynamic_pointer_cast<ResBlock>(blocks["post_upsample_res_blocks." + std::to_string(i)]);
x = block->forward(ctx, x);
sd::ggml_graph_cut::mark_graph_cut(x, "ltx_latent_upsampler.post_blocks." + std::to_string(i), "x");
}
x = final_conv->forward(ctx, x);
sd::ggml_graph_cut::mark_graph_cut(x, "ltx_latent_upsampler.final", "x");
return x;
}
};
struct LatentUpsamplerRunner : public GGMLRunner {
std::unique_ptr<LatentUpsampler> model;
LatentUpsamplerRunner(ggml_backend_t backend,
ggml_backend_t params_backend)
: GGMLRunner(backend, params_backend) {}
std::string get_desc() override {
return "ltx_latent_upsampler";
}
bool load_from_file(const std::string& file_path, int n_threads) {
LOG_INFO("loading LTX latent upsampler from '%s'", file_path.c_str());
ModelLoader model_loader;
if (!model_loader.init_from_file(file_path)) {
LOG_ERROR("init LTX latent upsampler model loader from file failed: '%s'", file_path.c_str());
return false;
}
const auto& tensor_storage_map = model_loader.get_tensor_storage_map();
if (!has_tensor(tensor_storage_map, "post_upsample_res_blocks.0.conv2.bias") ||
!has_tensor(tensor_storage_map, "upsampler.0.weight")) {
LOG_ERROR("unsupported LTX latent upsampler weights: expected spatial upsampler tensors");
return false;
}
LatentUpsamplerConfig config = detect_config_from_weights(tensor_storage_map);
if (config.dims != 3 || !config.spatial_upsample || config.temporal_upsample ||
config.rational_resampler) {
LOG_ERROR("unsupported LTX latent upsampler config: dims=%d spatial=%d temporal=%d rational=%d",
config.dims,
config.spatial_upsample,
config.temporal_upsample,
config.rational_resampler);
return false;
}
model = std::make_unique<LatentUpsampler>(config);
model->init(params_ctx, tensor_storage_map, "");
if (!alloc_params_buffer()) {
LOG_ERROR("LTX latent upsampler params buffer allocation failed");
return false;
}
std::map<std::string, ggml_tensor*> tensors;
model->get_param_tensors(tensors);
if (!model_loader.load_tensors(tensors, {}, n_threads)) {
LOG_ERROR("load LTX latent upsampler tensors failed");
return false;
}
LOG_INFO("LTX latent upsampler loaded: in_channels=%" PRId64 ", mid_channels=%" PRId64 ", blocks=%d",
config.in_channels,
config.mid_channels,
config.num_blocks_per_stage);
return true;
}
ggml_cgraph* build_graph(const sd::Tensor<float>& x_tensor) {
if (!model) {
return nullptr;
}
ggml_cgraph* gf = new_graph_custom(LTX_UPSAMPLER_GRAPH_SIZE);
ggml_tensor* x = make_input(x_tensor);
auto runner_ctx = get_context();
ggml_tensor* out = model->forward(&runner_ctx, x);
ggml_build_forward_expand(gf, out);
return gf;
}
sd::Tensor<float> compute(const int n_threads,
const sd::Tensor<float>& x) {
if (!model) {
LOG_ERROR("LTX latent upsampler is not loaded");
return {};
}
if (x.dim() != 4 && x.dim() != 5) {
LOG_ERROR("LTX latent upsampler expects 4D or 5D video latent, got dim=%lld",
(long long)x.dim());
return {};
}
if (x.dim() == 5 && x.shape()[4] != 1) {
LOG_ERROR("LTX latent upsampler currently supports batch size 1, got batch=%lld",
(long long)x.shape()[4]);
return {};
}
if (x.shape()[3] != model->config.in_channels) {
LOG_ERROR("LTX latent upsampler expected %" PRId64 " channels, got %lld",
model->config.in_channels,
(long long)x.shape()[3]);
return {};
}
size_t expected_dim = static_cast<size_t>(x.dim());
auto get_graph = [&]() -> ggml_cgraph* { return build_graph(x); };
return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), expected_dim);
}
};
} // namespace LTXVUpsampler
#endif // __SD_LTX_LATENT_UPSCALER_HPP__

View file

@ -1123,6 +1123,18 @@ namespace LTXVAE {
mean = ggml_cont(ctx->ggml_ctx, mean);
return processor->normalize(ctx, mean);
}
ggml_tensor* normalize_latents(GGMLRunnerContext* ctx,
ggml_tensor* x) {
auto processor = std::dynamic_pointer_cast<PerChannelStatistics>(blocks["per_channel_statistics"]);
return processor->normalize(ctx, x);
}
ggml_tensor* un_normalize_latents(GGMLRunnerContext* ctx,
ggml_tensor* x) {
auto processor = std::dynamic_pointer_cast<PerChannelStatistics>(blocks["per_channel_statistics"]);
return processor->un_normalize(ctx, x);
}
};
} // namespace LTXVAE
@ -1192,6 +1204,17 @@ struct LTXVideoVAE : public VAE {
return gf;
}
ggml_cgraph* build_latent_statistics_graph(const sd::Tensor<float>& z_tensor, bool normalize) {
ggml_cgraph* gf = new_graph_custom(1024);
ggml_tensor* z = make_input(z_tensor);
auto runner_ctx = get_context();
ggml_tensor* out = normalize ? vae.normalize_latents(&runner_ctx, z)
: vae.un_normalize_latents(&runner_ctx, z);
ggml_build_forward_expand(gf, out);
return gf;
}
sd::Tensor<float> _compute(const int n_threads,
const sd::Tensor<float>& z,
bool decode_graph) override {
@ -1226,6 +1249,26 @@ struct LTXVideoVAE : public VAE {
return result;
}
sd::Tensor<float> apply_latent_statistics(const int n_threads,
const sd::Tensor<float>& z,
bool normalize) {
auto get_graph = [&]() -> ggml_cgraph* {
return build_latent_statistics_graph(z, normalize);
};
return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false),
static_cast<size_t>(z.dim()));
}
sd::Tensor<float> normalize_latents(const int n_threads,
const sd::Tensor<float>& z) {
return apply_latent_statistics(n_threads, z, true);
}
sd::Tensor<float> un_normalize_latents(const int n_threads,
const sd::Tensor<float>& z) {
return apply_latent_statistics(n_threads, z, false);
}
int get_encoder_output_channels(int input_channels) override {
SD_UNUSED(input_channels);
return 256;

View file

@ -1487,6 +1487,9 @@ namespace LTXV {
->forward(ctx, ggml_ext_scale(ctx->ggml_ctx, av_ca_audio_timestep, av_ca_factor))
.first;
sd::ggml_graph_cut::mark_graph_cut(vx, "ltxav.prelude", "vx");
sd::ggml_graph_cut::mark_graph_cut(ax, "ltxav.prelude", "ax");
for (int i = 0; i < cfg.num_layers; i++) {
auto block = std::dynamic_pointer_cast<BasicAVTransformerBlock>(blocks["transformer_blocks." + std::to_string(i)]);
auto out = block->forward(ctx,
@ -1509,6 +1512,8 @@ namespace LTXV {
a_prompt_timestep_mod);
vx = out.first;
ax = out.second;
sd::ggml_graph_cut::mark_graph_cut(vx, "ltxav.transformer_blocks." + std::to_string(i), "vx");
sd::ggml_graph_cut::mark_graph_cut(ax, "ltxav.transformer_blocks." + std::to_string(i), "ax");
}
auto v_shift_scale = get_output_scale_shift(ctx, params["scale_shift_table"], v_embedded_time, cfg.hidden_size);

View file

@ -17,6 +17,7 @@
#include "guidance.h"
#include "lora.hpp"
#include "ltx_audio_vae.h"
#include "ltx_latent_upscaler.hpp"
#include "ltx_vae.hpp"
#include "pmid.hpp"
#include "sample-cache.h"
@ -883,7 +884,8 @@ public:
auto create_tae = [&]() -> std::shared_ptr<VAE> {
if (sd_version_is_wan(version) ||
sd_version_is_qwen_image(version) ||
sd_version_is_anima(version)) {
sd_version_is_anima(version) ||
sd_version_is_ltxav(version)) {
return std::make_shared<TinyVideoAutoEncoder>(backend_for(SDBackendModule::VAE),
params_backend_for(SDBackendModule::VAE),
tensor_storage_map,
@ -1430,7 +1432,7 @@ public:
}
auto lora = std::make_shared<LoraModel>(lora_id,
backend_for(module),
params_backend_for(module),
backend_for(module),
lora_path,
is_high_noise ? "model.high_noise_" : "",
version);
@ -2421,6 +2423,24 @@ public:
return first_stage_model->decode(n_threads, latents, vae_tiling_params, decode_video, circular_x, circular_y);
}
sd::Tensor<float> normalize_ltx_video_latents(const sd::Tensor<float>& x) {
auto ltx_vae = std::dynamic_pointer_cast<LTXVideoVAE>(first_stage_model);
if (!ltx_vae) {
LOG_ERROR("LTX latent normalization requires LTX video VAE");
return {};
}
return ltx_vae->normalize_latents(n_threads, x);
}
sd::Tensor<float> un_normalize_ltx_video_latents(const sd::Tensor<float>& x) {
auto ltx_vae = std::dynamic_pointer_cast<LTXVideoVAE>(first_stage_model);
if (!ltx_vae) {
LOG_ERROR("LTX latent un-normalization requires LTX video VAE");
return {};
}
return ltx_vae->un_normalize_latents(n_threads, x);
}
sd::Tensor<float> decode_ltx_audio_latent(const sd::Tensor<float>& audio_latent) {
if (audio_vae_model == nullptr || audio_latent.empty()) {
return {};
@ -2704,16 +2724,18 @@ void sd_cache_params_init(sd_cache_params_t* cache_params) {
}
void sd_hires_params_init(sd_hires_params_t* hires_params) {
*hires_params = {};
hires_params->enabled = false;
hires_params->upscaler = SD_HIRES_UPSCALER_LATENT;
hires_params->model_path = nullptr;
hires_params->scale = 2.0f;
hires_params->target_width = 0;
hires_params->target_height = 0;
hires_params->steps = 0;
hires_params->denoising_strength = 0.7f;
hires_params->upscale_tile_size = 128;
*hires_params = {};
hires_params->enabled = false;
hires_params->upscaler = SD_HIRES_UPSCALER_LATENT;
hires_params->model_path = nullptr;
hires_params->scale = 2.0f;
hires_params->target_width = 0;
hires_params->target_height = 0;
hires_params->steps = 0;
hires_params->denoising_strength = 0.7f;
hires_params->upscale_tile_size = 128;
hires_params->custom_sigmas = nullptr;
hires_params->custom_sigmas_count = 0;
}
void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) {
@ -2986,6 +3008,16 @@ void sd_vid_gen_params_init(sd_vid_gen_params_t* sd_vid_gen_params) {
sd_vid_gen_params->moe_boundary = 0.875f;
sd_vid_gen_params->vace_strength = 1.f;
sd_vid_gen_params->vae_tiling_params = {false, false, 0, 0, 0.5f, 0.0f, 0.0f};
sd_vid_gen_params->hires.enabled = false;
sd_vid_gen_params->hires.upscaler = SD_HIRES_UPSCALER_LATENT;
sd_vid_gen_params->hires.scale = 2.f;
sd_vid_gen_params->hires.target_width = 0;
sd_vid_gen_params->hires.target_height = 0;
sd_vid_gen_params->hires.steps = 0;
sd_vid_gen_params->hires.denoising_strength = 0.7f;
sd_vid_gen_params->hires.upscale_tile_size = 128;
sd_vid_gen_params->hires.custom_sigmas = nullptr;
sd_vid_gen_params->hires.custom_sigmas_count = 0;
sd_cache_params_init(&sd_vid_gen_params->cache);
}
@ -3235,6 +3267,7 @@ struct GenerationRequest {
vace_strength = sd_vid_gen_params->vace_strength;
guidance = sd_vid_gen_params->sample_params.guidance;
high_noise_guidance = sd_vid_gen_params->high_noise_sample_params.guidance;
hires = sd_vid_gen_params->hires;
resolve(sd_ctx);
if (frames != requested_frames) {
LOG_WARN("align video frames from %d to %d for %s",
@ -3293,6 +3326,20 @@ struct GenerationRequest {
hires.enabled = false;
return;
}
if (hires.custom_sigmas_count < 0) {
LOG_WARN("hires custom sigmas count is negative, ignoring custom sigmas");
hires.custom_sigmas = nullptr;
hires.custom_sigmas_count = 0;
}
if (hires.custom_sigmas_count > 0 && hires.custom_sigmas == nullptr) {
LOG_WARN("hires custom sigmas count is positive but custom sigmas are null, ignoring custom sigmas");
hires.custom_sigmas_count = 0;
}
if (hires.custom_sigmas_count == 1) {
LOG_WARN("hires custom sigmas requires at least two values, ignoring custom sigmas");
hires.custom_sigmas = nullptr;
hires.custom_sigmas_count = 0;
}
hires.denoising_strength = std::clamp(hires.denoising_strength, 0.0001f, 1.f);
hires.steps = std::max(0, hires.steps);
@ -3657,6 +3704,85 @@ static sd::Tensor<float> pack_ltxav_audio_and_video_denoise_mask(const sd::Tenso
return sd::ops::concat(video_mask_full, audio_mask, 3);
}
static sd::Tensor<float> make_ltxav_video_denoise_mask(const sd::Tensor<float>& video_latent, float value = 1.f) {
if (video_latent.empty()) {
return {};
}
return sd::full<float>({video_latent.shape()[0],
video_latent.shape()[1],
video_latent.shape()[2],
1,
1},
value);
}
static sd::Tensor<float> encode_ltxav_condition_image(sd_ctx_t* sd_ctx,
const sd::Tensor<float>& image,
const char* name) {
if (sd_ctx == nullptr || sd_ctx->sd == nullptr || image.empty()) {
return {};
}
auto condition_image = image.reshape({image.shape()[0],
image.shape()[1],
1,
image.shape()[2],
image.shape()[3]});
auto condition_latent = sd_ctx->sd->encode_first_stage(condition_image);
if (condition_latent.empty()) {
LOG_ERROR("failed to encode LTXAV %s image", name);
}
return condition_latent;
}
static bool apply_ltxav_condition_by_latent_index(sd::Tensor<float>* video_latent,
sd::Tensor<float>* video_mask,
const sd::Tensor<float>& condition_latent,
int64_t latent_idx,
const char* name,
float conditioned_mask) {
if (video_latent == nullptr || video_mask == nullptr || video_latent->empty() || video_mask->empty()) {
return false;
}
if (condition_latent.empty() ||
condition_latent.shape()[0] != video_latent->shape()[0] ||
condition_latent.shape()[1] != video_latent->shape()[1] ||
condition_latent.shape()[3] != video_latent->shape()[3]) {
LOG_ERROR("invalid LTXAV %s condition latent shape", name);
return false;
}
int64_t latent_frames = video_latent->shape()[2];
int64_t condition_frames = condition_latent.shape()[2];
if (latent_idx < 0 || condition_frames <= 0 || latent_idx + condition_frames > latent_frames) {
LOG_ERROR("invalid LTXAV %s image latent range: start=%" PRId64 ", length=%" PRId64 ", latent_frames=%" PRId64,
name,
latent_idx,
condition_frames,
latent_frames);
return false;
}
sd::ops::slice_assign(video_latent, 2, latent_idx, latent_idx + condition_frames, condition_latent);
sd::ops::fill_slice(video_mask, 2, latent_idx, latent_idx + condition_frames, conditioned_mask);
return true;
}
static bool apply_ltxav_condition_image_by_latent_index(sd_ctx_t* sd_ctx,
const sd::Tensor<float>& image,
sd::Tensor<float>* video_latent,
sd::Tensor<float>* video_mask,
int64_t latent_idx,
const char* name,
float strength) {
auto condition_latent = encode_ltxav_condition_image(sd_ctx, image, name);
return !condition_latent.empty() &&
apply_ltxav_condition_by_latent_index(video_latent,
video_mask,
condition_latent,
latent_idx,
name,
1.0f - std::clamp(strength, 0.f, 1.f));
}
static sd::Tensor<float> unpack_ltxav_audio_latent(const sd::Tensor<float>& packed_latent,
int audio_length,
int video_channels) {
@ -4218,6 +4344,53 @@ static sd::Tensor<float> upscale_hires_latent(sd_ctx_t* sd_ctx,
return {};
}
static std::vector<float> make_hires_sigma_schedule(sd_ctx_t* sd_ctx,
const sd_hires_params_t& hires,
const sd_sample_params_t& sample_params,
sample_method_t sample_method,
int default_steps,
int sample_seq_len,
int* scheduler_steps_out) {
if (scheduler_steps_out != nullptr) {
*scheduler_steps_out = 0;
}
if (hires.custom_sigmas_count > 0 && hires.custom_sigmas != nullptr) {
std::vector<float> custom_sigmas(hires.custom_sigmas,
hires.custom_sigmas + hires.custom_sigmas_count);
if (scheduler_steps_out != nullptr) {
*scheduler_steps_out = static_cast<int>(custom_sigmas.size()) - 1;
}
return custom_sigmas;
}
int effective_steps = hires.steps > 0 ? hires.steps : default_steps;
effective_steps = std::max(1, effective_steps);
// sd-webui behavior: scale up total steps so trimming by denoising_strength yields exactly hires_steps effective steps,
// unlike img2img which trims from a fixed step count.
int scheduler_steps = static_cast<int>(effective_steps / hires.denoising_strength);
scheduler_steps = std::max(1, scheduler_steps);
scheduler_t scheduler = resolve_scheduler(sd_ctx,
sample_params.scheduler,
sample_method);
std::vector<float> sigmas = sd_ctx->sd->denoiser->get_sigmas(scheduler_steps,
sample_seq_len,
scheduler,
sd_ctx->sd->version,
sample_params.extra_sample_args);
size_t t_enc = static_cast<size_t>(scheduler_steps * hires.denoising_strength);
if (t_enc >= static_cast<size_t>(scheduler_steps)) {
t_enc = static_cast<size_t>(scheduler_steps) - 1;
}
if (scheduler_steps_out != nullptr) {
*scheduler_steps_out = scheduler_steps;
}
return std::vector<float>(sigmas.begin() + scheduler_steps - static_cast<int>(t_enc) - 1,
sigmas.end());
}
SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params) {
if (sd_ctx == nullptr || sd_img_gen_params == nullptr) {
return nullptr;
@ -4340,29 +4513,20 @@ SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* s
}
}
int hires_steps = request.hires.steps > 0 ? request.hires.steps : plan.sample_steps;
// sd-webui behavior: scale up total steps so trimming by denoising_strength yields exactly hires_steps effective steps,
// unlike img2img which trims from a fixed step count
hires_steps = static_cast<int>(hires_steps / request.hires.denoising_strength);
std::vector<float> hires_sigmas = sd_ctx->sd->denoiser->get_sigmas(
hires_steps,
sd_ctx->sd->get_image_seq_len(request.hires.target_height, request.hires.target_width),
sd_img_gen_params->sample_params.scheduler,
sd_ctx->sd->version,
sd_img_gen_params->sample_params.extra_sample_args);
size_t t_enc = static_cast<size_t>(hires_steps * request.hires.denoising_strength);
if (t_enc >= static_cast<size_t>(hires_steps)) {
t_enc = static_cast<size_t>(hires_steps) - 1;
}
std::vector<float> hires_sigma_sched(hires_sigmas.begin() + hires_steps - static_cast<int>(t_enc) - 1,
hires_sigmas.end());
LOG_INFO("hires fix: %d steps, denoising_strength=%.2f, sigma_sched_size=%zu",
hires_steps,
int hires_scheduler_steps = 0;
std::vector<float> hires_sigma_sched =
make_hires_sigma_schedule(sd_ctx,
request.hires,
sd_img_gen_params->sample_params,
plan.sample_method,
plan.sample_steps,
sd_ctx->sd->get_image_seq_len(request.hires.target_height, request.hires.target_width),
&hires_scheduler_steps);
LOG_INFO("hires fix: scheduler_steps=%d, denoising_strength=%.2f, sigma_sched_size=%zu%s",
hires_scheduler_steps,
request.hires.denoising_strength,
hires_sigma_sched.size());
hires_sigma_sched.size(),
request.hires.custom_sigmas_count > 0 ? ", custom_sigmas=true" : "");
std::vector<sd::Tensor<float>> hires_final_latents;
int64_t hires_denoise_start = ggml_time_ms();
@ -4510,44 +4674,7 @@ static std::optional<ImageGenerationLatents> prepare_video_generation_latents(sd
float conditioning_strength = std::clamp(request->strength, 0.f, 1.f);
float conditioned_mask = 1.0f - conditioning_strength;
latents.denoise_mask = sd::full<float>({latents.init_latent.shape()[0],
latents.init_latent.shape()[1],
latents.init_latent.shape()[2],
1,
1},
1.f);
auto encode_ltxav_condition_image = [&](const sd::Tensor<float>& image, const char* name) -> sd::Tensor<float> {
auto condition_image = image.reshape({image.shape()[0],
image.shape()[1],
1,
image.shape()[2],
image.shape()[3]});
auto condition_latent = sd_ctx->sd->encode_first_stage(condition_image);
if (condition_latent.empty()) {
LOG_ERROR("failed to encode LTXAV %s image", name);
}
return condition_latent;
};
auto apply_video_condition_by_latent_index = [&](const sd::Tensor<float>& condition_latent,
int64_t latent_idx,
const char* name) -> bool {
int64_t latent_frames = latents.init_latent.shape()[2];
int64_t condition_frames = condition_latent.shape()[2];
if (latent_idx < 0 || condition_frames <= 0 || latent_idx + condition_frames > latent_frames) {
LOG_ERROR("invalid LTXAV %s image latent range: start=%" PRId64 ", length=%" PRId64 ", latent_frames=%" PRId64,
name,
latent_idx,
condition_frames,
latent_frames);
return false;
}
sd::ops::slice_assign(&latents.init_latent, 2, latent_idx, latent_idx + condition_frames, condition_latent);
sd::ops::fill_slice(&latents.denoise_mask, 2, latent_idx, latent_idx + condition_frames, conditioned_mask);
return true;
};
latents.denoise_mask = make_ltxav_video_denoise_mask(latents.init_latent, 1.f);
auto apply_video_condition_by_keyframe_index = [&](const sd::Tensor<float>& keyframes,
int frame_idx,
@ -4585,20 +4712,30 @@ static std::optional<ImageGenerationLatents> prepare_video_generation_latents(sd
};
if (!start_image.empty()) {
auto start_image_latent = encode_ltxav_condition_image(start_image, "init");
if (start_image_latent.empty() || !apply_video_condition_by_latent_index(start_image_latent, 0, "init")) {
if (!apply_ltxav_condition_image_by_latent_index(sd_ctx,
start_image,
&latents.init_latent,
&latents.denoise_mask,
0,
"init",
conditioning_strength)) {
return std::nullopt;
}
}
if (!end_image.empty()) {
auto end_image_latent = encode_ltxav_condition_image(end_image, "end");
auto end_image_latent = encode_ltxav_condition_image(sd_ctx, end_image, "end");
if (end_image_latent.empty()) {
return std::nullopt;
}
int frame_idx = request->frames - 1;
bool ok = frame_idx == 0 ? apply_video_condition_by_latent_index(end_image_latent, 0, "end")
bool ok = frame_idx == 0 ? apply_ltxav_condition_by_latent_index(&latents.init_latent,
&latents.denoise_mask,
end_image_latent,
0,
"end",
conditioned_mask)
: apply_video_condition_by_keyframe_index(end_image_latent, frame_idx, "end");
if (!ok) {
return std::nullopt;
@ -4879,6 +5016,175 @@ static sd_image_t* decode_video_outputs(sd_ctx_t* sd_ctx,
return result_images;
}
static sd::Tensor<float> upscale_ltx_spatial_video_latent(sd_ctx_t* sd_ctx,
const char* model_path,
const sd::Tensor<float>& packed_latent,
int audio_length) {
if (sd_ctx == nullptr || sd_ctx->sd == nullptr || packed_latent.empty()) {
return {};
}
if (strlen(SAFE_STR(model_path)) == 0) {
LOG_ERROR("LTX latent spatial upscale requires a model path");
return {};
}
if (!sd_ctx->sd->ensure_backend_pair(SDBackendModule::UPSCALER)) {
return {};
}
int latent_channels = sd_ctx->sd->get_latent_channel();
sd::Tensor<float> video_latent = packed_latent;
sd::Tensor<float> audio_latent;
if (packed_latent.shape()[3] > latent_channels) {
video_latent = sd::ops::slice(packed_latent, 3, 0, latent_channels);
audio_latent = unpack_ltxav_audio_latent(packed_latent, audio_length, latent_channels);
}
LOG_INFO("LTX latent spatial upscale: latent %dx%dx%dx%d -> x2",
(int)video_latent.shape()[0],
(int)video_latent.shape()[1],
(int)video_latent.shape()[2],
(int)video_latent.shape()[3]);
sd::Tensor<float> unnormalized = sd_ctx->sd->un_normalize_ltx_video_latents(video_latent);
if (unnormalized.empty()) {
LOG_ERROR("LTX latent un-normalization failed before spatial upscale");
return {};
}
std::unique_ptr<LTXVUpsampler::LatentUpsamplerRunner> upsampler =
std::make_unique<LTXVUpsampler::LatentUpsamplerRunner>(sd_ctx->sd->backend_for(SDBackendModule::UPSCALER),
sd_ctx->sd->params_backend_for(SDBackendModule::UPSCALER));
const size_t max_graph_vram_bytes = sd::ggml_graph_cut::max_vram_gib_to_bytes(sd_ctx->sd->max_vram);
upsampler->set_max_graph_vram_bytes(max_graph_vram_bytes);
if (!upsampler->load_from_file(model_path, sd_ctx->sd->n_threads)) {
LOG_ERROR("load LTX latent upsampler failed");
return {};
}
sd::Tensor<float> upscaled = upsampler->compute(sd_ctx->sd->n_threads, unnormalized);
upsampler.reset();
if (upscaled.empty()) {
LOG_ERROR("LTX latent spatial upscale failed");
return {};
}
upscaled = sd_ctx->sd->normalize_ltx_video_latents(upscaled);
if (upscaled.empty()) {
LOG_ERROR("LTX latent normalization failed after spatial upscale");
return {};
}
if (!audio_latent.empty()) {
upscaled = pack_ltxav_audio_and_video_latents(upscaled, audio_latent);
}
return upscaled;
}
static bool apply_ltxv_refine_image_conditioning(sd_ctx_t* sd_ctx,
const sd_vid_gen_params_t* sd_vid_gen_params,
const GenerationRequest& request,
const ImageGenerationLatents& latents,
sd::Tensor<float>* latent,
sd::Tensor<float>* denoise_mask,
sd::Tensor<float>* video_positions) {
if (sd_ctx == nullptr || sd_ctx->sd == nullptr || sd_vid_gen_params == nullptr ||
latent == nullptr || latent->empty() || denoise_mask == nullptr || video_positions == nullptr) {
return true;
}
if (sd_vid_gen_params->init_image.data == nullptr &&
sd_vid_gen_params->end_image.data == nullptr) {
return true;
}
if (sd_ctx->sd->vae_decode_only) {
LOG_ERROR("LTXV refine image conditioning requires VAE encoder weights; create the context with vae_decode_only=false");
return false;
}
constexpr float conditioning_strength = 1.f;
int latent_channels = sd_ctx->sd->get_latent_channel();
sd::Tensor<float> video_latent = *latent;
sd::Tensor<float> audio_latent;
if (latent->shape()[3] > latent_channels) {
video_latent = sd::ops::slice(*latent, 3, 0, latent_channels);
audio_latent = unpack_ltxav_audio_latent(*latent, latents.audio_length, latent_channels);
if (audio_latent.empty()) {
LOG_ERROR("failed to unpack LTXAV audio latent before image-to-video inplace conditioning");
return false;
}
}
int image_width = static_cast<int>(video_latent.shape()[0]) * request.vae_scale_factor;
int image_height = static_cast<int>(video_latent.shape()[1]) * request.vae_scale_factor;
sd::Tensor<float> video_mask = make_ltxav_video_denoise_mask(video_latent, 1.f);
if (sd_vid_gen_params->init_image.data != nullptr) {
sd::Tensor<float> start_image = sd_image_to_tensor(sd_vid_gen_params->init_image, image_width, image_height);
if (!apply_ltxav_condition_image_by_latent_index(sd_ctx,
start_image,
&video_latent,
&video_mask,
0,
"init",
conditioning_strength)) {
return false;
}
}
if (sd_vid_gen_params->end_image.data != nullptr) {
sd::Tensor<float> end_image = sd_image_to_tensor(sd_vid_gen_params->end_image, image_width, image_height);
sd::Tensor<float> end_image_latent = encode_ltxav_condition_image(sd_ctx, end_image, "end");
if (end_image_latent.empty()) {
return false;
}
int frame_idx = request.frames - 1;
if (frame_idx == 0) {
if (!apply_ltxav_condition_by_latent_index(&video_latent,
&video_mask,
end_image_latent,
0,
"end",
1.f - conditioning_strength)) {
return false;
}
} else {
if (latents.video_conditioning_frame_count <= 0 || latents.video_target_frame_count <= 0) {
LOG_ERROR("LTXV FLF2V refine conditioning requires low-resolution keyframe conditioning metadata");
return false;
}
int64_t target_latent_frames = latents.video_target_frame_count;
if (!apply_ltxav_condition_by_latent_index(&video_latent,
&video_mask,
end_image_latent,
target_latent_frames,
"end",
1.f - conditioning_strength)) {
return false;
}
*video_positions = build_ltxv_video_positions(video_latent.shape()[0],
video_latent.shape()[1],
target_latent_frames,
end_image_latent.shape()[2],
frame_idx,
1,
request.fps,
request.vae_scale_factor,
8,
true);
}
}
if (!audio_latent.empty()) {
*latent = pack_ltxav_audio_and_video_latents(video_latent, audio_latent);
*denoise_mask = pack_ltxav_audio_and_video_denoise_mask(video_mask, video_latent, audio_latent);
} else {
*latent = std::move(video_latent);
*denoise_mask = std::move(video_mask);
}
LOG_INFO("LTXV refine image conditioning applied at %dx%d", image_width, image_height);
return true;
}
SD_API bool generate_video(sd_ctx_t* sd_ctx,
const sd_vid_gen_params_t* sd_vid_gen_params,
sd_image_t** frames_out,
@ -4899,6 +5205,23 @@ SD_API bool generate_video(sd_ctx_t* sd_ctx,
int64_t t0 = ggml_time_ms();
sd_ctx->sd->vae_tiling_params = sd_vid_gen_params->vae_tiling_params;
GenerationRequest request(sd_ctx, sd_vid_gen_params);
bool latent_upscale_enabled = request.hires.enabled;
GenerationRequest hires_request = request;
if (latent_upscale_enabled) {
if (!sd_version_is_ltxav(sd_ctx->sd->version)) {
LOG_ERROR("LTX latent spatial upscale is only supported for LTX video models");
return false;
}
if (request.hires.upscaler != SD_HIRES_UPSCALER_MODEL) {
LOG_ERROR("LTX latent spatial upscale currently requires hires upscaler MODEL");
return false;
}
if (strlen(SAFE_STR(request.hires.model_path)) == 0) {
LOG_ERROR("LTX latent spatial upscale is enabled but hires model path was not provided");
return false;
}
}
sd_ctx->sd->rng->manual_seed(request.seed);
sd_ctx->sd->sampler_rng->manual_seed(request.seed);
sd_ctx->sd->set_flow_shift(sd_vid_gen_params->sample_params.flow_shift);
@ -4910,14 +5233,22 @@ SD_API bool generate_video(sd_ctx_t* sd_ctx,
return false;
}
ImageGenerationLatents latents = std::move(*latent_inputs_opt);
ImageGenerationEmbeds embeds = prepare_video_generation_embeds(sd_ctx,
sd_vid_gen_params,
request,
latents);
LOG_INFO("generate_video %dx%dx%d",
request.width,
request.height,
request.frames);
ImageGenerationEmbeds embeds = prepare_video_generation_embeds(sd_ctx,
sd_vid_gen_params,
request,
latents);
if (latent_upscale_enabled) {
LOG_INFO("generate_video %dx%dx%d -> LTX latent spatial upscale",
request.width,
request.height,
request.frames);
} else {
LOG_INFO("generate_video %dx%dx%d",
request.width,
request.height,
request.frames);
}
int64_t latent_start = ggml_time_ms();
int W = request.width / request.vae_scale_factor;
@ -5009,15 +5340,126 @@ SD_API bool generate_video(sd_ctx_t* sd_ctx,
latents.video_positions);
int64_t sampling_end = ggml_time_ms();
if (sd_ctx->sd->free_params_immediately) {
sd_ctx->sd->diffusion_model->free_params_buffer();
}
if (final_latent.empty()) {
if (sd_ctx->sd->free_params_immediately) {
sd_ctx->sd->diffusion_model->free_params_buffer();
}
LOG_ERROR("sampling failed after %.2fs", (sampling_end - sampling_start) * 1.0f / 1000);
return false;
}
LOG_INFO("sampling completed, taking %.2fs", (sampling_end - sampling_start) * 1.0f / 1000);
if (latent_upscale_enabled) {
int64_t upscale_start = ggml_time_ms();
sd::Tensor<float> upscaled_latent = upscale_ltx_spatial_video_latent(sd_ctx,
request.hires.model_path,
final_latent,
latents.audio_length);
int64_t upscale_end = ggml_time_ms();
if (upscaled_latent.empty()) {
if (sd_ctx->sd->free_params_immediately) {
sd_ctx->sd->diffusion_model->free_params_buffer();
}
return false;
}
LOG_INFO("LTX latent spatial upscale completed, taking %.2fs",
(upscale_end - upscale_start) * 1.0f / 1000);
x_t = std::move(upscaled_latent);
hires_request.width = static_cast<int>(x_t.shape()[0]) * hires_request.vae_scale_factor;
hires_request.height = static_cast<int>(x_t.shape()[1]) * hires_request.vae_scale_factor;
if ((request.hires.target_width > 0 || request.hires.target_height > 0) &&
(request.hires.target_width != hires_request.width || request.hires.target_height != hires_request.height)) {
LOG_WARN("LTX latent spatial upsampler output is %dx%d; ignoring hires target %dx%d",
hires_request.width,
hires_request.height,
request.hires.target_width,
request.hires.target_height);
}
sd::Tensor<float> hires_denoise_mask;
sd::Tensor<float> hires_video_positions;
if (!apply_ltxv_refine_image_conditioning(sd_ctx,
sd_vid_gen_params,
hires_request,
latents,
&x_t,
&hires_denoise_mask,
&hires_video_positions)) {
if (sd_ctx->sd->free_params_immediately) {
sd_ctx->sd->diffusion_model->free_params_buffer();
}
return false;
}
noise = sd::Tensor<float>::randn_like(x_t, sd_ctx->sd->rng);
W = hires_request.width / hires_request.vae_scale_factor;
H = hires_request.height / hires_request.vae_scale_factor;
T = static_cast<int>(x_t.shape()[2]);
sample_method_t hires_sample_method = plan.sample_method;
int hires_scheduler_steps = 0;
std::vector<float> hires_sigma_sched =
make_hires_sigma_schedule(sd_ctx,
request.hires,
sd_vid_gen_params->sample_params,
hires_sample_method,
plan.sample_steps,
sd_ctx->sd->get_image_seq_len(hires_request.height, hires_request.width) * T,
&hires_scheduler_steps);
float hires_eta = resolve_eta(sd_ctx,
sd_vid_gen_params->sample_params.eta,
hires_sample_method);
LOG_DEBUG("sample(latent upscale) %dx%dx%d", W, H, T);
LOG_INFO("LTX latent spatial upscale refine: scheduler_steps=%d, denoising_strength=%.2f, sampler=%s, sigma_sched_size=%zu%s",
hires_scheduler_steps,
request.hires.denoising_strength,
sampling_methods_str[hires_sample_method],
hires_sigma_sched.size(),
request.hires.custom_sigmas_count > 0 ? ", custom_sigmas=true" : "");
sampling_start = ggml_time_ms();
final_latent = sd_ctx->sd->sample(sd_ctx->sd->diffusion_model,
true,
x_t,
std::move(noise),
embeds.cond,
hires_request.use_uncond ? embeds.uncond : SDCondition(),
embeds.img_cond,
embeds.id_cond,
sd::Tensor<float>(),
0.f,
sd_vid_gen_params->sample_params.guidance,
hires_eta,
sd_vid_gen_params->sample_params.shifted_timestep,
hires_sample_method,
sd_ctx->sd->is_flow_denoiser(),
plan.extra_sample_args,
hires_sigma_sched,
-1,
std::vector<sd::Tensor<float>>{},
false,
hires_denoise_mask,
sd::Tensor<float>(),
hires_request.vace_strength,
latents.audio_length,
static_cast<float>(hires_request.fps),
hires_request.cache_params,
hires_video_positions);
sampling_end = ggml_time_ms();
if (sd_ctx->sd->free_params_immediately) {
sd_ctx->sd->diffusion_model->free_params_buffer();
}
if (final_latent.empty()) {
LOG_ERROR("sampling(latent upscale) failed after %.2fs",
(sampling_end - sampling_start) * 1.0f / 1000);
return false;
}
LOG_INFO("sampling(latent upscale) completed, taking %.2fs",
(sampling_end - sampling_start) * 1.0f / 1000);
} else if (sd_ctx->sd->free_params_immediately) {
sd_ctx->sd->diffusion_model->free_params_buffer();
}
sd_audio_t* generated_audio = nullptr;
if (sd_version_is_ltxav(sd_ctx->sd->version) &&
latents.audio_length > 0 &&
@ -5048,7 +5490,7 @@ SD_API bool generate_video(sd_ctx_t* sd_ctx,
int64_t latent_end = ggml_time_ms();
LOG_INFO("generating latent video completed, taking %.2fs", (latent_end - latent_start) * 1.0f / 1000);
auto result = decode_video_outputs(sd_ctx, request, final_latent, num_frames_out);
auto result = decode_video_outputs(sd_ctx, latent_upscale_enabled ? hires_request : request, final_latent, num_frames_out);
if (result == nullptr) {
free_sd_audio(generated_audio);
return false;

View file

@ -332,6 +332,8 @@ typedef struct {
int steps;
float denoising_strength;
int upscale_tile_size;
float* custom_sigmas;
int custom_sigmas_count;
} sd_hires_params_t;
typedef struct {
@ -382,6 +384,7 @@ typedef struct {
float vace_strength;
sd_tiling_params_t vae_tiling_params;
sd_cache_params_t cache;
sd_hires_params_t hires;
} sd_vid_gen_params_t;
typedef struct sd_ctx_t sd_ctx_t;

View file

@ -322,13 +322,21 @@ class TinyVideoEncoder : public UnaryBlock {
int patch_size = 1;
public:
TinyVideoEncoder(int z_channels = 4, int patch_size = 1)
int t_downscale = 1;
TinyVideoEncoder(int z_channels = 4, int patch_size = 1, std::vector<bool> time_downscale = {true, true, false})
: z_channels(z_channels), patch_size(patch_size) {
// self.t_downscale = 2**sum(t.stride == 2 for t in self.encoder if isinstance(t, TPool))
t_downscale = 1;
for (bool downscale : time_downscale) {
if (downscale) {
t_downscale *= 2;
}
}
int index = 0;
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels * patch_size * patch_size, hidden, {3, 3}, {1, 1}, {1, 1}));
index++; // nn.ReLU()
for (int i = 0; i < num_layers; i++) {
int stride = i == num_layers - 1 ? 1 : 2;
int stride = time_downscale[i] ? 2 : 1;
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TPool(hidden, stride));
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(hidden, hidden, {3, 3}, {2, 2}, {1, 1}, {1, 1}, false));
for (int j = 0; j < num_blocks; j++) {
@ -375,15 +383,22 @@ class TinyVideoDecoder : public UnaryBlock {
static const int num_layers = 3;
int channels[num_layers + 1] = {256, 128, 64, 64};
int patch_size = 1;
int t_upscale = 1;
public:
TinyVideoDecoder(int z_channels = 4, int patch_size = 1)
TinyVideoDecoder(int z_channels = 4, int patch_size = 1, std::vector<bool> time_upscale = {false, true, true})
: z_channels(z_channels), patch_size(patch_size) {
t_upscale = 1;
for (bool upscale : time_upscale) {
if (upscale) {
t_upscale *= 2;
}
}
int index = 1; // Clamp()
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(z_channels, channels[0], {3, 3}, {1, 1}, {1, 1}));
index++; // nn.ReLU()
for (int i = 0; i < num_layers; i++) {
int stride = i == 0 ? 1 : 2;
int stride = time_upscale[i] ? 2 : 1;
for (int j = 0; j < num_blocks; j++) {
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new MemBlock(channels[i], channels[i]));
}
@ -430,8 +445,8 @@ public:
if (patch_size > 1) {
h = unpatchify(ctx->ggml_ctx, h, patch_size, 1);
}
// shape(W, H, 3, 3 + T) => shape(W, H, 3, T)
h = ggml_view_4d(ctx->ggml_ctx, h, h->ne[0], h->ne[1], h->ne[2], h->ne[3] - 3, h->nb[1], h->nb[2], h->nb[3], 3 * h->nb[3]);
// shape(W, H, 3, (t_upscale - 1) + T) => shape(W, H, 3, T)
h = ggml_view_4d(ctx->ggml_ctx, h, h->ne[0], h->ne[1], h->ne[2], h->ne[3] - (t_upscale - 1), h->nb[1], h->nb[2], h->nb[3], (t_upscale - 1) * h->nb[3]);
return h;
}
};
@ -442,7 +457,9 @@ protected:
SDVersion version;
public:
int z_channels = 16;
int z_channels = 16;
std::vector<bool> time_downscale = {true, true, false};
std::vector<bool> time_upscale = {false, true, true};
public:
TAEHV(bool decode_only = true, SDVersion version = VERSION_WAN2)
@ -451,21 +468,26 @@ public:
if (version == VERSION_WAN2_2_TI2V) {
z_channels = 48;
patch = 2;
} else if (sd_version_is_ltxav(version)) {
z_channels = 128;
patch = 4;
time_downscale = {true, true, true};
time_upscale = {true, true, true};
}
blocks["decoder"] = std::shared_ptr<GGMLBlock>(new TinyVideoDecoder(z_channels, patch));
blocks["decoder"] = std::shared_ptr<GGMLBlock>(new TinyVideoDecoder(z_channels, patch, time_upscale));
if (!decode_only) {
blocks["encoder"] = std::shared_ptr<GGMLBlock>(new TinyVideoEncoder(z_channels, patch));
blocks["encoder"] = std::shared_ptr<GGMLBlock>(new TinyVideoEncoder(z_channels, patch, time_downscale));
}
}
ggml_tensor* decode(GGMLRunnerContext* ctx, ggml_tensor* z) {
auto decoder = std::dynamic_pointer_cast<TinyVideoDecoder>(blocks["decoder"]);
if (sd_version_is_wan(version)) {
if (sd_version_is_wan(version) || sd_version_is_ltxav(version)) {
// (W, H, C, T) -> (W, H, T, C)
z = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, z, 0, 1, 3, 2));
}
auto result = decoder->forward(ctx, z);
if (sd_version_is_wan(version)) {
if (sd_version_is_wan(version) || sd_version_is_ltxav(version)) {
// (W, H, C, T) -> (W, H, T, C)
result = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, result, 0, 1, 3, 2));
}
@ -477,10 +499,10 @@ public:
// (W, H, T, C) -> (W, H, C, T)
x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 0, 1, 3, 2));
int64_t num_frames = x->ne[3];
if (num_frames % 4) {
// pad to multiple of 4 at the end
if (num_frames % encoder->t_downscale) {
// pad to multiple of encoder->t_downscale at the end
auto last_frame = ggml_view_4d(ctx->ggml_ctx, x, x->ne[0], x->ne[1], x->ne[2], 1, x->nb[1], x->nb[2], x->nb[3], (num_frames - 1) * x->nb[3]);
for (int i = 0; i < 4 - num_frames % 4; i++) {
for (int i = 0; i < encoder->t_downscale - num_frames % encoder->t_downscale; i++) {
x = ggml_concat(ctx->ggml_ctx, x, last_frame, 3);
}
}