mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-05-22 19:47:49 +00:00
sd: sync to master-637-ef92a00
This commit is contained in:
parent
627e317cd7
commit
f27795cef0
9 changed files with 1070 additions and 142 deletions
2
Makefile
2
Makefile
|
|
@ -695,7 +695,7 @@ llama-impl.o: src/llama-impl.cpp src/llama-impl.h
|
|||
budget.o: common/reasoning-budget.cpp common/reasoning-budget.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
SDCPP_COMMON_BASENAMES := anima.hpp auto_encoder_kl.hpp avi_writer.h cache_dit.hpp clip.hpp common_block.hpp common_dit.hpp condition_cache_utils.hpp conditioner.hpp control.hpp convert.cpp denoiser.hpp diffusion_model.hpp easycache.hpp ernie_image.hpp esrgan.hpp flux.hpp ggml_extend.hpp ggml_extend_backend.cpp ggml_extend_backend.h ggml_graph_cut.cpp ggml_graph_cut.h gits_noise.inl guidance.cpp guidance.h hidream_o1.hpp image_metadata.cpp image_metadata.h kcpp_sd_extensions.h latent-preview.h llm.hpp lora.hpp ltx_audio_vae.h ltx_vae.hpp ltxv.hpp mmdit.hpp model.cpp model.h model_io/binary_io.h model_io/gguf_io.cpp model_io/gguf_io.h model_io/gguf_reader_ext.h model_io/pickle_io.cpp model_io/pickle_io.h model_io/safetensors_io.cpp model_io/safetensors_io.h model_io/tensor_storage.h model_io/torch_legacy_io.cpp model_io/torch_legacy_io.h model_io/torch_zip_io.cpp model_io/torch_zip_io.h msf_gif.h name_conversion.cpp name_conversion.h ordered_map.hpp pmid.hpp preprocessing.hpp qwen_image.hpp rng.hpp rng_mt19937.hpp rng_philox.hpp rope.hpp sample-cache.cpp sample-cache.h spectrum.hpp stable-diffusion.cpp stable-diffusion.h t5.hpp tae.hpp tensor.hpp tensor_ggml.hpp thirdparty/LICENSE.darts_clone.txt thirdparty/darts.h thirdparty/miniz.h thirdparty/stb_image_resize.h thirdparty/stb_image_write.h thirdparty/zip.c thirdparty/zip.h tokenizers/bpe_tokenizer.cpp tokenizers/bpe_tokenizer.h tokenizers/clip_tokenizer.cpp tokenizers/clip_tokenizer.h tokenizers/gemma_tokenizer.cpp tokenizers/gemma_tokenizer.h tokenizers/mistral_tokenizer.cpp tokenizers/mistral_tokenizer.h tokenizers/qwen2_tokenizer.cpp tokenizers/qwen2_tokenizer.h tokenizers/t5_unigram_tokenizer.cpp tokenizers/t5_unigram_tokenizer.h tokenizers/tokenize_util.cpp tokenizers/tokenize_util.h tokenizers/tokenizer.cpp tokenizers/tokenizer.h tokenizers/vocab/vocab.h ucache.hpp unet.hpp upscaler.cpp upscaler.h util.cpp util.h vae.hpp wan.hpp z_image.hpp
|
||||
SDCPP_COMMON_BASENAMES := anima.hpp auto_encoder_kl.hpp avi_writer.h cache_dit.hpp clip.hpp common_block.hpp common_dit.hpp condition_cache_utils.hpp conditioner.hpp control.hpp convert.cpp denoiser.hpp diffusion_model.hpp easycache.hpp ernie_image.hpp esrgan.hpp flux.hpp ggml_extend.hpp ggml_extend_backend.cpp ggml_extend_backend.h ggml_graph_cut.cpp ggml_graph_cut.h gits_noise.inl guidance.cpp guidance.h hidream_o1.hpp image_metadata.cpp image_metadata.h kcpp_sd_extensions.h latent-preview.h llm.hpp lora.hpp ltx_audio_vae.h ltx_latent_upscaler.hpp ltx_vae.hpp ltxv.hpp mmdit.hpp model.cpp model.h model_io/binary_io.h model_io/gguf_io.cpp model_io/gguf_io.h model_io/gguf_reader_ext.h model_io/pickle_io.cpp model_io/pickle_io.h model_io/safetensors_io.cpp model_io/safetensors_io.h model_io/tensor_storage.h model_io/torch_legacy_io.cpp model_io/torch_legacy_io.h model_io/torch_zip_io.cpp model_io/torch_zip_io.h msf_gif.h name_conversion.cpp name_conversion.h ordered_map.hpp pmid.hpp preprocessing.hpp qwen_image.hpp rng.hpp rng_mt19937.hpp rng_philox.hpp rope.hpp sample-cache.cpp sample-cache.h spectrum.hpp stable-diffusion.cpp stable-diffusion.h t5.hpp tae.hpp tensor.hpp tensor_ggml.hpp thirdparty/LICENSE.darts_clone.txt thirdparty/darts.h thirdparty/miniz.h thirdparty/stb_image_resize.h thirdparty/stb_image_write.h thirdparty/zip.c thirdparty/zip.h tokenizers/bpe_tokenizer.cpp tokenizers/bpe_tokenizer.h tokenizers/clip_tokenizer.cpp tokenizers/clip_tokenizer.h tokenizers/gemma_tokenizer.cpp tokenizers/gemma_tokenizer.h tokenizers/mistral_tokenizer.cpp tokenizers/mistral_tokenizer.h tokenizers/qwen2_tokenizer.cpp tokenizers/qwen2_tokenizer.h tokenizers/t5_unigram_tokenizer.cpp tokenizers/t5_unigram_tokenizer.h tokenizers/tokenize_util.cpp tokenizers/tokenize_util.h tokenizers/tokenizer.cpp tokenizers/tokenizer.h tokenizers/vocab/vocab.h ucache.hpp unet.hpp upscaler.cpp upscaler.h util.cpp util.h vae.hpp wan.hpp z_image.hpp
|
||||
|
||||
SDCPP_MAIN_BASENAMES := common/common.cpp common/common.h common/log.cpp common/log.h common/media_io.cpp common/media_io.cpp common/media_io.h common/resource_owners.hpp convert.cpp image_metadata.cpp main.cpp tokenizers/vocab/clip_merges.hpp tokenizers/vocab/gemma_merges.hpp tokenizers/vocab/gemma_vocab.hpp tokenizers/vocab/mistral_merges.hpp tokenizers/vocab/mistral_vocab.hpp tokenizers/vocab/qwen_merges.hpp tokenizers/vocab/t5.hpp tokenizers/vocab/umt5.hpp tokenizers/vocab/vocab.cpp version.cpp
|
||||
|
||||
|
|
|
|||
|
|
@ -1134,11 +1134,11 @@ ArgOptions SDGenerationParams::get_options() {
|
|||
return 1;
|
||||
};
|
||||
|
||||
auto on_sigmas_arg = [&](int argc, const char** argv, int index) {
|
||||
if (++index >= argc) {
|
||||
auto parse_sigmas_arg = [&](const char* value, std::vector<float>* target, const char* option_name) {
|
||||
if (target == nullptr || value == nullptr) {
|
||||
return -1;
|
||||
}
|
||||
std::string sigmas_str = argv[index];
|
||||
std::string sigmas_str = value;
|
||||
if (!sigmas_str.empty() && sigmas_str.front() == '[') {
|
||||
sigmas_str.erase(0, 1);
|
||||
}
|
||||
|
|
@ -1146,6 +1146,7 @@ ArgOptions SDGenerationParams::get_options() {
|
|||
sigmas_str.pop_back();
|
||||
}
|
||||
|
||||
size_t before = target->size();
|
||||
std::stringstream ss(sigmas_str);
|
||||
std::string item;
|
||||
while (std::getline(ss, item, ',')) {
|
||||
|
|
@ -1153,24 +1154,38 @@ ArgOptions SDGenerationParams::get_options() {
|
|||
item.erase(item.find_last_not_of(" \t\n\r\f\v") + 1);
|
||||
if (!item.empty()) {
|
||||
try {
|
||||
custom_sigmas.push_back(std::stof(item));
|
||||
target->push_back(std::stof(item));
|
||||
} catch (const std::invalid_argument&) {
|
||||
LOG_ERROR("error: invalid float value '%s' in --sigmas", item.c_str());
|
||||
LOG_ERROR("error: invalid float value '%s' in %s", item.c_str(), option_name);
|
||||
return -1;
|
||||
} catch (const std::out_of_range&) {
|
||||
LOG_ERROR("error: float value '%s' out of range in --sigmas", item.c_str());
|
||||
LOG_ERROR("error: float value '%s' out of range in %s", item.c_str(), option_name);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (custom_sigmas.empty() && !sigmas_str.empty()) {
|
||||
LOG_ERROR("error: could not parse any sigma values from '%s'", argv[index]);
|
||||
if (target->size() == before && !sigmas_str.empty()) {
|
||||
LOG_ERROR("error: could not parse any sigma values from '%s'", value);
|
||||
return -1;
|
||||
}
|
||||
return 1;
|
||||
};
|
||||
|
||||
auto on_sigmas_arg = [&](int argc, const char** argv, int index) {
|
||||
if (++index >= argc) {
|
||||
return -1;
|
||||
}
|
||||
return parse_sigmas_arg(argv[index], &custom_sigmas, "--sigmas");
|
||||
};
|
||||
|
||||
auto on_hires_sigmas_arg = [&](int argc, const char** argv, int index) {
|
||||
if (++index >= argc) {
|
||||
return -1;
|
||||
}
|
||||
return parse_sigmas_arg(argv[index], &hires_custom_sigmas, "--hires-sigmas");
|
||||
};
|
||||
|
||||
auto on_ref_image_arg = [&](int argc, const char** argv, int index) {
|
||||
if (++index >= argc) {
|
||||
return -1;
|
||||
|
|
@ -1293,6 +1308,10 @@ ArgOptions SDGenerationParams::get_options() {
|
|||
"--sigmas",
|
||||
"custom sigma values for the sampler, comma-separated (e.g., \"14.61,7.8,3.5,0.0\").",
|
||||
on_sigmas_arg},
|
||||
{"",
|
||||
"--hires-sigmas",
|
||||
"custom sigma values for the highres fix second pass, comma-separated (e.g., \"0.85,0.725,0.421875,0.0\").",
|
||||
on_hires_sigmas_arg},
|
||||
{"",
|
||||
"--skip-layers",
|
||||
"layers to skip for SLG steps (default: [7,8,9])",
|
||||
|
|
@ -1525,11 +1544,31 @@ static bool resolve_model_file_from_dir(const std::string& model_name,
|
|||
LOG_ERROR("%s directory is empty", label);
|
||||
return false;
|
||||
}
|
||||
auto ends_with_valid_ext = [&]() {
|
||||
for (const auto& ext : valid_ext) {
|
||||
if (model_name.size() < ext.size()) {
|
||||
continue;
|
||||
}
|
||||
auto suffix = model_name.substr(model_name.size() - ext.size());
|
||||
std::transform(suffix.begin(), suffix.end(), suffix.begin(), [](unsigned char c) {
|
||||
return static_cast<char>(std::tolower(c));
|
||||
});
|
||||
std::string lower_ext = ext;
|
||||
std::transform(lower_ext.begin(), lower_ext.end(), lower_ext.begin(), [](unsigned char c) {
|
||||
return static_cast<char>(std::tolower(c));
|
||||
});
|
||||
if (suffix == lower_ext) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
};
|
||||
|
||||
if (model_name.empty() ||
|
||||
model_name.find('/') != std::string::npos ||
|
||||
model_name.find('\\') != std::string::npos ||
|
||||
fs::path(model_name).has_root_path() ||
|
||||
fs::path(model_name).has_extension()) {
|
||||
ends_with_valid_ext()) {
|
||||
LOG_ERROR("%s must be a model name without path or extension: %s", label, model_name.c_str());
|
||||
return false;
|
||||
}
|
||||
|
|
@ -1633,6 +1672,9 @@ bool SDGenerationParams::from_json_str(
|
|||
if (hires_json.contains("denoising_strength") && hires_json["denoising_strength"].is_number()) {
|
||||
hires_denoising_strength = hires_json["denoising_strength"];
|
||||
}
|
||||
if (hires_json.contains("custom_sigmas") && hires_json["custom_sigmas"].is_array()) {
|
||||
hires_custom_sigmas = hires_json["custom_sigmas"].get<std::vector<float>>();
|
||||
}
|
||||
if (hires_json.contains("upscale_tile_size") && hires_json["upscale_tile_size"].is_number_integer()) {
|
||||
hires_upscale_tile_size = hires_json["upscale_tile_size"];
|
||||
}
|
||||
|
|
@ -2080,6 +2122,10 @@ bool SDGenerationParams::validate(SDMode mode) {
|
|||
LOG_ERROR("error: hires denoising strength must be in (0.0, 1.0]");
|
||||
return false;
|
||||
}
|
||||
if (!hires_custom_sigmas.empty() && hires_custom_sigmas.size() < 2) {
|
||||
LOG_ERROR("error: hires custom sigmas must contain at least two values");
|
||||
return false;
|
||||
}
|
||||
if (hires_upscale_tile_size < 1) {
|
||||
LOG_ERROR("error: hires upscale tile size must be positive");
|
||||
return false;
|
||||
|
|
@ -2174,15 +2220,17 @@ sd_img_gen_params_t SDGenerationParams::to_sd_img_gen_params_t() {
|
|||
params.vae_tiling_params = vae_tiling_params;
|
||||
params.cache = cache_params;
|
||||
|
||||
params.hires.enabled = hires_enabled;
|
||||
params.hires.upscaler = resolved_hires_upscaler;
|
||||
params.hires.model_path = hires_upscaler_model_path.empty() ? nullptr : hires_upscaler_model_path.c_str();
|
||||
params.hires.scale = hires_scale;
|
||||
params.hires.target_width = hires_width;
|
||||
params.hires.target_height = hires_height;
|
||||
params.hires.steps = hires_steps;
|
||||
params.hires.denoising_strength = hires_denoising_strength;
|
||||
params.hires.upscale_tile_size = hires_upscale_tile_size;
|
||||
params.hires.enabled = hires_enabled;
|
||||
params.hires.upscaler = resolved_hires_upscaler;
|
||||
params.hires.model_path = hires_upscaler_model_path.empty() ? nullptr : hires_upscaler_model_path.c_str();
|
||||
params.hires.scale = hires_scale;
|
||||
params.hires.target_width = hires_width;
|
||||
params.hires.target_height = hires_height;
|
||||
params.hires.steps = hires_steps;
|
||||
params.hires.denoising_strength = hires_denoising_strength;
|
||||
params.hires.upscale_tile_size = hires_upscale_tile_size;
|
||||
params.hires.custom_sigmas = hires_custom_sigmas.empty() ? nullptr : hires_custom_sigmas.data();
|
||||
params.hires.custom_sigmas_count = static_cast<int>(hires_custom_sigmas.size());
|
||||
return params;
|
||||
}
|
||||
|
||||
|
|
@ -2215,27 +2263,38 @@ sd_vid_gen_params_t SDGenerationParams::to_sd_vid_gen_params_t() {
|
|||
high_noise_sample_params.extra_sample_args = high_noise_extra_sample_args.empty() ? nullptr : high_noise_extra_sample_args.c_str();
|
||||
cache_params.scm_mask = scm_mask.empty() ? nullptr : scm_mask.c_str();
|
||||
|
||||
params.loras = lora_vec.empty() ? nullptr : lora_vec.data();
|
||||
params.lora_count = static_cast<uint32_t>(lora_vec.size());
|
||||
params.prompt = prompt.c_str();
|
||||
params.negative_prompt = negative_prompt.c_str();
|
||||
params.clip_skip = clip_skip;
|
||||
params.init_image = init_image.get();
|
||||
params.end_image = end_image.get();
|
||||
params.control_frames = control_frame_views.empty() ? nullptr : control_frame_views.data();
|
||||
params.control_frames_size = static_cast<int>(control_frame_views.size());
|
||||
params.width = get_resolved_width();
|
||||
params.height = get_resolved_height();
|
||||
params.sample_params = sample_params;
|
||||
params.high_noise_sample_params = high_noise_sample_params;
|
||||
params.moe_boundary = moe_boundary;
|
||||
params.strength = strength;
|
||||
params.seed = seed;
|
||||
params.video_frames = video_frames;
|
||||
params.fps = fps;
|
||||
params.vace_strength = vace_strength;
|
||||
params.vae_tiling_params = vae_tiling_params;
|
||||
params.cache = cache_params;
|
||||
params.loras = lora_vec.empty() ? nullptr : lora_vec.data();
|
||||
params.lora_count = static_cast<uint32_t>(lora_vec.size());
|
||||
params.prompt = prompt.c_str();
|
||||
params.negative_prompt = negative_prompt.c_str();
|
||||
params.clip_skip = clip_skip;
|
||||
params.init_image = init_image.get();
|
||||
params.end_image = end_image.get();
|
||||
params.control_frames = control_frame_views.empty() ? nullptr : control_frame_views.data();
|
||||
params.control_frames_size = static_cast<int>(control_frame_views.size());
|
||||
params.width = get_resolved_width();
|
||||
params.height = get_resolved_height();
|
||||
params.sample_params = sample_params;
|
||||
params.high_noise_sample_params = high_noise_sample_params;
|
||||
params.moe_boundary = moe_boundary;
|
||||
params.strength = strength;
|
||||
params.seed = seed;
|
||||
params.video_frames = video_frames;
|
||||
params.fps = fps;
|
||||
params.vace_strength = vace_strength;
|
||||
params.vae_tiling_params = vae_tiling_params;
|
||||
params.cache = cache_params;
|
||||
params.hires.enabled = hires_enabled;
|
||||
params.hires.upscaler = resolved_hires_upscaler;
|
||||
params.hires.model_path = hires_upscaler_model_path.empty() ? nullptr : hires_upscaler_model_path.c_str();
|
||||
params.hires.scale = hires_scale;
|
||||
params.hires.target_width = hires_width;
|
||||
params.hires.target_height = hires_height;
|
||||
params.hires.steps = hires_steps;
|
||||
params.hires.denoising_strength = hires_denoising_strength;
|
||||
params.hires.upscale_tile_size = hires_upscale_tile_size;
|
||||
params.hires.custom_sigmas = hires_custom_sigmas.empty() ? nullptr : hires_custom_sigmas.data();
|
||||
params.hires.custom_sigmas_count = static_cast<int>(hires_custom_sigmas.size());
|
||||
return params;
|
||||
}
|
||||
|
||||
|
|
@ -2318,6 +2377,7 @@ std::string SDGenerationParams::to_string() const {
|
|||
<< ", target_height: " << hires_height
|
||||
<< ", steps: " << hires_steps
|
||||
<< ", denoising_strength: " << hires_denoising_strength
|
||||
<< ", custom_sigmas: " << vec_to_string(hires_custom_sigmas)
|
||||
<< ", upscale_tile_size: " << hires_upscale_tile_size << " },\n"
|
||||
<< " vae_tiling_params: { "
|
||||
<< vae_tiling_params.enabled << ", "
|
||||
|
|
@ -2469,6 +2529,7 @@ std::string build_sdcpp_image_metadata_json(const SDContextParams& ctx_params,
|
|||
{"target_height", gen_params.hires_height},
|
||||
{"steps", gen_params.hires_steps},
|
||||
{"denoising_strength", gen_params.hires_denoising_strength},
|
||||
{"custom_sigmas", gen_params.hires_custom_sigmas},
|
||||
{"upscale_tile_size", gen_params.hires_upscale_tile_size},
|
||||
};
|
||||
}
|
||||
|
|
@ -2588,6 +2649,9 @@ std::string get_image_params(const SDContextParams& ctx_params,
|
|||
parameter_string += "Hires resize: " + std::to_string(gen_params.hires_width) + "x" + std::to_string(gen_params.hires_height) + ", ";
|
||||
parameter_string += "Hires steps: " + std::to_string(gen_params.hires_steps) + ", ";
|
||||
parameter_string += "Denoising strength: " + std::to_string(gen_params.hires_denoising_strength) + ", ";
|
||||
if (!gen_params.hires_custom_sigmas.empty()) {
|
||||
parameter_string += "Hires custom sigmas: " + vec_to_string(gen_params.hires_custom_sigmas) + ", ";
|
||||
}
|
||||
}
|
||||
parameter_string += "Version: stable-diffusion.cpp";
|
||||
parameter_string += ", SDCPP: " + build_sdcpp_image_metadata_json(ctx_params, gen_params, seed, mode);
|
||||
|
|
|
|||
|
|
@ -207,6 +207,7 @@ struct SDGenerationParams {
|
|||
int hires_steps = 0;
|
||||
float hires_denoising_strength = 0.7f;
|
||||
int hires_upscale_tile_size = 128;
|
||||
std::vector<float> hires_custom_sigmas;
|
||||
|
||||
std::map<std::string, float> lora_map;
|
||||
std::map<std::string, float> high_noise_lora_map;
|
||||
|
|
|
|||
348
otherarch/sdcpp/ltx_latent_upscaler.hpp
Normal file
348
otherarch/sdcpp/ltx_latent_upscaler.hpp
Normal file
|
|
@ -0,0 +1,348 @@
|
|||
#ifndef __SD_LTX_LATENT_UPSCALER_HPP__
|
||||
#define __SD_LTX_LATENT_UPSCALER_HPP__
|
||||
|
||||
#include <cinttypes>
|
||||
#include <cmath>
|
||||
#include <cstdlib>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
|
||||
#include "common_dit.hpp"
|
||||
#include "ggml_extend.hpp"
|
||||
#include "ggml_graph_cut.h"
|
||||
#include "model.h"
|
||||
#include "util.h"
|
||||
|
||||
namespace LTXVUpsampler {
|
||||
constexpr int LTX_UPSAMPLER_GRAPH_SIZE = 10240;
|
||||
|
||||
struct LatentUpsamplerConfig {
|
||||
int64_t in_channels = 128;
|
||||
int64_t mid_channels = 1024;
|
||||
int num_blocks_per_stage = 4;
|
||||
int dims = 3;
|
||||
bool spatial_upsample = true;
|
||||
bool temporal_upsample = false;
|
||||
bool rational_resampler = false;
|
||||
};
|
||||
|
||||
static inline bool has_tensor(const String2TensorStorage& tensor_storage_map,
|
||||
const std::string& name) {
|
||||
return tensor_storage_map.find(name) != tensor_storage_map.end();
|
||||
}
|
||||
|
||||
static inline int64_t get_tensor_ne0(const String2TensorStorage& tensor_storage_map,
|
||||
const std::string& name,
|
||||
int64_t fallback) {
|
||||
auto it = tensor_storage_map.find(name);
|
||||
if (it == tensor_storage_map.end()) {
|
||||
return fallback;
|
||||
}
|
||||
return it->second.ne[0];
|
||||
}
|
||||
|
||||
static inline int count_module_blocks(const String2TensorStorage& tensor_storage_map,
|
||||
const std::string& module_name) {
|
||||
int max_block = -1;
|
||||
const std::string prefix = module_name + ".";
|
||||
for (const auto& pair : tensor_storage_map) {
|
||||
const std::string& name = pair.first;
|
||||
if (name.find(prefix) != 0) {
|
||||
continue;
|
||||
}
|
||||
size_t begin = prefix.size();
|
||||
size_t end = name.find('.', begin);
|
||||
if (end == std::string::npos) {
|
||||
continue;
|
||||
}
|
||||
int index = atoi(name.substr(begin, end - begin).c_str());
|
||||
max_block = std::max(max_block, index);
|
||||
}
|
||||
return max_block + 1;
|
||||
}
|
||||
|
||||
static inline LatentUpsamplerConfig detect_config_from_weights(const String2TensorStorage& tensor_storage_map) {
|
||||
LatentUpsamplerConfig config;
|
||||
config.mid_channels = get_tensor_ne0(tensor_storage_map, "initial_norm.weight", config.mid_channels);
|
||||
config.in_channels = get_tensor_ne0(tensor_storage_map, "final_conv.bias", config.in_channels);
|
||||
int detected_blocks = count_module_blocks(tensor_storage_map, "res_blocks");
|
||||
if (detected_blocks > 0) {
|
||||
config.num_blocks_per_stage = detected_blocks;
|
||||
}
|
||||
config.spatial_upsample = has_tensor(tensor_storage_map, "upsampler.0.weight");
|
||||
config.temporal_upsample = has_tensor(tensor_storage_map, "temporal_upsampler.0.weight");
|
||||
return config;
|
||||
}
|
||||
|
||||
class VideoGroupNorm : public GGMLBlock {
|
||||
protected:
|
||||
int num_groups;
|
||||
int64_t num_channels;
|
||||
float eps;
|
||||
std::string prefix;
|
||||
|
||||
void init_params(ggml_context* ctx,
|
||||
const String2TensorStorage& tensor_storage_map = {},
|
||||
const std::string prefix = "") override {
|
||||
SD_UNUSED(tensor_storage_map);
|
||||
this->prefix = prefix;
|
||||
params["weight"] = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, num_channels);
|
||||
params["bias"] = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, num_channels);
|
||||
}
|
||||
|
||||
public:
|
||||
VideoGroupNorm(int num_groups, int64_t num_channels, float eps = 1e-05f)
|
||||
: num_groups(num_groups),
|
||||
num_channels(num_channels),
|
||||
eps(eps) {}
|
||||
|
||||
ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
|
||||
// LTX video latent layout is [W, H, T, C]. ggml_group_norm treats ne[2]
|
||||
// as channels, so fold only H/T internally and restore the same layout.
|
||||
GGML_ASSERT(x->ne[3] == num_channels);
|
||||
const int64_t W = x->ne[0];
|
||||
const int64_t H = x->ne[1];
|
||||
const int64_t T = x->ne[2];
|
||||
x = ggml_ext_cont(ctx->ggml_ctx, x);
|
||||
x = ggml_reshape_4d(ctx->ggml_ctx, x, W, H * T, num_channels, 1);
|
||||
x = ggml_group_norm(ctx->ggml_ctx, x, num_groups, eps);
|
||||
|
||||
ggml_tensor* weight = params["weight"];
|
||||
ggml_tensor* bias = params["bias"];
|
||||
if (ctx->weight_adapter) {
|
||||
weight = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, ctx->backend, weight, prefix + "weight");
|
||||
bias = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, ctx->backend, bias, prefix + "bias");
|
||||
}
|
||||
weight = ggml_reshape_4d(ctx->ggml_ctx, weight, 1, 1, num_channels, 1);
|
||||
bias = ggml_reshape_4d(ctx->ggml_ctx, bias, 1, 1, num_channels, 1);
|
||||
x = ggml_mul_inplace(ctx->ggml_ctx, x, weight);
|
||||
x = ggml_add_inplace(ctx->ggml_ctx, x, bias);
|
||||
return ggml_reshape_4d(ctx->ggml_ctx, x, W, H, T, num_channels);
|
||||
}
|
||||
};
|
||||
|
||||
class ResBlock : public GGMLBlock {
|
||||
public:
|
||||
ResBlock(int64_t channels, int dims = 3) {
|
||||
GGML_ASSERT(dims == 3);
|
||||
blocks["conv1"] = std::shared_ptr<GGMLBlock>(new Conv3d(channels, channels, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}));
|
||||
blocks["norm1"] = std::shared_ptr<GGMLBlock>(new VideoGroupNorm(32, channels));
|
||||
blocks["conv2"] = std::shared_ptr<GGMLBlock>(new Conv3d(channels, channels, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}));
|
||||
blocks["norm2"] = std::shared_ptr<GGMLBlock>(new VideoGroupNorm(32, channels));
|
||||
}
|
||||
|
||||
ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
|
||||
auto conv1 = std::dynamic_pointer_cast<Conv3d>(blocks["conv1"]);
|
||||
auto norm1 = std::dynamic_pointer_cast<VideoGroupNorm>(blocks["norm1"]);
|
||||
auto conv2 = std::dynamic_pointer_cast<Conv3d>(blocks["conv2"]);
|
||||
auto norm2 = std::dynamic_pointer_cast<VideoGroupNorm>(blocks["norm2"]);
|
||||
|
||||
ggml_tensor* residual = x;
|
||||
|
||||
x = conv1->forward(ctx, x);
|
||||
x = norm1->forward(ctx, x);
|
||||
x = ggml_silu_inplace(ctx->ggml_ctx, x);
|
||||
x = conv2->forward(ctx, x);
|
||||
x = norm2->forward(ctx, x);
|
||||
x = ggml_add(ctx->ggml_ctx, x, residual);
|
||||
return ggml_silu(ctx->ggml_ctx, x);
|
||||
}
|
||||
};
|
||||
|
||||
class PixelShuffleND : public UnaryBlock {
|
||||
protected:
|
||||
int upscale_factor;
|
||||
|
||||
public:
|
||||
explicit PixelShuffleND(int upscale_factor)
|
||||
: upscale_factor(upscale_factor) {}
|
||||
|
||||
ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
|
||||
GGML_ASSERT(upscale_factor == 2);
|
||||
int64_t h = x->ne[1];
|
||||
int64_t w = x->ne[0];
|
||||
// x: [b*f, c*4, h, w] -> [b*f, c, h*2, w*2]
|
||||
x = ggml_ext_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, x, 2, 0, 1, 3)); // [b*f, h, w, c*4]
|
||||
x = ggml_reshape_3d(ctx->ggml_ctx, x, x->ne[0], x->ne[1] * x->ne[2], x->ne[3]); // [b*f, h*w, c*4]
|
||||
return DiT::unpatchify(ctx->ggml_ctx, x, h, w, upscale_factor, upscale_factor, true);
|
||||
}
|
||||
};
|
||||
|
||||
class LatentUpsampler : public GGMLBlock {
|
||||
public:
|
||||
LatentUpsamplerConfig config;
|
||||
|
||||
explicit LatentUpsampler(LatentUpsamplerConfig config)
|
||||
: config(std::move(config)) {
|
||||
GGML_ASSERT(this->config.dims == 3);
|
||||
GGML_ASSERT(this->config.spatial_upsample);
|
||||
GGML_ASSERT(!this->config.temporal_upsample);
|
||||
GGML_ASSERT(!this->config.rational_resampler);
|
||||
|
||||
blocks["initial_conv"] = std::shared_ptr<GGMLBlock>(new Conv3d(this->config.in_channels,
|
||||
this->config.mid_channels,
|
||||
{3, 3, 3},
|
||||
{1, 1, 1},
|
||||
{1, 1, 1}));
|
||||
blocks["initial_norm"] = std::shared_ptr<GGMLBlock>(new VideoGroupNorm(32, this->config.mid_channels));
|
||||
for (int i = 0; i < this->config.num_blocks_per_stage; ++i) {
|
||||
blocks["res_blocks." + std::to_string(i)] = std::shared_ptr<GGMLBlock>(new ResBlock(this->config.mid_channels, this->config.dims));
|
||||
}
|
||||
blocks["upsampler.0"] = std::shared_ptr<GGMLBlock>(new Conv2d(this->config.mid_channels,
|
||||
4 * this->config.mid_channels,
|
||||
{3, 3},
|
||||
{1, 1},
|
||||
{1, 1}));
|
||||
blocks["upsampler.1"] = std::shared_ptr<GGMLBlock>(new PixelShuffleND(2));
|
||||
for (int i = 0; i < this->config.num_blocks_per_stage; ++i) {
|
||||
blocks["post_upsample_res_blocks." + std::to_string(i)] = std::shared_ptr<GGMLBlock>(new ResBlock(this->config.mid_channels, this->config.dims));
|
||||
}
|
||||
blocks["final_conv"] = std::shared_ptr<GGMLBlock>(new Conv3d(this->config.mid_channels,
|
||||
this->config.in_channels,
|
||||
{3, 3, 3},
|
||||
{1, 1, 1},
|
||||
{1, 1, 1}));
|
||||
}
|
||||
|
||||
ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
|
||||
// x: [b*c, f, h, w]
|
||||
// return: [b*c, f, h*2, w*2]
|
||||
auto initial_conv = std::dynamic_pointer_cast<Conv3d>(blocks["initial_conv"]);
|
||||
auto initial_norm = std::dynamic_pointer_cast<VideoGroupNorm>(blocks["initial_norm"]);
|
||||
auto upsample_conv = std::dynamic_pointer_cast<Conv2d>(blocks["upsampler.0"]);
|
||||
auto pixel_shuffle = std::dynamic_pointer_cast<PixelShuffleND>(blocks["upsampler.1"]);
|
||||
auto final_conv = std::dynamic_pointer_cast<Conv3d>(blocks["final_conv"]);
|
||||
|
||||
x = initial_conv->forward(ctx, x);
|
||||
x = initial_norm->forward(ctx, x);
|
||||
x = ggml_silu(ctx->ggml_ctx, x);
|
||||
sd::ggml_graph_cut::mark_graph_cut(x, "ltx_latent_upsampler.initial", "x");
|
||||
|
||||
for (int i = 0; i < config.num_blocks_per_stage; ++i) {
|
||||
auto block = std::dynamic_pointer_cast<ResBlock>(blocks["res_blocks." + std::to_string(i)]);
|
||||
x = block->forward(ctx, x);
|
||||
sd::ggml_graph_cut::mark_graph_cut(x, "ltx_latent_upsampler.res_blocks." + std::to_string(i), "x");
|
||||
}
|
||||
|
||||
// rearrange(x, "b c f h w -> (b f) c h w"),
|
||||
x = ggml_ext_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, x, 0, 1, 3, 2)); // [b*f, c, h, w]
|
||||
x = upsample_conv->forward(ctx, x); // [b*f, c*4, h, w]
|
||||
x = pixel_shuffle->forward(ctx, x); // [b*f, c, h*2, w*2]
|
||||
x = ggml_ext_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, x, 0, 1, 3, 2)); // [b*c, f, h, w]
|
||||
sd::ggml_graph_cut::mark_graph_cut(x, "ltx_latent_upsampler.spatial_up", "x");
|
||||
|
||||
for (int i = 0; i < config.num_blocks_per_stage; ++i) {
|
||||
auto block = std::dynamic_pointer_cast<ResBlock>(blocks["post_upsample_res_blocks." + std::to_string(i)]);
|
||||
x = block->forward(ctx, x);
|
||||
sd::ggml_graph_cut::mark_graph_cut(x, "ltx_latent_upsampler.post_blocks." + std::to_string(i), "x");
|
||||
}
|
||||
|
||||
x = final_conv->forward(ctx, x);
|
||||
sd::ggml_graph_cut::mark_graph_cut(x, "ltx_latent_upsampler.final", "x");
|
||||
return x;
|
||||
}
|
||||
};
|
||||
|
||||
struct LatentUpsamplerRunner : public GGMLRunner {
|
||||
std::unique_ptr<LatentUpsampler> model;
|
||||
|
||||
LatentUpsamplerRunner(ggml_backend_t backend,
|
||||
ggml_backend_t params_backend)
|
||||
: GGMLRunner(backend, params_backend) {}
|
||||
|
||||
std::string get_desc() override {
|
||||
return "ltx_latent_upsampler";
|
||||
}
|
||||
|
||||
bool load_from_file(const std::string& file_path, int n_threads) {
|
||||
LOG_INFO("loading LTX latent upsampler from '%s'", file_path.c_str());
|
||||
ModelLoader model_loader;
|
||||
if (!model_loader.init_from_file(file_path)) {
|
||||
LOG_ERROR("init LTX latent upsampler model loader from file failed: '%s'", file_path.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
const auto& tensor_storage_map = model_loader.get_tensor_storage_map();
|
||||
if (!has_tensor(tensor_storage_map, "post_upsample_res_blocks.0.conv2.bias") ||
|
||||
!has_tensor(tensor_storage_map, "upsampler.0.weight")) {
|
||||
LOG_ERROR("unsupported LTX latent upsampler weights: expected spatial upsampler tensors");
|
||||
return false;
|
||||
}
|
||||
|
||||
LatentUpsamplerConfig config = detect_config_from_weights(tensor_storage_map);
|
||||
if (config.dims != 3 || !config.spatial_upsample || config.temporal_upsample ||
|
||||
config.rational_resampler) {
|
||||
LOG_ERROR("unsupported LTX latent upsampler config: dims=%d spatial=%d temporal=%d rational=%d",
|
||||
config.dims,
|
||||
config.spatial_upsample,
|
||||
config.temporal_upsample,
|
||||
config.rational_resampler);
|
||||
return false;
|
||||
}
|
||||
|
||||
model = std::make_unique<LatentUpsampler>(config);
|
||||
model->init(params_ctx, tensor_storage_map, "");
|
||||
if (!alloc_params_buffer()) {
|
||||
LOG_ERROR("LTX latent upsampler params buffer allocation failed");
|
||||
return false;
|
||||
}
|
||||
|
||||
std::map<std::string, ggml_tensor*> tensors;
|
||||
model->get_param_tensors(tensors);
|
||||
if (!model_loader.load_tensors(tensors, {}, n_threads)) {
|
||||
LOG_ERROR("load LTX latent upsampler tensors failed");
|
||||
return false;
|
||||
}
|
||||
|
||||
LOG_INFO("LTX latent upsampler loaded: in_channels=%" PRId64 ", mid_channels=%" PRId64 ", blocks=%d",
|
||||
config.in_channels,
|
||||
config.mid_channels,
|
||||
config.num_blocks_per_stage);
|
||||
return true;
|
||||
}
|
||||
|
||||
ggml_cgraph* build_graph(const sd::Tensor<float>& x_tensor) {
|
||||
if (!model) {
|
||||
return nullptr;
|
||||
}
|
||||
ggml_cgraph* gf = new_graph_custom(LTX_UPSAMPLER_GRAPH_SIZE);
|
||||
ggml_tensor* x = make_input(x_tensor);
|
||||
auto runner_ctx = get_context();
|
||||
ggml_tensor* out = model->forward(&runner_ctx, x);
|
||||
ggml_build_forward_expand(gf, out);
|
||||
return gf;
|
||||
}
|
||||
|
||||
sd::Tensor<float> compute(const int n_threads,
|
||||
const sd::Tensor<float>& x) {
|
||||
if (!model) {
|
||||
LOG_ERROR("LTX latent upsampler is not loaded");
|
||||
return {};
|
||||
}
|
||||
if (x.dim() != 4 && x.dim() != 5) {
|
||||
LOG_ERROR("LTX latent upsampler expects 4D or 5D video latent, got dim=%lld",
|
||||
(long long)x.dim());
|
||||
return {};
|
||||
}
|
||||
if (x.dim() == 5 && x.shape()[4] != 1) {
|
||||
LOG_ERROR("LTX latent upsampler currently supports batch size 1, got batch=%lld",
|
||||
(long long)x.shape()[4]);
|
||||
return {};
|
||||
}
|
||||
if (x.shape()[3] != model->config.in_channels) {
|
||||
LOG_ERROR("LTX latent upsampler expected %" PRId64 " channels, got %lld",
|
||||
model->config.in_channels,
|
||||
(long long)x.shape()[3]);
|
||||
return {};
|
||||
}
|
||||
size_t expected_dim = static_cast<size_t>(x.dim());
|
||||
auto get_graph = [&]() -> ggml_cgraph* { return build_graph(x); };
|
||||
return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), expected_dim);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace LTXVUpsampler
|
||||
|
||||
#endif // __SD_LTX_LATENT_UPSCALER_HPP__
|
||||
|
|
@ -1123,6 +1123,18 @@ namespace LTXVAE {
|
|||
mean = ggml_cont(ctx->ggml_ctx, mean);
|
||||
return processor->normalize(ctx, mean);
|
||||
}
|
||||
|
||||
ggml_tensor* normalize_latents(GGMLRunnerContext* ctx,
|
||||
ggml_tensor* x) {
|
||||
auto processor = std::dynamic_pointer_cast<PerChannelStatistics>(blocks["per_channel_statistics"]);
|
||||
return processor->normalize(ctx, x);
|
||||
}
|
||||
|
||||
ggml_tensor* un_normalize_latents(GGMLRunnerContext* ctx,
|
||||
ggml_tensor* x) {
|
||||
auto processor = std::dynamic_pointer_cast<PerChannelStatistics>(blocks["per_channel_statistics"]);
|
||||
return processor->un_normalize(ctx, x);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace LTXVAE
|
||||
|
|
@ -1192,6 +1204,17 @@ struct LTXVideoVAE : public VAE {
|
|||
return gf;
|
||||
}
|
||||
|
||||
ggml_cgraph* build_latent_statistics_graph(const sd::Tensor<float>& z_tensor, bool normalize) {
|
||||
ggml_cgraph* gf = new_graph_custom(1024);
|
||||
ggml_tensor* z = make_input(z_tensor);
|
||||
|
||||
auto runner_ctx = get_context();
|
||||
ggml_tensor* out = normalize ? vae.normalize_latents(&runner_ctx, z)
|
||||
: vae.un_normalize_latents(&runner_ctx, z);
|
||||
ggml_build_forward_expand(gf, out);
|
||||
return gf;
|
||||
}
|
||||
|
||||
sd::Tensor<float> _compute(const int n_threads,
|
||||
const sd::Tensor<float>& z,
|
||||
bool decode_graph) override {
|
||||
|
|
@ -1226,6 +1249,26 @@ struct LTXVideoVAE : public VAE {
|
|||
return result;
|
||||
}
|
||||
|
||||
sd::Tensor<float> apply_latent_statistics(const int n_threads,
|
||||
const sd::Tensor<float>& z,
|
||||
bool normalize) {
|
||||
auto get_graph = [&]() -> ggml_cgraph* {
|
||||
return build_latent_statistics_graph(z, normalize);
|
||||
};
|
||||
return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false),
|
||||
static_cast<size_t>(z.dim()));
|
||||
}
|
||||
|
||||
sd::Tensor<float> normalize_latents(const int n_threads,
|
||||
const sd::Tensor<float>& z) {
|
||||
return apply_latent_statistics(n_threads, z, true);
|
||||
}
|
||||
|
||||
sd::Tensor<float> un_normalize_latents(const int n_threads,
|
||||
const sd::Tensor<float>& z) {
|
||||
return apply_latent_statistics(n_threads, z, false);
|
||||
}
|
||||
|
||||
int get_encoder_output_channels(int input_channels) override {
|
||||
SD_UNUSED(input_channels);
|
||||
return 256;
|
||||
|
|
|
|||
|
|
@ -1487,6 +1487,9 @@ namespace LTXV {
|
|||
->forward(ctx, ggml_ext_scale(ctx->ggml_ctx, av_ca_audio_timestep, av_ca_factor))
|
||||
.first;
|
||||
|
||||
sd::ggml_graph_cut::mark_graph_cut(vx, "ltxav.prelude", "vx");
|
||||
sd::ggml_graph_cut::mark_graph_cut(ax, "ltxav.prelude", "ax");
|
||||
|
||||
for (int i = 0; i < cfg.num_layers; i++) {
|
||||
auto block = std::dynamic_pointer_cast<BasicAVTransformerBlock>(blocks["transformer_blocks." + std::to_string(i)]);
|
||||
auto out = block->forward(ctx,
|
||||
|
|
@ -1509,6 +1512,8 @@ namespace LTXV {
|
|||
a_prompt_timestep_mod);
|
||||
vx = out.first;
|
||||
ax = out.second;
|
||||
sd::ggml_graph_cut::mark_graph_cut(vx, "ltxav.transformer_blocks." + std::to_string(i), "vx");
|
||||
sd::ggml_graph_cut::mark_graph_cut(ax, "ltxav.transformer_blocks." + std::to_string(i), "ax");
|
||||
}
|
||||
|
||||
auto v_shift_scale = get_output_scale_shift(ctx, params["scale_shift_table"], v_embedded_time, cfg.hidden_size);
|
||||
|
|
|
|||
|
|
@ -17,6 +17,7 @@
|
|||
#include "guidance.h"
|
||||
#include "lora.hpp"
|
||||
#include "ltx_audio_vae.h"
|
||||
#include "ltx_latent_upscaler.hpp"
|
||||
#include "ltx_vae.hpp"
|
||||
#include "pmid.hpp"
|
||||
#include "sample-cache.h"
|
||||
|
|
@ -883,7 +884,8 @@ public:
|
|||
auto create_tae = [&]() -> std::shared_ptr<VAE> {
|
||||
if (sd_version_is_wan(version) ||
|
||||
sd_version_is_qwen_image(version) ||
|
||||
sd_version_is_anima(version)) {
|
||||
sd_version_is_anima(version) ||
|
||||
sd_version_is_ltxav(version)) {
|
||||
return std::make_shared<TinyVideoAutoEncoder>(backend_for(SDBackendModule::VAE),
|
||||
params_backend_for(SDBackendModule::VAE),
|
||||
tensor_storage_map,
|
||||
|
|
@ -1430,7 +1432,7 @@ public:
|
|||
}
|
||||
auto lora = std::make_shared<LoraModel>(lora_id,
|
||||
backend_for(module),
|
||||
params_backend_for(module),
|
||||
backend_for(module),
|
||||
lora_path,
|
||||
is_high_noise ? "model.high_noise_" : "",
|
||||
version);
|
||||
|
|
@ -2421,6 +2423,24 @@ public:
|
|||
return first_stage_model->decode(n_threads, latents, vae_tiling_params, decode_video, circular_x, circular_y);
|
||||
}
|
||||
|
||||
sd::Tensor<float> normalize_ltx_video_latents(const sd::Tensor<float>& x) {
|
||||
auto ltx_vae = std::dynamic_pointer_cast<LTXVideoVAE>(first_stage_model);
|
||||
if (!ltx_vae) {
|
||||
LOG_ERROR("LTX latent normalization requires LTX video VAE");
|
||||
return {};
|
||||
}
|
||||
return ltx_vae->normalize_latents(n_threads, x);
|
||||
}
|
||||
|
||||
sd::Tensor<float> un_normalize_ltx_video_latents(const sd::Tensor<float>& x) {
|
||||
auto ltx_vae = std::dynamic_pointer_cast<LTXVideoVAE>(first_stage_model);
|
||||
if (!ltx_vae) {
|
||||
LOG_ERROR("LTX latent un-normalization requires LTX video VAE");
|
||||
return {};
|
||||
}
|
||||
return ltx_vae->un_normalize_latents(n_threads, x);
|
||||
}
|
||||
|
||||
sd::Tensor<float> decode_ltx_audio_latent(const sd::Tensor<float>& audio_latent) {
|
||||
if (audio_vae_model == nullptr || audio_latent.empty()) {
|
||||
return {};
|
||||
|
|
@ -2704,16 +2724,18 @@ void sd_cache_params_init(sd_cache_params_t* cache_params) {
|
|||
}
|
||||
|
||||
void sd_hires_params_init(sd_hires_params_t* hires_params) {
|
||||
*hires_params = {};
|
||||
hires_params->enabled = false;
|
||||
hires_params->upscaler = SD_HIRES_UPSCALER_LATENT;
|
||||
hires_params->model_path = nullptr;
|
||||
hires_params->scale = 2.0f;
|
||||
hires_params->target_width = 0;
|
||||
hires_params->target_height = 0;
|
||||
hires_params->steps = 0;
|
||||
hires_params->denoising_strength = 0.7f;
|
||||
hires_params->upscale_tile_size = 128;
|
||||
*hires_params = {};
|
||||
hires_params->enabled = false;
|
||||
hires_params->upscaler = SD_HIRES_UPSCALER_LATENT;
|
||||
hires_params->model_path = nullptr;
|
||||
hires_params->scale = 2.0f;
|
||||
hires_params->target_width = 0;
|
||||
hires_params->target_height = 0;
|
||||
hires_params->steps = 0;
|
||||
hires_params->denoising_strength = 0.7f;
|
||||
hires_params->upscale_tile_size = 128;
|
||||
hires_params->custom_sigmas = nullptr;
|
||||
hires_params->custom_sigmas_count = 0;
|
||||
}
|
||||
|
||||
void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) {
|
||||
|
|
@ -2986,6 +3008,16 @@ void sd_vid_gen_params_init(sd_vid_gen_params_t* sd_vid_gen_params) {
|
|||
sd_vid_gen_params->moe_boundary = 0.875f;
|
||||
sd_vid_gen_params->vace_strength = 1.f;
|
||||
sd_vid_gen_params->vae_tiling_params = {false, false, 0, 0, 0.5f, 0.0f, 0.0f};
|
||||
sd_vid_gen_params->hires.enabled = false;
|
||||
sd_vid_gen_params->hires.upscaler = SD_HIRES_UPSCALER_LATENT;
|
||||
sd_vid_gen_params->hires.scale = 2.f;
|
||||
sd_vid_gen_params->hires.target_width = 0;
|
||||
sd_vid_gen_params->hires.target_height = 0;
|
||||
sd_vid_gen_params->hires.steps = 0;
|
||||
sd_vid_gen_params->hires.denoising_strength = 0.7f;
|
||||
sd_vid_gen_params->hires.upscale_tile_size = 128;
|
||||
sd_vid_gen_params->hires.custom_sigmas = nullptr;
|
||||
sd_vid_gen_params->hires.custom_sigmas_count = 0;
|
||||
sd_cache_params_init(&sd_vid_gen_params->cache);
|
||||
}
|
||||
|
||||
|
|
@ -3235,6 +3267,7 @@ struct GenerationRequest {
|
|||
vace_strength = sd_vid_gen_params->vace_strength;
|
||||
guidance = sd_vid_gen_params->sample_params.guidance;
|
||||
high_noise_guidance = sd_vid_gen_params->high_noise_sample_params.guidance;
|
||||
hires = sd_vid_gen_params->hires;
|
||||
resolve(sd_ctx);
|
||||
if (frames != requested_frames) {
|
||||
LOG_WARN("align video frames from %d to %d for %s",
|
||||
|
|
@ -3293,6 +3326,20 @@ struct GenerationRequest {
|
|||
hires.enabled = false;
|
||||
return;
|
||||
}
|
||||
if (hires.custom_sigmas_count < 0) {
|
||||
LOG_WARN("hires custom sigmas count is negative, ignoring custom sigmas");
|
||||
hires.custom_sigmas = nullptr;
|
||||
hires.custom_sigmas_count = 0;
|
||||
}
|
||||
if (hires.custom_sigmas_count > 0 && hires.custom_sigmas == nullptr) {
|
||||
LOG_WARN("hires custom sigmas count is positive but custom sigmas are null, ignoring custom sigmas");
|
||||
hires.custom_sigmas_count = 0;
|
||||
}
|
||||
if (hires.custom_sigmas_count == 1) {
|
||||
LOG_WARN("hires custom sigmas requires at least two values, ignoring custom sigmas");
|
||||
hires.custom_sigmas = nullptr;
|
||||
hires.custom_sigmas_count = 0;
|
||||
}
|
||||
hires.denoising_strength = std::clamp(hires.denoising_strength, 0.0001f, 1.f);
|
||||
hires.steps = std::max(0, hires.steps);
|
||||
|
||||
|
|
@ -3657,6 +3704,85 @@ static sd::Tensor<float> pack_ltxav_audio_and_video_denoise_mask(const sd::Tenso
|
|||
return sd::ops::concat(video_mask_full, audio_mask, 3);
|
||||
}
|
||||
|
||||
static sd::Tensor<float> make_ltxav_video_denoise_mask(const sd::Tensor<float>& video_latent, float value = 1.f) {
|
||||
if (video_latent.empty()) {
|
||||
return {};
|
||||
}
|
||||
return sd::full<float>({video_latent.shape()[0],
|
||||
video_latent.shape()[1],
|
||||
video_latent.shape()[2],
|
||||
1,
|
||||
1},
|
||||
value);
|
||||
}
|
||||
|
||||
static sd::Tensor<float> encode_ltxav_condition_image(sd_ctx_t* sd_ctx,
|
||||
const sd::Tensor<float>& image,
|
||||
const char* name) {
|
||||
if (sd_ctx == nullptr || sd_ctx->sd == nullptr || image.empty()) {
|
||||
return {};
|
||||
}
|
||||
auto condition_image = image.reshape({image.shape()[0],
|
||||
image.shape()[1],
|
||||
1,
|
||||
image.shape()[2],
|
||||
image.shape()[3]});
|
||||
auto condition_latent = sd_ctx->sd->encode_first_stage(condition_image);
|
||||
if (condition_latent.empty()) {
|
||||
LOG_ERROR("failed to encode LTXAV %s image", name);
|
||||
}
|
||||
return condition_latent;
|
||||
}
|
||||
|
||||
static bool apply_ltxav_condition_by_latent_index(sd::Tensor<float>* video_latent,
|
||||
sd::Tensor<float>* video_mask,
|
||||
const sd::Tensor<float>& condition_latent,
|
||||
int64_t latent_idx,
|
||||
const char* name,
|
||||
float conditioned_mask) {
|
||||
if (video_latent == nullptr || video_mask == nullptr || video_latent->empty() || video_mask->empty()) {
|
||||
return false;
|
||||
}
|
||||
if (condition_latent.empty() ||
|
||||
condition_latent.shape()[0] != video_latent->shape()[0] ||
|
||||
condition_latent.shape()[1] != video_latent->shape()[1] ||
|
||||
condition_latent.shape()[3] != video_latent->shape()[3]) {
|
||||
LOG_ERROR("invalid LTXAV %s condition latent shape", name);
|
||||
return false;
|
||||
}
|
||||
int64_t latent_frames = video_latent->shape()[2];
|
||||
int64_t condition_frames = condition_latent.shape()[2];
|
||||
if (latent_idx < 0 || condition_frames <= 0 || latent_idx + condition_frames > latent_frames) {
|
||||
LOG_ERROR("invalid LTXAV %s image latent range: start=%" PRId64 ", length=%" PRId64 ", latent_frames=%" PRId64,
|
||||
name,
|
||||
latent_idx,
|
||||
condition_frames,
|
||||
latent_frames);
|
||||
return false;
|
||||
}
|
||||
|
||||
sd::ops::slice_assign(video_latent, 2, latent_idx, latent_idx + condition_frames, condition_latent);
|
||||
sd::ops::fill_slice(video_mask, 2, latent_idx, latent_idx + condition_frames, conditioned_mask);
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool apply_ltxav_condition_image_by_latent_index(sd_ctx_t* sd_ctx,
|
||||
const sd::Tensor<float>& image,
|
||||
sd::Tensor<float>* video_latent,
|
||||
sd::Tensor<float>* video_mask,
|
||||
int64_t latent_idx,
|
||||
const char* name,
|
||||
float strength) {
|
||||
auto condition_latent = encode_ltxav_condition_image(sd_ctx, image, name);
|
||||
return !condition_latent.empty() &&
|
||||
apply_ltxav_condition_by_latent_index(video_latent,
|
||||
video_mask,
|
||||
condition_latent,
|
||||
latent_idx,
|
||||
name,
|
||||
1.0f - std::clamp(strength, 0.f, 1.f));
|
||||
}
|
||||
|
||||
static sd::Tensor<float> unpack_ltxav_audio_latent(const sd::Tensor<float>& packed_latent,
|
||||
int audio_length,
|
||||
int video_channels) {
|
||||
|
|
@ -4218,6 +4344,53 @@ static sd::Tensor<float> upscale_hires_latent(sd_ctx_t* sd_ctx,
|
|||
return {};
|
||||
}
|
||||
|
||||
static std::vector<float> make_hires_sigma_schedule(sd_ctx_t* sd_ctx,
|
||||
const sd_hires_params_t& hires,
|
||||
const sd_sample_params_t& sample_params,
|
||||
sample_method_t sample_method,
|
||||
int default_steps,
|
||||
int sample_seq_len,
|
||||
int* scheduler_steps_out) {
|
||||
if (scheduler_steps_out != nullptr) {
|
||||
*scheduler_steps_out = 0;
|
||||
}
|
||||
|
||||
if (hires.custom_sigmas_count > 0 && hires.custom_sigmas != nullptr) {
|
||||
std::vector<float> custom_sigmas(hires.custom_sigmas,
|
||||
hires.custom_sigmas + hires.custom_sigmas_count);
|
||||
if (scheduler_steps_out != nullptr) {
|
||||
*scheduler_steps_out = static_cast<int>(custom_sigmas.size()) - 1;
|
||||
}
|
||||
return custom_sigmas;
|
||||
}
|
||||
|
||||
int effective_steps = hires.steps > 0 ? hires.steps : default_steps;
|
||||
effective_steps = std::max(1, effective_steps);
|
||||
|
||||
// sd-webui behavior: scale up total steps so trimming by denoising_strength yields exactly hires_steps effective steps,
|
||||
// unlike img2img which trims from a fixed step count.
|
||||
int scheduler_steps = static_cast<int>(effective_steps / hires.denoising_strength);
|
||||
scheduler_steps = std::max(1, scheduler_steps);
|
||||
|
||||
scheduler_t scheduler = resolve_scheduler(sd_ctx,
|
||||
sample_params.scheduler,
|
||||
sample_method);
|
||||
std::vector<float> sigmas = sd_ctx->sd->denoiser->get_sigmas(scheduler_steps,
|
||||
sample_seq_len,
|
||||
scheduler,
|
||||
sd_ctx->sd->version,
|
||||
sample_params.extra_sample_args);
|
||||
size_t t_enc = static_cast<size_t>(scheduler_steps * hires.denoising_strength);
|
||||
if (t_enc >= static_cast<size_t>(scheduler_steps)) {
|
||||
t_enc = static_cast<size_t>(scheduler_steps) - 1;
|
||||
}
|
||||
if (scheduler_steps_out != nullptr) {
|
||||
*scheduler_steps_out = scheduler_steps;
|
||||
}
|
||||
return std::vector<float>(sigmas.begin() + scheduler_steps - static_cast<int>(t_enc) - 1,
|
||||
sigmas.end());
|
||||
}
|
||||
|
||||
SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params) {
|
||||
if (sd_ctx == nullptr || sd_img_gen_params == nullptr) {
|
||||
return nullptr;
|
||||
|
|
@ -4340,29 +4513,20 @@ SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* s
|
|||
}
|
||||
}
|
||||
|
||||
int hires_steps = request.hires.steps > 0 ? request.hires.steps : plan.sample_steps;
|
||||
|
||||
// sd-webui behavior: scale up total steps so trimming by denoising_strength yields exactly hires_steps effective steps,
|
||||
// unlike img2img which trims from a fixed step count
|
||||
hires_steps = static_cast<int>(hires_steps / request.hires.denoising_strength);
|
||||
|
||||
std::vector<float> hires_sigmas = sd_ctx->sd->denoiser->get_sigmas(
|
||||
hires_steps,
|
||||
sd_ctx->sd->get_image_seq_len(request.hires.target_height, request.hires.target_width),
|
||||
sd_img_gen_params->sample_params.scheduler,
|
||||
sd_ctx->sd->version,
|
||||
sd_img_gen_params->sample_params.extra_sample_args);
|
||||
|
||||
size_t t_enc = static_cast<size_t>(hires_steps * request.hires.denoising_strength);
|
||||
if (t_enc >= static_cast<size_t>(hires_steps)) {
|
||||
t_enc = static_cast<size_t>(hires_steps) - 1;
|
||||
}
|
||||
std::vector<float> hires_sigma_sched(hires_sigmas.begin() + hires_steps - static_cast<int>(t_enc) - 1,
|
||||
hires_sigmas.end());
|
||||
LOG_INFO("hires fix: %d steps, denoising_strength=%.2f, sigma_sched_size=%zu",
|
||||
hires_steps,
|
||||
int hires_scheduler_steps = 0;
|
||||
std::vector<float> hires_sigma_sched =
|
||||
make_hires_sigma_schedule(sd_ctx,
|
||||
request.hires,
|
||||
sd_img_gen_params->sample_params,
|
||||
plan.sample_method,
|
||||
plan.sample_steps,
|
||||
sd_ctx->sd->get_image_seq_len(request.hires.target_height, request.hires.target_width),
|
||||
&hires_scheduler_steps);
|
||||
LOG_INFO("hires fix: scheduler_steps=%d, denoising_strength=%.2f, sigma_sched_size=%zu%s",
|
||||
hires_scheduler_steps,
|
||||
request.hires.denoising_strength,
|
||||
hires_sigma_sched.size());
|
||||
hires_sigma_sched.size(),
|
||||
request.hires.custom_sigmas_count > 0 ? ", custom_sigmas=true" : "");
|
||||
|
||||
std::vector<sd::Tensor<float>> hires_final_latents;
|
||||
int64_t hires_denoise_start = ggml_time_ms();
|
||||
|
|
@ -4510,44 +4674,7 @@ static std::optional<ImageGenerationLatents> prepare_video_generation_latents(sd
|
|||
|
||||
float conditioning_strength = std::clamp(request->strength, 0.f, 1.f);
|
||||
float conditioned_mask = 1.0f - conditioning_strength;
|
||||
latents.denoise_mask = sd::full<float>({latents.init_latent.shape()[0],
|
||||
latents.init_latent.shape()[1],
|
||||
latents.init_latent.shape()[2],
|
||||
1,
|
||||
1},
|
||||
1.f);
|
||||
|
||||
auto encode_ltxav_condition_image = [&](const sd::Tensor<float>& image, const char* name) -> sd::Tensor<float> {
|
||||
auto condition_image = image.reshape({image.shape()[0],
|
||||
image.shape()[1],
|
||||
1,
|
||||
image.shape()[2],
|
||||
image.shape()[3]});
|
||||
auto condition_latent = sd_ctx->sd->encode_first_stage(condition_image);
|
||||
if (condition_latent.empty()) {
|
||||
LOG_ERROR("failed to encode LTXAV %s image", name);
|
||||
}
|
||||
return condition_latent;
|
||||
};
|
||||
|
||||
auto apply_video_condition_by_latent_index = [&](const sd::Tensor<float>& condition_latent,
|
||||
int64_t latent_idx,
|
||||
const char* name) -> bool {
|
||||
int64_t latent_frames = latents.init_latent.shape()[2];
|
||||
int64_t condition_frames = condition_latent.shape()[2];
|
||||
if (latent_idx < 0 || condition_frames <= 0 || latent_idx + condition_frames > latent_frames) {
|
||||
LOG_ERROR("invalid LTXAV %s image latent range: start=%" PRId64 ", length=%" PRId64 ", latent_frames=%" PRId64,
|
||||
name,
|
||||
latent_idx,
|
||||
condition_frames,
|
||||
latent_frames);
|
||||
return false;
|
||||
}
|
||||
|
||||
sd::ops::slice_assign(&latents.init_latent, 2, latent_idx, latent_idx + condition_frames, condition_latent);
|
||||
sd::ops::fill_slice(&latents.denoise_mask, 2, latent_idx, latent_idx + condition_frames, conditioned_mask);
|
||||
return true;
|
||||
};
|
||||
latents.denoise_mask = make_ltxav_video_denoise_mask(latents.init_latent, 1.f);
|
||||
|
||||
auto apply_video_condition_by_keyframe_index = [&](const sd::Tensor<float>& keyframes,
|
||||
int frame_idx,
|
||||
|
|
@ -4585,20 +4712,30 @@ static std::optional<ImageGenerationLatents> prepare_video_generation_latents(sd
|
|||
};
|
||||
|
||||
if (!start_image.empty()) {
|
||||
auto start_image_latent = encode_ltxav_condition_image(start_image, "init");
|
||||
if (start_image_latent.empty() || !apply_video_condition_by_latent_index(start_image_latent, 0, "init")) {
|
||||
if (!apply_ltxav_condition_image_by_latent_index(sd_ctx,
|
||||
start_image,
|
||||
&latents.init_latent,
|
||||
&latents.denoise_mask,
|
||||
0,
|
||||
"init",
|
||||
conditioning_strength)) {
|
||||
return std::nullopt;
|
||||
}
|
||||
}
|
||||
|
||||
if (!end_image.empty()) {
|
||||
auto end_image_latent = encode_ltxav_condition_image(end_image, "end");
|
||||
auto end_image_latent = encode_ltxav_condition_image(sd_ctx, end_image, "end");
|
||||
if (end_image_latent.empty()) {
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
int frame_idx = request->frames - 1;
|
||||
bool ok = frame_idx == 0 ? apply_video_condition_by_latent_index(end_image_latent, 0, "end")
|
||||
bool ok = frame_idx == 0 ? apply_ltxav_condition_by_latent_index(&latents.init_latent,
|
||||
&latents.denoise_mask,
|
||||
end_image_latent,
|
||||
0,
|
||||
"end",
|
||||
conditioned_mask)
|
||||
: apply_video_condition_by_keyframe_index(end_image_latent, frame_idx, "end");
|
||||
if (!ok) {
|
||||
return std::nullopt;
|
||||
|
|
@ -4879,6 +5016,175 @@ static sd_image_t* decode_video_outputs(sd_ctx_t* sd_ctx,
|
|||
return result_images;
|
||||
}
|
||||
|
||||
static sd::Tensor<float> upscale_ltx_spatial_video_latent(sd_ctx_t* sd_ctx,
|
||||
const char* model_path,
|
||||
const sd::Tensor<float>& packed_latent,
|
||||
int audio_length) {
|
||||
if (sd_ctx == nullptr || sd_ctx->sd == nullptr || packed_latent.empty()) {
|
||||
return {};
|
||||
}
|
||||
if (strlen(SAFE_STR(model_path)) == 0) {
|
||||
LOG_ERROR("LTX latent spatial upscale requires a model path");
|
||||
return {};
|
||||
}
|
||||
if (!sd_ctx->sd->ensure_backend_pair(SDBackendModule::UPSCALER)) {
|
||||
return {};
|
||||
}
|
||||
|
||||
int latent_channels = sd_ctx->sd->get_latent_channel();
|
||||
sd::Tensor<float> video_latent = packed_latent;
|
||||
sd::Tensor<float> audio_latent;
|
||||
if (packed_latent.shape()[3] > latent_channels) {
|
||||
video_latent = sd::ops::slice(packed_latent, 3, 0, latent_channels);
|
||||
audio_latent = unpack_ltxav_audio_latent(packed_latent, audio_length, latent_channels);
|
||||
}
|
||||
|
||||
LOG_INFO("LTX latent spatial upscale: latent %dx%dx%dx%d -> x2",
|
||||
(int)video_latent.shape()[0],
|
||||
(int)video_latent.shape()[1],
|
||||
(int)video_latent.shape()[2],
|
||||
(int)video_latent.shape()[3]);
|
||||
|
||||
sd::Tensor<float> unnormalized = sd_ctx->sd->un_normalize_ltx_video_latents(video_latent);
|
||||
if (unnormalized.empty()) {
|
||||
LOG_ERROR("LTX latent un-normalization failed before spatial upscale");
|
||||
return {};
|
||||
}
|
||||
|
||||
std::unique_ptr<LTXVUpsampler::LatentUpsamplerRunner> upsampler =
|
||||
std::make_unique<LTXVUpsampler::LatentUpsamplerRunner>(sd_ctx->sd->backend_for(SDBackendModule::UPSCALER),
|
||||
sd_ctx->sd->params_backend_for(SDBackendModule::UPSCALER));
|
||||
const size_t max_graph_vram_bytes = sd::ggml_graph_cut::max_vram_gib_to_bytes(sd_ctx->sd->max_vram);
|
||||
upsampler->set_max_graph_vram_bytes(max_graph_vram_bytes);
|
||||
if (!upsampler->load_from_file(model_path, sd_ctx->sd->n_threads)) {
|
||||
LOG_ERROR("load LTX latent upsampler failed");
|
||||
return {};
|
||||
}
|
||||
|
||||
sd::Tensor<float> upscaled = upsampler->compute(sd_ctx->sd->n_threads, unnormalized);
|
||||
upsampler.reset();
|
||||
if (upscaled.empty()) {
|
||||
LOG_ERROR("LTX latent spatial upscale failed");
|
||||
return {};
|
||||
}
|
||||
|
||||
upscaled = sd_ctx->sd->normalize_ltx_video_latents(upscaled);
|
||||
if (upscaled.empty()) {
|
||||
LOG_ERROR("LTX latent normalization failed after spatial upscale");
|
||||
return {};
|
||||
}
|
||||
|
||||
if (!audio_latent.empty()) {
|
||||
upscaled = pack_ltxav_audio_and_video_latents(upscaled, audio_latent);
|
||||
}
|
||||
return upscaled;
|
||||
}
|
||||
|
||||
static bool apply_ltxv_refine_image_conditioning(sd_ctx_t* sd_ctx,
|
||||
const sd_vid_gen_params_t* sd_vid_gen_params,
|
||||
const GenerationRequest& request,
|
||||
const ImageGenerationLatents& latents,
|
||||
sd::Tensor<float>* latent,
|
||||
sd::Tensor<float>* denoise_mask,
|
||||
sd::Tensor<float>* video_positions) {
|
||||
if (sd_ctx == nullptr || sd_ctx->sd == nullptr || sd_vid_gen_params == nullptr ||
|
||||
latent == nullptr || latent->empty() || denoise_mask == nullptr || video_positions == nullptr) {
|
||||
return true;
|
||||
}
|
||||
if (sd_vid_gen_params->init_image.data == nullptr &&
|
||||
sd_vid_gen_params->end_image.data == nullptr) {
|
||||
return true;
|
||||
}
|
||||
if (sd_ctx->sd->vae_decode_only) {
|
||||
LOG_ERROR("LTXV refine image conditioning requires VAE encoder weights; create the context with vae_decode_only=false");
|
||||
return false;
|
||||
}
|
||||
|
||||
constexpr float conditioning_strength = 1.f;
|
||||
int latent_channels = sd_ctx->sd->get_latent_channel();
|
||||
sd::Tensor<float> video_latent = *latent;
|
||||
sd::Tensor<float> audio_latent;
|
||||
if (latent->shape()[3] > latent_channels) {
|
||||
video_latent = sd::ops::slice(*latent, 3, 0, latent_channels);
|
||||
audio_latent = unpack_ltxav_audio_latent(*latent, latents.audio_length, latent_channels);
|
||||
if (audio_latent.empty()) {
|
||||
LOG_ERROR("failed to unpack LTXAV audio latent before image-to-video inplace conditioning");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
int image_width = static_cast<int>(video_latent.shape()[0]) * request.vae_scale_factor;
|
||||
int image_height = static_cast<int>(video_latent.shape()[1]) * request.vae_scale_factor;
|
||||
sd::Tensor<float> video_mask = make_ltxav_video_denoise_mask(video_latent, 1.f);
|
||||
|
||||
if (sd_vid_gen_params->init_image.data != nullptr) {
|
||||
sd::Tensor<float> start_image = sd_image_to_tensor(sd_vid_gen_params->init_image, image_width, image_height);
|
||||
if (!apply_ltxav_condition_image_by_latent_index(sd_ctx,
|
||||
start_image,
|
||||
&video_latent,
|
||||
&video_mask,
|
||||
0,
|
||||
"init",
|
||||
conditioning_strength)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if (sd_vid_gen_params->end_image.data != nullptr) {
|
||||
sd::Tensor<float> end_image = sd_image_to_tensor(sd_vid_gen_params->end_image, image_width, image_height);
|
||||
sd::Tensor<float> end_image_latent = encode_ltxav_condition_image(sd_ctx, end_image, "end");
|
||||
if (end_image_latent.empty()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
int frame_idx = request.frames - 1;
|
||||
if (frame_idx == 0) {
|
||||
if (!apply_ltxav_condition_by_latent_index(&video_latent,
|
||||
&video_mask,
|
||||
end_image_latent,
|
||||
0,
|
||||
"end",
|
||||
1.f - conditioning_strength)) {
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
if (latents.video_conditioning_frame_count <= 0 || latents.video_target_frame_count <= 0) {
|
||||
LOG_ERROR("LTXV FLF2V refine conditioning requires low-resolution keyframe conditioning metadata");
|
||||
return false;
|
||||
}
|
||||
int64_t target_latent_frames = latents.video_target_frame_count;
|
||||
if (!apply_ltxav_condition_by_latent_index(&video_latent,
|
||||
&video_mask,
|
||||
end_image_latent,
|
||||
target_latent_frames,
|
||||
"end",
|
||||
1.f - conditioning_strength)) {
|
||||
return false;
|
||||
}
|
||||
*video_positions = build_ltxv_video_positions(video_latent.shape()[0],
|
||||
video_latent.shape()[1],
|
||||
target_latent_frames,
|
||||
end_image_latent.shape()[2],
|
||||
frame_idx,
|
||||
1,
|
||||
request.fps,
|
||||
request.vae_scale_factor,
|
||||
8,
|
||||
true);
|
||||
}
|
||||
}
|
||||
|
||||
if (!audio_latent.empty()) {
|
||||
*latent = pack_ltxav_audio_and_video_latents(video_latent, audio_latent);
|
||||
*denoise_mask = pack_ltxav_audio_and_video_denoise_mask(video_mask, video_latent, audio_latent);
|
||||
} else {
|
||||
*latent = std::move(video_latent);
|
||||
*denoise_mask = std::move(video_mask);
|
||||
}
|
||||
LOG_INFO("LTXV refine image conditioning applied at %dx%d", image_width, image_height);
|
||||
return true;
|
||||
}
|
||||
|
||||
SD_API bool generate_video(sd_ctx_t* sd_ctx,
|
||||
const sd_vid_gen_params_t* sd_vid_gen_params,
|
||||
sd_image_t** frames_out,
|
||||
|
|
@ -4899,6 +5205,23 @@ SD_API bool generate_video(sd_ctx_t* sd_ctx,
|
|||
int64_t t0 = ggml_time_ms();
|
||||
sd_ctx->sd->vae_tiling_params = sd_vid_gen_params->vae_tiling_params;
|
||||
GenerationRequest request(sd_ctx, sd_vid_gen_params);
|
||||
bool latent_upscale_enabled = request.hires.enabled;
|
||||
GenerationRequest hires_request = request;
|
||||
if (latent_upscale_enabled) {
|
||||
if (!sd_version_is_ltxav(sd_ctx->sd->version)) {
|
||||
LOG_ERROR("LTX latent spatial upscale is only supported for LTX video models");
|
||||
return false;
|
||||
}
|
||||
if (request.hires.upscaler != SD_HIRES_UPSCALER_MODEL) {
|
||||
LOG_ERROR("LTX latent spatial upscale currently requires hires upscaler MODEL");
|
||||
return false;
|
||||
}
|
||||
if (strlen(SAFE_STR(request.hires.model_path)) == 0) {
|
||||
LOG_ERROR("LTX latent spatial upscale is enabled but hires model path was not provided");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
sd_ctx->sd->rng->manual_seed(request.seed);
|
||||
sd_ctx->sd->sampler_rng->manual_seed(request.seed);
|
||||
sd_ctx->sd->set_flow_shift(sd_vid_gen_params->sample_params.flow_shift);
|
||||
|
|
@ -4910,14 +5233,22 @@ SD_API bool generate_video(sd_ctx_t* sd_ctx,
|
|||
return false;
|
||||
}
|
||||
ImageGenerationLatents latents = std::move(*latent_inputs_opt);
|
||||
ImageGenerationEmbeds embeds = prepare_video_generation_embeds(sd_ctx,
|
||||
sd_vid_gen_params,
|
||||
request,
|
||||
latents);
|
||||
LOG_INFO("generate_video %dx%dx%d",
|
||||
request.width,
|
||||
request.height,
|
||||
request.frames);
|
||||
|
||||
ImageGenerationEmbeds embeds = prepare_video_generation_embeds(sd_ctx,
|
||||
sd_vid_gen_params,
|
||||
request,
|
||||
latents);
|
||||
if (latent_upscale_enabled) {
|
||||
LOG_INFO("generate_video %dx%dx%d -> LTX latent spatial upscale",
|
||||
request.width,
|
||||
request.height,
|
||||
request.frames);
|
||||
} else {
|
||||
LOG_INFO("generate_video %dx%dx%d",
|
||||
request.width,
|
||||
request.height,
|
||||
request.frames);
|
||||
}
|
||||
|
||||
int64_t latent_start = ggml_time_ms();
|
||||
int W = request.width / request.vae_scale_factor;
|
||||
|
|
@ -5009,15 +5340,126 @@ SD_API bool generate_video(sd_ctx_t* sd_ctx,
|
|||
latents.video_positions);
|
||||
|
||||
int64_t sampling_end = ggml_time_ms();
|
||||
if (sd_ctx->sd->free_params_immediately) {
|
||||
sd_ctx->sd->diffusion_model->free_params_buffer();
|
||||
}
|
||||
if (final_latent.empty()) {
|
||||
if (sd_ctx->sd->free_params_immediately) {
|
||||
sd_ctx->sd->diffusion_model->free_params_buffer();
|
||||
}
|
||||
LOG_ERROR("sampling failed after %.2fs", (sampling_end - sampling_start) * 1.0f / 1000);
|
||||
return false;
|
||||
}
|
||||
LOG_INFO("sampling completed, taking %.2fs", (sampling_end - sampling_start) * 1.0f / 1000);
|
||||
|
||||
if (latent_upscale_enabled) {
|
||||
int64_t upscale_start = ggml_time_ms();
|
||||
sd::Tensor<float> upscaled_latent = upscale_ltx_spatial_video_latent(sd_ctx,
|
||||
request.hires.model_path,
|
||||
final_latent,
|
||||
latents.audio_length);
|
||||
int64_t upscale_end = ggml_time_ms();
|
||||
if (upscaled_latent.empty()) {
|
||||
if (sd_ctx->sd->free_params_immediately) {
|
||||
sd_ctx->sd->diffusion_model->free_params_buffer();
|
||||
}
|
||||
return false;
|
||||
}
|
||||
LOG_INFO("LTX latent spatial upscale completed, taking %.2fs",
|
||||
(upscale_end - upscale_start) * 1.0f / 1000);
|
||||
|
||||
x_t = std::move(upscaled_latent);
|
||||
hires_request.width = static_cast<int>(x_t.shape()[0]) * hires_request.vae_scale_factor;
|
||||
hires_request.height = static_cast<int>(x_t.shape()[1]) * hires_request.vae_scale_factor;
|
||||
if ((request.hires.target_width > 0 || request.hires.target_height > 0) &&
|
||||
(request.hires.target_width != hires_request.width || request.hires.target_height != hires_request.height)) {
|
||||
LOG_WARN("LTX latent spatial upsampler output is %dx%d; ignoring hires target %dx%d",
|
||||
hires_request.width,
|
||||
hires_request.height,
|
||||
request.hires.target_width,
|
||||
request.hires.target_height);
|
||||
}
|
||||
sd::Tensor<float> hires_denoise_mask;
|
||||
sd::Tensor<float> hires_video_positions;
|
||||
if (!apply_ltxv_refine_image_conditioning(sd_ctx,
|
||||
sd_vid_gen_params,
|
||||
hires_request,
|
||||
latents,
|
||||
&x_t,
|
||||
&hires_denoise_mask,
|
||||
&hires_video_positions)) {
|
||||
if (sd_ctx->sd->free_params_immediately) {
|
||||
sd_ctx->sd->diffusion_model->free_params_buffer();
|
||||
}
|
||||
return false;
|
||||
}
|
||||
noise = sd::Tensor<float>::randn_like(x_t, sd_ctx->sd->rng);
|
||||
|
||||
W = hires_request.width / hires_request.vae_scale_factor;
|
||||
H = hires_request.height / hires_request.vae_scale_factor;
|
||||
T = static_cast<int>(x_t.shape()[2]);
|
||||
sample_method_t hires_sample_method = plan.sample_method;
|
||||
int hires_scheduler_steps = 0;
|
||||
std::vector<float> hires_sigma_sched =
|
||||
make_hires_sigma_schedule(sd_ctx,
|
||||
request.hires,
|
||||
sd_vid_gen_params->sample_params,
|
||||
hires_sample_method,
|
||||
plan.sample_steps,
|
||||
sd_ctx->sd->get_image_seq_len(hires_request.height, hires_request.width) * T,
|
||||
&hires_scheduler_steps);
|
||||
float hires_eta = resolve_eta(sd_ctx,
|
||||
sd_vid_gen_params->sample_params.eta,
|
||||
hires_sample_method);
|
||||
|
||||
LOG_DEBUG("sample(latent upscale) %dx%dx%d", W, H, T);
|
||||
LOG_INFO("LTX latent spatial upscale refine: scheduler_steps=%d, denoising_strength=%.2f, sampler=%s, sigma_sched_size=%zu%s",
|
||||
hires_scheduler_steps,
|
||||
request.hires.denoising_strength,
|
||||
sampling_methods_str[hires_sample_method],
|
||||
hires_sigma_sched.size(),
|
||||
request.hires.custom_sigmas_count > 0 ? ", custom_sigmas=true" : "");
|
||||
|
||||
sampling_start = ggml_time_ms();
|
||||
final_latent = sd_ctx->sd->sample(sd_ctx->sd->diffusion_model,
|
||||
true,
|
||||
x_t,
|
||||
std::move(noise),
|
||||
embeds.cond,
|
||||
hires_request.use_uncond ? embeds.uncond : SDCondition(),
|
||||
embeds.img_cond,
|
||||
embeds.id_cond,
|
||||
sd::Tensor<float>(),
|
||||
0.f,
|
||||
sd_vid_gen_params->sample_params.guidance,
|
||||
hires_eta,
|
||||
sd_vid_gen_params->sample_params.shifted_timestep,
|
||||
hires_sample_method,
|
||||
sd_ctx->sd->is_flow_denoiser(),
|
||||
plan.extra_sample_args,
|
||||
hires_sigma_sched,
|
||||
-1,
|
||||
std::vector<sd::Tensor<float>>{},
|
||||
false,
|
||||
hires_denoise_mask,
|
||||
sd::Tensor<float>(),
|
||||
hires_request.vace_strength,
|
||||
latents.audio_length,
|
||||
static_cast<float>(hires_request.fps),
|
||||
hires_request.cache_params,
|
||||
hires_video_positions);
|
||||
sampling_end = ggml_time_ms();
|
||||
if (sd_ctx->sd->free_params_immediately) {
|
||||
sd_ctx->sd->diffusion_model->free_params_buffer();
|
||||
}
|
||||
if (final_latent.empty()) {
|
||||
LOG_ERROR("sampling(latent upscale) failed after %.2fs",
|
||||
(sampling_end - sampling_start) * 1.0f / 1000);
|
||||
return false;
|
||||
}
|
||||
LOG_INFO("sampling(latent upscale) completed, taking %.2fs",
|
||||
(sampling_end - sampling_start) * 1.0f / 1000);
|
||||
} else if (sd_ctx->sd->free_params_immediately) {
|
||||
sd_ctx->sd->diffusion_model->free_params_buffer();
|
||||
}
|
||||
|
||||
sd_audio_t* generated_audio = nullptr;
|
||||
if (sd_version_is_ltxav(sd_ctx->sd->version) &&
|
||||
latents.audio_length > 0 &&
|
||||
|
|
@ -5048,7 +5490,7 @@ SD_API bool generate_video(sd_ctx_t* sd_ctx,
|
|||
int64_t latent_end = ggml_time_ms();
|
||||
LOG_INFO("generating latent video completed, taking %.2fs", (latent_end - latent_start) * 1.0f / 1000);
|
||||
|
||||
auto result = decode_video_outputs(sd_ctx, request, final_latent, num_frames_out);
|
||||
auto result = decode_video_outputs(sd_ctx, latent_upscale_enabled ? hires_request : request, final_latent, num_frames_out);
|
||||
if (result == nullptr) {
|
||||
free_sd_audio(generated_audio);
|
||||
return false;
|
||||
|
|
|
|||
|
|
@ -332,6 +332,8 @@ typedef struct {
|
|||
int steps;
|
||||
float denoising_strength;
|
||||
int upscale_tile_size;
|
||||
float* custom_sigmas;
|
||||
int custom_sigmas_count;
|
||||
} sd_hires_params_t;
|
||||
|
||||
typedef struct {
|
||||
|
|
@ -382,6 +384,7 @@ typedef struct {
|
|||
float vace_strength;
|
||||
sd_tiling_params_t vae_tiling_params;
|
||||
sd_cache_params_t cache;
|
||||
sd_hires_params_t hires;
|
||||
} sd_vid_gen_params_t;
|
||||
|
||||
typedef struct sd_ctx_t sd_ctx_t;
|
||||
|
|
|
|||
|
|
@ -322,13 +322,21 @@ class TinyVideoEncoder : public UnaryBlock {
|
|||
int patch_size = 1;
|
||||
|
||||
public:
|
||||
TinyVideoEncoder(int z_channels = 4, int patch_size = 1)
|
||||
int t_downscale = 1;
|
||||
TinyVideoEncoder(int z_channels = 4, int patch_size = 1, std::vector<bool> time_downscale = {true, true, false})
|
||||
: z_channels(z_channels), patch_size(patch_size) {
|
||||
// self.t_downscale = 2**sum(t.stride == 2 for t in self.encoder if isinstance(t, TPool))
|
||||
t_downscale = 1;
|
||||
for (bool downscale : time_downscale) {
|
||||
if (downscale) {
|
||||
t_downscale *= 2;
|
||||
}
|
||||
}
|
||||
int index = 0;
|
||||
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels * patch_size * patch_size, hidden, {3, 3}, {1, 1}, {1, 1}));
|
||||
index++; // nn.ReLU()
|
||||
for (int i = 0; i < num_layers; i++) {
|
||||
int stride = i == num_layers - 1 ? 1 : 2;
|
||||
int stride = time_downscale[i] ? 2 : 1;
|
||||
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TPool(hidden, stride));
|
||||
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(hidden, hidden, {3, 3}, {2, 2}, {1, 1}, {1, 1}, false));
|
||||
for (int j = 0; j < num_blocks; j++) {
|
||||
|
|
@ -375,15 +383,22 @@ class TinyVideoDecoder : public UnaryBlock {
|
|||
static const int num_layers = 3;
|
||||
int channels[num_layers + 1] = {256, 128, 64, 64};
|
||||
int patch_size = 1;
|
||||
int t_upscale = 1;
|
||||
|
||||
public:
|
||||
TinyVideoDecoder(int z_channels = 4, int patch_size = 1)
|
||||
TinyVideoDecoder(int z_channels = 4, int patch_size = 1, std::vector<bool> time_upscale = {false, true, true})
|
||||
: z_channels(z_channels), patch_size(patch_size) {
|
||||
t_upscale = 1;
|
||||
for (bool upscale : time_upscale) {
|
||||
if (upscale) {
|
||||
t_upscale *= 2;
|
||||
}
|
||||
}
|
||||
int index = 1; // Clamp()
|
||||
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(z_channels, channels[0], {3, 3}, {1, 1}, {1, 1}));
|
||||
index++; // nn.ReLU()
|
||||
for (int i = 0; i < num_layers; i++) {
|
||||
int stride = i == 0 ? 1 : 2;
|
||||
int stride = time_upscale[i] ? 2 : 1;
|
||||
for (int j = 0; j < num_blocks; j++) {
|
||||
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new MemBlock(channels[i], channels[i]));
|
||||
}
|
||||
|
|
@ -430,8 +445,8 @@ public:
|
|||
if (patch_size > 1) {
|
||||
h = unpatchify(ctx->ggml_ctx, h, patch_size, 1);
|
||||
}
|
||||
// shape(W, H, 3, 3 + T) => shape(W, H, 3, T)
|
||||
h = ggml_view_4d(ctx->ggml_ctx, h, h->ne[0], h->ne[1], h->ne[2], h->ne[3] - 3, h->nb[1], h->nb[2], h->nb[3], 3 * h->nb[3]);
|
||||
// shape(W, H, 3, (t_upscale - 1) + T) => shape(W, H, 3, T)
|
||||
h = ggml_view_4d(ctx->ggml_ctx, h, h->ne[0], h->ne[1], h->ne[2], h->ne[3] - (t_upscale - 1), h->nb[1], h->nb[2], h->nb[3], (t_upscale - 1) * h->nb[3]);
|
||||
return h;
|
||||
}
|
||||
};
|
||||
|
|
@ -442,7 +457,9 @@ protected:
|
|||
SDVersion version;
|
||||
|
||||
public:
|
||||
int z_channels = 16;
|
||||
int z_channels = 16;
|
||||
std::vector<bool> time_downscale = {true, true, false};
|
||||
std::vector<bool> time_upscale = {false, true, true};
|
||||
|
||||
public:
|
||||
TAEHV(bool decode_only = true, SDVersion version = VERSION_WAN2)
|
||||
|
|
@ -451,21 +468,26 @@ public:
|
|||
if (version == VERSION_WAN2_2_TI2V) {
|
||||
z_channels = 48;
|
||||
patch = 2;
|
||||
} else if (sd_version_is_ltxav(version)) {
|
||||
z_channels = 128;
|
||||
patch = 4;
|
||||
time_downscale = {true, true, true};
|
||||
time_upscale = {true, true, true};
|
||||
}
|
||||
blocks["decoder"] = std::shared_ptr<GGMLBlock>(new TinyVideoDecoder(z_channels, patch));
|
||||
blocks["decoder"] = std::shared_ptr<GGMLBlock>(new TinyVideoDecoder(z_channels, patch, time_upscale));
|
||||
if (!decode_only) {
|
||||
blocks["encoder"] = std::shared_ptr<GGMLBlock>(new TinyVideoEncoder(z_channels, patch));
|
||||
blocks["encoder"] = std::shared_ptr<GGMLBlock>(new TinyVideoEncoder(z_channels, patch, time_downscale));
|
||||
}
|
||||
}
|
||||
|
||||
ggml_tensor* decode(GGMLRunnerContext* ctx, ggml_tensor* z) {
|
||||
auto decoder = std::dynamic_pointer_cast<TinyVideoDecoder>(blocks["decoder"]);
|
||||
if (sd_version_is_wan(version)) {
|
||||
if (sd_version_is_wan(version) || sd_version_is_ltxav(version)) {
|
||||
// (W, H, C, T) -> (W, H, T, C)
|
||||
z = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, z, 0, 1, 3, 2));
|
||||
}
|
||||
auto result = decoder->forward(ctx, z);
|
||||
if (sd_version_is_wan(version)) {
|
||||
if (sd_version_is_wan(version) || sd_version_is_ltxav(version)) {
|
||||
// (W, H, C, T) -> (W, H, T, C)
|
||||
result = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, result, 0, 1, 3, 2));
|
||||
}
|
||||
|
|
@ -477,10 +499,10 @@ public:
|
|||
// (W, H, T, C) -> (W, H, C, T)
|
||||
x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 0, 1, 3, 2));
|
||||
int64_t num_frames = x->ne[3];
|
||||
if (num_frames % 4) {
|
||||
// pad to multiple of 4 at the end
|
||||
if (num_frames % encoder->t_downscale) {
|
||||
// pad to multiple of encoder->t_downscale at the end
|
||||
auto last_frame = ggml_view_4d(ctx->ggml_ctx, x, x->ne[0], x->ne[1], x->ne[2], 1, x->nb[1], x->nb[2], x->nb[3], (num_frames - 1) * x->nb[3]);
|
||||
for (int i = 0; i < 4 - num_frames % 4; i++) {
|
||||
for (int i = 0; i < encoder->t_downscale - num_frames % encoder->t_downscale; i++) {
|
||||
x = ggml_concat(ctx->ggml_ctx, x, last_frame, 3);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue