koboldcpp/otherarch/ttscpp/src/kokoro_model.h
2025-09-22 21:22:41 +08:00

462 lines
17 KiB
C++

#ifndef kokoro_model_h
#define kokoro_model_h
#include <stdlib.h>
#include "tts_model.h"
#include "ttstokenizer.h"
#include "phonemizer.h"
// Rather than using ISO 639-2 language codes, Kokoro voice pack specify their corresponding language via their first letter.
// Below is a map that describes the relationship between those designations and espeak-ng's voice identifiers so that the
// appropriate phonemization protocol can inferred from the Kokoro voice.
static std::map<char, std::string> KOKORO_LANG_TO_ESPEAK_ID = {
{'a', "gmw/en-US"},
{'b', "gmw/en"},
{'e', "roa/es"},
{'f', "roa/fr"},
{'h', "inc/hi"},
{'i', "roa/it"},
{'j', "jpx/ja"},
{'p', "roa/pt-BR"},
{'z', "sit/cmn"}
};
struct lstm_cell {
std::vector<ggml_tensor*> weights;
std::vector<ggml_tensor*> biases;
std::vector<ggml_tensor*> reverse_weights;
std::vector<ggml_tensor*> reverse_biases;
};
struct lstm {
std::vector<ggml_tensor*> hidden;
std::vector<ggml_tensor*> states;
bool bidirectional = false;
std::vector<lstm_cell*> cells;
};
struct duration_predictor_layer {
lstm * rnn;
struct ggml_tensor * ada_norm_gamma_weight;
struct ggml_tensor * ada_norm_gamma_bias;
struct ggml_tensor * ada_norm_beta_weight;
struct ggml_tensor * ada_norm_beta_bias;
};
struct ada_residual_conv_block {
struct ggml_tensor * conv1;
struct ggml_tensor * conv1_bias;
struct ggml_tensor * conv2;
struct ggml_tensor * conv2_bias;
struct ggml_tensor * norm1_gamma;
struct ggml_tensor * norm1_gamma_bias;
struct ggml_tensor * norm1_beta;
struct ggml_tensor * norm1_beta_bias;
struct ggml_tensor * norm2_gamma;
struct ggml_tensor * norm2_gamma_bias;
struct ggml_tensor * norm2_beta;
struct ggml_tensor * norm2_beta_bias;
struct ggml_tensor * pool = nullptr;
struct ggml_tensor * pool_bias = nullptr;
struct ggml_tensor * upsample = nullptr;
struct ggml_tensor * upsample_bias = nullptr;
};
struct duration_predictor {
struct ggml_tensor * albert_encode;
struct ggml_tensor * albert_encode_bias;
std::vector<duration_predictor_layer*> layers;
lstm * duration_proj_lstm;
struct ggml_tensor * duration_proj;
struct ggml_tensor * duration_proj_bias;
struct ggml_tensor * n_proj_kernel;
struct ggml_tensor * n_proj_bias;
struct ggml_tensor * f0_proj_kernel;
struct ggml_tensor * f0_proj_bias;
lstm * shared_lstm;
std::vector<ada_residual_conv_block*> f0_blocks;
std::vector<ada_residual_conv_block*> n_blocks;
};
struct kokoro_text_encoder_conv_layer {
struct ggml_tensor * norm_gamma;
struct ggml_tensor * norm_beta;
struct ggml_tensor * conv_weight;
struct ggml_tensor * conv_bias;
};
struct kokoro_text_encoder {
struct ggml_tensor * embd;
std::vector<kokoro_text_encoder_conv_layer*> conv_layers;
lstm * out_lstm;
};
struct kokoro_generator_residual_block {
std::vector<uint32_t> conv1_dilations;
std::vector<uint32_t> conv1_paddings;
std::vector<ggml_tensor*> adain1d_1_gamma_weights;
std::vector<ggml_tensor*> adain1d_2_gamma_weights;
std::vector<ggml_tensor*> adain1d_1_gamma_biases;
std::vector<ggml_tensor*> adain1d_2_gamma_biases;
std::vector<ggml_tensor*> adain1d_1_beta_weights;
std::vector<ggml_tensor*> adain1d_2_beta_weights;
std::vector<ggml_tensor*> adain1d_1_beta_biases;
std::vector<ggml_tensor*> adain1d_2_beta_biases;
std::vector<ggml_tensor*> input_alphas;
std::vector<ggml_tensor*> output_alphas;
std::vector<ggml_tensor*> convs1_weights;
std::vector<ggml_tensor*> convs1_biases;
std::vector<ggml_tensor*> convs2_weights;
std::vector<ggml_tensor*> convs2_biases;
};
struct kokoro_noise_residual_block {
uint32_t input_conv_stride;
uint32_t input_conv_padding;
struct ggml_tensor * input_conv;
struct ggml_tensor * input_conv_bias;
struct kokoro_generator_residual_block * res_block;
};
struct kokoro_generator_upsample_block {
uint32_t padding;
uint32_t stride;
// these are just conv transpose layers
struct ggml_tensor * upsample_weight;
struct ggml_tensor * upsample_bias;
};
struct kokoro_generator {
// unfortunately the squared sum of the windows needs to be computed dynamically per run because it is dependent
// on the sequence size of the generation and the hop is typically less than half the size of our window.
struct ggml_tensor * window;
struct ggml_tensor * m_source_weight;
struct ggml_tensor * m_source_bias;
struct ggml_tensor * out_conv_weight;
struct ggml_tensor * out_conv_bias;
std::vector<kokoro_noise_residual_block*> noise_blocks;
std::vector<kokoro_generator_residual_block*> res_blocks;
std::vector<kokoro_generator_upsample_block*> ups;
};
struct kokoro_decoder {
struct ggml_tensor * f0_conv;
struct ggml_tensor * f0_conv_bias;
struct ggml_tensor * n_conv;
struct ggml_tensor * n_conv_bias;
struct ggml_tensor * asr_conv;
struct ggml_tensor * asr_conv_bias;
std::vector<ada_residual_conv_block*> decoder_blocks;
ada_residual_conv_block* encoder_block;
kokoro_generator * generator;
};
struct albert_layer {
struct ggml_tensor * ffn;
struct ggml_tensor * ffn_out;
struct ggml_tensor * ffn_bias;
struct ggml_tensor * ffn_out_bias;
struct ggml_tensor * layer_output_norm_weight;
struct ggml_tensor * layer_output_norm_bias;
struct ggml_tensor * q;
struct ggml_tensor * k;
struct ggml_tensor * v;
struct ggml_tensor * o;
struct ggml_tensor * q_bias;
struct ggml_tensor * k_bias;
struct ggml_tensor * v_bias;
struct ggml_tensor * o_bias;
struct ggml_tensor * attn_norm_weight;
struct ggml_tensor * attn_norm_bias;
};
struct kokoro_model : tts_model {
// standard configruation for Kokoro's Albert model
// tokenization
uint32_t bos_token_id = 0;
uint32_t eos_token_id = 0;
uint32_t space_token_id = 16;
// duration prediction
uint32_t max_context_length = 512;
uint32_t vocab_size = 178;
uint32_t hidden_size = 768;
uint32_t n_attn_heads = 12;
uint32_t n_layers = 1;
uint32_t n_recurrence = 12;
uint32_t head_size = 64;
uint32_t duration_hidden_size = 512;
uint32_t up_sampling_factor;
float upsample_scale = 300.0f;
float scale = 0.125f;
// standard configuration for duration prediction
uint32_t f0_n_blocks = 3;
uint32_t n_duration_prediction_layers = 3;
// while it is technically possible for the duration predictor to assign 50 values per token there is no practical need to
// allocate that many items to the sequence as it is impossible for all tokens to require such long durations and each
// allocation increases node allocation size by O(N)
uint32_t max_duration_per_token = 20;
uint32_t style_half_size = 128;
// standard text encoding configuration
uint32_t n_conv_layers = 3;
// standard decoder configuration
uint32_t n_kernels = 3;
uint32_t n_upsamples = 2;
uint32_t n_decoder_blocks = 4;
uint32_t n_res_blocks = 6;
uint32_t n_noise_blocks = 2;
uint32_t out_conv_padding = 3;
uint32_t post_n_fft = 11;
uint32_t true_n_fft = 20;
uint32_t stft_hop = 5;
uint32_t harmonic_num = 8;
float sin_amp = 0.1f;
float noise_std = 0.003f;
float voice_threshold = 10.0f;
float sample_rate = 24000.0f;
std::string window = "hann";
// It is really annoying that ggml doesn't allow using non ggml tensors as the operator for simple math ops.
// This is just the constant defined above as a tensor.
struct ggml_tensor * n_kernels_tensor;
// Kokoro loads albert with use_pooling = true but doesn't use the pooling outputs.
bool uses_pooling = false;
bool static_token_types = true;
std::map<std::string, struct ggml_tensor *> voices;
// Albert portion of the model
struct ggml_tensor * embd_hidden;
struct ggml_tensor * embd_hidden_bias;
struct ggml_tensor * token_type_embd = nullptr;
struct ggml_tensor * token_embd;
struct ggml_tensor * position_embd;
struct ggml_tensor * input_norm_weight;
struct ggml_tensor * input_norm_bias;
struct ggml_tensor * static_token_type_values = nullptr;
struct ggml_tensor * pool = nullptr;
struct ggml_tensor * pool_bias = nullptr;
std::vector<albert_layer*> layers;
struct ggml_tensor * harmonic_sampling_norm = nullptr; // a static 1x9 harmonic multiplier
struct ggml_tensor * sampling_factor_scalar = nullptr; // a static scalar
struct ggml_tensor * sqrt_tensor = nullptr; // static tensor for constant division
// Prosody Predictor portion of the model
struct duration_predictor * prosody_pred;
// Text encoding portion of the model
struct kokoro_text_encoder * text_encoder;
// Decoding and Generation portion of the model
struct kokoro_decoder * decoder;
// the default hidden states need to be initialized
std::vector<lstm*> lstms;
size_t duration_node_counter = 0;
size_t generation_node_counter = 0;
// setting this is likely unnecessary as it is precomputed by the post load function.
uint32_t post_load_tensor_bytes = 13000;
size_t max_gen_nodes();
size_t max_duration_nodes();
lstm * prep_lstm();
// helper functions for assigning tensors to substructs
void assign_lstm(lstm * rnn, std::string name, ggml_tensor * tensor);
void assign_generator_weight(kokoro_generator * generator, std::string name, ggml_tensor * tensor);
void assign_gen_resblock(kokoro_generator_residual_block * block, std::string name, ggml_tensor * tensor);
void assign_ada_res_block(ada_residual_conv_block * block, std::string name, ggml_tensor * tensor);
void assign_decoder_weight(std::string name, ggml_tensor * tensor);
void assign_duration_weight(std::string name, ggml_tensor * tensor);
void assign_text_encoder_weight(std::string name, ggml_tensor * tensor);
void assign_albert_weight(std::string name, ggml_tensor * tensor);
void post_load_assign();
void assign_weight(std::string name, ggml_tensor * tensor);
void prep_layers(gguf_context * meta);
void prep_constants(gguf_context * meta);
void setup_from_file(gguf_context * meta_ctx, ggml_context * load_context, bool cpu_only = true) {
std::function<void (ggml_tensor *)> fn = ([&](ggml_tensor* cur) {
std::string name = ggml_get_name(cur);
size_t increment = 1;
if (name.find("lstm") != std::string::npos) {
increment = max_context_length;
}
if (name.find("duration_predictor") != std::string::npos) {
duration_node_counter += increment;
} else {
generation_node_counter += increment;
}
});
compute_tensor_meta_cb = &fn;
prep_constants(meta_ctx);
prep_layers(meta_ctx);
tts_model::setup_from_file(meta_ctx, load_context, cpu_only, "kokoro", 1.6, post_load_tensor_bytes);
}
};
struct kokoro_ubatch {
size_t n_tokens; // the number of tokens in our encoded sequence
uint32_t * input_tokens; // [n_tokens]
struct kokoro_duration_response * resp = nullptr;
};
struct kokoro_duration_context : runner_context {
kokoro_duration_context(kokoro_model * model, int n_threads): runner_context(n_threads), model(model) {};
~kokoro_duration_context() {
ggml_backend_buffer_free(buf_len_output);
}
std::string voice = "af_heart";
struct kokoro_model * model;
ggml_backend_buffer_t buf_len_output = nullptr;
size_t logits_size = 0; // capacity (of floats) for logits
float * lens = nullptr;
struct ggml_tensor * inp_tokens;
struct ggml_tensor * positions;
struct ggml_tensor * attn_mask;
struct ggml_tensor * token_types = nullptr;
void build_schedule() {
runner_context::build_schedule(model->max_duration_nodes()*5);
}
};
static struct ggml_tensor * build_albert_attn_mask(ggml_context * ctx, struct kokoro_duration_context *kctx, const kokoro_ubatch & batch);
static struct ggml_tensor * build_albert_inputs(ggml_context * ctx, kokoro_model * model, ggml_tensor * input_tokens, ggml_tensor * positions, ggml_tensor * token_types);
static struct ggml_tensor * build_albert_norm(ggml_context * ctx, ggml_tensor * cur, ggml_tensor * weight, ggml_tensor * bias);
static struct ggml_tensor * build_ada_residual_conv(ggml_context * ctx, struct ggml_tensor * x, ada_residual_conv_block * block, struct ggml_tensor * style, struct ggml_tensor * sqrt_tensor);
static struct ggml_tensor * build_kokoro_generator_res_block(ggml_context * ctx, struct ggml_tensor * x, struct ggml_tensor * style, kokoro_generator_residual_block * block);
static struct ggml_tensor * build_noise_block(ggml_context * ctx, kokoro_noise_residual_block * block, struct ggml_tensor * x, struct ggml_tensor * style);
static kokoro_generator_residual_block * build_res_block_from_file(gguf_context * meta, std::string base_config_key);
static kokoro_noise_residual_block * build_noise_block_from_file(gguf_context * meta, int index);
static kokoro_generator_upsample_block* kokoro_generator_upsample_block(gguf_context * meta, int index);
std::string get_espeak_id_from_kokoro_voice(std::string voice);
struct kokoro_duration_context * build_new_duration_kokoro_context(struct kokoro_model * model, int n_threads, bool use_cpu = true);
struct kokoro_duration_response {
size_t n_outputs;
float * lengths;
float * hidden_states;
};
// This struct is intended to manage graph and compute for the duration prediction portion of the kokoro model.
// Duration computation and speech generation are separated into distinct graphs because the precomputed graph structure of ggml doesn't
// support the tensor dependent views that would otherwise be necessary.
struct kokoro_duration_runner : tts_runner {
kokoro_duration_runner(kokoro_model * model, kokoro_duration_context * context, single_pass_tokenizer * tokenizer): model(model), kctx(context), tokenizer(tokenizer) {};
~kokoro_duration_runner() {
if (ctx) {
ggml_free(ctx);
}
model->free();
delete model;
delete kctx;
}
struct single_pass_tokenizer * tokenizer;
kokoro_model * model;
kokoro_duration_context * kctx;
void init_build() {
tts_runner::init_build(&kctx->buf_compute_meta);
}
void prepare_post_load();
struct kokoro_ubatch build_worst_case_batch();
void set_inputs(kokoro_ubatch & batch);
struct ggml_cgraph * build_kokoro_duration_graph(kokoro_ubatch & batch);
void run(kokoro_ubatch & ubatch);
};
struct kokoro_context : runner_context {
kokoro_context(kokoro_model * model, int n_threads): runner_context(n_threads), model(model) {};
~kokoro_context() {
ggml_backend_sched_free(sched);
ggml_backend_free(backend_cpu);
if (backend) {
ggml_backend_free(backend);
}
if (buf_output) {
ggml_backend_buffer_free(buf_output);
}
}
std::string voice = "af_heart";
struct kokoro_model * model;
uint32_t total_duration;
uint32_t sequence_length;
struct ggml_tensor * inp_tokens;
struct ggml_tensor * duration_pred;
struct ggml_tensor * duration_mask;
struct ggml_tensor * window_sq_sum; // needs to be calculatd from the generator window.
struct ggml_tensor * uv_noise_data;
void build_schedule() {
runner_context::build_schedule(model->max_gen_nodes()*30);
}
};
// TODO: now that we are passing the context down to these methods we should clean up their parameters
static struct ggml_tensor * build_generator(ggml_context * ctx, kokoro_model * model, kokoro_context * kctx, struct ggml_tensor * x, struct ggml_tensor * style, struct ggml_tensor * f0_curve, kokoro_generator* generator, int sequence_length, struct ggml_tensor * window_sq_sum, ggml_cgraph * gf);
static struct ggml_tensor * build_sin_gen(ggml_context * ctx, kokoro_model * model, kokoro_context * kctx, struct ggml_tensor * x, int harmonic_num, int sequence_length, float voice_threshold, float sin_amp, float noise_std);
struct kokoro_context * build_new_kokoro_context(struct kokoro_model * model, int n_threads, bool use_cpu = true);
// This manages the graph compilation of computation for the Kokoro model.
struct kokoro_runner : tts_runner {
kokoro_runner(kokoro_model * model, kokoro_context * context, single_pass_tokenizer * tokenizer, kokoro_duration_runner * drunner, phonemizer * phmzr): model(model), kctx(context), tokenizer(tokenizer), drunner(drunner), phmzr(phmzr) {
tts_runner::sampling_rate = 24000.0f;
tts_runner::supports_voices = true;
};
~kokoro_runner() {
if (ctx) {
ggml_free(ctx);
}
delete drunner;
model->free();
delete model;
delete kctx;
delete phmzr;
}
struct single_pass_tokenizer * tokenizer;
kokoro_model * model;
kokoro_context * kctx;
kokoro_duration_runner * drunner;
phonemizer * phmzr;
std::string default_voice = "af_heart";
void init_build() {
tts_runner::init_build(&kctx->buf_compute_meta);
}
std::vector<std::string> list_voices();
std::vector<std::vector<uint32_t>> tokenize_chunks(std::vector<std::string> clauses);
void assign_weight(std::string name, ggml_tensor * tensor);
void prepare_post_load();
kokoro_ubatch build_worst_case_batch();
void set_inputs(kokoro_ubatch & batch, uint32_t total_size);
struct ggml_cgraph * build_kokoro_graph(kokoro_ubatch & batch);
void run(kokoro_ubatch & batch, struct tts_response * outputs);
int generate(std::string prompt, struct tts_response * response, std::string voice, std::string voice_code = "");
};
#endif