koboldcpp/otherarch/ttscpp/src/ttst5_encoder_model.h
2025-08-17 14:11:30 +08:00

130 lines
4.3 KiB
C++

#ifndef t5_encoder_model_h
#define t5_encoder_model_h
#include "tts_model.h"
#include "ttstokenizer.h"
enum t5_tensor {
T5_EMBD,
T5_NORM,
T5_DOWN_PROJ,
T5_DOWN_PROJ_BIAS,
T5_RELATIVE_BIAS,
T5_LAYER_ATTN_Q,
T5_LAYER_ATTN_K,
T5_LAYER_ATTN_V,
T5_LAYER_ATTN_O,
T5_LAYER_ATTN_NORM,
T5_LAYER_WI_0,
T5_LAYER_WI_1,
T5_LAYER_WO,
T5_LAYER_OUT_NORM,
};
struct t5_layer {
struct ggml_tensor * q;
struct ggml_tensor * k;
struct ggml_tensor * v;
struct ggml_tensor * o;
struct ggml_tensor * attn_norm;
struct ggml_tensor * wi_0;
struct ggml_tensor * wi_1;
struct ggml_tensor * wo;
struct ggml_tensor * mlp_norm;
};
// this struct maintains the static tensors for a t5_encoder model
// the defautl configuration is form copied from standard configuration for
// flan-t5-xl. Note this model is slightly different from a standard t5 encoder.
// Specifically this model has a down projection which converts the text encoder's
// hidden size to the hidden size of the parler decoder.
struct t5_encoder : tts_model {
// These configs are essentially built for the 44khZ 8kbps standard DAC model audio encoder and decoder
uint32_t n_layers = 24;
uint32_t n_attn_heads = 32;
uint32_t head_size = 64;
uint32_t hidden_size = 2048;
uint32_t relative_attn_buckets = 32;
uint32_t eos_token_id = 1;
uint32_t bos_token_id = 0;
uint32_t max_context_length = 512;
uint32_t output_size = 1536;
uint32_t vocab_size;
struct ggml_tensor * embd;
struct ggml_tensor * relative_attn_bias;
struct ggml_tensor * out_norm;
struct ggml_tensor * down_proj = nullptr;
struct ggml_tensor * down_proj_bias = nullptr;
std::vector<t5_layer> layers;
void assign_weight(std::string name, ggml_tensor * tensor);
void prep_layers(gguf_context * meta);
void prep_constants(gguf_context * meta);
void setup_from_file(gguf_context * meta_ctx, ggml_context * load_context, bool cpu_only = true) {
prep_constants(meta_ctx);
prep_layers(meta_ctx);
tts_model::setup_from_file(meta_ctx, load_context, cpu_only, "t5encoder", 1.25);
}
};
// For assigning weights from gguf file to local model.
void assign_to_t5_encoder(t5_encoder * model, const std::string name, ggml_tensor * tensor);
void assign_to_t5_layer(t5_encoder * model, t5_layer & layer, std::string name, ggml_tensor * tensor);
struct t5_context : runner_context {
t5_context(t5_encoder * model, int n_threads): runner_context(n_threads), model(model) {};
struct t5_encoder * model;
struct ggml_tensor * inp_tokens;
struct ggml_tensor * positions;
struct ggml_tensor * attn_mask;
struct ggml_tensor * inp_pos_bucket;
void build_schedule() {
runner_context::build_schedule(model->max_nodes());
}
};
struct t5_context * build_new_t5_context(struct t5_encoder * model, int n_threads, bool use_cpu = true);
struct t5_ubatch {
size_t n_tokens; // the number of tokens in our encoded sequence
uint32_t * input_tokens; // [n_tokens]
};
static struct ggml_tensor * build_t5_norm(struct ggml_context * ctx, struct ggml_tensor * cur, struct ggml_tensor * weight);
static struct ggml_tensor * build_t5_attn_mask(ggml_context * ctx, struct t5_context *t5ctx, const t5_ubatch & batch);
// This struct is intended to manage the t5 encoder model's graph compilation and compute function.
struct t5_runner : tts_runner {
t5_runner(t5_encoder * model, t5_context * context, unigram_tokenizer * tokenizer): model(model), t5ctx(context), tokenizer(tokenizer) {};
~t5_runner() {
if (ctx) {
ggml_free(ctx);
}
model->free();
delete model;
delete t5ctx;
}
struct unigram_tokenizer * tokenizer;
t5_encoder * model;
t5_context * t5ctx;
void init_build() {
tts_runner::init_build(&t5ctx->buf_compute_meta);
}
void prepare_post_load();
struct t5_ubatch build_worst_case_batch();
void set_inputs(t5_ubatch & batch);
struct ggml_cgraph * build_t5_graph(t5_ubatch & batch);
void run(uint32_t * input_tokens, uint32_t sequence_length, struct tts_response * outputs);
int generate(std::string prompt, struct tts_response * response);
};
struct t5_runner * text_encoder_from_file(std::string file_path, int n_threads, unigram_tokenizer * tokenizer, bool cpu_only = true);
#endif