mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 09:34:37 +00:00
builds but crashes
This commit is contained in:
parent
2bf128587d
commit
bc04366a65
43 changed files with 12183 additions and 2 deletions
911
otherarch/ttscpp/src/dia_model.cpp
Normal file
911
otherarch/ttscpp/src/dia_model.cpp
Normal file
|
@ -0,0 +1,911 @@
|
|||
#include "dia_model.h"
|
||||
|
||||
void dia_model::assign_weight(std::string name, struct ggml_tensor * tensor) {
|
||||
std::vector<std::string> parts = split(name, ".");
|
||||
TTS_ASSERT(parts.size() >= 3);
|
||||
|
||||
if (parts[1] == "encoder") {
|
||||
assign_to_encoder(parts, tensor, name);
|
||||
} else if (parts[1] == "decoder"){
|
||||
assign_to_decoder(parts, tensor, name);
|
||||
} else {
|
||||
TTS_ABORT("Unrecognized tensor '%s' when loading Dia from GGUF file.", name.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
void dia_model::assign_to_encoder(std::vector<std::string> parts, struct ggml_tensor * tensor, std::string name) {
|
||||
if (parts[2] == "embedding") {
|
||||
encoder->embedding = ggml_dup_tensor(ctx, tensor);
|
||||
set_tensor(encoder->embedding, tensor);
|
||||
} else if (parts[2] == "norm") {
|
||||
encoder->norm = ggml_dup_tensor(ctx, tensor);
|
||||
set_tensor(encoder->norm, tensor);
|
||||
} else if (parts[2] == "layers") {
|
||||
TTS_ASSERT(parts.size() >= 4);
|
||||
int index = std::stoi(parts[3]);
|
||||
TTS_ASSERT(index < decoder->layers.size());
|
||||
assign_to_encoder_layer(parts[4], encoder->layers[index], tensor);
|
||||
} else {
|
||||
TTS_ABORT("Unrecognized tensor '%s' when loading Dia from GGUF file.", name.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
void dia_model::assign_to_decoder(std::vector<std::string> parts, struct ggml_tensor * tensor, std::string name) {
|
||||
if (parts[2] == "embeddings") {
|
||||
TTS_ASSERT(parts.size() > 2);
|
||||
int index = std::stoi(parts[3]);
|
||||
TTS_ASSERT(index < decoder->embds.size());
|
||||
decoder->embds[index] = ggml_dup_tensor(ctx, tensor);
|
||||
set_tensor(decoder->embds[index], tensor);
|
||||
} else if (parts[2] == "norm") {
|
||||
decoder->norm = ggml_dup_tensor(ctx, tensor);
|
||||
set_tensor(decoder->norm, tensor);
|
||||
} else if (parts[2] == "heads") {
|
||||
TTS_ASSERT(parts.size() > 2);
|
||||
int index = std::stoi(parts[3]);
|
||||
TTS_ASSERT(index < decoder->heads.size());
|
||||
decoder->heads[index] = ggml_dup_tensor(ctx, tensor);
|
||||
set_tensor(decoder->heads[index], tensor);
|
||||
} else if (parts[2] == "layers") {
|
||||
TTS_ASSERT(parts.size() >= 4);
|
||||
int index = std::stoi(parts[3]);
|
||||
TTS_ASSERT(index < decoder->layers.size());
|
||||
assign_to_decoder_layer(parts[4], decoder->layers[index], tensor);
|
||||
} else {
|
||||
TTS_ABORT("Unrecognized tensor '%s' when loading Dia from GGUF file.", name.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
void dia_model::assign_to_encoder_layer(std::string part, dia_encoder_layer * layer, struct ggml_tensor * tensor) {
|
||||
if (part == "q_proj") {
|
||||
layer->q = ggml_dup_tensor(ctx, tensor);
|
||||
set_tensor(layer->q, tensor);
|
||||
} else if (part == "k_proj") {
|
||||
layer->k = ggml_dup_tensor(ctx, tensor);
|
||||
set_tensor(layer->k, tensor);
|
||||
} else if (part == "v_proj") {
|
||||
layer->v = ggml_dup_tensor(ctx, tensor);
|
||||
set_tensor(layer->v, tensor);
|
||||
} else if (part == "o_proj") {
|
||||
layer->o = ggml_dup_tensor(ctx, tensor);
|
||||
set_tensor(layer->o, tensor);
|
||||
} else if (part == "pre_sa_norm") {
|
||||
layer->self_attn_norm = ggml_dup_tensor(ctx, tensor);
|
||||
set_tensor(layer->self_attn_norm, tensor);
|
||||
} else if (part == "post_sa_norm") {
|
||||
layer->mlp_norm = ggml_dup_tensor(ctx, tensor);
|
||||
set_tensor(layer->mlp_norm, tensor);
|
||||
} else if (part == "gate") {
|
||||
layer->gate = ggml_dup_tensor(ctx, tensor);
|
||||
set_tensor(layer->gate, tensor);
|
||||
} else if (part == "up") {
|
||||
layer->up = ggml_dup_tensor(ctx, tensor);
|
||||
set_tensor(layer->up, tensor);
|
||||
} else if (part == "wo") {
|
||||
layer->out = ggml_dup_tensor(ctx, tensor);
|
||||
set_tensor(layer->out, tensor);
|
||||
} else {
|
||||
TTS_ABORT("Unrecognized tensor '%s' for encoder layer when loading Dia from GGUF file.", part.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
void dia_model::assign_to_decoder_layer(std::string part, dia_decoder_layer * layer, struct ggml_tensor * tensor) {
|
||||
if (part == "self_q_proj") {
|
||||
layer->self_attn_q = ggml_dup_tensor(ctx, tensor);
|
||||
set_tensor(layer->self_attn_q, tensor);
|
||||
} else if (part == "self_k_proj") {
|
||||
layer->self_attn_k = ggml_dup_tensor(ctx, tensor);
|
||||
set_tensor(layer->self_attn_k, tensor);
|
||||
} else if (part == "self_v_proj") {
|
||||
layer->self_attn_v = ggml_dup_tensor(ctx, tensor);
|
||||
set_tensor(layer->self_attn_v, tensor);
|
||||
} else if (part == "self_o_proj") {
|
||||
layer->self_attn_o = ggml_dup_tensor(ctx, tensor);
|
||||
set_tensor(layer->self_attn_o, tensor);
|
||||
} else if (part == "cross_q_proj") {
|
||||
layer->cross_attn_q = ggml_dup_tensor(ctx, tensor);
|
||||
set_tensor(layer->cross_attn_q, tensor);
|
||||
} else if (part == "cross_k_proj") {
|
||||
layer->cross_attn_k = ggml_dup_tensor(ctx, tensor);
|
||||
set_tensor(layer->cross_attn_k, tensor);
|
||||
} else if (part == "cross_v_proj") {
|
||||
layer->cross_attn_v = ggml_dup_tensor(ctx, tensor);
|
||||
set_tensor(layer->cross_attn_v, tensor);
|
||||
} else if (part == "cross_o_proj") {
|
||||
layer->cross_attn_o = ggml_dup_tensor(ctx, tensor);
|
||||
set_tensor(layer->cross_attn_o, tensor);
|
||||
} else if (part == "pre_sa_norm") {
|
||||
layer->self_attn_norm = ggml_dup_tensor(ctx, tensor);
|
||||
set_tensor(layer->self_attn_norm, tensor);
|
||||
} else if (part == "pre_mlp_norm") {
|
||||
layer->mlp_norm = ggml_dup_tensor(ctx, tensor);
|
||||
set_tensor(layer->mlp_norm, tensor);
|
||||
} else if (part == "pre_ca_norm") {
|
||||
layer->cross_attn_norm = ggml_dup_tensor(ctx, tensor);
|
||||
set_tensor(layer->cross_attn_norm, tensor);
|
||||
} else if (part == "gate") {
|
||||
layer->gate = ggml_dup_tensor(ctx, tensor);
|
||||
set_tensor(layer->gate, tensor);
|
||||
} else if (part == "up") {
|
||||
layer->up = ggml_dup_tensor(ctx, tensor);
|
||||
set_tensor(layer->up, tensor);
|
||||
} else if (part == "wo") {
|
||||
layer->out = ggml_dup_tensor(ctx, tensor);
|
||||
set_tensor(layer->out, tensor);
|
||||
} else {
|
||||
TTS_ABORT("Unrecognized tensor '%s' for encoder layer when loading Dia from GGUF file.", part.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
void dia_model::prep_layers() {
|
||||
encoder = new dia_encoder;
|
||||
decoder = new dia_decoder;
|
||||
encoder->layers.reserve((size_t) n_encoder_layers);
|
||||
for (int i = 0; i < (int) n_encoder_layers; i++) {
|
||||
dia_encoder_layer * l = new dia_encoder_layer;
|
||||
encoder->layers.push_back(l);
|
||||
}
|
||||
|
||||
decoder->layers.reserve((size_t) n_decoder_layers);
|
||||
for (int i = 0; i < (int) n_decoder_layers; i++) {
|
||||
dia_decoder_layer * l = new dia_decoder_layer;
|
||||
decoder->layers.push_back(l);
|
||||
}
|
||||
|
||||
decoder->embds.reserve((size_t) n_output_heads);
|
||||
decoder->heads.reserve((size_t) n_output_heads);
|
||||
for (int i = 0; i < n_output_heads; i++) {
|
||||
struct ggml_tensor * h = nullptr;
|
||||
struct ggml_tensor * embd = nullptr;
|
||||
decoder->embds.push_back(embd);
|
||||
decoder->heads.push_back(h);
|
||||
}
|
||||
}
|
||||
|
||||
void dia_model::prep_constants(gguf_context * meta) {
|
||||
int output_heads_key = gguf_find_key(meta, "dia.decoder.output_heads");
|
||||
if (output_heads_key != -1) {
|
||||
n_output_heads = gguf_get_val_u32(meta, output_heads_key);
|
||||
}
|
||||
|
||||
int decoder_layers_key = gguf_find_key(meta, "dia.decoder.layers");
|
||||
if (decoder_layers_key != -1) {
|
||||
n_decoder_layers = gguf_get_val_u32(meta, decoder_layers_key);
|
||||
}
|
||||
|
||||
int encoder_layers_key = gguf_find_key(meta, "dia.encoder.layers");
|
||||
if (encoder_layers_key != -1) {
|
||||
n_encoder_layers = gguf_get_val_u32(meta, encoder_layers_key);
|
||||
}
|
||||
|
||||
int decoder_hidden_size_key = gguf_find_key(meta, "dia.decoder.hidden_size");
|
||||
if (decoder_hidden_size_key != -1) {
|
||||
decoder_hidden_size = gguf_get_val_u32(meta, decoder_hidden_size_key);
|
||||
}
|
||||
|
||||
int decoder_attn_heads_key = gguf_find_key(meta, "dia.decoder.attn_heads");
|
||||
if (decoder_attn_heads_key != -1) {
|
||||
decoder_attn_heads = gguf_get_val_u32(meta, decoder_attn_heads_key);
|
||||
}
|
||||
|
||||
int decoder_query_heads_key = gguf_find_key(meta, "dia.decoder.query_heads");
|
||||
if (decoder_query_heads_key != -1) {
|
||||
decoder_query_heads = gguf_get_val_u32(meta, decoder_query_heads_key);
|
||||
}
|
||||
|
||||
int encoder_attn_heads_key = gguf_find_key(meta, "dia.encoder.attn_heads");
|
||||
if (encoder_attn_heads_key != -1) {
|
||||
encoder_attn_heads = gguf_get_val_u32(meta, encoder_attn_heads_key);
|
||||
}
|
||||
|
||||
int head_size_key = gguf_find_key(meta, "dia.attn_head_size");
|
||||
if (head_size_key != -1) {
|
||||
head_size = gguf_get_val_u32(meta, head_size_key);
|
||||
}
|
||||
|
||||
int eos_token_id_key = gguf_find_key(meta, "dia.eos_token_id");
|
||||
if (eos_token_id_key != -1) {
|
||||
eos_token_id = gguf_get_val_u32(meta, eos_token_id_key);
|
||||
}
|
||||
|
||||
int bos_token_id_key = gguf_find_key(meta, "dia.bos_token_id");
|
||||
if (bos_token_id_key != -1) {
|
||||
bos_token_id = gguf_get_val_u32(meta, bos_token_id_key);
|
||||
}
|
||||
|
||||
int pad_token_id_key = gguf_find_key(meta, "dia.pad_token_id");
|
||||
if (pad_token_id_key != -1) {
|
||||
pad_token_id = gguf_get_val_u32(meta, pad_token_id_key);
|
||||
}
|
||||
|
||||
int max_context_key = gguf_find_key(meta, "dia.encoder.max_context_length");
|
||||
if (max_context_key != -1) {
|
||||
max_encoder_context_length = gguf_get_val_u32(meta, max_context_key);
|
||||
}
|
||||
|
||||
int output_vocab_size_key = gguf_find_key(meta, "dia.decoder.output_vocab_size");
|
||||
if (output_vocab_size_key != -1) {
|
||||
output_vocab_size = gguf_get_val_u32(meta, output_vocab_size_key);
|
||||
}
|
||||
|
||||
int audio_vocab_size_key = gguf_find_key(meta, "dia.decoder.audio_vocab_size");
|
||||
if (audio_vocab_size_key != -1) {
|
||||
audio_vocab_size = gguf_get_val_u32(meta, audio_vocab_size_key);
|
||||
}
|
||||
|
||||
int max_generation_size_key = gguf_find_key(meta, "dia.decoder.max_generation_size");
|
||||
if (max_generation_size_key != -1) {
|
||||
max_generation_size = gguf_get_val_u32(meta, max_generation_size_key);
|
||||
}
|
||||
int max_delay_key = gguf_find_key(meta, "dia.max_delay");
|
||||
if (max_delay_key != -1) {
|
||||
max_delay = gguf_get_val_u32(meta, max_delay_key);
|
||||
}
|
||||
|
||||
// please note that this value is not currently set in the gguf encoder as it effectively only exists as a default
|
||||
// python parameter (rather than an attribute in the model config) for the python Dia model.
|
||||
int cfg_scale_key = gguf_find_key(meta, "dia.cfg_scale");
|
||||
if (cfg_scale_key != -1) {
|
||||
cfg_scale_data[0] = gguf_get_val_f32(meta, cfg_scale_key);
|
||||
}
|
||||
}
|
||||
|
||||
void dia_context::reset() {
|
||||
current_position = 0;
|
||||
prompt_size = 0;
|
||||
output_tokens.clear();
|
||||
delay_steps = -1;
|
||||
}
|
||||
|
||||
struct dia_context * build_new_dia_context(struct dia_model * model, int n_threads, bool use_cpu) {
|
||||
dia_context * dctx = new dia_context(model, n_threads);
|
||||
if (!use_cpu) {
|
||||
#ifdef GGML_USE_METAL
|
||||
dctx->backend = ggml_backend_metal_init();
|
||||
#endif
|
||||
}
|
||||
dctx->backend_cpu = ggml_backend_cpu_init();
|
||||
dctx->set_threads();
|
||||
dctx->build_schedule();
|
||||
dctx->buf_compute_meta.resize(ggml_tensor_overhead()*model->max_nodes() + ggml_graph_overhead_custom(model->max_nodes(), false));
|
||||
return dctx;
|
||||
}
|
||||
|
||||
static bool dia_kv_cache_init(struct dia_kv_cache * cache, dia_model * model, dia_context * dctx) {
|
||||
ggml_backend_buffer_type_t buft = nullptr;
|
||||
// this will only really support cpu or metal for the time being;
|
||||
if (dctx->backend != nullptr) {
|
||||
#ifdef GGML_USE_METAL
|
||||
buft = ggml_backend_metal_buffer_type();
|
||||
#endif
|
||||
} else {
|
||||
buft = ggml_backend_cpu_buffer_type();
|
||||
}
|
||||
|
||||
struct ggml_init_params params = {
|
||||
/*.mem_size =*/ (4u * model->n_decoder_layers + 1) * ggml_tensor_overhead(),
|
||||
/*.mem_buffer =*/ NULL,
|
||||
/*.no_alloc =*/ true,
|
||||
};
|
||||
ggml_context * ctx = ggml_init(params);
|
||||
if (!ctx) {
|
||||
return false;
|
||||
}
|
||||
cache->ctx = ctx;
|
||||
|
||||
cache->k_l.reserve(model->n_decoder_layers);
|
||||
cache->v_l.reserve(model->n_decoder_layers);
|
||||
cache->cross_k_l.reserve(model->n_decoder_layers);
|
||||
cache->cross_v_l.reserve(model->n_decoder_layers);
|
||||
|
||||
for (int i = 0; i < (int) model->n_decoder_layers; i++) {
|
||||
struct ggml_tensor * k = ggml_new_tensor_1d(cache->ctx, cache->tensor_type, model->head_size * model->decoder_attn_heads * model->max_generation_size * 2);
|
||||
struct ggml_tensor * v = ggml_new_tensor_1d(cache->ctx, cache->tensor_type, model->head_size * model->decoder_attn_heads * model->max_generation_size * 2);
|
||||
struct ggml_tensor * cross_k = ggml_new_tensor_1d(cache->ctx, cache->tensor_type, model->head_size * model->decoder_attn_heads * model->max_encoder_context_length * 2);
|
||||
struct ggml_tensor * cross_v = ggml_new_tensor_1d(cache->ctx, cache->tensor_type, model->head_size * model->decoder_attn_heads * model->max_encoder_context_length * 2);
|
||||
ggml_format_name(k, "cache_k_l%d", i);
|
||||
ggml_format_name(v, "cache_v_l%d", i);
|
||||
ggml_format_name(cross_k, "cache_cross_k_l%d", i);
|
||||
ggml_format_name(cross_v, "cache_cross_v_l%d", i);
|
||||
cache->k_l.push_back(k);
|
||||
cache->v_l.push_back(v);
|
||||
cache->cross_k_l.push_back(cross_k);
|
||||
cache->cross_v_l.push_back(cross_v);
|
||||
}
|
||||
|
||||
// allocate tensors and initialize the buffers to avoid NaNs in the padding
|
||||
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(cache->ctx, buft);
|
||||
if (!buf) {
|
||||
return false;
|
||||
}
|
||||
ggml_backend_buffer_clear(buf, 0);
|
||||
cache->buf = buf;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static struct ggml_tensor * build_dia_decoder_inp_embd(struct ggml_context * ctx, dia_context *dctx, dia_decoder * decoder, dia_ubatch & batch, uint32_t n_output_heads) {
|
||||
struct ggml_tensor * input_embs;
|
||||
|
||||
dctx->audio_inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_output_heads * 2);
|
||||
ggml_set_input(dctx->audio_inp_tokens);
|
||||
for (int i = 0; i < n_output_heads; i++) {
|
||||
struct ggml_tensor * view = ggml_view_1d(ctx, dctx->audio_inp_tokens, 2, i * ggml_element_size(dctx->audio_inp_tokens));
|
||||
view->nb[0] = n_output_heads * ggml_element_size(dctx->audio_inp_tokens);
|
||||
if (i == 0) {
|
||||
input_embs = ggml_get_rows(ctx, decoder->embds[i], view);
|
||||
} else {
|
||||
input_embs = ggml_add(ctx, ggml_get_rows(ctx, decoder->embds[i], view), input_embs);
|
||||
}
|
||||
}
|
||||
return input_embs;
|
||||
}
|
||||
|
||||
static struct ggml_tensor * dia_layer_norm(struct ggml_context * ctx, struct ggml_tensor * inputs, struct ggml_tensor * weight) {
|
||||
// dia always uses 1e-5 as the default eps
|
||||
float eps = 0.00001;
|
||||
inputs = ggml_rms_norm(ctx, inputs, eps);
|
||||
return ggml_mul(ctx, inputs, weight);
|
||||
}
|
||||
|
||||
static struct ggml_tensor * build_dia_encoder_attn_mask(ggml_context * ctx, struct dia_context * dctx, dia_model * model) {
|
||||
dctx->encode_attn_mask = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, (int64_t) model->max_encoder_context_length, (int64_t) model->max_encoder_context_length);
|
||||
ggml_set_input(dctx->encode_attn_mask);
|
||||
|
||||
return dctx->encode_attn_mask;
|
||||
}
|
||||
|
||||
static struct ggml_tensor * build_dia_head_outputs(struct ggml_context * ctx, dia_model * model, struct ggml_tensor * cur) {
|
||||
// going to cat the heads together and then reshape them
|
||||
struct ggml_tensor * out;
|
||||
for (int i = 0; i < model->n_output_heads; i++) {
|
||||
if (i == 0) {
|
||||
out = ggml_mul_mat(ctx, model->decoder->heads[i], cur);
|
||||
} else {
|
||||
out = ggml_concat(ctx, out, ggml_mul_mat(ctx, model->decoder->heads[i], cur), 2);
|
||||
}
|
||||
}
|
||||
struct ggml_tensor * cond = ggml_cont(ctx, ggml_view_2d(ctx, out, out->ne[0], out->ne[2], out->nb[2], 0));
|
||||
struct ggml_tensor * uncond = ggml_cont(ctx, ggml_view_2d(ctx, out, out->ne[0], out->ne[2], out->nb[2], out->nb[1]));
|
||||
return ggml_map_custom2(ctx, cond, uncond, &cfg_scale, out->ne[0], &model->cfg_scale_data);
|
||||
}
|
||||
|
||||
static struct ggml_tensor * build_dia_encoder(ggml_context * ctx, dia_model * model, dia_context * dctx, dia_ubatch & batch) {
|
||||
dctx->inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, model->max_encoder_context_length*2);
|
||||
ggml_set_input(dctx->inp_tokens);
|
||||
|
||||
dctx->encode_positions = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, model->max_encoder_context_length);
|
||||
ggml_set_input(dctx->encode_positions);
|
||||
|
||||
struct ggml_tensor * attn_mask = build_dia_encoder_attn_mask(ctx, dctx, model);
|
||||
|
||||
struct ggml_tensor * cur = ggml_reshape_3d(ctx, ggml_get_rows(ctx, model->encoder->embedding, dctx->inp_tokens), model->encoder_hidden_size, model->max_encoder_context_length, 2);
|
||||
for (auto layer : model->encoder->layers) {
|
||||
struct ggml_tensor * residual = cur;
|
||||
|
||||
cur = dia_layer_norm(ctx, cur, layer->self_attn_norm);
|
||||
// self-attention
|
||||
{
|
||||
struct ggml_tensor * Qcur = ggml_mul_mat(ctx, layer->q, cur);
|
||||
struct ggml_tensor * Kcur = ggml_mul_mat(ctx, layer->k, cur);
|
||||
struct ggml_tensor * Vcur = ggml_mul_mat(ctx, layer->v, cur);
|
||||
|
||||
// Strangely Dia follows the neoX Rotary Positional Embeddings Protocol
|
||||
Qcur = ggml_rope(ctx, ggml_cont(ctx, ggml_reshape_4d(ctx, Qcur, model->head_size, model->encoder_attn_heads, model->max_encoder_context_length, 2)), dctx->encode_positions, model->head_size, 2);
|
||||
Kcur = ggml_rope(ctx, ggml_cont(ctx, ggml_reshape_4d(ctx, Kcur, model->head_size, model->encoder_attn_heads, model->max_encoder_context_length, 2)), dctx->encode_positions, model->head_size, 2);
|
||||
struct ggml_tensor * q = ggml_cont(ctx, ggml_permute(ctx, Qcur, 0, 2, 1, 3));
|
||||
struct ggml_tensor * k = ggml_cont(ctx, ggml_permute(ctx, Kcur, 0, 2, 1, 3));
|
||||
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
|
||||
kq = ggml_soft_max_ext(ctx, kq, attn_mask, 1.0f, 0.0f);
|
||||
struct ggml_tensor * v = ggml_cont_4d(ctx, ggml_transpose(ctx, Vcur), model->max_encoder_context_length, model->head_size, model->encoder_attn_heads, 2);
|
||||
struct ggml_tensor * kqv = ggml_mul_mat(ctx, kq, v);
|
||||
struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 2, 0, 1, 3);
|
||||
|
||||
// It is unclear why the attention ops in Dia's encoder don't project to the embedding dimension size as is standard. Instead they up project to the decoder's embedding dimension
|
||||
// then down project back the the encoder embedding dimension.
|
||||
cur = ggml_cont_3d(ctx, kqv_merged, model->decoder_hidden_size, model->max_encoder_context_length, 2);
|
||||
cur = ggml_mul_mat(ctx, layer->o, cur);
|
||||
}
|
||||
|
||||
cur = ggml_add(ctx, cur, residual);
|
||||
struct ggml_tensor * residual_mlp = cur;
|
||||
|
||||
cur = dia_layer_norm(ctx, cur, layer->mlp_norm);
|
||||
// mlp
|
||||
{
|
||||
cur = ggml_mul(ctx, ggml_silu(ctx, ggml_mul_mat(ctx, layer->gate, cur)), ggml_mul_mat(ctx, layer->up, cur));
|
||||
cur = ggml_mul_mat(ctx, layer->out, cur);
|
||||
}
|
||||
|
||||
cur = ggml_add(ctx, cur, residual_mlp);
|
||||
}
|
||||
|
||||
cur = dia_layer_norm(ctx, cur, model->encoder->norm);
|
||||
return cur;
|
||||
}
|
||||
|
||||
static struct ggml_tensor * repeat_interleave_dim1(ggml_context * ctx, struct ggml_tensor * a, int repeat) {
|
||||
//return ggml_repeat(ctx, a, ggml_new_tensor_4d(ctx, GGML_TYPE_F32, a->ne[0], 4*a->ne[1], a->ne[2], a->ne[3]));
|
||||
struct ggml_tensor * running;
|
||||
for (int i = 0; i < a->ne[1]; i++) {
|
||||
int offset = i * a->nb[1];
|
||||
struct ggml_tensor * t = ggml_cont(ctx, ggml_view_4d(ctx, a, a->ne[0], 1, a->ne[2], a->ne[3], a->nb[1], a->nb[2], a->nb[3], offset));
|
||||
t = ggml_repeat(ctx, t, ggml_new_tensor_4d(ctx, GGML_TYPE_F32, a->ne[0], repeat, a->ne[2], a->ne[3]));
|
||||
if (i == 0) {
|
||||
running = t;
|
||||
} else {
|
||||
running = ggml_concat(ctx, running, t, 1);
|
||||
}
|
||||
}
|
||||
return running;
|
||||
}
|
||||
|
||||
static void build_dia_self_kv_store(ggml_context * ctx, dia_context * dctx, dia_model * model, dia_kv_cache * kv, ggml_cgraph * gf, struct ggml_tensor * k, struct ggml_tensor * v, dia_ubatch & batch, int layer_index) {
|
||||
int64_t attn_size = model->head_size * model->decoder_attn_heads;
|
||||
|
||||
struct ggml_tensor * k_cache_view =
|
||||
ggml_view_2d(
|
||||
ctx, kv->k_l[layer_index], attn_size, 2,
|
||||
attn_size * model->max_generation_size * ggml_element_size(kv->k_l[layer_index]),
|
||||
attn_size*dctx->current_position*ggml_element_size(kv->k_l[layer_index]));
|
||||
|
||||
k = ggml_rope(ctx, ggml_cont(ctx, ggml_reshape_4d(ctx, k, model->head_size, model->decoder_attn_heads / model->decoder_query_heads, batch.sequence_length, 2)), dctx->positions, model->head_size, 2);
|
||||
// Since the sequence length should always be 1 here this is the most pertinent time to repeat the heads for grouped query attention.
|
||||
// If GGML supported a repeat_interleave op then it would be more optimal to store just the groups in the cache and interleave the attention heads after recalling
|
||||
// from the cache
|
||||
k = repeat_interleave_dim1(ctx, ggml_cont(ctx, ggml_reshape_4d(ctx, k, model->head_size, model->decoder_attn_heads / model->decoder_query_heads, batch.sequence_length, 2)), model->decoder_query_heads);
|
||||
k = ggml_cont(ctx, ggml_reshape_2d(ctx, k, attn_size, 2));
|
||||
|
||||
ggml_build_forward_expand(gf, ggml_cpy(ctx, k, k_cache_view));
|
||||
|
||||
struct ggml_tensor * v_cache_view = nullptr;
|
||||
|
||||
v_cache_view = ggml_view_2d(
|
||||
ctx, kv->v_l[layer_index], attn_size, 2,
|
||||
attn_size * model->max_generation_size * ggml_element_size(kv->v_l[layer_index]),
|
||||
attn_size*dctx->current_position*ggml_element_size(kv->v_l[layer_index]));
|
||||
|
||||
// Since the sequence length should always be 1 here this is the most pertinent time to repeat the heads for grouped query attention.
|
||||
// If GGML supported a repeat_interleave op then it would be more optimal to store just the groups in the cache and interleave the attention heads after recalling
|
||||
// from the cache
|
||||
v = repeat_interleave_dim1(ctx, ggml_cont(ctx, ggml_reshape_4d(ctx, v, model->head_size, model->decoder_attn_heads / model->decoder_query_heads, batch.sequence_length, 2)), model->decoder_query_heads);
|
||||
|
||||
ggml_build_forward_expand(gf, ggml_cpy(ctx, v, v_cache_view));
|
||||
}
|
||||
|
||||
static void build_dia_cross_kv_store(ggml_context * ctx, dia_context * dctx, dia_model * model, dia_kv_cache * kv, ggml_cgraph * gf, struct ggml_tensor * encoder_hidden_states, int layer_index) {
|
||||
dia_decoder_layer * layer = model->decoder->layers[layer_index];
|
||||
struct ggml_tensor * encoder_states_key_view = ggml_cont(ctx, ggml_view_3d(
|
||||
ctx,
|
||||
encoder_hidden_states,
|
||||
model->encoder_hidden_size,
|
||||
dctx->prompt_size,
|
||||
2,
|
||||
model->encoder_hidden_size * ggml_element_size(encoder_hidden_states), model->encoder_hidden_size * model->max_encoder_context_length * ggml_element_size(encoder_hidden_states), 0));
|
||||
|
||||
struct ggml_tensor * k = ggml_mul_mat(ctx, layer->cross_attn_k, encoder_states_key_view);
|
||||
struct ggml_tensor * positions_view = ggml_view_1d(ctx, dctx->encode_positions, dctx->prompt_size, 0);
|
||||
|
||||
k = ggml_rope(ctx, ggml_cont(ctx, ggml_reshape_4d(ctx, k, model->head_size, model->decoder_attn_heads, dctx->prompt_size, 2)), positions_view, model->head_size, 2);
|
||||
k = ggml_cont(ctx, ggml_permute(ctx, k, 0, 1, 3, 2));
|
||||
|
||||
struct ggml_tensor * k_cache_view =
|
||||
ggml_view_4d(
|
||||
ctx, kv->cross_k_l[layer_index], model->head_size, model->decoder_attn_heads, 2, dctx->prompt_size,
|
||||
model->head_size*ggml_element_size(kv->cross_k_l[layer_index]),
|
||||
model->head_size*model->decoder_attn_heads*ggml_element_size(kv->cross_k_l[layer_index]),
|
||||
model->head_size*model->decoder_attn_heads*2*ggml_element_size(kv->cross_k_l[layer_index]),
|
||||
0);
|
||||
|
||||
ggml_build_forward_expand(gf, ggml_cpy(ctx, k, k_cache_view));
|
||||
|
||||
struct ggml_tensor * v = ggml_cont(ctx, ggml_transpose(ctx, ggml_mul_mat(ctx, layer->cross_attn_v, encoder_hidden_states)));
|
||||
v = ggml_cont_4d(ctx, v, model->max_encoder_context_length, model->head_size, model->decoder_attn_heads, 2);
|
||||
|
||||
struct ggml_tensor * v_cache_view =
|
||||
ggml_view_4d(
|
||||
ctx, kv->cross_v_l[layer_index], model->max_encoder_context_length, model->head_size, model->decoder_attn_heads, 2,
|
||||
model->max_encoder_context_length*ggml_element_size(kv->cross_v_l[layer_index]),
|
||||
model->head_size*model->max_encoder_context_length*ggml_element_size(kv->cross_v_l[layer_index]),
|
||||
model->head_size*model->max_encoder_context_length*model->decoder_attn_heads*ggml_element_size(kv->cross_v_l[layer_index]),
|
||||
0);
|
||||
|
||||
ggml_build_forward_expand(gf, ggml_cpy(ctx, v, v_cache_view));
|
||||
}
|
||||
|
||||
static struct ggml_tensor * build_dia_decoder(
|
||||
ggml_cgraph * gf,
|
||||
ggml_context * ctx,
|
||||
dia_model * model,
|
||||
dia_context * dctx,
|
||||
dia_kv_cache * cache,
|
||||
dia_ubatch & batch,
|
||||
struct ggml_tensor * encoder_hidden_states) {
|
||||
dctx->positions = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.sequence_length);
|
||||
ggml_set_input(dctx->positions);
|
||||
struct ggml_tensor * cur = build_dia_decoder_inp_embd(ctx, dctx, model->decoder, batch, model->n_output_heads);
|
||||
|
||||
for (int l = 0; l < model->decoder->layers.size(); l++){
|
||||
dia_decoder_layer * layer = model->decoder->layers[l];
|
||||
struct ggml_tensor * residual = cur;
|
||||
|
||||
cur = dia_layer_norm(ctx, cur, layer->self_attn_norm);
|
||||
// self-attention
|
||||
{
|
||||
struct ggml_tensor * Qcur = ggml_mul_mat(ctx, layer->self_attn_q, cur);
|
||||
struct ggml_tensor * Kcur = ggml_mul_mat(ctx, layer->self_attn_k, cur);
|
||||
struct ggml_tensor * Vcur = ggml_mul_mat(ctx, layer->self_attn_v, cur);
|
||||
|
||||
build_dia_self_kv_store(ctx, dctx, model, cache, gf, Kcur, Vcur, batch, l);
|
||||
struct ggml_tensor * k =
|
||||
ggml_view_4d(ctx, cache->k_l[l],
|
||||
model->head_size, model->decoder_attn_heads, dctx->current_position + 1, 2,
|
||||
ggml_element_size(cache->k_l[l]) * model->head_size,
|
||||
ggml_element_size(cache->k_l[l]) * model->decoder_attn_heads * model->head_size,
|
||||
ggml_element_size(cache->k_l[l]) * model->decoder_attn_heads * model->head_size * model->max_generation_size,
|
||||
0);
|
||||
k = ggml_cont(ctx, ggml_permute(ctx, k, 0, 2, 1, 3));
|
||||
|
||||
struct ggml_tensor * v =
|
||||
ggml_view_3d(ctx, cache->v_l[l],
|
||||
model->head_size * model->decoder_attn_heads, dctx->current_position + 1, 2,
|
||||
ggml_element_size(cache->v_l[l]) * model->decoder_attn_heads * model->head_size,
|
||||
ggml_element_size(cache->v_l[l]) * model->decoder_attn_heads * model->head_size * model->max_generation_size,
|
||||
0);
|
||||
v = ggml_cont_4d(ctx, ggml_transpose(ctx, v), dctx->current_position + 1, model->head_size, model->decoder_attn_heads, 2);
|
||||
|
||||
// As noted in the encoder Dia uses the Neo-X protocol for RoPE.
|
||||
Qcur = ggml_rope(ctx, ggml_cont(ctx, ggml_reshape_4d(ctx, Qcur, model->head_size, model->decoder_attn_heads, batch.sequence_length, 2)), dctx->positions, model->head_size, 2);
|
||||
struct ggml_tensor * q = ggml_cont(ctx, ggml_permute(ctx, Qcur, 0, 2, 1, 3));
|
||||
struct ggml_tensor * kq = ggml_mul_mat(ctx, ggml_cont(ctx, k), q);
|
||||
|
||||
// given that attention bias, scaling and masking are not used for decoding, it might be faster to prefer the #ggml_soft_max op here,
|
||||
kq = ggml_soft_max_ext(ctx, kq, nullptr, 1.0f, 0.0f);
|
||||
struct ggml_tensor * kqv = ggml_mul_mat(ctx, kq, v);
|
||||
struct ggml_tensor * kqv_merged = ggml_cont(ctx, ggml_permute(ctx, kqv, 2, 0, 1, 3));
|
||||
cur = ggml_cont_3d(ctx, kqv_merged, model->decoder_hidden_size, batch.sequence_length, 2);
|
||||
cur = ggml_mul_mat(ctx, layer->self_attn_o, cur);
|
||||
}
|
||||
|
||||
|
||||
// if we ever need to support multiple step decoder runs then this reshape will need to be replaced with permutation.
|
||||
cur = ggml_cont_2d(ctx, cur, cur->ne[0], 2);
|
||||
cur = ggml_add(ctx, cur, residual);
|
||||
struct ggml_tensor * residual_cross = cur;
|
||||
|
||||
cur = dia_layer_norm(ctx, cur, layer->cross_attn_norm);
|
||||
// cross-attention
|
||||
{
|
||||
struct ggml_tensor * cross_Qcur = ggml_mul_mat(ctx, layer->cross_attn_q, cur);
|
||||
|
||||
// only load the cross attention kv store when performing the encoding step
|
||||
if (batch.encoder_step) {
|
||||
build_dia_cross_kv_store(ctx, dctx, model, cache, gf, encoder_hidden_states, l);
|
||||
}
|
||||
|
||||
struct ggml_tensor * cross_k =
|
||||
ggml_view_4d(
|
||||
ctx, cache->cross_k_l[l], model->head_size, model->decoder_attn_heads, 2,
|
||||
model->max_encoder_context_length, model->head_size*ggml_element_size(cache->cross_k_l[l]),
|
||||
model->head_size*model->decoder_attn_heads*ggml_element_size(cache->cross_k_l[l]),
|
||||
model->head_size*model->decoder_attn_heads*2*ggml_element_size(cache->cross_k_l[l]),
|
||||
0);
|
||||
// the double permute operation shouldn't be necessary here, but it seems that currently ggml permute only currently alows for a single
|
||||
// axis pair to be transposed.
|
||||
cross_k = ggml_cont(ctx, ggml_permute(ctx, ggml_permute(ctx, cross_k, 0, 1, 3, 2), 0, 2, 1, 3));
|
||||
|
||||
struct ggml_tensor * cross_v =
|
||||
ggml_cont(ctx, ggml_view_4d(
|
||||
ctx, cache->cross_v_l[l], model->max_encoder_context_length, model->head_size, model->decoder_attn_heads, 2,
|
||||
model->max_encoder_context_length*ggml_element_size(cache->cross_v_l[l]),
|
||||
model->head_size*model->max_encoder_context_length*ggml_element_size(cache->cross_v_l[l]),
|
||||
model->head_size*model->max_encoder_context_length*model->decoder_attn_heads*ggml_element_size(cache->cross_v_l[l]),
|
||||
0));
|
||||
|
||||
// As noted in the encoder Dia uses the Neo-X protocol for RoPE.
|
||||
cross_Qcur = ggml_rope(ctx, ggml_cont(ctx, ggml_reshape_4d(ctx, cross_Qcur, model->head_size, model->decoder_attn_heads, batch.sequence_length, 2)), dctx->positions, model->head_size, 2);
|
||||
struct ggml_tensor * cross_q = ggml_cont(ctx, ggml_permute(ctx, cross_Qcur, 0, 2, 1, 3));
|
||||
struct ggml_tensor * cross_kq = ggml_mul_mat(ctx, cross_k, cross_q);
|
||||
|
||||
// given that attention bias, scaling and masking are not used for decoding, it might be faster to prefer the #ggml_soft_max op here,
|
||||
cross_kq = ggml_soft_max_ext(ctx, cross_kq, nullptr, 1.0f, 0.0f);
|
||||
struct ggml_tensor * cross_kqv = ggml_mul_mat(ctx, cross_kq, cross_v);
|
||||
struct ggml_tensor * cross_kqv_merged = ggml_cont(ctx, ggml_permute(ctx, cross_kqv, 2, 0, 1, 3));
|
||||
cur = ggml_cont_3d(ctx, cross_kqv_merged, model->decoder_hidden_size, batch.sequence_length, 2);
|
||||
cur = ggml_mul_mat(ctx, layer->cross_attn_o, cur);
|
||||
}
|
||||
|
||||
|
||||
// if we ever need to support multiple step decoder runs then this reshape will need to be replaced with permutation.
|
||||
cur = ggml_cont_2d(ctx, cur, cur->ne[0], 2);
|
||||
cur = ggml_add(ctx, cur, residual_cross);
|
||||
struct ggml_tensor * residual_mlp = cur;
|
||||
|
||||
cur = dia_layer_norm(ctx, cur, layer->mlp_norm);
|
||||
// mlp
|
||||
{
|
||||
cur = ggml_mul(ctx, ggml_silu(ctx, ggml_mul_mat(ctx, layer->gate, cur)), ggml_mul_mat(ctx, layer->up, cur));
|
||||
cur = ggml_mul_mat(ctx, layer->out, cur);
|
||||
}
|
||||
|
||||
cur = ggml_add(ctx, cur, residual_mlp);
|
||||
}
|
||||
|
||||
cur = dia_layer_norm(ctx, cur, model->decoder->norm);
|
||||
cur = build_dia_head_outputs(ctx, model, cur);
|
||||
return cur;
|
||||
}
|
||||
|
||||
void dia_runner::tokenize_sentence(std::string sentence, dia_ubatch & batch) {
|
||||
// Dia's tokenization process is unusual. Essentially Dia takes the byte value for each character and uses that as
|
||||
// a token array. Additionally, because Dia performs a cfg-scale adjustment before sampling tokens, it is necessary to
|
||||
// generate with a conditioned context (i.e. with the text) and an unconditioned context (i.e. without any text) so that
|
||||
// proper adjustments can be perfored at each generation step. This means that we need to pad the end of our tokens to the
|
||||
// max context size for both the conditional and unconditional sequence.
|
||||
|
||||
// if the sentence isn't prepended by dialogue start tokens, [S1] or [S2], then append one.
|
||||
sentence = strip(sentence);
|
||||
std::string start = sentence.substr(0, 4);
|
||||
if (start != "[S1]" && start != "[S2]") {
|
||||
sentence = "[S1] " + sentence;
|
||||
}
|
||||
if (sentence[sentence.size() - 1] != '.') {
|
||||
sentence += ".";
|
||||
}
|
||||
|
||||
// [S1] and [S2] are special character sequences that are replaced with the special tokens 0x01 and 0x02 respectively.
|
||||
std::string r1(1, 1);
|
||||
std::string r2(1, 2);
|
||||
while (sentence.find("[S1]") != std::string::npos) {
|
||||
size_t pos = sentence.find("[S1]");
|
||||
sentence.replace(pos, 4, r1);
|
||||
}
|
||||
while (sentence.find("[S2]") != std::string::npos) {
|
||||
size_t pos = sentence.find("[S2]");
|
||||
sentence.replace(pos, 4, r2);
|
||||
}
|
||||
|
||||
if (sentence.size() > model->max_encoder_context_length) {
|
||||
TTS_ABORT("Dia currently only supports a max of %d characters and received an input of %d characters.", model->max_encoder_context_length, sentence.size());
|
||||
}
|
||||
batch.tokens.reserve(model->max_encoder_context_length * 2);
|
||||
for (auto character : sentence) {
|
||||
batch.tokens.push_back((uint32_t) character);
|
||||
}
|
||||
batch.sentence_length = batch.tokens.size();
|
||||
// this 100 token warning is arbitrarily chosen based on spot checking small prompt performance
|
||||
if (batch.sentence_length <= 100) {
|
||||
fprintf(stdout, "Your prompt has fewer than 100 tokens. Please note that Dia's generation with prompts that are fewer than 100 tokens is highly inconsistent.\n");
|
||||
}
|
||||
|
||||
for (int i = (int) batch.tokens.size(); i < model->max_encoder_context_length * 2; i++) {
|
||||
batch.tokens.push_back(0u);
|
||||
}
|
||||
}
|
||||
|
||||
dia_ubatch dia_runner::batch_from_sentence(std::string sentence) {
|
||||
// if we are generating a new batch from tokens then we need to run the encoder step;
|
||||
struct dia_ubatch batch{ 1, true};
|
||||
tokenize_sentence(sentence, batch);
|
||||
batch.audio_tokens.reserve(model->n_output_heads);
|
||||
for (int i = 0; i < model->n_output_heads; i++) {
|
||||
batch.audio_tokens.push_back(model->bos_token_id);
|
||||
}
|
||||
return batch;
|
||||
}
|
||||
|
||||
/*
|
||||
* There are two unique features of Dia's model architecture:
|
||||
* 1. Dia cleans its output generation by adding the difference between its text based output (its conditional output) and its unconditional output
|
||||
* to the conditional ouput before sampling. This is why the batch is set to two throughout the graph.
|
||||
*
|
||||
* 2. Dia's decoder attends across the entire encoded space including the pad buffer which receives a unique attention mask. This is why the
|
||||
* encoder sequence is always max length.
|
||||
*/
|
||||
struct ggml_cgraph * dia_runner::build_dia_graph(dia_ubatch & batch) {
|
||||
init_build();
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx, 8192, false);
|
||||
struct ggml_tensor * encoded_states = nullptr;
|
||||
|
||||
if (batch.encoder_step) {
|
||||
encoded_states = build_dia_encoder(ctx, model, dctx, batch);
|
||||
ggml_build_forward_expand(gf, encoded_states);
|
||||
}
|
||||
|
||||
struct ggml_tensor * cur = build_dia_decoder(gf, ctx, model, dctx, kv_cross_self, batch, encoded_states);
|
||||
ggml_set_name(cur, "decoder_output");
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
free_build();
|
||||
|
||||
return gf;
|
||||
}
|
||||
|
||||
void dia_runner::configure_generation(generation_configuration * config) {
|
||||
GGML_ASSERT(config->max_tokens == 0 || config->max_tokens > model->max_delay);
|
||||
decode_sampler->temperature = config->temperature;
|
||||
decode_sampler->repetition_penalty = config->repetition_penalty;
|
||||
decode_sampler->do_sample = config->sample;
|
||||
decode_sampler->top_k = config->top_k;
|
||||
decode_sampler->top_p = config->top_p;
|
||||
dctx->max_generation_size = config->max_tokens > model->max_delay ? config->max_tokens : model->max_generation_size;
|
||||
}
|
||||
|
||||
void dia_runner::set_inputs(dia_ubatch & batch) {
|
||||
if (batch.encoder_step) {
|
||||
ggml_backend_tensor_set(dctx->inp_tokens, batch.tokens.data(), 0, batch.tokens.size()*ggml_element_size(dctx->inp_tokens));
|
||||
int32_t * ep = (int32_t*) dctx->encode_positions->data;
|
||||
float * mask = (float*) dctx->encode_attn_mask->data;
|
||||
for (int i = 0; i < model->max_encoder_context_length; i++) {
|
||||
ep[i] = (int32_t) i;
|
||||
for (int ii = 0; ii < model->max_encoder_context_length; ii++) {
|
||||
if (i < batch.sentence_length) {
|
||||
mask[i*model->max_encoder_context_length + ii] = ii < batch.sentence_length ? 0.0 : -INFINITY;
|
||||
} else {
|
||||
mask[i*model->max_encoder_context_length + ii] = ii >= batch.sentence_length ? 0.0 : -INFINITY;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// The audio tokens need to be repeated in the input in order to support cfg-scaling. I.E we need duplicate inputs for conditional and unconditional logits.
|
||||
ggml_backend_tensor_set(dctx->audio_inp_tokens, batch.audio_tokens.data(), 0, batch.audio_tokens.size()*ggml_element_size(dctx->audio_inp_tokens));
|
||||
ggml_backend_tensor_set(dctx->audio_inp_tokens, batch.audio_tokens.data(), batch.audio_tokens.size()*ggml_element_size(dctx->audio_inp_tokens), batch.audio_tokens.size()*ggml_element_size(dctx->audio_inp_tokens));
|
||||
((int32_t*) dctx->positions->data)[0] = dctx->current_position;
|
||||
}
|
||||
|
||||
int dia_runner::decode(dia_ubatch & batch) {
|
||||
if (batch.encoder_step) {
|
||||
dctx->prompt_size = batch.sentence_length;
|
||||
dctx->output_tokens.reserve(dctx->max_generation_size * model->n_output_heads);
|
||||
}
|
||||
ggml_backend_sched_reset(dctx->sched);
|
||||
|
||||
const size_t logits_size = model->output_vocab_size * dctx->max_generation_size * model->n_output_heads;
|
||||
const size_t prev_size = dctx->buf_output ? ggml_backend_buffer_get_size(dctx->buf_output) : 0;
|
||||
const size_t new_size = logits_size * sizeof(float);
|
||||
|
||||
if (!dctx->buf_output || prev_size < new_size) {
|
||||
if (dctx->buf_output) {
|
||||
ggml_backend_buffer_free(dctx->buf_output);
|
||||
dctx->buf_output = nullptr;
|
||||
dctx->logits = nullptr;
|
||||
}
|
||||
|
||||
dctx->buf_output = ggml_backend_buft_alloc_buffer(dctx->backend_cpu_buffer, new_size);
|
||||
}
|
||||
|
||||
dctx->logits = (float *) ggml_backend_buffer_get_base(dctx->buf_output);
|
||||
|
||||
ggml_cgraph * gf = build_dia_graph(batch);
|
||||
|
||||
// the output is always the last tensor in the graph
|
||||
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
||||
std::string resname = ggml_get_name(res);
|
||||
ggml_backend_sched_alloc_graph(dctx->sched, gf);
|
||||
|
||||
set_inputs(batch);
|
||||
|
||||
ggml_backend_sched_graph_compute_async(dctx->sched, gf);
|
||||
|
||||
float * logits_out = dctx->logits + dctx->current_position * model->output_vocab_size * model->n_output_heads;
|
||||
dctx->get_ggml_node_data(res, logits_out, model->output_vocab_size * model->n_output_heads * sizeof(float));
|
||||
|
||||
// Reset state for the next token before backend sync, to allow the CPU activities in the reset to
|
||||
// overlap with device computation.
|
||||
ggml_backend_sched_reset(dctx->sched);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
dia_ubatch dia_runner::build_worst_case_batch() {
|
||||
struct dia_ubatch batch{ 1, true };
|
||||
batch.tokens.resize(model->max_encoder_context_length * 2);
|
||||
batch.audio_tokens.resize(model->n_output_heads);
|
||||
return batch;
|
||||
}
|
||||
|
||||
void dia_runner::prepare_post_load() {
|
||||
dac_runner->prepare_post_load();
|
||||
dia_kv_cache_init(kv_cross_self, model, dctx);
|
||||
auto batch = build_worst_case_batch();
|
||||
batch.sentence_length = model->max_encoder_context_length;
|
||||
dctx->prompt_size = model->max_encoder_context_length;
|
||||
auto gf = build_dia_graph(batch);
|
||||
dctx->prep_schedule(gf);
|
||||
}
|
||||
|
||||
bool dia_runner::check_stopping(dia_ubatch & batch) {
|
||||
if (dctx->delay_steps == -1 && (batch.audio_tokens[0] == model->eos_token_id || dctx->current_position >= dctx->max_generation_size - model->max_delay)) {
|
||||
dctx->delay_steps = model->max_delay;
|
||||
}
|
||||
|
||||
if (dctx->delay_steps > 0) {
|
||||
int step_after_eos = model->max_delay - dctx->delay_steps;
|
||||
for (int i = 0; i < model->delay_pattern.size(); i++) {
|
||||
if (step_after_eos == model->delay_pattern[i]) {
|
||||
batch.audio_tokens[i] = model->eos_token_id;
|
||||
} else if (step_after_eos > model->delay_pattern[i]) {
|
||||
batch.audio_tokens[i] = model->pad_token_id;
|
||||
}
|
||||
}
|
||||
dctx->delay_steps -= 1;
|
||||
}
|
||||
return dctx->delay_steps == 0;
|
||||
}
|
||||
|
||||
void dia_runner::adjust_output_tokens(std::vector<uint32_t> & output_tokens, std::vector<uint32_t> & filtered) {
|
||||
// currently this is applying sliding window over the heads and filtering out bad tokens.
|
||||
// If we convert the DAC model's quantizer layers to support by row + column embeddings then we will need to transpose
|
||||
// the heads and the sequence here, but right now simplying using a strided view is more peformant.
|
||||
size_t size = output_tokens.size();
|
||||
filtered.reserve(size);
|
||||
for (int i = 0; i < (size / model->n_output_heads) - model->max_delay; i++) {
|
||||
bool skip_step = false;
|
||||
for (int ii = 0; ii < model->n_output_heads; ii++) {
|
||||
int next_index = i*model->n_output_heads+model->delay_pattern[ii]*model->n_output_heads+ii;
|
||||
if (next_index > size || output_tokens[next_index] >= model->audio_vocab_size) {
|
||||
skip_step = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!skip_step) {
|
||||
for (int ii = 0; ii < model->n_output_heads; ii++) {
|
||||
int next_index = i*model->n_output_heads+model->delay_pattern[ii]*model->n_output_heads+ii;
|
||||
filtered.push_back(output_tokens[next_index]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int dia_runner::generate_from_batch(dia_ubatch & batch, struct tts_response * output) {
|
||||
while (!check_stopping(batch)) {
|
||||
int state = decode(batch);
|
||||
if (state != 0) {
|
||||
return state;
|
||||
}
|
||||
decode_sampler->sample(dctx->logits + dctx->current_position * model->n_output_heads * model->output_vocab_size, dctx->output_tokens);
|
||||
dctx->current_position += batch.sequence_length;
|
||||
batch = dia_ubatch{ 1 };
|
||||
uint32_t * last_outputs = (dctx->output_tokens.data() + (int) dctx->output_tokens.size() - model->n_output_heads);
|
||||
batch.audio_tokens.reserve(model->n_output_heads);
|
||||
for (int i = 0; i < model->n_output_heads; i++) {
|
||||
batch.audio_tokens.push_back(dctx->current_position > i ? last_outputs[i] : model->bos_token_id);
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<uint32_t> filtered_output_tokens;
|
||||
adjust_output_tokens(dctx->output_tokens, filtered_output_tokens);
|
||||
|
||||
dac_runner->run(filtered_output_tokens.data(), (int32_t) filtered_output_tokens.size() / model->n_output_heads, output);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int dia_runner::generate(std::string sentence, struct tts_response * output) {
|
||||
dia_ubatch batch = batch_from_sentence(sentence);
|
||||
dctx->reset();
|
||||
decode_sampler->reset();
|
||||
dctx->current_position = 0;
|
||||
if (!kv_cross_self) {
|
||||
kv_cross_self = new dia_kv_cache;
|
||||
if (!dia_kv_cache_init(kv_cross_self, model, dctx)) {
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
return generate_from_batch(batch, output);
|
||||
}
|
||||
|
||||
void dia_runner::assign_weight(std::string name, ggml_tensor * tensor) {
|
||||
if (tensor->data == NULL) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (name.size() == 0) {
|
||||
// handles the top level meta tensor
|
||||
return;
|
||||
}
|
||||
|
||||
if (name.size() > 14 && name.substr(0, 14) == "audio_encoder.") {
|
||||
dac_runner->model->assign_weight(name.substr(14), tensor);
|
||||
} else {
|
||||
model->assign_weight(name, tensor);
|
||||
}
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue