builds but crashes

This commit is contained in:
Concedo 2025-08-17 00:09:03 +08:00
parent 2bf128587d
commit bc04366a65
43 changed files with 12183 additions and 2 deletions

View file

@ -0,0 +1,911 @@
#include "dia_model.h"
void dia_model::assign_weight(std::string name, struct ggml_tensor * tensor) {
std::vector<std::string> parts = split(name, ".");
TTS_ASSERT(parts.size() >= 3);
if (parts[1] == "encoder") {
assign_to_encoder(parts, tensor, name);
} else if (parts[1] == "decoder"){
assign_to_decoder(parts, tensor, name);
} else {
TTS_ABORT("Unrecognized tensor '%s' when loading Dia from GGUF file.", name.c_str());
}
}
void dia_model::assign_to_encoder(std::vector<std::string> parts, struct ggml_tensor * tensor, std::string name) {
if (parts[2] == "embedding") {
encoder->embedding = ggml_dup_tensor(ctx, tensor);
set_tensor(encoder->embedding, tensor);
} else if (parts[2] == "norm") {
encoder->norm = ggml_dup_tensor(ctx, tensor);
set_tensor(encoder->norm, tensor);
} else if (parts[2] == "layers") {
TTS_ASSERT(parts.size() >= 4);
int index = std::stoi(parts[3]);
TTS_ASSERT(index < decoder->layers.size());
assign_to_encoder_layer(parts[4], encoder->layers[index], tensor);
} else {
TTS_ABORT("Unrecognized tensor '%s' when loading Dia from GGUF file.", name.c_str());
}
}
void dia_model::assign_to_decoder(std::vector<std::string> parts, struct ggml_tensor * tensor, std::string name) {
if (parts[2] == "embeddings") {
TTS_ASSERT(parts.size() > 2);
int index = std::stoi(parts[3]);
TTS_ASSERT(index < decoder->embds.size());
decoder->embds[index] = ggml_dup_tensor(ctx, tensor);
set_tensor(decoder->embds[index], tensor);
} else if (parts[2] == "norm") {
decoder->norm = ggml_dup_tensor(ctx, tensor);
set_tensor(decoder->norm, tensor);
} else if (parts[2] == "heads") {
TTS_ASSERT(parts.size() > 2);
int index = std::stoi(parts[3]);
TTS_ASSERT(index < decoder->heads.size());
decoder->heads[index] = ggml_dup_tensor(ctx, tensor);
set_tensor(decoder->heads[index], tensor);
} else if (parts[2] == "layers") {
TTS_ASSERT(parts.size() >= 4);
int index = std::stoi(parts[3]);
TTS_ASSERT(index < decoder->layers.size());
assign_to_decoder_layer(parts[4], decoder->layers[index], tensor);
} else {
TTS_ABORT("Unrecognized tensor '%s' when loading Dia from GGUF file.", name.c_str());
}
}
void dia_model::assign_to_encoder_layer(std::string part, dia_encoder_layer * layer, struct ggml_tensor * tensor) {
if (part == "q_proj") {
layer->q = ggml_dup_tensor(ctx, tensor);
set_tensor(layer->q, tensor);
} else if (part == "k_proj") {
layer->k = ggml_dup_tensor(ctx, tensor);
set_tensor(layer->k, tensor);
} else if (part == "v_proj") {
layer->v = ggml_dup_tensor(ctx, tensor);
set_tensor(layer->v, tensor);
} else if (part == "o_proj") {
layer->o = ggml_dup_tensor(ctx, tensor);
set_tensor(layer->o, tensor);
} else if (part == "pre_sa_norm") {
layer->self_attn_norm = ggml_dup_tensor(ctx, tensor);
set_tensor(layer->self_attn_norm, tensor);
} else if (part == "post_sa_norm") {
layer->mlp_norm = ggml_dup_tensor(ctx, tensor);
set_tensor(layer->mlp_norm, tensor);
} else if (part == "gate") {
layer->gate = ggml_dup_tensor(ctx, tensor);
set_tensor(layer->gate, tensor);
} else if (part == "up") {
layer->up = ggml_dup_tensor(ctx, tensor);
set_tensor(layer->up, tensor);
} else if (part == "wo") {
layer->out = ggml_dup_tensor(ctx, tensor);
set_tensor(layer->out, tensor);
} else {
TTS_ABORT("Unrecognized tensor '%s' for encoder layer when loading Dia from GGUF file.", part.c_str());
}
}
void dia_model::assign_to_decoder_layer(std::string part, dia_decoder_layer * layer, struct ggml_tensor * tensor) {
if (part == "self_q_proj") {
layer->self_attn_q = ggml_dup_tensor(ctx, tensor);
set_tensor(layer->self_attn_q, tensor);
} else if (part == "self_k_proj") {
layer->self_attn_k = ggml_dup_tensor(ctx, tensor);
set_tensor(layer->self_attn_k, tensor);
} else if (part == "self_v_proj") {
layer->self_attn_v = ggml_dup_tensor(ctx, tensor);
set_tensor(layer->self_attn_v, tensor);
} else if (part == "self_o_proj") {
layer->self_attn_o = ggml_dup_tensor(ctx, tensor);
set_tensor(layer->self_attn_o, tensor);
} else if (part == "cross_q_proj") {
layer->cross_attn_q = ggml_dup_tensor(ctx, tensor);
set_tensor(layer->cross_attn_q, tensor);
} else if (part == "cross_k_proj") {
layer->cross_attn_k = ggml_dup_tensor(ctx, tensor);
set_tensor(layer->cross_attn_k, tensor);
} else if (part == "cross_v_proj") {
layer->cross_attn_v = ggml_dup_tensor(ctx, tensor);
set_tensor(layer->cross_attn_v, tensor);
} else if (part == "cross_o_proj") {
layer->cross_attn_o = ggml_dup_tensor(ctx, tensor);
set_tensor(layer->cross_attn_o, tensor);
} else if (part == "pre_sa_norm") {
layer->self_attn_norm = ggml_dup_tensor(ctx, tensor);
set_tensor(layer->self_attn_norm, tensor);
} else if (part == "pre_mlp_norm") {
layer->mlp_norm = ggml_dup_tensor(ctx, tensor);
set_tensor(layer->mlp_norm, tensor);
} else if (part == "pre_ca_norm") {
layer->cross_attn_norm = ggml_dup_tensor(ctx, tensor);
set_tensor(layer->cross_attn_norm, tensor);
} else if (part == "gate") {
layer->gate = ggml_dup_tensor(ctx, tensor);
set_tensor(layer->gate, tensor);
} else if (part == "up") {
layer->up = ggml_dup_tensor(ctx, tensor);
set_tensor(layer->up, tensor);
} else if (part == "wo") {
layer->out = ggml_dup_tensor(ctx, tensor);
set_tensor(layer->out, tensor);
} else {
TTS_ABORT("Unrecognized tensor '%s' for encoder layer when loading Dia from GGUF file.", part.c_str());
}
}
void dia_model::prep_layers() {
encoder = new dia_encoder;
decoder = new dia_decoder;
encoder->layers.reserve((size_t) n_encoder_layers);
for (int i = 0; i < (int) n_encoder_layers; i++) {
dia_encoder_layer * l = new dia_encoder_layer;
encoder->layers.push_back(l);
}
decoder->layers.reserve((size_t) n_decoder_layers);
for (int i = 0; i < (int) n_decoder_layers; i++) {
dia_decoder_layer * l = new dia_decoder_layer;
decoder->layers.push_back(l);
}
decoder->embds.reserve((size_t) n_output_heads);
decoder->heads.reserve((size_t) n_output_heads);
for (int i = 0; i < n_output_heads; i++) {
struct ggml_tensor * h = nullptr;
struct ggml_tensor * embd = nullptr;
decoder->embds.push_back(embd);
decoder->heads.push_back(h);
}
}
void dia_model::prep_constants(gguf_context * meta) {
int output_heads_key = gguf_find_key(meta, "dia.decoder.output_heads");
if (output_heads_key != -1) {
n_output_heads = gguf_get_val_u32(meta, output_heads_key);
}
int decoder_layers_key = gguf_find_key(meta, "dia.decoder.layers");
if (decoder_layers_key != -1) {
n_decoder_layers = gguf_get_val_u32(meta, decoder_layers_key);
}
int encoder_layers_key = gguf_find_key(meta, "dia.encoder.layers");
if (encoder_layers_key != -1) {
n_encoder_layers = gguf_get_val_u32(meta, encoder_layers_key);
}
int decoder_hidden_size_key = gguf_find_key(meta, "dia.decoder.hidden_size");
if (decoder_hidden_size_key != -1) {
decoder_hidden_size = gguf_get_val_u32(meta, decoder_hidden_size_key);
}
int decoder_attn_heads_key = gguf_find_key(meta, "dia.decoder.attn_heads");
if (decoder_attn_heads_key != -1) {
decoder_attn_heads = gguf_get_val_u32(meta, decoder_attn_heads_key);
}
int decoder_query_heads_key = gguf_find_key(meta, "dia.decoder.query_heads");
if (decoder_query_heads_key != -1) {
decoder_query_heads = gguf_get_val_u32(meta, decoder_query_heads_key);
}
int encoder_attn_heads_key = gguf_find_key(meta, "dia.encoder.attn_heads");
if (encoder_attn_heads_key != -1) {
encoder_attn_heads = gguf_get_val_u32(meta, encoder_attn_heads_key);
}
int head_size_key = gguf_find_key(meta, "dia.attn_head_size");
if (head_size_key != -1) {
head_size = gguf_get_val_u32(meta, head_size_key);
}
int eos_token_id_key = gguf_find_key(meta, "dia.eos_token_id");
if (eos_token_id_key != -1) {
eos_token_id = gguf_get_val_u32(meta, eos_token_id_key);
}
int bos_token_id_key = gguf_find_key(meta, "dia.bos_token_id");
if (bos_token_id_key != -1) {
bos_token_id = gguf_get_val_u32(meta, bos_token_id_key);
}
int pad_token_id_key = gguf_find_key(meta, "dia.pad_token_id");
if (pad_token_id_key != -1) {
pad_token_id = gguf_get_val_u32(meta, pad_token_id_key);
}
int max_context_key = gguf_find_key(meta, "dia.encoder.max_context_length");
if (max_context_key != -1) {
max_encoder_context_length = gguf_get_val_u32(meta, max_context_key);
}
int output_vocab_size_key = gguf_find_key(meta, "dia.decoder.output_vocab_size");
if (output_vocab_size_key != -1) {
output_vocab_size = gguf_get_val_u32(meta, output_vocab_size_key);
}
int audio_vocab_size_key = gguf_find_key(meta, "dia.decoder.audio_vocab_size");
if (audio_vocab_size_key != -1) {
audio_vocab_size = gguf_get_val_u32(meta, audio_vocab_size_key);
}
int max_generation_size_key = gguf_find_key(meta, "dia.decoder.max_generation_size");
if (max_generation_size_key != -1) {
max_generation_size = gguf_get_val_u32(meta, max_generation_size_key);
}
int max_delay_key = gguf_find_key(meta, "dia.max_delay");
if (max_delay_key != -1) {
max_delay = gguf_get_val_u32(meta, max_delay_key);
}
// please note that this value is not currently set in the gguf encoder as it effectively only exists as a default
// python parameter (rather than an attribute in the model config) for the python Dia model.
int cfg_scale_key = gguf_find_key(meta, "dia.cfg_scale");
if (cfg_scale_key != -1) {
cfg_scale_data[0] = gguf_get_val_f32(meta, cfg_scale_key);
}
}
void dia_context::reset() {
current_position = 0;
prompt_size = 0;
output_tokens.clear();
delay_steps = -1;
}
struct dia_context * build_new_dia_context(struct dia_model * model, int n_threads, bool use_cpu) {
dia_context * dctx = new dia_context(model, n_threads);
if (!use_cpu) {
#ifdef GGML_USE_METAL
dctx->backend = ggml_backend_metal_init();
#endif
}
dctx->backend_cpu = ggml_backend_cpu_init();
dctx->set_threads();
dctx->build_schedule();
dctx->buf_compute_meta.resize(ggml_tensor_overhead()*model->max_nodes() + ggml_graph_overhead_custom(model->max_nodes(), false));
return dctx;
}
static bool dia_kv_cache_init(struct dia_kv_cache * cache, dia_model * model, dia_context * dctx) {
ggml_backend_buffer_type_t buft = nullptr;
// this will only really support cpu or metal for the time being;
if (dctx->backend != nullptr) {
#ifdef GGML_USE_METAL
buft = ggml_backend_metal_buffer_type();
#endif
} else {
buft = ggml_backend_cpu_buffer_type();
}
struct ggml_init_params params = {
/*.mem_size =*/ (4u * model->n_decoder_layers + 1) * ggml_tensor_overhead(),
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ true,
};
ggml_context * ctx = ggml_init(params);
if (!ctx) {
return false;
}
cache->ctx = ctx;
cache->k_l.reserve(model->n_decoder_layers);
cache->v_l.reserve(model->n_decoder_layers);
cache->cross_k_l.reserve(model->n_decoder_layers);
cache->cross_v_l.reserve(model->n_decoder_layers);
for (int i = 0; i < (int) model->n_decoder_layers; i++) {
struct ggml_tensor * k = ggml_new_tensor_1d(cache->ctx, cache->tensor_type, model->head_size * model->decoder_attn_heads * model->max_generation_size * 2);
struct ggml_tensor * v = ggml_new_tensor_1d(cache->ctx, cache->tensor_type, model->head_size * model->decoder_attn_heads * model->max_generation_size * 2);
struct ggml_tensor * cross_k = ggml_new_tensor_1d(cache->ctx, cache->tensor_type, model->head_size * model->decoder_attn_heads * model->max_encoder_context_length * 2);
struct ggml_tensor * cross_v = ggml_new_tensor_1d(cache->ctx, cache->tensor_type, model->head_size * model->decoder_attn_heads * model->max_encoder_context_length * 2);
ggml_format_name(k, "cache_k_l%d", i);
ggml_format_name(v, "cache_v_l%d", i);
ggml_format_name(cross_k, "cache_cross_k_l%d", i);
ggml_format_name(cross_v, "cache_cross_v_l%d", i);
cache->k_l.push_back(k);
cache->v_l.push_back(v);
cache->cross_k_l.push_back(cross_k);
cache->cross_v_l.push_back(cross_v);
}
// allocate tensors and initialize the buffers to avoid NaNs in the padding
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(cache->ctx, buft);
if (!buf) {
return false;
}
ggml_backend_buffer_clear(buf, 0);
cache->buf = buf;
return true;
}
static struct ggml_tensor * build_dia_decoder_inp_embd(struct ggml_context * ctx, dia_context *dctx, dia_decoder * decoder, dia_ubatch & batch, uint32_t n_output_heads) {
struct ggml_tensor * input_embs;
dctx->audio_inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_output_heads * 2);
ggml_set_input(dctx->audio_inp_tokens);
for (int i = 0; i < n_output_heads; i++) {
struct ggml_tensor * view = ggml_view_1d(ctx, dctx->audio_inp_tokens, 2, i * ggml_element_size(dctx->audio_inp_tokens));
view->nb[0] = n_output_heads * ggml_element_size(dctx->audio_inp_tokens);
if (i == 0) {
input_embs = ggml_get_rows(ctx, decoder->embds[i], view);
} else {
input_embs = ggml_add(ctx, ggml_get_rows(ctx, decoder->embds[i], view), input_embs);
}
}
return input_embs;
}
static struct ggml_tensor * dia_layer_norm(struct ggml_context * ctx, struct ggml_tensor * inputs, struct ggml_tensor * weight) {
// dia always uses 1e-5 as the default eps
float eps = 0.00001;
inputs = ggml_rms_norm(ctx, inputs, eps);
return ggml_mul(ctx, inputs, weight);
}
static struct ggml_tensor * build_dia_encoder_attn_mask(ggml_context * ctx, struct dia_context * dctx, dia_model * model) {
dctx->encode_attn_mask = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, (int64_t) model->max_encoder_context_length, (int64_t) model->max_encoder_context_length);
ggml_set_input(dctx->encode_attn_mask);
return dctx->encode_attn_mask;
}
static struct ggml_tensor * build_dia_head_outputs(struct ggml_context * ctx, dia_model * model, struct ggml_tensor * cur) {
// going to cat the heads together and then reshape them
struct ggml_tensor * out;
for (int i = 0; i < model->n_output_heads; i++) {
if (i == 0) {
out = ggml_mul_mat(ctx, model->decoder->heads[i], cur);
} else {
out = ggml_concat(ctx, out, ggml_mul_mat(ctx, model->decoder->heads[i], cur), 2);
}
}
struct ggml_tensor * cond = ggml_cont(ctx, ggml_view_2d(ctx, out, out->ne[0], out->ne[2], out->nb[2], 0));
struct ggml_tensor * uncond = ggml_cont(ctx, ggml_view_2d(ctx, out, out->ne[0], out->ne[2], out->nb[2], out->nb[1]));
return ggml_map_custom2(ctx, cond, uncond, &cfg_scale, out->ne[0], &model->cfg_scale_data);
}
static struct ggml_tensor * build_dia_encoder(ggml_context * ctx, dia_model * model, dia_context * dctx, dia_ubatch & batch) {
dctx->inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, model->max_encoder_context_length*2);
ggml_set_input(dctx->inp_tokens);
dctx->encode_positions = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, model->max_encoder_context_length);
ggml_set_input(dctx->encode_positions);
struct ggml_tensor * attn_mask = build_dia_encoder_attn_mask(ctx, dctx, model);
struct ggml_tensor * cur = ggml_reshape_3d(ctx, ggml_get_rows(ctx, model->encoder->embedding, dctx->inp_tokens), model->encoder_hidden_size, model->max_encoder_context_length, 2);
for (auto layer : model->encoder->layers) {
struct ggml_tensor * residual = cur;
cur = dia_layer_norm(ctx, cur, layer->self_attn_norm);
// self-attention
{
struct ggml_tensor * Qcur = ggml_mul_mat(ctx, layer->q, cur);
struct ggml_tensor * Kcur = ggml_mul_mat(ctx, layer->k, cur);
struct ggml_tensor * Vcur = ggml_mul_mat(ctx, layer->v, cur);
// Strangely Dia follows the neoX Rotary Positional Embeddings Protocol
Qcur = ggml_rope(ctx, ggml_cont(ctx, ggml_reshape_4d(ctx, Qcur, model->head_size, model->encoder_attn_heads, model->max_encoder_context_length, 2)), dctx->encode_positions, model->head_size, 2);
Kcur = ggml_rope(ctx, ggml_cont(ctx, ggml_reshape_4d(ctx, Kcur, model->head_size, model->encoder_attn_heads, model->max_encoder_context_length, 2)), dctx->encode_positions, model->head_size, 2);
struct ggml_tensor * q = ggml_cont(ctx, ggml_permute(ctx, Qcur, 0, 2, 1, 3));
struct ggml_tensor * k = ggml_cont(ctx, ggml_permute(ctx, Kcur, 0, 2, 1, 3));
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
kq = ggml_soft_max_ext(ctx, kq, attn_mask, 1.0f, 0.0f);
struct ggml_tensor * v = ggml_cont_4d(ctx, ggml_transpose(ctx, Vcur), model->max_encoder_context_length, model->head_size, model->encoder_attn_heads, 2);
struct ggml_tensor * kqv = ggml_mul_mat(ctx, kq, v);
struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 2, 0, 1, 3);
// It is unclear why the attention ops in Dia's encoder don't project to the embedding dimension size as is standard. Instead they up project to the decoder's embedding dimension
// then down project back the the encoder embedding dimension.
cur = ggml_cont_3d(ctx, kqv_merged, model->decoder_hidden_size, model->max_encoder_context_length, 2);
cur = ggml_mul_mat(ctx, layer->o, cur);
}
cur = ggml_add(ctx, cur, residual);
struct ggml_tensor * residual_mlp = cur;
cur = dia_layer_norm(ctx, cur, layer->mlp_norm);
// mlp
{
cur = ggml_mul(ctx, ggml_silu(ctx, ggml_mul_mat(ctx, layer->gate, cur)), ggml_mul_mat(ctx, layer->up, cur));
cur = ggml_mul_mat(ctx, layer->out, cur);
}
cur = ggml_add(ctx, cur, residual_mlp);
}
cur = dia_layer_norm(ctx, cur, model->encoder->norm);
return cur;
}
static struct ggml_tensor * repeat_interleave_dim1(ggml_context * ctx, struct ggml_tensor * a, int repeat) {
//return ggml_repeat(ctx, a, ggml_new_tensor_4d(ctx, GGML_TYPE_F32, a->ne[0], 4*a->ne[1], a->ne[2], a->ne[3]));
struct ggml_tensor * running;
for (int i = 0; i < a->ne[1]; i++) {
int offset = i * a->nb[1];
struct ggml_tensor * t = ggml_cont(ctx, ggml_view_4d(ctx, a, a->ne[0], 1, a->ne[2], a->ne[3], a->nb[1], a->nb[2], a->nb[3], offset));
t = ggml_repeat(ctx, t, ggml_new_tensor_4d(ctx, GGML_TYPE_F32, a->ne[0], repeat, a->ne[2], a->ne[3]));
if (i == 0) {
running = t;
} else {
running = ggml_concat(ctx, running, t, 1);
}
}
return running;
}
static void build_dia_self_kv_store(ggml_context * ctx, dia_context * dctx, dia_model * model, dia_kv_cache * kv, ggml_cgraph * gf, struct ggml_tensor * k, struct ggml_tensor * v, dia_ubatch & batch, int layer_index) {
int64_t attn_size = model->head_size * model->decoder_attn_heads;
struct ggml_tensor * k_cache_view =
ggml_view_2d(
ctx, kv->k_l[layer_index], attn_size, 2,
attn_size * model->max_generation_size * ggml_element_size(kv->k_l[layer_index]),
attn_size*dctx->current_position*ggml_element_size(kv->k_l[layer_index]));
k = ggml_rope(ctx, ggml_cont(ctx, ggml_reshape_4d(ctx, k, model->head_size, model->decoder_attn_heads / model->decoder_query_heads, batch.sequence_length, 2)), dctx->positions, model->head_size, 2);
// Since the sequence length should always be 1 here this is the most pertinent time to repeat the heads for grouped query attention.
// If GGML supported a repeat_interleave op then it would be more optimal to store just the groups in the cache and interleave the attention heads after recalling
// from the cache
k = repeat_interleave_dim1(ctx, ggml_cont(ctx, ggml_reshape_4d(ctx, k, model->head_size, model->decoder_attn_heads / model->decoder_query_heads, batch.sequence_length, 2)), model->decoder_query_heads);
k = ggml_cont(ctx, ggml_reshape_2d(ctx, k, attn_size, 2));
ggml_build_forward_expand(gf, ggml_cpy(ctx, k, k_cache_view));
struct ggml_tensor * v_cache_view = nullptr;
v_cache_view = ggml_view_2d(
ctx, kv->v_l[layer_index], attn_size, 2,
attn_size * model->max_generation_size * ggml_element_size(kv->v_l[layer_index]),
attn_size*dctx->current_position*ggml_element_size(kv->v_l[layer_index]));
// Since the sequence length should always be 1 here this is the most pertinent time to repeat the heads for grouped query attention.
// If GGML supported a repeat_interleave op then it would be more optimal to store just the groups in the cache and interleave the attention heads after recalling
// from the cache
v = repeat_interleave_dim1(ctx, ggml_cont(ctx, ggml_reshape_4d(ctx, v, model->head_size, model->decoder_attn_heads / model->decoder_query_heads, batch.sequence_length, 2)), model->decoder_query_heads);
ggml_build_forward_expand(gf, ggml_cpy(ctx, v, v_cache_view));
}
static void build_dia_cross_kv_store(ggml_context * ctx, dia_context * dctx, dia_model * model, dia_kv_cache * kv, ggml_cgraph * gf, struct ggml_tensor * encoder_hidden_states, int layer_index) {
dia_decoder_layer * layer = model->decoder->layers[layer_index];
struct ggml_tensor * encoder_states_key_view = ggml_cont(ctx, ggml_view_3d(
ctx,
encoder_hidden_states,
model->encoder_hidden_size,
dctx->prompt_size,
2,
model->encoder_hidden_size * ggml_element_size(encoder_hidden_states), model->encoder_hidden_size * model->max_encoder_context_length * ggml_element_size(encoder_hidden_states), 0));
struct ggml_tensor * k = ggml_mul_mat(ctx, layer->cross_attn_k, encoder_states_key_view);
struct ggml_tensor * positions_view = ggml_view_1d(ctx, dctx->encode_positions, dctx->prompt_size, 0);
k = ggml_rope(ctx, ggml_cont(ctx, ggml_reshape_4d(ctx, k, model->head_size, model->decoder_attn_heads, dctx->prompt_size, 2)), positions_view, model->head_size, 2);
k = ggml_cont(ctx, ggml_permute(ctx, k, 0, 1, 3, 2));
struct ggml_tensor * k_cache_view =
ggml_view_4d(
ctx, kv->cross_k_l[layer_index], model->head_size, model->decoder_attn_heads, 2, dctx->prompt_size,
model->head_size*ggml_element_size(kv->cross_k_l[layer_index]),
model->head_size*model->decoder_attn_heads*ggml_element_size(kv->cross_k_l[layer_index]),
model->head_size*model->decoder_attn_heads*2*ggml_element_size(kv->cross_k_l[layer_index]),
0);
ggml_build_forward_expand(gf, ggml_cpy(ctx, k, k_cache_view));
struct ggml_tensor * v = ggml_cont(ctx, ggml_transpose(ctx, ggml_mul_mat(ctx, layer->cross_attn_v, encoder_hidden_states)));
v = ggml_cont_4d(ctx, v, model->max_encoder_context_length, model->head_size, model->decoder_attn_heads, 2);
struct ggml_tensor * v_cache_view =
ggml_view_4d(
ctx, kv->cross_v_l[layer_index], model->max_encoder_context_length, model->head_size, model->decoder_attn_heads, 2,
model->max_encoder_context_length*ggml_element_size(kv->cross_v_l[layer_index]),
model->head_size*model->max_encoder_context_length*ggml_element_size(kv->cross_v_l[layer_index]),
model->head_size*model->max_encoder_context_length*model->decoder_attn_heads*ggml_element_size(kv->cross_v_l[layer_index]),
0);
ggml_build_forward_expand(gf, ggml_cpy(ctx, v, v_cache_view));
}
static struct ggml_tensor * build_dia_decoder(
ggml_cgraph * gf,
ggml_context * ctx,
dia_model * model,
dia_context * dctx,
dia_kv_cache * cache,
dia_ubatch & batch,
struct ggml_tensor * encoder_hidden_states) {
dctx->positions = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.sequence_length);
ggml_set_input(dctx->positions);
struct ggml_tensor * cur = build_dia_decoder_inp_embd(ctx, dctx, model->decoder, batch, model->n_output_heads);
for (int l = 0; l < model->decoder->layers.size(); l++){
dia_decoder_layer * layer = model->decoder->layers[l];
struct ggml_tensor * residual = cur;
cur = dia_layer_norm(ctx, cur, layer->self_attn_norm);
// self-attention
{
struct ggml_tensor * Qcur = ggml_mul_mat(ctx, layer->self_attn_q, cur);
struct ggml_tensor * Kcur = ggml_mul_mat(ctx, layer->self_attn_k, cur);
struct ggml_tensor * Vcur = ggml_mul_mat(ctx, layer->self_attn_v, cur);
build_dia_self_kv_store(ctx, dctx, model, cache, gf, Kcur, Vcur, batch, l);
struct ggml_tensor * k =
ggml_view_4d(ctx, cache->k_l[l],
model->head_size, model->decoder_attn_heads, dctx->current_position + 1, 2,
ggml_element_size(cache->k_l[l]) * model->head_size,
ggml_element_size(cache->k_l[l]) * model->decoder_attn_heads * model->head_size,
ggml_element_size(cache->k_l[l]) * model->decoder_attn_heads * model->head_size * model->max_generation_size,
0);
k = ggml_cont(ctx, ggml_permute(ctx, k, 0, 2, 1, 3));
struct ggml_tensor * v =
ggml_view_3d(ctx, cache->v_l[l],
model->head_size * model->decoder_attn_heads, dctx->current_position + 1, 2,
ggml_element_size(cache->v_l[l]) * model->decoder_attn_heads * model->head_size,
ggml_element_size(cache->v_l[l]) * model->decoder_attn_heads * model->head_size * model->max_generation_size,
0);
v = ggml_cont_4d(ctx, ggml_transpose(ctx, v), dctx->current_position + 1, model->head_size, model->decoder_attn_heads, 2);
// As noted in the encoder Dia uses the Neo-X protocol for RoPE.
Qcur = ggml_rope(ctx, ggml_cont(ctx, ggml_reshape_4d(ctx, Qcur, model->head_size, model->decoder_attn_heads, batch.sequence_length, 2)), dctx->positions, model->head_size, 2);
struct ggml_tensor * q = ggml_cont(ctx, ggml_permute(ctx, Qcur, 0, 2, 1, 3));
struct ggml_tensor * kq = ggml_mul_mat(ctx, ggml_cont(ctx, k), q);
// given that attention bias, scaling and masking are not used for decoding, it might be faster to prefer the #ggml_soft_max op here,
kq = ggml_soft_max_ext(ctx, kq, nullptr, 1.0f, 0.0f);
struct ggml_tensor * kqv = ggml_mul_mat(ctx, kq, v);
struct ggml_tensor * kqv_merged = ggml_cont(ctx, ggml_permute(ctx, kqv, 2, 0, 1, 3));
cur = ggml_cont_3d(ctx, kqv_merged, model->decoder_hidden_size, batch.sequence_length, 2);
cur = ggml_mul_mat(ctx, layer->self_attn_o, cur);
}
// if we ever need to support multiple step decoder runs then this reshape will need to be replaced with permutation.
cur = ggml_cont_2d(ctx, cur, cur->ne[0], 2);
cur = ggml_add(ctx, cur, residual);
struct ggml_tensor * residual_cross = cur;
cur = dia_layer_norm(ctx, cur, layer->cross_attn_norm);
// cross-attention
{
struct ggml_tensor * cross_Qcur = ggml_mul_mat(ctx, layer->cross_attn_q, cur);
// only load the cross attention kv store when performing the encoding step
if (batch.encoder_step) {
build_dia_cross_kv_store(ctx, dctx, model, cache, gf, encoder_hidden_states, l);
}
struct ggml_tensor * cross_k =
ggml_view_4d(
ctx, cache->cross_k_l[l], model->head_size, model->decoder_attn_heads, 2,
model->max_encoder_context_length, model->head_size*ggml_element_size(cache->cross_k_l[l]),
model->head_size*model->decoder_attn_heads*ggml_element_size(cache->cross_k_l[l]),
model->head_size*model->decoder_attn_heads*2*ggml_element_size(cache->cross_k_l[l]),
0);
// the double permute operation shouldn't be necessary here, but it seems that currently ggml permute only currently alows for a single
// axis pair to be transposed.
cross_k = ggml_cont(ctx, ggml_permute(ctx, ggml_permute(ctx, cross_k, 0, 1, 3, 2), 0, 2, 1, 3));
struct ggml_tensor * cross_v =
ggml_cont(ctx, ggml_view_4d(
ctx, cache->cross_v_l[l], model->max_encoder_context_length, model->head_size, model->decoder_attn_heads, 2,
model->max_encoder_context_length*ggml_element_size(cache->cross_v_l[l]),
model->head_size*model->max_encoder_context_length*ggml_element_size(cache->cross_v_l[l]),
model->head_size*model->max_encoder_context_length*model->decoder_attn_heads*ggml_element_size(cache->cross_v_l[l]),
0));
// As noted in the encoder Dia uses the Neo-X protocol for RoPE.
cross_Qcur = ggml_rope(ctx, ggml_cont(ctx, ggml_reshape_4d(ctx, cross_Qcur, model->head_size, model->decoder_attn_heads, batch.sequence_length, 2)), dctx->positions, model->head_size, 2);
struct ggml_tensor * cross_q = ggml_cont(ctx, ggml_permute(ctx, cross_Qcur, 0, 2, 1, 3));
struct ggml_tensor * cross_kq = ggml_mul_mat(ctx, cross_k, cross_q);
// given that attention bias, scaling and masking are not used for decoding, it might be faster to prefer the #ggml_soft_max op here,
cross_kq = ggml_soft_max_ext(ctx, cross_kq, nullptr, 1.0f, 0.0f);
struct ggml_tensor * cross_kqv = ggml_mul_mat(ctx, cross_kq, cross_v);
struct ggml_tensor * cross_kqv_merged = ggml_cont(ctx, ggml_permute(ctx, cross_kqv, 2, 0, 1, 3));
cur = ggml_cont_3d(ctx, cross_kqv_merged, model->decoder_hidden_size, batch.sequence_length, 2);
cur = ggml_mul_mat(ctx, layer->cross_attn_o, cur);
}
// if we ever need to support multiple step decoder runs then this reshape will need to be replaced with permutation.
cur = ggml_cont_2d(ctx, cur, cur->ne[0], 2);
cur = ggml_add(ctx, cur, residual_cross);
struct ggml_tensor * residual_mlp = cur;
cur = dia_layer_norm(ctx, cur, layer->mlp_norm);
// mlp
{
cur = ggml_mul(ctx, ggml_silu(ctx, ggml_mul_mat(ctx, layer->gate, cur)), ggml_mul_mat(ctx, layer->up, cur));
cur = ggml_mul_mat(ctx, layer->out, cur);
}
cur = ggml_add(ctx, cur, residual_mlp);
}
cur = dia_layer_norm(ctx, cur, model->decoder->norm);
cur = build_dia_head_outputs(ctx, model, cur);
return cur;
}
void dia_runner::tokenize_sentence(std::string sentence, dia_ubatch & batch) {
// Dia's tokenization process is unusual. Essentially Dia takes the byte value for each character and uses that as
// a token array. Additionally, because Dia performs a cfg-scale adjustment before sampling tokens, it is necessary to
// generate with a conditioned context (i.e. with the text) and an unconditioned context (i.e. without any text) so that
// proper adjustments can be perfored at each generation step. This means that we need to pad the end of our tokens to the
// max context size for both the conditional and unconditional sequence.
// if the sentence isn't prepended by dialogue start tokens, [S1] or [S2], then append one.
sentence = strip(sentence);
std::string start = sentence.substr(0, 4);
if (start != "[S1]" && start != "[S2]") {
sentence = "[S1] " + sentence;
}
if (sentence[sentence.size() - 1] != '.') {
sentence += ".";
}
// [S1] and [S2] are special character sequences that are replaced with the special tokens 0x01 and 0x02 respectively.
std::string r1(1, 1);
std::string r2(1, 2);
while (sentence.find("[S1]") != std::string::npos) {
size_t pos = sentence.find("[S1]");
sentence.replace(pos, 4, r1);
}
while (sentence.find("[S2]") != std::string::npos) {
size_t pos = sentence.find("[S2]");
sentence.replace(pos, 4, r2);
}
if (sentence.size() > model->max_encoder_context_length) {
TTS_ABORT("Dia currently only supports a max of %d characters and received an input of %d characters.", model->max_encoder_context_length, sentence.size());
}
batch.tokens.reserve(model->max_encoder_context_length * 2);
for (auto character : sentence) {
batch.tokens.push_back((uint32_t) character);
}
batch.sentence_length = batch.tokens.size();
// this 100 token warning is arbitrarily chosen based on spot checking small prompt performance
if (batch.sentence_length <= 100) {
fprintf(stdout, "Your prompt has fewer than 100 tokens. Please note that Dia's generation with prompts that are fewer than 100 tokens is highly inconsistent.\n");
}
for (int i = (int) batch.tokens.size(); i < model->max_encoder_context_length * 2; i++) {
batch.tokens.push_back(0u);
}
}
dia_ubatch dia_runner::batch_from_sentence(std::string sentence) {
// if we are generating a new batch from tokens then we need to run the encoder step;
struct dia_ubatch batch{ 1, true};
tokenize_sentence(sentence, batch);
batch.audio_tokens.reserve(model->n_output_heads);
for (int i = 0; i < model->n_output_heads; i++) {
batch.audio_tokens.push_back(model->bos_token_id);
}
return batch;
}
/*
* There are two unique features of Dia's model architecture:
* 1. Dia cleans its output generation by adding the difference between its text based output (its conditional output) and its unconditional output
* to the conditional ouput before sampling. This is why the batch is set to two throughout the graph.
*
* 2. Dia's decoder attends across the entire encoded space including the pad buffer which receives a unique attention mask. This is why the
* encoder sequence is always max length.
*/
struct ggml_cgraph * dia_runner::build_dia_graph(dia_ubatch & batch) {
init_build();
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx, 8192, false);
struct ggml_tensor * encoded_states = nullptr;
if (batch.encoder_step) {
encoded_states = build_dia_encoder(ctx, model, dctx, batch);
ggml_build_forward_expand(gf, encoded_states);
}
struct ggml_tensor * cur = build_dia_decoder(gf, ctx, model, dctx, kv_cross_self, batch, encoded_states);
ggml_set_name(cur, "decoder_output");
ggml_build_forward_expand(gf, cur);
free_build();
return gf;
}
void dia_runner::configure_generation(generation_configuration * config) {
GGML_ASSERT(config->max_tokens == 0 || config->max_tokens > model->max_delay);
decode_sampler->temperature = config->temperature;
decode_sampler->repetition_penalty = config->repetition_penalty;
decode_sampler->do_sample = config->sample;
decode_sampler->top_k = config->top_k;
decode_sampler->top_p = config->top_p;
dctx->max_generation_size = config->max_tokens > model->max_delay ? config->max_tokens : model->max_generation_size;
}
void dia_runner::set_inputs(dia_ubatch & batch) {
if (batch.encoder_step) {
ggml_backend_tensor_set(dctx->inp_tokens, batch.tokens.data(), 0, batch.tokens.size()*ggml_element_size(dctx->inp_tokens));
int32_t * ep = (int32_t*) dctx->encode_positions->data;
float * mask = (float*) dctx->encode_attn_mask->data;
for (int i = 0; i < model->max_encoder_context_length; i++) {
ep[i] = (int32_t) i;
for (int ii = 0; ii < model->max_encoder_context_length; ii++) {
if (i < batch.sentence_length) {
mask[i*model->max_encoder_context_length + ii] = ii < batch.sentence_length ? 0.0 : -INFINITY;
} else {
mask[i*model->max_encoder_context_length + ii] = ii >= batch.sentence_length ? 0.0 : -INFINITY;
}
}
}
}
// The audio tokens need to be repeated in the input in order to support cfg-scaling. I.E we need duplicate inputs for conditional and unconditional logits.
ggml_backend_tensor_set(dctx->audio_inp_tokens, batch.audio_tokens.data(), 0, batch.audio_tokens.size()*ggml_element_size(dctx->audio_inp_tokens));
ggml_backend_tensor_set(dctx->audio_inp_tokens, batch.audio_tokens.data(), batch.audio_tokens.size()*ggml_element_size(dctx->audio_inp_tokens), batch.audio_tokens.size()*ggml_element_size(dctx->audio_inp_tokens));
((int32_t*) dctx->positions->data)[0] = dctx->current_position;
}
int dia_runner::decode(dia_ubatch & batch) {
if (batch.encoder_step) {
dctx->prompt_size = batch.sentence_length;
dctx->output_tokens.reserve(dctx->max_generation_size * model->n_output_heads);
}
ggml_backend_sched_reset(dctx->sched);
const size_t logits_size = model->output_vocab_size * dctx->max_generation_size * model->n_output_heads;
const size_t prev_size = dctx->buf_output ? ggml_backend_buffer_get_size(dctx->buf_output) : 0;
const size_t new_size = logits_size * sizeof(float);
if (!dctx->buf_output || prev_size < new_size) {
if (dctx->buf_output) {
ggml_backend_buffer_free(dctx->buf_output);
dctx->buf_output = nullptr;
dctx->logits = nullptr;
}
dctx->buf_output = ggml_backend_buft_alloc_buffer(dctx->backend_cpu_buffer, new_size);
}
dctx->logits = (float *) ggml_backend_buffer_get_base(dctx->buf_output);
ggml_cgraph * gf = build_dia_graph(batch);
// the output is always the last tensor in the graph
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
std::string resname = ggml_get_name(res);
ggml_backend_sched_alloc_graph(dctx->sched, gf);
set_inputs(batch);
ggml_backend_sched_graph_compute_async(dctx->sched, gf);
float * logits_out = dctx->logits + dctx->current_position * model->output_vocab_size * model->n_output_heads;
dctx->get_ggml_node_data(res, logits_out, model->output_vocab_size * model->n_output_heads * sizeof(float));
// Reset state for the next token before backend sync, to allow the CPU activities in the reset to
// overlap with device computation.
ggml_backend_sched_reset(dctx->sched);
return 0;
}
dia_ubatch dia_runner::build_worst_case_batch() {
struct dia_ubatch batch{ 1, true };
batch.tokens.resize(model->max_encoder_context_length * 2);
batch.audio_tokens.resize(model->n_output_heads);
return batch;
}
void dia_runner::prepare_post_load() {
dac_runner->prepare_post_load();
dia_kv_cache_init(kv_cross_self, model, dctx);
auto batch = build_worst_case_batch();
batch.sentence_length = model->max_encoder_context_length;
dctx->prompt_size = model->max_encoder_context_length;
auto gf = build_dia_graph(batch);
dctx->prep_schedule(gf);
}
bool dia_runner::check_stopping(dia_ubatch & batch) {
if (dctx->delay_steps == -1 && (batch.audio_tokens[0] == model->eos_token_id || dctx->current_position >= dctx->max_generation_size - model->max_delay)) {
dctx->delay_steps = model->max_delay;
}
if (dctx->delay_steps > 0) {
int step_after_eos = model->max_delay - dctx->delay_steps;
for (int i = 0; i < model->delay_pattern.size(); i++) {
if (step_after_eos == model->delay_pattern[i]) {
batch.audio_tokens[i] = model->eos_token_id;
} else if (step_after_eos > model->delay_pattern[i]) {
batch.audio_tokens[i] = model->pad_token_id;
}
}
dctx->delay_steps -= 1;
}
return dctx->delay_steps == 0;
}
void dia_runner::adjust_output_tokens(std::vector<uint32_t> & output_tokens, std::vector<uint32_t> & filtered) {
// currently this is applying sliding window over the heads and filtering out bad tokens.
// If we convert the DAC model's quantizer layers to support by row + column embeddings then we will need to transpose
// the heads and the sequence here, but right now simplying using a strided view is more peformant.
size_t size = output_tokens.size();
filtered.reserve(size);
for (int i = 0; i < (size / model->n_output_heads) - model->max_delay; i++) {
bool skip_step = false;
for (int ii = 0; ii < model->n_output_heads; ii++) {
int next_index = i*model->n_output_heads+model->delay_pattern[ii]*model->n_output_heads+ii;
if (next_index > size || output_tokens[next_index] >= model->audio_vocab_size) {
skip_step = true;
break;
}
}
if (!skip_step) {
for (int ii = 0; ii < model->n_output_heads; ii++) {
int next_index = i*model->n_output_heads+model->delay_pattern[ii]*model->n_output_heads+ii;
filtered.push_back(output_tokens[next_index]);
}
}
}
}
int dia_runner::generate_from_batch(dia_ubatch & batch, struct tts_response * output) {
while (!check_stopping(batch)) {
int state = decode(batch);
if (state != 0) {
return state;
}
decode_sampler->sample(dctx->logits + dctx->current_position * model->n_output_heads * model->output_vocab_size, dctx->output_tokens);
dctx->current_position += batch.sequence_length;
batch = dia_ubatch{ 1 };
uint32_t * last_outputs = (dctx->output_tokens.data() + (int) dctx->output_tokens.size() - model->n_output_heads);
batch.audio_tokens.reserve(model->n_output_heads);
for (int i = 0; i < model->n_output_heads; i++) {
batch.audio_tokens.push_back(dctx->current_position > i ? last_outputs[i] : model->bos_token_id);
}
}
std::vector<uint32_t> filtered_output_tokens;
adjust_output_tokens(dctx->output_tokens, filtered_output_tokens);
dac_runner->run(filtered_output_tokens.data(), (int32_t) filtered_output_tokens.size() / model->n_output_heads, output);
return 0;
}
int dia_runner::generate(std::string sentence, struct tts_response * output) {
dia_ubatch batch = batch_from_sentence(sentence);
dctx->reset();
decode_sampler->reset();
dctx->current_position = 0;
if (!kv_cross_self) {
kv_cross_self = new dia_kv_cache;
if (!dia_kv_cache_init(kv_cross_self, model, dctx)) {
return 1;
}
}
return generate_from_batch(batch, output);
}
void dia_runner::assign_weight(std::string name, ggml_tensor * tensor) {
if (tensor->data == NULL) {
return;
}
if (name.size() == 0) {
// handles the top level meta tensor
return;
}
if (name.size() > 14 && name.substr(0, 14) == "audio_encoder.") {
dac_runner->model->assign_weight(name.substr(14), tensor);
} else {
model->assign_weight(name, tensor);
}
}