#include "dia_model.h" void dia_model::assign_weight(std::string name, struct ggml_tensor * tensor) { std::vector parts = split(name, "."); // GGML_LOG("\nassign_weight %s\n",name.c_str()); if(name=="GGUF tensor data binary blob") //patch for gguf, idk why this tensor exists { return; } TTS_ASSERT(parts.size() >= 3); if (parts[1] == "encoder") { assign_to_encoder(parts, tensor, name); } else if (parts[1] == "decoder"){ assign_to_decoder(parts, tensor, name); } else { TTS_ABORT("Unrecognized tensor '%s' when loading Dia from GGUF file.", name.c_str()); } } void dia_model::assign_to_encoder(std::vector parts, struct ggml_tensor * tensor, std::string name) { if (parts[2] == "embedding") { encoder->embedding = ggml_dup_tensor(ctx, tensor); set_tensor(encoder->embedding, tensor); } else if (parts[2] == "norm") { encoder->norm = ggml_dup_tensor(ctx, tensor); set_tensor(encoder->norm, tensor); } else if (parts[2] == "layers") { TTS_ASSERT(parts.size() >= 4); int index = std::stoi(parts[3]); TTS_ASSERT(index < decoder->layers.size()); assign_to_encoder_layer(parts[4], encoder->layers[index], tensor); } else { TTS_ABORT("Unrecognized tensor '%s' when loading Dia from GGUF file.", name.c_str()); } } void dia_model::assign_to_decoder(std::vector parts, struct ggml_tensor * tensor, std::string name) { if (parts[2] == "embeddings") { TTS_ASSERT(parts.size() > 2); int index = std::stoi(parts[3]); TTS_ASSERT(index < decoder->embds.size()); decoder->embds[index] = ggml_dup_tensor(ctx, tensor); set_tensor(decoder->embds[index], tensor); } else if (parts[2] == "norm") { decoder->norm = ggml_dup_tensor(ctx, tensor); set_tensor(decoder->norm, tensor); } else if (parts[2] == "heads") { TTS_ASSERT(parts.size() > 2); int index = std::stoi(parts[3]); TTS_ASSERT(index < decoder->heads.size()); decoder->heads[index] = ggml_dup_tensor(ctx, tensor); set_tensor(decoder->heads[index], tensor); } else if (parts[2] == "layers") { TTS_ASSERT(parts.size() >= 4); int index = std::stoi(parts[3]); TTS_ASSERT(index < decoder->layers.size()); assign_to_decoder_layer(parts[4], decoder->layers[index], tensor); } else { TTS_ABORT("Unrecognized tensor '%s' when loading Dia from GGUF file.", name.c_str()); } } void dia_model::assign_to_encoder_layer(std::string part, dia_encoder_layer * layer, struct ggml_tensor * tensor) { if (part == "q_proj") { layer->q = ggml_dup_tensor(ctx, tensor); set_tensor(layer->q, tensor); } else if (part == "k_proj") { layer->k = ggml_dup_tensor(ctx, tensor); set_tensor(layer->k, tensor); } else if (part == "v_proj") { layer->v = ggml_dup_tensor(ctx, tensor); set_tensor(layer->v, tensor); } else if (part == "o_proj") { layer->o = ggml_dup_tensor(ctx, tensor); set_tensor(layer->o, tensor); } else if (part == "pre_sa_norm") { layer->self_attn_norm = ggml_dup_tensor(ctx, tensor); set_tensor(layer->self_attn_norm, tensor); } else if (part == "post_sa_norm") { layer->mlp_norm = ggml_dup_tensor(ctx, tensor); set_tensor(layer->mlp_norm, tensor); } else if (part == "gate") { layer->gate = ggml_dup_tensor(ctx, tensor); set_tensor(layer->gate, tensor); } else if (part == "up") { layer->up = ggml_dup_tensor(ctx, tensor); set_tensor(layer->up, tensor); } else if (part == "wo") { layer->out = ggml_dup_tensor(ctx, tensor); set_tensor(layer->out, tensor); } else { TTS_ABORT("Unrecognized tensor '%s' for encoder layer when loading Dia from GGUF file.", part.c_str()); } } void dia_model::assign_to_decoder_layer(std::string part, dia_decoder_layer * layer, struct ggml_tensor * tensor) { if (part == "self_q_proj") { layer->self_attn_q = ggml_dup_tensor(ctx, tensor); set_tensor(layer->self_attn_q, tensor); } else if (part == "self_k_proj") { layer->self_attn_k = ggml_dup_tensor(ctx, tensor); set_tensor(layer->self_attn_k, tensor); } else if (part == "self_v_proj") { layer->self_attn_v = ggml_dup_tensor(ctx, tensor); set_tensor(layer->self_attn_v, tensor); } else if (part == "self_o_proj") { layer->self_attn_o = ggml_dup_tensor(ctx, tensor); set_tensor(layer->self_attn_o, tensor); } else if (part == "cross_q_proj") { layer->cross_attn_q = ggml_dup_tensor(ctx, tensor); set_tensor(layer->cross_attn_q, tensor); } else if (part == "cross_k_proj") { layer->cross_attn_k = ggml_dup_tensor(ctx, tensor); set_tensor(layer->cross_attn_k, tensor); } else if (part == "cross_v_proj") { layer->cross_attn_v = ggml_dup_tensor(ctx, tensor); set_tensor(layer->cross_attn_v, tensor); } else if (part == "cross_o_proj") { layer->cross_attn_o = ggml_dup_tensor(ctx, tensor); set_tensor(layer->cross_attn_o, tensor); } else if (part == "pre_sa_norm") { layer->self_attn_norm = ggml_dup_tensor(ctx, tensor); set_tensor(layer->self_attn_norm, tensor); } else if (part == "pre_mlp_norm") { layer->mlp_norm = ggml_dup_tensor(ctx, tensor); set_tensor(layer->mlp_norm, tensor); } else if (part == "pre_ca_norm") { layer->cross_attn_norm = ggml_dup_tensor(ctx, tensor); set_tensor(layer->cross_attn_norm, tensor); } else if (part == "gate") { layer->gate = ggml_dup_tensor(ctx, tensor); set_tensor(layer->gate, tensor); } else if (part == "up") { layer->up = ggml_dup_tensor(ctx, tensor); set_tensor(layer->up, tensor); } else if (part == "wo") { layer->out = ggml_dup_tensor(ctx, tensor); set_tensor(layer->out, tensor); } else { TTS_ABORT("Unrecognized tensor '%s' for encoder layer when loading Dia from GGUF file.", part.c_str()); } } void dia_model::prep_layers() { encoder = new dia_encoder; decoder = new dia_decoder; encoder->layers.reserve((size_t) n_encoder_layers); for (int i = 0; i < (int) n_encoder_layers; i++) { dia_encoder_layer * l = new dia_encoder_layer; encoder->layers.push_back(l); } decoder->layers.reserve((size_t) n_decoder_layers); for (int i = 0; i < (int) n_decoder_layers; i++) { dia_decoder_layer * l = new dia_decoder_layer; decoder->layers.push_back(l); } decoder->embds.reserve((size_t) n_output_heads); decoder->heads.reserve((size_t) n_output_heads); for (int i = 0; i < n_output_heads; i++) { struct ggml_tensor * h = nullptr; struct ggml_tensor * embd = nullptr; decoder->embds.push_back(embd); decoder->heads.push_back(h); } } void dia_model::prep_constants(gguf_context * meta) { int output_heads_key = gguf_find_key(meta, "dia.decoder.output_heads"); if (output_heads_key != -1) { n_output_heads = gguf_get_val_u32(meta, output_heads_key); } int decoder_layers_key = gguf_find_key(meta, "dia.decoder.layers"); if (decoder_layers_key != -1) { n_decoder_layers = gguf_get_val_u32(meta, decoder_layers_key); } int encoder_layers_key = gguf_find_key(meta, "dia.encoder.layers"); if (encoder_layers_key != -1) { n_encoder_layers = gguf_get_val_u32(meta, encoder_layers_key); } int decoder_hidden_size_key = gguf_find_key(meta, "dia.decoder.hidden_size"); if (decoder_hidden_size_key != -1) { decoder_hidden_size = gguf_get_val_u32(meta, decoder_hidden_size_key); } int decoder_attn_heads_key = gguf_find_key(meta, "dia.decoder.attn_heads"); if (decoder_attn_heads_key != -1) { decoder_attn_heads = gguf_get_val_u32(meta, decoder_attn_heads_key); } int decoder_query_heads_key = gguf_find_key(meta, "dia.decoder.query_heads"); if (decoder_query_heads_key != -1) { decoder_query_heads = gguf_get_val_u32(meta, decoder_query_heads_key); } int encoder_attn_heads_key = gguf_find_key(meta, "dia.encoder.attn_heads"); if (encoder_attn_heads_key != -1) { encoder_attn_heads = gguf_get_val_u32(meta, encoder_attn_heads_key); } int head_size_key = gguf_find_key(meta, "dia.attn_head_size"); if (head_size_key != -1) { head_size = gguf_get_val_u32(meta, head_size_key); } int eos_token_id_key = gguf_find_key(meta, "dia.eos_token_id"); if (eos_token_id_key != -1) { eos_token_id = gguf_get_val_u32(meta, eos_token_id_key); } int bos_token_id_key = gguf_find_key(meta, "dia.bos_token_id"); if (bos_token_id_key != -1) { bos_token_id = gguf_get_val_u32(meta, bos_token_id_key); } int pad_token_id_key = gguf_find_key(meta, "dia.pad_token_id"); if (pad_token_id_key != -1) { pad_token_id = gguf_get_val_u32(meta, pad_token_id_key); } int max_context_key = gguf_find_key(meta, "dia.encoder.max_context_length"); if (max_context_key != -1) { max_encoder_context_length = gguf_get_val_u32(meta, max_context_key); } int output_vocab_size_key = gguf_find_key(meta, "dia.decoder.output_vocab_size"); if (output_vocab_size_key != -1) { output_vocab_size = gguf_get_val_u32(meta, output_vocab_size_key); } int audio_vocab_size_key = gguf_find_key(meta, "dia.decoder.audio_vocab_size"); if (audio_vocab_size_key != -1) { audio_vocab_size = gguf_get_val_u32(meta, audio_vocab_size_key); } int max_generation_size_key = gguf_find_key(meta, "dia.decoder.max_generation_size"); if (max_generation_size_key != -1) { max_generation_size = gguf_get_val_u32(meta, max_generation_size_key); } int max_delay_key = gguf_find_key(meta, "dia.max_delay"); if (max_delay_key != -1) { max_delay = gguf_get_val_u32(meta, max_delay_key); } // please note that this value is not currently set in the gguf encoder as it effectively only exists as a default // python parameter (rather than an attribute in the model config) for the python Dia model. int cfg_scale_key = gguf_find_key(meta, "dia.cfg_scale"); if (cfg_scale_key != -1) { cfg_scale_data[0] = gguf_get_val_f32(meta, cfg_scale_key); } } void dia_context::reset() { current_position = 0; prompt_size = 0; output_tokens.clear(); delay_steps = -1; } struct dia_context * build_new_dia_context(struct dia_model * model, int n_threads, bool use_cpu) { dia_context * dctx = new dia_context(model, n_threads); if (!use_cpu) { #ifdef GGML_USE_METAL dctx->backend = ggml_backend_metal_init(); #endif } dctx->backend_cpu = ggml_backend_cpu_init(); dctx->set_threads(); dctx->build_schedule(); dctx->buf_compute_meta.resize(ggml_tensor_overhead()*model->max_nodes() + ggml_graph_overhead_custom(model->max_nodes(), false)); return dctx; } static bool dia_kv_cache_init(struct dia_kv_cache * cache, dia_model * model, dia_context * dctx) { ggml_backend_buffer_type_t buft = nullptr; // this will only really support cpu or metal for the time being; if (dctx->backend != nullptr) { #ifdef GGML_USE_METAL buft = ggml_backend_metal_buffer_type(); #endif } else { buft = ggml_backend_cpu_buffer_type(); } struct ggml_init_params params = { /*.mem_size =*/ (4u * model->n_decoder_layers + 1) * ggml_tensor_overhead(), /*.mem_buffer =*/ NULL, /*.no_alloc =*/ true, }; ggml_context * ctx = ggml_init(params); if (!ctx) { return false; } cache->ctx = ctx; cache->k_l.reserve(model->n_decoder_layers); cache->v_l.reserve(model->n_decoder_layers); cache->cross_k_l.reserve(model->n_decoder_layers); cache->cross_v_l.reserve(model->n_decoder_layers); for (int i = 0; i < (int) model->n_decoder_layers; i++) { struct ggml_tensor * k = ggml_new_tensor_1d(cache->ctx, cache->tensor_type, model->head_size * model->decoder_attn_heads * model->max_generation_size * 2); struct ggml_tensor * v = ggml_new_tensor_1d(cache->ctx, cache->tensor_type, model->head_size * model->decoder_attn_heads * model->max_generation_size * 2); struct ggml_tensor * cross_k = ggml_new_tensor_1d(cache->ctx, cache->tensor_type, model->head_size * model->decoder_attn_heads * model->max_encoder_context_length * 2); struct ggml_tensor * cross_v = ggml_new_tensor_1d(cache->ctx, cache->tensor_type, model->head_size * model->decoder_attn_heads * model->max_encoder_context_length * 2); ggml_format_name(k, "cache_k_l%d", i); ggml_format_name(v, "cache_v_l%d", i); ggml_format_name(cross_k, "cache_cross_k_l%d", i); ggml_format_name(cross_v, "cache_cross_v_l%d", i); cache->k_l.push_back(k); cache->v_l.push_back(v); cache->cross_k_l.push_back(cross_k); cache->cross_v_l.push_back(cross_v); } // allocate tensors and initialize the buffers to avoid NaNs in the padding ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(cache->ctx, buft); if (!buf) { return false; } ggml_backend_buffer_clear(buf, 0); cache->buf = buf; return true; } static struct ggml_tensor * build_dia_decoder_inp_embd(struct ggml_context * ctx, dia_context *dctx, dia_decoder * decoder, dia_ubatch & batch, uint32_t n_output_heads) { struct ggml_tensor * input_embs; dctx->audio_inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_output_heads * 2); ggml_set_input(dctx->audio_inp_tokens); for (int i = 0; i < n_output_heads; i++) { struct ggml_tensor * view = ggml_view_1d(ctx, dctx->audio_inp_tokens, 2, i * ggml_element_size(dctx->audio_inp_tokens)); view->nb[0] = n_output_heads * ggml_element_size(dctx->audio_inp_tokens); if (i == 0) { input_embs = ggml_get_rows(ctx, decoder->embds[i], view); } else { input_embs = ggml_add(ctx, ggml_get_rows(ctx, decoder->embds[i], view), input_embs); } } return input_embs; } static struct ggml_tensor * dia_layer_norm(struct ggml_context * ctx, struct ggml_tensor * inputs, struct ggml_tensor * weight) { // dia always uses 1e-5 as the default eps float eps = 0.00001; inputs = ggml_rms_norm(ctx, inputs, eps); return ggml_mul(ctx, inputs, weight); } static struct ggml_tensor * build_dia_encoder_attn_mask(ggml_context * ctx, struct dia_context * dctx, dia_model * model) { dctx->encode_attn_mask = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, (int64_t) model->max_encoder_context_length, (int64_t) model->max_encoder_context_length); ggml_set_input(dctx->encode_attn_mask); return dctx->encode_attn_mask; } static struct ggml_tensor * build_dia_head_outputs(struct ggml_context * ctx, dia_model * model, struct ggml_tensor * cur) { // going to cat the heads together and then reshape them struct ggml_tensor * out; for (int i = 0; i < model->n_output_heads; i++) { if (i == 0) { out = ggml_mul_mat(ctx, model->decoder->heads[i], cur); } else { out = ggml_concat(ctx, out, ggml_mul_mat(ctx, model->decoder->heads[i], cur), 2); } } struct ggml_tensor * cond = ggml_cont(ctx, ggml_view_2d(ctx, out, out->ne[0], out->ne[2], out->nb[2], 0)); struct ggml_tensor * uncond = ggml_cont(ctx, ggml_view_2d(ctx, out, out->ne[0], out->ne[2], out->nb[2], out->nb[1])); return ggml_map_custom2(ctx, cond, uncond, &cfg_scale, out->ne[0], &model->cfg_scale_data); } static struct ggml_tensor * build_dia_encoder(ggml_context * ctx, dia_model * model, dia_context * dctx, dia_ubatch & batch) { dctx->inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, model->max_encoder_context_length*2); ggml_set_input(dctx->inp_tokens); dctx->encode_positions = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, model->max_encoder_context_length); ggml_set_input(dctx->encode_positions); struct ggml_tensor * attn_mask = build_dia_encoder_attn_mask(ctx, dctx, model); struct ggml_tensor * cur = ggml_reshape_3d(ctx, ggml_get_rows(ctx, model->encoder->embedding, dctx->inp_tokens), model->encoder_hidden_size, model->max_encoder_context_length, 2); for (auto layer : model->encoder->layers) { struct ggml_tensor * residual = cur; cur = dia_layer_norm(ctx, cur, layer->self_attn_norm); // self-attention { struct ggml_tensor * Qcur = ggml_mul_mat(ctx, layer->q, cur); struct ggml_tensor * Kcur = ggml_mul_mat(ctx, layer->k, cur); struct ggml_tensor * Vcur = ggml_mul_mat(ctx, layer->v, cur); // Strangely Dia follows the neoX Rotary Positional Embeddings Protocol Qcur = ggml_rope(ctx, ggml_cont(ctx, ggml_reshape_4d(ctx, Qcur, model->head_size, model->encoder_attn_heads, model->max_encoder_context_length, 2)), dctx->encode_positions, model->head_size, 2); Kcur = ggml_rope(ctx, ggml_cont(ctx, ggml_reshape_4d(ctx, Kcur, model->head_size, model->encoder_attn_heads, model->max_encoder_context_length, 2)), dctx->encode_positions, model->head_size, 2); struct ggml_tensor * q = ggml_cont(ctx, ggml_permute(ctx, Qcur, 0, 2, 1, 3)); struct ggml_tensor * k = ggml_cont(ctx, ggml_permute(ctx, Kcur, 0, 2, 1, 3)); struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q); kq = ggml_soft_max_ext(ctx, kq, attn_mask, 1.0f, 0.0f); struct ggml_tensor * v = ggml_cont_4d(ctx, ggml_transpose(ctx, Vcur), model->max_encoder_context_length, model->head_size, model->encoder_attn_heads, 2); struct ggml_tensor * kqv = ggml_mul_mat(ctx, kq, v); struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 2, 0, 1, 3); // It is unclear why the attention ops in Dia's encoder don't project to the embedding dimension size as is standard. Instead they up project to the decoder's embedding dimension // then down project back the the encoder embedding dimension. cur = ggml_cont_3d(ctx, kqv_merged, model->decoder_hidden_size, model->max_encoder_context_length, 2); cur = ggml_mul_mat(ctx, layer->o, cur); } cur = ggml_add(ctx, cur, residual); struct ggml_tensor * residual_mlp = cur; cur = dia_layer_norm(ctx, cur, layer->mlp_norm); // mlp { cur = ggml_mul(ctx, ggml_silu(ctx, ggml_mul_mat(ctx, layer->gate, cur)), ggml_mul_mat(ctx, layer->up, cur)); cur = ggml_mul_mat(ctx, layer->out, cur); } cur = ggml_add(ctx, cur, residual_mlp); } cur = dia_layer_norm(ctx, cur, model->encoder->norm); return cur; } static struct ggml_tensor * repeat_interleave_dim1(ggml_context * ctx, struct ggml_tensor * a, int repeat) { //return ggml_repeat(ctx, a, ggml_new_tensor_4d(ctx, GGML_TYPE_F32, a->ne[0], 4*a->ne[1], a->ne[2], a->ne[3])); struct ggml_tensor * running; for (int i = 0; i < a->ne[1]; i++) { int offset = i * a->nb[1]; struct ggml_tensor * t = ggml_cont(ctx, ggml_view_4d(ctx, a, a->ne[0], 1, a->ne[2], a->ne[3], a->nb[1], a->nb[2], a->nb[3], offset)); t = ggml_repeat(ctx, t, ggml_new_tensor_4d(ctx, GGML_TYPE_F32, a->ne[0], repeat, a->ne[2], a->ne[3])); if (i == 0) { running = t; } else { running = ggml_concat(ctx, running, t, 1); } } return running; } static void build_dia_self_kv_store(ggml_context * ctx, dia_context * dctx, dia_model * model, dia_kv_cache * kv, ggml_cgraph * gf, struct ggml_tensor * k, struct ggml_tensor * v, dia_ubatch & batch, int layer_index) { int64_t attn_size = model->head_size * model->decoder_attn_heads; struct ggml_tensor * k_cache_view = ggml_view_2d( ctx, kv->k_l[layer_index], attn_size, 2, attn_size * model->max_generation_size * ggml_element_size(kv->k_l[layer_index]), attn_size*dctx->current_position*ggml_element_size(kv->k_l[layer_index])); k = ggml_rope(ctx, ggml_cont(ctx, ggml_reshape_4d(ctx, k, model->head_size, model->decoder_attn_heads / model->decoder_query_heads, batch.sequence_length, 2)), dctx->positions, model->head_size, 2); // Since the sequence length should always be 1 here this is the most pertinent time to repeat the heads for grouped query attention. // If GGML supported a repeat_interleave op then it would be more optimal to store just the groups in the cache and interleave the attention heads after recalling // from the cache k = repeat_interleave_dim1(ctx, ggml_cont(ctx, ggml_reshape_4d(ctx, k, model->head_size, model->decoder_attn_heads / model->decoder_query_heads, batch.sequence_length, 2)), model->decoder_query_heads); k = ggml_cont(ctx, ggml_reshape_2d(ctx, k, attn_size, 2)); ggml_build_forward_expand(gf, ggml_cpy(ctx, k, k_cache_view)); struct ggml_tensor * v_cache_view = nullptr; v_cache_view = ggml_view_2d( ctx, kv->v_l[layer_index], attn_size, 2, attn_size * model->max_generation_size * ggml_element_size(kv->v_l[layer_index]), attn_size*dctx->current_position*ggml_element_size(kv->v_l[layer_index])); // Since the sequence length should always be 1 here this is the most pertinent time to repeat the heads for grouped query attention. // If GGML supported a repeat_interleave op then it would be more optimal to store just the groups in the cache and interleave the attention heads after recalling // from the cache v = repeat_interleave_dim1(ctx, ggml_cont(ctx, ggml_reshape_4d(ctx, v, model->head_size, model->decoder_attn_heads / model->decoder_query_heads, batch.sequence_length, 2)), model->decoder_query_heads); ggml_build_forward_expand(gf, ggml_cpy(ctx, v, v_cache_view)); } static void build_dia_cross_kv_store(ggml_context * ctx, dia_context * dctx, dia_model * model, dia_kv_cache * kv, ggml_cgraph * gf, struct ggml_tensor * encoder_hidden_states, int layer_index) { dia_decoder_layer * layer = model->decoder->layers[layer_index]; struct ggml_tensor * encoder_states_key_view = ggml_cont(ctx, ggml_view_3d( ctx, encoder_hidden_states, model->encoder_hidden_size, dctx->prompt_size, 2, model->encoder_hidden_size * ggml_element_size(encoder_hidden_states), model->encoder_hidden_size * model->max_encoder_context_length * ggml_element_size(encoder_hidden_states), 0)); struct ggml_tensor * k = ggml_mul_mat(ctx, layer->cross_attn_k, encoder_states_key_view); struct ggml_tensor * positions_view = ggml_view_1d(ctx, dctx->encode_positions, dctx->prompt_size, 0); k = ggml_rope(ctx, ggml_cont(ctx, ggml_reshape_4d(ctx, k, model->head_size, model->decoder_attn_heads, dctx->prompt_size, 2)), positions_view, model->head_size, 2); k = ggml_cont(ctx, ggml_permute(ctx, k, 0, 1, 3, 2)); struct ggml_tensor * k_cache_view = ggml_view_4d( ctx, kv->cross_k_l[layer_index], model->head_size, model->decoder_attn_heads, 2, dctx->prompt_size, model->head_size*ggml_element_size(kv->cross_k_l[layer_index]), model->head_size*model->decoder_attn_heads*ggml_element_size(kv->cross_k_l[layer_index]), model->head_size*model->decoder_attn_heads*2*ggml_element_size(kv->cross_k_l[layer_index]), 0); ggml_build_forward_expand(gf, ggml_cpy(ctx, k, k_cache_view)); struct ggml_tensor * v = ggml_cont(ctx, ggml_transpose(ctx, ggml_mul_mat(ctx, layer->cross_attn_v, encoder_hidden_states))); v = ggml_cont_4d(ctx, v, model->max_encoder_context_length, model->head_size, model->decoder_attn_heads, 2); struct ggml_tensor * v_cache_view = ggml_view_4d( ctx, kv->cross_v_l[layer_index], model->max_encoder_context_length, model->head_size, model->decoder_attn_heads, 2, model->max_encoder_context_length*ggml_element_size(kv->cross_v_l[layer_index]), model->head_size*model->max_encoder_context_length*ggml_element_size(kv->cross_v_l[layer_index]), model->head_size*model->max_encoder_context_length*model->decoder_attn_heads*ggml_element_size(kv->cross_v_l[layer_index]), 0); ggml_build_forward_expand(gf, ggml_cpy(ctx, v, v_cache_view)); } static struct ggml_tensor * build_dia_decoder( ggml_cgraph * gf, ggml_context * ctx, dia_model * model, dia_context * dctx, dia_kv_cache * cache, dia_ubatch & batch, struct ggml_tensor * encoder_hidden_states) { dctx->positions = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.sequence_length); ggml_set_input(dctx->positions); struct ggml_tensor * cur = build_dia_decoder_inp_embd(ctx, dctx, model->decoder, batch, model->n_output_heads); for (int l = 0; l < model->decoder->layers.size(); l++){ dia_decoder_layer * layer = model->decoder->layers[l]; struct ggml_tensor * residual = cur; cur = dia_layer_norm(ctx, cur, layer->self_attn_norm); // self-attention { struct ggml_tensor * Qcur = ggml_mul_mat(ctx, layer->self_attn_q, cur); struct ggml_tensor * Kcur = ggml_mul_mat(ctx, layer->self_attn_k, cur); struct ggml_tensor * Vcur = ggml_mul_mat(ctx, layer->self_attn_v, cur); build_dia_self_kv_store(ctx, dctx, model, cache, gf, Kcur, Vcur, batch, l); struct ggml_tensor * k = ggml_view_4d(ctx, cache->k_l[l], model->head_size, model->decoder_attn_heads, dctx->current_position + 1, 2, ggml_element_size(cache->k_l[l]) * model->head_size, ggml_element_size(cache->k_l[l]) * model->decoder_attn_heads * model->head_size, ggml_element_size(cache->k_l[l]) * model->decoder_attn_heads * model->head_size * model->max_generation_size, 0); k = ggml_cont(ctx, ggml_permute(ctx, k, 0, 2, 1, 3)); struct ggml_tensor * v = ggml_view_3d(ctx, cache->v_l[l], model->head_size * model->decoder_attn_heads, dctx->current_position + 1, 2, ggml_element_size(cache->v_l[l]) * model->decoder_attn_heads * model->head_size, ggml_element_size(cache->v_l[l]) * model->decoder_attn_heads * model->head_size * model->max_generation_size, 0); v = ggml_cont_4d(ctx, ggml_transpose(ctx, v), dctx->current_position + 1, model->head_size, model->decoder_attn_heads, 2); // As noted in the encoder Dia uses the Neo-X protocol for RoPE. Qcur = ggml_rope(ctx, ggml_cont(ctx, ggml_reshape_4d(ctx, Qcur, model->head_size, model->decoder_attn_heads, batch.sequence_length, 2)), dctx->positions, model->head_size, 2); struct ggml_tensor * q = ggml_cont(ctx, ggml_permute(ctx, Qcur, 0, 2, 1, 3)); struct ggml_tensor * kq = ggml_mul_mat(ctx, ggml_cont(ctx, k), q); // given that attention bias, scaling and masking are not used for decoding, it might be faster to prefer the #ggml_soft_max op here, kq = ggml_soft_max_ext(ctx, kq, nullptr, 1.0f, 0.0f); struct ggml_tensor * kqv = ggml_mul_mat(ctx, kq, v); struct ggml_tensor * kqv_merged = ggml_cont(ctx, ggml_permute(ctx, kqv, 2, 0, 1, 3)); cur = ggml_cont_3d(ctx, kqv_merged, model->decoder_hidden_size, batch.sequence_length, 2); cur = ggml_mul_mat(ctx, layer->self_attn_o, cur); } // if we ever need to support multiple step decoder runs then this reshape will need to be replaced with permutation. cur = ggml_cont_2d(ctx, cur, cur->ne[0], 2); cur = ggml_add(ctx, cur, residual); struct ggml_tensor * residual_cross = cur; cur = dia_layer_norm(ctx, cur, layer->cross_attn_norm); // cross-attention { struct ggml_tensor * cross_Qcur = ggml_mul_mat(ctx, layer->cross_attn_q, cur); // only load the cross attention kv store when performing the encoding step if (batch.encoder_step) { build_dia_cross_kv_store(ctx, dctx, model, cache, gf, encoder_hidden_states, l); } struct ggml_tensor * cross_k = ggml_view_4d( ctx, cache->cross_k_l[l], model->head_size, model->decoder_attn_heads, 2, model->max_encoder_context_length, model->head_size*ggml_element_size(cache->cross_k_l[l]), model->head_size*model->decoder_attn_heads*ggml_element_size(cache->cross_k_l[l]), model->head_size*model->decoder_attn_heads*2*ggml_element_size(cache->cross_k_l[l]), 0); // the double permute operation shouldn't be necessary here, but it seems that currently ggml permute only currently alows for a single // axis pair to be transposed. cross_k = ggml_cont(ctx, ggml_permute(ctx, ggml_permute(ctx, cross_k, 0, 1, 3, 2), 0, 2, 1, 3)); struct ggml_tensor * cross_v = ggml_cont(ctx, ggml_view_4d( ctx, cache->cross_v_l[l], model->max_encoder_context_length, model->head_size, model->decoder_attn_heads, 2, model->max_encoder_context_length*ggml_element_size(cache->cross_v_l[l]), model->head_size*model->max_encoder_context_length*ggml_element_size(cache->cross_v_l[l]), model->head_size*model->max_encoder_context_length*model->decoder_attn_heads*ggml_element_size(cache->cross_v_l[l]), 0)); // As noted in the encoder Dia uses the Neo-X protocol for RoPE. cross_Qcur = ggml_rope(ctx, ggml_cont(ctx, ggml_reshape_4d(ctx, cross_Qcur, model->head_size, model->decoder_attn_heads, batch.sequence_length, 2)), dctx->positions, model->head_size, 2); struct ggml_tensor * cross_q = ggml_cont(ctx, ggml_permute(ctx, cross_Qcur, 0, 2, 1, 3)); struct ggml_tensor * cross_kq = ggml_mul_mat(ctx, cross_k, cross_q); // given that attention bias, scaling and masking are not used for decoding, it might be faster to prefer the #ggml_soft_max op here, cross_kq = ggml_soft_max_ext(ctx, cross_kq, nullptr, 1.0f, 0.0f); struct ggml_tensor * cross_kqv = ggml_mul_mat(ctx, cross_kq, cross_v); struct ggml_tensor * cross_kqv_merged = ggml_cont(ctx, ggml_permute(ctx, cross_kqv, 2, 0, 1, 3)); cur = ggml_cont_3d(ctx, cross_kqv_merged, model->decoder_hidden_size, batch.sequence_length, 2); cur = ggml_mul_mat(ctx, layer->cross_attn_o, cur); } // if we ever need to support multiple step decoder runs then this reshape will need to be replaced with permutation. cur = ggml_cont_2d(ctx, cur, cur->ne[0], 2); cur = ggml_add(ctx, cur, residual_cross); struct ggml_tensor * residual_mlp = cur; cur = dia_layer_norm(ctx, cur, layer->mlp_norm); // mlp { cur = ggml_mul(ctx, ggml_silu(ctx, ggml_mul_mat(ctx, layer->gate, cur)), ggml_mul_mat(ctx, layer->up, cur)); cur = ggml_mul_mat(ctx, layer->out, cur); } cur = ggml_add(ctx, cur, residual_mlp); } cur = dia_layer_norm(ctx, cur, model->decoder->norm); cur = build_dia_head_outputs(ctx, model, cur); return cur; } void dia_runner::tokenize_sentence(std::string sentence, dia_ubatch & batch) { // Dia's tokenization process is unusual. Essentially Dia takes the byte value for each character and uses that as // a token array. Additionally, because Dia performs a cfg-scale adjustment before sampling tokens, it is necessary to // generate with a conditioned context (i.e. with the text) and an unconditioned context (i.e. without any text) so that // proper adjustments can be perfored at each generation step. This means that we need to pad the end of our tokens to the // max context size for both the conditional and unconditional sequence. // if the sentence isn't prepended by dialogue start tokens, [S1] or [S2], then append one. sentence = strip(sentence); std::string start = sentence.substr(0, 4); if (start != "[S1]" && start != "[S2]") { sentence = "[S1] " + sentence; } if (sentence[sentence.size() - 1] != '.') { sentence += "."; } // [S1] and [S2] are special character sequences that are replaced with the special tokens 0x01 and 0x02 respectively. std::string r1(1, 1); std::string r2(1, 2); while (sentence.find("[S1]") != std::string::npos) { size_t pos = sentence.find("[S1]"); sentence.replace(pos, 4, r1); } while (sentence.find("[S2]") != std::string::npos) { size_t pos = sentence.find("[S2]"); sentence.replace(pos, 4, r2); } if (sentence.size() > model->max_encoder_context_length) { TTS_ABORT("Dia currently only supports a max of %d characters and received an input of %d characters.", model->max_encoder_context_length, sentence.size()); } batch.tokens.reserve(model->max_encoder_context_length * 2); for (auto character : sentence) { batch.tokens.push_back((uint32_t) character); } batch.sentence_length = batch.tokens.size(); // this 100 token warning is arbitrarily chosen based on spot checking small prompt performance if (batch.sentence_length <= 100) { fprintf(stdout, "Your prompt has fewer than 100 tokens. Please note that Dia's generation with prompts that are fewer than 100 tokens is highly inconsistent.\n"); } for (int i = (int) batch.tokens.size(); i < model->max_encoder_context_length * 2; i++) { batch.tokens.push_back(0u); } } dia_ubatch dia_runner::batch_from_sentence(std::string sentence) { // if we are generating a new batch from tokens then we need to run the encoder step; struct dia_ubatch batch{ 1, true}; tokenize_sentence(sentence, batch); batch.audio_tokens.reserve(model->n_output_heads); for (int i = 0; i < model->n_output_heads; i++) { batch.audio_tokens.push_back(model->bos_token_id); } return batch; } /* * There are two unique features of Dia's model architecture: * 1. Dia cleans its output generation by adding the difference between its text based output (its conditional output) and its unconditional output * to the conditional ouput before sampling. This is why the batch is set to two throughout the graph. * * 2. Dia's decoder attends across the entire encoded space including the pad buffer which receives a unique attention mask. This is why the * encoder sequence is always max length. */ struct ggml_cgraph * dia_runner::build_dia_graph(dia_ubatch & batch) { init_build(); struct ggml_cgraph * gf = ggml_new_graph_custom(ctx, 8192, false); struct ggml_tensor * encoded_states = nullptr; if (batch.encoder_step) { encoded_states = build_dia_encoder(ctx, model, dctx, batch); ggml_build_forward_expand(gf, encoded_states); } struct ggml_tensor * cur = build_dia_decoder(gf, ctx, model, dctx, kv_cross_self, batch, encoded_states); ggml_set_name(cur, "decoder_output"); ggml_build_forward_expand(gf, cur); free_build(); return gf; } void dia_runner::configure_generation(generation_configuration * config) { GGML_ASSERT(config->max_tokens == 0 || config->max_tokens > model->max_delay); decode_sampler->temperature = config->temperature; decode_sampler->repetition_penalty = config->repetition_penalty; decode_sampler->do_sample = config->sample; decode_sampler->top_k = config->top_k; decode_sampler->top_p = config->top_p; dctx->max_generation_size = config->max_tokens > model->max_delay ? config->max_tokens : model->max_generation_size; } void dia_runner::set_inputs(dia_ubatch & batch) { if (batch.encoder_step) { ggml_backend_tensor_set(dctx->inp_tokens, batch.tokens.data(), 0, batch.tokens.size()*ggml_element_size(dctx->inp_tokens)); int32_t * ep = (int32_t*) dctx->encode_positions->data; float * mask = (float*) dctx->encode_attn_mask->data; for (int i = 0; i < model->max_encoder_context_length; i++) { ep[i] = (int32_t) i; for (int ii = 0; ii < model->max_encoder_context_length; ii++) { if (i < batch.sentence_length) { mask[i*model->max_encoder_context_length + ii] = ii < batch.sentence_length ? 0.0 : -INFINITY; } else { mask[i*model->max_encoder_context_length + ii] = ii >= batch.sentence_length ? 0.0 : -INFINITY; } } } } // The audio tokens need to be repeated in the input in order to support cfg-scaling. I.E we need duplicate inputs for conditional and unconditional logits. ggml_backend_tensor_set(dctx->audio_inp_tokens, batch.audio_tokens.data(), 0, batch.audio_tokens.size()*ggml_element_size(dctx->audio_inp_tokens)); ggml_backend_tensor_set(dctx->audio_inp_tokens, batch.audio_tokens.data(), batch.audio_tokens.size()*ggml_element_size(dctx->audio_inp_tokens), batch.audio_tokens.size()*ggml_element_size(dctx->audio_inp_tokens)); ((int32_t*) dctx->positions->data)[0] = dctx->current_position; } int dia_runner::decode(dia_ubatch & batch) { if (batch.encoder_step) { dctx->prompt_size = batch.sentence_length; dctx->output_tokens.reserve(dctx->max_generation_size * model->n_output_heads); } ggml_backend_sched_reset(dctx->sched); const size_t logits_size = model->output_vocab_size * dctx->max_generation_size * model->n_output_heads; const size_t prev_size = dctx->buf_output ? ggml_backend_buffer_get_size(dctx->buf_output) : 0; const size_t new_size = logits_size * sizeof(float); if (!dctx->buf_output || prev_size < new_size) { if (dctx->buf_output) { ggml_backend_buffer_free(dctx->buf_output); dctx->buf_output = nullptr; dctx->logits = nullptr; } dctx->buf_output = ggml_backend_buft_alloc_buffer(dctx->backend_cpu_buffer, new_size); } dctx->logits = (float *) ggml_backend_buffer_get_base(dctx->buf_output); ggml_cgraph * gf = build_dia_graph(batch); // the output is always the last tensor in the graph struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1]; std::string resname = ggml_get_name(res); ggml_backend_sched_alloc_graph(dctx->sched, gf); set_inputs(batch); ggml_backend_sched_graph_compute_async(dctx->sched, gf); float * logits_out = dctx->logits + dctx->current_position * model->output_vocab_size * model->n_output_heads; dctx->get_ggml_node_data(res, logits_out, model->output_vocab_size * model->n_output_heads * sizeof(float)); // Reset state for the next token before backend sync, to allow the CPU activities in the reset to // overlap with device computation. ggml_backend_sched_reset(dctx->sched); return 0; } dia_ubatch dia_runner::build_worst_case_batch() { struct dia_ubatch batch{ 1, true }; batch.tokens.resize(model->max_encoder_context_length * 2); batch.audio_tokens.resize(model->n_output_heads); return batch; } void dia_runner::prepare_post_load() { dac_runner->prepare_post_load(); dia_kv_cache_init(kv_cross_self, model, dctx); auto batch = build_worst_case_batch(); batch.sentence_length = model->max_encoder_context_length; dctx->prompt_size = model->max_encoder_context_length; auto gf = build_dia_graph(batch); dctx->prep_schedule(gf); } bool dia_runner::check_stopping(dia_ubatch & batch) { if (dctx->delay_steps == -1 && (batch.audio_tokens[0] == model->eos_token_id || dctx->current_position >= dctx->max_generation_size - model->max_delay)) { dctx->delay_steps = model->max_delay; } if (dctx->delay_steps > 0) { int step_after_eos = model->max_delay - dctx->delay_steps; for (int i = 0; i < model->delay_pattern.size(); i++) { if (step_after_eos == model->delay_pattern[i]) { batch.audio_tokens[i] = model->eos_token_id; } else if (step_after_eos > model->delay_pattern[i]) { batch.audio_tokens[i] = model->pad_token_id; } } dctx->delay_steps -= 1; } return dctx->delay_steps == 0; } void dia_runner::adjust_output_tokens(std::vector & output_tokens, std::vector & filtered) { // currently this is applying sliding window over the heads and filtering out bad tokens. // If we convert the DAC model's quantizer layers to support by row + column embeddings then we will need to transpose // the heads and the sequence here, but right now simplying using a strided view is more peformant. size_t size = output_tokens.size(); filtered.reserve(size); for (int i = 0; i < (size / model->n_output_heads) - model->max_delay; i++) { bool skip_step = false; for (int ii = 0; ii < model->n_output_heads; ii++) { int next_index = i*model->n_output_heads+model->delay_pattern[ii]*model->n_output_heads+ii; if (next_index > size || output_tokens[next_index] >= model->audio_vocab_size) { skip_step = true; break; } } if (!skip_step) { for (int ii = 0; ii < model->n_output_heads; ii++) { int next_index = i*model->n_output_heads+model->delay_pattern[ii]*model->n_output_heads+ii; filtered.push_back(output_tokens[next_index]); } } } } int dia_runner::generate_from_batch(dia_ubatch & batch, struct tts_response * output) { while (!check_stopping(batch)) { int state = decode(batch); if (state != 0) { return state; } decode_sampler->sample(dctx->logits + dctx->current_position * model->n_output_heads * model->output_vocab_size, dctx->output_tokens); dctx->current_position += batch.sequence_length; batch = dia_ubatch{ 1 }; uint32_t * last_outputs = (dctx->output_tokens.data() + (int) dctx->output_tokens.size() - model->n_output_heads); batch.audio_tokens.reserve(model->n_output_heads); for (int i = 0; i < model->n_output_heads; i++) { batch.audio_tokens.push_back(dctx->current_position > i ? last_outputs[i] : model->bos_token_id); } } std::vector filtered_output_tokens; adjust_output_tokens(dctx->output_tokens, filtered_output_tokens); dac_runner->run(filtered_output_tokens.data(), (int32_t) filtered_output_tokens.size() / model->n_output_heads, output); return 0; } int dia_runner::generate(std::string sentence, struct tts_response * output) { dia_ubatch batch = batch_from_sentence(sentence); dctx->reset(); decode_sampler->reset(); dctx->current_position = 0; if (!kv_cross_self) { kv_cross_self = new dia_kv_cache; if (!dia_kv_cache_init(kv_cross_self, model, dctx)) { return 1; } } return generate_from_batch(batch, output); } void dia_runner::assign_weight(std::string name, ggml_tensor * tensor) { if (tensor->data == NULL) { return; } if (name.size() == 0) { // handles the top level meta tensor return; } if (name.size() > 14 && name.substr(0, 14) == "audio_encoder.") { dac_runner->model->assign_weight(name.substr(14), tensor); } else { model->assign_weight(name, tensor); } }