mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-05-22 11:16:08 +00:00
set default to GPU test
This commit is contained in:
parent
fdf868f397
commit
37ae068dee
5 changed files with 310 additions and 307 deletions
|
|
@ -19,7 +19,7 @@ AudioTokenizerDecoder::~AudioTokenizerDecoder() {
|
|||
|
||||
void AudioTokenizerDecoder::unload_model() {
|
||||
free_audio_decoder_model(model_);
|
||||
|
||||
|
||||
if (state_.sched) {
|
||||
ggml_backend_sched_free(state_.sched);
|
||||
state_.sched = nullptr;
|
||||
|
|
@ -39,32 +39,32 @@ void AudioTokenizerDecoder::unload_model() {
|
|||
|
||||
void AudioTokenizerDecoder::normalize_codebooks() {
|
||||
const float epsilon = 1e-5f;
|
||||
|
||||
|
||||
auto normalize_codebook = [epsilon](struct ggml_tensor * codebook, struct ggml_tensor * usage, const char *) {
|
||||
if (!codebook || !usage || !codebook->data || !usage->data) return;
|
||||
|
||||
|
||||
int64_t codebook_dim = codebook->ne[0];
|
||||
int64_t codebook_size = codebook->ne[1];
|
||||
|
||||
|
||||
ggml_fp16_t * cb_data = (ggml_fp16_t *)codebook->data;
|
||||
float * usage_data = (float *)usage->data;
|
||||
|
||||
|
||||
for (int64_t emb_idx = 0; emb_idx < codebook_size; ++emb_idx) {
|
||||
float u = usage_data[emb_idx];
|
||||
if (u < epsilon) u = epsilon;
|
||||
float inv_u = 1.0f / u;
|
||||
|
||||
|
||||
for (int64_t dim_idx = 0; dim_idx < codebook_dim; ++dim_idx) {
|
||||
int64_t mem_idx = dim_idx + emb_idx * codebook_dim;
|
||||
float val = ggml_fp16_to_fp32(cb_data[mem_idx]);
|
||||
cb_data[mem_idx] = ggml_fp32_to_fp16(val * inv_u);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
};
|
||||
|
||||
|
||||
normalize_codebook(model_.vq_first_codebook, model_.vq_first_usage, "first");
|
||||
|
||||
|
||||
for (int i = 0; i < 15; ++i) {
|
||||
char name[16];
|
||||
snprintf(name, sizeof(name), "rest%d", i);
|
||||
|
|
@ -80,11 +80,11 @@ bool AudioTokenizerDecoder::load_model(const std::string & model_path) {
|
|||
error_msg_ = loader.get_error();
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
model_.config.sample_rate = loader.get_u32("qwen3-tts.tokenizer.sample_rate", 24000);
|
||||
model_.config.n_codebooks = loader.get_u32("qwen3-tts.tokenizer.num_codebooks", 16);
|
||||
model_.config.codebook_size = loader.get_u32("qwen3-tts.tokenizer.codebook_size", 2048);
|
||||
|
||||
|
||||
int64_t n_tensors = loader.get_n_tensors();
|
||||
int dec_tensor_count = 0;
|
||||
for (int64_t i = 0; i < n_tensors; ++i) {
|
||||
|
|
@ -93,48 +93,48 @@ bool AudioTokenizerDecoder::load_model(const std::string & model_path) {
|
|||
dec_tensor_count++;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (dec_tensor_count == 0) {
|
||||
error_msg_ = "No decoder tensors found in model";
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
size_t ctx_size = ggml_tensor_overhead() * dec_tensor_count;
|
||||
struct ggml_init_params params = {
|
||||
/*.mem_size =*/ ctx_size,
|
||||
/*.mem_buffer =*/ nullptr,
|
||||
/*.no_alloc =*/ true,
|
||||
};
|
||||
|
||||
|
||||
model_.ctx = ggml_init(params);
|
||||
if (!model_.ctx) {
|
||||
error_msg_ = "Failed to initialize GGML context";
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
struct gguf_context * gguf_ctx = loader.get_ctx();
|
||||
struct ggml_context * meta_ctx = loader.get_meta_ctx();
|
||||
|
||||
|
||||
for (int64_t i = 0; i < n_tensors; ++i) {
|
||||
const char * name = loader.get_tensor_name(i);
|
||||
if (!name || strncmp(name, "tok_dec.", 8) != 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
struct ggml_tensor * meta_tensor = ggml_get_tensor(meta_ctx, name);
|
||||
if (!meta_tensor) {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
struct ggml_tensor * tensor = ggml_dup_tensor(model_.ctx, meta_tensor);
|
||||
ggml_set_name(tensor, name);
|
||||
|
||||
|
||||
model_.tensors[name] = tensor;
|
||||
|
||||
|
||||
std::string sname(name);
|
||||
|
||||
|
||||
if (sname == "tok_dec.vq_first.input_proj.weight") model_.vq_first_input_proj = tensor;
|
||||
else if (sname == "tok_dec.vq_first.output_proj.weight") model_.vq_first_output_proj = tensor;
|
||||
else if (sname == "tok_dec.vq_first.0.codebook") model_.vq_first_codebook = tensor;
|
||||
|
|
@ -174,13 +174,13 @@ bool AudioTokenizerDecoder::load_model(const std::string & model_path) {
|
|||
int blk_idx, res_idx, cb_idx, n = 0;
|
||||
char suffix[64];
|
||||
size_t name_len = strlen(name);
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
#define MATCH1(fmt, var) (sscanf(name, fmt "%n", &var, &n) == 1 && (size_t)n == name_len)
|
||||
#define MATCH2(fmt, v1, v2) (sscanf(name, fmt "%n", &v1, &v2, &n) == 2 && (size_t)n == name_len)
|
||||
#define MATCH1S(fmt, var, suf) (sscanf(name, fmt, &var, suf) == 2)
|
||||
|
||||
|
||||
if (MATCH1("tok_dec.vq_rest.%d.codebook", cb_idx)) {
|
||||
if (cb_idx >= 0 && cb_idx < 15) {
|
||||
model_.vq_rest_codebook[cb_idx] = tensor;
|
||||
|
|
@ -314,19 +314,19 @@ bool AudioTokenizerDecoder::load_model(const std::string & model_path) {
|
|||
#undef MATCH1S
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (!load_tensor_data_from_file(model_path, gguf_ctx, model_.ctx,
|
||||
model_.tensors, model_.buffer, error_msg_,
|
||||
GGML_BACKEND_DEVICE_TYPE_IGPU)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
model_.dec_blocks[i].res[0].dilation = 1;
|
||||
model_.dec_blocks[i].res[1].dilation = 3;
|
||||
model_.dec_blocks[i].res[2].dilation = 9;
|
||||
}
|
||||
|
||||
|
||||
normalize_codebooks();
|
||||
// Codebooks are normalized in host memory; sync once to backend tensors.
|
||||
auto upload_if_present = [](struct ggml_tensor * t) {
|
||||
|
|
@ -338,8 +338,8 @@ bool AudioTokenizerDecoder::load_model(const std::string & model_path) {
|
|||
for (int i = 0; i < 15; ++i) {
|
||||
upload_if_present(model_.vq_rest_codebook[i]);
|
||||
}
|
||||
|
||||
state_.backend = init_preferred_backend("AudioTokenizerDecoder", &error_msg_);
|
||||
|
||||
state_.backend = init_preferred_backend("AudioTokenizerDecoder", &error_msg_, true);
|
||||
if (!state_.backend) {
|
||||
return false;
|
||||
}
|
||||
|
|
@ -347,7 +347,7 @@ bool AudioTokenizerDecoder::load_model(const std::string & model_path) {
|
|||
ggml_backend_dev_t device = ggml_backend_get_device(state_.backend);
|
||||
const char * device_name = device ? ggml_backend_dev_name(device) : "Unknown";
|
||||
fprintf(stderr, " AudioTokenizerDecoder backend: %s\n", device_name);
|
||||
|
||||
|
||||
if (device && ggml_backend_dev_type(device) != GGML_BACKEND_DEVICE_TYPE_CPU) {
|
||||
state_.backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
|
||||
if (!state_.backend_cpu) {
|
||||
|
|
@ -366,9 +366,9 @@ bool AudioTokenizerDecoder::load_model(const std::string & model_path) {
|
|||
error_msg_ = "Failed to create backend scheduler";
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
state_.compute_meta.resize(ggml_tensor_overhead() * QWEN3_TTS_DEC_MAX_NODES + ggml_graph_overhead());
|
||||
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
@ -379,25 +379,25 @@ struct ggml_tensor * AudioTokenizerDecoder::apply_snake(struct ggml_context * ct
|
|||
int64_t seq_len = x->ne[0];
|
||||
int64_t channels = x->ne[1];
|
||||
int64_t batch = x->ne[2];
|
||||
|
||||
|
||||
struct ggml_tensor * alpha_exp = ggml_exp(ctx, alpha);
|
||||
|
||||
|
||||
struct ggml_tensor * alpha_3d = ggml_reshape_3d(ctx, alpha_exp, 1, channels, 1);
|
||||
struct ggml_tensor * alpha_broad = ggml_repeat(ctx, alpha_3d,
|
||||
struct ggml_tensor * alpha_broad = ggml_repeat(ctx, alpha_3d,
|
||||
ggml_new_tensor_3d(ctx, GGML_TYPE_F32, seq_len, channels, batch));
|
||||
|
||||
|
||||
struct ggml_tensor * ax = ggml_mul(ctx, x, alpha_broad);
|
||||
struct ggml_tensor * sin_ax = ggml_sin(ctx, ax);
|
||||
struct ggml_tensor * sin_sq = ggml_sqr(ctx, sin_ax);
|
||||
|
||||
|
||||
struct ggml_tensor * neg_beta = ggml_scale(ctx, beta, -1.0f);
|
||||
struct ggml_tensor * inv_beta_exp = ggml_exp(ctx, neg_beta);
|
||||
struct ggml_tensor * inv_beta_3d = ggml_reshape_3d(ctx, inv_beta_exp, 1, channels, 1);
|
||||
struct ggml_tensor * inv_beta = ggml_repeat(ctx, inv_beta_3d,
|
||||
struct ggml_tensor * inv_beta = ggml_repeat(ctx, inv_beta_3d,
|
||||
ggml_new_tensor_3d(ctx, GGML_TYPE_F32, seq_len, channels, batch));
|
||||
|
||||
|
||||
struct ggml_tensor * scaled_sin = ggml_mul(ctx, sin_sq, inv_beta);
|
||||
|
||||
|
||||
return ggml_add(ctx, x, scaled_sin);
|
||||
}
|
||||
|
||||
|
|
@ -418,72 +418,72 @@ struct ggml_tensor * AudioTokenizerDecoder::apply_pre_tfm_layer(struct ggml_cont
|
|||
const int n_heads = cfg.n_heads;
|
||||
const int qkv_dim = cfg.latent_dim;
|
||||
const int head_dim = qkv_dim / n_heads;
|
||||
|
||||
|
||||
if (!layer.attn_norm_w || !layer.attn_q_w || !layer.attn_k_w || !layer.attn_v_w ||
|
||||
!layer.attn_output_w || !layer.ffn_norm_w || !layer.ffn_gate_w ||
|
||||
!layer.attn_output_w || !layer.ffn_norm_w || !layer.ffn_gate_w ||
|
||||
!layer.ffn_up_w || !layer.ffn_down_w) {
|
||||
return x;
|
||||
}
|
||||
|
||||
|
||||
struct ggml_tensor * residual = x;
|
||||
|
||||
|
||||
struct ggml_tensor * normed = apply_rms_norm(ctx, x, layer.attn_norm_w, cfg.rms_norm_eps);
|
||||
|
||||
|
||||
struct ggml_tensor * Qcur = ggml_mul_mat(ctx, layer.attn_q_w, normed);
|
||||
struct ggml_tensor * Kcur = ggml_mul_mat(ctx, layer.attn_k_w, normed);
|
||||
struct ggml_tensor * Vcur = ggml_mul_mat(ctx, layer.attn_v_w, normed);
|
||||
|
||||
|
||||
Qcur = ggml_reshape_3d(ctx, Qcur, head_dim, n_heads, n_frames);
|
||||
Kcur = ggml_reshape_3d(ctx, Kcur, head_dim, n_heads, n_frames);
|
||||
Vcur = ggml_reshape_3d(ctx, Vcur, head_dim, n_heads, n_frames);
|
||||
|
||||
|
||||
Qcur = ggml_rope_ext(ctx, Qcur, positions, nullptr,
|
||||
head_dim, GGML_ROPE_TYPE_NEOX, 0,
|
||||
cfg.rope_theta, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f);
|
||||
|
||||
|
||||
Kcur = ggml_rope_ext(ctx, Kcur, positions, nullptr,
|
||||
head_dim, GGML_ROPE_TYPE_NEOX, 0,
|
||||
cfg.rope_theta, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f);
|
||||
|
||||
|
||||
struct ggml_tensor * Q = ggml_permute(ctx, Qcur, 0, 2, 1, 3);
|
||||
struct ggml_tensor * K = ggml_permute(ctx, Kcur, 0, 2, 1, 3);
|
||||
struct ggml_tensor * V = ggml_permute(ctx, Vcur, 0, 2, 1, 3);
|
||||
|
||||
|
||||
struct ggml_tensor * KQ = ggml_mul_mat(ctx, K, Q);
|
||||
KQ = ggml_scale(ctx, KQ, 1.0f / sqrtf((float)head_dim));
|
||||
// Apply causal mask (each position can only attend to itself and previous positions)
|
||||
KQ = ggml_diag_mask_inf(ctx, KQ, 0);
|
||||
KQ = ggml_soft_max(ctx, KQ);
|
||||
|
||||
|
||||
V = ggml_cont(ctx, ggml_transpose(ctx, V));
|
||||
|
||||
|
||||
struct ggml_tensor * KQV = ggml_mul_mat(ctx, V, KQ);
|
||||
KQV = ggml_permute(ctx, KQV, 0, 2, 1, 3);
|
||||
struct ggml_tensor * attn_out = ggml_cont_2d(ctx, KQV, n_heads * head_dim, n_frames);
|
||||
|
||||
|
||||
attn_out = ggml_mul_mat(ctx, layer.attn_output_w, attn_out);
|
||||
|
||||
|
||||
if (layer.attn_scale) {
|
||||
attn_out = ggml_mul(ctx, attn_out, layer.attn_scale);
|
||||
}
|
||||
|
||||
|
||||
x = ggml_add(ctx, residual, attn_out);
|
||||
residual = x;
|
||||
|
||||
|
||||
normed = apply_rms_norm(ctx, x, layer.ffn_norm_w, cfg.rms_norm_eps);
|
||||
|
||||
|
||||
struct ggml_tensor * gate = ggml_mul_mat(ctx, layer.ffn_gate_w, normed);
|
||||
struct ggml_tensor * up = ggml_mul_mat(ctx, layer.ffn_up_w, normed);
|
||||
|
||||
|
||||
gate = ggml_silu(ctx, gate);
|
||||
struct ggml_tensor * ffn_out = ggml_mul(ctx, gate, up);
|
||||
|
||||
|
||||
ffn_out = ggml_mul_mat(ctx, layer.ffn_down_w, ffn_out);
|
||||
|
||||
|
||||
if (layer.ffn_scale) {
|
||||
ffn_out = ggml_mul(ctx, ffn_out, layer.ffn_scale);
|
||||
}
|
||||
|
||||
|
||||
return ggml_add(ctx, residual, ffn_out);
|
||||
}
|
||||
|
||||
|
|
@ -493,19 +493,19 @@ struct ggml_tensor * AudioTokenizerDecoder::apply_upsample_block(struct ggml_con
|
|||
int block_idx) {
|
||||
int64_t seq_len = x->ne[0];
|
||||
int64_t channels = x->ne[1];
|
||||
|
||||
|
||||
struct ggml_tensor * x_2d = ggml_reshape_2d(ctx, x, seq_len, channels);
|
||||
x_2d = ggml_conv_transpose_1d(ctx, block.conv_w, x_2d, 2, 0, 1);
|
||||
|
||||
|
||||
int64_t new_seq_len = x_2d->ne[0];
|
||||
x = ggml_reshape_3d(ctx, x_2d, new_seq_len, channels, 1);
|
||||
|
||||
|
||||
if (block.conv_b) {
|
||||
x = ggml_add(ctx, x, ggml_reshape_3d(ctx, block.conv_b, 1, channels, 1));
|
||||
}
|
||||
|
||||
|
||||
struct ggml_tensor * residual = x;
|
||||
|
||||
|
||||
if (block.dwconv_w) {
|
||||
// Causal padding: pad left with 6 zeros (kernel_size - 1 = 7 - 1 = 6)
|
||||
x = ggml_pad_ext(ctx, x, 6, 0, 0, 0, 0, 0, 0, 0); // left pad only
|
||||
|
|
@ -514,37 +514,37 @@ struct ggml_tensor * AudioTokenizerDecoder::apply_upsample_block(struct ggml_con
|
|||
x = ggml_add(ctx, x, ggml_reshape_3d(ctx, block.dwconv_b, 1, channels, 1));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
x = ggml_permute(ctx, x, 1, 0, 2, 3);
|
||||
x = ggml_cont(ctx, x);
|
||||
|
||||
|
||||
if (block.norm_w && block.norm_b) {
|
||||
x = ggml_norm(ctx, x, 1e-6f);
|
||||
x = ggml_mul(ctx, x, block.norm_w);
|
||||
x = ggml_add(ctx, x, block.norm_b);
|
||||
}
|
||||
|
||||
|
||||
x = ggml_mul_mat(ctx, block.pwconv1_w, x);
|
||||
if (block.pwconv1_b) {
|
||||
x = ggml_add(ctx, x, block.pwconv1_b);
|
||||
}
|
||||
|
||||
|
||||
x = ggml_gelu(ctx, x);
|
||||
|
||||
|
||||
x = ggml_mul_mat(ctx, block.pwconv2_w, x);
|
||||
if (block.pwconv2_b) {
|
||||
x = ggml_add(ctx, x, block.pwconv2_b);
|
||||
}
|
||||
|
||||
|
||||
x = ggml_permute(ctx, x, 1, 0, 2, 3);
|
||||
x = ggml_cont(ctx, x);
|
||||
|
||||
|
||||
if (block.gamma) {
|
||||
struct ggml_tensor * gamma_3d = ggml_reshape_3d(ctx, block.gamma, 1, channels, 1);
|
||||
x = ggml_mul(ctx, x, ggml_repeat(ctx, gamma_3d,
|
||||
x = ggml_mul(ctx, x, ggml_repeat(ctx, gamma_3d,
|
||||
ggml_new_tensor_3d(ctx, GGML_TYPE_F32, new_seq_len, channels, 1)));
|
||||
}
|
||||
|
||||
|
||||
return ggml_add(ctx, residual, x);
|
||||
}
|
||||
|
||||
|
|
@ -552,11 +552,11 @@ struct ggml_tensor * AudioTokenizerDecoder::apply_residual_block(struct ggml_con
|
|||
struct ggml_tensor * x,
|
||||
const residual_block & block) {
|
||||
struct ggml_tensor * residual = x;
|
||||
|
||||
|
||||
if (block.act1_alpha) {
|
||||
x = apply_snake(ctx, x, block.act1_alpha, block.act1_beta);
|
||||
}
|
||||
|
||||
|
||||
int64_t out_channels = block.conv1_w->ne[2];
|
||||
int padding = 6 * block.dilation;
|
||||
x = ggml_pad_ext(ctx, x, padding, 0, 0, 0, 0, 0, 0, 0);
|
||||
|
|
@ -564,17 +564,17 @@ struct ggml_tensor * AudioTokenizerDecoder::apply_residual_block(struct ggml_con
|
|||
if (block.conv1_b) {
|
||||
x = ggml_add(ctx, x, ggml_reshape_3d(ctx, block.conv1_b, 1, out_channels, 1));
|
||||
}
|
||||
|
||||
|
||||
if (block.act2_alpha) {
|
||||
x = apply_snake(ctx, x, block.act2_alpha, block.act2_beta);
|
||||
}
|
||||
|
||||
|
||||
out_channels = block.conv2_w->ne[2];
|
||||
x = ggml_conv_1d(ctx, block.conv2_w, x, 1, 0, 1);
|
||||
if (block.conv2_b) {
|
||||
x = ggml_add(ctx, x, ggml_reshape_3d(ctx, block.conv2_b, 1, out_channels, 1));
|
||||
}
|
||||
|
||||
|
||||
return ggml_add(ctx, residual, x);
|
||||
}
|
||||
|
||||
|
|
@ -586,101 +586,101 @@ struct ggml_tensor * AudioTokenizerDecoder::apply_decoder_block(struct ggml_cont
|
|||
if (block.snake_alpha && block.snake_beta) {
|
||||
x = apply_snake(ctx, x, block.snake_alpha, block.snake_beta);
|
||||
}
|
||||
|
||||
|
||||
int64_t seq_len = x->ne[0];
|
||||
int64_t in_channels = x->ne[1];
|
||||
int64_t out_channels = block.conv_t_w->ne[1];
|
||||
int kernel_size = block.conv_t_w->ne[0];
|
||||
|
||||
|
||||
struct ggml_tensor * x_2d = ggml_reshape_2d(ctx, x, seq_len, in_channels);
|
||||
x_2d = ggml_conv_transpose_1d(ctx, block.conv_t_w, x_2d, upsample_rate, 0, 1);
|
||||
|
||||
|
||||
int64_t new_seq_len = x_2d->ne[0];
|
||||
x = ggml_reshape_3d(ctx, x_2d, new_seq_len, out_channels, 1);
|
||||
|
||||
|
||||
// Python CausalTransConvNet: left_pad = right_pad = kernel_size - stride
|
||||
int pad = kernel_size - upsample_rate;
|
||||
int left_pad = pad;
|
||||
int right_pad = pad;
|
||||
int64_t out_seq_len = new_seq_len - left_pad - right_pad;
|
||||
|
||||
|
||||
x = ggml_view_3d(ctx, x, out_seq_len, out_channels, 1,
|
||||
x->nb[1], x->nb[2], left_pad * x->nb[0]);
|
||||
x = ggml_cont(ctx, x);
|
||||
|
||||
|
||||
if (block.conv_t_b) {
|
||||
x = ggml_add(ctx, x, ggml_reshape_3d(ctx, block.conv_t_b, 1, out_channels, 1));
|
||||
}
|
||||
|
||||
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
x = apply_residual_block(ctx, x, block.res[i]);
|
||||
}
|
||||
|
||||
|
||||
return x;
|
||||
}
|
||||
|
||||
struct ggml_cgraph * AudioTokenizerDecoder::build_graph(int32_t n_frames) {
|
||||
const auto & cfg = model_.config;
|
||||
|
||||
|
||||
struct ggml_init_params params = {
|
||||
/*.mem_size =*/ state_.compute_meta.size(),
|
||||
/*.mem_buffer =*/ state_.compute_meta.data(),
|
||||
/*.no_alloc =*/ true,
|
||||
};
|
||||
|
||||
|
||||
struct ggml_context * ctx0 = ggml_init(params);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, QWEN3_TTS_DEC_MAX_NODES, false);
|
||||
|
||||
|
||||
static const char * cb_names[16] = {
|
||||
"codes_cb0", "codes_cb1", "codes_cb2", "codes_cb3",
|
||||
"codes_cb4", "codes_cb5", "codes_cb6", "codes_cb7",
|
||||
"codes_cb8", "codes_cb9", "codes_cb10", "codes_cb11",
|
||||
"codes_cb12", "codes_cb13", "codes_cb14", "codes_cb15"
|
||||
};
|
||||
|
||||
|
||||
struct ggml_tensor * cb_codes_tensors[16];
|
||||
for (int cb = 0; cb < 16; ++cb) {
|
||||
cb_codes_tensors[cb] = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_frames);
|
||||
ggml_set_name(cb_codes_tensors[cb], cb_names[cb]);
|
||||
ggml_set_input(cb_codes_tensors[cb]);
|
||||
}
|
||||
|
||||
|
||||
struct ggml_tensor * first_codes = cb_codes_tensors[0];
|
||||
|
||||
|
||||
struct ggml_tensor * first_emb = ggml_get_rows(ctx0, model_.vq_first_codebook, first_codes);
|
||||
ggml_set_name(first_emb, "first_emb_raw");
|
||||
|
||||
|
||||
struct ggml_tensor * rest_emb[15];
|
||||
for (int cb = 0; cb < 15; ++cb) {
|
||||
struct ggml_tensor * cb_codes = cb_codes_tensors[cb + 1];
|
||||
rest_emb[cb] = ggml_get_rows(ctx0, model_.vq_rest_codebook[cb], cb_codes);
|
||||
|
||||
|
||||
if (cb == 0) {
|
||||
ggml_set_name(rest_emb[cb], "rest_cb0_emb_raw");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
struct ggml_tensor * first_emb_2d = ggml_reshape_2d(ctx0, first_emb, cfg.codebook_dim, n_frames);
|
||||
ggml_set_name(first_emb_2d, "first_emb_2d");
|
||||
|
||||
struct ggml_tensor * first_proj_weight_2d = ggml_reshape_2d(ctx0, model_.vq_first_output_proj,
|
||||
|
||||
struct ggml_tensor * first_proj_weight_2d = ggml_reshape_2d(ctx0, model_.vq_first_output_proj,
|
||||
cfg.codebook_dim, cfg.hidden_dim);
|
||||
struct ggml_tensor * first_proj_2d = ggml_mul_mat(ctx0, first_proj_weight_2d, first_emb_2d);
|
||||
ggml_set_name(first_proj_2d, "first_proj_2d");
|
||||
|
||||
|
||||
struct ggml_tensor * rest_proj_weight_2d = ggml_reshape_2d(ctx0, model_.vq_rest_output_proj,
|
||||
cfg.codebook_dim, cfg.hidden_dim);
|
||||
|
||||
|
||||
struct ggml_tensor * rest_proj_2d = nullptr;
|
||||
for (int cb = 0; cb < 15; ++cb) {
|
||||
struct ggml_tensor * cb_emb_2d = ggml_reshape_2d(ctx0, rest_emb[cb], cfg.codebook_dim, n_frames);
|
||||
|
||||
|
||||
if (cb == 0) {
|
||||
ggml_set_name(cb_emb_2d, "rest_cb0_emb_2d");
|
||||
}
|
||||
|
||||
|
||||
struct ggml_tensor * cb_proj_2d = ggml_mul_mat(ctx0, rest_proj_weight_2d, cb_emb_2d);
|
||||
|
||||
|
||||
if (rest_proj_2d == nullptr) {
|
||||
rest_proj_2d = cb_proj_2d;
|
||||
} else {
|
||||
|
|
@ -688,82 +688,82 @@ struct ggml_cgraph * AudioTokenizerDecoder::build_graph(int32_t n_frames) {
|
|||
}
|
||||
}
|
||||
ggml_set_name(rest_proj_2d, "rest_proj_2d");
|
||||
|
||||
|
||||
struct ggml_tensor * latent_2d = ggml_add(ctx0, first_proj_2d, rest_proj_2d);
|
||||
ggml_set_name(latent_2d, "latent_2d");
|
||||
|
||||
|
||||
struct ggml_tensor * latent_t = ggml_transpose(ctx0, latent_2d);
|
||||
ggml_set_name(latent_t, "latent_t");
|
||||
|
||||
|
||||
struct ggml_tensor * latent_cont = ggml_cont(ctx0, latent_t);
|
||||
ggml_set_name(latent_cont, "latent_cont");
|
||||
|
||||
|
||||
struct ggml_tensor * latent = ggml_reshape_3d(ctx0, latent_cont, n_frames, cfg.hidden_dim, 1);
|
||||
|
||||
ggml_set_name(latent, "vq_output");
|
||||
|
||||
|
||||
struct ggml_tensor * latent_for_conv = ggml_cont(ctx0, latent);
|
||||
struct ggml_tensor * latent_padded = ggml_pad_ext(ctx0, latent_for_conv, 2, 0, 0, 0, 0, 0, 0, 0);
|
||||
struct ggml_tensor * cur = ggml_conv_1d(ctx0, model_.pre_conv_w, latent_padded, 1, 0, 1);
|
||||
if (model_.pre_conv_b) {
|
||||
cur = ggml_add(ctx0, cur, ggml_reshape_3d(ctx0, model_.pre_conv_b, 1, cfg.latent_dim, 1));
|
||||
}
|
||||
|
||||
|
||||
ggml_set_name(cur, "pre_conv_output");
|
||||
|
||||
|
||||
struct ggml_tensor * cur_2d = ggml_reshape_2d(ctx0, cur, n_frames, cfg.latent_dim);
|
||||
struct ggml_tensor * cur_t = ggml_transpose(ctx0, cur_2d);
|
||||
cur = ggml_cont(ctx0, cur_t);
|
||||
|
||||
|
||||
ggml_set_name(cur, "pre_conv_reshaped");
|
||||
|
||||
|
||||
cur = ggml_mul_mat(ctx0, model_.pre_tfm_input_proj_w, cur);
|
||||
if (model_.pre_tfm_input_proj_b) {
|
||||
cur = ggml_add(ctx0, cur, model_.pre_tfm_input_proj_b);
|
||||
}
|
||||
|
||||
|
||||
ggml_set_name(cur, "pre_tfm_input");
|
||||
|
||||
|
||||
struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_frames);
|
||||
ggml_set_name(positions, "positions");
|
||||
ggml_set_input(positions);
|
||||
|
||||
|
||||
for (int i = 0; i < cfg.n_pre_tfm_layers; ++i) {
|
||||
cur = apply_pre_tfm_layer(ctx0, cur, model_.pre_tfm_layers[i], n_frames, positions);
|
||||
}
|
||||
|
||||
|
||||
if (model_.pre_tfm_norm_w) {
|
||||
cur = apply_rms_norm(ctx0, cur, model_.pre_tfm_norm_w, cfg.rms_norm_eps);
|
||||
}
|
||||
|
||||
|
||||
cur = ggml_mul_mat(ctx0, model_.pre_tfm_output_proj_w, cur);
|
||||
if (model_.pre_tfm_output_proj_b) {
|
||||
cur = ggml_add(ctx0, cur, model_.pre_tfm_output_proj_b);
|
||||
}
|
||||
|
||||
|
||||
ggml_set_name(cur, "pre_tfm_output");
|
||||
|
||||
|
||||
cur = ggml_permute(ctx0, cur, 1, 0, 2, 3);
|
||||
cur = ggml_cont(ctx0, cur);
|
||||
cur = ggml_reshape_3d(ctx0, cur, n_frames, cfg.latent_dim, 1);
|
||||
|
||||
|
||||
ggml_set_name(cur, "pre_tfm_reshaped");
|
||||
|
||||
|
||||
for (int i = 0; i < 2; ++i) {
|
||||
cur = apply_upsample_block(ctx0, cur, model_.upsample[i], i);
|
||||
}
|
||||
|
||||
|
||||
ggml_set_name(cur, "upsample_output");
|
||||
|
||||
|
||||
// Causal padding: left pad with 6 (kernel_size - 1 = 7 - 1 = 6)
|
||||
cur = ggml_pad_ext(ctx0, cur, 6, 0, 0, 0, 0, 0, 0, 0);
|
||||
cur = ggml_conv_1d(ctx0, model_.dec0_conv_w, cur, 1, 0, 1);
|
||||
if (model_.dec0_conv_b) {
|
||||
cur = ggml_add(ctx0, cur, ggml_reshape_3d(ctx0, model_.dec0_conv_b, 1, cfg.decoder_dim, 1));
|
||||
}
|
||||
|
||||
|
||||
ggml_set_name(cur, "dec0_output");
|
||||
|
||||
|
||||
int upsample_rates[4] = {8, 5, 4, 3};
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
cur = apply_decoder_block(ctx0, cur, model_.dec_blocks[i], upsample_rates[i], i);
|
||||
|
|
@ -771,33 +771,33 @@ struct ggml_cgraph * AudioTokenizerDecoder::build_graph(int32_t n_frames) {
|
|||
snprintf(name, sizeof(name), "dec%d_output", i + 1);
|
||||
ggml_set_name(cur, name);
|
||||
}
|
||||
|
||||
|
||||
if (model_.dec5_snake_alpha) {
|
||||
cur = apply_snake(ctx0, cur, model_.dec5_snake_alpha, model_.dec5_snake_beta);
|
||||
}
|
||||
|
||||
|
||||
ggml_set_name(cur, "dec5_output");
|
||||
|
||||
|
||||
// Causal padding: left pad with 6 (kernel_size - 1 = 7 - 1 = 6)
|
||||
cur = ggml_pad_ext(ctx0, cur, 6, 0, 0, 0, 0, 0, 0, 0);
|
||||
cur = ggml_conv_1d(ctx0, model_.dec6_conv_w, cur, 1, 0, 1);
|
||||
if (model_.dec6_conv_b) {
|
||||
cur = ggml_add(ctx0, cur, ggml_reshape_3d(ctx0, model_.dec6_conv_b, 1, 1, 1));
|
||||
}
|
||||
|
||||
|
||||
ggml_set_name(cur, "dec6_output");
|
||||
|
||||
|
||||
cur = ggml_tanh(ctx0, cur);
|
||||
|
||||
|
||||
cur = ggml_reshape_1d(ctx0, cur, cur->ne[0]);
|
||||
|
||||
|
||||
ggml_set_name(cur, "audio");
|
||||
ggml_set_output(cur);
|
||||
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
|
||||
ggml_free(ctx0);
|
||||
|
||||
|
||||
return gf;
|
||||
}
|
||||
|
||||
|
|
@ -807,23 +807,23 @@ bool AudioTokenizerDecoder::decode(const int32_t * codes, int32_t n_frames,
|
|||
error_msg_ = "Model not loaded";
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
const auto & cfg = model_.config;
|
||||
|
||||
|
||||
codes_buf_.resize(n_frames * cfg.n_codebooks);
|
||||
for (int f = 0; f < n_frames; ++f) {
|
||||
for (int cb = 0; cb < cfg.n_codebooks; ++cb) {
|
||||
codes_buf_[cb + f * cfg.n_codebooks] = codes[f * cfg.n_codebooks + cb];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
struct ggml_cgraph * gf = build_graph(n_frames);
|
||||
|
||||
|
||||
if (!ggml_backend_sched_alloc_graph(state_.sched, gf)) {
|
||||
error_msg_ = "Failed to allocate graph";
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
std::vector<int32_t> cb_codes(n_frames);
|
||||
for (int cb = 0; cb < 16; ++cb) {
|
||||
char name[32];
|
||||
|
|
@ -834,47 +834,47 @@ bool AudioTokenizerDecoder::decode(const int32_t * codes, int32_t n_frames,
|
|||
ggml_backend_sched_reset(state_.sched);
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
for (int f = 0; f < n_frames; ++f) {
|
||||
cb_codes[f] = codes_buf_[f * cfg.n_codebooks + cb];
|
||||
}
|
||||
|
||||
|
||||
ggml_backend_tensor_set(cb_tensor, cb_codes.data(), 0, n_frames * sizeof(int32_t));
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
struct ggml_tensor * positions_tensor = ggml_graph_get_tensor(gf, "positions");
|
||||
if (positions_tensor) {
|
||||
std::vector<int32_t> positions(n_frames);
|
||||
for (int i = 0; i < n_frames; ++i) {
|
||||
positions[i] = i;
|
||||
}
|
||||
ggml_backend_tensor_set(positions_tensor, positions.data(), 0,
|
||||
ggml_backend_tensor_set(positions_tensor, positions.data(), 0,
|
||||
n_frames * sizeof(int32_t));
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
if (ggml_backend_sched_graph_compute(state_.sched, gf) != GGML_STATUS_SUCCESS) {
|
||||
error_msg_ = "Failed to compute graph";
|
||||
ggml_backend_sched_reset(state_.sched);
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
struct ggml_tensor * audio_tensor = ggml_graph_get_tensor(gf, "audio");
|
||||
if (!audio_tensor) {
|
||||
error_msg_ = "Failed to find audio tensor";
|
||||
ggml_backend_sched_reset(state_.sched);
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
int64_t n_samples = audio_tensor->ne[0];
|
||||
samples.resize(n_samples);
|
||||
ggml_backend_tensor_get(audio_tensor, samples.data(), 0, n_samples * sizeof(float));
|
||||
|
||||
|
||||
ggml_backend_sched_reset(state_.sched);
|
||||
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -13,7 +13,7 @@ namespace qwen3_tts {
|
|||
|
||||
// Mel filterbank computation using librosa slaney normalization
|
||||
// This matches librosa.filters.mel with norm='slaney'
|
||||
static void compute_mel_filterbank_slaney(float * filterbank, int n_mels, int n_fft,
|
||||
static void compute_mel_filterbank_slaney(float * filterbank, int n_mels, int n_fft,
|
||||
int sample_rate, float f_min, float f_max) {
|
||||
// Slaney-style mel scale (used by librosa default)
|
||||
auto hz_to_mel_slaney = [](float hz) -> float {
|
||||
|
|
@ -22,64 +22,64 @@ static void compute_mel_filterbank_slaney(float * filterbank, int n_mels, int n_
|
|||
const float min_log_hz = 1000.0f;
|
||||
const float min_log_mel = (min_log_hz - 0.0f) / f_sp; // 15
|
||||
const float logstep = logf(6.4f) / 27.0f; // log(6400/1000) / 27
|
||||
|
||||
|
||||
if (hz < min_log_hz) {
|
||||
return (hz - 0.0f) / f_sp;
|
||||
} else {
|
||||
return min_log_mel + logf(hz / min_log_hz) / logstep;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
auto mel_to_hz_slaney = [](float mel) -> float {
|
||||
const float f_sp = 200.0f / 3.0f;
|
||||
const float min_log_hz = 1000.0f;
|
||||
const float min_log_mel = (min_log_hz - 0.0f) / f_sp;
|
||||
const float logstep = logf(6.4f) / 27.0f;
|
||||
|
||||
|
||||
if (mel < min_log_mel) {
|
||||
return 0.0f + f_sp * mel;
|
||||
} else {
|
||||
return min_log_hz * expf(logstep * (mel - min_log_mel));
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
float mel_min = hz_to_mel_slaney(f_min);
|
||||
float mel_max = hz_to_mel_slaney(f_max);
|
||||
|
||||
|
||||
int n_fft_bins = n_fft / 2 + 1;
|
||||
|
||||
|
||||
// Compute mel center frequencies
|
||||
std::vector<float> mel_points(n_mels + 2);
|
||||
for (int i = 0; i < n_mels + 2; ++i) {
|
||||
mel_points[i] = mel_min + (mel_max - mel_min) * i / (n_mels + 1);
|
||||
}
|
||||
|
||||
|
||||
// Convert to Hz and then to FFT bin indices
|
||||
std::vector<float> hz_points(n_mels + 2);
|
||||
std::vector<float> fft_freqs(n_fft_bins);
|
||||
|
||||
|
||||
for (int i = 0; i < n_mels + 2; ++i) {
|
||||
hz_points[i] = mel_to_hz_slaney(mel_points[i]);
|
||||
}
|
||||
|
||||
|
||||
for (int i = 0; i < n_fft_bins; ++i) {
|
||||
fft_freqs[i] = (float)i * sample_rate / n_fft;
|
||||
}
|
||||
|
||||
|
||||
memset(filterbank, 0, n_mels * n_fft_bins * sizeof(float));
|
||||
|
||||
|
||||
// Create triangular filters with slaney normalization
|
||||
for (int m = 0; m < n_mels; ++m) {
|
||||
float f_left = hz_points[m];
|
||||
float f_center = hz_points[m + 1];
|
||||
float f_right = hz_points[m + 2];
|
||||
|
||||
|
||||
// Slaney normalization: divide by bandwidth (area normalization)
|
||||
float enorm = 2.0f / (f_right - f_left);
|
||||
|
||||
|
||||
for (int k = 0; k < n_fft_bins; ++k) {
|
||||
float freq = fft_freqs[k];
|
||||
|
||||
|
||||
if (freq >= f_left && freq <= f_center) {
|
||||
if (f_center > f_left) {
|
||||
filterbank[m * n_fft_bins + k] = enorm * (freq - f_left) / (f_center - f_left);
|
||||
|
|
@ -116,7 +116,7 @@ static void compute_hann_window(float * window, int n) {
|
|||
static void compute_centered_window(float * window, int n_fft, int win_length) {
|
||||
// Zero-initialize
|
||||
memset(window, 0, n_fft * sizeof(float));
|
||||
|
||||
|
||||
// Compute Hann window of win_length
|
||||
int offset = (n_fft - win_length) / 2;
|
||||
for (int i = 0; i < win_length; ++i) {
|
||||
|
|
@ -128,7 +128,7 @@ AudioTokenizerEncoder::AudioTokenizerEncoder() = default;
|
|||
|
||||
AudioTokenizerEncoder::~AudioTokenizerEncoder() {
|
||||
free_speaker_encoder_model(model_);
|
||||
|
||||
|
||||
if (state_.sched) {
|
||||
ggml_backend_sched_free(state_.sched);
|
||||
state_.sched = nullptr;
|
||||
|
|
@ -149,10 +149,10 @@ bool AudioTokenizerEncoder::load_model(const std::string & model_path) {
|
|||
error_msg_ = loader.get_error();
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
model_.config.sample_rate = loader.get_u32("qwen3-tts.speaker_encoder.sample_rate", 24000);
|
||||
model_.config.embedding_dim = loader.get_u32("qwen3-tts.speaker_encoder.embedding_length", 1024);
|
||||
|
||||
|
||||
int64_t n_tensors = loader.get_n_tensors();
|
||||
int spk_tensor_count = 0;
|
||||
for (int64_t i = 0; i < n_tensors; ++i) {
|
||||
|
|
@ -161,46 +161,46 @@ bool AudioTokenizerEncoder::load_model(const std::string & model_path) {
|
|||
spk_tensor_count++;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (spk_tensor_count == 0) {
|
||||
error_msg_ = "No speaker encoder tensors found in model";
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
size_t ctx_size = ggml_tensor_overhead() * spk_tensor_count;
|
||||
struct ggml_init_params params = {
|
||||
/*.mem_size =*/ ctx_size,
|
||||
/*.mem_buffer =*/ nullptr,
|
||||
/*.no_alloc =*/ true,
|
||||
};
|
||||
|
||||
|
||||
model_.ctx = ggml_init(params);
|
||||
if (!model_.ctx) {
|
||||
error_msg_ = "Failed to initialize GGML context";
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
struct gguf_context * gguf_ctx = loader.get_ctx();
|
||||
struct ggml_context * meta_ctx = loader.get_meta_ctx();
|
||||
|
||||
|
||||
for (int64_t i = 0; i < n_tensors; ++i) {
|
||||
const char * name = loader.get_tensor_name(i);
|
||||
if (!name || strncmp(name, "spk_enc.", 8) != 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
struct ggml_tensor * meta_tensor = ggml_get_tensor(meta_ctx, name);
|
||||
if (!meta_tensor) {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
struct ggml_tensor * tensor = ggml_dup_tensor(model_.ctx, meta_tensor);
|
||||
ggml_set_name(tensor, name);
|
||||
|
||||
|
||||
model_.tensors[name] = tensor;
|
||||
|
||||
|
||||
std::string sname(name);
|
||||
|
||||
|
||||
if (sname == "spk_enc.conv0.weight") model_.conv0_w = tensor;
|
||||
else if (sname == "spk_enc.conv0.bias") model_.conv0_b = tensor;
|
||||
else if (sname == "spk_enc.mfa.weight") model_.mfa_w = tensor;
|
||||
|
|
@ -214,7 +214,7 @@ bool AudioTokenizerEncoder::load_model(const std::string & model_path) {
|
|||
else {
|
||||
int blk_idx, res_idx;
|
||||
char suffix[64];
|
||||
|
||||
|
||||
if (sscanf(name, "spk_enc.blk.%d.tdnn1.%s", &blk_idx, suffix) == 2) {
|
||||
if (blk_idx >= 1 && blk_idx <= 3) {
|
||||
if (strcmp(suffix, "weight") == 0) model_.blocks[blk_idx-1].tdnn1_w = tensor;
|
||||
|
|
@ -247,20 +247,20 @@ bool AudioTokenizerEncoder::load_model(const std::string & model_path) {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!load_tensor_data_from_file(model_path, gguf_ctx, model_.ctx,
|
||||
model_.tensors, model_.buffer, error_msg_)) {
|
||||
|
||||
if (!load_tensor_data_from_file(model_path, gguf_ctx, model_.ctx,
|
||||
model_.tensors, model_.buffer, error_msg_, GGML_BACKEND_DEVICE_TYPE_GPU)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
state_.backend = init_preferred_backend("AudioTokenizerEncoder", &error_msg_);
|
||||
|
||||
state_.backend = init_preferred_backend("AudioTokenizerEncoder", &error_msg_, true);
|
||||
if (!state_.backend) {
|
||||
return false;
|
||||
}
|
||||
ggml_backend_dev_t device = ggml_backend_get_device(state_.backend);
|
||||
const char * device_name = device ? ggml_backend_dev_name(device) : "Unknown";
|
||||
fprintf(stderr, " AudioTokenizerEncoder backend: %s\n", device_name);
|
||||
|
||||
|
||||
if (device && ggml_backend_dev_type(device) != GGML_BACKEND_DEVICE_TYPE_CPU) {
|
||||
state_.backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
|
||||
if (!state_.backend_cpu) {
|
||||
|
|
@ -279,20 +279,20 @@ bool AudioTokenizerEncoder::load_model(const std::string & model_path) {
|
|||
error_msg_ = "Failed to create backend scheduler";
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
state_.compute_meta.resize(ggml_tensor_overhead() * QWEN3_TTS_MAX_NODES + ggml_graph_overhead());
|
||||
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool AudioTokenizerEncoder::compute_mel_spectrogram(const float * samples, int32_t n_samples,
|
||||
std::vector<float> & mel, int32_t & n_frames) {
|
||||
const auto & cfg = model_.config;
|
||||
|
||||
|
||||
// Match PyTorch STFT padding: (n_fft - hop_size) // 2 on each side with reflect
|
||||
int padding = (cfg.n_fft - cfg.hop_length) / 2;
|
||||
int padded_length = n_samples + 2 * padding;
|
||||
|
||||
|
||||
// Create padded signal with reflect padding
|
||||
std::vector<float> padded(padded_length);
|
||||
for (int i = 0; i < padded_length; ++i) {
|
||||
|
|
@ -310,50 +310,50 @@ bool AudioTokenizerEncoder::compute_mel_spectrogram(const float * samples, int32
|
|||
src_idx = std::max(0, std::min(n_samples - 1, src_idx));
|
||||
padded[i] = samples[src_idx];
|
||||
}
|
||||
|
||||
|
||||
// With center=False, frames start at 0 and step by hop_length
|
||||
n_frames = (padded_length - cfg.n_fft) / cfg.hop_length + 1;
|
||||
if (n_frames <= 0) {
|
||||
error_msg_ = "Audio too short for mel spectrogram";
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
int n_fft_bins = cfg.n_fft / 2 + 1;
|
||||
|
||||
|
||||
std::vector<float> filterbank(cfg.n_mels * n_fft_bins);
|
||||
compute_mel_filterbank_slaney(filterbank.data(), cfg.n_mels, cfg.n_fft,
|
||||
compute_mel_filterbank_slaney(filterbank.data(), cfg.n_mels, cfg.n_fft,
|
||||
cfg.sample_rate, cfg.f_min, cfg.f_max);
|
||||
|
||||
|
||||
// PyTorch STFT with win_length < n_fft centers the window in the n_fft frame
|
||||
// This is critical for matching PyTorch's output
|
||||
std::vector<float> window(cfg.n_fft);
|
||||
compute_centered_window(window.data(), cfg.n_fft, cfg.win_length);
|
||||
|
||||
|
||||
// Output: [batch, n_mels, n_frames] but we store as [n_mels, n_frames] row-major
|
||||
// which means mel[m * n_frames + f] = value at mel bin m, frame f
|
||||
mel.resize(cfg.n_mels * n_frames);
|
||||
|
||||
|
||||
std::vector<float> frame(cfg.n_fft, 0.0f);
|
||||
std::vector<float> fft_real(cfg.n_fft);
|
||||
std::vector<float> fft_imag(cfg.n_fft);
|
||||
std::vector<float> magnitude(n_fft_bins);
|
||||
|
||||
|
||||
for (int32_t f = 0; f < n_frames; ++f) {
|
||||
int start = f * cfg.hop_length;
|
||||
|
||||
|
||||
// Apply centered window to n_fft samples
|
||||
for (int i = 0; i < cfg.n_fft; ++i) {
|
||||
frame[i] = padded[start + i] * window[i];
|
||||
}
|
||||
|
||||
|
||||
compute_dft(frame.data(), fft_real.data(), fft_imag.data(), cfg.n_fft);
|
||||
|
||||
|
||||
// Compute magnitude (not power) - matches torch.stft with return_complex=True then abs()
|
||||
// spec = torch.sqrt(torch.view_as_real(spec).pow(2).sum(-1) + 1e-9)
|
||||
for (int k = 0; k < n_fft_bins; ++k) {
|
||||
magnitude[k] = sqrtf(fft_real[k] * fft_real[k] + fft_imag[k] * fft_imag[k] + 1e-9f);
|
||||
}
|
||||
|
||||
|
||||
// Apply mel filterbank and log compression
|
||||
// mel_spec = torch.matmul(mel_basis, spec)
|
||||
// mel_spec = dynamic_range_compression_torch(mel_spec) # log(clamp(x, min=1e-5) * 1)
|
||||
|
|
@ -366,7 +366,7 @@ bool AudioTokenizerEncoder::compute_mel_spectrogram(const float * samples, int32
|
|||
mel[m * n_frames + f] = logf(std::max(sum, 1e-5f));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
@ -376,41 +376,41 @@ static struct ggml_tensor * apply_reflect_pad_1d(struct ggml_context * ctx,
|
|||
if (pad == 0) {
|
||||
return x;
|
||||
}
|
||||
|
||||
|
||||
int64_t T = x->ne[0];
|
||||
int64_t C = x->ne[1];
|
||||
int64_t B = x->ne[2];
|
||||
|
||||
|
||||
struct ggml_tensor * left_slices[16];
|
||||
struct ggml_tensor * right_slices[16];
|
||||
|
||||
|
||||
for (int i = 0; i < pad && i < 16; ++i) {
|
||||
int left_src_idx = pad - i;
|
||||
left_slices[i] = ggml_view_3d(ctx, x, 1, C, B,
|
||||
x->nb[1], x->nb[2],
|
||||
left_src_idx * x->nb[0]);
|
||||
left_slices[i] = ggml_cont(ctx, left_slices[i]);
|
||||
|
||||
|
||||
int right_src_idx = T - 2 - i;
|
||||
right_slices[i] = ggml_view_3d(ctx, x, 1, C, B,
|
||||
x->nb[1], x->nb[2],
|
||||
right_src_idx * x->nb[0]);
|
||||
right_slices[i] = ggml_cont(ctx, right_slices[i]);
|
||||
}
|
||||
|
||||
|
||||
struct ggml_tensor * left_pad = left_slices[0];
|
||||
for (int i = 1; i < pad && i < 16; ++i) {
|
||||
left_pad = ggml_concat(ctx, left_pad, left_slices[i], 0);
|
||||
}
|
||||
|
||||
|
||||
struct ggml_tensor * right_pad = right_slices[0];
|
||||
for (int i = 1; i < pad && i < 16; ++i) {
|
||||
right_pad = ggml_concat(ctx, right_pad, right_slices[i], 0);
|
||||
}
|
||||
|
||||
|
||||
struct ggml_tensor * padded = ggml_concat(ctx, left_pad, x, 0);
|
||||
padded = ggml_concat(ctx, padded, right_pad, 0);
|
||||
|
||||
|
||||
return padded;
|
||||
}
|
||||
|
||||
|
|
@ -423,12 +423,12 @@ static struct ggml_tensor * apply_conv1d(struct ggml_context * ctx,
|
|||
bool use_reflect_pad = true) {
|
||||
struct ggml_tensor * input = x;
|
||||
int actual_pad = pad;
|
||||
|
||||
|
||||
if (use_reflect_pad && pad > 0) {
|
||||
input = apply_reflect_pad_1d(ctx, x, pad);
|
||||
actual_pad = 0;
|
||||
}
|
||||
|
||||
|
||||
struct ggml_tensor * y = ggml_conv_1d(ctx, w, input, stride, actual_pad, dilation);
|
||||
if (debug_name) {
|
||||
char name[64];
|
||||
|
|
@ -447,32 +447,32 @@ struct ggml_cgraph * AudioTokenizerEncoder::build_graph(int32_t n_frames) {
|
|||
const int hidden_dim = cfg.hidden_dim; // 512
|
||||
const int scale = cfg.res2net_scale; // 8
|
||||
const int branch_dim = hidden_dim / scale; // 64
|
||||
|
||||
|
||||
struct ggml_init_params params = {
|
||||
/*.mem_size =*/ state_.compute_meta.size(),
|
||||
/*.mem_buffer =*/ state_.compute_meta.data(),
|
||||
/*.no_alloc =*/ true,
|
||||
};
|
||||
|
||||
|
||||
struct ggml_context * ctx0 = ggml_init(params);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, QWEN3_TTS_MAX_NODES, false);
|
||||
|
||||
|
||||
// Input: mel spectrogram [n_mels, n_frames] - stored as [n_mels, n_frames] row-major
|
||||
// GGML uses column-major, so this is [n_frames, n_mels] in GGML notation
|
||||
struct ggml_tensor * mel = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_frames, cfg.n_mels);
|
||||
ggml_set_name(mel, "mel");
|
||||
ggml_set_input(mel);
|
||||
|
||||
|
||||
// PyTorch: hidden_states = hidden_states.transpose(1, 2) # [B, T, C] -> [B, C, T]
|
||||
// Our mel is [n_frames, n_mels] in GGML = [n_mels, n_frames] row-major
|
||||
// For conv1d, we need [T, C, B] in GGML = [B, C, T] row-major
|
||||
// So reshape to [n_frames, n_mels, 1]
|
||||
struct ggml_tensor * cur = ggml_reshape_3d(ctx0, mel, n_frames, cfg.n_mels, 1);
|
||||
ggml_set_name(cur, "mel_3d");
|
||||
|
||||
|
||||
struct ggml_tensor * mel_padded = apply_reflect_pad_1d(ctx0, cur, 2);
|
||||
ggml_set_name(mel_padded, "mel_padded");
|
||||
|
||||
|
||||
cur = ggml_conv_1d(ctx0, model_.conv0_w, mel_padded, 1, 0, 1);
|
||||
ggml_set_name(cur, "conv0_conv");
|
||||
|
||||
|
|
@ -483,37 +483,37 @@ struct ggml_cgraph * AudioTokenizerEncoder::build_graph(int32_t n_frames) {
|
|||
ggml_set_name(cur, "conv0_pre_relu");
|
||||
cur = ggml_relu(ctx0, cur);
|
||||
ggml_set_name(cur, "conv0_out");
|
||||
|
||||
|
||||
int64_t seq_len = cur->ne[0];
|
||||
|
||||
|
||||
// Store block outputs for MFA (including block 0)
|
||||
struct ggml_tensor * block_outputs[4];
|
||||
block_outputs[0] = cur; // Initial TDNN output
|
||||
|
||||
|
||||
// Blocks 1-3: SE-Res2Net blocks
|
||||
// Dilations: block1=2, block2=3, block3=4
|
||||
int dilations[3] = {2, 3, 4};
|
||||
|
||||
|
||||
for (int blk = 0; blk < 3; ++blk) {
|
||||
const auto & block = model_.blocks[blk];
|
||||
int dilation = dilations[blk];
|
||||
|
||||
|
||||
struct ggml_tensor * residual = cur;
|
||||
|
||||
|
||||
cur = apply_conv1d(ctx0, block.tdnn1_w, block.tdnn1_b, cur, 1, 0, 1);
|
||||
cur = ggml_relu(ctx0, cur);
|
||||
if (blk == 0) {
|
||||
ggml_set_name(cur, "blk1_tdnn1");
|
||||
}
|
||||
|
||||
|
||||
// Res2Net: Split into 8 branches of 64 channels each
|
||||
// cur shape: [seq_len, 512, 1]
|
||||
// Branch 0: identity (no conv)
|
||||
// Branch i (1-7): conv(hidden_part + previous_output) for i >= 2, conv(hidden_part) for i == 1
|
||||
|
||||
|
||||
// Split channels: view as [seq_len, 64, 8] then split
|
||||
struct ggml_tensor * branches[8];
|
||||
|
||||
|
||||
// Extract each branch using view operations
|
||||
// cur is [seq_len, 512, 1], we want to split dim 1 into 8 parts of 64
|
||||
for (int b = 0; b < scale; ++b) {
|
||||
|
|
@ -522,17 +522,17 @@ struct ggml_cgraph * AudioTokenizerEncoder::build_graph(int32_t n_frames) {
|
|||
// nb1 = stride for dim 1 = cur->nb[1] (bytes to move from one channel to next)
|
||||
// nb2 = stride for dim 2 = cur->nb[2] (bytes to move from one batch to next)
|
||||
// offset = b * 64 * cur->nb[1] (skip b*64 channels)
|
||||
branches[b] = ggml_view_3d(ctx0, cur,
|
||||
branches[b] = ggml_view_3d(ctx0, cur,
|
||||
seq_len, branch_dim, 1,
|
||||
cur->nb[1], cur->nb[2],
|
||||
cur->nb[1], cur->nb[2],
|
||||
b * branch_dim * cur->nb[1]);
|
||||
branches[b] = ggml_cont(ctx0, branches[b]);
|
||||
}
|
||||
|
||||
|
||||
// Process branches according to Res2Net logic
|
||||
struct ggml_tensor * outputs[8];
|
||||
outputs[0] = branches[0]; // Branch 0: identity
|
||||
|
||||
|
||||
for (int b = 1; b < scale; ++b) {
|
||||
struct ggml_tensor * input;
|
||||
if (b == 1) {
|
||||
|
|
@ -541,18 +541,18 @@ struct ggml_cgraph * AudioTokenizerEncoder::build_graph(int32_t n_frames) {
|
|||
// Add previous output to current branch
|
||||
input = ggml_add(ctx0, branches[b], outputs[b - 1]);
|
||||
}
|
||||
|
||||
|
||||
// Apply conv with dilation (kernel=3)
|
||||
// Padding for kernel=3, dilation=d: pad = d * (3-1) / 2 = d
|
||||
if (block.res2net_w[b - 1]) {
|
||||
outputs[b] = apply_conv1d(ctx0, block.res2net_w[b - 1], block.res2net_b[b - 1],
|
||||
outputs[b] = apply_conv1d(ctx0, block.res2net_w[b - 1], block.res2net_b[b - 1],
|
||||
input, 1, dilation, dilation);
|
||||
outputs[b] = ggml_relu(ctx0, outputs[b]);
|
||||
} else {
|
||||
outputs[b] = input; // Fallback if weight missing
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
cur = outputs[0];
|
||||
for (int b = 1; b < scale; ++b) {
|
||||
cur = ggml_concat(ctx0, cur, outputs[b], 1);
|
||||
|
|
@ -565,40 +565,40 @@ struct ggml_cgraph * AudioTokenizerEncoder::build_graph(int32_t n_frames) {
|
|||
ggml_set_name(outputs[b], name);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
cur = apply_conv1d(ctx0, block.tdnn2_w, block.tdnn2_b, cur, 1, 0, 1);
|
||||
cur = ggml_relu(ctx0, cur);
|
||||
if (blk == 0) {
|
||||
ggml_set_name(cur, "blk1_tdnn2");
|
||||
}
|
||||
|
||||
|
||||
// SE (Squeeze-Excitation)
|
||||
// Global average pooling over time: mean(dim=2, keepdim=True)
|
||||
struct ggml_tensor * se = ggml_pool_1d(ctx0, cur, GGML_OP_POOL_AVG, seq_len, seq_len, 0);
|
||||
se = ggml_reshape_3d(ctx0, se, 1, hidden_dim, 1);
|
||||
|
||||
|
||||
// SE conv1: 512 -> 128 with ReLU
|
||||
se = apply_conv1d(ctx0, block.se_conv1_w, block.se_conv1_b, se, 1, 0, 1);
|
||||
se = ggml_relu(ctx0, se);
|
||||
|
||||
|
||||
// SE conv2: 128 -> 512 with Sigmoid
|
||||
se = apply_conv1d(ctx0, block.se_conv2_w, block.se_conv2_b, se, 1, 0, 1);
|
||||
se = ggml_sigmoid(ctx0, se);
|
||||
|
||||
|
||||
cur = ggml_mul(ctx0, cur, se);
|
||||
if (blk == 0) {
|
||||
ggml_set_name(cur, "blk1_se");
|
||||
}
|
||||
|
||||
cur = ggml_add(ctx0, cur, residual);
|
||||
|
||||
|
||||
char block_name[32];
|
||||
snprintf(block_name, sizeof(block_name), "block_%d", blk + 1);
|
||||
ggml_set_name(cur, block_name);
|
||||
|
||||
|
||||
block_outputs[blk + 1] = cur;
|
||||
}
|
||||
|
||||
|
||||
// MFA: Concatenate block outputs [1:] (blocks 1, 2, 3 = indices 1, 2, 3)
|
||||
// hidden_states = torch.cat(hidden_states_list[1:], dim=1)
|
||||
// Each block output is [seq_len, 512, 1]
|
||||
|
|
@ -606,20 +606,20 @@ struct ggml_cgraph * AudioTokenizerEncoder::build_graph(int32_t n_frames) {
|
|||
struct ggml_tensor * mfa_input = ggml_concat(ctx0, block_outputs[1], block_outputs[2], 1);
|
||||
mfa_input = ggml_concat(ctx0, mfa_input, block_outputs[3], 1);
|
||||
ggml_set_name(mfa_input, "mfa_input");
|
||||
|
||||
|
||||
// MFA conv: 1536 -> 1536 with ReLU
|
||||
cur = apply_conv1d(ctx0, model_.mfa_w, model_.mfa_b, mfa_input, 1, 0, 1);
|
||||
cur = ggml_relu(ctx0, cur);
|
||||
ggml_set_name(cur, "mfa_out");
|
||||
|
||||
|
||||
// ASP (Attentive Statistics Pooling)
|
||||
// cur shape: [seq_len, 1536, 1]
|
||||
|
||||
|
||||
// Step 1: Compute global mean and std over time
|
||||
// mean = hidden_states.mean(dim=2, keepdim=True) # [1, 1536, 1]
|
||||
struct ggml_tensor * global_mean = ggml_pool_1d(ctx0, cur, GGML_OP_POOL_AVG, seq_len, seq_len, 0);
|
||||
global_mean = ggml_reshape_3d(ctx0, global_mean, 1, 1536, 1);
|
||||
|
||||
|
||||
// std = sqrt(E[x^2] - E[x]^2)
|
||||
struct ggml_tensor * sq = ggml_sqr(ctx0, cur);
|
||||
struct ggml_tensor * mean_sq = ggml_pool_1d(ctx0, sq, GGML_OP_POOL_AVG, seq_len, seq_len, 0);
|
||||
|
|
@ -627,20 +627,20 @@ struct ggml_cgraph * AudioTokenizerEncoder::build_graph(int32_t n_frames) {
|
|||
struct ggml_tensor * var = ggml_sub(ctx0, mean_sq, ggml_sqr(ctx0, global_mean));
|
||||
var = ggml_clamp(ctx0, var, 1e-12f, 1e10f);
|
||||
struct ggml_tensor * global_std = ggml_sqrt(ctx0, var);
|
||||
|
||||
|
||||
// Step 2: Expand mean and std to full sequence length and concatenate with hidden_states
|
||||
// mean = mean.repeat(1, 1, seq_length) # [1, 1536, seq_len]
|
||||
// std = std.repeat(1, 1, seq_length) # [1, 1536, seq_len]
|
||||
// attention = torch.cat([hidden_states, mean, std], dim=1) # [1, 4608, seq_len]
|
||||
struct ggml_tensor * mean_expanded = ggml_repeat(ctx0, global_mean,
|
||||
struct ggml_tensor * mean_expanded = ggml_repeat(ctx0, global_mean,
|
||||
ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, seq_len, 1536, 1));
|
||||
struct ggml_tensor * std_expanded = ggml_repeat(ctx0, global_std,
|
||||
ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, seq_len, 1536, 1));
|
||||
|
||||
|
||||
struct ggml_tensor * attention = ggml_concat(ctx0, cur, mean_expanded, 1);
|
||||
attention = ggml_concat(ctx0, attention, std_expanded, 1);
|
||||
// attention shape: [seq_len, 4608, 1]
|
||||
|
||||
|
||||
// Step 3: TDNN (4608 -> 128) with ReLU, then Tanh
|
||||
// self.tdnn = TimeDelayNetBlock(channels * 3, attention_channels, 1, 1) # has ReLU
|
||||
// attention = self.conv(self.tanh(self.tdnn(attention)))
|
||||
|
|
@ -648,17 +648,17 @@ struct ggml_cgraph * AudioTokenizerEncoder::build_graph(int32_t n_frames) {
|
|||
attention = ggml_relu(ctx0, attention); // TDNN has ReLU
|
||||
ggml_set_name(attention, "asp_tdnn");
|
||||
attention = ggml_tanh(ctx0, attention); // Then tanh is applied
|
||||
|
||||
|
||||
// Step 4: Conv (128 -> 1536) for attention weights
|
||||
// self.conv = nn.Conv1d(attention_channels, channels, kernel_size=1)
|
||||
attention = apply_conv1d(ctx0, model_.asp_conv_w, model_.asp_conv_b, attention, 1, 0, 1);
|
||||
ggml_set_name(attention, "asp_conv");
|
||||
// attention shape: [seq_len, 1536, 1]
|
||||
|
||||
|
||||
// Step 5: Softmax over time dimension
|
||||
attention = ggml_soft_max(ctx0, attention);
|
||||
ggml_set_name(attention, "asp_softmax");
|
||||
|
||||
|
||||
// Step 6: Compute weighted mean and std
|
||||
// mean, std = self._compute_statistics(hidden_states, attention)
|
||||
// mean = (attention * hidden_states).sum(dim=2)
|
||||
|
|
@ -666,7 +666,7 @@ struct ggml_cgraph * AudioTokenizerEncoder::build_graph(int32_t n_frames) {
|
|||
struct ggml_tensor * weighted_mean = ggml_pool_1d(ctx0, weighted, GGML_OP_POOL_AVG, seq_len, seq_len, 0);
|
||||
weighted_mean = ggml_scale(ctx0, weighted_mean, (float)seq_len); // Convert avg to sum
|
||||
weighted_mean = ggml_reshape_3d(ctx0, weighted_mean, 1, 1536, 1);
|
||||
|
||||
|
||||
// std = sqrt((attention * (hidden_states - mean)^2).sum(dim=2).clamp(eps))
|
||||
struct ggml_tensor * mean_for_std = ggml_repeat(ctx0, weighted_mean,
|
||||
ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, seq_len, 1536, 1));
|
||||
|
|
@ -678,25 +678,25 @@ struct ggml_cgraph * AudioTokenizerEncoder::build_graph(int32_t n_frames) {
|
|||
var_sum = ggml_reshape_3d(ctx0, var_sum, 1, 1536, 1);
|
||||
var_sum = ggml_clamp(ctx0, var_sum, 1e-12f, 1e10f);
|
||||
struct ggml_tensor * weighted_std = ggml_sqrt(ctx0, var_sum);
|
||||
|
||||
|
||||
// Step 7: Concatenate mean and std: [1, 3072, 1]
|
||||
struct ggml_tensor * pooled = ggml_concat(ctx0, weighted_mean, weighted_std, 1);
|
||||
ggml_set_name(pooled, "asp_pooled");
|
||||
|
||||
|
||||
// FC: 3072 -> 1024
|
||||
cur = apply_conv1d(ctx0, model_.fc_w, model_.fc_b, pooled, 1, 0, 1);
|
||||
ggml_set_name(cur, "fc_out");
|
||||
|
||||
|
||||
// Squeeze to 1D
|
||||
cur = ggml_reshape_1d(ctx0, cur, cfg.embedding_dim);
|
||||
|
||||
|
||||
ggml_set_name(cur, "embedding");
|
||||
ggml_set_output(cur);
|
||||
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
|
||||
ggml_free(ctx0);
|
||||
|
||||
|
||||
return gf;
|
||||
}
|
||||
|
||||
|
|
@ -706,27 +706,27 @@ bool AudioTokenizerEncoder::encode(const float * samples, int32_t n_samples,
|
|||
error_msg_ = "Model not loaded";
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
std::vector<float> mel;
|
||||
int32_t n_frames;
|
||||
if (!compute_mel_spectrogram(samples, n_samples, mel, n_frames)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
struct ggml_cgraph * gf = build_graph(n_frames);
|
||||
|
||||
if (!ggml_backend_sched_alloc_graph(state_.sched, gf)) {
|
||||
error_msg_ = "Failed to allocate graph";
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
struct ggml_tensor * mel_tensor = ggml_graph_get_tensor(gf, "mel");
|
||||
if (!mel_tensor) {
|
||||
error_msg_ = "Failed to find mel tensor";
|
||||
ggml_backend_sched_reset(state_.sched);
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
// mel is stored as [n_mels, n_frames] row-major: mel[m * n_frames + f] = mel bin m at frame f
|
||||
// GGML tensor is [n_frames, n_mels] column-major: element (f, m) at memory[f + m * n_frames]
|
||||
// For GGML conv1d, we want input(t, c) = mel bin c at time t
|
||||
|
|
@ -734,7 +734,7 @@ bool AudioTokenizerEncoder::encode(const float * samples, int32_t n_samples,
|
|||
// Since the memory layout matches (both are contiguous in frame order for each mel bin),
|
||||
// we can copy directly!
|
||||
ggml_backend_tensor_set(mel_tensor, mel.data(), 0, mel.size() * sizeof(float));
|
||||
|
||||
|
||||
if (ggml_backend_sched_graph_compute(state_.sched, gf) != GGML_STATUS_SUCCESS) {
|
||||
error_msg_ = "Failed to compute graph";
|
||||
ggml_backend_sched_reset(state_.sched);
|
||||
|
|
@ -747,12 +747,12 @@ bool AudioTokenizerEncoder::encode(const float * samples, int32_t n_samples,
|
|||
ggml_backend_sched_reset(state_.sched);
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
embedding.resize(model_.config.embedding_dim);
|
||||
ggml_backend_tensor_get(emb_tensor, embedding.data(), 0, embedding.size() * sizeof(float));
|
||||
|
||||
|
||||
ggml_backend_sched_reset(state_.sched);
|
||||
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -24,7 +24,7 @@ GGUFLoader::~GGUFLoader() {
|
|||
close();
|
||||
}
|
||||
|
||||
ggml_backend_t init_preferred_backend(const char * component_name, std::string * error_msg) {
|
||||
ggml_backend_t init_preferred_backend(const char * component_name, std::string * error_msg, bool allow_gpu) {
|
||||
if (error_msg) error_msg->clear();
|
||||
|
||||
auto & shared = get_shared_backend_state();
|
||||
|
|
@ -34,9 +34,12 @@ ggml_backend_t init_preferred_backend(const char * component_name, std::string *
|
|||
}
|
||||
|
||||
ggml_backend_t backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU, nullptr);
|
||||
if(allow_gpu)
|
||||
{
|
||||
if (!backend) {
|
||||
backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_GPU, nullptr);
|
||||
}
|
||||
}
|
||||
if (!backend) {
|
||||
backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_ACCEL, nullptr);
|
||||
}
|
||||
|
|
@ -78,20 +81,20 @@ void release_preferred_backend(ggml_backend_t backend) {
|
|||
|
||||
bool GGUFLoader::open(const std::string & path) {
|
||||
close(); // Close any previously opened file
|
||||
|
||||
|
||||
file_path_ = path;
|
||||
|
||||
|
||||
struct gguf_init_params params = {
|
||||
/*.no_alloc =*/ true,
|
||||
/*.ctx =*/ &meta_ctx_,
|
||||
};
|
||||
|
||||
|
||||
ctx_ = gguf_init_from_file(path.c_str(), params);
|
||||
if (!ctx_) {
|
||||
error_msg_ = "Failed to open GGUF file: " + path;
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
@ -168,7 +171,7 @@ bool load_tensor_data_from_file(
|
|||
error_msg = "Failed to initialize backend for GGUF tensor loader";
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
// Allocate buffer for all tensors
|
||||
buffer = ggml_backend_alloc_ctx_tensors(model_ctx, backend);
|
||||
if (!buffer) {
|
||||
|
|
@ -176,7 +179,7 @@ bool load_tensor_data_from_file(
|
|||
ggml_backend_free(backend);
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
// Open file for reading tensor data
|
||||
FILE * f = fopen(path.c_str(), "rb");
|
||||
if (!f) {
|
||||
|
|
@ -184,45 +187,45 @@ bool load_tensor_data_from_file(
|
|||
ggml_backend_free(backend);
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
const size_t data_offset = gguf_get_data_offset(ctx);
|
||||
const int64_t n_tensors = gguf_get_n_tensors(ctx);
|
||||
std::vector<uint8_t> read_buf;
|
||||
|
||||
|
||||
for (int64_t i = 0; i < n_tensors; ++i) {
|
||||
const char * name = gguf_get_tensor_name(ctx, i);
|
||||
size_t offset = gguf_get_tensor_offset(ctx, i);
|
||||
|
||||
|
||||
auto it = tensors.find(name);
|
||||
if (it == tensors.end()) {
|
||||
continue; // Skip tensors not in our map
|
||||
}
|
||||
|
||||
|
||||
struct ggml_tensor * tensor = it->second;
|
||||
size_t nbytes = ggml_nbytes(tensor);
|
||||
|
||||
|
||||
read_buf.resize(nbytes);
|
||||
|
||||
|
||||
if (fseek(f, data_offset + offset, SEEK_SET) != 0) {
|
||||
error_msg = "Failed to seek to tensor data: " + std::string(name);
|
||||
fclose(f);
|
||||
ggml_backend_free(backend);
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
if (fread(read_buf.data(), 1, nbytes, f) != nbytes) {
|
||||
error_msg = "Failed to read tensor data: " + std::string(name);
|
||||
fclose(f);
|
||||
ggml_backend_free(backend);
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
ggml_backend_tensor_set(tensor, read_buf.data(), 0, nbytes);
|
||||
}
|
||||
|
||||
|
||||
fclose(f);
|
||||
ggml_backend_free(backend);
|
||||
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -17,44 +17,44 @@ class GGUFLoader {
|
|||
public:
|
||||
GGUFLoader();
|
||||
~GGUFLoader();
|
||||
|
||||
|
||||
// Open GGUF file and parse metadata
|
||||
bool open(const std::string & path);
|
||||
|
||||
|
||||
// Close file and free resources
|
||||
void close();
|
||||
|
||||
|
||||
// Get error message if operation failed
|
||||
const std::string & get_error() const { return error_msg_; }
|
||||
|
||||
|
||||
// Get number of tensors in file
|
||||
int64_t get_n_tensors() const;
|
||||
|
||||
|
||||
// Get tensor name by index
|
||||
const char * get_tensor_name(int64_t idx) const;
|
||||
|
||||
|
||||
// Get tensor type by index
|
||||
enum ggml_type get_tensor_type(int64_t idx) const;
|
||||
|
||||
|
||||
// Get tensor offset by index
|
||||
size_t get_tensor_offset(int64_t idx) const;
|
||||
|
||||
|
||||
// Get tensor size by index
|
||||
size_t get_tensor_size(int64_t idx) const;
|
||||
|
||||
|
||||
// Get metadata value (returns -1 if not found)
|
||||
int32_t get_u32(const char * key, int32_t default_val = 0) const;
|
||||
float get_f32(const char * key, float default_val = 0.0f) const;
|
||||
|
||||
|
||||
// Get data offset (start of tensor data in file)
|
||||
size_t get_data_offset() const;
|
||||
|
||||
|
||||
// Get GGUF context (for advanced usage)
|
||||
struct gguf_context * get_ctx() const { return ctx_; }
|
||||
|
||||
|
||||
// Get metadata context
|
||||
struct ggml_context * get_meta_ctx() const { return meta_ctx_; }
|
||||
|
||||
|
||||
protected:
|
||||
struct gguf_context * ctx_ = nullptr;
|
||||
struct ggml_context * meta_ctx_ = nullptr;
|
||||
|
|
@ -70,11 +70,11 @@ bool load_tensor_data_from_file(
|
|||
const std::map<std::string, struct ggml_tensor *> & tensors,
|
||||
ggml_backend_buffer_t & buffer,
|
||||
std::string & error_msg,
|
||||
enum ggml_backend_dev_type preferred_backend_type = GGML_BACKEND_DEVICE_TYPE_CPU
|
||||
enum ggml_backend_dev_type preferred_backend_type = GGML_BACKEND_DEVICE_TYPE_GPU
|
||||
);
|
||||
|
||||
// Helper to initialize backend with GPU preference and CPU fallback
|
||||
ggml_backend_t init_preferred_backend(const char * component_name, std::string * error_msg);
|
||||
ggml_backend_t init_preferred_backend(const char * component_name, std::string * error_msg, bool allow_gpu);
|
||||
void release_preferred_backend(ggml_backend_t backend);
|
||||
|
||||
// Helper function to free model resources
|
||||
|
|
|
|||
|
|
@ -97,7 +97,7 @@ bool TTSTransformer::load_model(const std::string & model_path) {
|
|||
gguf_free(ctx);
|
||||
if (meta_ctx) ggml_free(meta_ctx);
|
||||
|
||||
state_.backend = init_preferred_backend("TTSTransformer", &error_msg_);
|
||||
state_.backend = init_preferred_backend("TTSTransformer", &error_msg_, true);
|
||||
if (!state_.backend) {
|
||||
return false;
|
||||
}
|
||||
|
|
@ -553,7 +553,7 @@ bool TTSTransformer::create_tensors(struct gguf_context * ctx) {
|
|||
}
|
||||
|
||||
bool TTSTransformer::load_tensor_data(const std::string & path, struct gguf_context * ctx) {
|
||||
ggml_backend_t backend = init_preferred_backend("TTSTransformer", &error_msg_);
|
||||
ggml_backend_t backend = init_preferred_backend("TTSTransformer", &error_msg_, true);
|
||||
if (!backend) {
|
||||
return false;
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue