koboldcpp/otherarch/ttscpp/src/orpheus_model.cpp
2025-08-17 00:09:03 +08:00

475 lines
20 KiB
C++

#include "orpheus_model.h"
#include <array>
// These tokens and variables aren't defined in the Orpheus' model configuration but instead are defined inline in various python functions.
// As such, they are not discoverable so defining them as unconfigurable constants should be fine.
static constexpr std::array<const char *, 7> orpheus_voices{"zoe", "zac","jess", "leo", "mia", "julia", "leah"};
static constexpr std::array<uint32_t, 2> orpheus_prepended_tokens = { 128259, 128000 };
static constexpr std::array<uint32_t, 4> orpheus_appended_tokens = { 128009, 128260, 128261, 128257 };
void orpheus_model::assign_weight(std::string name, struct ggml_tensor * tensor) {
if (name == "norm") {
output_norm = ggml_dup_tensor(ctx, tensor);
set_tensor(output_norm, tensor);
} else if (name == "lm_head") {
head = ggml_dup_tensor(ctx, tensor);
set_tensor(head, tensor);
} else if (name == "embed_tokens") {
embd = ggml_dup_tensor(ctx, tensor);
set_tensor(embd, tensor);
} else if (name == "rope_frequencies") {
rope_frequencies = ggml_dup_tensor(ctx, tensor);
set_tensor(rope_frequencies, tensor);
} else if (has_prefix(name, "layers")) {
auto lpair = parse_layer_count(name);
int l = lpair.first;
std::string lt_name = lpair.second;
assign_to_layer(lt_name, layers[l], tensor);
}
}
void orpheus_model::assign_to_layer(std::string part, orpheus_layer & layer, struct ggml_tensor * tensor) {
if (part == ".self_attn.k_proj") {
layer.k = ggml_dup_tensor(ctx, tensor);
set_tensor(layer.k, tensor);
} else if (part == ".self_attn.q_proj") {
layer.q = ggml_dup_tensor(ctx, tensor);
set_tensor(layer.q, tensor);
} else if (part == ".self_attn.v_proj") {
layer.v = ggml_dup_tensor(ctx, tensor);
set_tensor(layer.v, tensor);
} else if (part == ".self_attn.o_proj") {
layer.o = ggml_dup_tensor(ctx, tensor);
set_tensor(layer.o, tensor);
} else if (part == ".mlp.gate_proj") {
layer.gate = ggml_dup_tensor(ctx, tensor);
set_tensor(layer.gate, tensor);
} else if (part == ".mlp.up_proj") {
layer.up = ggml_dup_tensor(ctx, tensor);
set_tensor(layer.up, tensor);
} else if (part == ".mlp.down_proj") {
layer.down = ggml_dup_tensor(ctx, tensor);
set_tensor(layer.down, tensor);
} else if (part == ".input_layernorm") {
layer.input_norm = ggml_dup_tensor(ctx, tensor);
set_tensor(layer.input_norm, tensor);
} else if (part == ".post_attention_layernorm") {
layer.post_attention_norm = ggml_dup_tensor(ctx, tensor);
set_tensor(layer.post_attention_norm, tensor);
}
}
void orpheus_model::prep_constants(gguf_context * meta) {
// get constants for orpheus
int vocab_size_key = gguf_find_key(meta, "orpheus.vocab_size");
if (vocab_size_key != -1) {
vocab_size = gguf_get_val_u32(meta, vocab_size_key);
}
int attn_heads_key = gguf_find_key(meta, "orpheus.attn_heads");
if (attn_heads_key != -1) {
n_attn_heads = gguf_get_val_u32(meta, attn_heads_key);
}
int kv_attn_heads_key = gguf_find_key(meta, "orpheus.kv_attn_heads");
if (kv_attn_heads_key != -1) {
n_kv_attn_heads = gguf_get_val_u32(meta, kv_attn_heads_key);
}
int head_size_key = gguf_find_key(meta, "orpheus.head_dim");
if (head_size_key != -1) {
head_size = gguf_get_val_u32(meta, head_size_key);
}
int stopping_token_key = gguf_find_key(meta, "orpheus.stopping_token_id");
if (stopping_token_key != -1) {
stopping_token_id = gguf_get_val_u32(meta, stopping_token_key);;
}
int eos_token_id_key = gguf_find_key(meta, "tokenizer.ggml.eos_token_id");
if (eos_token_id_key != -1) {
eos_token_id = gguf_get_val_u32(meta, eos_token_id_key);
}
int bos_token_id_key = gguf_find_key(meta, "tokenizer.ggml.bos_token_id");
if (bos_token_id_key != -1) {
bos_token_id = gguf_get_val_u32(meta, bos_token_id_key);
}
int hidden_size_key = gguf_find_key(meta, "orpheus.hidden_size");
if (hidden_size_key != -1) {
hidden_size = gguf_get_val_u32(meta, hidden_size_key);
}
int kv_hidden_size_key = gguf_find_key(meta, "orpheus.kv_hidden_size");
if (kv_hidden_size_key != -1) {
kv_hidden_size = gguf_get_val_u32(meta, kv_hidden_size_key);
}
}
void orpheus_model::prep_layers(gguf_context * meta) {
int n_layers_key = gguf_find_key(meta, "orpheus.layers");
if (n_layers_key == -1) {
TTS_ABORT("the 'orpheus.layers' must be specified in the GGUF file.");
}
n_layers = (int) gguf_get_val_u32(meta, n_layers_key);
for (int i = 0; i < n_layers; i++) {
layers.push_back(orpheus_layer{});
}
}
struct ggml_tensor * orpheus_build_layer_norm(ggml_context * ctx, struct ggml_tensor * x, struct ggml_tensor * weight) {
float eps = 0.00001;
return ggml_mul(ctx, ggml_rms_norm(ctx, x, eps), weight);
}
struct ggml_tensor * build_attn_mask(ggml_context * ctx, orpheus_context * octx, orpheus_ubatch & batch) {
octx->attn_mask = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, (int64_t) octx->current_position + batch.n_tokens, (int64_t) octx->current_position + batch.n_tokens);
ggml_set_input(octx->attn_mask);
return octx->attn_mask;
}
void orpheus_context::reset() {
output_tokens.clear();
current_position = 0;
n_outputs = 0;
}
orpheus_context * build_new_orpheus_context(orpheus_model * model, int n_threads, bool use_cpu) {
orpheus_context * octx = new orpheus_context(model, n_threads);
if (!use_cpu) {
#ifdef GGML_USE_METAL
octx->backend = ggml_backend_metal_init();
#endif
}
octx->backend_cpu = ggml_backend_cpu_init();
octx->set_threads();
octx->build_schedule();
octx->buf_compute_meta.resize(ggml_tensor_overhead()*model->max_nodes() + ggml_graph_overhead_custom(model->max_nodes(), false));
return octx;
}
void orpheus_runner::orpheus_kv_cache_init() {
ggml_backend_buffer_type_t buft = nullptr;
if (octx->backend != nullptr) {
#ifdef GGML_USE_METAL
buft = ggml_backend_metal_buffer_type();
#endif
} else {
buft = ggml_backend_cpu_buffer_type();
}
struct ggml_init_params params = {
/*.mem_size =*/ (2u * model->layers.size() + 1)*ggml_tensor_overhead(),
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ true,
};
ggml_context * ctx = ggml_init(params);
if (!ctx) {
TTS_ABORT("%s: failed to initialze ggml context for key value cache.\n", __func__);
}
if (!kv_self) {
kv_self = new orpheus_kv_cache;
}
kv_self->ctx = ctx;
kv_self->k_l.reserve(model->layers.size());
kv_self->v_l.reserve(model->layers.size());
for (int i = 0; i < (int) model->layers.size(); i++) {
ggml_tensor * k = ggml_new_tensor_1d(kv_self->ctx, kv_self->cache_type, model->hidden_size * (model->max_context_length + model->max_generation_size));
ggml_tensor * v = ggml_new_tensor_1d(kv_self->ctx, kv_self->cache_type, model->hidden_size * (model->max_context_length + model->max_generation_size));
ggml_format_name(k, "cache_k_l%d", i);
ggml_format_name(v, "cache_v_l%d", i);
kv_self->k_l.push_back(k);
kv_self->v_l.push_back(v);
}
// allocate tensors and initialize the buffers to avoid NaNs in the padding
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(kv_self->ctx, buft);
ggml_backend_buffer_clear(buf, 0);
kv_self->buf = buf;
}
void orpheus_runner::orpheus_build_kv_store(struct ggml_context * ctx, struct ggml_cgraph * graph, struct ggml_tensor * k_cur, struct ggml_tensor * v_cur, int index, uint32_t n_tokens, int repeat) {
k_cur = ggml_rope_ext(ctx, ggml_cont(ctx, ggml_reshape_3d(ctx, k_cur, model->head_size, model->n_kv_attn_heads, n_tokens)), octx->positions, model->rope_frequencies,
model->head_size, 2,0, 500000.0f,
1.0f, 0.0f, 1.0f, 0.0f, 0.0f);
// A performance comparison between this method, i.e. performing 3 incremental copy operations in order to achieve repeat_interleave,
// and performing the repeat operation upfront before performign a single copy needs to be performed in order to better optimize this function.
// Additionally, it might be more performant for the values transposition to be performed prior to appending it to the cache, as it would save us
// from incrementally larger transpositions with generation.
for (int i = 0; i < repeat; i++) {
struct ggml_tensor * k_cache_view = ggml_view_3d(
ctx,
kv_self->k_l[index],
model->head_size,
model->n_kv_attn_heads,
n_tokens,
ggml_element_size(kv_self->k_l[index]) * model->head_size * repeat,
ggml_element_size(kv_self->k_l[index]) * model->n_attn_heads * model->head_size,
ggml_element_size(kv_self->k_l[index]) * model->n_attn_heads * model->head_size * octx->current_position + i * ggml_element_size(kv_self->k_l[index]) * model->head_size
);
ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur, k_cache_view));
struct ggml_tensor * v_cache_view = ggml_view_3d(
ctx,
kv_self->v_l[index],
model->head_size,
model->n_kv_attn_heads,
n_tokens,
ggml_element_size(kv_self->k_l[index]) * model->head_size * repeat,
ggml_element_size(kv_self->k_l[index]) * model->n_attn_heads * model->head_size,
ggml_element_size(kv_self->k_l[index]) * model->n_attn_heads * model->head_size * octx->current_position + i * ggml_element_size(kv_self->k_l[index]) * model->head_size
);
ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur, v_cache_view));
}
}
struct ggml_cgraph * orpheus_runner::build_orpheus_graph(orpheus_ubatch & batch) {
init_build();
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx, 8192, false);
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
const int32_t full_sequence_length = octx->current_position + (uint32_t) batch.n_tokens;
octx->positions = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.n_tokens);
ggml_set_input(octx->positions);
octx->inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.n_tokens);
ggml_set_input(octx->inp_tokens);
inpL = ggml_get_rows(ctx, model->embd, octx->inp_tokens);
struct ggml_tensor * KQ_mask_dec = build_attn_mask(ctx, octx, batch);
for (int l = 0; l < model->n_layers; l++) {
struct ggml_tensor * residual = inpL;
cur = orpheus_build_layer_norm(ctx, inpL, model->layers[l].input_norm);
struct ggml_tensor * attn_out;
// self-attention
{
struct ggml_tensor * Qcur = ggml_mul_mat(ctx, model->layers[l].q, cur);
struct ggml_tensor * Kcur = ggml_mul_mat(ctx, model->layers[l].k, cur);
struct ggml_tensor * Vcur = ggml_mul_mat(ctx, model->layers[l].v, cur);
orpheus_build_kv_store(ctx, gf, Kcur, Vcur, l, batch.n_tokens, 3);
struct ggml_tensor * k =
ggml_cont(ctx, ggml_view_3d(ctx, kv_self->k_l[l],
model->head_size, full_sequence_length, model->n_attn_heads,
ggml_element_size(kv_self->k_l[l]) * model->n_attn_heads * model->head_size,
ggml_element_size(kv_self->k_l[l]) * model->head_size,
0));
struct ggml_tensor * v =
ggml_view_2d(ctx, kv_self->v_l[l],
model->hidden_size, full_sequence_length,
ggml_element_size(kv_self->k_l[l]) * model->hidden_size,
0);
v = ggml_cont_3d(ctx, ggml_transpose(ctx, v), full_sequence_length, model->head_size, model->n_attn_heads);
Qcur = ggml_rope_ext(
ctx, ggml_cont(ctx, ggml_reshape_3d(ctx, Qcur, model->head_size, model->n_attn_heads, batch.n_tokens)),
octx->positions, model->rope_frequencies, model->head_size, 2, 0, 500000.0f, // rope theta
1.0f, 0.0f, 1.0f, 0.0f, 0.0f);
struct ggml_tensor * q = ggml_cont(ctx, ggml_permute(ctx, Qcur, 0, 2, 1, 3));
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
kq = ggml_soft_max_ext(ctx, kq, KQ_mask_dec, 1.0f/sqrtf(model->head_size), 0.0f);
struct ggml_tensor * kqv = ggml_mul_mat(ctx, kq, v);
struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 2, 0, 1, 3);
attn_out = ggml_cont_2d(ctx, kqv_merged, model->hidden_size, batch.n_tokens);
attn_out = ggml_mul_mat(ctx, model->layers[l].o, attn_out);
}
cur = ggml_add(ctx, attn_out, residual);
struct ggml_tensor * residualffn = cur;
// mlp
{
cur = orpheus_build_layer_norm(ctx, cur, model->layers[l].post_attention_norm);
cur = ggml_mul(ctx, ggml_silu(ctx, ggml_mul_mat(ctx, model->layers[l].gate, cur)), ggml_mul_mat(ctx, model->layers[l].up, cur));
cur = ggml_mul_mat(ctx, model->layers[l].down, cur);
}
cur = ggml_add(ctx, cur, residualffn);
inpL = cur;
}
cur = orpheus_build_layer_norm(ctx, cur, model->output_norm);
// only about 40k of the output head is actually uses for generation purposes. Ideally the head tensor should be shrunk and sampled tokens should be incremented.
cur = ggml_mul_mat(ctx, model->head, cur);
if (batch.n_tokens > 1) {
cur = ggml_cont(ctx, ggml_view_1d(ctx, cur, model->vocab_size, ggml_element_size(cur) * (cur->ne[1] - 1) * model->vocab_size));
}
ggml_build_forward_expand(gf, cur);
free_build();
return gf;
}
void orpheus_runner::decode(orpheus_ubatch & batch) {
ggml_backend_sched_reset(octx->sched);
octx->output_tokens.reserve(model->max_generation_size);
const size_t new_size = model->vocab_size * model->max_generation_size * sizeof(float);
octx->prep_output_buffer(new_size);
ggml_cgraph * gf = build_orpheus_graph(batch);
// the output is always the last tensor in the graph
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
ggml_backend_sched_alloc_graph(octx->sched, gf);
set_inputs(batch);
ggml_backend_sched_graph_compute_async(octx->sched, gf);
float * logits_out = octx->logits + octx->n_outputs * model->vocab_size;
octx->get_ggml_node_data(res, logits_out, model->vocab_size * sizeof(float));
// update the total number of outputs retrieved and the current position
octx->current_position += batch.n_tokens;
// Reset state for the next token before backend sync, to allow the CPU activities in the reset to
// overlap with device computation.
ggml_backend_sched_reset(octx->sched);
}
void orpheus_runner::set_inputs(orpheus_ubatch & batch) {
ggml_backend_tensor_set(octx->inp_tokens, batch.tokens.data(), 0, batch.tokens.size()*ggml_element_size(octx->inp_tokens));
int32_t * pos = (int32_t*) octx->positions->data;
float * mask = (float*) octx->attn_mask->data;
uint32_t max_pos = octx->current_position + batch.n_tokens;
for (int i = 0; i < batch.n_tokens; i++) {
pos[i] = (int32_t) octx->current_position + i;
for (int ii = 0; ii < max_pos; ii++) {
mask[i*max_pos + ii] = ii > pos[i] ? -INFINITY : 0.0f;
}
}
}
orpheus_ubatch orpheus_runner::batch_from_sentence(std::string sentence) {
struct orpheus_ubatch batch;
for (auto t : orpheus_prepended_tokens) {
batch.tokens.push_back(t);
}
if (!octx->voice.empty()) {
sentence = octx->voice + ": " + sentence;
}
tokenizer->tokenize(sentence, batch.tokens);
for (auto t : orpheus_appended_tokens) {
batch.tokens.push_back(t);
}
batch.n_tokens = batch.tokens.size();
return batch;
}
std::vector<std::vector<uint32_t>> orpheus_runner::prepare_output_tokens() {
size_t chunks = octx->output_tokens.size() / 7;
std::vector<std::vector<uint32_t>> output_tokens;
for (int i = 0; i < model->audio_heads; i++) {
output_tokens.push_back(std::vector<uint32_t>{});
}
for (int i = 0; i < chunks; i++) {
for (int ii = 0; ii < 7; ii++) {
uint32_t thead = model->heads[ii];
// the manipulations below are not configured because they are performed inline via undocumented constants in the Orpheus codebase.
// Essentially this is how Orpheus converts discrete samples from the output shape to the audio input shape.
uint32_t t = octx->output_tokens[i*7 + ii] - 128266 - ((ii % 7) * 4096);
output_tokens[thead].push_back(t);
}
}
return output_tokens;
}
void orpheus_runner::generate_from_batch(orpheus_ubatch & batch, struct tts_response * output) {
while ((octx->output_tokens.size() == 0 || octx->output_tokens.back() != model->stopping_token_id) && octx->output_tokens.size() < model->max_generation_size) {
decode(batch);
generation_sampler->sample(octx->logits + octx->n_outputs * model->vocab_size, octx->output_tokens);
// only increment the output count after sampling
octx->n_outputs++;
batch = orpheus_ubatch{
1, {octx->output_tokens.back()}
};
}
// this case could be better addressed by adding spliting to the generation process.
if (octx->output_tokens.size() >= model->max_generation_size) {
fprintf(stdout, "Warning: generation hit its max default length. The generated audio may not contain the entire prompt.\n");
}
std::vector<std::vector<uint32_t>> processed_output_tokens = prepare_output_tokens();
srunner->run(processed_output_tokens, output);
}
int orpheus_runner::generate(std::string sentence, struct tts_response * response) {
orpheus_ubatch batch = batch_from_sentence(sentence);
// it should be possible to update the max context window size, but currently it is extremely unlikely that a single prompt will
// surpass the default size.
if (batch.tokens.size() > model->max_context_length) {
TTS_ABORT("The prompt was too large for the default context window. Try splitting up or shortenning the prompt.");
}
octx->reset();
generation_sampler->reset();
if (!kv_self) {
orpheus_kv_cache_init();
}
generate_from_batch(batch, response);
return 0;
}
void orpheus_runner::configure_generation(generation_configuration * config) {
generation_sampler->temperature = config->temperature;
generation_sampler->repetition_penalty = config->repetition_penalty;
generation_sampler->do_sample = config->sample;
generation_sampler->top_k = config->top_k;
generation_sampler->top_p = config->top_p;
if (std::find(orpheus_voices.begin(), orpheus_voices.end(), config->voice) == orpheus_voices.end() && !config->voice.empty()) {
TTS_ABORT("Voice '%s' is not a valid voice for Orpheus.", config->voice.c_str());
}
octx->voice = config->voice;
}
orpheus_ubatch orpheus_runner::build_worst_case_batch() {
orpheus_ubatch batch;
batch.n_tokens = model->max_context_length;
return batch;
}
void orpheus_runner::assign_weight(std::string name, ggml_tensor * tensor) {
if (tensor->data == NULL) {
return;
}
if (name.size() == 0) {
// handles the top level meta tensor
return;
}
if (name.size() > 5 && name.substr(0, 5) == "snac.") {
srunner->model->assign_weight(name.substr(5), tensor);
} else if (name.size() > 8 && name.substr(0, 8) == "orpheus.") {
model->assign_weight(name.substr(8), tensor);
} else {
fprintf(stdout, "Warning: function %s encountered an unhandled tensor named '%s'.\n", __func__, name.c_str());
}
}
void orpheus_runner::prepare_post_load() {
srunner->prepare_post_load();
orpheus_kv_cache_init();
auto batch = build_worst_case_batch();
auto gf = build_orpheus_graph(batch);
octx->prep_schedule(gf);
}
std::vector<std::string> list_voices() {
std::vector<std::string> voices;
voices.reserve(orpheus_voices.size());
for (auto voice : orpheus_voices) {
voices.push_back(voice);
}
return voices;
}