mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 09:34:37 +00:00
475 lines
20 KiB
C++
475 lines
20 KiB
C++
#include "orpheus_model.h"
|
|
|
|
#include <array>
|
|
|
|
// These tokens and variables aren't defined in the Orpheus' model configuration but instead are defined inline in various python functions.
|
|
// As such, they are not discoverable so defining them as unconfigurable constants should be fine.
|
|
static constexpr std::array<const char *, 7> orpheus_voices{"zoe", "zac","jess", "leo", "mia", "julia", "leah"};
|
|
static constexpr std::array<uint32_t, 2> orpheus_prepended_tokens = { 128259, 128000 };
|
|
static constexpr std::array<uint32_t, 4> orpheus_appended_tokens = { 128009, 128260, 128261, 128257 };
|
|
|
|
void orpheus_model::assign_weight(std::string name, struct ggml_tensor * tensor) {
|
|
if (name == "norm") {
|
|
output_norm = ggml_dup_tensor(ctx, tensor);
|
|
set_tensor(output_norm, tensor);
|
|
} else if (name == "lm_head") {
|
|
head = ggml_dup_tensor(ctx, tensor);
|
|
set_tensor(head, tensor);
|
|
} else if (name == "embed_tokens") {
|
|
embd = ggml_dup_tensor(ctx, tensor);
|
|
set_tensor(embd, tensor);
|
|
} else if (name == "rope_frequencies") {
|
|
rope_frequencies = ggml_dup_tensor(ctx, tensor);
|
|
set_tensor(rope_frequencies, tensor);
|
|
} else if (has_prefix(name, "layers")) {
|
|
auto lpair = parse_layer_count(name);
|
|
int l = lpair.first;
|
|
std::string lt_name = lpair.second;
|
|
assign_to_layer(lt_name, layers[l], tensor);
|
|
}
|
|
}
|
|
|
|
void orpheus_model::assign_to_layer(std::string part, orpheus_layer & layer, struct ggml_tensor * tensor) {
|
|
if (part == ".self_attn.k_proj") {
|
|
layer.k = ggml_dup_tensor(ctx, tensor);
|
|
set_tensor(layer.k, tensor);
|
|
} else if (part == ".self_attn.q_proj") {
|
|
layer.q = ggml_dup_tensor(ctx, tensor);
|
|
set_tensor(layer.q, tensor);
|
|
} else if (part == ".self_attn.v_proj") {
|
|
layer.v = ggml_dup_tensor(ctx, tensor);
|
|
set_tensor(layer.v, tensor);
|
|
} else if (part == ".self_attn.o_proj") {
|
|
layer.o = ggml_dup_tensor(ctx, tensor);
|
|
set_tensor(layer.o, tensor);
|
|
} else if (part == ".mlp.gate_proj") {
|
|
layer.gate = ggml_dup_tensor(ctx, tensor);
|
|
set_tensor(layer.gate, tensor);
|
|
} else if (part == ".mlp.up_proj") {
|
|
layer.up = ggml_dup_tensor(ctx, tensor);
|
|
set_tensor(layer.up, tensor);
|
|
} else if (part == ".mlp.down_proj") {
|
|
layer.down = ggml_dup_tensor(ctx, tensor);
|
|
set_tensor(layer.down, tensor);
|
|
} else if (part == ".input_layernorm") {
|
|
layer.input_norm = ggml_dup_tensor(ctx, tensor);
|
|
set_tensor(layer.input_norm, tensor);
|
|
} else if (part == ".post_attention_layernorm") {
|
|
layer.post_attention_norm = ggml_dup_tensor(ctx, tensor);
|
|
set_tensor(layer.post_attention_norm, tensor);
|
|
}
|
|
}
|
|
|
|
void orpheus_model::prep_constants(gguf_context * meta) {
|
|
// get constants for orpheus
|
|
int vocab_size_key = gguf_find_key(meta, "orpheus.vocab_size");
|
|
if (vocab_size_key != -1) {
|
|
vocab_size = gguf_get_val_u32(meta, vocab_size_key);
|
|
}
|
|
|
|
int attn_heads_key = gguf_find_key(meta, "orpheus.attn_heads");
|
|
if (attn_heads_key != -1) {
|
|
n_attn_heads = gguf_get_val_u32(meta, attn_heads_key);
|
|
}
|
|
|
|
int kv_attn_heads_key = gguf_find_key(meta, "orpheus.kv_attn_heads");
|
|
if (kv_attn_heads_key != -1) {
|
|
n_kv_attn_heads = gguf_get_val_u32(meta, kv_attn_heads_key);
|
|
}
|
|
|
|
int head_size_key = gguf_find_key(meta, "orpheus.head_dim");
|
|
if (head_size_key != -1) {
|
|
head_size = gguf_get_val_u32(meta, head_size_key);
|
|
}
|
|
|
|
int stopping_token_key = gguf_find_key(meta, "orpheus.stopping_token_id");
|
|
if (stopping_token_key != -1) {
|
|
stopping_token_id = gguf_get_val_u32(meta, stopping_token_key);;
|
|
}
|
|
|
|
int eos_token_id_key = gguf_find_key(meta, "tokenizer.ggml.eos_token_id");
|
|
if (eos_token_id_key != -1) {
|
|
eos_token_id = gguf_get_val_u32(meta, eos_token_id_key);
|
|
}
|
|
|
|
int bos_token_id_key = gguf_find_key(meta, "tokenizer.ggml.bos_token_id");
|
|
if (bos_token_id_key != -1) {
|
|
bos_token_id = gguf_get_val_u32(meta, bos_token_id_key);
|
|
}
|
|
|
|
int hidden_size_key = gguf_find_key(meta, "orpheus.hidden_size");
|
|
if (hidden_size_key != -1) {
|
|
hidden_size = gguf_get_val_u32(meta, hidden_size_key);
|
|
}
|
|
|
|
int kv_hidden_size_key = gguf_find_key(meta, "orpheus.kv_hidden_size");
|
|
if (kv_hidden_size_key != -1) {
|
|
kv_hidden_size = gguf_get_val_u32(meta, kv_hidden_size_key);
|
|
}
|
|
}
|
|
|
|
void orpheus_model::prep_layers(gguf_context * meta) {
|
|
int n_layers_key = gguf_find_key(meta, "orpheus.layers");
|
|
if (n_layers_key == -1) {
|
|
TTS_ABORT("the 'orpheus.layers' must be specified in the GGUF file.");
|
|
}
|
|
n_layers = (int) gguf_get_val_u32(meta, n_layers_key);
|
|
for (int i = 0; i < n_layers; i++) {
|
|
layers.push_back(orpheus_layer{});
|
|
}
|
|
}
|
|
|
|
struct ggml_tensor * orpheus_build_layer_norm(ggml_context * ctx, struct ggml_tensor * x, struct ggml_tensor * weight) {
|
|
float eps = 0.00001;
|
|
return ggml_mul(ctx, ggml_rms_norm(ctx, x, eps), weight);
|
|
}
|
|
|
|
struct ggml_tensor * build_attn_mask(ggml_context * ctx, orpheus_context * octx, orpheus_ubatch & batch) {
|
|
octx->attn_mask = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, (int64_t) octx->current_position + batch.n_tokens, (int64_t) octx->current_position + batch.n_tokens);
|
|
ggml_set_input(octx->attn_mask);
|
|
return octx->attn_mask;
|
|
}
|
|
|
|
void orpheus_context::reset() {
|
|
output_tokens.clear();
|
|
current_position = 0;
|
|
n_outputs = 0;
|
|
}
|
|
|
|
orpheus_context * build_new_orpheus_context(orpheus_model * model, int n_threads, bool use_cpu) {
|
|
orpheus_context * octx = new orpheus_context(model, n_threads);
|
|
if (!use_cpu) {
|
|
#ifdef GGML_USE_METAL
|
|
octx->backend = ggml_backend_metal_init();
|
|
#endif
|
|
}
|
|
octx->backend_cpu = ggml_backend_cpu_init();
|
|
octx->set_threads();
|
|
octx->build_schedule();
|
|
octx->buf_compute_meta.resize(ggml_tensor_overhead()*model->max_nodes() + ggml_graph_overhead_custom(model->max_nodes(), false));
|
|
return octx;
|
|
}
|
|
|
|
void orpheus_runner::orpheus_kv_cache_init() {
|
|
ggml_backend_buffer_type_t buft = nullptr;
|
|
if (octx->backend != nullptr) {
|
|
#ifdef GGML_USE_METAL
|
|
buft = ggml_backend_metal_buffer_type();
|
|
#endif
|
|
} else {
|
|
buft = ggml_backend_cpu_buffer_type();
|
|
}
|
|
|
|
struct ggml_init_params params = {
|
|
/*.mem_size =*/ (2u * model->layers.size() + 1)*ggml_tensor_overhead(),
|
|
/*.mem_buffer =*/ NULL,
|
|
/*.no_alloc =*/ true,
|
|
};
|
|
ggml_context * ctx = ggml_init(params);
|
|
if (!ctx) {
|
|
TTS_ABORT("%s: failed to initialze ggml context for key value cache.\n", __func__);
|
|
}
|
|
if (!kv_self) {
|
|
kv_self = new orpheus_kv_cache;
|
|
}
|
|
kv_self->ctx = ctx;
|
|
kv_self->k_l.reserve(model->layers.size());
|
|
kv_self->v_l.reserve(model->layers.size());
|
|
|
|
for (int i = 0; i < (int) model->layers.size(); i++) {
|
|
ggml_tensor * k = ggml_new_tensor_1d(kv_self->ctx, kv_self->cache_type, model->hidden_size * (model->max_context_length + model->max_generation_size));
|
|
ggml_tensor * v = ggml_new_tensor_1d(kv_self->ctx, kv_self->cache_type, model->hidden_size * (model->max_context_length + model->max_generation_size));
|
|
ggml_format_name(k, "cache_k_l%d", i);
|
|
ggml_format_name(v, "cache_v_l%d", i);
|
|
kv_self->k_l.push_back(k);
|
|
kv_self->v_l.push_back(v);
|
|
}
|
|
|
|
// allocate tensors and initialize the buffers to avoid NaNs in the padding
|
|
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(kv_self->ctx, buft);
|
|
ggml_backend_buffer_clear(buf, 0);
|
|
kv_self->buf = buf;
|
|
}
|
|
|
|
void orpheus_runner::orpheus_build_kv_store(struct ggml_context * ctx, struct ggml_cgraph * graph, struct ggml_tensor * k_cur, struct ggml_tensor * v_cur, int index, uint32_t n_tokens, int repeat) {
|
|
k_cur = ggml_rope_ext(ctx, ggml_cont(ctx, ggml_reshape_3d(ctx, k_cur, model->head_size, model->n_kv_attn_heads, n_tokens)), octx->positions, model->rope_frequencies,
|
|
model->head_size, 2,0, 500000.0f,
|
|
1.0f, 0.0f, 1.0f, 0.0f, 0.0f);
|
|
|
|
// A performance comparison between this method, i.e. performing 3 incremental copy operations in order to achieve repeat_interleave,
|
|
// and performing the repeat operation upfront before performign a single copy needs to be performed in order to better optimize this function.
|
|
// Additionally, it might be more performant for the values transposition to be performed prior to appending it to the cache, as it would save us
|
|
// from incrementally larger transpositions with generation.
|
|
for (int i = 0; i < repeat; i++) {
|
|
struct ggml_tensor * k_cache_view = ggml_view_3d(
|
|
ctx,
|
|
kv_self->k_l[index],
|
|
model->head_size,
|
|
model->n_kv_attn_heads,
|
|
n_tokens,
|
|
ggml_element_size(kv_self->k_l[index]) * model->head_size * repeat,
|
|
ggml_element_size(kv_self->k_l[index]) * model->n_attn_heads * model->head_size,
|
|
ggml_element_size(kv_self->k_l[index]) * model->n_attn_heads * model->head_size * octx->current_position + i * ggml_element_size(kv_self->k_l[index]) * model->head_size
|
|
);
|
|
ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur, k_cache_view));
|
|
|
|
struct ggml_tensor * v_cache_view = ggml_view_3d(
|
|
ctx,
|
|
kv_self->v_l[index],
|
|
model->head_size,
|
|
model->n_kv_attn_heads,
|
|
n_tokens,
|
|
ggml_element_size(kv_self->k_l[index]) * model->head_size * repeat,
|
|
ggml_element_size(kv_self->k_l[index]) * model->n_attn_heads * model->head_size,
|
|
ggml_element_size(kv_self->k_l[index]) * model->n_attn_heads * model->head_size * octx->current_position + i * ggml_element_size(kv_self->k_l[index]) * model->head_size
|
|
);
|
|
ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur, v_cache_view));
|
|
}
|
|
}
|
|
|
|
struct ggml_cgraph * orpheus_runner::build_orpheus_graph(orpheus_ubatch & batch) {
|
|
init_build();
|
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx, 8192, false);
|
|
|
|
struct ggml_tensor * cur;
|
|
struct ggml_tensor * inpL;
|
|
|
|
const int32_t full_sequence_length = octx->current_position + (uint32_t) batch.n_tokens;
|
|
octx->positions = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.n_tokens);
|
|
ggml_set_input(octx->positions);
|
|
octx->inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.n_tokens);
|
|
ggml_set_input(octx->inp_tokens);
|
|
inpL = ggml_get_rows(ctx, model->embd, octx->inp_tokens);
|
|
|
|
struct ggml_tensor * KQ_mask_dec = build_attn_mask(ctx, octx, batch);
|
|
|
|
for (int l = 0; l < model->n_layers; l++) {
|
|
struct ggml_tensor * residual = inpL;
|
|
cur = orpheus_build_layer_norm(ctx, inpL, model->layers[l].input_norm);
|
|
|
|
struct ggml_tensor * attn_out;
|
|
|
|
// self-attention
|
|
{
|
|
struct ggml_tensor * Qcur = ggml_mul_mat(ctx, model->layers[l].q, cur);
|
|
struct ggml_tensor * Kcur = ggml_mul_mat(ctx, model->layers[l].k, cur);
|
|
struct ggml_tensor * Vcur = ggml_mul_mat(ctx, model->layers[l].v, cur);
|
|
|
|
orpheus_build_kv_store(ctx, gf, Kcur, Vcur, l, batch.n_tokens, 3);
|
|
struct ggml_tensor * k =
|
|
ggml_cont(ctx, ggml_view_3d(ctx, kv_self->k_l[l],
|
|
model->head_size, full_sequence_length, model->n_attn_heads,
|
|
ggml_element_size(kv_self->k_l[l]) * model->n_attn_heads * model->head_size,
|
|
ggml_element_size(kv_self->k_l[l]) * model->head_size,
|
|
0));
|
|
|
|
struct ggml_tensor * v =
|
|
ggml_view_2d(ctx, kv_self->v_l[l],
|
|
model->hidden_size, full_sequence_length,
|
|
ggml_element_size(kv_self->k_l[l]) * model->hidden_size,
|
|
0);
|
|
|
|
v = ggml_cont_3d(ctx, ggml_transpose(ctx, v), full_sequence_length, model->head_size, model->n_attn_heads);
|
|
|
|
Qcur = ggml_rope_ext(
|
|
ctx, ggml_cont(ctx, ggml_reshape_3d(ctx, Qcur, model->head_size, model->n_attn_heads, batch.n_tokens)),
|
|
octx->positions, model->rope_frequencies, model->head_size, 2, 0, 500000.0f, // rope theta
|
|
1.0f, 0.0f, 1.0f, 0.0f, 0.0f);
|
|
|
|
struct ggml_tensor * q = ggml_cont(ctx, ggml_permute(ctx, Qcur, 0, 2, 1, 3));
|
|
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
|
|
kq = ggml_soft_max_ext(ctx, kq, KQ_mask_dec, 1.0f/sqrtf(model->head_size), 0.0f);
|
|
struct ggml_tensor * kqv = ggml_mul_mat(ctx, kq, v);
|
|
struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 2, 0, 1, 3);
|
|
attn_out = ggml_cont_2d(ctx, kqv_merged, model->hidden_size, batch.n_tokens);
|
|
attn_out = ggml_mul_mat(ctx, model->layers[l].o, attn_out);
|
|
}
|
|
|
|
cur = ggml_add(ctx, attn_out, residual);
|
|
|
|
struct ggml_tensor * residualffn = cur;
|
|
|
|
// mlp
|
|
{
|
|
cur = orpheus_build_layer_norm(ctx, cur, model->layers[l].post_attention_norm);
|
|
cur = ggml_mul(ctx, ggml_silu(ctx, ggml_mul_mat(ctx, model->layers[l].gate, cur)), ggml_mul_mat(ctx, model->layers[l].up, cur));
|
|
cur = ggml_mul_mat(ctx, model->layers[l].down, cur);
|
|
}
|
|
cur = ggml_add(ctx, cur, residualffn);
|
|
inpL = cur;
|
|
}
|
|
|
|
cur = orpheus_build_layer_norm(ctx, cur, model->output_norm);
|
|
// only about 40k of the output head is actually uses for generation purposes. Ideally the head tensor should be shrunk and sampled tokens should be incremented.
|
|
cur = ggml_mul_mat(ctx, model->head, cur);
|
|
if (batch.n_tokens > 1) {
|
|
cur = ggml_cont(ctx, ggml_view_1d(ctx, cur, model->vocab_size, ggml_element_size(cur) * (cur->ne[1] - 1) * model->vocab_size));
|
|
}
|
|
ggml_build_forward_expand(gf, cur);
|
|
free_build();
|
|
|
|
return gf;
|
|
}
|
|
|
|
void orpheus_runner::decode(orpheus_ubatch & batch) {
|
|
ggml_backend_sched_reset(octx->sched);
|
|
|
|
octx->output_tokens.reserve(model->max_generation_size);
|
|
|
|
const size_t new_size = model->vocab_size * model->max_generation_size * sizeof(float);
|
|
octx->prep_output_buffer(new_size);
|
|
|
|
ggml_cgraph * gf = build_orpheus_graph(batch);
|
|
|
|
// the output is always the last tensor in the graph
|
|
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
|
ggml_backend_sched_alloc_graph(octx->sched, gf);
|
|
|
|
set_inputs(batch);
|
|
ggml_backend_sched_graph_compute_async(octx->sched, gf);
|
|
|
|
float * logits_out = octx->logits + octx->n_outputs * model->vocab_size;
|
|
octx->get_ggml_node_data(res, logits_out, model->vocab_size * sizeof(float));
|
|
|
|
// update the total number of outputs retrieved and the current position
|
|
octx->current_position += batch.n_tokens;
|
|
|
|
// Reset state for the next token before backend sync, to allow the CPU activities in the reset to
|
|
// overlap with device computation.
|
|
ggml_backend_sched_reset(octx->sched);
|
|
}
|
|
|
|
void orpheus_runner::set_inputs(orpheus_ubatch & batch) {
|
|
ggml_backend_tensor_set(octx->inp_tokens, batch.tokens.data(), 0, batch.tokens.size()*ggml_element_size(octx->inp_tokens));
|
|
int32_t * pos = (int32_t*) octx->positions->data;
|
|
float * mask = (float*) octx->attn_mask->data;
|
|
uint32_t max_pos = octx->current_position + batch.n_tokens;
|
|
for (int i = 0; i < batch.n_tokens; i++) {
|
|
pos[i] = (int32_t) octx->current_position + i;
|
|
for (int ii = 0; ii < max_pos; ii++) {
|
|
mask[i*max_pos + ii] = ii > pos[i] ? -INFINITY : 0.0f;
|
|
}
|
|
}
|
|
}
|
|
|
|
orpheus_ubatch orpheus_runner::batch_from_sentence(std::string sentence) {
|
|
struct orpheus_ubatch batch;
|
|
for (auto t : orpheus_prepended_tokens) {
|
|
batch.tokens.push_back(t);
|
|
}
|
|
if (!octx->voice.empty()) {
|
|
sentence = octx->voice + ": " + sentence;
|
|
}
|
|
tokenizer->tokenize(sentence, batch.tokens);
|
|
for (auto t : orpheus_appended_tokens) {
|
|
batch.tokens.push_back(t);
|
|
}
|
|
batch.n_tokens = batch.tokens.size();
|
|
return batch;
|
|
}
|
|
|
|
std::vector<std::vector<uint32_t>> orpheus_runner::prepare_output_tokens() {
|
|
size_t chunks = octx->output_tokens.size() / 7;
|
|
std::vector<std::vector<uint32_t>> output_tokens;
|
|
for (int i = 0; i < model->audio_heads; i++) {
|
|
output_tokens.push_back(std::vector<uint32_t>{});
|
|
}
|
|
for (int i = 0; i < chunks; i++) {
|
|
for (int ii = 0; ii < 7; ii++) {
|
|
uint32_t thead = model->heads[ii];
|
|
// the manipulations below are not configured because they are performed inline via undocumented constants in the Orpheus codebase.
|
|
// Essentially this is how Orpheus converts discrete samples from the output shape to the audio input shape.
|
|
uint32_t t = octx->output_tokens[i*7 + ii] - 128266 - ((ii % 7) * 4096);
|
|
output_tokens[thead].push_back(t);
|
|
}
|
|
}
|
|
return output_tokens;
|
|
}
|
|
|
|
void orpheus_runner::generate_from_batch(orpheus_ubatch & batch, struct tts_response * output) {
|
|
while ((octx->output_tokens.size() == 0 || octx->output_tokens.back() != model->stopping_token_id) && octx->output_tokens.size() < model->max_generation_size) {
|
|
decode(batch);
|
|
generation_sampler->sample(octx->logits + octx->n_outputs * model->vocab_size, octx->output_tokens);
|
|
// only increment the output count after sampling
|
|
octx->n_outputs++;
|
|
batch = orpheus_ubatch{
|
|
1, {octx->output_tokens.back()}
|
|
};
|
|
}
|
|
// this case could be better addressed by adding spliting to the generation process.
|
|
if (octx->output_tokens.size() >= model->max_generation_size) {
|
|
fprintf(stdout, "Warning: generation hit its max default length. The generated audio may not contain the entire prompt.\n");
|
|
}
|
|
std::vector<std::vector<uint32_t>> processed_output_tokens = prepare_output_tokens();
|
|
srunner->run(processed_output_tokens, output);
|
|
}
|
|
|
|
int orpheus_runner::generate(std::string sentence, struct tts_response * response) {
|
|
orpheus_ubatch batch = batch_from_sentence(sentence);
|
|
// it should be possible to update the max context window size, but currently it is extremely unlikely that a single prompt will
|
|
// surpass the default size.
|
|
if (batch.tokens.size() > model->max_context_length) {
|
|
TTS_ABORT("The prompt was too large for the default context window. Try splitting up or shortenning the prompt.");
|
|
}
|
|
octx->reset();
|
|
generation_sampler->reset();
|
|
if (!kv_self) {
|
|
orpheus_kv_cache_init();
|
|
}
|
|
generate_from_batch(batch, response);
|
|
return 0;
|
|
}
|
|
|
|
void orpheus_runner::configure_generation(generation_configuration * config) {
|
|
generation_sampler->temperature = config->temperature;
|
|
generation_sampler->repetition_penalty = config->repetition_penalty;
|
|
generation_sampler->do_sample = config->sample;
|
|
generation_sampler->top_k = config->top_k;
|
|
generation_sampler->top_p = config->top_p;
|
|
if (std::find(orpheus_voices.begin(), orpheus_voices.end(), config->voice) == orpheus_voices.end() && !config->voice.empty()) {
|
|
TTS_ABORT("Voice '%s' is not a valid voice for Orpheus.", config->voice.c_str());
|
|
}
|
|
octx->voice = config->voice;
|
|
}
|
|
|
|
orpheus_ubatch orpheus_runner::build_worst_case_batch() {
|
|
orpheus_ubatch batch;
|
|
batch.n_tokens = model->max_context_length;
|
|
return batch;
|
|
}
|
|
|
|
void orpheus_runner::assign_weight(std::string name, ggml_tensor * tensor) {
|
|
if (tensor->data == NULL) {
|
|
return;
|
|
}
|
|
|
|
if (name.size() == 0) {
|
|
// handles the top level meta tensor
|
|
return;
|
|
}
|
|
|
|
if (name.size() > 5 && name.substr(0, 5) == "snac.") {
|
|
srunner->model->assign_weight(name.substr(5), tensor);
|
|
} else if (name.size() > 8 && name.substr(0, 8) == "orpheus.") {
|
|
model->assign_weight(name.substr(8), tensor);
|
|
} else {
|
|
fprintf(stdout, "Warning: function %s encountered an unhandled tensor named '%s'.\n", __func__, name.c_str());
|
|
}
|
|
}
|
|
|
|
void orpheus_runner::prepare_post_load() {
|
|
srunner->prepare_post_load();
|
|
orpheus_kv_cache_init();
|
|
auto batch = build_worst_case_batch();
|
|
auto gf = build_orpheus_graph(batch);
|
|
octx->prep_schedule(gf);
|
|
}
|
|
|
|
std::vector<std::string> list_voices() {
|
|
std::vector<std::string> voices;
|
|
voices.reserve(orpheus_voices.size());
|
|
for (auto voice : orpheus_voices) {
|
|
voices.push_back(voice);
|
|
}
|
|
return voices;
|
|
}
|