mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 17:44:38 +00:00
standardize tts linting and formatting
This commit is contained in:
parent
cfc1a0d4ef
commit
9935ac093f
24 changed files with 371 additions and 355 deletions
|
@ -474,7 +474,7 @@ set_target_properties(whisper_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
|||
|
||||
add_library(tts_adapter
|
||||
otherarch/tts_adapter.cpp)
|
||||
target_include_directories(tts_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./vendor/stb ./vendor ./tools ./common)
|
||||
target_include_directories(tts_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./vendor/stb ./vendor ./otherarch/ttscpp/include ./otherarch/ttscpp/src ./tools ./common)
|
||||
target_compile_features(tts_adapter PUBLIC cxx_std_17) # don't bump
|
||||
target_link_libraries(tts_adapter PRIVATE common2 ggml ${LLAMA_EXTRA_LIBS})
|
||||
set_target_properties(tts_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
|
|
2
Makefile
2
Makefile
|
@ -729,7 +729,7 @@ mainvk: tools/main/main.cpp common/arg.cpp build-info.h ggml_v4_vulkan.o ggml-cp
|
|||
$(CXX) $(CXXFLAGS) -DGGML_USE_VULKAN -DSD_USE_VULKAN $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
embedding: examples/embedding/embedding.cpp common/arg.cpp build-info.h ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o ggml-repack.o $(OBJS_FULL) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
ttscppmain: otherarch/ttscpp/cli/cli.cpp otherarch/ttscpp/cli/playback.cpp otherarch/ttscpp/cli/playback.h otherarch/ttscpp/cli/write_file.cpp otherarch/ttscpp/cli/write_file.h otherarch/ttscpp/cli/vad.cpp otherarch/ttscpp/cli/vad.h otherarch/ttscpp/src/tts.cpp otherarch/ttscpp/src/tokenizer.cpp otherarch/ttscpp/src/sampler.cpp otherarch/ttscpp/src/parler_model.cpp otherarch/ttscpp/src/dac_model.cpp otherarch/ttscpp/src/ttsutil.cpp otherarch/ttscpp/src/args.cpp otherarch/ttscpp/src/t5_encoder_model.cpp otherarch/ttscpp/src/phonemizer.cpp otherarch/ttscpp/src/tts_model.cpp otherarch/ttscpp/src/kokoro_model.cpp otherarch/ttscpp/src/dia_model.cpp otherarch/ttscpp/src/orpheus_model.cpp otherarch/ttscpp/src/snac_model.cpp otherarch/ttscpp/src/general_neural_audio_codec.cpp ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o ggml-repack.o $(OBJS_FULL) $(OBJS)
|
||||
ttscppmain: otherarch/ttscpp/cli/cli.cpp otherarch/ttscpp/cli/playback.cpp otherarch/ttscpp/cli/playback.h otherarch/ttscpp/cli/write_file.cpp otherarch/ttscpp/cli/write_file.h otherarch/ttscpp/cli/vad.cpp otherarch/ttscpp/cli/vad.h otherarch/ttscpp/src/tts.cpp otherarch/ttscpp/src/ttstokenizer.cpp otherarch/ttscpp/src/ttssampler.cpp otherarch/ttscpp/src/parler_model.cpp otherarch/ttscpp/src/dac_model.cpp otherarch/ttscpp/src/ttsutil.cpp otherarch/ttscpp/src/ttsargs.cpp otherarch/ttscpp/src/ttst5_encoder_model.cpp otherarch/ttscpp/src/phonemizer.cpp otherarch/ttscpp/src/tts_model.cpp otherarch/ttscpp/src/kokoro_model.cpp otherarch/ttscpp/src/dia_model.cpp otherarch/ttscpp/src/orpheus_model.cpp otherarch/ttscpp/src/snac_model.cpp otherarch/ttscpp/src/general_neural_audio_codec.cpp ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o ggml-repack.o $(OBJS_FULL) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
ggml/src/ggml-vulkan-shaders.cpp:
|
||||
|
|
|
@ -25,6 +25,22 @@
|
|||
#define M_PI 3.14159265358979323846
|
||||
#endif
|
||||
|
||||
//imports required for tts.cpp to work
|
||||
#include "tts.cpp"
|
||||
#include "ttstokenizer.cpp"
|
||||
#include "ttssampler.cpp"
|
||||
#include "parler_model.cpp"
|
||||
#include "dac_model.cpp"
|
||||
#include "ttsutil.cpp"
|
||||
#include "ttst5_encoder_model.cpp"
|
||||
#include "phonemizer.cpp"
|
||||
#include "tts_model.cpp"
|
||||
#include "kokoro_model.cpp"
|
||||
#include "dia_model.cpp"
|
||||
#include "orpheus_model.cpp"
|
||||
#include "snac_model.cpp"
|
||||
#include "general_neural_audio_codec.cpp"
|
||||
|
||||
enum TTS_VER
|
||||
{
|
||||
TTS_VER_2,
|
||||
|
|
|
@ -9,8 +9,8 @@ float energy(float * chunk, int count) {
|
|||
}
|
||||
|
||||
void apply_energy_voice_inactivity_detection(
|
||||
tts_response & data,
|
||||
float sample_rate,
|
||||
tts_response & data,
|
||||
float sample_rate,
|
||||
int ms_per_frame,
|
||||
int frame_threshold,
|
||||
float normalized_energy_threshold,
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -12,7 +12,7 @@
|
|||
#include <unordered_map>
|
||||
#include <map>
|
||||
#include <unordered_set>
|
||||
#include "tokenizer.h"
|
||||
#include "ttstokenizer.h"
|
||||
#include <algorithm>
|
||||
#include <mutex>
|
||||
|
||||
|
@ -33,16 +33,16 @@ static const std::unordered_set<std::string> ONE_LETTER_WORDS = {
|
|||
"i",
|
||||
};
|
||||
/*
|
||||
* The two letter and three letter words listed below have been filtered down from the complete list of english two and three letter words
|
||||
* The two letter and three letter words listed below have been filtered down from the complete list of english two and three letter words
|
||||
* via several criteria:
|
||||
* 1. All non-EN-US words have been removed
|
||||
* 2. All three letter acronyms have been removed (as these lists are used to identify acronyms)
|
||||
* 3. All archaic, deprecated, or poetic words have been removed.
|
||||
* 4. All literary, abbreviative, and slang words have been removed if they see no more than a mean of 30 daily searches via google (over the
|
||||
* last 10 years).
|
||||
*
|
||||
* After the lists were filtered by the criteria described above, removed items were reviewed. Any item which had entered the common EN-US
|
||||
* vernacular but was not identified as of American origin was reintroduced into the sets below.
|
||||
* 3. All archaic, deprecated, or poetic words have been removed.
|
||||
* 4. All literary, abbreviative, and slang words have been removed if they see no more than a mean of 30 daily searches via google (over the
|
||||
* last 10 years).
|
||||
*
|
||||
* After the lists were filtered by the criteria described above, removed items were reviewed. Any item which had entered the common EN-US
|
||||
* vernacular but was not identified as of American origin was reintroduced into the sets below.
|
||||
*/
|
||||
static const std::unordered_set<std::string> TWO_LETTER_WORDS = {
|
||||
"ab", "ah", "am", "an", "as", "at", "aw", "ax", "ay", "be", "bo", "br",
|
||||
|
@ -50,7 +50,7 @@ static const std::unordered_set<std::string> TWO_LETTER_WORDS = {
|
|||
"id", "if", "in", "is", "it", "la", "lo", "ma", "me", "mm", "my", "na",
|
||||
"no", "of", "oh", "oi", "on", "oo", "or", "ow", "ox", "oy", "pa", "qi",
|
||||
"re", "sh", "so", "to", "uh", "um", "un", "up", "us", "we", "wo", "ya",
|
||||
"ye", "yo",
|
||||
"ye", "yo",
|
||||
};
|
||||
static const std::unordered_set<std::string> THREE_LETTER_WORDS = {
|
||||
"aah", "abs", "aby", "ace", "ach", "ack", "act", "add", "ado", "ads", "aft", "age",
|
||||
|
@ -292,7 +292,7 @@ static std::string STOPPING_TOKENS = ".,:;!?";
|
|||
|
||||
#ifdef ESPEAK_INSTALL
|
||||
/**
|
||||
* espeak-ng uses globals to persist and manage its state so it is not compatible with
|
||||
* espeak-ng uses globals to persist and manage its state so it is not compatible with
|
||||
* threaded parallelism (https://github.com/espeak-ng/espeak-ng/issues/1527).
|
||||
* This singleton acts as a mutex wrapped provider for all espeak phonemization methods such
|
||||
* that multiple instances of the kokoro_runner can be initialized and called in parallel.
|
||||
|
@ -323,7 +323,7 @@ public:
|
|||
#endif
|
||||
|
||||
enum lookup_code {
|
||||
SUCCESS = 100,
|
||||
SUCCESS_TOTAL = 100,
|
||||
SUCCESS_PARTIAL = 101,
|
||||
FAILURE_UNFOUND = 200,
|
||||
FAILURE_PHONETIC = 201,
|
||||
|
@ -368,7 +368,7 @@ struct conditions {
|
|||
void update_for_word(std::string word,bool allow_for_upper_check = true);
|
||||
};
|
||||
|
||||
/*
|
||||
/*
|
||||
* The corpus struct is simply a small wrapper class that is used to perform simple look forward and backwards in the text
|
||||
* which is being phonemized. This can be used to discern how to convert chunks of text in a consistent and protective fashion
|
||||
* in order to accurately phonemize complicated text.
|
||||
|
@ -376,7 +376,7 @@ struct conditions {
|
|||
struct corpus {
|
||||
corpus(const char * text, size_t size): size(size), text(text) {};
|
||||
size_t location = 0;
|
||||
size_t size;
|
||||
size_t size;
|
||||
const char * text;
|
||||
|
||||
/*
|
||||
|
@ -397,9 +397,9 @@ struct corpus {
|
|||
std::string after_until(int after, std::string val);
|
||||
};
|
||||
|
||||
/*
|
||||
/*
|
||||
* The TTS phonemizer works by splitting each word into distinct graphemes, and for each grapheme the phonemizer will look at the grapheme that came
|
||||
* before, after, and for any word specific exceptions in order to compile a
|
||||
* before, after, and for any word specific exceptions in order to compile a
|
||||
*/
|
||||
struct phonemizer_rule {
|
||||
~phonemizer_rule() {
|
||||
|
@ -436,10 +436,10 @@ private:
|
|||
|
||||
struct word_phonemizer * word_phonemizer_from_gguf(gguf_context * meta);
|
||||
|
||||
/*
|
||||
/*
|
||||
* The general translation approach that espeak uses is to lookup words in the dictionary and return a list of possible matches per lookup.
|
||||
* Each match contains flags which describe the match's conditions and limitations and optionally a pronunciation. When a pronunciation is not returned,
|
||||
* it usually means that the word needs to be pronounced phonetically, the word belongs to another language, or that the original content is a
|
||||
* it usually means that the word needs to be pronounced phonetically, the word belongs to another language, or that the original content is a
|
||||
* token representation of a different word (e.g. with numbers).
|
||||
*
|
||||
* Since it does not make sense to have the core lexer reperform this lookup operation with represented words or via distinct languages, those behaviors
|
||||
|
@ -470,7 +470,7 @@ struct phoneme_dictionary {
|
|||
|
||||
struct phoneme_dictionary * phoneme_dictionary_from_gguf(gguf_context * meta);
|
||||
|
||||
/*
|
||||
/*
|
||||
* In general, I would like to avoid requiring the installation of otherwise broad and technically complicated libraries,
|
||||
* like espeak, especially when they are only being used for a small portion of their overall functionality. While avoiding these
|
||||
* requirements will keep the default installation cost of TTS.cpp down, it is also unlikely that TTS.cpp will support
|
||||
|
@ -478,8 +478,8 @@ struct phoneme_dictionary * phoneme_dictionary_from_gguf(gguf_context * meta);
|
|||
* espeak. As such, the phonemizer struct described below will support simple text to IPA phoneme functionality out of the box,
|
||||
* while also optionally acting as an interface for espeak phonemization.
|
||||
*
|
||||
* Phonemization seems to use a pattern close to the common lexer, such that at each index or chunk of text forward and backward context
|
||||
* views are used to support single pass translation. As such, the TTS.cpp phonemization pattern I've decided to implement behaves
|
||||
* Phonemization seems to use a pattern close to the common lexer, such that at each index or chunk of text forward and backward context
|
||||
* views are used to support single pass translation. As such, the TTS.cpp phonemization pattern I've decided to implement behaves
|
||||
* effecively like a simple router lexer. It will only support utf-8 encoded text and english IPA conversion.
|
||||
*/
|
||||
struct phonemizer {
|
||||
|
|
|
@ -28,7 +28,7 @@ const std::map<std::string, tts_arch> SUPPORTED_ARCHITECTURES = {
|
|||
{ "orpheus", ORPHEUS_ARCH }
|
||||
};
|
||||
|
||||
/// Given a map from keys to values, creates a new map from values to keys
|
||||
/// Given a map from keys to values, creates a new map from values to keys
|
||||
template<typename K, typename V>
|
||||
static std::map<V, K> reverse_map(const std::map<K, V>& m) {
|
||||
std::map<V, K> r;
|
||||
|
@ -43,10 +43,10 @@ const std::map<tts_arch, std::string> ARCHITECTURE_NAMES = reverse_map(SUPPORTED
|
|||
struct generation_configuration {
|
||||
generation_configuration(
|
||||
std::string voice = "",
|
||||
int top_k = 50,
|
||||
float temperature = 1.0,
|
||||
float repetition_penalty = 1.0,
|
||||
bool use_cross_attn = true,
|
||||
int top_k = 50,
|
||||
float temperature = 1.0,
|
||||
float repetition_penalty = 1.0,
|
||||
bool use_cross_attn = true,
|
||||
std::string espeak_voice_id = "",
|
||||
int max_tokens = 0,
|
||||
float top_p = 1.0,
|
||||
|
|
|
@ -22,13 +22,13 @@ struct dac_quantize_layer {
|
|||
// this struct maintains the static tensors for the dac audio decoder graph.
|
||||
// As such, this is designed to contain basic configuration and ggml tensor support for DAC.
|
||||
// The dac_runner describes how the graph is built and run.
|
||||
struct dac_model : tts_model {
|
||||
struct dac_model : tts_model {
|
||||
// These configs are essentially built for the 44khZ 8kbps standard DAC model audio encoder and decoder
|
||||
uint32_t n_layers = 4;
|
||||
uint32_t n_heads = 9;
|
||||
uint32_t up_sampling_factor = 512;
|
||||
uint32_t max_generation_size = 2580;
|
||||
|
||||
|
||||
struct ggml_tensor * in_conv_kernel;
|
||||
struct ggml_tensor * in_conv_bias;
|
||||
struct ggml_tensor * out_conv_kernel;
|
||||
|
@ -53,11 +53,11 @@ void assign_to_audio_encoder(dac_model * model, std::string name, ggml_tensor *
|
|||
// the context used for running the dac model
|
||||
struct dac_context : runner_context {
|
||||
dac_context(dac_model * model, int n_threads): runner_context(n_threads), model(model) {};
|
||||
|
||||
|
||||
struct dac_model * model;
|
||||
|
||||
|
||||
struct ggml_tensor * inp_tokens;
|
||||
|
||||
|
||||
void build_schedule() {
|
||||
runner_context::build_schedule(model->max_nodes());
|
||||
}
|
||||
|
@ -85,11 +85,11 @@ struct dac_runner : tts_runner {
|
|||
}
|
||||
dac_model * model;
|
||||
dac_context * dctx;
|
||||
|
||||
|
||||
void init_build() {
|
||||
tts_runner::init_build(&dctx->buf_compute_meta);
|
||||
}
|
||||
|
||||
|
||||
void prepare_post_load();
|
||||
struct ggml_cgraph * build_dac_graph(dac_ubatch & batch);
|
||||
void run(uint32_t * input_tokens, uint32_t sequence_length, struct tts_response * outputs);
|
||||
|
|
|
@ -119,7 +119,7 @@ void dia_model::assign_to_decoder_layer(std::string part, dia_decoder_layer * la
|
|||
set_tensor(layer->self_attn_norm, tensor);
|
||||
} else if (part == "pre_mlp_norm") {
|
||||
layer->mlp_norm = ggml_dup_tensor(ctx, tensor);
|
||||
set_tensor(layer->mlp_norm, tensor);
|
||||
set_tensor(layer->mlp_norm, tensor);
|
||||
} else if (part == "pre_ca_norm") {
|
||||
layer->cross_attn_norm = ggml_dup_tensor(ctx, tensor);
|
||||
set_tensor(layer->cross_attn_norm, tensor);
|
||||
|
@ -151,7 +151,7 @@ void dia_model::prep_layers() {
|
|||
dia_decoder_layer * l = new dia_decoder_layer;
|
||||
decoder->layers.push_back(l);
|
||||
}
|
||||
|
||||
|
||||
decoder->embds.reserve((size_t) n_output_heads);
|
||||
decoder->heads.reserve((size_t) n_output_heads);
|
||||
for (int i = 0; i < n_output_heads; i++) {
|
||||
|
@ -196,7 +196,7 @@ void dia_model::prep_constants(gguf_context * meta) {
|
|||
int encoder_attn_heads_key = gguf_find_key(meta, "dia.encoder.attn_heads");
|
||||
if (encoder_attn_heads_key != -1) {
|
||||
encoder_attn_heads = gguf_get_val_u32(meta, encoder_attn_heads_key);
|
||||
}
|
||||
}
|
||||
|
||||
int head_size_key = gguf_find_key(meta, "dia.attn_head_size");
|
||||
if (head_size_key != -1) {
|
||||
|
@ -271,7 +271,7 @@ struct dia_context * build_new_dia_context(struct dia_model * model, int n_threa
|
|||
return dctx;
|
||||
}
|
||||
|
||||
static bool dia_kv_cache_init(struct dia_kv_cache * cache, dia_model * model, dia_context * dctx) {
|
||||
static bool dia_kv_cache_init(struct dia_kv_cache * cache, dia_model * model, dia_context * dctx) {
|
||||
ggml_backend_buffer_type_t buft = nullptr;
|
||||
// this will only really support cpu or metal for the time being;
|
||||
if (dctx->backend != nullptr) {
|
||||
|
@ -382,7 +382,7 @@ static struct ggml_tensor * build_dia_encoder(ggml_context * ctx, dia_model * mo
|
|||
struct ggml_tensor * cur = ggml_reshape_3d(ctx, ggml_get_rows(ctx, model->encoder->embedding, dctx->inp_tokens), model->encoder_hidden_size, model->max_encoder_context_length, 2);
|
||||
for (auto layer : model->encoder->layers) {
|
||||
struct ggml_tensor * residual = cur;
|
||||
|
||||
|
||||
cur = dia_layer_norm(ctx, cur, layer->self_attn_norm);
|
||||
// self-attention
|
||||
{
|
||||
|
@ -402,7 +402,7 @@ static struct ggml_tensor * build_dia_encoder(ggml_context * ctx, dia_model * mo
|
|||
struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 2, 0, 1, 3);
|
||||
|
||||
// It is unclear why the attention ops in Dia's encoder don't project to the embedding dimension size as is standard. Instead they up project to the decoder's embedding dimension
|
||||
// then down project back the the encoder embedding dimension.
|
||||
// then down project back the the encoder embedding dimension.
|
||||
cur = ggml_cont_3d(ctx, kqv_merged, model->decoder_hidden_size, model->max_encoder_context_length, 2);
|
||||
cur = ggml_mul_mat(ctx, layer->o, cur);
|
||||
}
|
||||
|
@ -443,10 +443,10 @@ static struct ggml_tensor * repeat_interleave_dim1(ggml_context * ctx, struct gg
|
|||
static void build_dia_self_kv_store(ggml_context * ctx, dia_context * dctx, dia_model * model, dia_kv_cache * kv, ggml_cgraph * gf, struct ggml_tensor * k, struct ggml_tensor * v, dia_ubatch & batch, int layer_index) {
|
||||
int64_t attn_size = model->head_size * model->decoder_attn_heads;
|
||||
|
||||
struct ggml_tensor * k_cache_view =
|
||||
struct ggml_tensor * k_cache_view =
|
||||
ggml_view_2d(
|
||||
ctx, kv->k_l[layer_index], attn_size, 2,
|
||||
attn_size * model->max_generation_size * ggml_element_size(kv->k_l[layer_index]),
|
||||
ctx, kv->k_l[layer_index], attn_size, 2,
|
||||
attn_size * model->max_generation_size * ggml_element_size(kv->k_l[layer_index]),
|
||||
attn_size*dctx->current_position*ggml_element_size(kv->k_l[layer_index]));
|
||||
|
||||
k = ggml_rope(ctx, ggml_cont(ctx, ggml_reshape_4d(ctx, k, model->head_size, model->decoder_attn_heads / model->decoder_query_heads, batch.sequence_length, 2)), dctx->positions, model->head_size, 2);
|
||||
|
@ -461,8 +461,8 @@ static void build_dia_self_kv_store(ggml_context * ctx, dia_context * dctx, dia_
|
|||
struct ggml_tensor * v_cache_view = nullptr;
|
||||
|
||||
v_cache_view = ggml_view_2d(
|
||||
ctx, kv->v_l[layer_index], attn_size, 2,
|
||||
attn_size * model->max_generation_size * ggml_element_size(kv->v_l[layer_index]),
|
||||
ctx, kv->v_l[layer_index], attn_size, 2,
|
||||
attn_size * model->max_generation_size * ggml_element_size(kv->v_l[layer_index]),
|
||||
attn_size*dctx->current_position*ggml_element_size(kv->v_l[layer_index]));
|
||||
|
||||
// Since the sequence length should always be 1 here this is the most pertinent time to repeat the heads for grouped query attention.
|
||||
|
@ -476,11 +476,11 @@ static void build_dia_self_kv_store(ggml_context * ctx, dia_context * dctx, dia_
|
|||
static void build_dia_cross_kv_store(ggml_context * ctx, dia_context * dctx, dia_model * model, dia_kv_cache * kv, ggml_cgraph * gf, struct ggml_tensor * encoder_hidden_states, int layer_index) {
|
||||
dia_decoder_layer * layer = model->decoder->layers[layer_index];
|
||||
struct ggml_tensor * encoder_states_key_view = ggml_cont(ctx, ggml_view_3d(
|
||||
ctx,
|
||||
encoder_hidden_states,
|
||||
model->encoder_hidden_size,
|
||||
dctx->prompt_size,
|
||||
2,
|
||||
ctx,
|
||||
encoder_hidden_states,
|
||||
model->encoder_hidden_size,
|
||||
dctx->prompt_size,
|
||||
2,
|
||||
model->encoder_hidden_size * ggml_element_size(encoder_hidden_states), model->encoder_hidden_size * model->max_encoder_context_length * ggml_element_size(encoder_hidden_states), 0));
|
||||
|
||||
struct ggml_tensor * k = ggml_mul_mat(ctx, layer->cross_attn_k, encoder_states_key_view);
|
||||
|
@ -491,8 +491,8 @@ static void build_dia_cross_kv_store(ggml_context * ctx, dia_context * dctx, dia
|
|||
|
||||
struct ggml_tensor * k_cache_view =
|
||||
ggml_view_4d(
|
||||
ctx, kv->cross_k_l[layer_index], model->head_size, model->decoder_attn_heads, 2, dctx->prompt_size,
|
||||
model->head_size*ggml_element_size(kv->cross_k_l[layer_index]),
|
||||
ctx, kv->cross_k_l[layer_index], model->head_size, model->decoder_attn_heads, 2, dctx->prompt_size,
|
||||
model->head_size*ggml_element_size(kv->cross_k_l[layer_index]),
|
||||
model->head_size*model->decoder_attn_heads*ggml_element_size(kv->cross_k_l[layer_index]),
|
||||
model->head_size*model->decoder_attn_heads*2*ggml_element_size(kv->cross_k_l[layer_index]),
|
||||
0);
|
||||
|
@ -504,10 +504,10 @@ static void build_dia_cross_kv_store(ggml_context * ctx, dia_context * dctx, dia
|
|||
|
||||
struct ggml_tensor * v_cache_view =
|
||||
ggml_view_4d(
|
||||
ctx, kv->cross_v_l[layer_index], model->max_encoder_context_length, model->head_size, model->decoder_attn_heads, 2,
|
||||
model->max_encoder_context_length*ggml_element_size(kv->cross_v_l[layer_index]),
|
||||
model->head_size*model->max_encoder_context_length*ggml_element_size(kv->cross_v_l[layer_index]),
|
||||
model->head_size*model->max_encoder_context_length*model->decoder_attn_heads*ggml_element_size(kv->cross_v_l[layer_index]),
|
||||
ctx, kv->cross_v_l[layer_index], model->max_encoder_context_length, model->head_size, model->decoder_attn_heads, 2,
|
||||
model->max_encoder_context_length*ggml_element_size(kv->cross_v_l[layer_index]),
|
||||
model->head_size*model->max_encoder_context_length*ggml_element_size(kv->cross_v_l[layer_index]),
|
||||
model->head_size*model->max_encoder_context_length*model->decoder_attn_heads*ggml_element_size(kv->cross_v_l[layer_index]),
|
||||
0);
|
||||
|
||||
ggml_build_forward_expand(gf, ggml_cpy(ctx, v, v_cache_view));
|
||||
|
@ -515,11 +515,11 @@ static void build_dia_cross_kv_store(ggml_context * ctx, dia_context * dctx, dia
|
|||
|
||||
static struct ggml_tensor * build_dia_decoder(
|
||||
ggml_cgraph * gf,
|
||||
ggml_context * ctx,
|
||||
dia_model * model,
|
||||
dia_context * dctx,
|
||||
dia_kv_cache * cache,
|
||||
dia_ubatch & batch,
|
||||
ggml_context * ctx,
|
||||
dia_model * model,
|
||||
dia_context * dctx,
|
||||
dia_kv_cache * cache,
|
||||
dia_ubatch & batch,
|
||||
struct ggml_tensor * encoder_hidden_states) {
|
||||
dctx->positions = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.sequence_length);
|
||||
ggml_set_input(dctx->positions);
|
||||
|
@ -528,7 +528,7 @@ static struct ggml_tensor * build_dia_decoder(
|
|||
for (int l = 0; l < model->decoder->layers.size(); l++){
|
||||
dia_decoder_layer * layer = model->decoder->layers[l];
|
||||
struct ggml_tensor * residual = cur;
|
||||
|
||||
|
||||
cur = dia_layer_norm(ctx, cur, layer->self_attn_norm);
|
||||
// self-attention
|
||||
{
|
||||
|
@ -546,13 +546,13 @@ static struct ggml_tensor * build_dia_decoder(
|
|||
0);
|
||||
k = ggml_cont(ctx, ggml_permute(ctx, k, 0, 2, 1, 3));
|
||||
|
||||
struct ggml_tensor * v =
|
||||
struct ggml_tensor * v =
|
||||
ggml_view_3d(ctx, cache->v_l[l],
|
||||
model->head_size * model->decoder_attn_heads, dctx->current_position + 1, 2,
|
||||
ggml_element_size(cache->v_l[l]) * model->decoder_attn_heads * model->head_size,
|
||||
ggml_element_size(cache->v_l[l]) * model->decoder_attn_heads * model->head_size * model->max_generation_size,
|
||||
0);
|
||||
v = ggml_cont_4d(ctx, ggml_transpose(ctx, v), dctx->current_position + 1, model->head_size, model->decoder_attn_heads, 2);
|
||||
v = ggml_cont_4d(ctx, ggml_transpose(ctx, v), dctx->current_position + 1, model->head_size, model->decoder_attn_heads, 2);
|
||||
|
||||
// As noted in the encoder Dia uses the Neo-X protocol for RoPE.
|
||||
Qcur = ggml_rope(ctx, ggml_cont(ctx, ggml_reshape_4d(ctx, Qcur, model->head_size, model->decoder_attn_heads, batch.sequence_length, 2)), dctx->positions, model->head_size, 2);
|
||||
|
@ -583,22 +583,22 @@ static struct ggml_tensor * build_dia_decoder(
|
|||
build_dia_cross_kv_store(ctx, dctx, model, cache, gf, encoder_hidden_states, l);
|
||||
}
|
||||
|
||||
struct ggml_tensor * cross_k =
|
||||
struct ggml_tensor * cross_k =
|
||||
ggml_view_4d(
|
||||
ctx, cache->cross_k_l[l], model->head_size, model->decoder_attn_heads, 2,
|
||||
model->max_encoder_context_length, model->head_size*ggml_element_size(cache->cross_k_l[l]),
|
||||
model->head_size*model->decoder_attn_heads*ggml_element_size(cache->cross_k_l[l]),
|
||||
model->head_size*model->decoder_attn_heads*2*ggml_element_size(cache->cross_k_l[l]),
|
||||
model->max_encoder_context_length, model->head_size*ggml_element_size(cache->cross_k_l[l]),
|
||||
model->head_size*model->decoder_attn_heads*ggml_element_size(cache->cross_k_l[l]),
|
||||
model->head_size*model->decoder_attn_heads*2*ggml_element_size(cache->cross_k_l[l]),
|
||||
0);
|
||||
// the double permute operation shouldn't be necessary here, but it seems that currently ggml permute only currently alows for a single
|
||||
// axis pair to be transposed.
|
||||
cross_k = ggml_cont(ctx, ggml_permute(ctx, ggml_permute(ctx, cross_k, 0, 1, 3, 2), 0, 2, 1, 3));
|
||||
|
||||
struct ggml_tensor * cross_v =
|
||||
struct ggml_tensor * cross_v =
|
||||
ggml_cont(ctx, ggml_view_4d(
|
||||
ctx, cache->cross_v_l[l], model->max_encoder_context_length, model->head_size, model->decoder_attn_heads, 2,
|
||||
model->max_encoder_context_length*ggml_element_size(cache->cross_v_l[l]),
|
||||
model->head_size*model->max_encoder_context_length*ggml_element_size(cache->cross_v_l[l]),
|
||||
model->max_encoder_context_length*ggml_element_size(cache->cross_v_l[l]),
|
||||
model->head_size*model->max_encoder_context_length*ggml_element_size(cache->cross_v_l[l]),
|
||||
model->head_size*model->max_encoder_context_length*model->decoder_attn_heads*ggml_element_size(cache->cross_v_l[l]),
|
||||
0));
|
||||
|
||||
|
@ -637,10 +637,10 @@ static struct ggml_tensor * build_dia_decoder(
|
|||
}
|
||||
|
||||
void dia_runner::tokenize_sentence(std::string sentence, dia_ubatch & batch) {
|
||||
// Dia's tokenization process is unusual. Essentially Dia takes the byte value for each character and uses that as
|
||||
// a token array. Additionally, because Dia performs a cfg-scale adjustment before sampling tokens, it is necessary to
|
||||
// Dia's tokenization process is unusual. Essentially Dia takes the byte value for each character and uses that as
|
||||
// a token array. Additionally, because Dia performs a cfg-scale adjustment before sampling tokens, it is necessary to
|
||||
// generate with a conditioned context (i.e. with the text) and an unconditioned context (i.e. without any text) so that
|
||||
// proper adjustments can be perfored at each generation step. This means that we need to pad the end of our tokens to the
|
||||
// proper adjustments can be perfored at each generation step. This means that we need to pad the end of our tokens to the
|
||||
// max context size for both the conditional and unconditional sequence.
|
||||
|
||||
// if the sentence isn't prepended by dialogue start tokens, [S1] or [S2], then append one.
|
||||
|
@ -699,7 +699,7 @@ dia_ubatch dia_runner::batch_from_sentence(std::string sentence) {
|
|||
* 1. Dia cleans its output generation by adding the difference between its text based output (its conditional output) and its unconditional output
|
||||
* to the conditional ouput before sampling. This is why the batch is set to two throughout the graph.
|
||||
*
|
||||
* 2. Dia's decoder attends across the entire encoded space including the pad buffer which receives a unique attention mask. This is why the
|
||||
* 2. Dia's decoder attends across the entire encoded space including the pad buffer which receives a unique attention mask. This is why the
|
||||
* encoder sequence is always max length.
|
||||
*/
|
||||
struct ggml_cgraph * dia_runner::build_dia_graph(dia_ubatch & batch) {
|
||||
|
@ -716,7 +716,7 @@ struct ggml_cgraph * dia_runner::build_dia_graph(dia_ubatch & batch) {
|
|||
ggml_set_name(cur, "decoder_output");
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
free_build();
|
||||
|
||||
|
||||
return gf;
|
||||
}
|
||||
|
||||
|
@ -758,11 +758,11 @@ int dia_runner::decode(dia_ubatch & batch) {
|
|||
dctx->output_tokens.reserve(dctx->max_generation_size * model->n_output_heads);
|
||||
}
|
||||
ggml_backend_sched_reset(dctx->sched);
|
||||
|
||||
|
||||
const size_t logits_size = model->output_vocab_size * dctx->max_generation_size * model->n_output_heads;
|
||||
const size_t prev_size = dctx->buf_output ? ggml_backend_buffer_get_size(dctx->buf_output) : 0;
|
||||
const size_t new_size = logits_size * sizeof(float);
|
||||
|
||||
|
||||
if (!dctx->buf_output || prev_size < new_size) {
|
||||
if (dctx->buf_output) {
|
||||
ggml_backend_buffer_free(dctx->buf_output);
|
||||
|
@ -772,7 +772,7 @@ int dia_runner::decode(dia_ubatch & batch) {
|
|||
|
||||
dctx->buf_output = ggml_backend_buft_alloc_buffer(dctx->backend_cpu_buffer, new_size);
|
||||
}
|
||||
|
||||
|
||||
dctx->logits = (float *) ggml_backend_buffer_get_base(dctx->buf_output);
|
||||
|
||||
ggml_cgraph * gf = build_dia_graph(batch);
|
||||
|
@ -817,7 +817,7 @@ bool dia_runner::check_stopping(dia_ubatch & batch) {
|
|||
if (dctx->delay_steps == -1 && (batch.audio_tokens[0] == model->eos_token_id || dctx->current_position >= dctx->max_generation_size - model->max_delay)) {
|
||||
dctx->delay_steps = model->max_delay;
|
||||
}
|
||||
|
||||
|
||||
if (dctx->delay_steps > 0) {
|
||||
int step_after_eos = model->max_delay - dctx->delay_steps;
|
||||
for (int i = 0; i < model->delay_pattern.size(); i++) {
|
||||
|
@ -907,5 +907,5 @@ void dia_runner::assign_weight(std::string name, ggml_tensor * tensor) {
|
|||
dac_runner->model->assign_weight(name.substr(14), tensor);
|
||||
} else {
|
||||
model->assign_weight(name, tensor);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
#pragma once
|
||||
|
||||
#include "dac_model.h"
|
||||
#include "sampler.h"
|
||||
#include "ttssampler.h"
|
||||
|
||||
struct dia_encoder_layer {
|
||||
struct ggml_tensor * k;
|
||||
|
@ -22,7 +22,7 @@ struct dia_decoder_layer {
|
|||
struct ggml_tensor * self_attn_v;
|
||||
struct ggml_tensor * self_attn_o;
|
||||
struct ggml_tensor * self_attn_norm;
|
||||
|
||||
|
||||
struct ggml_tensor * cross_attn_k;
|
||||
struct ggml_tensor * cross_attn_q;
|
||||
struct ggml_tensor * cross_attn_v;
|
||||
|
@ -76,7 +76,7 @@ struct dia_model : tts_model {
|
|||
|
||||
dia_encoder * encoder;
|
||||
dia_decoder * decoder;
|
||||
|
||||
|
||||
void assign_weight(std::string name, ggml_tensor * tensor);
|
||||
void assign_to_encoder(std::vector<std::string> parts, struct ggml_tensor * tensor, std::string name);
|
||||
void assign_to_decoder(std::vector<std::string> parts, struct ggml_tensor * tensor, std::string name);
|
||||
|
@ -103,15 +103,15 @@ struct dia_context : runner_context {
|
|||
uint32_t max_generation_size; // this is set by the generation context or defaults to the config set on dia model.
|
||||
|
||||
std::vector<uint32_t> output_tokens;
|
||||
struct dia_model * model;
|
||||
|
||||
struct dia_model * model;
|
||||
|
||||
struct ggml_tensor * inp_tokens;
|
||||
struct ggml_tensor * audio_inp_tokens;
|
||||
struct ggml_tensor * positions;
|
||||
struct ggml_tensor * encode_positions;
|
||||
struct ggml_tensor * encode_attn_mask;
|
||||
struct ggml_tensor * cross_attn_mask;
|
||||
|
||||
|
||||
void build_schedule() {
|
||||
runner_context::build_schedule(model->max_nodes());
|
||||
}
|
||||
|
@ -126,11 +126,11 @@ struct dia_kv_cache {
|
|||
|
||||
std::vector<struct ggml_tensor *> k_l;
|
||||
std::vector<struct ggml_tensor *> v_l;
|
||||
|
||||
|
||||
struct ggml_context * ctx;
|
||||
ggml_backend_buffer_type_t buft;
|
||||
ggml_backend_buffer_t buf;
|
||||
|
||||
|
||||
void free() {
|
||||
ggml_free(ctx);
|
||||
ggml_backend_buffer_free(buf);
|
||||
|
|
|
@ -53,7 +53,7 @@ namespace general_neural_audio_codec {
|
|||
|
||||
uint32_t padding;
|
||||
uint32_t stride;
|
||||
|
||||
|
||||
std::vector<residual_unit> residual_blocks;
|
||||
};
|
||||
|
||||
|
|
|
@ -3,11 +3,11 @@
|
|||
|
||||
#include <stdlib.h>
|
||||
#include "tts_model.h"
|
||||
#include "tokenizer.h"
|
||||
#include "ttstokenizer.h"
|
||||
#include "phonemizer.h"
|
||||
|
||||
// Rather than using ISO 639-2 language codes, Kokoro voice pack specify their corresponding language via their first letter.
|
||||
// Below is a map that describes the relationship between those designations and espeak-ng's voice identifiers so that the
|
||||
// Below is a map that describes the relationship between those designations and espeak-ng's voice identifiers so that the
|
||||
// appropriate phonemization protocol can inferred from the Kokoro voice.
|
||||
static std::map<char, std::string> KOKORO_LANG_TO_ESPEAK_ID = {
|
||||
{'a', "gmw/en-US"},
|
||||
|
@ -22,7 +22,7 @@ static std::map<char, std::string> KOKORO_LANG_TO_ESPEAK_ID = {
|
|||
};
|
||||
|
||||
struct lstm_cell {
|
||||
std::vector<ggml_tensor*> weights;
|
||||
std::vector<ggml_tensor*> weights;
|
||||
std::vector<ggml_tensor*> biases;
|
||||
std::vector<ggml_tensor*> reverse_weights;
|
||||
std::vector<ggml_tensor*> reverse_biases;
|
||||
|
@ -197,8 +197,8 @@ struct kokoro_model : tts_model {
|
|||
// standard configuration for duration prediction
|
||||
uint32_t f0_n_blocks = 3;
|
||||
uint32_t n_duration_prediction_layers = 3;
|
||||
// while it is technically possible for the duration predictor to assign 50 values per token there is no practical need to
|
||||
// allocate that many items to the sequence as it is impossible for all tokens to require such long durations and each
|
||||
// while it is technically possible for the duration predictor to assign 50 values per token there is no practical need to
|
||||
// allocate that many items to the sequence as it is impossible for all tokens to require such long durations and each
|
||||
// allocation increases node allocation size by O(N)
|
||||
uint32_t max_duration_per_token = 20;
|
||||
uint32_t style_half_size = 128;
|
||||
|
@ -221,7 +221,7 @@ struct kokoro_model : tts_model {
|
|||
float noise_std = 0.003f;
|
||||
float voice_threshold = 10.0f;
|
||||
float sample_rate = 24000.0f;
|
||||
std::string window = "hann";
|
||||
std::string window = "hann";
|
||||
|
||||
// It is really annoying that ggml doesn't allow using non ggml tensors as the operator for simple math ops.
|
||||
// This is just the constant defined above as a tensor.
|
||||
|
@ -259,7 +259,7 @@ struct kokoro_model : tts_model {
|
|||
// Decoding and Generation portion of the model
|
||||
struct kokoro_decoder * decoder;
|
||||
|
||||
// the default hidden states need to be initialized
|
||||
// the default hidden states need to be initialized
|
||||
std::vector<lstm*> lstms;
|
||||
|
||||
size_t duration_node_counter = 0;
|
||||
|
@ -317,15 +317,15 @@ struct kokoro_duration_context : runner_context {
|
|||
~kokoro_duration_context() {
|
||||
ggml_backend_buffer_free(buf_len_output);
|
||||
}
|
||||
|
||||
|
||||
std::string voice = "af_alloy";
|
||||
struct kokoro_model * model;
|
||||
ggml_backend_buffer_t buf_len_output = nullptr;
|
||||
|
||||
|
||||
|
||||
size_t logits_size = 0; // capacity (of floats) for logits
|
||||
float * lens = nullptr;
|
||||
|
||||
|
||||
struct ggml_tensor * inp_tokens;
|
||||
struct ggml_tensor * positions;
|
||||
struct ggml_tensor * attn_mask;
|
||||
|
@ -356,7 +356,7 @@ struct kokoro_duration_response {
|
|||
};
|
||||
|
||||
// This struct is intended to manage graph and compute for the duration prediction portion of the kokoro model.
|
||||
// Duration computation and speech generation are separated into distinct graphs because the precomputed graph structure of ggml doesn't
|
||||
// Duration computation and speech generation are separated into distinct graphs because the precomputed graph structure of ggml doesn't
|
||||
// support the tensor dependent views that would otherwise be necessary.
|
||||
struct kokoro_duration_runner : tts_runner {
|
||||
kokoro_duration_runner(kokoro_model * model, kokoro_duration_context * context, single_pass_tokenizer * tokenizer): model(model), kctx(context), tokenizer(tokenizer) {};
|
||||
|
@ -375,7 +375,7 @@ struct kokoro_duration_runner : tts_runner {
|
|||
void init_build() {
|
||||
tts_runner::init_build(&kctx->buf_compute_meta);
|
||||
}
|
||||
|
||||
|
||||
void prepare_post_load();
|
||||
struct kokoro_ubatch build_worst_case_batch();
|
||||
void set_inputs(kokoro_ubatch & batch);
|
||||
|
@ -397,7 +397,7 @@ struct kokoro_context : runner_context {
|
|||
}
|
||||
|
||||
std::string voice = "af_alloy";
|
||||
|
||||
|
||||
struct kokoro_model * model;
|
||||
|
||||
uint32_t total_duration;
|
||||
|
@ -408,7 +408,7 @@ struct kokoro_context : runner_context {
|
|||
struct ggml_tensor * duration_mask;
|
||||
struct ggml_tensor * window_sq_sum; // needs to be calculatd from the generator window.
|
||||
struct ggml_tensor * uv_noise_data;
|
||||
|
||||
|
||||
void build_schedule() {
|
||||
runner_context::build_schedule(model->max_gen_nodes()*30);
|
||||
}
|
||||
|
|
|
@ -150,7 +150,7 @@ orpheus_context * build_new_orpheus_context(orpheus_model * model, int n_threads
|
|||
return octx;
|
||||
}
|
||||
|
||||
void orpheus_runner::orpheus_kv_cache_init() {
|
||||
void orpheus_runner::orpheus_kv_cache_init() {
|
||||
ggml_backend_buffer_type_t buft = nullptr;
|
||||
if (octx->backend != nullptr) {
|
||||
#ifdef GGML_USE_METAL
|
||||
|
@ -192,21 +192,21 @@ void orpheus_runner::orpheus_kv_cache_init() {
|
|||
}
|
||||
|
||||
void orpheus_runner::orpheus_build_kv_store(struct ggml_context * ctx, struct ggml_cgraph * graph, struct ggml_tensor * k_cur, struct ggml_tensor * v_cur, int index, uint32_t n_tokens, int repeat) {
|
||||
k_cur = ggml_rope_ext(ctx, ggml_cont(ctx, ggml_reshape_3d(ctx, k_cur, model->head_size, model->n_kv_attn_heads, n_tokens)), octx->positions, model->rope_frequencies,
|
||||
k_cur = ggml_rope_ext(ctx, ggml_cont(ctx, ggml_reshape_3d(ctx, k_cur, model->head_size, model->n_kv_attn_heads, n_tokens)), octx->positions, model->rope_frequencies,
|
||||
model->head_size, 2,0, 500000.0f,
|
||||
1.0f, 0.0f, 1.0f, 0.0f, 0.0f);
|
||||
|
||||
// A performance comparison between this method, i.e. performing 3 incremental copy operations in order to achieve repeat_interleave,
|
||||
// and performing the repeat operation upfront before performign a single copy needs to be performed in order to better optimize this function.
|
||||
// Additionally, it might be more performant for the values transposition to be performed prior to appending it to the cache, as it would save us
|
||||
// Additionally, it might be more performant for the values transposition to be performed prior to appending it to the cache, as it would save us
|
||||
// from incrementally larger transpositions with generation.
|
||||
for (int i = 0; i < repeat; i++) {
|
||||
struct ggml_tensor * k_cache_view = ggml_view_3d(
|
||||
ctx,
|
||||
kv_self->k_l[index],
|
||||
ctx,
|
||||
kv_self->k_l[index],
|
||||
model->head_size,
|
||||
model->n_kv_attn_heads,
|
||||
n_tokens,
|
||||
n_tokens,
|
||||
ggml_element_size(kv_self->k_l[index]) * model->head_size * repeat,
|
||||
ggml_element_size(kv_self->k_l[index]) * model->n_attn_heads * model->head_size,
|
||||
ggml_element_size(kv_self->k_l[index]) * model->n_attn_heads * model->head_size * octx->current_position + i * ggml_element_size(kv_self->k_l[index]) * model->head_size
|
||||
|
@ -230,19 +230,19 @@ void orpheus_runner::orpheus_kv_cache_init() {
|
|||
struct ggml_cgraph * orpheus_runner::build_orpheus_graph(orpheus_ubatch & batch) {
|
||||
init_build();
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx, 8192, false);
|
||||
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * inpL;
|
||||
|
||||
|
||||
const int32_t full_sequence_length = octx->current_position + (uint32_t) batch.n_tokens;
|
||||
octx->positions = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.n_tokens);
|
||||
ggml_set_input(octx->positions);
|
||||
octx->inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.n_tokens);
|
||||
ggml_set_input(octx->inp_tokens);
|
||||
inpL = ggml_get_rows(ctx, model->embd, octx->inp_tokens);
|
||||
|
||||
|
||||
struct ggml_tensor * KQ_mask_dec = build_attn_mask(ctx, octx, batch);
|
||||
|
||||
|
||||
for (int l = 0; l < model->n_layers; l++) {
|
||||
struct ggml_tensor * residual = inpL;
|
||||
cur = orpheus_build_layer_norm(ctx, inpL, model->layers[l].input_norm);
|
||||
|
@ -261,8 +261,8 @@ struct ggml_cgraph * orpheus_runner::build_orpheus_graph(orpheus_ubatch & batch)
|
|||
model->head_size, full_sequence_length, model->n_attn_heads,
|
||||
ggml_element_size(kv_self->k_l[l]) * model->n_attn_heads * model->head_size,
|
||||
ggml_element_size(kv_self->k_l[l]) * model->head_size,
|
||||
0));
|
||||
|
||||
0));
|
||||
|
||||
struct ggml_tensor * v =
|
||||
ggml_view_2d(ctx, kv_self->v_l[l],
|
||||
model->hidden_size, full_sequence_length,
|
||||
|
@ -272,7 +272,7 @@ struct ggml_cgraph * orpheus_runner::build_orpheus_graph(orpheus_ubatch & batch)
|
|||
v = ggml_cont_3d(ctx, ggml_transpose(ctx, v), full_sequence_length, model->head_size, model->n_attn_heads);
|
||||
|
||||
Qcur = ggml_rope_ext(
|
||||
ctx, ggml_cont(ctx, ggml_reshape_3d(ctx, Qcur, model->head_size, model->n_attn_heads, batch.n_tokens)),
|
||||
ctx, ggml_cont(ctx, ggml_reshape_3d(ctx, Qcur, model->head_size, model->n_attn_heads, batch.n_tokens)),
|
||||
octx->positions, model->rope_frequencies, model->head_size, 2, 0, 500000.0f, // rope theta
|
||||
1.0f, 0.0f, 1.0f, 0.0f, 0.0f);
|
||||
|
||||
|
@ -286,7 +286,7 @@ struct ggml_cgraph * orpheus_runner::build_orpheus_graph(orpheus_ubatch & batch)
|
|||
}
|
||||
|
||||
cur = ggml_add(ctx, attn_out, residual);
|
||||
|
||||
|
||||
struct ggml_tensor * residualffn = cur;
|
||||
|
||||
// mlp
|
||||
|
@ -298,7 +298,7 @@ struct ggml_cgraph * orpheus_runner::build_orpheus_graph(orpheus_ubatch & batch)
|
|||
cur = ggml_add(ctx, cur, residualffn);
|
||||
inpL = cur;
|
||||
}
|
||||
|
||||
|
||||
cur = orpheus_build_layer_norm(ctx, cur, model->output_norm);
|
||||
// only about 40k of the output head is actually uses for generation purposes. Ideally the head tensor should be shrunk and sampled tokens should be incremented.
|
||||
cur = ggml_mul_mat(ctx, model->head, cur);
|
||||
|
@ -307,15 +307,15 @@ struct ggml_cgraph * orpheus_runner::build_orpheus_graph(orpheus_ubatch & batch)
|
|||
}
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
free_build();
|
||||
|
||||
|
||||
return gf;
|
||||
}
|
||||
|
||||
void orpheus_runner::decode(orpheus_ubatch & batch) {
|
||||
ggml_backend_sched_reset(octx->sched);
|
||||
|
||||
|
||||
octx->output_tokens.reserve(model->max_generation_size);
|
||||
|
||||
|
||||
const size_t new_size = model->vocab_size * model->max_generation_size * sizeof(float);
|
||||
octx->prep_output_buffer(new_size);
|
||||
|
||||
|
@ -324,10 +324,10 @@ void orpheus_runner::decode(orpheus_ubatch & batch) {
|
|||
// the output is always the last tensor in the graph
|
||||
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
||||
ggml_backend_sched_alloc_graph(octx->sched, gf);
|
||||
|
||||
|
||||
set_inputs(batch);
|
||||
ggml_backend_sched_graph_compute_async(octx->sched, gf);
|
||||
|
||||
|
||||
float * logits_out = octx->logits + octx->n_outputs * model->vocab_size;
|
||||
octx->get_ggml_node_data(res, logits_out, model->vocab_size * sizeof(float));
|
||||
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
#pragma once
|
||||
|
||||
#include "sampler.h"
|
||||
#include "tokenizer.h"
|
||||
#include "ttssampler.h"
|
||||
#include "ttstokenizer.h"
|
||||
#include "snac_model.h"
|
||||
|
||||
// Orpheus uses vLLM with a llama-3 architecture. The only critical difference from the normal llama architecture is the use of kv heads.
|
||||
|
@ -73,7 +73,7 @@ struct orpheus_context : runner_context {
|
|||
struct ggml_tensor * positions;
|
||||
};
|
||||
|
||||
struct orpheus_kv_cache {
|
||||
struct orpheus_kv_cache {
|
||||
ggml_type cache_type = GGML_TYPE_F32;
|
||||
|
||||
std::vector<struct ggml_tensor *> k_l;
|
||||
|
@ -104,11 +104,11 @@ struct orpheus_ubatch {
|
|||
|
||||
struct orpheus_runner : tts_runner {
|
||||
orpheus_runner(
|
||||
orpheus_model * model,
|
||||
snac_runner * audio_decoder,
|
||||
orpheus_context * octx,
|
||||
bpe_tokenizer * bt,
|
||||
sampler * samp,
|
||||
orpheus_model * model,
|
||||
snac_runner * audio_decoder,
|
||||
orpheus_context * octx,
|
||||
bpe_tokenizer * bt,
|
||||
sampler * samp,
|
||||
orpheus_kv_cache * cache): model(model), srunner(audio_decoder), octx(octx), tokenizer(bt), generation_sampler(samp), kv_self(cache) {
|
||||
tts_runner::sampling_rate = 24000.0f;
|
||||
generation_sampler->n_output_heads = 1;
|
||||
|
|
|
@ -2,8 +2,8 @@
|
|||
#define parler_model_h
|
||||
|
||||
#include "dac_model.h"
|
||||
#include "t5_encoder_model.h"
|
||||
#include "sampler.h"
|
||||
#include "ttst5_encoder_model.h"
|
||||
#include "ttssampler.h"
|
||||
|
||||
enum parler_tensor {
|
||||
PARLER_EMBD,
|
||||
|
@ -38,17 +38,17 @@ struct parler_layer {
|
|||
struct ggml_tensor * self_attn_o_proj;
|
||||
struct ggml_tensor * self_attn_norm;
|
||||
struct ggml_tensor * self_attn_norm_bias;
|
||||
|
||||
|
||||
struct ggml_tensor * attn_k_proj;
|
||||
struct ggml_tensor * attn_q_proj;
|
||||
struct ggml_tensor * attn_v_proj;
|
||||
struct ggml_tensor * attn_o_proj;
|
||||
struct ggml_tensor * attn_norm;
|
||||
struct ggml_tensor * attn_norm_bias;
|
||||
|
||||
|
||||
struct ggml_tensor * cross_k;
|
||||
struct ggml_tensor * cross_v;
|
||||
|
||||
|
||||
struct ggml_tensor * fc1;
|
||||
struct ggml_tensor * fc2;
|
||||
struct ggml_tensor * final_norm;
|
||||
|
@ -74,18 +74,18 @@ struct parler_tts_model : tts_model {
|
|||
uint32_t prompt_vocab_size;
|
||||
|
||||
bool use_cross_attn = true;
|
||||
|
||||
|
||||
std::vector<struct ggml_tensor*> embds;
|
||||
std::vector<parler_layer*> layers;
|
||||
std::vector<struct ggml_tensor*> heads;
|
||||
|
||||
|
||||
struct ggml_tensor * precomputed_input_emb;
|
||||
struct ggml_tensor * precomputed_positional_embds;
|
||||
|
||||
|
||||
struct ggml_tensor * layer_norm;
|
||||
struct ggml_tensor * layer_norm_bias;
|
||||
struct ggml_tensor * prompt_embd;
|
||||
|
||||
|
||||
void assign_weight(std::string name, ggml_tensor * tensor);
|
||||
void prep_constants(gguf_context * meta);
|
||||
void prep_layers(gguf_context * meta);
|
||||
|
@ -107,21 +107,21 @@ struct parler_context : runner_context {
|
|||
std::vector<bool> eos_seen;
|
||||
|
||||
bool use_cache = true;
|
||||
|
||||
|
||||
size_t output_size = 0; // capacity (of tokens positions) for the output buffers
|
||||
int32_t n_outputs = 0; // number of actually-used outputs in the current ubatch or last logical batch
|
||||
uint32_t current_position = 0; // current position in the active sequence
|
||||
uint32_t prompt_end_position = 0; // the position of the text prompt termination (used for adjusting the cache when incrementally generating)
|
||||
int32_t seq_id; // a unique identifier associated with the active sequence.
|
||||
|
||||
|
||||
std::vector<uint32_t> output_tokens;
|
||||
|
||||
|
||||
struct ggml_tensor * inp_tokens;
|
||||
struct ggml_tensor * audio_inp_tokens;
|
||||
struct ggml_tensor * positions;
|
||||
struct ggml_tensor * attn_mask;
|
||||
struct ggml_tensor * attn_mask_cross;
|
||||
|
||||
|
||||
void build_schedule() {
|
||||
runner_context::build_schedule(model->max_nodes());
|
||||
}
|
||||
|
@ -130,17 +130,17 @@ struct parler_context : runner_context {
|
|||
|
||||
struct parler_kv_cache {
|
||||
int32_t seq_id;
|
||||
|
||||
|
||||
ggml_type type_k = GGML_TYPE_F32;
|
||||
ggml_type type_v = GGML_TYPE_F32;
|
||||
|
||||
std::vector<struct ggml_tensor *> k_l;
|
||||
std::vector<struct ggml_tensor *> v_l;
|
||||
|
||||
|
||||
struct ggml_context * ctx;
|
||||
ggml_backend_buffer_type_t buft;
|
||||
ggml_backend_buffer_t buf;
|
||||
|
||||
|
||||
void free() {
|
||||
ggml_free(ctx);
|
||||
ggml_backend_buffer_free(buf);
|
||||
|
@ -152,8 +152,8 @@ struct parler_kv_cache {
|
|||
};
|
||||
|
||||
struct parler_ubatch {
|
||||
parler_ubatch(bool audio_generation, size_t n_tokens, size_t n_audio_tokens, size_t sequence_length,
|
||||
uint32_t * tokens, uint32_t * audio_tokens, uint32_t * positions, uint32_t * true_order,
|
||||
parler_ubatch(bool audio_generation, size_t n_tokens, size_t n_audio_tokens, size_t sequence_length,
|
||||
uint32_t * tokens, uint32_t * audio_tokens, uint32_t * positions, uint32_t * true_order,
|
||||
int current_step): audio_generation(audio_generation), n_tokens(n_tokens), n_audio_tokens(n_audio_tokens), sequence_length(sequence_length), tokens(tokens), audio_tokens(audio_tokens), positions(positions), true_order(true_order), current_step(current_step) {};
|
||||
parler_ubatch() {};
|
||||
bool audio_generation; // whether we are receiving codebook decoded tokens or text tokens
|
||||
|
|
|
@ -543,7 +543,7 @@ dictionary_response * phoneme_dictionary::lookup(corpus * text, std::string valu
|
|||
}
|
||||
std::vector<dictionary_response*> possibilities = lookup_map.at(value);
|
||||
for (auto possible : possibilities) {
|
||||
if (possible->code == SUCCESS || (possible->code == SUCCESS_PARTIAL && possible->is_match(text, flags))) {
|
||||
if (possible->code == SUCCESS_TOTAL || (possible->code == SUCCESS_PARTIAL && possible->is_match(text, flags))) {
|
||||
return possible;
|
||||
}
|
||||
}
|
||||
|
@ -818,7 +818,7 @@ bool phonemizer::process_word(corpus* text, std::string* output, std::string wor
|
|||
output->append(" ");
|
||||
}
|
||||
flags->update_for_word(word);
|
||||
if (response->code != SUCCESS) {
|
||||
if (response->code != SUCCESS_TOTAL) {
|
||||
word += response->after_match;
|
||||
output->append(response->value);
|
||||
text->size_pop(word.size()+unaccented_size_difference);
|
||||
|
@ -1072,7 +1072,7 @@ dictionary_response * response_from_string(std::string value, std::string key) {
|
|||
bool not_at_start = key[0] == '#';
|
||||
bool not_at_end = key.back() == '#';
|
||||
if (!has_spacing) {
|
||||
dictionary_response * resp = new dictionary_response(SUCCESS, value);
|
||||
dictionary_response * resp = new dictionary_response(SUCCESS_TOTAL, value);
|
||||
resp->expects_to_be_proceeded_by_number = expects_to_be_proceeded_by_number;
|
||||
resp->not_at_clause_end = not_at_end;
|
||||
resp->not_at_clause_start = not_at_start;
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
|
||||
// SNAC, Scale Neural Audio Codec, is another neural audio codec much like DAC.
|
||||
// The key differences are that it uses grouping in the residual units of its layers,
|
||||
// performs a repeat_interleave over the second and third input channels, applies
|
||||
// performs a repeat_interleave over the second and third input channels, applies
|
||||
// a noise convolutional layer after input encoding for each layer, and applies
|
||||
// an extra convolutional layer before residual layers are applied.
|
||||
struct snac_model : tts_model {
|
||||
|
@ -19,7 +19,7 @@ struct snac_model : tts_model {
|
|||
uint32_t noise_steps[4] = {8, 64, 256, 512};
|
||||
uint32_t noise_steps_sum = 840;
|
||||
bool use_noise = true;
|
||||
|
||||
|
||||
struct ggml_tensor * repeat_interleave_buffer;
|
||||
|
||||
struct ggml_tensor * in_conv_kernel;
|
||||
|
@ -46,12 +46,12 @@ struct snac_model : tts_model {
|
|||
// the context used for running the snac model
|
||||
struct snac_context : runner_context {
|
||||
snac_context(snac_model * model, int n_threads): runner_context(n_threads), model(model) {};
|
||||
|
||||
|
||||
struct snac_model * model;
|
||||
|
||||
|
||||
struct ggml_tensor * inp_tokens;
|
||||
struct ggml_tensor * noise;
|
||||
|
||||
|
||||
void build_schedule() {
|
||||
runner_context::build_schedule(model->max_nodes());
|
||||
}
|
||||
|
@ -74,11 +74,11 @@ struct snac_runner : tts_runner {
|
|||
}
|
||||
snac_model * model;
|
||||
snac_context * sctx;
|
||||
|
||||
|
||||
void init_build() {
|
||||
tts_runner::init_build(&sctx->buf_compute_meta);
|
||||
}
|
||||
|
||||
|
||||
void set_inputs(std::vector<std::vector<uint32_t>> & tokens);
|
||||
void prepare_post_load();
|
||||
struct ggml_cgraph * build_snac_graph(size_t sequence_length);
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
#include "sampler.h"
|
||||
#include "ttssampler.h"
|
||||
|
||||
void sampler::sample(float * logits, std::vector<uint32_t> & output_tokens) {
|
||||
// assume that we are pointing to the start of the first token output;
|
||||
|
@ -6,7 +6,7 @@ void sampler::sample(float * logits, std::vector<uint32_t> & output_tokens) {
|
|||
return max(logits, output_tokens);
|
||||
}
|
||||
std::vector<uint32_t> max_vals;
|
||||
// the max_head_probs variable is used when top-p is applied but exists to address the case in which top-k and top-p cause the cumulative probability of the nucleus to beless than or
|
||||
// the max_head_probs variable is used when top-p is applied but exists to address the case in which top-k and top-p cause the cumulative probability of the nucleus to beless than or
|
||||
// equal to top_p;
|
||||
std::vector<float> max_head_probs;
|
||||
|
||||
|
@ -189,7 +189,7 @@ void sampler::max(float * logits, std::vector<uint32_t> & output_tokens) {
|
|||
uint32_t token_id = 0;
|
||||
for (uint32_t ii = 0; ii < vocab_size; ii++) {
|
||||
float v = *(logits+i*vocab_size+ii);
|
||||
// while repetition penalty will never be used for maximum token selection, it is used for the logarithmic stabilization of
|
||||
// while repetition penalty will never be used for maximum token selection, it is used for the logarithmic stabilization of
|
||||
// the softmax function in which case it is possible for repetition counts to be set.
|
||||
if (has_repetition_penalty && last_token_ids[i] == ii) {
|
||||
v /= (pow(repetition_penalty, repetition_counts[i]));
|
|
@ -21,7 +21,7 @@ struct sampler {
|
|||
std::vector<uint32_t> repetition_counts;
|
||||
bool do_sample = true;
|
||||
bool apply_softmax = true;
|
||||
|
||||
|
||||
void sample(float * logits, std::vector<uint32_t> & output_tokens);
|
||||
void softmax(float * logits, std::vector<std::vector<size_t>> picks, std::vector<uint32_t> max_indices);
|
||||
void max(float * logits, std::vector<uint32_t> & output_tokens);
|
|
@ -1,4 +1,4 @@
|
|||
#include "t5_encoder_model.h"
|
||||
#include "ttst5_encoder_model.h"
|
||||
|
||||
static const std::map<std::string, t5_tensor> T5_TENSOR_GGUF_LOOKUP = {
|
||||
{"t5encoder.token_embd", T5_EMBD},
|
||||
|
@ -139,7 +139,7 @@ void t5_encoder::prep_constants(gguf_context * meta) {
|
|||
int bos_token_id_key = gguf_find_key(meta, "tokenizer.ggml.bos_token_id");
|
||||
if (bos_token_id_key != -1) {
|
||||
bos_token_id = gguf_get_val_u32(meta, bos_token_id_key);
|
||||
}
|
||||
}
|
||||
|
||||
int eos_token_id_key = gguf_find_key(meta, "tokenizer.ggml.eos_token_id");
|
||||
if (eos_token_id_key != -1) {
|
||||
|
@ -219,7 +219,7 @@ struct ggml_cgraph * t5_runner::build_t5_graph(t5_ubatch & batch) {
|
|||
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * inpL;
|
||||
|
||||
|
||||
//t5ctx->positions = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.n_tokens);
|
||||
//ggml_set_input(t5ctx->positions);
|
||||
|
||||
|
@ -233,7 +233,7 @@ struct ggml_cgraph * t5_runner::build_t5_graph(t5_ubatch & batch) {
|
|||
|
||||
struct ggml_tensor * KQ_mask_dec = build_t5_attn_mask(ctx, t5ctx, batch);
|
||||
struct ggml_tensor * pos_bias = build_t5_pos_bias(ctx, t5ctx->inp_pos_bucket, model->relative_attn_bias);
|
||||
|
||||
|
||||
for (int l = 0; l < model->n_layers; l++) {
|
||||
struct ggml_tensor * residual = inpL;
|
||||
|
||||
|
@ -293,7 +293,7 @@ struct ggml_cgraph * t5_runner::build_t5_graph(t5_ubatch & batch) {
|
|||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
free_build();
|
||||
|
||||
|
||||
return gf;
|
||||
}
|
||||
|
||||
|
@ -312,7 +312,7 @@ void t5_runner::set_inputs(t5_ubatch & batch) {
|
|||
for (int ii = 0; ii < batch.n_tokens; ii++) {
|
||||
int ab_rpos = abs(i - ii);
|
||||
int rpos = i - ii;
|
||||
attn_mask[i*batch.n_tokens + ii] = 0.0f; //ii > i ? -INFINITY : 0.0f;
|
||||
attn_mask[i*batch.n_tokens + ii] = 0.0f; //ii > i ? -INFINITY : 0.0f;
|
||||
pos_bucket[i*batch.n_tokens + ii] = (uint32_t) (rpos > 0 ? n_buckets : 0) + (ab_rpos < max_exact ? ab_rpos : std::min((n_buckets - 1), (max_exact + (int)((log((ab_rpos / max_exact)) / logarithmic_denominator) * max_exact))));
|
||||
}
|
||||
}
|
||||
|
@ -324,10 +324,10 @@ void t5_runner::run(uint32_t * input_tokens, uint32_t sequence_length, struct tt
|
|||
batch.input_tokens = input_tokens;
|
||||
batch.n_tokens = sequence_length;
|
||||
ggml_backend_sched_reset(t5ctx->sched);
|
||||
|
||||
|
||||
const size_t prev_size = t5ctx->buf_output ? ggml_backend_buffer_get_size(t5ctx->buf_output) : 0;
|
||||
const size_t new_size = model->max_context_length * model->output_size * sizeof(float);
|
||||
|
||||
|
||||
if (!t5ctx->buf_output || prev_size < new_size) {
|
||||
if (t5ctx->buf_output) {
|
||||
ggml_backend_buffer_free(t5ctx->buf_output);
|
||||
|
@ -337,7 +337,7 @@ void t5_runner::run(uint32_t * input_tokens, uint32_t sequence_length, struct tt
|
|||
|
||||
t5ctx->buf_output = ggml_backend_buft_alloc_buffer(t5ctx->backend_cpu_buffer, new_size);
|
||||
}
|
||||
|
||||
|
||||
outputs->data = (float *) ggml_backend_buffer_get_base(t5ctx->buf_output);
|
||||
ggml_backend_buffer_clear(t5ctx->buf_output, 0);
|
||||
struct ggml_cgraph * gf = NULL;
|
|
@ -2,7 +2,7 @@
|
|||
#define t5_encoder_model_h
|
||||
|
||||
#include "tts_model.h"
|
||||
#include "tokenizer.h"
|
||||
#include "ttstokenizer.h"
|
||||
|
||||
|
||||
enum t5_tensor {
|
||||
|
@ -75,14 +75,14 @@ void assign_to_t5_layer(t5_encoder * model, t5_layer & layer, std::string name,
|
|||
|
||||
struct t5_context : runner_context {
|
||||
t5_context(t5_encoder * model, int n_threads): runner_context(n_threads), model(model) {};
|
||||
|
||||
|
||||
struct t5_encoder * model;
|
||||
|
||||
|
||||
struct ggml_tensor * inp_tokens;
|
||||
struct ggml_tensor * positions;
|
||||
struct ggml_tensor * attn_mask;
|
||||
struct ggml_tensor * inp_pos_bucket;
|
||||
|
||||
|
||||
void build_schedule() {
|
||||
runner_context::build_schedule(model->max_nodes());
|
||||
}
|
||||
|
@ -116,7 +116,7 @@ struct t5_runner : tts_runner {
|
|||
void init_build() {
|
||||
tts_runner::init_build(&t5ctx->buf_compute_meta);
|
||||
}
|
||||
|
||||
|
||||
void prepare_post_load();
|
||||
struct t5_ubatch build_worst_case_batch();
|
||||
void set_inputs(t5_ubatch & batch);
|
|
@ -1,4 +1,4 @@
|
|||
#include "tokenizer.h"
|
||||
#include "ttstokenizer.h"
|
||||
|
||||
void token_trie::add(const std::string & gram, uint32_t token) {
|
||||
_add(gram, token, 0);
|
Loading…
Add table
Add a link
Reference in a new issue