standardize tts linting and formatting

This commit is contained in:
Concedo 2025-08-17 14:11:30 +08:00
parent cfc1a0d4ef
commit 9935ac093f
24 changed files with 371 additions and 355 deletions

View file

@ -474,7 +474,7 @@ set_target_properties(whisper_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON)
add_library(tts_adapter
otherarch/tts_adapter.cpp)
target_include_directories(tts_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./vendor/stb ./vendor ./tools ./common)
target_include_directories(tts_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./vendor/stb ./vendor ./otherarch/ttscpp/include ./otherarch/ttscpp/src ./tools ./common)
target_compile_features(tts_adapter PUBLIC cxx_std_17) # don't bump
target_link_libraries(tts_adapter PRIVATE common2 ggml ${LLAMA_EXTRA_LIBS})
set_target_properties(tts_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON)

View file

@ -729,7 +729,7 @@ mainvk: tools/main/main.cpp common/arg.cpp build-info.h ggml_v4_vulkan.o ggml-cp
$(CXX) $(CXXFLAGS) -DGGML_USE_VULKAN -DSD_USE_VULKAN $(filter-out %.h,$^) -o $@ $(LDFLAGS)
embedding: examples/embedding/embedding.cpp common/arg.cpp build-info.h ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o ggml-repack.o $(OBJS_FULL) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
ttscppmain: otherarch/ttscpp/cli/cli.cpp otherarch/ttscpp/cli/playback.cpp otherarch/ttscpp/cli/playback.h otherarch/ttscpp/cli/write_file.cpp otherarch/ttscpp/cli/write_file.h otherarch/ttscpp/cli/vad.cpp otherarch/ttscpp/cli/vad.h otherarch/ttscpp/src/tts.cpp otherarch/ttscpp/src/tokenizer.cpp otherarch/ttscpp/src/sampler.cpp otherarch/ttscpp/src/parler_model.cpp otherarch/ttscpp/src/dac_model.cpp otherarch/ttscpp/src/ttsutil.cpp otherarch/ttscpp/src/args.cpp otherarch/ttscpp/src/t5_encoder_model.cpp otherarch/ttscpp/src/phonemizer.cpp otherarch/ttscpp/src/tts_model.cpp otherarch/ttscpp/src/kokoro_model.cpp otherarch/ttscpp/src/dia_model.cpp otherarch/ttscpp/src/orpheus_model.cpp otherarch/ttscpp/src/snac_model.cpp otherarch/ttscpp/src/general_neural_audio_codec.cpp ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o ggml-repack.o $(OBJS_FULL) $(OBJS)
ttscppmain: otherarch/ttscpp/cli/cli.cpp otherarch/ttscpp/cli/playback.cpp otherarch/ttscpp/cli/playback.h otherarch/ttscpp/cli/write_file.cpp otherarch/ttscpp/cli/write_file.h otherarch/ttscpp/cli/vad.cpp otherarch/ttscpp/cli/vad.h otherarch/ttscpp/src/tts.cpp otherarch/ttscpp/src/ttstokenizer.cpp otherarch/ttscpp/src/ttssampler.cpp otherarch/ttscpp/src/parler_model.cpp otherarch/ttscpp/src/dac_model.cpp otherarch/ttscpp/src/ttsutil.cpp otherarch/ttscpp/src/ttsargs.cpp otherarch/ttscpp/src/ttst5_encoder_model.cpp otherarch/ttscpp/src/phonemizer.cpp otherarch/ttscpp/src/tts_model.cpp otherarch/ttscpp/src/kokoro_model.cpp otherarch/ttscpp/src/dia_model.cpp otherarch/ttscpp/src/orpheus_model.cpp otherarch/ttscpp/src/snac_model.cpp otherarch/ttscpp/src/general_neural_audio_codec.cpp ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o ggml-repack.o $(OBJS_FULL) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
ggml/src/ggml-vulkan-shaders.cpp:

View file

@ -25,6 +25,22 @@
#define M_PI 3.14159265358979323846
#endif
//imports required for tts.cpp to work
#include "tts.cpp"
#include "ttstokenizer.cpp"
#include "ttssampler.cpp"
#include "parler_model.cpp"
#include "dac_model.cpp"
#include "ttsutil.cpp"
#include "ttst5_encoder_model.cpp"
#include "phonemizer.cpp"
#include "tts_model.cpp"
#include "kokoro_model.cpp"
#include "dia_model.cpp"
#include "orpheus_model.cpp"
#include "snac_model.cpp"
#include "general_neural_audio_codec.cpp"
enum TTS_VER
{
TTS_VER_2,

View file

@ -12,7 +12,7 @@
#include <unordered_map>
#include <map>
#include <unordered_set>
#include "tokenizer.h"
#include "ttstokenizer.h"
#include <algorithm>
#include <mutex>
@ -323,7 +323,7 @@ public:
#endif
enum lookup_code {
SUCCESS = 100,
SUCCESS_TOTAL = 100,
SUCCESS_PARTIAL = 101,
FAILURE_UNFOUND = 200,
FAILURE_PHONETIC = 201,

View file

@ -1,7 +1,7 @@
#pragma once
#include "dac_model.h"
#include "sampler.h"
#include "ttssampler.h"
struct dia_encoder_layer {
struct ggml_tensor * k;

View file

@ -3,7 +3,7 @@
#include <stdlib.h>
#include "tts_model.h"
#include "tokenizer.h"
#include "ttstokenizer.h"
#include "phonemizer.h"
// Rather than using ISO 639-2 language codes, Kokoro voice pack specify their corresponding language via their first letter.

View file

@ -1,7 +1,7 @@
#pragma once
#include "sampler.h"
#include "tokenizer.h"
#include "ttssampler.h"
#include "ttstokenizer.h"
#include "snac_model.h"
// Orpheus uses vLLM with a llama-3 architecture. The only critical difference from the normal llama architecture is the use of kv heads.

View file

@ -2,8 +2,8 @@
#define parler_model_h
#include "dac_model.h"
#include "t5_encoder_model.h"
#include "sampler.h"
#include "ttst5_encoder_model.h"
#include "ttssampler.h"
enum parler_tensor {
PARLER_EMBD,

View file

@ -543,7 +543,7 @@ dictionary_response * phoneme_dictionary::lookup(corpus * text, std::string valu
}
std::vector<dictionary_response*> possibilities = lookup_map.at(value);
for (auto possible : possibilities) {
if (possible->code == SUCCESS || (possible->code == SUCCESS_PARTIAL && possible->is_match(text, flags))) {
if (possible->code == SUCCESS_TOTAL || (possible->code == SUCCESS_PARTIAL && possible->is_match(text, flags))) {
return possible;
}
}
@ -818,7 +818,7 @@ bool phonemizer::process_word(corpus* text, std::string* output, std::string wor
output->append(" ");
}
flags->update_for_word(word);
if (response->code != SUCCESS) {
if (response->code != SUCCESS_TOTAL) {
word += response->after_match;
output->append(response->value);
text->size_pop(word.size()+unaccented_size_difference);
@ -1072,7 +1072,7 @@ dictionary_response * response_from_string(std::string value, std::string key) {
bool not_at_start = key[0] == '#';
bool not_at_end = key.back() == '#';
if (!has_spacing) {
dictionary_response * resp = new dictionary_response(SUCCESS, value);
dictionary_response * resp = new dictionary_response(SUCCESS_TOTAL, value);
resp->expects_to_be_proceeded_by_number = expects_to_be_proceeded_by_number;
resp->not_at_clause_end = not_at_end;
resp->not_at_clause_start = not_at_start;

View file

@ -1,4 +1,4 @@
#include "sampler.h"
#include "ttssampler.h"
void sampler::sample(float * logits, std::vector<uint32_t> & output_tokens) {
// assume that we are pointing to the start of the first token output;

View file

@ -1,4 +1,4 @@
#include "t5_encoder_model.h"
#include "ttst5_encoder_model.h"
static const std::map<std::string, t5_tensor> T5_TENSOR_GGUF_LOOKUP = {
{"t5encoder.token_embd", T5_EMBD},

View file

@ -2,7 +2,7 @@
#define t5_encoder_model_h
#include "tts_model.h"
#include "tokenizer.h"
#include "ttstokenizer.h"
enum t5_tensor {

View file

@ -1,4 +1,4 @@
#include "tokenizer.h"
#include "ttstokenizer.h"
void token_trie::add(const std::string & gram, uint32_t token) {
_add(gram, token, 0);