From 2c0239fcf2c5b237d3fd4c530cfd50a2568f6775 Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Mon, 20 Jan 2025 23:02:50 +0800 Subject: [PATCH] exploration of alternative wavtokenizer --- examples/tts/convert_pt_to_hf.py | 2 +- otherarch/tts_adapter.cpp | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/tts/convert_pt_to_hf.py b/examples/tts/convert_pt_to_hf.py index 8909a65fd..f9c9e77ba 100644 --- a/examples/tts/convert_pt_to_hf.py +++ b/examples/tts/convert_pt_to_hf.py @@ -154,7 +154,7 @@ config = { "architectures": [ "WavTokenizerDec" ], - "hidden_size": 1282, + "hidden_size": 1282, # or 2402 for 40t/s "n_embd_features": 512, "n_ff": 2304, "vocab_size": 4096, diff --git a/otherarch/tts_adapter.cpp b/otherarch/tts_adapter.cpp index 75c4b103a..1b1cd72d4 100644 --- a/otherarch/tts_adapter.cpp +++ b/otherarch/tts_adapter.cpp @@ -152,9 +152,9 @@ static std::vector embd_to_audio( const int n_codes, const int n_embd, const int n_thread) { - const int n_fft = 1280; - const int n_hop = 320; - const int n_win = 1280; + const int n_hop = 600; + const int n_fft = n_hop*4; //its 1280 at 320, or 2400 at 600 + const int n_win = n_hop*4; const int n_pad = (n_win - n_hop)/2; const int n_out = (n_codes - 1)*n_hop + n_win;