From 2c0239fcf2c5b237d3fd4c530cfd50a2568f6775 Mon Sep 17 00:00:00 2001
From: Concedo <39025047+LostRuins@users.noreply.github.com>
Date: Mon, 20 Jan 2025 23:02:50 +0800
Subject: [PATCH] exploration of alternative wavtokenizer

---
 examples/tts/convert_pt_to_hf.py | 2 +-
 otherarch/tts_adapter.cpp        | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/examples/tts/convert_pt_to_hf.py b/examples/tts/convert_pt_to_hf.py
index 8909a65fd..f9c9e77ba 100644
--- a/examples/tts/convert_pt_to_hf.py
+++ b/examples/tts/convert_pt_to_hf.py
@@ -154,7 +154,7 @@ config = {
     "architectures": [
         "WavTokenizerDec"
     ],
-    "hidden_size": 1282,
+    "hidden_size": 1282, # or 2402 for 40t/s
     "n_embd_features": 512,
     "n_ff": 2304,
     "vocab_size": 4096,
diff --git a/otherarch/tts_adapter.cpp b/otherarch/tts_adapter.cpp
index 75c4b103a..1b1cd72d4 100644
--- a/otherarch/tts_adapter.cpp
+++ b/otherarch/tts_adapter.cpp
@@ -152,9 +152,9 @@ static std::vector<float> embd_to_audio(
         const int n_codes,
         const int n_embd,
         const int n_thread) {
-    const int n_fft = 1280;
-    const int n_hop = 320;
-    const int n_win = 1280;
+    const int n_hop = 600;
+    const int n_fft = n_hop*4; //its 1280 at 320, or 2400 at 600
+    const int n_win = n_hop*4;
     const int n_pad = (n_win - n_hop)/2;
     const int n_out = (n_codes - 1)*n_hop + n_win;