diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6e15b8cb5..1115088cf 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -463,6 +463,8 @@ add_library(common2
             src/unicode-data.cpp
             otherarch/utils.cpp
             otherarch/utils.h
+            otherarch/llmutils.cpp
+            otherarch/llmutils.h
             common/reasoning-budget.cpp
             common/reasoning-budget.h
             tools/mtmd/mtmd-audio.cpp
diff --git a/Makefile b/Makefile
index 24b43e945..73b11a1ae 100644
--- a/Makefile
+++ b/Makefile
@@ -110,10 +110,10 @@ endif
 CUBLASLD_FLAGS =
 CUBLAS_OBJS =
 
-OBJS_FULL += ggml-alloc.o ggml-cpu-traits.o ggml-quants.o ggml-cpu-quants.o kcpp-quantmapper.o kcpp-repackmapper.o unicode.o unicode-common.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o gguf.o sgemm.o common.o llama-impl.o sampling.o budget.o kcpputils.o mtmdaudio.o
-OBJS_SIMPLE += ggml-alloc.o ggml-cpu-traits.o ggml-quants_noavx2.o ggml-cpu-quants.o kcpp-quantmapper_noavx2.o kcpp-repackmapper_noavx2.o unicode.o unicode-common.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o gguf.o sgemm_noavx2.o common.o llama-impl.o sampling.o budget.o kcpputils.o mtmdaudio.o
-OBJS_SIMPLER += ggml-alloc.o ggml-cpu-traits.o ggml-quants_noavx1.o ggml-cpu-quants.o kcpp-quantmapper_noavx1.o kcpp-repackmapper_noavx1.o unicode.o unicode-common.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o gguf.o sgemm_noavx1.o common.o llama-impl.o sampling.o budget.o kcpputils.o mtmdaudio.o
-OBJS_FAILSAFE += ggml-alloc.o ggml-cpu-traits.o ggml-quants_failsafe.o ggml-cpu-quants.o kcpp-quantmapper_failsafe.o kcpp-repackmapper_failsafe.o unicode.o unicode-common.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o gguf.o sgemm_failsafe.o common.o llama-impl.o sampling.o budget.o kcpputils.o mtmdaudio.o
+OBJS_FULL += ggml-alloc.o ggml-cpu-traits.o ggml-quants.o ggml-cpu-quants.o kcpp-quantmapper.o kcpp-repackmapper.o unicode.o unicode-common.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o gguf.o sgemm.o common.o llama-impl.o sampling.o budget.o kcpputils.o kcppllmutils.o mtmdaudio.o
+OBJS_SIMPLE += ggml-alloc.o ggml-cpu-traits.o ggml-quants_noavx2.o ggml-cpu-quants.o kcpp-quantmapper_noavx2.o kcpp-repackmapper_noavx2.o unicode.o unicode-common.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o gguf.o sgemm_noavx2.o common.o llama-impl.o sampling.o budget.o kcpputils.o kcppllmutils.o mtmdaudio.o
+OBJS_SIMPLER += ggml-alloc.o ggml-cpu-traits.o ggml-quants_noavx1.o ggml-cpu-quants.o kcpp-quantmapper_noavx1.o kcpp-repackmapper_noavx1.o unicode.o unicode-common.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o gguf.o sgemm_noavx1.o common.o llama-impl.o sampling.o budget.o kcpputils.o kcppllmutils.o mtmdaudio.o
+OBJS_FAILSAFE += ggml-alloc.o ggml-cpu-traits.o ggml-quants_failsafe.o ggml-cpu-quants.o kcpp-quantmapper_failsafe.o kcpp-repackmapper_failsafe.o unicode.o unicode-common.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o gguf.o sgemm_failsafe.o common.o llama-impl.o sampling.o budget.o kcpputils.o kcppllmutils.o mtmdaudio.o
 
 # OS specific
 ifeq ($(UNAME_S),Linux)
@@ -602,6 +602,8 @@ gguf.o: ggml/src/gguf.cpp ggml/include/gguf.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 kcpputils.o: otherarch/utils.cpp otherarch/utils.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
+kcppllmutils.o: otherarch/llmutils.cpp otherarch/llmutils.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
 mtmdaudio.o: tools/mtmd/mtmd-audio.cpp tools/mtmd/mtmd-audio.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 ggml-backend.o: ggml/src/ggml-backend.cpp ggml/src/ggml-backend-impl.h ggml/include/ggml.h ggml/include/ggml-backend.h
diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp
index 9b9f2365c..d2473f082 100644
--- a/gpttype_adapter.cpp
+++ b/gpttype_adapter.cpp
@@ -24,6 +24,7 @@
 #include <chrono>
 
 #include "utils.h"
+#include "llmutils.h"
 
 //for easier compilation
 //concat source files into one file for compilation purposes
diff --git a/otherarch/llmutils.cpp b/otherarch/llmutils.cpp
new file mode 100644
index 000000000..46835e985
--- /dev/null
+++ b/otherarch/llmutils.cpp
@@ -0,0 +1,164 @@
+
+#include "llmutils.h"
+
+void kcpp_embd_batch::init_kcpp_batch(int32_t n_tokens,
+                                      int32_t npast,
+                                      bool    use_mrope,
+                                      bool    return_all_logits,
+                                      bool    mrope_is_image,
+                                      int     img_nx,
+                                      int     img_ny) {
+    const int          n_pos_per_embd = use_mrope ? 4 : 1;
+    const llama_seq_id seq_id         = 0;
+
+    if (use_mrope && mrope_is_image) {
+        GGML_ASSERT(img_nx > 0 && img_ny > 0);
+        GGML_ASSERT(img_nx * img_ny == n_tokens);
+    }
+
+    pos.resize(n_tokens * n_pos_per_embd);
+    std::fill(pos.begin(), pos.end(), 0);
+
+    n_seq_id.resize(n_tokens);
+    seq_ids.resize(n_tokens + 1);
+    logits.resize(n_tokens);
+    seq_id_0.resize(1);
+
+    seq_id_0[0]       = seq_id;
+    seq_ids[n_tokens] = nullptr;
+
+    batch.pos      = pos.data();
+    batch.n_seq_id = n_seq_id.data();
+    batch.seq_id   = seq_ids.data();
+    batch.logits   = logits.data();
+
+    for (int i = 0; i < n_tokens; ++i) {
+        n_seq_id[i] = 1;
+        seq_ids[i]  = seq_id_0.data();
+        logits[i]   = return_all_logits;
+    }
+
+    // ---- position encoding ----
+    if (!use_mrope) {
+        for (int i = 0; i < n_tokens; ++i) {
+            pos[i] = npast + i;
+        }
+    } else if (!mrope_is_image) {
+        // 1D M-RoPE (audio / embedding stream)
+        for (int i = 0; i < n_tokens; ++i) {
+            pos[i + 0 * n_tokens] = npast + i;
+            pos[i + 1 * n_tokens] = npast + i;
+            pos[i + 2 * n_tokens] = npast + i;
+            pos[i + 3 * n_tokens] = 0;
+        }
+    } else {
+        // 2D image M-RoPE
+        int idx = 0;
+        for (int y = 0; y < img_ny; ++y) {
+            for (int x = 0; x < img_nx; ++x) {
+                pos[idx + 0 * n_tokens] = npast;
+                pos[idx + 1 * n_tokens] = npast + y;
+                pos[idx + 2 * n_tokens] = npast + x;
+                pos[idx + 3 * n_tokens] = 0;
+                ++idx;
+            }
+        }
+    }
+
+    // Always request logits for last token
+    logits[n_tokens - 1] = true;
+}
+
+//for embeddings
+kcpp_embd_batch::kcpp_embd_batch(float * embd,
+                                 int32_t n_tokens,
+                                 int32_t npast,
+                                 bool    use_mrope,
+                                 bool    mrope_is_image,
+                                 int     img_nx,
+                                 int     img_ny) {
+    batch = {
+        /* n_tokens = */ n_tokens,
+        /* tokens   = */ nullptr,
+        /* embd     = */ embd,
+        /* pos      = */ nullptr,
+        /* n_seq_id = */ nullptr,
+        /* seq_id   = */ nullptr,
+        /* logits   = */ nullptr,
+    };
+
+    init_kcpp_batch(n_tokens, npast, use_mrope,
+                    /*return_all_logits=*/false, mrope_is_image, img_nx, img_ny);
+}
+
+// for tokens
+kcpp_embd_batch::kcpp_embd_batch(std::vector<llama_token> & tokens,
+                                 int32_t                    npast,
+                                 bool                       use_mrope,
+                                 bool                       return_all_logits,
+                                 bool                       mrope_is_image,
+                                 int                        img_nx,
+                                 int                        img_ny) {
+    batch = {
+        /* n_tokens = */ (int32_t) tokens.size(),
+        /* tokens   = */ tokens.data(),
+        /* embd     = */ nullptr,
+        /* pos      = */ nullptr,
+        /* n_seq_id = */ nullptr,
+        /* seq_id   = */ nullptr,
+        /* logits   = */ nullptr,
+    };
+
+    init_kcpp_batch(batch.n_tokens, npast, use_mrope, return_all_logits, mrope_is_image, img_nx, img_ny);
+}
+
+llama_batch kcpp_embd_batch::get_view(int offset, int n_tokens, int n_embd_mmproj) {
+    GGML_ASSERT(offset >= 0);
+    GGML_ASSERT(n_tokens > 0);
+    GGML_ASSERT(offset + n_tokens <= batch.n_tokens);
+
+    const int total_tokens = batch.n_tokens;
+    llama_pos * pos_ptr = nullptr;
+
+    // Detect M-RoPE vs normal RoPE
+    const bool is_mrope = (pos.size() > (size_t)total_tokens);
+
+    pos_view.clear();
+
+    if (is_mrope) {
+        const int n_pos_per_embd = pos.size() / total_tokens;
+        GGML_ASSERT(n_pos_per_embd == 4);
+
+        // Layout:
+        // src: [dim0_all_tokens][dim1_all_tokens][dim2_all_tokens][dim3_all_tokens]
+        // dst: same layout, but only [offset : offset + n_tokens]
+        pos_view.reserve(n_tokens * n_pos_per_embd);
+
+        for (int dim = 0; dim < n_pos_per_embd; ++dim) {
+            const llama_pos * src =
+                pos.data() + dim * total_tokens + offset;
+
+            pos_view.insert(
+                pos_view.end(),
+                src,
+                src + n_tokens
+            );
+        }
+
+        pos_ptr = pos_view.data();
+    }
+    else {
+        // Normal RoPE: contiguous slice
+        pos_ptr = pos.data() + offset;
+    }
+
+    return {
+        /* n_tokens = */ n_tokens,
+        /* tokens   = */ nullptr,
+        /* embd     = */ batch.embd ? batch.embd + offset*n_embd_mmproj : nullptr,
+        /* pos      = */ pos_ptr,
+        /* n_seq_id = */ batch.n_seq_id + offset,
+        /* seq_id   = */ batch.seq_id   + offset,
+        /* logits   = */ batch.logits   + offset,
+    };
+}
\ No newline at end of file
diff --git a/otherarch/llmutils.h b/otherarch/llmutils.h
new file mode 100644
index 000000000..e295362c0
--- /dev/null
+++ b/otherarch/llmutils.h
@@ -0,0 +1,54 @@
+#pragma once
+
+#include <string>
+#include <map>
+#include <vector>
+#include <random>
+#include <thread>
+#include "llama.h"
+
+//duplcated and modified from llava_embd_batch
+struct kcpp_embd_batch {
+    std::vector<llama_pos>    pos;
+    std::vector<llama_pos>    pos_view;
+    std::vector<int32_t>      n_seq_id;
+    std::vector<llama_seq_id> seq_id_0;
+    std::vector<llama_seq_id*> seq_ids;
+    std::vector<int8_t>       logits;
+    llama_batch batch;
+
+    llama_batch get_view(int offset, int n_tokens, int n_embd_mmproj);
+
+    // Embedding constructor
+    kcpp_embd_batch(
+        float * embd,
+        int32_t n_tokens,
+        int32_t npast,
+        bool use_mrope,
+        bool mrope_is_image = false,
+        int img_nx = 0,
+        int img_ny = 0
+    );
+
+    // Token constructor
+    kcpp_embd_batch(
+        std::vector<llama_token> & tokens,
+        int32_t npast,
+        bool use_mrope,
+        bool return_all_logits,
+        bool mrope_is_image = false,
+        int img_nx = 0,
+        int img_ny = 0
+    );
+
+private:
+    void init_kcpp_batch(
+        int32_t n_tokens,
+        int32_t npast,
+        bool use_mrope,
+        bool return_all_logits,
+        bool mrope_is_image,
+        int img_nx,
+        int img_ny
+    );
+};
\ No newline at end of file
diff --git a/otherarch/sdcpp/sdtype_adapter.cpp b/otherarch/sdcpp/sdtype_adapter.cpp
index d7c66ab7d..587f453c9 100644
--- a/otherarch/sdcpp/sdtype_adapter.cpp
+++ b/otherarch/sdcpp/sdtype_adapter.cpp
@@ -14,6 +14,8 @@
 #include <algorithm>
 #include <filesystem>
 
+#include "otherarch/utils.h"
+
 #include "model_adapter.h"
 #include "tokenizers/vocab/vocab.h"
 #include "flux.hpp"
@@ -54,10 +56,6 @@ using namespace torch_zip;
 #include "tokenizers/tokenizer.cpp"
 #include "tokenizers/tokenize_util.cpp"
 
-// FIXME: llama.h errors out if included (through utils.h)
-std::vector<uint8_t> kcpp_base64_decode(const std::string & encoded_string);
-std::string kcpp_base64_encode(const unsigned char* data, unsigned int data_length);
-std::string get_timestamp_str();
 
 // #include "preprocessing.hpp"
 #include "stable-diffusion.h"
diff --git a/otherarch/tts_adapter.cpp b/otherarch/tts_adapter.cpp
index 408c4da81..5284ad716 100644
--- a/otherarch/tts_adapter.cpp
+++ b/otherarch/tts_adapter.cpp
@@ -1,5 +1,6 @@
 #include "model_adapter.h"
 #include "otherarch/utils.h"
+#include "otherarch/llmutils.h"
 
 #include "common.h"
 #include "sampling.h"
diff --git a/otherarch/utils.cpp b/otherarch/utils.cpp
index 71d648faa..32f0e4652 100644
--- a/otherarch/utils.cpp
+++ b/otherarch/utils.cpp
@@ -760,167 +760,6 @@ int32_t kcpp_quick_sample(float * logits, const int n_logits, const std::vector<
     return logits_id[idx].second;
 }
 
-void kcpp_embd_batch::init_kcpp_batch(int32_t n_tokens,
-                                      int32_t npast,
-                                      bool    use_mrope,
-                                      bool    return_all_logits,
-                                      bool    mrope_is_image,
-                                      int     img_nx,
-                                      int     img_ny) {
-    const int          n_pos_per_embd = use_mrope ? 4 : 1;
-    const llama_seq_id seq_id         = 0;
-
-    if (use_mrope && mrope_is_image) {
-        GGML_ASSERT(img_nx > 0 && img_ny > 0);
-        GGML_ASSERT(img_nx * img_ny == n_tokens);
-    }
-
-    pos.resize(n_tokens * n_pos_per_embd);
-    std::fill(pos.begin(), pos.end(), 0);
-
-    n_seq_id.resize(n_tokens);
-    seq_ids.resize(n_tokens + 1);
-    logits.resize(n_tokens);
-    seq_id_0.resize(1);
-
-    seq_id_0[0]       = seq_id;
-    seq_ids[n_tokens] = nullptr;
-
-    batch.pos      = pos.data();
-    batch.n_seq_id = n_seq_id.data();
-    batch.seq_id   = seq_ids.data();
-    batch.logits   = logits.data();
-
-    for (int i = 0; i < n_tokens; ++i) {
-        n_seq_id[i] = 1;
-        seq_ids[i]  = seq_id_0.data();
-        logits[i]   = return_all_logits;
-    }
-
-    // ---- position encoding ----
-    if (!use_mrope) {
-        for (int i = 0; i < n_tokens; ++i) {
-            pos[i] = npast + i;
-        }
-    } else if (!mrope_is_image) {
-        // 1D M-RoPE (audio / embedding stream)
-        for (int i = 0; i < n_tokens; ++i) {
-            pos[i + 0 * n_tokens] = npast + i;
-            pos[i + 1 * n_tokens] = npast + i;
-            pos[i + 2 * n_tokens] = npast + i;
-            pos[i + 3 * n_tokens] = 0;
-        }
-    } else {
-        // 2D image M-RoPE
-        int idx = 0;
-        for (int y = 0; y < img_ny; ++y) {
-            for (int x = 0; x < img_nx; ++x) {
-                pos[idx + 0 * n_tokens] = npast;
-                pos[idx + 1 * n_tokens] = npast + y;
-                pos[idx + 2 * n_tokens] = npast + x;
-                pos[idx + 3 * n_tokens] = 0;
-                ++idx;
-            }
-        }
-    }
-
-    // Always request logits for last token
-    logits[n_tokens - 1] = true;
-}
-
-//for embeddings
-kcpp_embd_batch::kcpp_embd_batch(float * embd,
-                                 int32_t n_tokens,
-                                 int32_t npast,
-                                 bool    use_mrope,
-                                 bool    mrope_is_image,
-                                 int     img_nx,
-                                 int     img_ny) {
-    batch = {
-        /* n_tokens = */ n_tokens,
-        /* tokens   = */ nullptr,
-        /* embd     = */ embd,
-        /* pos      = */ nullptr,
-        /* n_seq_id = */ nullptr,
-        /* seq_id   = */ nullptr,
-        /* logits   = */ nullptr,
-    };
-
-    init_kcpp_batch(n_tokens, npast, use_mrope,
-                    /*return_all_logits=*/false, mrope_is_image, img_nx, img_ny);
-}
-
-// for tokens
-kcpp_embd_batch::kcpp_embd_batch(std::vector<llama_token> & tokens,
-                                 int32_t                    npast,
-                                 bool                       use_mrope,
-                                 bool                       return_all_logits,
-                                 bool                       mrope_is_image,
-                                 int                        img_nx,
-                                 int                        img_ny) {
-    batch = {
-        /* n_tokens = */ (int32_t) tokens.size(),
-        /* tokens   = */ tokens.data(),
-        /* embd     = */ nullptr,
-        /* pos      = */ nullptr,
-        /* n_seq_id = */ nullptr,
-        /* seq_id   = */ nullptr,
-        /* logits   = */ nullptr,
-    };
-
-    init_kcpp_batch(batch.n_tokens, npast, use_mrope, return_all_logits, mrope_is_image, img_nx, img_ny);
-}
-
-llama_batch kcpp_embd_batch::get_view(int offset, int n_tokens, int n_embd_mmproj) {
-    GGML_ASSERT(offset >= 0);
-    GGML_ASSERT(n_tokens > 0);
-    GGML_ASSERT(offset + n_tokens <= batch.n_tokens);
-
-    const int total_tokens = batch.n_tokens;
-    llama_pos * pos_ptr = nullptr;
-
-    // Detect M-RoPE vs normal RoPE
-    const bool is_mrope = (pos.size() > (size_t)total_tokens);
-
-    pos_view.clear();
-
-    if (is_mrope) {
-        const int n_pos_per_embd = pos.size() / total_tokens;
-        GGML_ASSERT(n_pos_per_embd == 4);
-
-        // Layout:
-        // src: [dim0_all_tokens][dim1_all_tokens][dim2_all_tokens][dim3_all_tokens]
-        // dst: same layout, but only [offset : offset + n_tokens]
-        pos_view.reserve(n_tokens * n_pos_per_embd);
-
-        for (int dim = 0; dim < n_pos_per_embd; ++dim) {
-            const llama_pos * src =
-                pos.data() + dim * total_tokens + offset;
-
-            pos_view.insert(
-                pos_view.end(),
-                src,
-                src + n_tokens
-            );
-        }
-
-        pos_ptr = pos_view.data();
-    }
-    else {
-        // Normal RoPE: contiguous slice
-        pos_ptr = pos.data() + offset;
-    }
-
-    return {
-        /* n_tokens = */ n_tokens,
-        /* tokens   = */ nullptr,
-        /* embd     = */ batch.embd ? batch.embd + offset*n_embd_mmproj : nullptr,
-        /* pos      = */ pos_ptr,
-        /* n_seq_id = */ batch.n_seq_id + offset,
-        /* seq_id   = */ batch.seq_id   + offset,
-        /* logits   = */ batch.logits   + offset,
-    };
-}
 
 std::vector<std::string> split_string(const std::string& input, const std::string& separator) {
     std::vector<std::string> result;
diff --git a/otherarch/utils.h b/otherarch/utils.h
index 3a5d00544..1c292db3e 100644
--- a/otherarch/utils.h
+++ b/otherarch/utils.h
@@ -8,16 +8,6 @@
 #include <random>
 #include <thread>
 #include "ggml_v3.h"
-#include "llama.h"
-
-//
-// CLI argument parsing
-//
-
-
-//
-// Vocab utils
-//
 
 struct gpt_vocab {
     using id    = int32_t;
@@ -73,6 +63,7 @@ std::vector<std::string> split_string(const std::string& input, const std::strin
 bool kcpp_decode_audio_from_buf(const unsigned char * buf_in, size_t len, int target_sampler_rate, std::vector<float> & pcmf32_mono);
 bool kcpp_decode_audio_to_f32_stereo_48k(const uint8_t * data, size_t data_size, std::vector<float> & pcm, int & T_audio);
 
+typedef struct ggml_backend_device * ggml_backend_dev_t;
 std::vector<ggml_backend_dev_t> kcpp_parse_device_list(const std::string & value);
 
 bool kcpp_string_ends_with(const std::string& str, const std::string& suffix);
@@ -81,52 +72,6 @@ int ComputeSharedPrefixLength(const std::vector<int> &tokens_a,const std::vector
 float ComputePrefixMatchPercent(const std::vector<int> &tokens_a,const std::vector<int> &tokens_b);
 bool FullyContainedPrefix(std::vector<int> &sequence1, std::vector<int> &sequence2);
 
-//duplcated and modified from llava_embd_batch
-struct kcpp_embd_batch {
-    std::vector<llama_pos>    pos;
-    std::vector<llama_pos>    pos_view;
-    std::vector<int32_t>      n_seq_id;
-    std::vector<llama_seq_id> seq_id_0;
-    std::vector<llama_seq_id*> seq_ids;
-    std::vector<int8_t>       logits;
-    llama_batch batch;
-
-    llama_batch get_view(int offset, int n_tokens, int n_embd_mmproj);
-
-    // Embedding constructor
-    kcpp_embd_batch(
-        float * embd,
-        int32_t n_tokens,
-        int32_t npast,
-        bool use_mrope,
-        bool mrope_is_image = false,
-        int img_nx = 0,
-        int img_ny = 0
-    );
-
-    // Token constructor
-    kcpp_embd_batch(
-        std::vector<llama_token> & tokens,
-        int32_t npast,
-        bool use_mrope,
-        bool return_all_logits,
-        bool mrope_is_image = false,
-        int img_nx = 0,
-        int img_ny = 0
-    );
-
-private:
-    void init_kcpp_batch(
-        int32_t n_tokens,
-        int32_t npast,
-        bool use_mrope,
-        bool return_all_logits,
-        bool mrope_is_image,
-        int img_nx,
-        int img_ny
-    );
-};
-
 #pragma pack(push, 1)
 struct wav16_header {
     char riff[4] = {'R', 'I', 'F', 'F'};