diff --git a/cmake/riscv64-spacemit-linux-gnu-gcc.cmake b/cmake/riscv64-spacemit-linux-gnu-gcc.cmake
deleted file mode 100644
index 08fdbf506..000000000
--- a/cmake/riscv64-spacemit-linux-gnu-gcc.cmake
+++ /dev/null
@@ -1,29 +0,0 @@
-set(CMAKE_SYSTEM_NAME Linux)
-set(CMAKE_SYSTEM_PROCESSOR riscv64)
-set(CMAKE_SYSTEM_VERSION 1)
-
-if (CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "^(riscv)")
-    message(STATUS "HOST SYSTEM ${CMAKE_HOST_SYSTEM_PROCESSOR}")
-else()
-    set(GNU_MACHINE riscv64-unknown-linux-gnu CACHE STRING "GNU compiler triple")
-    if (DEFINED ENV{RISCV_ROOT_PATH})
-        file(TO_CMAKE_PATH $ENV{RISCV_ROOT_PATH} RISCV_ROOT_PATH)
-    else()
-        message(FATAL_ERROR "RISCV_ROOT_PATH env must be defined")
-    endif()
-
-    set(RISCV_ROOT_PATH ${RISCV_ROOT_PATH} CACHE STRING "root path to riscv toolchain")
-    set(CMAKE_C_COMPILER ${RISCV_ROOT_PATH}/bin/riscv64-unknown-linux-gnu-gcc)
-    set(CMAKE_CXX_COMPILER ${RISCV_ROOT_PATH}/bin/riscv64-unknown-linux-gnu-g++)
-    set(CMAKE_STRIP ${RISCV_ROOT_PATH}/bin/riscv64-unknown-linux-gnu-strip)
-    set(CMAKE_FIND_ROOT_PATH "${RISCV_ROOT_PATH}/riscv64-unknown-linux-gnu")
-    set(CMAKE_SYSROOT "${RISCV_ROOT_PATH}/sysroot")
-endif()
-
-set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
-set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
-set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
-set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
-set(CMAKE_C_FLAGS "-march=rv64gcv_zfh_zba_zicbop -mabi=lp64d ${CMAKE_C_FLAGS}")
-set(CMAKE_CXX_FLAGS "-march=rv64gcv_zfh_zba_zicbop -mabi=lp64d ${CXX_FLAGS}")
-set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -latomic")
diff --git a/common/arg.cpp b/common/arg.cpp
index cfb65376b..d10368395 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -875,7 +875,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         sampler_type_chars += common_sampler_type_to_chr(sampler);
         sampler_type_names += common_sampler_type_to_str(sampler) + ";";
     }
-    sampler_type_names.pop_back();
+    if (!sampler_type_names.empty()) {
+        sampler_type_names.pop_back(); // remove last semicolon
+    }
 
 
     /**
@@ -2612,6 +2614,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.api_prefix = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_PREFIX"));
+    add_opt(common_arg(
+        {"--webui-config"}, "JSON",
+        "JSON that provides default WebUI settings (overrides WebUI defaults)",
+        [](common_params & params, const std::string & value) {
+            params.webui_config_json = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_CONFIG"));
+    add_opt(common_arg(
+        {"--webui-config-file"}, "PATH",
+        "JSON file that provides default WebUI settings (overrides WebUI defaults)",
+        [](common_params & params, const std::string & value) {
+            params.webui_config_json = read_file(value);
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_CONFIG_FILE"));
     add_opt(common_arg(
         {"--webui"},
         {"--no-webui"},
diff --git a/common/common.h b/common/common.h
index 519ffdd2f..389196606 100644
--- a/common/common.h
+++ b/common/common.h
@@ -480,8 +480,11 @@ struct common_params {
 
     std::map<std::string, std::string> default_template_kwargs;
 
+    // webui configs
+    bool webui = true;
+    std::string webui_config_json;
+
     // "advanced" endpoints are disabled by default for better security
-    bool webui            = true;
     bool endpoint_slots   = true;
     bool endpoint_props   = false; // only control POST requests, not GET
     bool endpoint_metrics = false;
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index bd16ba312..86fe0b5f1 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -189,10 +189,10 @@ class ModelBase:
             return tensors
 
         prefix = "model" if not self.is_mistral_format else "consolidated"
-        part_names: set[str] = set(ModelBase.get_model_part_names(self.dir_model, prefix, ".safetensors"))
+        part_names: list[str] = ModelBase.get_model_part_names(self.dir_model, prefix, ".safetensors")
         is_safetensors: bool = len(part_names) > 0
         if not is_safetensors:
-            part_names = set(ModelBase.get_model_part_names(self.dir_model, "pytorch_model", ".bin"))
+            part_names = ModelBase.get_model_part_names(self.dir_model, "pytorch_model", ".bin")
 
         tensor_names_from_index: set[str] = set()
 
@@ -209,7 +209,8 @@ class ModelBase:
                     if weight_map is None or not isinstance(weight_map, dict):
                         raise ValueError(f"Can't load 'weight_map' from {index_name!r}")
                     tensor_names_from_index.update(weight_map.keys())
-                    part_names |= set(weight_map.values())
+                    part_dict: dict[str, None] = dict.fromkeys(weight_map.values(), None)
+                    part_names = sorted(part_dict.keys())
             else:
                 weight_map = {}
         else:
@@ -1838,7 +1839,7 @@ class MmprojModel(ModelBase):
 
     def tensor_force_quant(self, name, new_name, bid, n_dims):
         del bid, name, n_dims  # unused
-        if ".patch_embd.weight" in new_name:
+        if ".patch_embd.weight" in new_name or ".patch_merger.weight" in new_name:
             return gguf.GGMLQuantizationType.F16 if self.ftype == gguf.LlamaFileType.MOSTLY_F16 else gguf.GGMLQuantizationType.F32
         return False
 
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index fc7543ac9..a9d6aee7b 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -4186,13 +4186,33 @@ void ggml_cpu_fp16_to_fp32(const ggml_fp16_t * x, float * y, int64_t n) {
         __m128 y_vec = _mm_cvtph_ps(x_vec);
         _mm_storeu_ps(y + i, y_vec);
     }
-#elif defined(__riscv_zvfh)
-    for (int vl; i < n; i += vl) {
-        vl = __riscv_vsetvl_e16m1(n - i);
-        vfloat16m1_t vx = __riscv_vle16_v_f16m1((_Float16 *)&x[i], vl);
-        vfloat32m2_t vy = __riscv_vfwcvt_f_f_v_f32m2(vx, vl);
-        __riscv_vse32_v_f32m2(&y[i], vy, vl);
+
+#elif defined(__riscv_v_intrinsic) && defined(__riscv_zvfhmin)
+    // calculate step size
+    const int epr = __riscv_vsetvlmax_e16m2();
+    const int step = epr * 2;
+    const int np = (n & ~(step - 1));
+
+    // unroll by 2
+    for (; i < np; i += step) {
+        vfloat16m2_t ax0 = __riscv_vle16_v_f16m2((const _Float16*)x + i, epr);
+        vfloat32m4_t ay0 = __riscv_vfwcvt_f_f_v_f32m4(ax0, epr);
+        __riscv_vse32_v_f32m4(y + i, ay0, epr);
+
+        vfloat16m2_t ax1 = __riscv_vle16_v_f16m2((const _Float16*)x + i + epr, epr);
+        vfloat32m4_t ay1 = __riscv_vfwcvt_f_f_v_f32m4(ax1, epr);
+        __riscv_vse32_v_f32m4(y + i + epr, ay1, epr);
     }
+
+    // leftovers
+    int vl;
+    for (i = np; i < n; i += vl) {
+        vl = __riscv_vsetvl_e16m2(n - i);
+        vfloat16m2_t ax0 = __riscv_vle16_v_f16m2((const _Float16*)x + i, vl);
+        vfloat32m4_t ay0 = __riscv_vfwcvt_f_f_v_f32m4(ax0, vl);
+        __riscv_vse32_v_f32m4(y + i, ay0, vl);
+    }
+
 #endif
 
     for (; i < n; ++i) {
@@ -4237,6 +4257,31 @@ void ggml_cpu_bf16_to_fp32(const ggml_bf16_t * x, float * y, int64_t n) {
                                         (const __m128i *)(x + i))),
                                 16)));
     }
+#elif defined(__riscv_v_intrinsic) && defined(__riscv_zvfbfmin)
+    // calculate step size
+    const int epr = __riscv_vsetvlmax_e16m2();
+    const int step = epr * 2;
+    const int np = (n & ~(step - 1));
+
+    // unroll by 2
+    for (; i < np; i += step) {
+        vbfloat16m2_t ax0 = __riscv_vle16_v_bf16m2((const __bf16*)x + i, epr);
+        vfloat32m4_t ay0 = __riscv_vfwcvtbf16_f_f_v_f32m4(ax0, epr);
+        __riscv_vse32_v_f32m4(y + i, ay0, epr);
+
+        vbfloat16m2_t ax1 = __riscv_vle16_v_bf16m2((const __bf16*)x + i + epr, epr);
+        vfloat32m4_t ay1 = __riscv_vfwcvtbf16_f_f_v_f32m4(ax1, epr);
+        __riscv_vse32_v_f32m4(y + i + epr, ay1, epr);
+    }
+
+    // leftovers
+    int vl;
+    for (i = np; i < n; i += vl) {
+        vl = __riscv_vsetvl_e16m2(n - i);
+        vbfloat16m2_t ax0 = __riscv_vle16_v_bf16m2((const __bf16*)x + i, vl);
+        vfloat32m4_t ay0 = __riscv_vfwcvtbf16_f_f_v_f32m4(ax0, vl);
+        __riscv_vse32_v_f32m4(y + i, ay0, vl);
+    }
 #endif
     for (; i < n; i++) {
         y[i] = GGML_BF16_TO_FP32(x[i]);
diff --git a/ggml/src/ggml-cpu/vec.cpp b/ggml/src/ggml-cpu/vec.cpp
index ac8633e21..427e63245 100644
--- a/ggml/src/ggml-cpu/vec.cpp
+++ b/ggml/src/ggml-cpu/vec.cpp
@@ -195,8 +195,48 @@ void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t *
     sumf += (ggml_float)_mm_cvtss_f32(g);
 
 #undef LOAD
-#endif
+#elif defined(__riscv_v_intrinsic) && defined(__riscv_zvfbfwma)
+    size_t vl = __riscv_vsetvlmax_e32m4();
 
+    // initialize accumulators to all zeroes
+    vfloat32m4_t vsum0 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
+    vfloat32m4_t vsum1 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
+
+    // calculate step size
+    const size_t epr = __riscv_vsetvlmax_e16m2();
+    const size_t step = epr * 2;
+    const int np = (n & ~(step - 1));
+
+    // unroll by 2
+    for (; i < np; i += step) {
+        vbfloat16m2_t ax0 = __riscv_vle16_v_bf16m2((const __bf16 *)&x[i], epr);
+        vbfloat16m2_t ay0 = __riscv_vle16_v_bf16m2((const __bf16 *)&y[i], epr);
+        vsum0 = __riscv_vfwmaccbf16_vv_f32m4(vsum0, ax0, ay0, epr);
+        __asm__ __volatile__ ("" ::: "memory");
+
+        vbfloat16m2_t ax1 = __riscv_vle16_v_bf16m2((const __bf16 *)&x[i + epr], epr);
+        vbfloat16m2_t ay1 = __riscv_vle16_v_bf16m2((const __bf16 *)&y[i + epr], epr);
+        vsum1 = __riscv_vfwmaccbf16_vv_f32m4(vsum1, ax1, ay1, epr);
+        __asm__ __volatile__ ("" ::: "memory");
+    }
+
+    // accumulate in 1 register
+    vsum0 = __riscv_vfadd_vv_f32m4(vsum0, vsum1, vl);
+
+    // leftovers
+    for (i = np; i < n; i += vl) {
+        vl = __riscv_vsetvl_e16m2(n - i);
+        vbfloat16m2_t ax0 = __riscv_vle16_v_bf16m2((const __bf16 *)&x[i], vl);
+        vbfloat16m2_t ay0 = __riscv_vle16_v_bf16m2((const __bf16 *)&y[i], vl);
+        vsum0 = __riscv_vfwmaccbf16_vv_f32m4(vsum0, ax0, ay0, vl);
+    }
+
+    // reduce
+    vl = __riscv_vsetvlmax_e32m4();
+    vfloat32m1_t redsum = __riscv_vfredusum_vs_f32m4_f32m1(vsum0, __riscv_vfmv_v_f_f32m1(0.0f, 1), vl);
+    sumf += __riscv_vfmv_f_s_f32m1_f32(redsum);
+
+#endif
     for (; i < n; ++i) {
         sumf += (ggml_float)(GGML_BF16_TO_FP32(x[i]) *
                              GGML_BF16_TO_FP32(y[i]));
diff --git a/ggml/src/ggml-cpu/vec.h b/ggml/src/ggml-cpu/vec.h
index bd80805fd..3198b33b5 100644
--- a/ggml/src/ggml-cpu/vec.h
+++ b/ggml/src/ggml-cpu/vec.h
@@ -224,13 +224,71 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
         }
         GGML_F16x_VEC_REDUCE(sumf[0], sum_00, sum_01, sum_02, sum_03);
         GGML_F16x_VEC_REDUCE(sumf[1], sum_10, sum_11, sum_12, sum_13);
-    #elif defined(__riscv_v_intrinsic)
-      // todo: RVV impl
-      for (int i = 0; i < n; ++i) {
-          for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
-              sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i]));
-          }
-      }
+
+    #elif defined(__riscv_v_intrinsic) && defined(__riscv_zvfh)
+        size_t vl = __riscv_vsetvlmax_e32m4();
+
+        // initialize accumulators to all zeroes
+        vfloat32m4_t vsum0_0 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
+        vfloat32m4_t vsum0_1 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
+        vfloat32m4_t vsum1_0 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
+        vfloat32m4_t vsum1_1 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
+
+        // calculate step size
+        const size_t epr = __riscv_vsetvlmax_e16m2();
+        const size_t step = epr * 2;
+        const int np = (n & ~(step - 1));
+
+        // unroll by 2 along the row dimension
+        for (int i = 0; i < np; i += step) {
+            vfloat16m2_t ay0 = __riscv_vle16_v_f16m2((const _Float16 *)(y + i), epr);
+            vfloat16m2_t ax0_0 = __riscv_vle16_v_f16m2((const _Float16 *)(x[0] + i), epr);
+            vfloat16m2_t ax1_0 = __riscv_vle16_v_f16m2((const _Float16 *)(x[1] + i), epr);
+            vsum0_0 = __riscv_vfwmacc_vv_f32m4(vsum0_0, ax0_0, ay0, epr);
+            vsum1_0 = __riscv_vfwmacc_vv_f32m4(vsum1_0, ax1_0, ay0, epr);
+
+            vfloat16m2_t ay1 = __riscv_vle16_v_f16m2((const _Float16 *)(y + i + epr), epr);
+            vfloat16m2_t ax0_1 = __riscv_vle16_v_f16m2((const _Float16 *)(x[0] + i + epr), epr);
+            vfloat16m2_t ax1_1 = __riscv_vle16_v_f16m2((const _Float16 *)(x[1] + i + epr), epr);
+            vsum0_1 = __riscv_vfwmacc_vv_f32m4(vsum0_1, ax0_1, ay1, epr);
+            vsum1_1 = __riscv_vfwmacc_vv_f32m4(vsum1_1, ax1_1, ay1, epr);
+        }
+
+        vfloat32m4_t vsum0 = __riscv_vfadd_vv_f32m4(vsum0_0, vsum0_1, vl);
+        vfloat32m4_t vsum1 = __riscv_vfadd_vv_f32m4(vsum1_0, vsum1_1, vl);
+
+        // leftovers
+        for (int i = np; i < n; i += vl) {
+            vl = __riscv_vsetvl_e16m2(n - i);
+            vfloat16m2_t ay = __riscv_vle16_v_f16m2((const _Float16 *)(y + i), vl);
+            vfloat16m2_t ax0 = __riscv_vle16_v_f16m2((const _Float16 *)(x[0] + i), vl);
+            vfloat16m2_t ax1 = __riscv_vle16_v_f16m2((const _Float16 *)(x[1] + i), vl);
+
+            vsum0 = __riscv_vfwmacc_vv_f32m4(vsum0, ax0, ay, vl);
+            vsum1 = __riscv_vfwmacc_vv_f32m4(vsum1, ax1, ay, vl);
+        }
+
+        // reduce
+        vl = __riscv_vsetvlmax_e32m2();
+        vfloat32m2_t acc0_0 = __riscv_vfadd_vv_f32m2(__riscv_vget_v_f32m4_f32m2(vsum0, 0),
+                                    __riscv_vget_v_f32m4_f32m2(vsum0, 1), vl);
+        vl = __riscv_vsetvlmax_e32m1();
+        vfloat32m1_t acc0_1 = __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m2_f32m1(acc0_0, 0),
+        __riscv_vget_v_f32m2_f32m1(acc0_0, 1), vl);
+        vfloat32m1_t redsum0 = __riscv_vfredusum_vs_f32m1_f32m1(
+                                    acc0_1, __riscv_vfmv_v_f_f32m1(0.0f, 1), vl);
+
+        vl = __riscv_vsetvlmax_e32m2();
+        vfloat32m2_t acc1_0 = __riscv_vfadd_vv_f32m2(__riscv_vget_v_f32m4_f32m2(vsum1, 0),
+                                    __riscv_vget_v_f32m4_f32m2(vsum1, 1), vl);
+        vl = __riscv_vsetvlmax_e32m1();
+        vfloat32m1_t acc1_1 = __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m2_f32m1(acc1_0, 0),
+                                    __riscv_vget_v_f32m2_f32m1(acc1_0, 1), vl);
+        vfloat32m1_t redsum1 = __riscv_vfredusum_vs_f32m1_f32m1(
+                                    acc1_1, __riscv_vfmv_v_f_f32m1(0.0f, 1), vl);
+        sumf[0] = __riscv_vfmv_f_s_f32m1_f32(redsum0);
+        sumf[1] = __riscv_vfmv_f_s_f32m1_f32(redsum1);
+
     #else
         const int np = (n & ~(GGML_F16_STEP - 1));
 
@@ -475,15 +533,39 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y,
     }
     np = n;
 #elif defined(__riscv_zvfh) // implies __riscv_v_intrinsic
-    const int np = n;
-    _Float16 hv = (_Float16)v;
-    for (int i = 0, avl; i < n; i += avl) {
-        avl = __riscv_vsetvl_e16m8(n - i);
-        vfloat16m8_t ax = __riscv_vle16_v_f16m8((const _Float16 *)&x[i], avl);
-        vfloat16m8_t ay = __riscv_vle16_v_f16m8((_Float16 *)&y[i], avl);
-        vfloat16m8_t ny = __riscv_vfmadd_vf_f16m8(ax, hv, ay, avl);
-        __riscv_vse16_v_f16m8((_Float16 *)&y[i], ny, avl);
+    const ggml_fp16_t s = GGML_CPU_FP32_TO_FP16(v);
+    const _Float16 scale = *(const _Float16*)(&s);
+
+    // calculate step size
+    const int epr = __riscv_vsetvlmax_e16m4();
+    const int step = epr * 2;
+    int np = (n & ~(step - 1));
+
+    // unroll by 2
+    for (int i = 0; i < np; i += step) {
+        vfloat16m4_t ax0 = __riscv_vle16_v_f16m4((const _Float16*)x + i, epr);
+        vfloat16m4_t ay0 = __riscv_vle16_v_f16m4((const _Float16*)y + i, epr);
+        ay0 = __riscv_vfmacc_vf_f16m4(ay0, scale, ax0, epr);
+        __riscv_vse16_v_f16m4((_Float16*)y + i, ay0, epr);
+        __asm__ __volatile__ ("" ::: "memory");
+
+        vfloat16m4_t ax1 = __riscv_vle16_v_f16m4((const _Float16*)x + i + epr, epr);
+        vfloat16m4_t ay1 = __riscv_vle16_v_f16m4((const _Float16*)y + i + epr, epr);
+        ay1 = __riscv_vfmacc_vf_f16m4(ay1, scale, ax1, epr);
+        __riscv_vse16_v_f16m4((_Float16*)y + i + epr, ay1, epr);
+        __asm__ __volatile__ ("" ::: "memory");
     }
+
+    // leftovers
+    int vl;
+    for (int i = np; i < n; i += vl) {
+        vl = __riscv_vsetvl_e16m4(n - i);
+        vfloat16m4_t ax0 = __riscv_vle16_v_f16m4((const _Float16*)x + i, vl);
+        vfloat16m4_t ay0 = __riscv_vle16_v_f16m4((const _Float16*)y + i, vl);
+        ay0 = __riscv_vfmacc_vf_f16m4(ay0, scale, ax0, vl);
+        __riscv_vse16_v_f16m4((_Float16*)y + i, ay0, vl);
+    }
+    np = n;
 #elif defined(GGML_SIMD)
     const int np = (n & ~(GGML_F16_STEP - 1));
 
@@ -724,13 +806,34 @@ inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float
         svst1_f16(pg, (__fp16 *)(y + np), out);
     }
 #elif defined(__riscv_v_intrinsic) && defined(__riscv_zvfh)
-    for (int i = 0, vl; i < n; i += vl) {
-        vl = __riscv_vsetvl_e16m2(n - i);
-        vfloat16m2_t vy = __riscv_vle16_v_f16m2((_Float16 *)&y[i], vl);
-        vfloat32m4_t vy32 = __riscv_vfwcvt_f_f_v_f32m4(vy, vl);
-        vy32 = __riscv_vfmul_vf_f32m4(vy32, v, vl);
-        vy = __riscv_vfncvt_f_f_w_f16m2(vy32, vl);
-        __riscv_vse16_v_f16m2((_Float16 *)&y[i], vy, vl);
+    const ggml_fp16_t s = GGML_CPU_FP32_TO_FP16(v);
+    const _Float16 scale = *(const _Float16*)(&s);
+
+    // calculate step size
+    const int epr = __riscv_vsetvlmax_e16m4();
+    const int step = epr * 2;
+    const int np = (n & ~(step - 1));
+
+    // unroll by 2
+    for (int i = 0; i < np; i += step) {
+        vfloat16m4_t ay0 = __riscv_vle16_v_f16m4((const _Float16*)y + i, epr);
+        ay0 = __riscv_vfmul_vf_f16m4(ay0, scale, epr);
+        __riscv_vse16_v_f16m4((_Float16*)y + i, ay0, epr);
+        __asm__ __volatile__ ("" ::: "memory");
+
+        vfloat16m4_t ay1 = __riscv_vle16_v_f16m4((const _Float16*)y + i + epr, epr);
+        ay1 = __riscv_vfmul_vf_f16m4(ay1, scale, epr);
+        __riscv_vse16_v_f16m4((_Float16*)y + i + epr, ay1, epr);
+        __asm__ __volatile__ ("" ::: "memory");
+    }
+
+    // leftovers
+    int vl;
+    for (int i = np; i < n; i += vl) {
+        vl = __riscv_vsetvl_e16m4(n - i);
+        vfloat16m4_t ay0 = __riscv_vle16_v_f16m4((const _Float16*)y + i, vl);
+        ay0 = __riscv_vfmul_vf_f16m4(ay0, scale, vl);
+        __riscv_vse16_v_f16m4((_Float16*)y + i, ay0, vl);
     }
 #elif defined(GGML_SIMD)
     const int np = (n & ~(GGML_F16_STEP - 1));
diff --git a/ggml/src/ggml-cuda/mma.cuh b/ggml/src/ggml-cuda/mma.cuh
index 794d90bdd..3268dadfe 100644
--- a/ggml/src/ggml-cuda/mma.cuh
+++ b/ggml/src/ggml-cuda/mma.cuh
@@ -78,27 +78,25 @@ namespace ggml_cuda_mma {
         // MIRRORED == Each data value is held exactly once per thread subgroup.
         DATA_LAYOUT_I_MAJOR           =  0, // Always used for Turing, Ampere, Ada Lovelace, consumer Blackwell, matrix A&B for RDNA4 and CDNA.
         DATA_LAYOUT_J_MAJOR           = 10, // Matrix C for CDNA and RDNA4, int and float matrix C for RDNA3.
-        DATA_LAYOUT_I_MAJOR_MIRRORED  = 20,
+        DATA_LAYOUT_I_MAJOR_MIRRORED  = 20, // Volta, matrix A&B for RDNA3.
         DATA_LAYOUT_J_MAJOR_MIRRORED  = 30,
-        DATA_LAYOUT_I_MAJOR_DUAL      = 40, // Matrix A&B for RDNA3.
     };
     // Implemented mma combinations are:
     //   - (I_MAJOR, I_MAJOR)          -> I_MAJOR
     //   - (I_MAJOR, I_MAJOR_MIRRORED) -> I_MAJOR
     //   - (I_MAJOR, J_MAJOR_MIRRORED) -> I_MAJOR
 
-    constexpr bool is_i_major(const data_layout dl) {
+    static constexpr bool is_i_major(const data_layout dl) {
         return dl == DATA_LAYOUT_I_MAJOR ||
-               dl == DATA_LAYOUT_I_MAJOR_MIRRORED ||
-               dl == DATA_LAYOUT_I_MAJOR_DUAL;
+               dl == DATA_LAYOUT_I_MAJOR_MIRRORED;
     }
 
-    constexpr data_layout get_input_data_layout() {
-#if defined(RDNA3)
-        return DATA_LAYOUT_I_MAJOR_DUAL;
+    static constexpr __device__ data_layout get_input_data_layout() {
+#if defined(RDNA3) || __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
+        return DATA_LAYOUT_I_MAJOR_MIRRORED;
 #else
         return DATA_LAYOUT_I_MAJOR;
-#endif // defined(RDNA3)
+#endif // defined(RDNA3) || __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
     }
 
     template <int I_, int J_, typename T, data_layout ds_=DATA_LAYOUT_I_MAJOR>
@@ -462,11 +460,65 @@ namespace ggml_cuda_mma {
         }
     };
 
+    template <int I_, int J_, typename T>
+    struct tile<I_, J_, T, DATA_LAYOUT_I_MAJOR_MIRRORED> {
+        static constexpr int         I  = I_;
+        static constexpr int         J  = J_;
+        static constexpr data_layout dl = DATA_LAYOUT_I_MAJOR_MIRRORED;
+
+        // RDNA3
+        static constexpr int         ne = I * J / 32 * 2;
+
+        T x[ne] = {0};
+
+        static constexpr __device__ bool supported() {
+            if (I == 16 && J == 16) return true;
+            if (I == 16 && J == 8)  return true;
+            if (I == 16 && J == 4)  return true;
+            return false;
+        }
+
+        static __device__ __forceinline__ int get_i(const int /*l*/) {
+            if constexpr (supported()) {
+                return threadIdx.x % 16;
+            } else {
+                NO_DEVICE_CODE;
+                return -1;
+            }
+        }
+
+        static __device__ __forceinline__ int get_j(const int l) {
+            if constexpr (supported()) {
+                return l;
+            } else {
+                NO_DEVICE_CODE;
+                return -1;
+            }
+        }
+    };
+
     template <int I_, int J_>
     struct tile<I_, J_, half2, DATA_LAYOUT_I_MAJOR_MIRRORED> {
         static constexpr int         I  = I_;
         static constexpr int         J  = J_;
         static constexpr data_layout dl = DATA_LAYOUT_I_MAJOR_MIRRORED;
+#if defined(RDNA3)
+        static constexpr int         ne = tile<I_, J_, float, DATA_LAYOUT_I_MAJOR_MIRRORED>::ne;
+
+        half2 x[ne] = {{0.0f, 0.0f}};
+
+        static constexpr __device__ bool supported() {
+            return tile<I_, J_, float, DATA_LAYOUT_I_MAJOR_MIRRORED>::supported();
+        }
+
+        static __device__ __forceinline__ int get_i(const int l) {
+            return tile<I_, J_, float, DATA_LAYOUT_I_MAJOR_MIRRORED>::get_i(l);
+        }
+
+        static __device__ __forceinline__ int get_j(const int l) {
+            return tile<I_, J_, float, DATA_LAYOUT_I_MAJOR_MIRRORED>::get_j(l);
+        }
+#else // Volta
         static constexpr int         ne = I * J / (WARP_SIZE/4);
 
         half2 x[ne] = {{0.0f, 0.0f}};
@@ -493,6 +545,29 @@ namespace ggml_cuda_mma {
                 return -1;
             }
         }
+#endif // defined(RDNA3)
+    };
+
+    template <int I_, int J_>
+    struct tile<I_, J_, nv_bfloat162, DATA_LAYOUT_I_MAJOR_MIRRORED> {
+        static constexpr int         I  = I_;
+        static constexpr int         J  = J_;
+        static constexpr data_layout dl = DATA_LAYOUT_I_MAJOR_MIRRORED;
+        static constexpr int         ne = tile<I_, J_, float, DATA_LAYOUT_I_MAJOR_MIRRORED>::ne;
+
+        nv_bfloat162 x[ne] = {{0.0f, 0.0f}};
+
+        static constexpr __device__ bool supported() {
+            return tile<I_, J_, float, DATA_LAYOUT_I_MAJOR_MIRRORED>::supported();
+        }
+
+        static __device__ __forceinline__ int get_i(const int l) {
+            return tile<I_, J_, float, DATA_LAYOUT_I_MAJOR_MIRRORED>::get_i(l);
+        }
+
+        static __device__ __forceinline__ int get_j(const int l) {
+            return tile<I_, J_, float, DATA_LAYOUT_I_MAJOR_MIRRORED>::get_j(l);
+        }
     };
 
     template <int I_, int J_>
@@ -528,42 +603,6 @@ namespace ggml_cuda_mma {
         }
     };
 
-    template <int I_, int J_, typename T>
-    struct tile<I_, J_, T, DATA_LAYOUT_I_MAJOR_DUAL> {
-        static constexpr int         I  = I_;
-        static constexpr int         J  = J_;
-        static constexpr data_layout dl = DATA_LAYOUT_I_MAJOR_DUAL;
-
-        static constexpr int         ne = I * J / 32 * 2;
-
-        T x[ne] = {0};
-
-        static constexpr __device__ bool supported() {
-            if (I == 16 && J == 16) return true;
-            if (I == 16 && J == 8)  return true;
-            if (I == 16 && J == 4)  return true;
-            return false;
-        }
-
-        static __device__ __forceinline__ int get_i(const int l) {
-            if constexpr (supported()) {
-                return threadIdx.x % 16;
-            } else {
-                NO_DEVICE_CODE;
-                return -1;
-            }
-        }
-
-        static __device__ __forceinline__ int get_j(const int l) {
-            if constexpr (supported()) {
-                return l;
-            } else {
-                NO_DEVICE_CODE;
-                return -1;
-            }
-        }
-    };
-
 #if defined(TURING_MMA_AVAILABLE)
     template <int I, int J>
     static __device__ __forceinline__ tile<I, J/2, half2> get_half2(const tile<I, J, float> & tile_float) {
diff --git a/gguf-py/gguf/utility.py b/gguf-py/gguf/utility.py
index 7907e706d..4918ae971 100644
--- a/gguf-py/gguf/utility.py
+++ b/gguf-py/gguf/utility.py
@@ -288,7 +288,7 @@ class LocalTensor:
     data_range: LocalTensorRange
 
     def mmap_bytes(self) -> np.ndarray:
-        return np.memmap(self.data_range.filename, mode='r', offset=self.data_range.offset, shape=self.data_range.size)
+        return np.memmap(self.data_range.filename, mode='c', offset=self.data_range.offset, shape=self.data_range.size)
 
 
 class SafetensorsLocal:
diff --git a/models/templates/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16.jinja b/models/templates/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16.jinja
deleted file mode 100644
index a01e0861c..000000000
--- a/models/templates/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16.jinja
+++ /dev/null
@@ -1,204 +0,0 @@
-{% macro render_extra_keys(json_dict, handled_keys) %}
-    {%- if json_dict is mapping %}
-        {%- for json_key in json_dict if json_key not in handled_keys %}
-            {%- if json_dict[json_key] is mapping or (json_dict[json_key] is sequence and json_dict[json_key] is not string) %}
-                {{- '\n<' ~ json_key ~ '>' ~ (json_dict[json_key] | tojson | safe) ~ '</' ~ json_key ~ '>' }}
-            {%- else %}
-                {{-'\n<' ~ json_key ~ '>' ~ (json_dict[json_key] | string) ~ '</' ~ json_key ~ '>' }}
-            {%- endif %}
-        {%- endfor %}
-    {%- endif %}
-{% endmacro %}
-{%- set enable_thinking = enable_thinking if enable_thinking is defined else True %}
-{%- set truncate_history_thinking = truncate_history_thinking if truncate_history_thinking is defined else True %}
-
-{%- set ns = namespace(last_user_idx = -1) %}
-{%- set loop_messages = messages %}
-{%- for m in loop_messages %}
-  {%- if m["role"] == "user" %}
-    {%- set ns.last_user_idx = loop.index0 %}
-  {%- endif %}
-{%- endfor %}
-
-{%- if messages[0]["role"] == "system" %}
-    {%- set system_message = messages[0]["content"] %}
-    {%- set loop_messages = messages[1:] %}
-{%- else %}
-    {%- set system_message = "" %}
-    {%- set loop_messages = messages %}
-{%- endif %}
-{%- if not tools is defined %}
-    {%- set tools = [] %}
-{%- endif %}
-{# Recompute last_user_idx relative to loop_messages after handling system #}
-{%- set ns = namespace(last_user_idx = -1) %}
-{%- for m in loop_messages %}
-  {%- if m["role"] == "user" %}
-    {%- set ns.last_user_idx = loop.index0 %}
-  {%- endif %}
-{%- endfor %}
-{%- if system_message is defined %}
-    {{- "<|im_start|>system\n" + system_message }}
-{%- else %}
-    {%- if tools is iterable and tools | length > 0 %}
-        {{- "<|im_start|>system\n" }}
-    {%- endif %}
-{%- endif %}
-{%- if tools is iterable and tools | length > 0 %}
-    {%- if system_message is defined and system_message | length > 0 %}
-        {{- "\n\n" }}
-    {%- endif %}
-    {{- "# Tools\n\nYou have access to the following functions:\n\n" }}
-    {{- "<tools>" }}
-    {%- for tool in tools %}
-        {%- if tool.function is defined %}
-            {%- set tool = tool.function %}
-        {%- endif %}
-        {{- "\n<function>\n<name>" ~ tool.name ~ "</name>" }}
-        {%- if tool.description is defined %}
-            {{- '\n<description>' ~ (tool.description | trim) ~ '</description>' }}
-        {%- endif %}
-        {{- '\n<parameters>' }}
-        {%- if tool.parameters is defined and tool.parameters is mapping and tool.parameters.properties is defined and tool.parameters.properties is mapping %}
-            {%- for param_name, param_fields in tool.parameters.properties|items %}
-                {{- '\n<parameter>' }}
-                {{- '\n<name>' ~ param_name ~ '</name>' }}
-                {%- if param_fields.type is defined %}
-                    {{- '\n<type>' ~ (param_fields.type | string) ~ '</type>' }}
-                {%- endif %}
-                {%- if param_fields.description is defined %}
-                    {{- '\n<description>' ~ (param_fields.description | trim) ~ '</description>' }}
-                {%- endif %}
-                {%- if param_fields.enum is defined %}
-                    {{- '\n<enum>' ~ (param_fields.enum | tojson | safe) ~ '</enum>' }}
-                {%- endif %}
-                {%- set handled_keys = ['name', 'type', 'description', 'enum'] %}
-                {{- render_extra_keys(param_fields, handled_keys) }}
-                {{- '\n</parameter>' }}
-            {%- endfor %}
-        {%- endif %}
-        {% set handled_keys = ['type', 'properties', 'required'] %}
-        {{- render_extra_keys(tool.parameters, handled_keys) }}
-        {%- if tool.parameters is defined and tool.parameters.required is defined %}
-            {{- '\n<required>' ~ (tool.parameters.required | tojson | safe) ~ '</required>' }}
-        {%- endif %}
-        {{- '\n</parameters>' }}
-        {%- set handled_keys = ['type', 'name', 'description', 'parameters'] %}
-        {{- render_extra_keys(tool, handled_keys) }}
-        {{- '\n</function>' }}
-    {%- endfor %}
-    {{- "\n</tools>" }}
-
-    {{- '\n\nIf you choose to call a function ONLY reply in the following format with NO suffix:\n\n<tool_call>\n<function=example_function_name>\n<parameter=example_parameter_1>\nvalue_1\n</parameter>\n<parameter=example_parameter_2>\nThis is the value for the second parameter\nthat can span\nmultiple lines\n</parameter>\n</function>\n</tool_call>\n\n<IMPORTANT>\nReminder:\n- Function calls MUST follow the specified format: an inner <function=...></function> block must be nested within <tool_call></tool_call> XML tags\n- Required parameters MUST be specified\n- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after\n- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls\n</IMPORTANT>' }}
-{%- endif %}
-
-
-{%- if system_message is defined %}
-    {{- '<|im_end|>\n' }}
-{%- else %}
-    {%- if tools is iterable and tools | length > 0 %}
-        {{- '<|im_end|>\n' }}
-    {%- endif %}
-{%- endif %}
-
-{%- for message in loop_messages %}
-    {%- if message.role == "assistant" %}
-        {# Add reasoning content in to content field for unified processing below. #}
-        {%- if message.reasoning_content is defined and message.reasoning_content is string and message.reasoning_content | trim | length > 0 %}
-            {%- set content = "<think>\n" ~ message.reasoning_content ~ "\n</think>\n" ~ (message.content | default('', true)) %}
-        {%- else %}
-            {%- set content = message.content | default('', true) %}
-            {%- if content is string -%}
-                {# Allow downstream logic to to take care of broken thought, only handle coherent reasoning here. #}
-                {%- if '<think>' not in content and '</think>' not in content -%}
-                    {%- set content = "<think></think>" ~ content -%}
-                {%- endif -%}
-            {%- else -%}
-                {%- set content = content -%}
-            {%- endif -%}
-        {%- endif %}
-        {%- if message.tool_calls is defined and message.tool_calls is iterable and message.tool_calls | length > 0 %}
-            {# Assistant message has tool calls. #}
-            {{- '<|im_start|>assistant\n' }}
-                {%- set include_content = not (truncate_history_thinking and loop.index0 < ns.last_user_idx) %}
-                {%- if content is string and content | trim | length > 0 %}
-                    {%- if include_content %}
-                        {{- (content | trim) ~ '\n' -}}
-                    {%- else %}
-                        {%- set c = (content | string) %}
-                        {%- if '</think>' in c %}
-                            {# Keep only content after the last closing think. Also generation prompt causes this. #}
-                            {%- set c = c.split('</think>')[-1] %}
-                        {%- elif '<think>' in c %}
-                            {# If <think> was opened but never closed, drop the trailing think segment #}
-                            {%- set c = c.split('<think>')[0] %}
-                        {%- endif %}
-                        {%- set c = "<think></think>" ~ c | trim %}
-                        {%- if c | length > 0 %}
-                            {{- c ~ '\n' -}}
-                        {%- endif %}
-                    {%- endif %}
-                {%- else %}
-                    {{- "<think></think>" -}}
-                {%- endif %}
-                {%- for tool_call in message.tool_calls %}
-                    {%- if tool_call.function is defined %}
-                        {%- set tool_call = tool_call.function %}
-                    {%- endif %}
-                    {{- '<tool_call>\n<function=' ~ tool_call.name ~ '>\n' -}}
-                        {%- if tool_call.arguments is defined %}
-                            {%- for args_name, args_value in tool_call.arguments|items %}
-                                {{- '<parameter=' ~ args_name ~ '>\n' -}}
-                                    {%- set args_value = args_value | tojson | safe if args_value is mapping or (args_value is sequence and args_value is not string) else args_value | string %}
-                                {{- args_value ~ '\n</parameter>\n' -}}
-                            {%- endfor %}
-                        {%- endif %}
-                    {{- '</function>\n</tool_call>\n' -}}
-                {%- endfor %}
-                {{- '<|im_end|>\n' }}
-        {%- else %}
-            {# Assistant message doesn't have tool calls. #}
-            {%- if not (truncate_history_thinking and loop.index0 < ns.last_user_idx) %}
-                {{- '<|im_start|>assistant\n' ~ (content | default('', true) | string | trim) ~ '<|im_end|>\n' }}
-            {%- else %}
-                {%- set c = (content | default('', true) | string) %}
-                {%- if '<think>' in c and '</think>' in c %}
-                    {%- set c = "<think></think>" ~ c.split('</think>')[-1] %}
-                {%- endif %}
-                {%- set c = c | trim %}
-                {%- if c | length > 0 %}
-                    {{- '<|im_start|>assistant\n' ~ c ~ '<|im_end|>\n' }}
-                {%- else %}
-                    {{- '<|im_start|>assistant\n<|im_end|>\n' }}
-                {%- endif %}
-            {%- endif %}
-        {%- endif %}
-    {%- elif message.role == "user" or message.role == "system" %}
-        {{- '<|im_start|>' + message.role + '\n' }}
-        {%- set content = message.content | string %}
-        {{- content }}
-        {{- '<|im_end|>\n' }}
-    {%- elif message.role == "tool" %}
-        {%- if loop.previtem and loop.previtem.role != "tool" %}
-            {{- '<|im_start|>user\n' }}
-        {%- endif %}
-        {{- '<tool_response>\n' }}
-        {{- message.content }}
-        {{- '\n</tool_response>\n' }}
-        {%- if not loop.last and loop.nextitem.role != "tool" %}
-            {{- '<|im_end|>\n' }}
-        {%- elif loop.last %}
-            {{- '<|im_end|>\n' }}
-        {%- endif %}
-    {%- else %}
-        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>\n' }}
-    {%- endif %}
-{%- endfor %}
-
-{%- if add_generation_prompt %}
-    {%- if enable_thinking %}
-        {{- '<|im_start|>assistant\n<think>\n' }}
-    {%- else %}
-        {{- '<|im_start|>assistant\n<think></think>' }}
-    {%- endif %}
-{%- endif %}
diff --git a/scripts/compare-logprobs.py b/scripts/compare-logprobs.py
deleted file mode 100644
index 63861dd9a..000000000
--- a/scripts/compare-logprobs.py
+++ /dev/null
@@ -1,281 +0,0 @@
-import argparse
-import requests
-import json
-from pathlib import Path
-import logging
-
-logger = logging.getLogger("compare-logprobs")
-logging.basicConfig(level=logging.INFO)
-
-
-DESCRIPTION = """
-Compare logits between llama.cpp and another inference engine using OpenAI-compatible server endpoints.
-
-Unlike compare-logits.py, it allows dumping logits from a hosted API endpoint. Useful when it's not possible to run both models locally.
-
-Example usage:
-    Step 1: Dump logits from two different servers
-        python scripts/compare-logprobs.py dump logits_llama.log http://localhost:8080/v1/completions
-        python scripts/compare-logprobs.py dump logits_other.log http://other-engine:8000/v1/completions
-
-        (optionally, you can add --api-key <key> if the endpoint requires authentication)
-
-    Step 2: Compare the dumped logits
-        python scripts/compare-logprobs.py compare logits_llama.log logits_other.log report.md
-"""
-
-
-def generate_input_prompt(length: int) -> list[str]:
-    CORPUS = """
-    You are an advanced AI assistant capable of using tools to gather information, perform calculations, or execute tasks. Always think step by step before responding. If a user's query requires external data, computation, or actions beyond your internal knowledge, use the appropriate tools via function calls.
-
-    ### Tool Call Format:
-    When you need to use a tool, output the call in this exact XML format. Include the opening and closing tags. Do not escape arguments; they will be parsed as plain text.
-
-    You can make multiple calls in one go by placing them one after another.
-    """
-    words = [w.strip() for w in CORPUS.strip().split(" ")]
-    words = [w for w in words if len(w) > 0]  # filter out empty strings
-    while len(words) < length:
-        words += words
-    return words[:length]
-
-
-def dump_logits(
-    endpoint: str,
-    output_path: Path,
-    input_words: list[str],
-    pattern: list[tuple[bool, int]],
-    api_key=None,
-):
-    logger.info(f"Dumping logits to {output_path} from endpoint {endpoint}...")
-    words = input_words
-    curr_text = ""
-    n_total = sum(n for get, n in pattern if get)
-    n_done = 0
-    i_cur = 0
-    i_total = len(words)
-    with output_path.open("w") as f:
-        for get, n in pattern:
-            if not get:
-                # skip n words
-                for i in range(n):
-                    curr_text += words.pop(0) + " "
-                    i_cur += 1
-                continue
-            # get n words
-            for i in range(n):
-                curr_text += words.pop(0) + " "
-                payload = {
-                    "prompt": curr_text.strip(),
-                    "temperature": 0.0,
-                    "top_k": 1,
-                    "max_tokens": 1,
-                    "logprobs": 1,
-                    "stream": False,
-                }
-                response = requests.post(
-                    endpoint,
-                    json=payload,
-                    headers={"Authorization": f"Bearer {api_key}"} if api_key else {},
-                )
-                response.raise_for_status()
-                data = response.json()
-                data["__index"] = i_cur  # add index for easier debugging later
-                data = json.dumps(data)
-                f.write(f"{data}\n")
-                n_done += 1
-                i_cur += 1
-                logger.info(
-                    f"\n\n{data}\n\n[Step: {n_done}/{n_total} | Word: {i_cur}/{i_total}]"
-                )
-    logger.info(f"Logits dumped to {output_path}")
-
-
-def get_token_logprobs(data: dict):
-    logprobs = data["choices"][0]["logprobs"]
-    if "content" in logprobs:
-        # llama.cpp case
-        top = logprobs["content"][0]["top_logprobs"][0]
-        return top["token"], top["logprob"]
-    else:
-        # vllm case
-        tokens = logprobs["tokens"]
-        token_logprobs = logprobs["token_logprobs"]
-        return tokens[0], token_logprobs[0]
-
-
-def clean_text(text: str) -> str:
-    return (
-        "'"
-        + text.replace("\n", "\\n")
-        .replace("\t", "\\t")
-        .replace("\r", "\\r")
-        .replace("|", "\\|")
-        + "'"
-    )
-
-
-def compare_logits(input1: Path, input2: Path, output_path: Path):
-    with input1.open("r") as f1, input2.open("r") as f2, output_path.open("w") as fout:
-        lines1 = f1.readlines()
-        lines2 = f2.readlines()
-
-        tab_header = [
-            "idx",
-            input1.name,
-            "logprob_1",
-            input2.name,
-            "logprob_2",
-            "diff (abs)",
-        ]
-        tab_entries = []
-        tab_max_widths = [len(h) for h in tab_header]
-
-        assert len(lines1) == len(
-            lines2
-        ), "Input files must have the same number of lines."
-
-        fout.write("# Logits Comparison Report\n\n")
-        for i, (line1, line2) in enumerate(zip(lines1, lines2)):
-            if not line1.strip() or not line2.strip():
-                continue  # skip empty lines
-
-            data1 = json.loads(line1)
-            data2 = json.loads(line2)
-
-            idx1 = data1.get("__index", -1)
-            idx2 = data2.get("__index", -1)
-            if idx1 != idx2:
-                logger.warning(
-                    f"Warning: Mismatched indices at line {i}: {idx1} vs {idx2}"
-                )
-
-            token1, logprob1 = get_token_logprobs(data1)
-            token2, logprob2 = get_token_logprobs(data2)
-
-            token1 = clean_text(token1)
-            token2 = clean_text(token2)
-            abs_diff = abs(logprob1 - logprob2)
-
-            tab_entries.append(
-                (
-                    str(idx1 + 1),
-                    token1,
-                    f"{logprob1:.4f}",
-                    token2,
-                    f"{logprob2:.4f}",
-                    f"{(abs_diff):.4f}",
-                )
-            )
-
-        for i in range(len(tab_entries)):
-            for j in range(len(tab_header)):
-                tab_max_widths[j] = max(tab_max_widths[j], len(tab_entries[i][j]))
-
-        output = ""
-        for j in range(len(tab_header)):
-            output += f"| {tab_header[j]:<{tab_max_widths[j]}} "
-        output += "|\n"
-        for j in range(len(tab_header)):
-            output += f"|{'-' * (tab_max_widths[j] + 2)}"
-        output += "|\n"
-        for entry in tab_entries:
-            for j in range(len(tab_header)):
-                output += f"| {entry[j]:<{tab_max_widths[j]}} "
-            output += "|\n"
-
-        logger.info("\n" + output)
-        fout.write(output)
-        logger.info(f"Report written to {output_path}")
-
-
-def parse_pattern(pattern: str) -> list[tuple[bool, int]]:
-    parts = pattern.split(",")
-    result = []
-    for i, part in enumerate(parts):
-        n = int(part)
-        if i % 2 == 0:
-            result.append((True, n))  # get n words
-        else:
-            result.append((False, n))  # skip n words
-    return result
-
-
-def parse_args() -> argparse.Namespace:
-    parser = argparse.ArgumentParser(
-        description=DESCRIPTION, formatter_class=argparse.RawTextHelpFormatter
-    )
-    subparsers = parser.add_subparsers(
-        dest="verb", required=True, help="action to perform"
-    )
-
-    # dump subcommand
-    parser_dump = subparsers.add_parser("dump", help="dump logits from an endpoint")
-    parser_dump.add_argument(
-        "output", type=Path, help="output path for dumped logits (.log)"
-    )
-    parser_dump.add_argument(
-        "endpoint", type=str, help="OAI-compat /completions endpoint"
-    )
-    parser_dump.add_argument(
-        "--api-key",
-        type=str,
-        default=None,
-        help="API key for authentication (if required)",
-    )
-    parser_dump.add_argument(
-        "--file",
-        type=Path,
-        default=None,
-        help="File containing prompt to use instead of the default",
-    )
-    parser_dump.add_argument(
-        "--pattern",
-        type=str,
-        default="10,1000,10,4000,10",
-        help="Pattern n_get,n_skip,... where n_get is number of words to get and n_skip is number of words to skip (num of words, NOT num of tokens)",
-    )
-
-    # compare subcommand
-    parser_compare = subparsers.add_parser(
-        "compare", help="compare two dumped logits files"
-    )
-    parser_compare.add_argument("input1", type=Path, help="first input file (.log)")
-    parser_compare.add_argument("input2", type=Path, help="second input file (.log)")
-    parser_compare.add_argument(
-        "output", type=Path, help="output path for comparison report (.md)"
-    )
-
-    try:
-        return parser.parse_args()
-    except Exception as e:
-        parser.print_help()
-        raise e
-
-
-def main():
-    args = parse_args()
-
-    if args.verb == "dump":
-        pattern = parse_pattern(args.pattern)
-        input_length = sum(n for _, n in pattern)
-        input_words = generate_input_prompt(input_length)
-        if args.file is not None:
-            with args.file.open("r") as f:
-                input_words = f.read().strip().split(" ")
-                if input_length < sum(n for _, n in pattern):
-                    raise ValueError(
-                        f"Input file has only {input_length} words, but pattern requires at least {input_length} words."
-                    )
-                input_length = len(input_words)
-        logger.info(f"Using {input_length} words")
-        dump_logits(args.endpoint, args.output, input_words, pattern, args.api_key)
-    elif args.verb == "compare":
-        compare_logits(args.input1, args.input2, args.output)
-    else:
-        raise ValueError(f"Unknown verb: {args.verb}")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/scripts/snapdragon/adb/run-mtmd.sh b/scripts/snapdragon/adb/run-mtmd.sh
deleted file mode 100755
index 91d868278..000000000
--- a/scripts/snapdragon/adb/run-mtmd.sh
+++ /dev/null
@@ -1,65 +0,0 @@
-#!/bin/sh
-#
-
-# Basedir on device
-basedir=/data/local/tmp/llama.cpp
-
-cli_opts=
-
-branch=.
-[ "$B" != "" ] && branch=$B
-
-adbserial=
-[ "$S" != "" ] && adbserial="-s $S"
-
-model="gemma-3-4b-it-Q4_0.gguf"
-[ "$M" != "" ] && model="$M"
-
-mmproj="mmproj-F16.gguf"
-[ "$MMPROJ" != "" ] && mmproj="$MMPROJ"
-
-image=
-[ "$IMG" != "" ] && image="$IMG"
-
-device="HTP0"
-[ "$D" != "" ] && device="$D"
-
-verbose=
-[ "$V" != "" ] && verbose="GGML_HEXAGON_VERBOSE=$V"
-
-experimental="GGML_HEXAGON_EXPERIMENTAL=1"
-[ "$E" != "" ] && experimental="GGML_HEXAGON_EXPERIMENTAL=$E"
-
-sched=
-[ "$SCHED" != "" ] && sched="GGML_SCHED_DEBUG=2" cli_opts="$cli_opts -v"
-
-profile=
-[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF GGML_HEXAGON_OPSYNC=1"
-
-opmask=
-[ "$OPMASK" != "" ] && opmask="GGML_HEXAGON_OPMASK=$OPMASK"
-
-nhvx=
-[ "$NHVX" != "" ] && nhvx="GGML_HEXAGON_NHVX=$NHVX"
-
-ndev=
-[ "$NDEV" != "" ] && ndev="GGML_HEXAGON_NDEV=$NDEV"
-
-# MTMD backend device for vision model (defaults to CPU if not set)
-mtmd_backend=
-[ "$MTMD_DEVICE" != "" ] && mtmd_backend="MTMD_BACKEND_DEVICE=$MTMD_DEVICE"
-
-set -x
-
-adb $adbserial shell " \
-  cd $basedir; ulimit -c unlimited;        \
-    LD_LIBRARY_PATH=$basedir/$branch/lib   \
-    ADSP_LIBRARY_PATH=$basedir/$branch/lib \
-    $verbose $experimental $sched $opmask $profile $nhvx $ndev $mtmd_backend       \
-      ./$branch/bin/llama-mtmd-cli --no-mmap -m $basedir/../gguf/$model   \
-         --mmproj $basedir/../gguf/$mmproj \
-         --image $basedir/../gguf/$image \
-         --poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1             \
-         --ctx-size 8192 --batch-size 128 -ctk q8_0 -ctv q8_0 -fa on \
-         -ngl 99 --device $device -v $cli_opts $@ \
-"
diff --git a/src/llama-mmap.cpp b/src/llama-mmap.cpp
index 339917b24..d7e019bde 100644
--- a/src/llama-mmap.cpp
+++ b/src/llama-mmap.cpp
@@ -13,9 +13,10 @@
 #ifdef __has_include
     #if __has_include(<unistd.h>)
         #include <unistd.h>
+        #include <fcntl.h>
+        #include <sys/stat.h>
         #if defined(_POSIX_MAPPED_FILES)
             #include <sys/mman.h>
-            #include <fcntl.h>
         #endif
         #if defined(_POSIX_MEMLOCK_RANGE)
             #include <sys/resource.h>
@@ -74,7 +75,7 @@ struct llama_file::impl {
         return ret;
     }
 
-    impl(const char * fname, const char * mode) {
+    impl(const char * fname, const char * mode, [[maybe_unused]] const bool use_direct_io = false) {
         fp = ggml_fopen(fname, mode);
         if (fp == NULL) {
             throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
@@ -153,13 +154,40 @@ struct llama_file::impl {
         write_raw(&val, sizeof(val));
     }
 
+    void read_aligned_chunk(size_t offset, void * dest, size_t size) const {
+        throw std::runtime_error("DirectIO is not implemented on Windows.");
+    }
+
     ~impl() {
         if (fp) {
             std::fclose(fp);
         }
     }
 #else
-    impl(const char * fname, const char * mode) {
+    impl(const char * fname, const char * mode, [[maybe_unused]] const bool use_direct_io = false) {
+#ifdef __linux__
+        // Try unbuffered I/O for read only
+        if (use_direct_io && std::strcmp(mode, "rb") == 0) {
+            fd = open(fname, O_RDONLY | O_DIRECT);
+
+            if (fd != -1) {
+                struct stat file_stats{};
+                fstat(fd, &file_stats);
+
+                size = file_stats.st_size;
+                alignment = file_stats.st_blksize;
+
+                off_t ret = lseek(fd, 0, SEEK_SET);
+                if (ret == -1) {
+                    throw std::runtime_error(format("seek error: %s", strerror(errno)));
+                }
+                return;
+            }
+
+            LLAMA_LOG_WARN("Failed to open model %s with error: %s. Falling back to buffered I/O",
+                fname, strerror(errno));
+        }
+#endif
         fp = ggml_fopen(fname, mode);
         if (fp == NULL) {
             throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
@@ -170,27 +198,30 @@ struct llama_file::impl {
     }
 
     size_t tell() const {
-// TODO: this ifdef is never true?
-#ifdef _WIN32
-        __int64 ret = _ftelli64(fp);
-#else
-        long ret = std::ftell(fp);
-#endif
-        if (ret == -1) {
-            throw std::runtime_error(format("ftell error: %s", strerror(errno)));
+        if (fd == -1) {
+            long ret = std::ftell(fp);
+            if (ret == -1) {
+                throw std::runtime_error(format("ftell error: %s", strerror(errno)));
+            }
+
+            return (size_t) ret;
         }
 
-        return (size_t) ret;
+        off_t pos = lseek(fd, 0, SEEK_CUR);
+        if (pos == -1) {
+            throw std::runtime_error(format("lseek error: %s", strerror(errno)));
+        }
+        return (size_t) pos;
     }
 
     void seek(size_t offset, int whence) const {
-// TODO: this ifdef is never true?
-#ifdef _WIN32
-        int ret = _fseeki64(fp, (__int64) offset, whence);
-#else
-        int ret = std::fseek(fp, (long) offset, whence);
-#endif
-        if (ret != 0) {
+        off_t ret = 0;
+        if (fd == -1) {
+            ret = std::fseek(fp, (long) offset, whence);
+        } else {
+            ret = lseek(fd, offset, whence);
+        }
+        if (ret == -1) {
             throw std::runtime_error(format("seek error: %s", strerror(errno)));
         }
     }
@@ -200,13 +231,55 @@ struct llama_file::impl {
             return;
         }
         errno = 0;
-        std::size_t ret = std::fread(ptr, len, 1, fp);
-        if (ferror(fp)) {
-            throw std::runtime_error(format("read error: %s", strerror(errno)));
+        if (fd == -1) {
+            std::size_t ret = std::fread(ptr, len, 1, fp);
+            if (ferror(fp)) {
+                throw std::runtime_error(format("read error: %s", strerror(errno)));
+            }
+            if (ret != 1) {
+                throw std::runtime_error("unexpectedly reached end of file");
+            }
+        } else {
+            bool successful = false;
+            while (!successful) {
+                off_t ret = read(fd, ptr, len);
+
+                if (ret == -1) {
+                    if (errno == EINTR) {
+                        continue;  // Interrupted by signal, retry
+                    }
+                    throw std::runtime_error(format("read error: %s", strerror(errno)));
+                }
+                if (ret == 0) {
+                    throw std::runtime_error("unexpectedly reached end of file");
+                }
+
+                successful = true;
+            }
         }
-        if (ret != 1) {
-            throw std::runtime_error("unexpectedly reached end of file");
+    }
+
+    void read_aligned_chunk(size_t offset, void * dest, size_t size) const {
+        off_t aligned_offset = offset & ~(alignment - 1);
+        off_t offset_from_alignment = offset - aligned_offset;
+        size_t bytes_to_read = (offset_from_alignment + size + alignment - 1) & ~(alignment - 1);
+
+        void * raw_buffer = nullptr;
+        int ret = posix_memalign(&raw_buffer, alignment, bytes_to_read);
+        if (ret != 0) {
+            throw std::runtime_error(format("posix_memalign failed with error %d", ret));
         }
+
+        struct aligned_buffer_deleter {
+            void operator()(void * p) const { free(p); }
+        };
+        std::unique_ptr<void, aligned_buffer_deleter> buffer(raw_buffer);
+
+        seek(aligned_offset, SEEK_SET);
+        read_raw(buffer.get(), bytes_to_read);
+
+        uintptr_t actual_data = reinterpret_cast<uintptr_t>(buffer.get()) + offset_from_alignment;
+        memcpy(dest, reinterpret_cast<void *>(actual_data), size);
     }
 
     uint32_t read_u32() const {
@@ -231,22 +304,43 @@ struct llama_file::impl {
     }
 
     ~impl() {
-        if (fp) {
+        if (fd != -1) {
+            close(fd);
+        } else {
             std::fclose(fp);
         }
     }
+    int fd = -1;
 #endif
 
-    FILE * fp;
-    size_t size;
+    void read_raw_at(void * ptr, size_t len, size_t offset) const {
+        if (alignment != 1) {
+            read_aligned_chunk(offset, ptr, len);
+        } else {
+            seek(offset, SEEK_SET);
+            read_raw(ptr, len);
+        }
+    }
+
+    size_t read_alignment() const {
+        return alignment;
+    }
+
+    size_t alignment = 1;
+
+    FILE * fp{};
+    size_t size{};
 };
 
-llama_file::llama_file(const char * fname, const char * mode) : pimpl(std::make_unique<impl>(fname, mode)) {}
+llama_file::llama_file(const char * fname, const char * mode, const bool use_direct_io) :
+    pimpl(std::make_unique<impl>(fname, mode, use_direct_io)) {}
 llama_file::~llama_file() = default;
 
 size_t llama_file::tell() const { return pimpl->tell(); }
 size_t llama_file::size() const { return pimpl->size; }
 
+size_t llama_file::read_alignment() const { return pimpl->read_alignment(); }
+
 int llama_file::file_id() const {
 #ifdef _WIN32
     return _fileno(pimpl->fp);
@@ -261,6 +355,7 @@ int llama_file::file_id() const {
 
 void llama_file::seek(size_t offset, int whence) const { pimpl->seek(offset, whence); }
 void llama_file::read_raw(void * ptr, size_t len) const { pimpl->read_raw(ptr, len); }
+void llama_file::read_raw_at(void * ptr, size_t len, size_t offset) const { pimpl->read_raw_at(ptr, len, offset); }
 
 uint32_t llama_file::read_u32() const { return pimpl->read_u32(); }
 
diff --git a/src/llama-mmap.h b/src/llama-mmap.h
index 4e5aec3f4..729aac164 100644
--- a/src/llama-mmap.h
+++ b/src/llama-mmap.h
@@ -3,6 +3,7 @@
 #include <cstdint>
 #include <memory>
 #include <vector>
+#include <cstdio>
 
 struct llama_file;
 struct llama_mmap;
@@ -13,7 +14,7 @@ using llama_mmaps  = std::vector<std::unique_ptr<llama_mmap>>;
 using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
 
 struct llama_file {
-    llama_file(const char * fname, const char * mode);
+    llama_file(const char * fname, const char * mode, bool use_direct_io = false);
     ~llama_file();
 
     size_t tell() const;
@@ -24,11 +25,14 @@ struct llama_file {
     void seek(size_t offset, int whence) const;
 
     void read_raw(void * ptr, size_t len) const;
+    void read_raw_at(void * ptr, size_t len, size_t offset) const;
+    void read_aligned_chunk(size_t offset, void * dest, size_t size) const;
     uint32_t read_u32() const;
 
     void write_raw(const void * ptr, size_t len) const;
     void write_u32(uint32_t val) const;
 
+    size_t read_alignment() const;
 private:
     struct impl;
     std::unique_ptr<impl> pimpl;
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
index 2a0e16bba..966e82d00 100644
--- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp
@@ -508,7 +508,7 @@ llama_model_loader::llama_model_loader(
     get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
     llm_kv = LLM_KV(llm_arch_from_string(arch_name));
 
-    files.emplace_back(new llama_file(fname.c_str(), "rb"));
+    files.emplace_back(new llama_file(fname.c_str(), "rb", !use_mmap));
     contexts.emplace_back(ctx);
 
     // Save tensors data offset of the main file.
@@ -576,7 +576,7 @@ llama_model_loader::llama_model_loader(
                 }
             }
 
-            files.emplace_back(new llama_file(fname_split, "rb"));
+            files.emplace_back(new llama_file(fname_split, "rb", !use_mmap));
             contexts.emplace_back(ctx);
 
             // Save tensors data offset info of the shard.
@@ -958,7 +958,15 @@ bool llama_model_loader::load_all_data(
     // 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives.
     // NVMe raid configurations might require more / larger buffers.
     constexpr size_t n_buffers = 4;
-    constexpr size_t buffer_size = 1 * 1024 * 1024; // 1MB
+
+    size_t alignment = 1;
+    for (const auto & file : files) {
+        alignment = std::max(file->read_alignment(), alignment);
+    }
+
+    // Buffer size: balance between memory usage and I/O efficiency
+    // 64MB works well for NVMe drives
+    const size_t buffer_size = alignment != 1 ? 64 * 1024 * 1024 + 2 * alignment : 1 * 1024 * 1024;
 
     std::vector<ggml_backend_buffer_t> host_buffers;
     std::vector<ggml_backend_event_t> events;
@@ -1008,6 +1016,7 @@ bool llama_model_loader::load_all_data(
         // If the backend is supported, create pinned memory buffers and events for synchronisation.
         for (size_t idx = 0; idx < n_buffers; ++idx) {
             auto * buf = ggml_backend_buft_alloc_buffer(host_buft, buffer_size);
+
             if (!buf) {
                 LLAMA_LOG_DEBUG("%s: failed to allocate host buffer for async uploads for device %s\n", func,
                     ggml_backend_dev_name(dev));
@@ -1089,9 +1098,9 @@ bool llama_model_loader::load_all_data(
             }
         } else {
             const auto & file = files.at(weight->idx);
+
             if (ggml_backend_buffer_is_host(cur->buffer)) {
-                file->seek(weight->offs, SEEK_SET);
-                file->read_raw(cur->data, n_size);
+                file->read_raw_at(cur->data, n_size, weight->offs);
                 if (check_tensors) {
                     validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] {
                         return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size));
@@ -1100,26 +1109,60 @@ bool llama_model_loader::load_all_data(
             } else {
                 // If upload_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU.
                 if (upload_backend) {
-                    file->seek(weight->offs, SEEK_SET);
+                    auto offset = (off_t) weight->offs;
+                    alignment = file->read_alignment();
+                    off_t aligned_offset = offset & ~(alignment - 1);
+                    off_t offset_from_alignment = offset - aligned_offset;
+                    file->seek(aligned_offset, SEEK_SET);
+
+                    // Calculate aligned read boundaries
+                    size_t read_start = aligned_offset;
+                    size_t read_end = (offset + n_size + alignment - 1) & ~(alignment - 1);
 
                     size_t bytes_read = 0;
+                    size_t data_read = 0;  // Actual tensor data copied (excluding padding)
 
-                    while (bytes_read < n_size) {
-                        size_t read_iteration = std::min<size_t>(buffer_size, n_size - bytes_read);
+                    while (bytes_read < read_end - read_start) {
+                        size_t read_size = std::min<size_t>(buffer_size, read_end - read_start - bytes_read);
 
+                        // Align the destination pointer within the pinned buffer
+                        uintptr_t ptr_dest_aligned = (reinterpret_cast<uintptr_t>(host_ptrs[buffer_idx]) + alignment - 1) & ~(alignment - 1);
+
+                        // Wait for previous upload to complete before reusing buffer
                         ggml_backend_event_synchronize(events[buffer_idx]);
-                        file->read_raw(host_ptrs[buffer_idx], read_iteration);
-                        ggml_backend_tensor_set_async(upload_backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration);
+
+                        // Read aligned chunk from file
+                        file->read_raw(reinterpret_cast<void *>(ptr_dest_aligned), read_size);
+
+                        // Calculate actual data portion (excluding alignment padding)
+                        uintptr_t ptr_data = ptr_dest_aligned;
+                        size_t data_to_copy = read_size;
+
+                        // Skip alignment padding at start of first chunk
+                        if (bytes_read == 0) {
+                            ptr_data += offset_from_alignment;
+                            data_to_copy -= offset_from_alignment;
+                        }
+
+                        // Trim alignment padding at end of last chunk
+                        if (aligned_offset + bytes_read + read_size > offset + n_size) {
+                            data_to_copy -= (read_end - (offset + n_size));
+                        }
+
+                        // Async upload actual data to GPU
+                        ggml_backend_tensor_set_async(upload_backend, cur,
+                                                      reinterpret_cast<void *>(ptr_data), data_read, data_to_copy);
                         ggml_backend_event_record(events[buffer_idx], upload_backend);
 
-                        bytes_read += read_iteration;
+                        data_read += data_to_copy;
+                        bytes_read += read_size;
+
                         ++buffer_idx;
                         buffer_idx %= n_buffers;
                     }
                 } else {
                     read_buf.resize(n_size);
-                    file->seek(weight->offs, SEEK_SET);
-                    file->read_raw(read_buf.data(), n_size);
+                    file->read_raw_at(read_buf.data(), n_size, weight->offs);
                     ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
                     if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
                         throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 8cd222b61..abd76900d 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -2480,7 +2480,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
     }
 
     ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
-    int i_gpu_start = std::max((int) hparams.n_layer - n_gpu_layers, (int) 0);
+    int i_gpu_start = std::max(int(hparams.n_layer) + 1 - n_gpu_layers, 0);
 
     #if defined(GGML_USE_CLBLAST)
     printf("\nOpenCL GPU Offload Fallback...\n");
@@ -2491,9 +2491,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
     if (cpu_dev == nullptr) {
         throw std::runtime_error(format("%s: no CPU backend found", __func__));
     }
-    const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1);
+    const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, int(n_layer) + 1);
     auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
-        const bool is_swa = il < (int) hparams.n_layer && hparams.is_swa(il);
+        const bool is_swa = il < int(hparams.n_layer) && hparams.is_swa(il);
         if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) {
             // LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(cpu_dev), is_swa);
             return {cpu_dev, &pimpl->cpu_buft_list};
@@ -6852,10 +6852,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
     if (llama_supports_gpu_offload()) {
         const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
 
-        LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
-        if (n_gpu_layers > (int) hparams.n_layer) {
+        int n_repeating = n_gpu;
+        if (n_repeating > 0) {
             LLAMA_LOG_INFO("%s: offloading output layer to GPU\n", __func__);
+            n_repeating--;
         }
+        LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_repeating);
 
         const int max_backend_supported_layers = hparams.n_layer + 1;
         const int max_offloadable_layers       = hparams.n_layer + 1;
diff --git a/src/llama.cpp b/src/llama.cpp
index 4502fd42f..9be51e42c 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -316,10 +316,6 @@ static void llama_params_fit_impl(
         if (mparams->split_mode == LLAMA_SPLIT_MODE_ROW) {
             throw std::runtime_error("changing weight allocation for LLAMA_SPLIT_MODE_ROW not implemented, abort");
         }
-        if (hp_ngl < 2*nd) {
-            throw std::runtime_error("model has only " + std::to_string(hp_ngl) + " layers but need at least "
-                + std::to_string(2*nd) + " to fit memory for " + std::to_string(nd) + " devices, abort");
-        }
     }
     if (!tensor_buft_overrides) {
         throw std::runtime_error("did not provide buffer to set tensor_buft_overrides, abort");
@@ -386,8 +382,7 @@ static void llama_params_fit_impl(
     auto set_ngl_tensor_split_tbo = [&](
             const std::vector<ngl_t> & ngl_per_device,
             const std::vector<ggml_backend_buffer_type_t> & overflow_bufts,
-            llama_model_params & mparams,
-            const bool add_nonrepeating) {
+            llama_model_params & mparams) {
         mparams.n_gpu_layers = 0;
         for (size_t id = 0; id < nd; id++) {
             mparams.n_gpu_layers += ngl_per_device[id].n_layer;
@@ -395,13 +390,9 @@ static void llama_params_fit_impl(
                 tensor_split[id] = ngl_per_device[id].n_layer;
             }
         }
-        assert(uint32_t(mparams.n_gpu_layers) <= hp_ngl);
-        uint32_t il0 = hp_ngl - mparams.n_gpu_layers; // start index for tensor buft overrides
+        assert(uint32_t(mparams.n_gpu_layers) <= hp_ngl + 1);
+        uint32_t il0 = hp_ngl + 1 - mparams.n_gpu_layers; // start index for tensor buft overrides
 
-        if (add_nonrepeating) {
-            mparams.n_gpu_layers += 1;
-            tensor_split[nd - 1] += 1;
-        }
         mparams.tensor_split = tensor_split;
 
         size_t itbo = 0;
@@ -432,10 +423,9 @@ static void llama_params_fit_impl(
     auto get_memory_for_layers = [&](
             const char * func_name,
             const std::vector<ngl_t> & ngl_per_device,
-            const std::vector<ggml_backend_buffer_type_t> & overflow_bufts,
-            const bool add_nonrepeating) -> std::vector<int64_t> {
+            const std::vector<ggml_backend_buffer_type_t> & overflow_bufts) -> std::vector<int64_t> {
         llama_model_params mparams_copy = *mparams;
-        set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, mparams_copy, add_nonrepeating);
+        set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, mparams_copy);
 
         const dmds_t dmd_nl = llama_get_device_memory_data(
             path_model, &mparams_copy, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
@@ -493,9 +483,6 @@ static void llama_params_fit_impl(
         LLAMA_LOG_DEBUG("%s: id=%zu, target=%" PRId64 " MiB\n", __func__, id, targets[id]/MiB);
     }
 
-    // whether for the optimal memory use we expect to load at least some MoE tensors:
-    const bool partial_moe = hp_nex > 0 && global_surplus_cpu_moe > 0;
-
     std::vector<ggml_backend_buffer_type_t> overflow_bufts; // which bufts the partial layers of a device overflow to:
     overflow_bufts.reserve(nd);
     for (size_t id = 0; id < nd - 1; ++id) {
@@ -504,7 +491,7 @@ static void llama_params_fit_impl(
     overflow_bufts.push_back(ggml_backend_cpu_buffer_type());
 
     std::vector<ngl_t> ngl_per_device(nd);
-    std::vector<int64_t> mem = get_memory_for_layers(__func__, ngl_per_device, overflow_bufts, partial_moe);
+    std::vector<int64_t> mem = get_memory_for_layers(__func__, ngl_per_device, overflow_bufts);
     if (hp_nex > 0) {
         for (size_t id = 0; id < nd; id++) {
             ngl_per_device[id].overflow_type = LAYER_FRACTION_MOE;
@@ -517,13 +504,14 @@ static void llama_params_fit_impl(
     //   - interpolate the memory use / layer between low and high linearly to get a guess where it meets our target
     //   - check memory use of our guess, replace either the low or high bound
     //   - once we only have a difference of a single layer, stop and return the lower bound that just barely still fits
+    //   - the last device has the output layer, which cannot be a partial layer
     if (hp_nex == 0) {
         LLAMA_LOG_INFO("%s: filling dense layers back-to-front:\n", __func__);
     } else {
         LLAMA_LOG_INFO("%s: filling dense-only layers back-to-front:\n", __func__);
     }
     for (int id = nd - 1; id >= 0; id--) {
-        uint32_t n_unassigned = hp_ngl;
+        uint32_t n_unassigned = hp_ngl + 1;
         for (size_t jd = id + 1; jd < nd; ++jd) {
             assert(n_unassigned >= ngl_per_device[jd].n_layer);
             n_unassigned -= ngl_per_device[jd].n_layer;
@@ -532,10 +520,10 @@ static void llama_params_fit_impl(
         std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
         ngl_per_device_high[id].n_layer = n_unassigned;
         if (hp_nex > 0) {
-            ngl_per_device_high[id].n_part = ngl_per_device_high[id].n_layer;
+            ngl_per_device_high[id].n_part = size_t(id) < nd - 1 ? ngl_per_device_high[id].n_layer : ngl_per_device_high[id].n_layer - 1;
         }
         if (ngl_per_device_high[id].n_layer > 0) {
-            std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts, partial_moe);
+            std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts);
             if (mem_high[id] > targets[id]) {
                 assert(ngl_per_device_high[id].n_layer > ngl_per_device[id].n_layer);
                 uint32_t delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
@@ -550,7 +538,7 @@ static void llama_params_fit_impl(
                     if (hp_nex) {
                         ngl_per_device_test[id].n_part += step_size;
                     }
-                    const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
+                    const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
 
                     if (mem_test[id] <= targets[id]) {
                         ngl_per_device = ngl_per_device_test;
@@ -577,7 +565,7 @@ static void llama_params_fit_impl(
             __func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, mem[id]/MiB, projected_margin/MiB);
     }
     if (hp_nex == 0 || global_surplus_cpu_moe <= 0) {
-        set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams, partial_moe);
+        set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams);
         return;
     }
 
@@ -600,13 +588,13 @@ static void llama_params_fit_impl(
     for (size_t id = 0; id <= id_dense_start; id++) {
         std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
         for (size_t jd = id_dense_start; jd < nd; jd++) {
-            const uint32_t n_layer_move = ngl_per_device_high[jd].n_layer;
+            const uint32_t n_layer_move = jd < nd - 1 ? ngl_per_device_high[jd].n_layer : ngl_per_device_high[jd].n_layer - 1;
             ngl_per_device_high[id].n_layer += n_layer_move;
             ngl_per_device_high[jd].n_layer -= n_layer_move;
             ngl_per_device_high[jd].n_part = 0;
         }
         size_t id_dense_start_high = nd - 1;
-        std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts, partial_moe);
+        std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts);
 
         if (mem_high[id] > targets[id]) {
             assert(ngl_per_device_high[id].n_layer >= ngl_per_device_high[id].n_part);
@@ -634,7 +622,7 @@ static void llama_params_fit_impl(
                         break;
                     }
                 }
-                const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
+                const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
 
                 if (mem_test[id] <= targets[id]) {
                     ngl_per_device = ngl_per_device_test;
@@ -661,7 +649,7 @@ static void llama_params_fit_impl(
         }
 
         // try to fit at least part of one more layer
-        if (ngl_per_device[id_dense_start].n_layer > 0) {
+        if (ngl_per_device[id_dense_start].n_layer > (id < nd - 1 ? 0 : 1)) {
             std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
             size_t id_dense_start_test = id_dense_start;
             ngl_per_device_test[id_dense_start_test].n_layer--;
@@ -673,7 +661,7 @@ static void llama_params_fit_impl(
             }
             ngl_per_device_test[id].overflow_type = LAYER_FRACTION_UP;
             LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_UP\n", __func__);
-            std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
+            std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
             if (mem_test[id] < targets[id]) {
                 ngl_per_device = ngl_per_device_test;
                 mem            = mem_test;
@@ -683,7 +671,7 @@ static void llama_params_fit_impl(
 
                 ngl_per_device_test[id].overflow_type = LAYER_FRACTION_GATE;
                 LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_GATE\n", __func__);
-                mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
+                mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
                 if (mem_test[id] < targets[id]) {
                     ngl_per_device = ngl_per_device_test;
                     mem            = mem_test;
@@ -694,7 +682,7 @@ static void llama_params_fit_impl(
             } else {
                 ngl_per_device_test[id].overflow_type = LAYER_FRACTION_ATTN;
                 LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_ATTN\n", __func__);
-                mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
+                mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
                 if (mem_test[id] < targets[id]) {
                     ngl_per_device = ngl_per_device_test;
                     mem            = mem_test;
@@ -711,7 +699,7 @@ static void llama_params_fit_impl(
             __func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
     }
 
-    set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams, partial_moe);
+    set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams);
 }
 
 bool llama_params_fit(
diff --git a/tests/test-state-restore-fragmented.cpp b/tests/test-state-restore-fragmented.cpp
deleted file mode 100644
index 481b39d04..000000000
--- a/tests/test-state-restore-fragmented.cpp
+++ /dev/null
@@ -1,122 +0,0 @@
-// Test for state restore with fragmented KV cache
-// This tests the fix for: https://github.com/ggml-org/llama.cpp/issues/17527
-// The issue was that state restore required contiguous KV cache slots,
-// which fails when the cache is fragmented.
-//
-// The fix changes find_slot(ubatch, true) to find_slot(ubatch, false)
-// in state_read_meta(), allowing non-contiguous slot allocation.
-
-#include "arg.h"
-#include "common.h"
-#include "llama.h"
-
-#include <vector>
-#include <cstdio>
-#include <cstring>
-
-int main(int argc, char ** argv) {
-    common_params params;
-
-    params.sampling.seed = 1234;
-    params.kv_unified = true;
-    params.n_parallel = 3;
-    params.n_ctx = 256;
-
-    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
-        return 1;
-    }
-
-    common_init();
-
-    // init
-    common_init_result_ptr llama_init = common_init_from_params(params);
-
-    llama_model * model = llama_init->model();
-    llama_context * ctx = llama_init->context();
-
-    if (model == nullptr || ctx == nullptr) {
-        fprintf(stderr, "%s : failed to init\n", __func__);
-        return 1;
-    }
-
-    GGML_UNUSED(model);
-
-    // tokenize prompt
-    std::vector<llama_token> tokens(70, 1);
-
-    // interleave the 3 sequences:
-    // 01201230123...
-    llama_batch batch = llama_batch_init(params.n_parallel*tokens.size(), 0, 1);
-    for (size_t i = 0; i < tokens.size(); i++) {
-        for (int s = 0; s < params.n_parallel; ++s) {
-            common_batch_add(batch, tokens[i], i, {s}, false);
-        }
-    }
-    batch.logits[batch.n_tokens - 1] = true;
-
-    if (llama_decode(ctx, batch)) {
-        fprintf(stderr, "%s : failed to decode seq 0\n", __func__);
-        return 1;
-    }
-
-    fprintf(stderr, "%s : processed prompt on seq 0, 1, 2 (%zu tokens each)\n", __func__, tokens.size());
-
-    // Save state of seq 1
-    std::vector<uint8_t> seq_state(llama_state_seq_get_size(ctx, 1));
-    const size_t ncopy = llama_state_seq_get_data(ctx, seq_state.data(), seq_state.size(), 1);
-    if (ncopy != seq_state.size()) {
-        fprintf(stderr, "%s : failed to save seq 1 state\n", __func__);
-        return 1;
-    }
-    fprintf(stderr, "%s : saved seq 1 state, %zu bytes\n", __func__, ncopy);
-
-    // clear seq 1 to create a "hole" in the KV cache (fragmentation)
-    // 0.20.20.20.2....
-    llama_memory_t mem = llama_get_memory(ctx);
-    llama_memory_seq_rm(mem, 1, -1, -1);
-    fprintf(stderr, "%s : cleared seq 1 to create fragmentation\n", __func__);
-
-    // Now the cache has holes where seq 1 was
-    // This creates fragmentation - there's no contiguous block large enough
-    // for the seq 1 state if we only look for contiguous slots
-
-    // Restore seq 1 state into seq 1 (should work with non-contiguous allocation)
-    // We use seq 1 since it's a valid sequence ID (0 to n_parallel-1)
-    // Before the fix, this would fail with "failed to find available cells in kv cache"
-    const size_t nset = llama_state_seq_set_data(ctx, seq_state.data(), seq_state.size(), 1);
-    if (nset != seq_state.size()) {
-        fprintf(stderr, "%s : FAILED to restore seq state into fragmented cache (got %zu, expected %zu)\n",
-                __func__, nset, seq_state.size());
-        fprintf(stderr, "%s : This is the bug - state restore fails with fragmented KV cache\n", __func__);
-        llama_batch_free(batch);
-        return 1;
-    }
-    fprintf(stderr, "%s : restored state into seq 1, %zu bytes\n", __func__, nset);
-
-    // Verify we can decode with the restored state
-    // Generate one token to verify the restored state is usable
-    auto sparams = llama_sampler_chain_default_params();
-    llama_sampler * smpl = llama_sampler_chain_init(sparams);
-    llama_sampler_chain_add(smpl, llama_sampler_init_dist(params.sampling.seed));
-
-    auto next_token = llama_sampler_sample(smpl, ctx, -1);
-    auto next_token_str = common_token_to_piece(ctx, next_token);
-
-    common_batch_clear(batch);
-    common_batch_add(batch, next_token, (int)tokens.size(), {1}, true);
-
-    if (llama_decode(ctx, batch)) {
-        fprintf(stderr, "%s : failed to decode with restored state\n", __func__);
-        llama_sampler_free(smpl);
-        llama_batch_free(batch);
-        return 1;
-    }
-
-    fprintf(stderr, "%s : successfully decoded with restored state, generated: '%s'\n", __func__, next_token_str.c_str());
-    fprintf(stderr, "%s : SUCCESS - state restore works with fragmented KV cache\n", __func__);
-
-    llama_sampler_free(smpl);
-    llama_batch_free(batch);
-
-    return 0;
-}
diff --git a/tools/cli/CMakeLists.txt b/tools/cli/CMakeLists.txt
deleted file mode 100644
index b08fff4c2..000000000
--- a/tools/cli/CMakeLists.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-set(TARGET llama-cli)
-add_executable(${TARGET} cli.cpp)
-target_link_libraries(${TARGET} PRIVATE server-context PUBLIC common ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_17)
-
-include_directories(../server)
-
-if(LLAMA_TOOLS_INSTALL)
-    install(TARGETS ${TARGET} RUNTIME)
-endif()
diff --git a/tools/cli/README.md b/tools/cli/README.md
deleted file mode 100644
index 1333ed77b..000000000
--- a/tools/cli/README.md
+++ /dev/null
@@ -1 +0,0 @@
-TODO
diff --git a/tools/cli/cli.cpp b/tools/cli/cli.cpp
deleted file mode 100644
index 8a8639207..000000000
--- a/tools/cli/cli.cpp
+++ /dev/null
@@ -1,395 +0,0 @@
-#include "common.h"
-#include "arg.h"
-#include "console.h"
-// #include "log.h"
-
-#include "server-context.h"
-#include "server-task.h"
-
-#include <atomic>
-#include <fstream>
-#include <thread>
-#include <signal.h>
-
-#if defined(_WIN32)
-#define WIN32_LEAN_AND_MEAN
-#ifndef NOMINMAX
-#   define NOMINMAX
-#endif
-#include <windows.h>
-#endif
-
-const char * LLAMA_ASCII_LOGO = R"(
-▄▄ ▄▄
-██ ██
-██ ██  ▀▀█▄ ███▄███▄  ▀▀█▄    ▄████ ████▄ ████▄
-██ ██ ▄█▀██ ██ ██ ██ ▄█▀██    ██    ██ ██ ██ ██
-██ ██ ▀█▄██ ██ ██ ██ ▀█▄██ ██ ▀████ ████▀ ████▀
-                                    ██    ██
-                                    ▀▀    ▀▀
-)";
-
-static std::atomic<bool> g_is_interrupted = false;
-static bool should_stop() {
-    return g_is_interrupted.load();
-}
-
-#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
-static void signal_handler(int) {
-    if (g_is_interrupted.load()) {
-        // second Ctrl+C - exit immediately
-        // make sure to clear colors before exiting (not using LOG or console.cpp here to avoid deadlock)
-        fprintf(stdout, "\033[0m\n");
-        fflush(stdout);
-        std::exit(130);
-    }
-    g_is_interrupted.store(true);
-}
-#endif
-
-struct cli_context {
-    server_context ctx_server;
-    json messages = json::array();
-    std::vector<raw_buffer> input_files;
-    task_params defaults;
-
-    // thread for showing "loading" animation
-    std::atomic<bool> loading_show;
-
-    cli_context(const common_params & params) {
-        defaults.sampling    = params.sampling;
-        defaults.speculative = params.speculative;
-        defaults.n_keep      = params.n_keep;
-        defaults.n_predict   = params.n_predict;
-        defaults.antiprompt  = params.antiprompt;
-
-        defaults.stream = true; // make sure we always use streaming mode
-        defaults.timings_per_token = true; // in order to get timings even when we cancel mid-way
-        // defaults.return_progress = true; // TODO: show progress
-        defaults.oaicompat_chat_syntax.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
-    }
-
-    std::string generate_completion(result_timings & out_timings) {
-        server_response_reader rd = ctx_server.get_response_reader();
-        {
-            // TODO: reduce some copies here in the future
-            server_task task = server_task(SERVER_TASK_TYPE_COMPLETION);
-            task.id        = rd.get_new_id();
-            task.index     = 0;
-            task.params    = defaults;    // copy
-            task.cli_input = messages;    // copy
-            task.cli_files = input_files; // copy
-            rd.post_task({std::move(task)});
-        }
-
-        // wait for first result
-        console::spinner::start();
-        server_task_result_ptr result = rd.next(should_stop);
-
-        console::spinner::stop();
-        std::string curr_content;
-        bool is_thinking = false;
-
-        while (result) {
-            if (should_stop()) {
-                break;
-            }
-            if (result->is_error()) {
-                json err_data = result->to_json();
-                if (err_data.contains("message")) {
-                    console::error("Error: %s\n", err_data["message"].get<std::string>().c_str());
-                } else {
-                    console::error("Error: %s\n", err_data.dump().c_str());
-                }
-                return curr_content;
-            }
-            auto res_partial = dynamic_cast<server_task_result_cmpl_partial *>(result.get());
-            if (res_partial) {
-                out_timings = std::move(res_partial->timings);
-                for (const auto & diff : res_partial->oaicompat_msg_diffs) {
-                    if (!diff.content_delta.empty()) {
-                        if (is_thinking) {
-                            console::log("\n[End thinking]\n\n");
-                            console::set_display(DISPLAY_TYPE_RESET);
-                            is_thinking = false;
-                        }
-                        curr_content += diff.content_delta;
-                        console::log("%s", diff.content_delta.c_str());
-                        console::flush();
-                    }
-                    if (!diff.reasoning_content_delta.empty()) {
-                        console::set_display(DISPLAY_TYPE_REASONING);
-                        if (!is_thinking) {
-                            console::log("[Start thinking]\n");
-                        }
-                        is_thinking = true;
-                        console::log("%s", diff.reasoning_content_delta.c_str());
-                        console::flush();
-                    }
-                }
-            }
-            auto res_final = dynamic_cast<server_task_result_cmpl_final *>(result.get());
-            if (res_final) {
-                out_timings = std::move(res_final->timings);
-                break;
-            }
-            result = rd.next(should_stop);
-        }
-        g_is_interrupted.store(false);
-        // server_response_reader automatically cancels pending tasks upon destruction
-        return curr_content;
-    }
-
-    // TODO: support remote files in the future (http, https, etc)
-    std::string load_input_file(const std::string & fname, bool is_media) {
-        std::ifstream file(fname, std::ios::binary);
-        if (!file) {
-            return "";
-        }
-        if (is_media) {
-            raw_buffer buf;
-            buf.assign((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
-            input_files.push_back(std::move(buf));
-            return mtmd_default_marker();
-        } else {
-            std::string content((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
-            return content;
-        }
-    }
-};
-
-int main(int argc, char ** argv) {
-    common_params params;
-
-    params.verbosity = LOG_LEVEL_ERROR; // by default, less verbose logs
-
-    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_CLI)) {
-        return 1;
-    }
-
-    // TODO: maybe support it later?
-    if (params.conversation_mode == COMMON_CONVERSATION_MODE_DISABLED) {
-        console::error("--no-conversation is not supported by llama-cli\n");
-        console::error("please use llama-completion instead\n");
-    }
-
-    common_init();
-
-    // struct that contains llama context and inference
-    cli_context ctx_cli(params);
-
-    llama_backend_init();
-    llama_numa_init(params.numa);
-
-    // TODO: avoid using atexit() here by making `console` a singleton
-    console::init(params.simple_io, params.use_color);
-    atexit([]() { console::cleanup(); });
-
-    console::set_display(DISPLAY_TYPE_RESET);
-
-#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
-    struct sigaction sigint_action;
-    sigint_action.sa_handler = signal_handler;
-    sigemptyset (&sigint_action.sa_mask);
-    sigint_action.sa_flags = 0;
-    sigaction(SIGINT, &sigint_action, NULL);
-    sigaction(SIGTERM, &sigint_action, NULL);
-#elif defined (_WIN32)
-    auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
-        return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
-    };
-    SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
-#endif
-
-    console::log("\nLoading model... "); // followed by loading animation
-    console::spinner::start();
-    if (!ctx_cli.ctx_server.load_model(params)) {
-        console::spinner::stop();
-        console::error("\nFailed to load the model\n");
-        return 1;
-    }
-
-    ctx_cli.ctx_server.init();
-
-    console::spinner::stop();
-    console::log("\n");
-
-    std::thread inference_thread([&ctx_cli]() {
-        ctx_cli.ctx_server.start_loop();
-    });
-
-    auto inf = ctx_cli.ctx_server.get_info();
-    std::string modalities = "text";
-    if (inf.has_inp_image) {
-        modalities += ", vision";
-    }
-    if (inf.has_inp_audio) {
-        modalities += ", audio";
-    }
-
-    if (!params.system_prompt.empty()) {
-        ctx_cli.messages.push_back({
-            {"role",    "system"},
-            {"content", params.system_prompt}
-        });
-    }
-
-    console::log("\n");
-    console::log("%s\n", LLAMA_ASCII_LOGO);
-    console::log("build      : %s\n", inf.build_info.c_str());
-    console::log("model      : %s\n", inf.model_name.c_str());
-    console::log("modalities : %s\n", modalities.c_str());
-    if (!params.system_prompt.empty()) {
-        console::log("using custom system prompt\n");
-    }
-    console::log("\n");
-    console::log("available commands:\n");
-    console::log("  /exit or Ctrl+C     stop or exit\n");
-    console::log("  /regen              regenerate the last response\n");
-    console::log("  /clear              clear the chat history\n");
-    console::log("  /read               add a text file\n");
-    if (inf.has_inp_image) {
-        console::log("  /image <file>       add an image file\n");
-    }
-    if (inf.has_inp_audio) {
-        console::log("  /audio <file>       add an audio file\n");
-    }
-    console::log("\n");
-
-    // interactive loop
-    std::string cur_msg;
-    while (true) {
-        std::string buffer;
-        console::set_display(DISPLAY_TYPE_USER_INPUT);
-        if (params.prompt.empty()) {
-            console::log("\n> ");
-            std::string line;
-            bool another_line = true;
-            do {
-                another_line = console::readline(line, params.multiline_input);
-                buffer += line;
-            } while (another_line);
-        } else {
-            // process input prompt from args
-            for (auto & fname : params.image) {
-                std::string marker = ctx_cli.load_input_file(fname, true);
-                if (marker.empty()) {
-                    console::error("file does not exist or cannot be opened: '%s'\n", fname.c_str());
-                    break;
-                }
-                console::log("Loaded media from '%s'\n", fname.c_str());
-                cur_msg += marker;
-            }
-            buffer = params.prompt;
-            if (buffer.size() > 500) {
-                console::log("\n> %s ... (truncated)\n", buffer.substr(0, 500).c_str());
-            } else {
-                console::log("\n> %s\n", buffer.c_str());
-            }
-            params.prompt.clear(); // only use it once
-        }
-        console::set_display(DISPLAY_TYPE_RESET);
-        console::log("\n");
-
-        if (should_stop()) {
-            g_is_interrupted.store(false);
-            break;
-        }
-
-        // remove trailing newline
-        if (!buffer.empty() &&buffer.back() == '\n') {
-            buffer.pop_back();
-        }
-
-        // skip empty messages
-        if (buffer.empty()) {
-            continue;
-        }
-
-        bool add_user_msg = true;
-
-        // process commands
-        if (string_starts_with(buffer, "/exit")) {
-            break;
-        } else if (string_starts_with(buffer, "/regen")) {
-            if (ctx_cli.messages.size() >= 2) {
-                size_t last_idx = ctx_cli.messages.size() - 1;
-                ctx_cli.messages.erase(last_idx);
-                add_user_msg = false;
-            } else {
-                console::error("No message to regenerate.\n");
-                continue;
-            }
-        } else if (string_starts_with(buffer, "/clear")) {
-            ctx_cli.messages.clear();
-            ctx_cli.input_files.clear();
-            console::log("Chat history cleared.\n");
-            continue;
-        } else if (
-                (string_starts_with(buffer, "/image ") && inf.has_inp_image) ||
-                (string_starts_with(buffer, "/audio ") && inf.has_inp_audio)) {
-            // just in case (bad copy-paste for example), we strip all trailing/leading spaces
-            std::string fname = string_strip(buffer.substr(7));
-            std::string marker = ctx_cli.load_input_file(fname, true);
-            if (marker.empty()) {
-                console::error("file does not exist or cannot be opened: '%s'\n", fname.c_str());
-                continue;
-            }
-            cur_msg += marker;
-            console::log("Loaded media from '%s'\n", fname.c_str());
-            continue;
-        } else if (string_starts_with(buffer, "/read ")) {
-            std::string fname = string_strip(buffer.substr(6));
-            std::string marker = ctx_cli.load_input_file(fname, false);
-            if (marker.empty()) {
-                console::error("file does not exist or cannot be opened: '%s'\n", fname.c_str());
-                continue;
-            }
-            cur_msg += marker;
-            console::log("Loaded text from '%s'\n", fname.c_str());
-            continue;
-        } else {
-            // not a command
-            cur_msg += buffer;
-        }
-
-        // generate response
-        if (add_user_msg) {
-            ctx_cli.messages.push_back({
-                {"role",    "user"},
-                {"content", cur_msg}
-            });
-            cur_msg.clear();
-        }
-        result_timings timings;
-        std::string assistant_content = ctx_cli.generate_completion(timings);
-        ctx_cli.messages.push_back({
-            {"role",    "assistant"},
-            {"content", assistant_content}
-        });
-        console::log("\n");
-
-        if (params.show_timings) {
-            console::set_display(DISPLAY_TYPE_INFO);
-            console::log("\n");
-            console::log("[ Prompt: %.1f t/s | Generation: %.1f t/s ]\n", timings.prompt_per_second, timings.predicted_per_second);
-            console::set_display(DISPLAY_TYPE_RESET);
-        }
-
-        if (params.single_turn) {
-            break;
-        }
-    }
-
-    console::set_display(DISPLAY_TYPE_RESET);
-
-    console::log("\nExiting...\n");
-    ctx_cli.ctx_server.terminate();
-    inference_thread.join();
-
-    // bump the log level to display timings
-    common_log_set_verbosity_thold(LOG_LEVEL_INFO);
-    llama_memory_breakdown_print(ctx_cli.ctx_server.get_llama_context());
-
-    return 0;
-}
diff --git a/tools/server/public/index.html.gz b/tools/server/public/index.html.gz
index d6cc23ebf..9e44f0326 100644
Binary files a/tools/server/public/index.html.gz and b/tools/server/public/index.html.gz differ
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index 90898b5ec..def57d025 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -544,6 +544,8 @@ struct server_context_impl {
 
     server_metrics metrics;
 
+    json webui_settings = json::object();
+
     // Necessary similarity of prompt for slot selection
     float slot_prompt_similarity = 0.0f;
 
@@ -575,6 +577,16 @@ struct server_context_impl {
 
         params_base = params;
 
+        webui_settings = json::object();
+        if (!params_base.webui_config_json.empty()) {
+            try {
+                webui_settings = json::parse(params_base.webui_config_json);
+            } catch (const std::exception & e) {
+                SRV_ERR("%s: failed to parse webui config: %s\n", __func__, e.what());
+                return false;
+            }
+        }
+
         llama_init = common_init_from_params(params_base);
 
         model = llama_init->model();
@@ -3103,7 +3115,6 @@ void server_routes::init_routes() {
             };
         }
 
-        // this endpoint is publicly available, please only return what is safe to be exposed
         json data = {
             { "default_generation_settings", default_generation_settings_for_props },
             { "total_slots",                 ctx_server.params_base.n_parallel },
@@ -3117,6 +3128,7 @@ void server_routes::init_routes() {
             { "endpoint_props",              params.endpoint_props },
             { "endpoint_metrics",            params.endpoint_metrics },
             { "webui",                       params.webui },
+            { "webui_settings",              ctx_server.webui_settings },
             { "chat_template",               common_chat_templates_source(ctx_server.chat_templates.get()) },
             { "bos_token",                   common_token_to_piece(ctx_server.ctx, llama_vocab_bos(ctx_server.vocab), /* special= */ true)},
             { "eos_token",                   common_token_to_piece(ctx_server.ctx, llama_vocab_eos(ctx_server.vocab), /* special= */ true)},
diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp
index 8b5cb5d27..c1f86e549 100644
--- a/tools/server/server-models.cpp
+++ b/tools/server/server-models.cpp
@@ -168,7 +168,9 @@ server_presets::server_presets(int argc, char ** argv, common_params & base_para
             env == "LLAMA_ARG_MODEL" ||
             env == "LLAMA_ARG_MMPROJ" ||
             env == "LLAMA_ARG_HF_REPO" ||
-            env == "LLAMA_ARG_NO_MODELS_AUTOLOAD") {
+            env == "LLAMA_ARG_NO_MODELS_AUTOLOAD" ||
+            env == "LLAMA_ARG_SSL_KEY_FILE" ||
+            env == "LLAMA_ARG_SSL_CERT_FILE") {
             control_args[env] = opt;
         }
     }
@@ -222,6 +224,9 @@ void server_presets::render_args(server_model_meta & meta) {
             preset.options[control_args["LLAMA_ARG_MMPROJ"]] = meta.path_mmproj;
         }
     }
+    // disable SSL for child processes (HTTPS already handled by router)
+    preset.options[control_args["LLAMA_ARG_SSL_KEY_FILE"]] = "";
+    preset.options[control_args["LLAMA_ARG_SSL_CERT_FILE"]] = "";
     meta.args = preset.to_args();
     // add back the binary path at the front
     meta.args.insert(meta.args.begin(), get_server_exec_path().string());
@@ -813,6 +818,7 @@ void server_models_routes::init_routes() {
                     {"params", json{}},
                     {"n_ctx",  0},
                 }},
+                {"webui_settings", webui_settings},
             });
             return res;
         }
diff --git a/tools/server/server-models.h b/tools/server/server-models.h
index 227b15bbc..cbc4c4324 100644
--- a/tools/server/server-models.h
+++ b/tools/server/server-models.h
@@ -2,6 +2,7 @@
 
 #include "common.h"
 #include "preset.h"
+#include "server-common.h"
 #include "server-http.h"
 
 #include <mutex>
@@ -149,9 +150,18 @@ public:
 
 struct server_models_routes {
     common_params params;
+    json webui_settings = json::object();
     server_models models;
     server_models_routes(const common_params & params, int argc, char ** argv, char ** envp)
             : params(params), models(params, argc, argv, envp) {
+        if (!this->params.webui_config_json.empty()) {
+            try {
+                webui_settings = json::parse(this->params.webui_config_json);
+            } catch (const std::exception & e) {
+                LOG_ERR("%s: failed to parse webui config: %s\n", __func__, e.what());
+                throw;
+            }
+        }
         init_routes();
     }
 
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index 3cebe174b..b6b611b3f 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -8,6 +8,7 @@
 #include "log.h"
 
 #include <atomic>
+#include <exception>
 #include <signal.h>
 #include <thread> // for std::thread::hardware_concurrency
 
@@ -124,7 +125,12 @@ int main(int argc, char ** argv, char ** envp) {
     std::optional<server_models_routes> models_routes{};
     if (is_router_server) {
         // setup server instances manager
-        models_routes.emplace(params, argc, argv, envp);
+        try {
+            models_routes.emplace(params, argc, argv, envp);
+        } catch (const std::exception & e) {
+            LOG_ERR("%s: failed to initialize router models: %s\n", __func__, e.what());
+            return 1;
+        }
 
         // proxy handlers
         // note: routes.get_health stays the same
diff --git a/tools/server/webui/package-lock.json b/tools/server/webui/package-lock.json
index 4f37b308b..0d1a03aca 100644
--- a/tools/server/webui/package-lock.json
+++ b/tools/server/webui/package-lock.json
@@ -2109,9 +2109,9 @@
 			}
 		},
 		"node_modules/@sveltejs/kit": {
-			"version": "2.48.5",
-			"resolved": "https://registry.npmjs.org/@sveltejs/kit/-/kit-2.48.5.tgz",
-			"integrity": "sha512-/rnwfSWS3qwUSzvHynUTORF9xSJi7PCR9yXkxUOnRrNqyKmCmh3FPHH+E9BbgqxXfTevGXBqgnlh9kMb+9T5XA==",
+			"version": "2.49.2",
+			"resolved": "https://registry.npmjs.org/@sveltejs/kit/-/kit-2.49.2.tgz",
+			"integrity": "sha512-Vp3zX/qlwerQmHMP6x0Ry1oY7eKKRcOWGc2P59srOp4zcqyn+etJyQpELgOi4+ZSUgteX8Y387NuwruLgGXLUQ==",
 			"dev": true,
 			"license": "MIT",
 			"dependencies": {
@@ -5797,9 +5797,9 @@
 			}
 		},
 		"node_modules/mdast-util-to-hast": {
-			"version": "13.2.0",
-			"resolved": "https://registry.npmjs.org/mdast-util-to-hast/-/mdast-util-to-hast-13.2.0.tgz",
-			"integrity": "sha512-QGYKEuUsYT9ykKBCMOEDLsU5JRObWQusAolFMeko/tYPufNkRffBAQjIE+99jbA87xv6FgmjLtwjh9wBWajwAA==",
+			"version": "13.2.1",
+			"resolved": "https://registry.npmjs.org/mdast-util-to-hast/-/mdast-util-to-hast-13.2.1.tgz",
+			"integrity": "sha512-cctsq2wp5vTsLIcaymblUriiTcZd0CwWtCbLvrOzYCDZoWyMNV8sZ7krj09FSnsiJi3WVsHLM4k6Dq/yaPyCXA==",
 			"license": "MIT",
 			"dependencies": {
 				"@types/hast": "^3.0.0",
diff --git a/tools/server/webui/src/app.d.ts b/tools/server/webui/src/app.d.ts
index 71976936e..73287d91b 100644
--- a/tools/server/webui/src/app.d.ts
+++ b/tools/server/webui/src/app.d.ts
@@ -124,3 +124,10 @@ declare global {
 		SettingsConfigType
 	};
 }
+
+declare global {
+	interface Window {
+		idxThemeStyle?: number;
+		idxCodeBlock?: number;
+	}
+}
diff --git a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageAssistant.svelte b/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageAssistant.svelte
index 2c9a012ef..8997963f1 100644
--- a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageAssistant.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageAssistant.svelte
@@ -244,7 +244,7 @@
 
 	<div class="info my-6 grid gap-4">
 		{#if displayedModel()}
-			<span class="inline-flex flex-wrap items-center gap-2 text-xs text-muted-foreground">
+			<div class="inline-flex flex-wrap items-start gap-2 text-xs text-muted-foreground">
 				{#if isRouter}
 					<ModelsSelector
 						currentModel={displayedModel()}
@@ -258,11 +258,13 @@
 
 				{#if currentConfig.showMessageStats && message.timings && message.timings.predicted_n && message.timings.predicted_ms}
 					<ChatMessageStatistics
+						promptTokens={message.timings.prompt_n}
+						promptMs={message.timings.prompt_ms}
 						predictedTokens={message.timings.predicted_n}
 						predictedMs={message.timings.predicted_ms}
 					/>
 				{/if}
-			</span>
+			</div>
 		{/if}
 
 		{#if config().showToolCalls}
diff --git a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageStatistics.svelte b/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageStatistics.svelte
index a453a3101..a39acb1d7 100644
--- a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageStatistics.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageStatistics.svelte
@@ -1,20 +1,122 @@
 <script lang="ts">
-	import { Clock, Gauge, WholeWord } from '@lucide/svelte';
+	import { Clock, Gauge, WholeWord, BookOpenText, Sparkles } from '@lucide/svelte';
 	import { BadgeChatStatistic } from '$lib/components/app';
+	import * as Tooltip from '$lib/components/ui/tooltip';
+	import { ChatMessageStatsView } from '$lib/enums';
 
 	interface Props {
 		predictedTokens: number;
 		predictedMs: number;
+		promptTokens?: number;
+		promptMs?: number;
 	}
 
-	let { predictedTokens, predictedMs }: Props = $props();
+	let { predictedTokens, predictedMs, promptTokens, promptMs }: Props = $props();
+
+	let activeView: ChatMessageStatsView = $state(ChatMessageStatsView.GENERATION);
 
 	let tokensPerSecond = $derived((predictedTokens / predictedMs) * 1000);
 	let timeInSeconds = $derived((predictedMs / 1000).toFixed(2));
+
+	let promptTokensPerSecond = $derived(
+		promptTokens !== undefined && promptMs !== undefined
+			? (promptTokens / promptMs) * 1000
+			: undefined
+	);
+
+	let promptTimeInSeconds = $derived(
+		promptMs !== undefined ? (promptMs / 1000).toFixed(2) : undefined
+	);
+
+	let hasPromptStats = $derived(
+		promptTokens !== undefined &&
+			promptMs !== undefined &&
+			promptTokensPerSecond !== undefined &&
+			promptTimeInSeconds !== undefined
+	);
 </script>
 
-<BadgeChatStatistic icon={WholeWord} value="{predictedTokens} tokens" />
+<div class="inline-flex items-center text-xs text-muted-foreground">
+	<div class="inline-flex items-center rounded-sm bg-muted-foreground/15 p-0.5">
+		{#if hasPromptStats}
+			<Tooltip.Root>
+				<Tooltip.Trigger>
+					<button
+						type="button"
+						class="inline-flex h-5 w-5 items-center justify-center rounded-sm transition-colors {activeView ===
+						ChatMessageStatsView.READING
+							? 'bg-background text-foreground shadow-sm'
+							: 'hover:text-foreground'}"
+						onclick={() => (activeView = ChatMessageStatsView.READING)}
+					>
+						<BookOpenText class="h-3 w-3" />
+						<span class="sr-only">Reading</span>
+					</button>
+				</Tooltip.Trigger>
+				<Tooltip.Content>
+					<p>Reading (prompt processing)</p>
+				</Tooltip.Content>
+			</Tooltip.Root>
+		{/if}
+		<Tooltip.Root>
+			<Tooltip.Trigger>
+				<button
+					type="button"
+					class="inline-flex h-5 w-5 items-center justify-center rounded-sm transition-colors {activeView ===
+					ChatMessageStatsView.GENERATION
+						? 'bg-background text-foreground shadow-sm'
+						: 'hover:text-foreground'}"
+					onclick={() => (activeView = ChatMessageStatsView.GENERATION)}
+				>
+					<Sparkles class="h-3 w-3" />
+					<span class="sr-only">Generation</span>
+				</button>
+			</Tooltip.Trigger>
+			<Tooltip.Content>
+				<p>Generation (token output)</p>
+			</Tooltip.Content>
+		</Tooltip.Root>
+	</div>
 
-<BadgeChatStatistic icon={Clock} value="{timeInSeconds}s" />
-
-<BadgeChatStatistic icon={Gauge} value="{tokensPerSecond.toFixed(2)} tokens/s" />
+	<div class="flex items-center gap-1 px-2">
+		{#if activeView === ChatMessageStatsView.GENERATION}
+			<BadgeChatStatistic
+				class="bg-transparent"
+				icon={WholeWord}
+				value="{predictedTokens} tokens"
+				tooltipLabel="Generated tokens"
+			/>
+			<BadgeChatStatistic
+				class="bg-transparent"
+				icon={Clock}
+				value="{timeInSeconds}s"
+				tooltipLabel="Generation time"
+			/>
+			<BadgeChatStatistic
+				class="bg-transparent"
+				icon={Gauge}
+				value="{tokensPerSecond.toFixed(2)} tokens/s"
+				tooltipLabel="Generation speed"
+			/>
+		{:else if hasPromptStats}
+			<BadgeChatStatistic
+				class="bg-transparent"
+				icon={WholeWord}
+				value="{promptTokens} tokens"
+				tooltipLabel="Prompt tokens"
+			/>
+			<BadgeChatStatistic
+				class="bg-transparent"
+				icon={Clock}
+				value="{promptTimeInSeconds}s"
+				tooltipLabel="Prompt processing time"
+			/>
+			<BadgeChatStatistic
+				class="bg-transparent"
+				icon={Gauge}
+				value="{promptTokensPerSecond!.toFixed(2)} tokens/s"
+				tooltipLabel="Prompt processing speed"
+			/>
+		{/if}
+	</div>
+</div>
diff --git a/tools/server/webui/src/lib/components/app/chat/ChatScreen/ChatScreen.svelte b/tools/server/webui/src/lib/components/app/chat/ChatScreen/ChatScreen.svelte
index 8eacb7744..23a9f37b7 100644
--- a/tools/server/webui/src/lib/components/app/chat/ChatScreen/ChatScreen.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatScreen/ChatScreen.svelte
@@ -587,7 +587,7 @@
 
 		&::after {
 			content: '';
-			position: fixed;
+			position: absolute;
 			bottom: 0;
 			z-index: -1;
 			left: 0;
diff --git a/tools/server/webui/src/lib/components/app/misc/BadgeChatStatistic.svelte b/tools/server/webui/src/lib/components/app/misc/BadgeChatStatistic.svelte
index 9e5339cab..a2b28d205 100644
--- a/tools/server/webui/src/lib/components/app/misc/BadgeChatStatistic.svelte
+++ b/tools/server/webui/src/lib/components/app/misc/BadgeChatStatistic.svelte
@@ -1,5 +1,6 @@
 <script lang="ts">
 	import { BadgeInfo } from '$lib/components/app';
+	import * as Tooltip from '$lib/components/ui/tooltip';
 	import { copyToClipboard } from '$lib/utils';
 	import type { Component } from 'svelte';
 
@@ -7,19 +8,37 @@
 		class?: string;
 		icon: Component;
 		value: string | number;
+		tooltipLabel?: string;
 	}
 
-	let { class: className = '', icon: Icon, value }: Props = $props();
+	let { class: className = '', icon: Icon, value, tooltipLabel }: Props = $props();
 
 	function handleClick() {
 		void copyToClipboard(String(value));
 	}
 </script>
 
-<BadgeInfo class={className} onclick={handleClick}>
-	{#snippet icon()}
-		<Icon class="h-3 w-3" />
-	{/snippet}
+{#if tooltipLabel}
+	<Tooltip.Root>
+		<Tooltip.Trigger>
+			<BadgeInfo class={className} onclick={handleClick}>
+				{#snippet icon()}
+					<Icon class="h-3 w-3" />
+				{/snippet}
 
-	{value}
-</BadgeInfo>
+				{value}
+			</BadgeInfo>
+		</Tooltip.Trigger>
+		<Tooltip.Content>
+			<p>{tooltipLabel}</p>
+		</Tooltip.Content>
+	</Tooltip.Root>
+{:else}
+	<BadgeInfo class={className} onclick={handleClick}>
+		{#snippet icon()}
+			<Icon class="h-3 w-3" />
+		{/snippet}
+
+		{value}
+	</BadgeInfo>
+{/if}
diff --git a/tools/server/webui/src/lib/components/app/misc/MarkdownContent.svelte b/tools/server/webui/src/lib/components/app/misc/MarkdownContent.svelte
index 2a4a39535..cb3ae17a6 100644
--- a/tools/server/webui/src/lib/components/app/misc/MarkdownContent.svelte
+++ b/tools/server/webui/src/lib/components/app/misc/MarkdownContent.svelte
@@ -7,15 +7,19 @@
 	import remarkRehype from 'remark-rehype';
 	import rehypeKatex from 'rehype-katex';
 	import rehypeStringify from 'rehype-stringify';
-	import { copyCodeToClipboard, preprocessLaTeX } from '$lib/utils';
-	import { rehypeRestoreTableHtml } from '$lib/markdown/table-html-restorer';
+	import type { Root as HastRoot, RootContent as HastRootContent } from 'hast';
+	import type { Root as MdastRoot } from 'mdast';
 	import { browser } from '$app/environment';
+	import { onDestroy, tick } from 'svelte';
+	import { rehypeRestoreTableHtml } from '$lib/markdown/table-html-restorer';
+	import { rehypeEnhanceLinks } from '$lib/markdown/enhance-links';
+	import { rehypeEnhanceCodeBlocks } from '$lib/markdown/enhance-code-blocks';
+	import { remarkLiteralHtml } from '$lib/markdown/literal-html';
+	import { copyCodeToClipboard, preprocessLaTeX } from '$lib/utils';
 	import '$styles/katex-custom.scss';
-
 	import githubDarkCss from 'highlight.js/styles/github-dark.css?inline';
 	import githubLightCss from 'highlight.js/styles/github.css?inline';
 	import { mode } from 'mode-watcher';
-	import { remarkLiteralHtml } from '$lib/markdown/literal-html';
 	import CodePreviewDialog from './CodePreviewDialog.svelte';
 
 	interface Props {
@@ -23,33 +27,24 @@
 		class?: string;
 	}
 
+	interface MarkdownBlock {
+		id: string;
+		html: string;
+	}
+
 	let { content, class: className = '' }: Props = $props();
 
 	let containerRef = $state<HTMLDivElement>();
-	let processedHtml = $state('');
+	let renderedBlocks = $state<MarkdownBlock[]>([]);
+	let unstableBlockHtml = $state('');
 	let previewDialogOpen = $state(false);
 	let previewCode = $state('');
 	let previewLanguage = $state('text');
 
-	function loadHighlightTheme(isDark: boolean) {
-		if (!browser) return;
+	let pendingMarkdown: string | null = null;
+	let isProcessing = false;
 
-		const existingThemes = document.querySelectorAll('style[data-highlight-theme]');
-		existingThemes.forEach((style) => style.remove());
-
-		const style = document.createElement('style');
-		style.setAttribute('data-highlight-theme', 'true');
-		style.textContent = isDark ? githubDarkCss : githubLightCss;
-
-		document.head.appendChild(style);
-	}
-
-	$effect(() => {
-		const currentMode = mode.current;
-		const isDark = currentMode === 'dark';
-
-		loadHighlightTheme(isDark);
-	});
+	const themeStyleId = `highlight-theme-${(window.idxThemeStyle = (window.idxThemeStyle ?? 0) + 1)}`;
 
 	let processor = $derived(() => {
 		return remark()
@@ -61,139 +56,64 @@
 			.use(rehypeKatex) // Render math using KaTeX
 			.use(rehypeHighlight) // Add syntax highlighting
 			.use(rehypeRestoreTableHtml) // Restore limited HTML (e.g., <br>, <ul>) inside Markdown tables
-			.use(rehypeStringify); // Convert to HTML string
+			.use(rehypeEnhanceLinks) // Add target="_blank" to links
+			.use(rehypeEnhanceCodeBlocks) // Wrap code blocks with header and actions
+			.use(rehypeStringify, { allowDangerousHtml: true }); // Convert to HTML string
 	});
 
-	function enhanceLinks(html: string): string {
-		if (!html.includes('<a')) {
-			return html;
+	/**
+	 * Removes click event listeners from copy and preview buttons.
+	 * Called on component destroy.
+	 */
+	function cleanupEventListeners() {
+		if (!containerRef) return;
+
+		const copyButtons = containerRef.querySelectorAll<HTMLButtonElement>('.copy-code-btn');
+		const previewButtons = containerRef.querySelectorAll<HTMLButtonElement>('.preview-code-btn');
+
+		for (const button of copyButtons) {
+			button.removeEventListener('click', handleCopyClick);
 		}
 
-		const tempDiv = document.createElement('div');
-		tempDiv.innerHTML = html;
-
-		// Make all links open in new tabs
-		const linkElements = tempDiv.querySelectorAll('a[href]');
-		let mutated = false;
-
-		for (const link of linkElements) {
-			const target = link.getAttribute('target');
-			const rel = link.getAttribute('rel');
-
-			if (target !== '_blank' || rel !== 'noopener noreferrer') {
-				mutated = true;
-			}
-
-			link.setAttribute('target', '_blank');
-			link.setAttribute('rel', 'noopener noreferrer');
-		}
-
-		return mutated ? tempDiv.innerHTML : html;
-	}
-
-	function enhanceCodeBlocks(html: string): string {
-		if (!html.includes('<pre')) {
-			return html;
-		}
-
-		const tempDiv = document.createElement('div');
-		tempDiv.innerHTML = html;
-
-		const preElements = tempDiv.querySelectorAll('pre');
-		let mutated = false;
-
-		for (const [index, pre] of Array.from(preElements).entries()) {
-			const codeElement = pre.querySelector('code');
-
-			if (!codeElement) {
-				continue;
-			}
-
-			mutated = true;
-
-			let language = 'text';
-			const classList = Array.from(codeElement.classList);
-
-			for (const className of classList) {
-				if (className.startsWith('language-')) {
-					language = className.replace('language-', '');
-					break;
-				}
-			}
-
-			const rawCode = codeElement.textContent || '';
-			const codeId = `code-${Date.now()}-${index}`;
-			codeElement.setAttribute('data-code-id', codeId);
-			codeElement.setAttribute('data-raw-code', rawCode);
-
-			const wrapper = document.createElement('div');
-			wrapper.className = 'code-block-wrapper';
-
-			const header = document.createElement('div');
-			header.className = 'code-block-header';
-
-			const languageLabel = document.createElement('span');
-			languageLabel.className = 'code-language';
-			languageLabel.textContent = language;
-
-			const copyButton = document.createElement('button');
-			copyButton.className = 'copy-code-btn';
-			copyButton.setAttribute('data-code-id', codeId);
-			copyButton.setAttribute('title', 'Copy code');
-			copyButton.setAttribute('type', 'button');
-
-			copyButton.innerHTML = `
-                                <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-copy-icon lucide-copy"><rect width="14" height="14" x="8" y="8" rx="2" ry="2"/><path d="M4 16c-1.1 0-2-.9-2-2V4c0-1.1.9-2 2-2h10c1.1 0 2 .9 2 2"/></svg>
-                        `;
-
-			const actions = document.createElement('div');
-			actions.className = 'code-block-actions';
-
-			actions.appendChild(copyButton);
-
-			if (language.toLowerCase() === 'html') {
-				const previewButton = document.createElement('button');
-				previewButton.className = 'preview-code-btn';
-				previewButton.setAttribute('data-code-id', codeId);
-				previewButton.setAttribute('title', 'Preview code');
-				previewButton.setAttribute('type', 'button');
-
-				previewButton.innerHTML = `
-                                        <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-eye lucide-eye-icon"><path d="M2.062 12.345a1 1 0 0 1 0-.69C3.5 7.73 7.36 5 12 5s8.5 2.73 9.938 6.655a1 1 0 0 1 0 .69C20.5 16.27 16.64 19 12 19s-8.5-2.73-9.938-6.655"/><circle cx="12" cy="12" r="3"/></svg>
-                                `;
-
-				actions.appendChild(previewButton);
-			}
-
-			header.appendChild(languageLabel);
-			header.appendChild(actions);
-			wrapper.appendChild(header);
-
-			const clonedPre = pre.cloneNode(true) as HTMLElement;
-			wrapper.appendChild(clonedPre);
-
-			pre.parentNode?.replaceChild(wrapper, pre);
-		}
-
-		return mutated ? tempDiv.innerHTML : html;
-	}
-
-	async function processMarkdown(text: string): Promise<string> {
-		try {
-			let normalized = preprocessLaTeX(text);
-			const result = await processor().process(normalized);
-			const html = String(result);
-			const enhancedLinks = enhanceLinks(html);
-
-			return enhanceCodeBlocks(enhancedLinks);
-		} catch (error) {
-			console.error('Markdown processing error:', error);
-
-			// Fallback to plain text with line breaks
-			return text.replace(/\n/g, '<br>');
+		for (const button of previewButtons) {
+			button.removeEventListener('click', handlePreviewClick);
 		}
 	}
 
+	/**
+	 * Removes this component's highlight.js theme style from the document head.
+	 * Called on component destroy to clean up injected styles.
+	 */
+	function cleanupHighlightTheme() {
+		if (!browser) return;
+
+		const existingTheme = document.getElementById(themeStyleId);
+		existingTheme?.remove();
+	}
+
+	/**
+	 * Loads the appropriate highlight.js theme based on dark/light mode.
+	 * Injects a scoped style element into the document head.
+	 * @param isDark - Whether to load the dark theme (true) or light theme (false)
+	 */
+	function loadHighlightTheme(isDark: boolean) {
+		if (!browser) return;
+
+		const existingTheme = document.getElementById(themeStyleId);
+		existingTheme?.remove();
+
+		const style = document.createElement('style');
+		style.id = themeStyleId;
+		style.textContent = isDark ? githubDarkCss : githubLightCss;
+
+		document.head.appendChild(style);
+	}
+
+	/**
+	 * Extracts code information from a button click target within a code block.
+	 * @param target - The clicked button element
+	 * @returns Object with rawCode and language, or null if extraction fails
+	 */
 	function getCodeInfoFromTarget(target: HTMLElement) {
 		const wrapper = target.closest('.code-block-wrapper');
 
@@ -209,12 +129,7 @@
 			return null;
 		}
 
-		const rawCode = codeElement.getAttribute('data-raw-code');
-
-		if (rawCode === null) {
-			console.error('No raw code found');
-			return null;
-		}
+		const rawCode = codeElement.textContent ?? '';
 
 		const languageLabel = wrapper.querySelector<HTMLElement>('.code-language');
 		const language = languageLabel?.textContent?.trim() || 'text';
@@ -222,6 +137,28 @@
 		return { rawCode, language };
 	}
 
+	/**
+	 * Generates a unique identifier for a HAST node based on its position.
+	 * Used for stable block identification during incremental rendering.
+	 * @param node - The HAST root content node
+	 * @param indexFallback - Fallback index if position is unavailable
+	 * @returns Unique string identifier for the node
+	 */
+	function getHastNodeId(node: HastRootContent, indexFallback: number): string {
+		const position = node.position;
+
+		if (position?.start?.offset != null && position?.end?.offset != null) {
+			return `hast-${position.start.offset}-${position.end.offset}`;
+		}
+
+		return `${node.type}-${indexFallback}`;
+	}
+
+	/**
+	 * Handles click events on copy buttons within code blocks.
+	 * Copies the raw code content to the clipboard.
+	 * @param event - The click event from the copy button
+	 */
 	async function handleCopyClick(event: Event) {
 		event.preventDefault();
 		event.stopPropagation();
@@ -245,6 +182,25 @@
 		}
 	}
 
+	/**
+	 * Handles preview dialog open state changes.
+	 * Clears preview content when dialog is closed.
+	 * @param open - Whether the dialog is being opened or closed
+	 */
+	function handlePreviewDialogOpenChange(open: boolean) {
+		previewDialogOpen = open;
+
+		if (!open) {
+			previewCode = '';
+			previewLanguage = 'text';
+		}
+	}
+
+	/**
+	 * Handles click events on preview buttons within HTML code blocks.
+	 * Opens a preview dialog with the rendered HTML content.
+	 * @param event - The click event from the preview button
+	 */
 	function handlePreviewClick(event: Event) {
 		event.preventDefault();
 		event.stopPropagation();
@@ -266,6 +222,61 @@
 		previewDialogOpen = true;
 	}
 
+	/**
+	 * Processes markdown content into stable and unstable HTML blocks.
+	 * Uses incremental rendering: stable blocks are cached, unstable block is re-rendered.
+	 * @param markdown - The raw markdown string to process
+	 */
+	async function processMarkdown(markdown: string) {
+		if (!markdown) {
+			renderedBlocks = [];
+			unstableBlockHtml = '';
+			return;
+		}
+
+		const normalized = preprocessLaTeX(markdown);
+		const processorInstance = processor();
+		const ast = processorInstance.parse(normalized) as MdastRoot;
+		const processedRoot = (await processorInstance.run(ast)) as HastRoot;
+		const processedChildren = processedRoot.children ?? [];
+		const stableCount = Math.max(processedChildren.length - 1, 0);
+		const nextBlocks: MarkdownBlock[] = [];
+
+		for (let index = 0; index < stableCount; index++) {
+			const hastChild = processedChildren[index];
+			const id = getHastNodeId(hastChild, index);
+			const existing = renderedBlocks[index];
+
+			if (existing && existing.id === id) {
+				nextBlocks.push(existing);
+				continue;
+			}
+
+			const html = stringifyProcessedNode(
+				processorInstance,
+				processedRoot,
+				processedChildren[index]
+			);
+
+			nextBlocks.push({ id, html });
+		}
+
+		let unstableHtml = '';
+
+		if (processedChildren.length > stableCount) {
+			const unstableChild = processedChildren[stableCount];
+			unstableHtml = stringifyProcessedNode(processorInstance, processedRoot, unstableChild);
+		}
+
+		renderedBlocks = nextBlocks;
+		await tick(); // Force DOM sync before updating unstable HTML block
+		unstableBlockHtml = unstableHtml;
+	}
+
+	/**
+	 * Attaches click event listeners to copy and preview buttons in code blocks.
+	 * Uses data-listener-bound attribute to prevent duplicate bindings.
+	 */
 	function setupCodeBlockActions() {
 		if (!containerRef) return;
 
@@ -287,40 +298,97 @@
 		}
 	}
 
-	function handlePreviewDialogOpenChange(open: boolean) {
-		previewDialogOpen = open;
+	/**
+	 * Converts a single HAST node to an enhanced HTML string.
+	 * Applies link and code block enhancements to the output.
+	 * @param processorInstance - The remark/rehype processor instance
+	 * @param processedRoot - The full processed HAST root (for context)
+	 * @param child - The specific HAST child node to stringify
+	 * @returns Enhanced HTML string representation of the node
+	 */
+	function stringifyProcessedNode(
+		processorInstance: ReturnType<typeof processor>,
+		processedRoot: HastRoot,
+		child: unknown
+	) {
+		const root: HastRoot = {
+			...(processedRoot as HastRoot),
+			children: [child as never]
+		};
 
-		if (!open) {
-			previewCode = '';
-			previewLanguage = 'text';
+		return processorInstance.stringify(root);
+	}
+
+	/**
+	 * Queues markdown for processing with coalescing support.
+	 * Only processes the latest markdown when multiple updates arrive quickly.
+	 * @param markdown - The markdown content to render
+	 */
+	async function updateRenderedBlocks(markdown: string) {
+		pendingMarkdown = markdown;
+
+		if (isProcessing) {
+			return;
+		}
+
+		isProcessing = true;
+
+		try {
+			while (pendingMarkdown !== null) {
+				const nextMarkdown = pendingMarkdown;
+				pendingMarkdown = null;
+
+				await processMarkdown(nextMarkdown);
+			}
+		} catch (error) {
+			console.error('Failed to process markdown:', error);
+			renderedBlocks = [];
+			unstableBlockHtml = markdown.replace(/\n/g, '<br>');
+		} finally {
+			isProcessing = false;
 		}
 	}
 
 	$effect(() => {
-		if (content) {
-			processMarkdown(content)
-				.then((result) => {
-					processedHtml = result;
-				})
-				.catch((error) => {
-					console.error('Failed to process markdown:', error);
-					processedHtml = content.replace(/\n/g, '<br>');
-				});
-		} else {
-			processedHtml = '';
-		}
+		const currentMode = mode.current;
+		const isDark = currentMode === 'dark';
+
+		loadHighlightTheme(isDark);
 	});
 
 	$effect(() => {
-		if (containerRef && processedHtml) {
+		updateRenderedBlocks(content);
+	});
+
+	$effect(() => {
+		const hasRenderedBlocks = renderedBlocks.length > 0;
+		const hasUnstableBlock = Boolean(unstableBlockHtml);
+
+		if ((hasRenderedBlocks || hasUnstableBlock) && containerRef) {
 			setupCodeBlockActions();
 		}
 	});
+
+	onDestroy(() => {
+		cleanupEventListeners();
+		cleanupHighlightTheme();
+	});
 </script>
 
 <div bind:this={containerRef} class={className}>
-	<!-- eslint-disable-next-line no-at-html-tags -->
-	{@html processedHtml}
+	{#each renderedBlocks as block (block.id)}
+		<div class="markdown-block" data-block-id={block.id}>
+			<!-- eslint-disable-next-line no-at-html-tags -->
+			{@html block.html}
+		</div>
+	{/each}
+
+	{#if unstableBlockHtml}
+		<div class="markdown-block markdown-block--unstable" data-block-id="unstable">
+			<!-- eslint-disable-next-line no-at-html-tags -->
+			{@html unstableBlockHtml}
+		</div>
+	{/if}
 </div>
 
 <CodePreviewDialog
@@ -331,6 +399,11 @@
 />
 
 <style>
+	.markdown-block,
+	.markdown-block--unstable {
+		display: contents;
+	}
+
 	/* Base typography styles */
 	div :global(p:not(:last-child)) {
 		margin-bottom: 1rem;
diff --git a/tools/server/webui/src/lib/components/app/server/ServerErrorSplash.svelte b/tools/server/webui/src/lib/components/app/server/ServerErrorSplash.svelte
index 39613f200..fa4c2842c 100644
--- a/tools/server/webui/src/lib/components/app/server/ServerErrorSplash.svelte
+++ b/tools/server/webui/src/lib/components/app/server/ServerErrorSplash.svelte
@@ -1,4 +1,5 @@
 <script lang="ts">
+	import { base } from '$app/paths';
 	import { AlertTriangle, RefreshCw, Key, CheckCircle, XCircle } from '@lucide/svelte';
 	import { goto } from '$app/navigation';
 	import { Button } from '$lib/components/ui/button';
@@ -64,7 +65,7 @@
 			settingsStore.updateConfig('apiKey', apiKeyInput.trim());
 
 			// Test the API key by making a real request to the server
-			const response = await fetch('./props', {
+			const response = await fetch(`${base}/props`, {
 				headers: {
 					'Content-Type': 'application/json',
 					Authorization: `Bearer ${apiKeyInput.trim()}`
diff --git a/tools/server/webui/src/lib/enums/chat.ts b/tools/server/webui/src/lib/enums/chat.ts
new file mode 100644
index 000000000..2b9eb7bc2
--- /dev/null
+++ b/tools/server/webui/src/lib/enums/chat.ts
@@ -0,0 +1,4 @@
+export enum ChatMessageStatsView {
+	GENERATION = 'generation',
+	READING = 'reading'
+}
diff --git a/tools/server/webui/src/lib/enums/index.ts b/tools/server/webui/src/lib/enums/index.ts
index d9e900147..83c86caf6 100644
--- a/tools/server/webui/src/lib/enums/index.ts
+++ b/tools/server/webui/src/lib/enums/index.ts
@@ -1,5 +1,7 @@
 export { AttachmentType } from './attachment';
 
+export { ChatMessageStatsView } from './chat';
+
 export {
 	FileTypeCategory,
 	FileTypeImage,
diff --git a/tools/server/webui/src/lib/markdown/enhance-code-blocks.ts b/tools/server/webui/src/lib/markdown/enhance-code-blocks.ts
new file mode 100644
index 000000000..6f0e03e21
--- /dev/null
+++ b/tools/server/webui/src/lib/markdown/enhance-code-blocks.ts
@@ -0,0 +1,162 @@
+/**
+ * Rehype plugin to enhance code blocks with wrapper, header, and action buttons.
+ *
+ * Wraps <pre><code> elements with a container that includes:
+ * - Language label
+ * - Copy button
+ * - Preview button (for HTML code blocks)
+ *
+ * This operates directly on the HAST tree for better performance,
+ * avoiding the need to stringify and re-parse HTML.
+ */
+
+import type { Plugin } from 'unified';
+import type { Root, Element, ElementContent } from 'hast';
+import { visit } from 'unist-util-visit';
+
+declare global {
+	interface Window {
+		idxCodeBlock?: number;
+	}
+}
+
+const COPY_ICON_SVG = `<svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-copy-icon lucide-copy"><rect width="14" height="14" x="8" y="8" rx="2" ry="2"/><path d="M4 16c-1.1 0-2-.9-2-2V4c0-1.1.9-2 2-2h10c1.1 0 2 .9 2 2"/></svg>`;
+
+const PREVIEW_ICON_SVG = `<svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-eye lucide-eye-icon"><path d="M2.062 12.345a1 1 0 0 1 0-.69C3.5 7.73 7.36 5 12 5s8.5 2.73 9.938 6.655a1 1 0 0 1 0 .69C20.5 16.27 16.64 19 12 19s-8.5-2.73-9.938-6.655"/><circle cx="12" cy="12" r="3"/></svg>`;
+
+/**
+ * Creates an SVG element node from raw SVG string.
+ * Since we can't parse HTML in HAST directly, we use the raw property.
+ */
+function createRawHtmlElement(html: string): Element {
+	return {
+		type: 'element',
+		tagName: 'span',
+		properties: {},
+		children: [{ type: 'raw', value: html } as unknown as ElementContent]
+	};
+}
+
+function createCopyButton(codeId: string): Element {
+	return {
+		type: 'element',
+		tagName: 'button',
+		properties: {
+			className: ['copy-code-btn'],
+			'data-code-id': codeId,
+			title: 'Copy code',
+			type: 'button'
+		},
+		children: [createRawHtmlElement(COPY_ICON_SVG)]
+	};
+}
+
+function createPreviewButton(codeId: string): Element {
+	return {
+		type: 'element',
+		tagName: 'button',
+		properties: {
+			className: ['preview-code-btn'],
+			'data-code-id': codeId,
+			title: 'Preview code',
+			type: 'button'
+		},
+		children: [createRawHtmlElement(PREVIEW_ICON_SVG)]
+	};
+}
+
+function createHeader(language: string, codeId: string): Element {
+	const actions: Element[] = [createCopyButton(codeId)];
+
+	if (language.toLowerCase() === 'html') {
+		actions.push(createPreviewButton(codeId));
+	}
+
+	return {
+		type: 'element',
+		tagName: 'div',
+		properties: { className: ['code-block-header'] },
+		children: [
+			{
+				type: 'element',
+				tagName: 'span',
+				properties: { className: ['code-language'] },
+				children: [{ type: 'text', value: language }]
+			},
+			{
+				type: 'element',
+				tagName: 'div',
+				properties: { className: ['code-block-actions'] },
+				children: actions
+			}
+		]
+	};
+}
+
+function createWrapper(header: Element, preElement: Element): Element {
+	return {
+		type: 'element',
+		tagName: 'div',
+		properties: { className: ['code-block-wrapper'] },
+		children: [header, preElement]
+	};
+}
+
+function extractLanguage(codeElement: Element): string {
+	const className = codeElement.properties?.className;
+	if (!Array.isArray(className)) return 'text';
+
+	for (const cls of className) {
+		if (typeof cls === 'string' && cls.startsWith('language-')) {
+			return cls.replace('language-', '');
+		}
+	}
+
+	return 'text';
+}
+
+/**
+ * Generates a unique code block ID using a global counter.
+ */
+function generateCodeId(): string {
+	if (typeof window !== 'undefined') {
+		return `code-${(window.idxCodeBlock = (window.idxCodeBlock ?? 0) + 1)}`;
+	}
+	// Fallback for SSR - use timestamp + random
+	return `code-${Date.now()}-${Math.random().toString(36).slice(2, 7)}`;
+}
+
+/**
+ * Rehype plugin to enhance code blocks with wrapper, header, and action buttons.
+ * This plugin wraps <pre><code> elements with a container that includes:
+ * - Language label
+ * - Copy button
+ * - Preview button (for HTML code blocks)
+ */
+export const rehypeEnhanceCodeBlocks: Plugin<[], Root> = () => {
+	return (tree: Root) => {
+		visit(tree, 'element', (node: Element, index, parent) => {
+			if (node.tagName !== 'pre' || !parent || index === undefined) return;
+
+			const codeElement = node.children.find(
+				(child): child is Element => child.type === 'element' && child.tagName === 'code'
+			);
+
+			if (!codeElement) return;
+
+			const language = extractLanguage(codeElement);
+			const codeId = generateCodeId();
+
+			codeElement.properties = {
+				...codeElement.properties,
+				'data-code-id': codeId
+			};
+
+			const header = createHeader(language, codeId);
+			const wrapper = createWrapper(header, node);
+
+			// Replace pre with wrapper in parent
+			(parent.children as ElementContent[])[index] = wrapper;
+		});
+	};
+};
diff --git a/tools/server/webui/src/lib/markdown/enhance-links.ts b/tools/server/webui/src/lib/markdown/enhance-links.ts
new file mode 100644
index 000000000..b5fbcbdaa
--- /dev/null
+++ b/tools/server/webui/src/lib/markdown/enhance-links.ts
@@ -0,0 +1,33 @@
+/**
+ * Rehype plugin to enhance links with security attributes.
+ *
+ * Adds target="_blank" and rel="noopener noreferrer" to all anchor elements,
+ * ensuring external links open in new tabs safely.
+ */
+
+import type { Plugin } from 'unified';
+import type { Root, Element } from 'hast';
+import { visit } from 'unist-util-visit';
+
+/**
+ * Rehype plugin that adds security attributes to all links.
+ * This plugin ensures external links open in new tabs safely by adding:
+ * - target="_blank"
+ * - rel="noopener noreferrer"
+ */
+export const rehypeEnhanceLinks: Plugin<[], Root> = () => {
+	return (tree: Root) => {
+		visit(tree, 'element', (node: Element) => {
+			if (node.tagName !== 'a') return;
+
+			const props = node.properties ?? {};
+
+			// Only modify if href exists
+			if (!props.href) return;
+
+			props.target = '_blank';
+			props.rel = 'noopener noreferrer';
+			node.properties = props;
+		});
+	};
+};
diff --git a/tools/server/webui/src/lib/services/parameter-sync.spec.ts b/tools/server/webui/src/lib/services/parameter-sync.spec.ts
index 17b12f757..6b5c58ad4 100644
--- a/tools/server/webui/src/lib/services/parameter-sync.spec.ts
+++ b/tools/server/webui/src/lib/services/parameter-sync.spec.ts
@@ -130,5 +130,19 @@ describe('ParameterSyncService', () => {
 			expect(result.max_tokens).toBe(-1);
 			expect(result.temperature).toBe(0.7);
 		});
+
+		it('should merge webui settings from props when provided', () => {
+			const result = ParameterSyncService.extractServerDefaults(null, {
+				pasteLongTextToFileLen: 0,
+				pdfAsImage: true,
+				renderUserContentAsMarkdown: false,
+				theme: 'dark'
+			});
+
+			expect(result.pasteLongTextToFileLen).toBe(0);
+			expect(result.pdfAsImage).toBe(true);
+			expect(result.renderUserContentAsMarkdown).toBe(false);
+			expect(result.theme).toBeUndefined();
+		});
 	});
 });
diff --git a/tools/server/webui/src/lib/services/parameter-sync.ts b/tools/server/webui/src/lib/services/parameter-sync.ts
index d32d66926..d124cf5c8 100644
--- a/tools/server/webui/src/lib/services/parameter-sync.ts
+++ b/tools/server/webui/src/lib/services/parameter-sync.ts
@@ -55,7 +55,55 @@ export const SYNCABLE_PARAMETERS: SyncableParameter[] = [
 	{ key: 'dry_allowed_length', serverKey: 'dry_allowed_length', type: 'number', canSync: true },
 	{ key: 'dry_penalty_last_n', serverKey: 'dry_penalty_last_n', type: 'number', canSync: true },
 	{ key: 'max_tokens', serverKey: 'max_tokens', type: 'number', canSync: true },
-	{ key: 'samplers', serverKey: 'samplers', type: 'string', canSync: true }
+	{ key: 'samplers', serverKey: 'samplers', type: 'string', canSync: true },
+	{
+		key: 'pasteLongTextToFileLen',
+		serverKey: 'pasteLongTextToFileLen',
+		type: 'number',
+		canSync: true
+	},
+	{ key: 'pdfAsImage', serverKey: 'pdfAsImage', type: 'boolean', canSync: true },
+	{
+		key: 'showThoughtInProgress',
+		serverKey: 'showThoughtInProgress',
+		type: 'boolean',
+		canSync: true
+	},
+	{ key: 'showToolCalls', serverKey: 'showToolCalls', type: 'boolean', canSync: true },
+	{
+		key: 'disableReasoningFormat',
+		serverKey: 'disableReasoningFormat',
+		type: 'boolean',
+		canSync: true
+	},
+	{ key: 'keepStatsVisible', serverKey: 'keepStatsVisible', type: 'boolean', canSync: true },
+	{ key: 'showMessageStats', serverKey: 'showMessageStats', type: 'boolean', canSync: true },
+	{
+		key: 'askForTitleConfirmation',
+		serverKey: 'askForTitleConfirmation',
+		type: 'boolean',
+		canSync: true
+	},
+	{ key: 'disableAutoScroll', serverKey: 'disableAutoScroll', type: 'boolean', canSync: true },
+	{
+		key: 'renderUserContentAsMarkdown',
+		serverKey: 'renderUserContentAsMarkdown',
+		type: 'boolean',
+		canSync: true
+	},
+	{ key: 'autoMicOnEmpty', serverKey: 'autoMicOnEmpty', type: 'boolean', canSync: true },
+	{
+		key: 'pyInterpreterEnabled',
+		serverKey: 'pyInterpreterEnabled',
+		type: 'boolean',
+		canSync: true
+	},
+	{
+		key: 'enableContinueGeneration',
+		serverKey: 'enableContinueGeneration',
+		type: 'boolean',
+		canSync: true
+	}
 ];
 
 export class ParameterSyncService {
@@ -74,25 +122,39 @@ export class ParameterSyncService {
 	 * Extract server default parameters that can be synced
 	 */
 	static extractServerDefaults(
-		serverParams: ApiLlamaCppServerProps['default_generation_settings']['params'] | null
+		serverParams: ApiLlamaCppServerProps['default_generation_settings']['params'] | null,
+		webuiSettings?: Record<string, string | number | boolean>
 	): ParameterRecord {
-		if (!serverParams) return {};
-
 		const extracted: ParameterRecord = {};
 
-		for (const param of SYNCABLE_PARAMETERS) {
-			if (param.canSync && param.serverKey in serverParams) {
-				const value = (serverParams as unknown as Record<string, ParameterValue>)[param.serverKey];
-				if (value !== undefined) {
-					// Apply precision rounding to avoid JavaScript floating-point issues
-					extracted[param.key] = this.roundFloatingPoint(value);
+		if (serverParams) {
+			for (const param of SYNCABLE_PARAMETERS) {
+				if (param.canSync && param.serverKey in serverParams) {
+					const value = (serverParams as unknown as Record<string, ParameterValue>)[
+						param.serverKey
+					];
+					if (value !== undefined) {
+						// Apply precision rounding to avoid JavaScript floating-point issues
+						extracted[param.key] = this.roundFloatingPoint(value);
+					}
 				}
 			}
+
+			// Handle samplers array conversion to string
+			if (serverParams.samplers && Array.isArray(serverParams.samplers)) {
+				extracted.samplers = serverParams.samplers.join(';');
+			}
 		}
 
-		// Handle samplers array conversion to string
-		if (serverParams.samplers && Array.isArray(serverParams.samplers)) {
-			extracted.samplers = serverParams.samplers.join(';');
+		if (webuiSettings) {
+			for (const param of SYNCABLE_PARAMETERS) {
+				if (param.canSync && param.serverKey in webuiSettings) {
+					const value = webuiSettings[param.serverKey];
+					if (value !== undefined) {
+						extracted[param.key] = this.roundFloatingPoint(value);
+					}
+				}
+			}
 		}
 
 		return extracted;
diff --git a/tools/server/webui/src/lib/stores/chat.svelte.ts b/tools/server/webui/src/lib/stores/chat.svelte.ts
index 4f78840a5..e0431ee64 100644
--- a/tools/server/webui/src/lib/stores/chat.svelte.ts
+++ b/tools/server/webui/src/lib/stores/chat.svelte.ts
@@ -171,6 +171,7 @@ class ChatStore {
 	updateProcessingStateFromTimings(
 		timingData: {
 			prompt_n: number;
+			prompt_ms?: number;
 			predicted_n: number;
 			predicted_per_second: number;
 			cache_n: number;
@@ -212,6 +213,7 @@ class ChatStore {
 			if (message.role === 'assistant' && message.timings) {
 				const restoredState = this.parseTimingData({
 					prompt_n: message.timings.prompt_n || 0,
+					prompt_ms: message.timings.prompt_ms,
 					predicted_n: message.timings.predicted_n || 0,
 					predicted_per_second:
 						message.timings.predicted_n && message.timings.predicted_ms
@@ -282,6 +284,7 @@ class ChatStore {
 
 	private parseTimingData(timingData: Record<string, unknown>): ApiProcessingState | null {
 		const promptTokens = (timingData.prompt_n as number) || 0;
+		const promptMs = (timingData.prompt_ms as number) || undefined;
 		const predictedTokens = (timingData.predicted_n as number) || 0;
 		const tokensPerSecond = (timingData.predicted_per_second as number) || 0;
 		const cacheTokens = (timingData.cache_n as number) || 0;
@@ -320,6 +323,7 @@ class ChatStore {
 			speculative: false,
 			progressPercent,
 			promptTokens,
+			promptMs,
 			cacheTokens
 		};
 	}
@@ -536,6 +540,7 @@ class ChatStore {
 					this.updateProcessingStateFromTimings(
 						{
 							prompt_n: timings?.prompt_n || 0,
+							prompt_ms: timings?.prompt_ms,
 							predicted_n: timings?.predicted_n || 0,
 							predicted_per_second: tokensPerSecond,
 							cache_n: timings?.cache_n || 0,
@@ -768,10 +773,11 @@ class ChatStore {
 					content: streamingState.response
 				};
 				if (lastMessage.thinking?.trim()) updateData.thinking = lastMessage.thinking;
-				const lastKnownState = this.getCurrentProcessingStateSync();
+				const lastKnownState = this.getProcessingState(conversationId);
 				if (lastKnownState) {
 					updateData.timings = {
 						prompt_n: lastKnownState.promptTokens || 0,
+						prompt_ms: lastKnownState.promptMs,
 						predicted_n: lastKnownState.tokensDecoded || 0,
 						cache_n: lastKnownState.cacheTokens || 0,
 						predicted_ms:
@@ -1253,6 +1259,7 @@ class ChatStore {
 						this.updateProcessingStateFromTimings(
 							{
 								prompt_n: timings?.prompt_n || 0,
+								prompt_ms: timings?.prompt_ms,
 								predicted_n: timings?.predicted_n || 0,
 								predicted_per_second: tokensPerSecond,
 								cache_n: timings?.cache_n || 0,
diff --git a/tools/server/webui/src/lib/stores/server.svelte.ts b/tools/server/webui/src/lib/stores/server.svelte.ts
index fd2d335be..facfd333b 100644
--- a/tools/server/webui/src/lib/stores/server.svelte.ts
+++ b/tools/server/webui/src/lib/stores/server.svelte.ts
@@ -40,6 +40,10 @@ class ServerStore {
 		return this.props?.default_generation_settings?.n_ctx ?? null;
 	}
 
+	get webuiSettings(): Record<string, string | number | boolean> | undefined {
+		return this.props?.webui_settings;
+	}
+
 	get isRouterMode(): boolean {
 		return this.role === ServerRole.ROUTER;
 	}
diff --git a/tools/server/webui/src/lib/stores/settings.svelte.ts b/tools/server/webui/src/lib/stores/settings.svelte.ts
index 2b7d8db10..e163833bf 100644
--- a/tools/server/webui/src/lib/stores/settings.svelte.ts
+++ b/tools/server/webui/src/lib/stores/settings.svelte.ts
@@ -66,7 +66,8 @@ class SettingsStore {
 	 */
 	private getServerDefaults(): Record<string, string | number | boolean> {
 		const serverParams = serverStore.defaultParams;
-		return serverParams ? ParameterSyncService.extractServerDefaults(serverParams) : {};
+		const webuiSettings = serverStore.webuiSettings;
+		return ParameterSyncService.extractServerDefaults(serverParams, webuiSettings);
 	}
 
 	constructor() {
diff --git a/tools/server/webui/src/lib/types/api.d.ts b/tools/server/webui/src/lib/types/api.d.ts
index 4bc92b57b..e5fde24c7 100644
--- a/tools/server/webui/src/lib/types/api.d.ts
+++ b/tools/server/webui/src/lib/types/api.d.ts
@@ -176,6 +176,7 @@ export interface ApiLlamaCppServerProps {
 	bos_token: string;
 	eos_token: string;
 	build_info: string;
+	webui_settings?: Record<string, string | number | boolean>;
 }
 
 export interface ApiChatCompletionRequest {
@@ -341,6 +342,7 @@ export interface ApiProcessingState {
 	// Progress information from prompt_progress
 	progressPercent?: number;
 	promptTokens?: number;
+	promptMs?: number;
 	cacheTokens?: number;
 }
 
diff --git a/tools/server/webui/src/lib/utils/api-key-validation.ts b/tools/server/webui/src/lib/utils/api-key-validation.ts
index 0652467b8..948b7d7b6 100644
--- a/tools/server/webui/src/lib/utils/api-key-validation.ts
+++ b/tools/server/webui/src/lib/utils/api-key-validation.ts
@@ -1,3 +1,4 @@
+import { base } from '$app/paths';
 import { error } from '@sveltejs/kit';
 import { browser } from '$app/environment';
 import { config } from '$lib/stores/settings.svelte';
@@ -22,7 +23,7 @@ export async function validateApiKey(fetch: typeof globalThis.fetch): Promise<vo
 			headers.Authorization = `Bearer ${apiKey}`;
 		}
 
-		const response = await fetch(`./props`, { headers });
+		const response = await fetch(`${base}/props`, { headers });
 
 		if (!response.ok) {
 			if (response.status === 401 || response.status === 403) {
diff --git a/tools/server/webui/src/routes/+layout.svelte b/tools/server/webui/src/routes/+layout.svelte
index 17e13e9f3..a14dfb633 100644
--- a/tools/server/webui/src/routes/+layout.svelte
+++ b/tools/server/webui/src/routes/+layout.svelte
@@ -1,5 +1,6 @@
 <script lang="ts">
 	import '../app.css';
+	import { base } from '$app/paths';
 	import { page } from '$app/state';
 	import { untrack } from 'svelte';
 	import { ChatSidebar, DialogConversationTitleUpdate } from '$lib/components/app';
@@ -157,7 +158,7 @@
 				headers.Authorization = `Bearer ${apiKey.trim()}`;
 			}
 
-			fetch(`./props`, { headers })
+			fetch(`${base}/props`, { headers })
 				.then((response) => {
 					if (response.status === 401 || response.status === 403) {
 						window.location.reload();