diff --git a/cmake/riscv64-spacemit-linux-gnu-gcc.cmake b/cmake/riscv64-spacemit-linux-gnu-gcc.cmake deleted file mode 100644 index 08fdbf506..000000000 --- a/cmake/riscv64-spacemit-linux-gnu-gcc.cmake +++ /dev/null @@ -1,29 +0,0 @@ -set(CMAKE_SYSTEM_NAME Linux) -set(CMAKE_SYSTEM_PROCESSOR riscv64) -set(CMAKE_SYSTEM_VERSION 1) - -if (CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "^(riscv)") - message(STATUS "HOST SYSTEM ${CMAKE_HOST_SYSTEM_PROCESSOR}") -else() - set(GNU_MACHINE riscv64-unknown-linux-gnu CACHE STRING "GNU compiler triple") - if (DEFINED ENV{RISCV_ROOT_PATH}) - file(TO_CMAKE_PATH $ENV{RISCV_ROOT_PATH} RISCV_ROOT_PATH) - else() - message(FATAL_ERROR "RISCV_ROOT_PATH env must be defined") - endif() - - set(RISCV_ROOT_PATH ${RISCV_ROOT_PATH} CACHE STRING "root path to riscv toolchain") - set(CMAKE_C_COMPILER ${RISCV_ROOT_PATH}/bin/riscv64-unknown-linux-gnu-gcc) - set(CMAKE_CXX_COMPILER ${RISCV_ROOT_PATH}/bin/riscv64-unknown-linux-gnu-g++) - set(CMAKE_STRIP ${RISCV_ROOT_PATH}/bin/riscv64-unknown-linux-gnu-strip) - set(CMAKE_FIND_ROOT_PATH "${RISCV_ROOT_PATH}/riscv64-unknown-linux-gnu") - set(CMAKE_SYSROOT "${RISCV_ROOT_PATH}/sysroot") -endif() - -set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) -set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) -set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) -set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY) -set(CMAKE_C_FLAGS "-march=rv64gcv_zfh_zba_zicbop -mabi=lp64d ${CMAKE_C_FLAGS}") -set(CMAKE_CXX_FLAGS "-march=rv64gcv_zfh_zba_zicbop -mabi=lp64d ${CXX_FLAGS}") -set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -latomic") diff --git a/common/arg.cpp b/common/arg.cpp index cfb65376b..d10368395 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -875,7 +875,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex sampler_type_chars += common_sampler_type_to_chr(sampler); sampler_type_names += common_sampler_type_to_str(sampler) + ";"; } - sampler_type_names.pop_back(); + if (!sampler_type_names.empty()) { + sampler_type_names.pop_back(); // remove last semicolon + } /** @@ -2612,6 +2614,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.api_prefix = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_PREFIX")); + add_opt(common_arg( + {"--webui-config"}, "JSON", + "JSON that provides default WebUI settings (overrides WebUI defaults)", + [](common_params & params, const std::string & value) { + params.webui_config_json = value; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_CONFIG")); + add_opt(common_arg( + {"--webui-config-file"}, "PATH", + "JSON file that provides default WebUI settings (overrides WebUI defaults)", + [](common_params & params, const std::string & value) { + params.webui_config_json = read_file(value); + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_CONFIG_FILE")); add_opt(common_arg( {"--webui"}, {"--no-webui"}, diff --git a/common/common.h b/common/common.h index 519ffdd2f..389196606 100644 --- a/common/common.h +++ b/common/common.h @@ -480,8 +480,11 @@ struct common_params { std::map default_template_kwargs; + // webui configs + bool webui = true; + std::string webui_config_json; + // "advanced" endpoints are disabled by default for better security - bool webui = true; bool endpoint_slots = true; bool endpoint_props = false; // only control POST requests, not GET bool endpoint_metrics = false; diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index bd16ba312..86fe0b5f1 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -189,10 +189,10 @@ class ModelBase: return tensors prefix = "model" if not self.is_mistral_format else "consolidated" - part_names: set[str] = set(ModelBase.get_model_part_names(self.dir_model, prefix, ".safetensors")) + part_names: list[str] = ModelBase.get_model_part_names(self.dir_model, prefix, ".safetensors") is_safetensors: bool = len(part_names) > 0 if not is_safetensors: - part_names = set(ModelBase.get_model_part_names(self.dir_model, "pytorch_model", ".bin")) + part_names = ModelBase.get_model_part_names(self.dir_model, "pytorch_model", ".bin") tensor_names_from_index: set[str] = set() @@ -209,7 +209,8 @@ class ModelBase: if weight_map is None or not isinstance(weight_map, dict): raise ValueError(f"Can't load 'weight_map' from {index_name!r}") tensor_names_from_index.update(weight_map.keys()) - part_names |= set(weight_map.values()) + part_dict: dict[str, None] = dict.fromkeys(weight_map.values(), None) + part_names = sorted(part_dict.keys()) else: weight_map = {} else: @@ -1838,7 +1839,7 @@ class MmprojModel(ModelBase): def tensor_force_quant(self, name, new_name, bid, n_dims): del bid, name, n_dims # unused - if ".patch_embd.weight" in new_name: + if ".patch_embd.weight" in new_name or ".patch_merger.weight" in new_name: return gguf.GGMLQuantizationType.F16 if self.ftype == gguf.LlamaFileType.MOSTLY_F16 else gguf.GGMLQuantizationType.F32 return False diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index fc7543ac9..a9d6aee7b 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -4186,13 +4186,33 @@ void ggml_cpu_fp16_to_fp32(const ggml_fp16_t * x, float * y, int64_t n) { __m128 y_vec = _mm_cvtph_ps(x_vec); _mm_storeu_ps(y + i, y_vec); } -#elif defined(__riscv_zvfh) - for (int vl; i < n; i += vl) { - vl = __riscv_vsetvl_e16m1(n - i); - vfloat16m1_t vx = __riscv_vle16_v_f16m1((_Float16 *)&x[i], vl); - vfloat32m2_t vy = __riscv_vfwcvt_f_f_v_f32m2(vx, vl); - __riscv_vse32_v_f32m2(&y[i], vy, vl); + +#elif defined(__riscv_v_intrinsic) && defined(__riscv_zvfhmin) + // calculate step size + const int epr = __riscv_vsetvlmax_e16m2(); + const int step = epr * 2; + const int np = (n & ~(step - 1)); + + // unroll by 2 + for (; i < np; i += step) { + vfloat16m2_t ax0 = __riscv_vle16_v_f16m2((const _Float16*)x + i, epr); + vfloat32m4_t ay0 = __riscv_vfwcvt_f_f_v_f32m4(ax0, epr); + __riscv_vse32_v_f32m4(y + i, ay0, epr); + + vfloat16m2_t ax1 = __riscv_vle16_v_f16m2((const _Float16*)x + i + epr, epr); + vfloat32m4_t ay1 = __riscv_vfwcvt_f_f_v_f32m4(ax1, epr); + __riscv_vse32_v_f32m4(y + i + epr, ay1, epr); } + + // leftovers + int vl; + for (i = np; i < n; i += vl) { + vl = __riscv_vsetvl_e16m2(n - i); + vfloat16m2_t ax0 = __riscv_vle16_v_f16m2((const _Float16*)x + i, vl); + vfloat32m4_t ay0 = __riscv_vfwcvt_f_f_v_f32m4(ax0, vl); + __riscv_vse32_v_f32m4(y + i, ay0, vl); + } + #endif for (; i < n; ++i) { @@ -4237,6 +4257,31 @@ void ggml_cpu_bf16_to_fp32(const ggml_bf16_t * x, float * y, int64_t n) { (const __m128i *)(x + i))), 16))); } +#elif defined(__riscv_v_intrinsic) && defined(__riscv_zvfbfmin) + // calculate step size + const int epr = __riscv_vsetvlmax_e16m2(); + const int step = epr * 2; + const int np = (n & ~(step - 1)); + + // unroll by 2 + for (; i < np; i += step) { + vbfloat16m2_t ax0 = __riscv_vle16_v_bf16m2((const __bf16*)x + i, epr); + vfloat32m4_t ay0 = __riscv_vfwcvtbf16_f_f_v_f32m4(ax0, epr); + __riscv_vse32_v_f32m4(y + i, ay0, epr); + + vbfloat16m2_t ax1 = __riscv_vle16_v_bf16m2((const __bf16*)x + i + epr, epr); + vfloat32m4_t ay1 = __riscv_vfwcvtbf16_f_f_v_f32m4(ax1, epr); + __riscv_vse32_v_f32m4(y + i + epr, ay1, epr); + } + + // leftovers + int vl; + for (i = np; i < n; i += vl) { + vl = __riscv_vsetvl_e16m2(n - i); + vbfloat16m2_t ax0 = __riscv_vle16_v_bf16m2((const __bf16*)x + i, vl); + vfloat32m4_t ay0 = __riscv_vfwcvtbf16_f_f_v_f32m4(ax0, vl); + __riscv_vse32_v_f32m4(y + i, ay0, vl); + } #endif for (; i < n; i++) { y[i] = GGML_BF16_TO_FP32(x[i]); diff --git a/ggml/src/ggml-cpu/vec.cpp b/ggml/src/ggml-cpu/vec.cpp index ac8633e21..427e63245 100644 --- a/ggml/src/ggml-cpu/vec.cpp +++ b/ggml/src/ggml-cpu/vec.cpp @@ -195,8 +195,48 @@ void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * sumf += (ggml_float)_mm_cvtss_f32(g); #undef LOAD -#endif +#elif defined(__riscv_v_intrinsic) && defined(__riscv_zvfbfwma) + size_t vl = __riscv_vsetvlmax_e32m4(); + // initialize accumulators to all zeroes + vfloat32m4_t vsum0 = __riscv_vfmv_v_f_f32m4(0.0f, vl); + vfloat32m4_t vsum1 = __riscv_vfmv_v_f_f32m4(0.0f, vl); + + // calculate step size + const size_t epr = __riscv_vsetvlmax_e16m2(); + const size_t step = epr * 2; + const int np = (n & ~(step - 1)); + + // unroll by 2 + for (; i < np; i += step) { + vbfloat16m2_t ax0 = __riscv_vle16_v_bf16m2((const __bf16 *)&x[i], epr); + vbfloat16m2_t ay0 = __riscv_vle16_v_bf16m2((const __bf16 *)&y[i], epr); + vsum0 = __riscv_vfwmaccbf16_vv_f32m4(vsum0, ax0, ay0, epr); + __asm__ __volatile__ ("" ::: "memory"); + + vbfloat16m2_t ax1 = __riscv_vle16_v_bf16m2((const __bf16 *)&x[i + epr], epr); + vbfloat16m2_t ay1 = __riscv_vle16_v_bf16m2((const __bf16 *)&y[i + epr], epr); + vsum1 = __riscv_vfwmaccbf16_vv_f32m4(vsum1, ax1, ay1, epr); + __asm__ __volatile__ ("" ::: "memory"); + } + + // accumulate in 1 register + vsum0 = __riscv_vfadd_vv_f32m4(vsum0, vsum1, vl); + + // leftovers + for (i = np; i < n; i += vl) { + vl = __riscv_vsetvl_e16m2(n - i); + vbfloat16m2_t ax0 = __riscv_vle16_v_bf16m2((const __bf16 *)&x[i], vl); + vbfloat16m2_t ay0 = __riscv_vle16_v_bf16m2((const __bf16 *)&y[i], vl); + vsum0 = __riscv_vfwmaccbf16_vv_f32m4(vsum0, ax0, ay0, vl); + } + + // reduce + vl = __riscv_vsetvlmax_e32m4(); + vfloat32m1_t redsum = __riscv_vfredusum_vs_f32m4_f32m1(vsum0, __riscv_vfmv_v_f_f32m1(0.0f, 1), vl); + sumf += __riscv_vfmv_f_s_f32m1_f32(redsum); + +#endif for (; i < n; ++i) { sumf += (ggml_float)(GGML_BF16_TO_FP32(x[i]) * GGML_BF16_TO_FP32(y[i])); diff --git a/ggml/src/ggml-cpu/vec.h b/ggml/src/ggml-cpu/vec.h index bd80805fd..3198b33b5 100644 --- a/ggml/src/ggml-cpu/vec.h +++ b/ggml/src/ggml-cpu/vec.h @@ -224,13 +224,71 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG } GGML_F16x_VEC_REDUCE(sumf[0], sum_00, sum_01, sum_02, sum_03); GGML_F16x_VEC_REDUCE(sumf[1], sum_10, sum_11, sum_12, sum_13); - #elif defined(__riscv_v_intrinsic) - // todo: RVV impl - for (int i = 0; i < n; ++i) { - for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) { - sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i])); - } - } + + #elif defined(__riscv_v_intrinsic) && defined(__riscv_zvfh) + size_t vl = __riscv_vsetvlmax_e32m4(); + + // initialize accumulators to all zeroes + vfloat32m4_t vsum0_0 = __riscv_vfmv_v_f_f32m4(0.0f, vl); + vfloat32m4_t vsum0_1 = __riscv_vfmv_v_f_f32m4(0.0f, vl); + vfloat32m4_t vsum1_0 = __riscv_vfmv_v_f_f32m4(0.0f, vl); + vfloat32m4_t vsum1_1 = __riscv_vfmv_v_f_f32m4(0.0f, vl); + + // calculate step size + const size_t epr = __riscv_vsetvlmax_e16m2(); + const size_t step = epr * 2; + const int np = (n & ~(step - 1)); + + // unroll by 2 along the row dimension + for (int i = 0; i < np; i += step) { + vfloat16m2_t ay0 = __riscv_vle16_v_f16m2((const _Float16 *)(y + i), epr); + vfloat16m2_t ax0_0 = __riscv_vle16_v_f16m2((const _Float16 *)(x[0] + i), epr); + vfloat16m2_t ax1_0 = __riscv_vle16_v_f16m2((const _Float16 *)(x[1] + i), epr); + vsum0_0 = __riscv_vfwmacc_vv_f32m4(vsum0_0, ax0_0, ay0, epr); + vsum1_0 = __riscv_vfwmacc_vv_f32m4(vsum1_0, ax1_0, ay0, epr); + + vfloat16m2_t ay1 = __riscv_vle16_v_f16m2((const _Float16 *)(y + i + epr), epr); + vfloat16m2_t ax0_1 = __riscv_vle16_v_f16m2((const _Float16 *)(x[0] + i + epr), epr); + vfloat16m2_t ax1_1 = __riscv_vle16_v_f16m2((const _Float16 *)(x[1] + i + epr), epr); + vsum0_1 = __riscv_vfwmacc_vv_f32m4(vsum0_1, ax0_1, ay1, epr); + vsum1_1 = __riscv_vfwmacc_vv_f32m4(vsum1_1, ax1_1, ay1, epr); + } + + vfloat32m4_t vsum0 = __riscv_vfadd_vv_f32m4(vsum0_0, vsum0_1, vl); + vfloat32m4_t vsum1 = __riscv_vfadd_vv_f32m4(vsum1_0, vsum1_1, vl); + + // leftovers + for (int i = np; i < n; i += vl) { + vl = __riscv_vsetvl_e16m2(n - i); + vfloat16m2_t ay = __riscv_vle16_v_f16m2((const _Float16 *)(y + i), vl); + vfloat16m2_t ax0 = __riscv_vle16_v_f16m2((const _Float16 *)(x[0] + i), vl); + vfloat16m2_t ax1 = __riscv_vle16_v_f16m2((const _Float16 *)(x[1] + i), vl); + + vsum0 = __riscv_vfwmacc_vv_f32m4(vsum0, ax0, ay, vl); + vsum1 = __riscv_vfwmacc_vv_f32m4(vsum1, ax1, ay, vl); + } + + // reduce + vl = __riscv_vsetvlmax_e32m2(); + vfloat32m2_t acc0_0 = __riscv_vfadd_vv_f32m2(__riscv_vget_v_f32m4_f32m2(vsum0, 0), + __riscv_vget_v_f32m4_f32m2(vsum0, 1), vl); + vl = __riscv_vsetvlmax_e32m1(); + vfloat32m1_t acc0_1 = __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m2_f32m1(acc0_0, 0), + __riscv_vget_v_f32m2_f32m1(acc0_0, 1), vl); + vfloat32m1_t redsum0 = __riscv_vfredusum_vs_f32m1_f32m1( + acc0_1, __riscv_vfmv_v_f_f32m1(0.0f, 1), vl); + + vl = __riscv_vsetvlmax_e32m2(); + vfloat32m2_t acc1_0 = __riscv_vfadd_vv_f32m2(__riscv_vget_v_f32m4_f32m2(vsum1, 0), + __riscv_vget_v_f32m4_f32m2(vsum1, 1), vl); + vl = __riscv_vsetvlmax_e32m1(); + vfloat32m1_t acc1_1 = __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m2_f32m1(acc1_0, 0), + __riscv_vget_v_f32m2_f32m1(acc1_0, 1), vl); + vfloat32m1_t redsum1 = __riscv_vfredusum_vs_f32m1_f32m1( + acc1_1, __riscv_vfmv_v_f_f32m1(0.0f, 1), vl); + sumf[0] = __riscv_vfmv_f_s_f32m1_f32(redsum0); + sumf[1] = __riscv_vfmv_f_s_f32m1_f32(redsum1); + #else const int np = (n & ~(GGML_F16_STEP - 1)); @@ -475,15 +533,39 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y, } np = n; #elif defined(__riscv_zvfh) // implies __riscv_v_intrinsic - const int np = n; - _Float16 hv = (_Float16)v; - for (int i = 0, avl; i < n; i += avl) { - avl = __riscv_vsetvl_e16m8(n - i); - vfloat16m8_t ax = __riscv_vle16_v_f16m8((const _Float16 *)&x[i], avl); - vfloat16m8_t ay = __riscv_vle16_v_f16m8((_Float16 *)&y[i], avl); - vfloat16m8_t ny = __riscv_vfmadd_vf_f16m8(ax, hv, ay, avl); - __riscv_vse16_v_f16m8((_Float16 *)&y[i], ny, avl); + const ggml_fp16_t s = GGML_CPU_FP32_TO_FP16(v); + const _Float16 scale = *(const _Float16*)(&s); + + // calculate step size + const int epr = __riscv_vsetvlmax_e16m4(); + const int step = epr * 2; + int np = (n & ~(step - 1)); + + // unroll by 2 + for (int i = 0; i < np; i += step) { + vfloat16m4_t ax0 = __riscv_vle16_v_f16m4((const _Float16*)x + i, epr); + vfloat16m4_t ay0 = __riscv_vle16_v_f16m4((const _Float16*)y + i, epr); + ay0 = __riscv_vfmacc_vf_f16m4(ay0, scale, ax0, epr); + __riscv_vse16_v_f16m4((_Float16*)y + i, ay0, epr); + __asm__ __volatile__ ("" ::: "memory"); + + vfloat16m4_t ax1 = __riscv_vle16_v_f16m4((const _Float16*)x + i + epr, epr); + vfloat16m4_t ay1 = __riscv_vle16_v_f16m4((const _Float16*)y + i + epr, epr); + ay1 = __riscv_vfmacc_vf_f16m4(ay1, scale, ax1, epr); + __riscv_vse16_v_f16m4((_Float16*)y + i + epr, ay1, epr); + __asm__ __volatile__ ("" ::: "memory"); } + + // leftovers + int vl; + for (int i = np; i < n; i += vl) { + vl = __riscv_vsetvl_e16m4(n - i); + vfloat16m4_t ax0 = __riscv_vle16_v_f16m4((const _Float16*)x + i, vl); + vfloat16m4_t ay0 = __riscv_vle16_v_f16m4((const _Float16*)y + i, vl); + ay0 = __riscv_vfmacc_vf_f16m4(ay0, scale, ax0, vl); + __riscv_vse16_v_f16m4((_Float16*)y + i, ay0, vl); + } + np = n; #elif defined(GGML_SIMD) const int np = (n & ~(GGML_F16_STEP - 1)); @@ -724,13 +806,34 @@ inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float svst1_f16(pg, (__fp16 *)(y + np), out); } #elif defined(__riscv_v_intrinsic) && defined(__riscv_zvfh) - for (int i = 0, vl; i < n; i += vl) { - vl = __riscv_vsetvl_e16m2(n - i); - vfloat16m2_t vy = __riscv_vle16_v_f16m2((_Float16 *)&y[i], vl); - vfloat32m4_t vy32 = __riscv_vfwcvt_f_f_v_f32m4(vy, vl); - vy32 = __riscv_vfmul_vf_f32m4(vy32, v, vl); - vy = __riscv_vfncvt_f_f_w_f16m2(vy32, vl); - __riscv_vse16_v_f16m2((_Float16 *)&y[i], vy, vl); + const ggml_fp16_t s = GGML_CPU_FP32_TO_FP16(v); + const _Float16 scale = *(const _Float16*)(&s); + + // calculate step size + const int epr = __riscv_vsetvlmax_e16m4(); + const int step = epr * 2; + const int np = (n & ~(step - 1)); + + // unroll by 2 + for (int i = 0; i < np; i += step) { + vfloat16m4_t ay0 = __riscv_vle16_v_f16m4((const _Float16*)y + i, epr); + ay0 = __riscv_vfmul_vf_f16m4(ay0, scale, epr); + __riscv_vse16_v_f16m4((_Float16*)y + i, ay0, epr); + __asm__ __volatile__ ("" ::: "memory"); + + vfloat16m4_t ay1 = __riscv_vle16_v_f16m4((const _Float16*)y + i + epr, epr); + ay1 = __riscv_vfmul_vf_f16m4(ay1, scale, epr); + __riscv_vse16_v_f16m4((_Float16*)y + i + epr, ay1, epr); + __asm__ __volatile__ ("" ::: "memory"); + } + + // leftovers + int vl; + for (int i = np; i < n; i += vl) { + vl = __riscv_vsetvl_e16m4(n - i); + vfloat16m4_t ay0 = __riscv_vle16_v_f16m4((const _Float16*)y + i, vl); + ay0 = __riscv_vfmul_vf_f16m4(ay0, scale, vl); + __riscv_vse16_v_f16m4((_Float16*)y + i, ay0, vl); } #elif defined(GGML_SIMD) const int np = (n & ~(GGML_F16_STEP - 1)); diff --git a/ggml/src/ggml-cuda/mma.cuh b/ggml/src/ggml-cuda/mma.cuh index 794d90bdd..3268dadfe 100644 --- a/ggml/src/ggml-cuda/mma.cuh +++ b/ggml/src/ggml-cuda/mma.cuh @@ -78,27 +78,25 @@ namespace ggml_cuda_mma { // MIRRORED == Each data value is held exactly once per thread subgroup. DATA_LAYOUT_I_MAJOR = 0, // Always used for Turing, Ampere, Ada Lovelace, consumer Blackwell, matrix A&B for RDNA4 and CDNA. DATA_LAYOUT_J_MAJOR = 10, // Matrix C for CDNA and RDNA4, int and float matrix C for RDNA3. - DATA_LAYOUT_I_MAJOR_MIRRORED = 20, + DATA_LAYOUT_I_MAJOR_MIRRORED = 20, // Volta, matrix A&B for RDNA3. DATA_LAYOUT_J_MAJOR_MIRRORED = 30, - DATA_LAYOUT_I_MAJOR_DUAL = 40, // Matrix A&B for RDNA3. }; // Implemented mma combinations are: // - (I_MAJOR, I_MAJOR) -> I_MAJOR // - (I_MAJOR, I_MAJOR_MIRRORED) -> I_MAJOR // - (I_MAJOR, J_MAJOR_MIRRORED) -> I_MAJOR - constexpr bool is_i_major(const data_layout dl) { + static constexpr bool is_i_major(const data_layout dl) { return dl == DATA_LAYOUT_I_MAJOR || - dl == DATA_LAYOUT_I_MAJOR_MIRRORED || - dl == DATA_LAYOUT_I_MAJOR_DUAL; + dl == DATA_LAYOUT_I_MAJOR_MIRRORED; } - constexpr data_layout get_input_data_layout() { -#if defined(RDNA3) - return DATA_LAYOUT_I_MAJOR_DUAL; + static constexpr __device__ data_layout get_input_data_layout() { +#if defined(RDNA3) || __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA + return DATA_LAYOUT_I_MAJOR_MIRRORED; #else return DATA_LAYOUT_I_MAJOR; -#endif // defined(RDNA3) +#endif // defined(RDNA3) || __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA } template @@ -462,11 +460,65 @@ namespace ggml_cuda_mma { } }; + template + struct tile { + static constexpr int I = I_; + static constexpr int J = J_; + static constexpr data_layout dl = DATA_LAYOUT_I_MAJOR_MIRRORED; + + // RDNA3 + static constexpr int ne = I * J / 32 * 2; + + T x[ne] = {0}; + + static constexpr __device__ bool supported() { + if (I == 16 && J == 16) return true; + if (I == 16 && J == 8) return true; + if (I == 16 && J == 4) return true; + return false; + } + + static __device__ __forceinline__ int get_i(const int /*l*/) { + if constexpr (supported()) { + return threadIdx.x % 16; + } else { + NO_DEVICE_CODE; + return -1; + } + } + + static __device__ __forceinline__ int get_j(const int l) { + if constexpr (supported()) { + return l; + } else { + NO_DEVICE_CODE; + return -1; + } + } + }; + template struct tile { static constexpr int I = I_; static constexpr int J = J_; static constexpr data_layout dl = DATA_LAYOUT_I_MAJOR_MIRRORED; +#if defined(RDNA3) + static constexpr int ne = tile::ne; + + half2 x[ne] = {{0.0f, 0.0f}}; + + static constexpr __device__ bool supported() { + return tile::supported(); + } + + static __device__ __forceinline__ int get_i(const int l) { + return tile::get_i(l); + } + + static __device__ __forceinline__ int get_j(const int l) { + return tile::get_j(l); + } +#else // Volta static constexpr int ne = I * J / (WARP_SIZE/4); half2 x[ne] = {{0.0f, 0.0f}}; @@ -493,6 +545,29 @@ namespace ggml_cuda_mma { return -1; } } +#endif // defined(RDNA3) + }; + + template + struct tile { + static constexpr int I = I_; + static constexpr int J = J_; + static constexpr data_layout dl = DATA_LAYOUT_I_MAJOR_MIRRORED; + static constexpr int ne = tile::ne; + + nv_bfloat162 x[ne] = {{0.0f, 0.0f}}; + + static constexpr __device__ bool supported() { + return tile::supported(); + } + + static __device__ __forceinline__ int get_i(const int l) { + return tile::get_i(l); + } + + static __device__ __forceinline__ int get_j(const int l) { + return tile::get_j(l); + } }; template @@ -528,42 +603,6 @@ namespace ggml_cuda_mma { } }; - template - struct tile { - static constexpr int I = I_; - static constexpr int J = J_; - static constexpr data_layout dl = DATA_LAYOUT_I_MAJOR_DUAL; - - static constexpr int ne = I * J / 32 * 2; - - T x[ne] = {0}; - - static constexpr __device__ bool supported() { - if (I == 16 && J == 16) return true; - if (I == 16 && J == 8) return true; - if (I == 16 && J == 4) return true; - return false; - } - - static __device__ __forceinline__ int get_i(const int l) { - if constexpr (supported()) { - return threadIdx.x % 16; - } else { - NO_DEVICE_CODE; - return -1; - } - } - - static __device__ __forceinline__ int get_j(const int l) { - if constexpr (supported()) { - return l; - } else { - NO_DEVICE_CODE; - return -1; - } - } - }; - #if defined(TURING_MMA_AVAILABLE) template static __device__ __forceinline__ tile get_half2(const tile & tile_float) { diff --git a/gguf-py/gguf/utility.py b/gguf-py/gguf/utility.py index 7907e706d..4918ae971 100644 --- a/gguf-py/gguf/utility.py +++ b/gguf-py/gguf/utility.py @@ -288,7 +288,7 @@ class LocalTensor: data_range: LocalTensorRange def mmap_bytes(self) -> np.ndarray: - return np.memmap(self.data_range.filename, mode='r', offset=self.data_range.offset, shape=self.data_range.size) + return np.memmap(self.data_range.filename, mode='c', offset=self.data_range.offset, shape=self.data_range.size) class SafetensorsLocal: diff --git a/models/templates/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16.jinja b/models/templates/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16.jinja deleted file mode 100644 index a01e0861c..000000000 --- a/models/templates/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16.jinja +++ /dev/null @@ -1,204 +0,0 @@ -{% macro render_extra_keys(json_dict, handled_keys) %} - {%- if json_dict is mapping %} - {%- for json_key in json_dict if json_key not in handled_keys %} - {%- if json_dict[json_key] is mapping or (json_dict[json_key] is sequence and json_dict[json_key] is not string) %} - {{- '\n<' ~ json_key ~ '>' ~ (json_dict[json_key] | tojson | safe) ~ '' }} - {%- else %} - {{-'\n<' ~ json_key ~ '>' ~ (json_dict[json_key] | string) ~ '' }} - {%- endif %} - {%- endfor %} - {%- endif %} -{% endmacro %} -{%- set enable_thinking = enable_thinking if enable_thinking is defined else True %} -{%- set truncate_history_thinking = truncate_history_thinking if truncate_history_thinking is defined else True %} - -{%- set ns = namespace(last_user_idx = -1) %} -{%- set loop_messages = messages %} -{%- for m in loop_messages %} - {%- if m["role"] == "user" %} - {%- set ns.last_user_idx = loop.index0 %} - {%- endif %} -{%- endfor %} - -{%- if messages[0]["role"] == "system" %} - {%- set system_message = messages[0]["content"] %} - {%- set loop_messages = messages[1:] %} -{%- else %} - {%- set system_message = "" %} - {%- set loop_messages = messages %} -{%- endif %} -{%- if not tools is defined %} - {%- set tools = [] %} -{%- endif %} -{# Recompute last_user_idx relative to loop_messages after handling system #} -{%- set ns = namespace(last_user_idx = -1) %} -{%- for m in loop_messages %} - {%- if m["role"] == "user" %} - {%- set ns.last_user_idx = loop.index0 %} - {%- endif %} -{%- endfor %} -{%- if system_message is defined %} - {{- "<|im_start|>system\n" + system_message }} -{%- else %} - {%- if tools is iterable and tools | length > 0 %} - {{- "<|im_start|>system\n" }} - {%- endif %} -{%- endif %} -{%- if tools is iterable and tools | length > 0 %} - {%- if system_message is defined and system_message | length > 0 %} - {{- "\n\n" }} - {%- endif %} - {{- "# Tools\n\nYou have access to the following functions:\n\n" }} - {{- "" }} - {%- for tool in tools %} - {%- if tool.function is defined %} - {%- set tool = tool.function %} - {%- endif %} - {{- "\n\n" ~ tool.name ~ "" }} - {%- if tool.description is defined %} - {{- '\n' ~ (tool.description | trim) ~ '' }} - {%- endif %} - {{- '\n' }} - {%- if tool.parameters is defined and tool.parameters is mapping and tool.parameters.properties is defined and tool.parameters.properties is mapping %} - {%- for param_name, param_fields in tool.parameters.properties|items %} - {{- '\n' }} - {{- '\n' ~ param_name ~ '' }} - {%- if param_fields.type is defined %} - {{- '\n' ~ (param_fields.type | string) ~ '' }} - {%- endif %} - {%- if param_fields.description is defined %} - {{- '\n' ~ (param_fields.description | trim) ~ '' }} - {%- endif %} - {%- if param_fields.enum is defined %} - {{- '\n' ~ (param_fields.enum | tojson | safe) ~ '' }} - {%- endif %} - {%- set handled_keys = ['name', 'type', 'description', 'enum'] %} - {{- render_extra_keys(param_fields, handled_keys) }} - {{- '\n' }} - {%- endfor %} - {%- endif %} - {% set handled_keys = ['type', 'properties', 'required'] %} - {{- render_extra_keys(tool.parameters, handled_keys) }} - {%- if tool.parameters is defined and tool.parameters.required is defined %} - {{- '\n' ~ (tool.parameters.required | tojson | safe) ~ '' }} - {%- endif %} - {{- '\n' }} - {%- set handled_keys = ['type', 'name', 'description', 'parameters'] %} - {{- render_extra_keys(tool, handled_keys) }} - {{- '\n' }} - {%- endfor %} - {{- "\n" }} - - {{- '\n\nIf you choose to call a function ONLY reply in the following format with NO suffix:\n\n\n\n\nvalue_1\n\n\nThis is the value for the second parameter\nthat can span\nmultiple lines\n\n\n\n\n\nReminder:\n- Function calls MUST follow the specified format: an inner block must be nested within XML tags\n- Required parameters MUST be specified\n- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after\n- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls\n' }} -{%- endif %} - - -{%- if system_message is defined %} - {{- '<|im_end|>\n' }} -{%- else %} - {%- if tools is iterable and tools | length > 0 %} - {{- '<|im_end|>\n' }} - {%- endif %} -{%- endif %} - -{%- for message in loop_messages %} - {%- if message.role == "assistant" %} - {# Add reasoning content in to content field for unified processing below. #} - {%- if message.reasoning_content is defined and message.reasoning_content is string and message.reasoning_content | trim | length > 0 %} - {%- set content = "\n" ~ message.reasoning_content ~ "\n\n" ~ (message.content | default('', true)) %} - {%- else %} - {%- set content = message.content | default('', true) %} - {%- if content is string -%} - {# Allow downstream logic to to take care of broken thought, only handle coherent reasoning here. #} - {%- if '' not in content and '' not in content -%} - {%- set content = "" ~ content -%} - {%- endif -%} - {%- else -%} - {%- set content = content -%} - {%- endif -%} - {%- endif %} - {%- if message.tool_calls is defined and message.tool_calls is iterable and message.tool_calls | length > 0 %} - {# Assistant message has tool calls. #} - {{- '<|im_start|>assistant\n' }} - {%- set include_content = not (truncate_history_thinking and loop.index0 < ns.last_user_idx) %} - {%- if content is string and content | trim | length > 0 %} - {%- if include_content %} - {{- (content | trim) ~ '\n' -}} - {%- else %} - {%- set c = (content | string) %} - {%- if '' in c %} - {# Keep only content after the last closing think. Also generation prompt causes this. #} - {%- set c = c.split('')[-1] %} - {%- elif '' in c %} - {# If was opened but never closed, drop the trailing think segment #} - {%- set c = c.split('')[0] %} - {%- endif %} - {%- set c = "" ~ c | trim %} - {%- if c | length > 0 %} - {{- c ~ '\n' -}} - {%- endif %} - {%- endif %} - {%- else %} - {{- "" -}} - {%- endif %} - {%- for tool_call in message.tool_calls %} - {%- if tool_call.function is defined %} - {%- set tool_call = tool_call.function %} - {%- endif %} - {{- '\n\n' -}} - {%- if tool_call.arguments is defined %} - {%- for args_name, args_value in tool_call.arguments|items %} - {{- '\n' -}} - {%- set args_value = args_value | tojson | safe if args_value is mapping or (args_value is sequence and args_value is not string) else args_value | string %} - {{- args_value ~ '\n\n' -}} - {%- endfor %} - {%- endif %} - {{- '\n\n' -}} - {%- endfor %} - {{- '<|im_end|>\n' }} - {%- else %} - {# Assistant message doesn't have tool calls. #} - {%- if not (truncate_history_thinking and loop.index0 < ns.last_user_idx) %} - {{- '<|im_start|>assistant\n' ~ (content | default('', true) | string | trim) ~ '<|im_end|>\n' }} - {%- else %} - {%- set c = (content | default('', true) | string) %} - {%- if '' in c and '' in c %} - {%- set c = "" ~ c.split('')[-1] %} - {%- endif %} - {%- set c = c | trim %} - {%- if c | length > 0 %} - {{- '<|im_start|>assistant\n' ~ c ~ '<|im_end|>\n' }} - {%- else %} - {{- '<|im_start|>assistant\n<|im_end|>\n' }} - {%- endif %} - {%- endif %} - {%- endif %} - {%- elif message.role == "user" or message.role == "system" %} - {{- '<|im_start|>' + message.role + '\n' }} - {%- set content = message.content | string %} - {{- content }} - {{- '<|im_end|>\n' }} - {%- elif message.role == "tool" %} - {%- if loop.previtem and loop.previtem.role != "tool" %} - {{- '<|im_start|>user\n' }} - {%- endif %} - {{- '\n' }} - {{- message.content }} - {{- '\n\n' }} - {%- if not loop.last and loop.nextitem.role != "tool" %} - {{- '<|im_end|>\n' }} - {%- elif loop.last %} - {{- '<|im_end|>\n' }} - {%- endif %} - {%- else %} - {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>\n' }} - {%- endif %} -{%- endfor %} - -{%- if add_generation_prompt %} - {%- if enable_thinking %} - {{- '<|im_start|>assistant\n\n' }} - {%- else %} - {{- '<|im_start|>assistant\n' }} - {%- endif %} -{%- endif %} diff --git a/scripts/compare-logprobs.py b/scripts/compare-logprobs.py deleted file mode 100644 index 63861dd9a..000000000 --- a/scripts/compare-logprobs.py +++ /dev/null @@ -1,281 +0,0 @@ -import argparse -import requests -import json -from pathlib import Path -import logging - -logger = logging.getLogger("compare-logprobs") -logging.basicConfig(level=logging.INFO) - - -DESCRIPTION = """ -Compare logits between llama.cpp and another inference engine using OpenAI-compatible server endpoints. - -Unlike compare-logits.py, it allows dumping logits from a hosted API endpoint. Useful when it's not possible to run both models locally. - -Example usage: - Step 1: Dump logits from two different servers - python scripts/compare-logprobs.py dump logits_llama.log http://localhost:8080/v1/completions - python scripts/compare-logprobs.py dump logits_other.log http://other-engine:8000/v1/completions - - (optionally, you can add --api-key if the endpoint requires authentication) - - Step 2: Compare the dumped logits - python scripts/compare-logprobs.py compare logits_llama.log logits_other.log report.md -""" - - -def generate_input_prompt(length: int) -> list[str]: - CORPUS = """ - You are an advanced AI assistant capable of using tools to gather information, perform calculations, or execute tasks. Always think step by step before responding. If a user's query requires external data, computation, or actions beyond your internal knowledge, use the appropriate tools via function calls. - - ### Tool Call Format: - When you need to use a tool, output the call in this exact XML format. Include the opening and closing tags. Do not escape arguments; they will be parsed as plain text. - - You can make multiple calls in one go by placing them one after another. - """ - words = [w.strip() for w in CORPUS.strip().split(" ")] - words = [w for w in words if len(w) > 0] # filter out empty strings - while len(words) < length: - words += words - return words[:length] - - -def dump_logits( - endpoint: str, - output_path: Path, - input_words: list[str], - pattern: list[tuple[bool, int]], - api_key=None, -): - logger.info(f"Dumping logits to {output_path} from endpoint {endpoint}...") - words = input_words - curr_text = "" - n_total = sum(n for get, n in pattern if get) - n_done = 0 - i_cur = 0 - i_total = len(words) - with output_path.open("w") as f: - for get, n in pattern: - if not get: - # skip n words - for i in range(n): - curr_text += words.pop(0) + " " - i_cur += 1 - continue - # get n words - for i in range(n): - curr_text += words.pop(0) + " " - payload = { - "prompt": curr_text.strip(), - "temperature": 0.0, - "top_k": 1, - "max_tokens": 1, - "logprobs": 1, - "stream": False, - } - response = requests.post( - endpoint, - json=payload, - headers={"Authorization": f"Bearer {api_key}"} if api_key else {}, - ) - response.raise_for_status() - data = response.json() - data["__index"] = i_cur # add index for easier debugging later - data = json.dumps(data) - f.write(f"{data}\n") - n_done += 1 - i_cur += 1 - logger.info( - f"\n\n{data}\n\n[Step: {n_done}/{n_total} | Word: {i_cur}/{i_total}]" - ) - logger.info(f"Logits dumped to {output_path}") - - -def get_token_logprobs(data: dict): - logprobs = data["choices"][0]["logprobs"] - if "content" in logprobs: - # llama.cpp case - top = logprobs["content"][0]["top_logprobs"][0] - return top["token"], top["logprob"] - else: - # vllm case - tokens = logprobs["tokens"] - token_logprobs = logprobs["token_logprobs"] - return tokens[0], token_logprobs[0] - - -def clean_text(text: str) -> str: - return ( - "'" - + text.replace("\n", "\\n") - .replace("\t", "\\t") - .replace("\r", "\\r") - .replace("|", "\\|") - + "'" - ) - - -def compare_logits(input1: Path, input2: Path, output_path: Path): - with input1.open("r") as f1, input2.open("r") as f2, output_path.open("w") as fout: - lines1 = f1.readlines() - lines2 = f2.readlines() - - tab_header = [ - "idx", - input1.name, - "logprob_1", - input2.name, - "logprob_2", - "diff (abs)", - ] - tab_entries = [] - tab_max_widths = [len(h) for h in tab_header] - - assert len(lines1) == len( - lines2 - ), "Input files must have the same number of lines." - - fout.write("# Logits Comparison Report\n\n") - for i, (line1, line2) in enumerate(zip(lines1, lines2)): - if not line1.strip() or not line2.strip(): - continue # skip empty lines - - data1 = json.loads(line1) - data2 = json.loads(line2) - - idx1 = data1.get("__index", -1) - idx2 = data2.get("__index", -1) - if idx1 != idx2: - logger.warning( - f"Warning: Mismatched indices at line {i}: {idx1} vs {idx2}" - ) - - token1, logprob1 = get_token_logprobs(data1) - token2, logprob2 = get_token_logprobs(data2) - - token1 = clean_text(token1) - token2 = clean_text(token2) - abs_diff = abs(logprob1 - logprob2) - - tab_entries.append( - ( - str(idx1 + 1), - token1, - f"{logprob1:.4f}", - token2, - f"{logprob2:.4f}", - f"{(abs_diff):.4f}", - ) - ) - - for i in range(len(tab_entries)): - for j in range(len(tab_header)): - tab_max_widths[j] = max(tab_max_widths[j], len(tab_entries[i][j])) - - output = "" - for j in range(len(tab_header)): - output += f"| {tab_header[j]:<{tab_max_widths[j]}} " - output += "|\n" - for j in range(len(tab_header)): - output += f"|{'-' * (tab_max_widths[j] + 2)}" - output += "|\n" - for entry in tab_entries: - for j in range(len(tab_header)): - output += f"| {entry[j]:<{tab_max_widths[j]}} " - output += "|\n" - - logger.info("\n" + output) - fout.write(output) - logger.info(f"Report written to {output_path}") - - -def parse_pattern(pattern: str) -> list[tuple[bool, int]]: - parts = pattern.split(",") - result = [] - for i, part in enumerate(parts): - n = int(part) - if i % 2 == 0: - result.append((True, n)) # get n words - else: - result.append((False, n)) # skip n words - return result - - -def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser( - description=DESCRIPTION, formatter_class=argparse.RawTextHelpFormatter - ) - subparsers = parser.add_subparsers( - dest="verb", required=True, help="action to perform" - ) - - # dump subcommand - parser_dump = subparsers.add_parser("dump", help="dump logits from an endpoint") - parser_dump.add_argument( - "output", type=Path, help="output path for dumped logits (.log)" - ) - parser_dump.add_argument( - "endpoint", type=str, help="OAI-compat /completions endpoint" - ) - parser_dump.add_argument( - "--api-key", - type=str, - default=None, - help="API key for authentication (if required)", - ) - parser_dump.add_argument( - "--file", - type=Path, - default=None, - help="File containing prompt to use instead of the default", - ) - parser_dump.add_argument( - "--pattern", - type=str, - default="10,1000,10,4000,10", - help="Pattern n_get,n_skip,... where n_get is number of words to get and n_skip is number of words to skip (num of words, NOT num of tokens)", - ) - - # compare subcommand - parser_compare = subparsers.add_parser( - "compare", help="compare two dumped logits files" - ) - parser_compare.add_argument("input1", type=Path, help="first input file (.log)") - parser_compare.add_argument("input2", type=Path, help="second input file (.log)") - parser_compare.add_argument( - "output", type=Path, help="output path for comparison report (.md)" - ) - - try: - return parser.parse_args() - except Exception as e: - parser.print_help() - raise e - - -def main(): - args = parse_args() - - if args.verb == "dump": - pattern = parse_pattern(args.pattern) - input_length = sum(n for _, n in pattern) - input_words = generate_input_prompt(input_length) - if args.file is not None: - with args.file.open("r") as f: - input_words = f.read().strip().split(" ") - if input_length < sum(n for _, n in pattern): - raise ValueError( - f"Input file has only {input_length} words, but pattern requires at least {input_length} words." - ) - input_length = len(input_words) - logger.info(f"Using {input_length} words") - dump_logits(args.endpoint, args.output, input_words, pattern, args.api_key) - elif args.verb == "compare": - compare_logits(args.input1, args.input2, args.output) - else: - raise ValueError(f"Unknown verb: {args.verb}") - - -if __name__ == "__main__": - main() diff --git a/scripts/snapdragon/adb/run-mtmd.sh b/scripts/snapdragon/adb/run-mtmd.sh deleted file mode 100755 index 91d868278..000000000 --- a/scripts/snapdragon/adb/run-mtmd.sh +++ /dev/null @@ -1,65 +0,0 @@ -#!/bin/sh -# - -# Basedir on device -basedir=/data/local/tmp/llama.cpp - -cli_opts= - -branch=. -[ "$B" != "" ] && branch=$B - -adbserial= -[ "$S" != "" ] && adbserial="-s $S" - -model="gemma-3-4b-it-Q4_0.gguf" -[ "$M" != "" ] && model="$M" - -mmproj="mmproj-F16.gguf" -[ "$MMPROJ" != "" ] && mmproj="$MMPROJ" - -image= -[ "$IMG" != "" ] && image="$IMG" - -device="HTP0" -[ "$D" != "" ] && device="$D" - -verbose= -[ "$V" != "" ] && verbose="GGML_HEXAGON_VERBOSE=$V" - -experimental="GGML_HEXAGON_EXPERIMENTAL=1" -[ "$E" != "" ] && experimental="GGML_HEXAGON_EXPERIMENTAL=$E" - -sched= -[ "$SCHED" != "" ] && sched="GGML_SCHED_DEBUG=2" cli_opts="$cli_opts -v" - -profile= -[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF GGML_HEXAGON_OPSYNC=1" - -opmask= -[ "$OPMASK" != "" ] && opmask="GGML_HEXAGON_OPMASK=$OPMASK" - -nhvx= -[ "$NHVX" != "" ] && nhvx="GGML_HEXAGON_NHVX=$NHVX" - -ndev= -[ "$NDEV" != "" ] && ndev="GGML_HEXAGON_NDEV=$NDEV" - -# MTMD backend device for vision model (defaults to CPU if not set) -mtmd_backend= -[ "$MTMD_DEVICE" != "" ] && mtmd_backend="MTMD_BACKEND_DEVICE=$MTMD_DEVICE" - -set -x - -adb $adbserial shell " \ - cd $basedir; ulimit -c unlimited; \ - LD_LIBRARY_PATH=$basedir/$branch/lib \ - ADSP_LIBRARY_PATH=$basedir/$branch/lib \ - $verbose $experimental $sched $opmask $profile $nhvx $ndev $mtmd_backend \ - ./$branch/bin/llama-mtmd-cli --no-mmap -m $basedir/../gguf/$model \ - --mmproj $basedir/../gguf/$mmproj \ - --image $basedir/../gguf/$image \ - --poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1 \ - --ctx-size 8192 --batch-size 128 -ctk q8_0 -ctv q8_0 -fa on \ - -ngl 99 --device $device -v $cli_opts $@ \ -" diff --git a/src/llama-mmap.cpp b/src/llama-mmap.cpp index 339917b24..d7e019bde 100644 --- a/src/llama-mmap.cpp +++ b/src/llama-mmap.cpp @@ -13,9 +13,10 @@ #ifdef __has_include #if __has_include() #include + #include + #include #if defined(_POSIX_MAPPED_FILES) #include - #include #endif #if defined(_POSIX_MEMLOCK_RANGE) #include @@ -74,7 +75,7 @@ struct llama_file::impl { return ret; } - impl(const char * fname, const char * mode) { + impl(const char * fname, const char * mode, [[maybe_unused]] const bool use_direct_io = false) { fp = ggml_fopen(fname, mode); if (fp == NULL) { throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno))); @@ -153,13 +154,40 @@ struct llama_file::impl { write_raw(&val, sizeof(val)); } + void read_aligned_chunk(size_t offset, void * dest, size_t size) const { + throw std::runtime_error("DirectIO is not implemented on Windows."); + } + ~impl() { if (fp) { std::fclose(fp); } } #else - impl(const char * fname, const char * mode) { + impl(const char * fname, const char * mode, [[maybe_unused]] const bool use_direct_io = false) { +#ifdef __linux__ + // Try unbuffered I/O for read only + if (use_direct_io && std::strcmp(mode, "rb") == 0) { + fd = open(fname, O_RDONLY | O_DIRECT); + + if (fd != -1) { + struct stat file_stats{}; + fstat(fd, &file_stats); + + size = file_stats.st_size; + alignment = file_stats.st_blksize; + + off_t ret = lseek(fd, 0, SEEK_SET); + if (ret == -1) { + throw std::runtime_error(format("seek error: %s", strerror(errno))); + } + return; + } + + LLAMA_LOG_WARN("Failed to open model %s with error: %s. Falling back to buffered I/O", + fname, strerror(errno)); + } +#endif fp = ggml_fopen(fname, mode); if (fp == NULL) { throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno))); @@ -170,27 +198,30 @@ struct llama_file::impl { } size_t tell() const { -// TODO: this ifdef is never true? -#ifdef _WIN32 - __int64 ret = _ftelli64(fp); -#else - long ret = std::ftell(fp); -#endif - if (ret == -1) { - throw std::runtime_error(format("ftell error: %s", strerror(errno))); + if (fd == -1) { + long ret = std::ftell(fp); + if (ret == -1) { + throw std::runtime_error(format("ftell error: %s", strerror(errno))); + } + + return (size_t) ret; } - return (size_t) ret; + off_t pos = lseek(fd, 0, SEEK_CUR); + if (pos == -1) { + throw std::runtime_error(format("lseek error: %s", strerror(errno))); + } + return (size_t) pos; } void seek(size_t offset, int whence) const { -// TODO: this ifdef is never true? -#ifdef _WIN32 - int ret = _fseeki64(fp, (__int64) offset, whence); -#else - int ret = std::fseek(fp, (long) offset, whence); -#endif - if (ret != 0) { + off_t ret = 0; + if (fd == -1) { + ret = std::fseek(fp, (long) offset, whence); + } else { + ret = lseek(fd, offset, whence); + } + if (ret == -1) { throw std::runtime_error(format("seek error: %s", strerror(errno))); } } @@ -200,13 +231,55 @@ struct llama_file::impl { return; } errno = 0; - std::size_t ret = std::fread(ptr, len, 1, fp); - if (ferror(fp)) { - throw std::runtime_error(format("read error: %s", strerror(errno))); + if (fd == -1) { + std::size_t ret = std::fread(ptr, len, 1, fp); + if (ferror(fp)) { + throw std::runtime_error(format("read error: %s", strerror(errno))); + } + if (ret != 1) { + throw std::runtime_error("unexpectedly reached end of file"); + } + } else { + bool successful = false; + while (!successful) { + off_t ret = read(fd, ptr, len); + + if (ret == -1) { + if (errno == EINTR) { + continue; // Interrupted by signal, retry + } + throw std::runtime_error(format("read error: %s", strerror(errno))); + } + if (ret == 0) { + throw std::runtime_error("unexpectedly reached end of file"); + } + + successful = true; + } } - if (ret != 1) { - throw std::runtime_error("unexpectedly reached end of file"); + } + + void read_aligned_chunk(size_t offset, void * dest, size_t size) const { + off_t aligned_offset = offset & ~(alignment - 1); + off_t offset_from_alignment = offset - aligned_offset; + size_t bytes_to_read = (offset_from_alignment + size + alignment - 1) & ~(alignment - 1); + + void * raw_buffer = nullptr; + int ret = posix_memalign(&raw_buffer, alignment, bytes_to_read); + if (ret != 0) { + throw std::runtime_error(format("posix_memalign failed with error %d", ret)); } + + struct aligned_buffer_deleter { + void operator()(void * p) const { free(p); } + }; + std::unique_ptr buffer(raw_buffer); + + seek(aligned_offset, SEEK_SET); + read_raw(buffer.get(), bytes_to_read); + + uintptr_t actual_data = reinterpret_cast(buffer.get()) + offset_from_alignment; + memcpy(dest, reinterpret_cast(actual_data), size); } uint32_t read_u32() const { @@ -231,22 +304,43 @@ struct llama_file::impl { } ~impl() { - if (fp) { + if (fd != -1) { + close(fd); + } else { std::fclose(fp); } } + int fd = -1; #endif - FILE * fp; - size_t size; + void read_raw_at(void * ptr, size_t len, size_t offset) const { + if (alignment != 1) { + read_aligned_chunk(offset, ptr, len); + } else { + seek(offset, SEEK_SET); + read_raw(ptr, len); + } + } + + size_t read_alignment() const { + return alignment; + } + + size_t alignment = 1; + + FILE * fp{}; + size_t size{}; }; -llama_file::llama_file(const char * fname, const char * mode) : pimpl(std::make_unique(fname, mode)) {} +llama_file::llama_file(const char * fname, const char * mode, const bool use_direct_io) : + pimpl(std::make_unique(fname, mode, use_direct_io)) {} llama_file::~llama_file() = default; size_t llama_file::tell() const { return pimpl->tell(); } size_t llama_file::size() const { return pimpl->size; } +size_t llama_file::read_alignment() const { return pimpl->read_alignment(); } + int llama_file::file_id() const { #ifdef _WIN32 return _fileno(pimpl->fp); @@ -261,6 +355,7 @@ int llama_file::file_id() const { void llama_file::seek(size_t offset, int whence) const { pimpl->seek(offset, whence); } void llama_file::read_raw(void * ptr, size_t len) const { pimpl->read_raw(ptr, len); } +void llama_file::read_raw_at(void * ptr, size_t len, size_t offset) const { pimpl->read_raw_at(ptr, len, offset); } uint32_t llama_file::read_u32() const { return pimpl->read_u32(); } diff --git a/src/llama-mmap.h b/src/llama-mmap.h index 4e5aec3f4..729aac164 100644 --- a/src/llama-mmap.h +++ b/src/llama-mmap.h @@ -3,6 +3,7 @@ #include #include #include +#include struct llama_file; struct llama_mmap; @@ -13,7 +14,7 @@ using llama_mmaps = std::vector>; using llama_mlocks = std::vector>; struct llama_file { - llama_file(const char * fname, const char * mode); + llama_file(const char * fname, const char * mode, bool use_direct_io = false); ~llama_file(); size_t tell() const; @@ -24,11 +25,14 @@ struct llama_file { void seek(size_t offset, int whence) const; void read_raw(void * ptr, size_t len) const; + void read_raw_at(void * ptr, size_t len, size_t offset) const; + void read_aligned_chunk(size_t offset, void * dest, size_t size) const; uint32_t read_u32() const; void write_raw(const void * ptr, size_t len) const; void write_u32(uint32_t val) const; + size_t read_alignment() const; private: struct impl; std::unique_ptr pimpl; diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 2a0e16bba..966e82d00 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -508,7 +508,7 @@ llama_model_loader::llama_model_loader( get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false); llm_kv = LLM_KV(llm_arch_from_string(arch_name)); - files.emplace_back(new llama_file(fname.c_str(), "rb")); + files.emplace_back(new llama_file(fname.c_str(), "rb", !use_mmap)); contexts.emplace_back(ctx); // Save tensors data offset of the main file. @@ -576,7 +576,7 @@ llama_model_loader::llama_model_loader( } } - files.emplace_back(new llama_file(fname_split, "rb")); + files.emplace_back(new llama_file(fname_split, "rb", !use_mmap)); contexts.emplace_back(ctx); // Save tensors data offset info of the shard. @@ -958,7 +958,15 @@ bool llama_model_loader::load_all_data( // 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives. // NVMe raid configurations might require more / larger buffers. constexpr size_t n_buffers = 4; - constexpr size_t buffer_size = 1 * 1024 * 1024; // 1MB + + size_t alignment = 1; + for (const auto & file : files) { + alignment = std::max(file->read_alignment(), alignment); + } + + // Buffer size: balance between memory usage and I/O efficiency + // 64MB works well for NVMe drives + const size_t buffer_size = alignment != 1 ? 64 * 1024 * 1024 + 2 * alignment : 1 * 1024 * 1024; std::vector host_buffers; std::vector events; @@ -1008,6 +1016,7 @@ bool llama_model_loader::load_all_data( // If the backend is supported, create pinned memory buffers and events for synchronisation. for (size_t idx = 0; idx < n_buffers; ++idx) { auto * buf = ggml_backend_buft_alloc_buffer(host_buft, buffer_size); + if (!buf) { LLAMA_LOG_DEBUG("%s: failed to allocate host buffer for async uploads for device %s\n", func, ggml_backend_dev_name(dev)); @@ -1089,9 +1098,9 @@ bool llama_model_loader::load_all_data( } } else { const auto & file = files.at(weight->idx); + if (ggml_backend_buffer_is_host(cur->buffer)) { - file->seek(weight->offs, SEEK_SET); - file->read_raw(cur->data, n_size); + file->read_raw_at(cur->data, n_size, weight->offs); if (check_tensors) { validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] { return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size)); @@ -1100,26 +1109,60 @@ bool llama_model_loader::load_all_data( } else { // If upload_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU. if (upload_backend) { - file->seek(weight->offs, SEEK_SET); + auto offset = (off_t) weight->offs; + alignment = file->read_alignment(); + off_t aligned_offset = offset & ~(alignment - 1); + off_t offset_from_alignment = offset - aligned_offset; + file->seek(aligned_offset, SEEK_SET); + + // Calculate aligned read boundaries + size_t read_start = aligned_offset; + size_t read_end = (offset + n_size + alignment - 1) & ~(alignment - 1); size_t bytes_read = 0; + size_t data_read = 0; // Actual tensor data copied (excluding padding) - while (bytes_read < n_size) { - size_t read_iteration = std::min(buffer_size, n_size - bytes_read); + while (bytes_read < read_end - read_start) { + size_t read_size = std::min(buffer_size, read_end - read_start - bytes_read); + // Align the destination pointer within the pinned buffer + uintptr_t ptr_dest_aligned = (reinterpret_cast(host_ptrs[buffer_idx]) + alignment - 1) & ~(alignment - 1); + + // Wait for previous upload to complete before reusing buffer ggml_backend_event_synchronize(events[buffer_idx]); - file->read_raw(host_ptrs[buffer_idx], read_iteration); - ggml_backend_tensor_set_async(upload_backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration); + + // Read aligned chunk from file + file->read_raw(reinterpret_cast(ptr_dest_aligned), read_size); + + // Calculate actual data portion (excluding alignment padding) + uintptr_t ptr_data = ptr_dest_aligned; + size_t data_to_copy = read_size; + + // Skip alignment padding at start of first chunk + if (bytes_read == 0) { + ptr_data += offset_from_alignment; + data_to_copy -= offset_from_alignment; + } + + // Trim alignment padding at end of last chunk + if (aligned_offset + bytes_read + read_size > offset + n_size) { + data_to_copy -= (read_end - (offset + n_size)); + } + + // Async upload actual data to GPU + ggml_backend_tensor_set_async(upload_backend, cur, + reinterpret_cast(ptr_data), data_read, data_to_copy); ggml_backend_event_record(events[buffer_idx], upload_backend); - bytes_read += read_iteration; + data_read += data_to_copy; + bytes_read += read_size; + ++buffer_idx; buffer_idx %= n_buffers; } } else { read_buf.resize(n_size); - file->seek(weight->offs, SEEK_SET); - file->read_raw(read_buf.data(), n_size); + file->read_raw_at(read_buf.data(), n_size, weight->offs); ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size); if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) { throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur))); diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 8cd222b61..abd76900d 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -2480,7 +2480,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); - int i_gpu_start = std::max((int) hparams.n_layer - n_gpu_layers, (int) 0); + int i_gpu_start = std::max(int(hparams.n_layer) + 1 - n_gpu_layers, 0); #if defined(GGML_USE_CLBLAST) printf("\nOpenCL GPU Offload Fallback...\n"); @@ -2491,9 +2491,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) { if (cpu_dev == nullptr) { throw std::runtime_error(format("%s: no CPU backend found", __func__)); } - const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1); + const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, int(n_layer) + 1); auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev { - const bool is_swa = il < (int) hparams.n_layer && hparams.is_swa(il); + const bool is_swa = il < int(hparams.n_layer) && hparams.is_swa(il); if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) { // LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(cpu_dev), is_swa); return {cpu_dev, &pimpl->cpu_buft_list}; @@ -6852,10 +6852,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) { if (llama_supports_gpu_offload()) { const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer)); - LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu); - if (n_gpu_layers > (int) hparams.n_layer) { + int n_repeating = n_gpu; + if (n_repeating > 0) { LLAMA_LOG_INFO("%s: offloading output layer to GPU\n", __func__); + n_repeating--; } + LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_repeating); const int max_backend_supported_layers = hparams.n_layer + 1; const int max_offloadable_layers = hparams.n_layer + 1; diff --git a/src/llama.cpp b/src/llama.cpp index 4502fd42f..9be51e42c 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -316,10 +316,6 @@ static void llama_params_fit_impl( if (mparams->split_mode == LLAMA_SPLIT_MODE_ROW) { throw std::runtime_error("changing weight allocation for LLAMA_SPLIT_MODE_ROW not implemented, abort"); } - if (hp_ngl < 2*nd) { - throw std::runtime_error("model has only " + std::to_string(hp_ngl) + " layers but need at least " - + std::to_string(2*nd) + " to fit memory for " + std::to_string(nd) + " devices, abort"); - } } if (!tensor_buft_overrides) { throw std::runtime_error("did not provide buffer to set tensor_buft_overrides, abort"); @@ -386,8 +382,7 @@ static void llama_params_fit_impl( auto set_ngl_tensor_split_tbo = [&]( const std::vector & ngl_per_device, const std::vector & overflow_bufts, - llama_model_params & mparams, - const bool add_nonrepeating) { + llama_model_params & mparams) { mparams.n_gpu_layers = 0; for (size_t id = 0; id < nd; id++) { mparams.n_gpu_layers += ngl_per_device[id].n_layer; @@ -395,13 +390,9 @@ static void llama_params_fit_impl( tensor_split[id] = ngl_per_device[id].n_layer; } } - assert(uint32_t(mparams.n_gpu_layers) <= hp_ngl); - uint32_t il0 = hp_ngl - mparams.n_gpu_layers; // start index for tensor buft overrides + assert(uint32_t(mparams.n_gpu_layers) <= hp_ngl + 1); + uint32_t il0 = hp_ngl + 1 - mparams.n_gpu_layers; // start index for tensor buft overrides - if (add_nonrepeating) { - mparams.n_gpu_layers += 1; - tensor_split[nd - 1] += 1; - } mparams.tensor_split = tensor_split; size_t itbo = 0; @@ -432,10 +423,9 @@ static void llama_params_fit_impl( auto get_memory_for_layers = [&]( const char * func_name, const std::vector & ngl_per_device, - const std::vector & overflow_bufts, - const bool add_nonrepeating) -> std::vector { + const std::vector & overflow_bufts) -> std::vector { llama_model_params mparams_copy = *mparams; - set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, mparams_copy, add_nonrepeating); + set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, mparams_copy); const dmds_t dmd_nl = llama_get_device_memory_data( path_model, &mparams_copy, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level); @@ -493,9 +483,6 @@ static void llama_params_fit_impl( LLAMA_LOG_DEBUG("%s: id=%zu, target=%" PRId64 " MiB\n", __func__, id, targets[id]/MiB); } - // whether for the optimal memory use we expect to load at least some MoE tensors: - const bool partial_moe = hp_nex > 0 && global_surplus_cpu_moe > 0; - std::vector overflow_bufts; // which bufts the partial layers of a device overflow to: overflow_bufts.reserve(nd); for (size_t id = 0; id < nd - 1; ++id) { @@ -504,7 +491,7 @@ static void llama_params_fit_impl( overflow_bufts.push_back(ggml_backend_cpu_buffer_type()); std::vector ngl_per_device(nd); - std::vector mem = get_memory_for_layers(__func__, ngl_per_device, overflow_bufts, partial_moe); + std::vector mem = get_memory_for_layers(__func__, ngl_per_device, overflow_bufts); if (hp_nex > 0) { for (size_t id = 0; id < nd; id++) { ngl_per_device[id].overflow_type = LAYER_FRACTION_MOE; @@ -517,13 +504,14 @@ static void llama_params_fit_impl( // - interpolate the memory use / layer between low and high linearly to get a guess where it meets our target // - check memory use of our guess, replace either the low or high bound // - once we only have a difference of a single layer, stop and return the lower bound that just barely still fits + // - the last device has the output layer, which cannot be a partial layer if (hp_nex == 0) { LLAMA_LOG_INFO("%s: filling dense layers back-to-front:\n", __func__); } else { LLAMA_LOG_INFO("%s: filling dense-only layers back-to-front:\n", __func__); } for (int id = nd - 1; id >= 0; id--) { - uint32_t n_unassigned = hp_ngl; + uint32_t n_unassigned = hp_ngl + 1; for (size_t jd = id + 1; jd < nd; ++jd) { assert(n_unassigned >= ngl_per_device[jd].n_layer); n_unassigned -= ngl_per_device[jd].n_layer; @@ -532,10 +520,10 @@ static void llama_params_fit_impl( std::vector ngl_per_device_high = ngl_per_device; ngl_per_device_high[id].n_layer = n_unassigned; if (hp_nex > 0) { - ngl_per_device_high[id].n_part = ngl_per_device_high[id].n_layer; + ngl_per_device_high[id].n_part = size_t(id) < nd - 1 ? ngl_per_device_high[id].n_layer : ngl_per_device_high[id].n_layer - 1; } if (ngl_per_device_high[id].n_layer > 0) { - std::vector mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts, partial_moe); + std::vector mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts); if (mem_high[id] > targets[id]) { assert(ngl_per_device_high[id].n_layer > ngl_per_device[id].n_layer); uint32_t delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer; @@ -550,7 +538,7 @@ static void llama_params_fit_impl( if (hp_nex) { ngl_per_device_test[id].n_part += step_size; } - const std::vector mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe); + const std::vector mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts); if (mem_test[id] <= targets[id]) { ngl_per_device = ngl_per_device_test; @@ -577,7 +565,7 @@ static void llama_params_fit_impl( __func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, mem[id]/MiB, projected_margin/MiB); } if (hp_nex == 0 || global_surplus_cpu_moe <= 0) { - set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams, partial_moe); + set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams); return; } @@ -600,13 +588,13 @@ static void llama_params_fit_impl( for (size_t id = 0; id <= id_dense_start; id++) { std::vector ngl_per_device_high = ngl_per_device; for (size_t jd = id_dense_start; jd < nd; jd++) { - const uint32_t n_layer_move = ngl_per_device_high[jd].n_layer; + const uint32_t n_layer_move = jd < nd - 1 ? ngl_per_device_high[jd].n_layer : ngl_per_device_high[jd].n_layer - 1; ngl_per_device_high[id].n_layer += n_layer_move; ngl_per_device_high[jd].n_layer -= n_layer_move; ngl_per_device_high[jd].n_part = 0; } size_t id_dense_start_high = nd - 1; - std::vector mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts, partial_moe); + std::vector mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts); if (mem_high[id] > targets[id]) { assert(ngl_per_device_high[id].n_layer >= ngl_per_device_high[id].n_part); @@ -634,7 +622,7 @@ static void llama_params_fit_impl( break; } } - const std::vector mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe); + const std::vector mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts); if (mem_test[id] <= targets[id]) { ngl_per_device = ngl_per_device_test; @@ -661,7 +649,7 @@ static void llama_params_fit_impl( } // try to fit at least part of one more layer - if (ngl_per_device[id_dense_start].n_layer > 0) { + if (ngl_per_device[id_dense_start].n_layer > (id < nd - 1 ? 0 : 1)) { std::vector ngl_per_device_test = ngl_per_device; size_t id_dense_start_test = id_dense_start; ngl_per_device_test[id_dense_start_test].n_layer--; @@ -673,7 +661,7 @@ static void llama_params_fit_impl( } ngl_per_device_test[id].overflow_type = LAYER_FRACTION_UP; LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_UP\n", __func__); - std::vector mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe); + std::vector mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts); if (mem_test[id] < targets[id]) { ngl_per_device = ngl_per_device_test; mem = mem_test; @@ -683,7 +671,7 @@ static void llama_params_fit_impl( ngl_per_device_test[id].overflow_type = LAYER_FRACTION_GATE; LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_GATE\n", __func__); - mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe); + mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts); if (mem_test[id] < targets[id]) { ngl_per_device = ngl_per_device_test; mem = mem_test; @@ -694,7 +682,7 @@ static void llama_params_fit_impl( } else { ngl_per_device_test[id].overflow_type = LAYER_FRACTION_ATTN; LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_ATTN\n", __func__); - mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe); + mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts); if (mem_test[id] < targets[id]) { ngl_per_device = ngl_per_device_test; mem = mem_test; @@ -711,7 +699,7 @@ static void llama_params_fit_impl( __func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB); } - set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams, partial_moe); + set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams); } bool llama_params_fit( diff --git a/tests/test-state-restore-fragmented.cpp b/tests/test-state-restore-fragmented.cpp deleted file mode 100644 index 481b39d04..000000000 --- a/tests/test-state-restore-fragmented.cpp +++ /dev/null @@ -1,122 +0,0 @@ -// Test for state restore with fragmented KV cache -// This tests the fix for: https://github.com/ggml-org/llama.cpp/issues/17527 -// The issue was that state restore required contiguous KV cache slots, -// which fails when the cache is fragmented. -// -// The fix changes find_slot(ubatch, true) to find_slot(ubatch, false) -// in state_read_meta(), allowing non-contiguous slot allocation. - -#include "arg.h" -#include "common.h" -#include "llama.h" - -#include -#include -#include - -int main(int argc, char ** argv) { - common_params params; - - params.sampling.seed = 1234; - params.kv_unified = true; - params.n_parallel = 3; - params.n_ctx = 256; - - if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) { - return 1; - } - - common_init(); - - // init - common_init_result_ptr llama_init = common_init_from_params(params); - - llama_model * model = llama_init->model(); - llama_context * ctx = llama_init->context(); - - if (model == nullptr || ctx == nullptr) { - fprintf(stderr, "%s : failed to init\n", __func__); - return 1; - } - - GGML_UNUSED(model); - - // tokenize prompt - std::vector tokens(70, 1); - - // interleave the 3 sequences: - // 01201230123... - llama_batch batch = llama_batch_init(params.n_parallel*tokens.size(), 0, 1); - for (size_t i = 0; i < tokens.size(); i++) { - for (int s = 0; s < params.n_parallel; ++s) { - common_batch_add(batch, tokens[i], i, {s}, false); - } - } - batch.logits[batch.n_tokens - 1] = true; - - if (llama_decode(ctx, batch)) { - fprintf(stderr, "%s : failed to decode seq 0\n", __func__); - return 1; - } - - fprintf(stderr, "%s : processed prompt on seq 0, 1, 2 (%zu tokens each)\n", __func__, tokens.size()); - - // Save state of seq 1 - std::vector seq_state(llama_state_seq_get_size(ctx, 1)); - const size_t ncopy = llama_state_seq_get_data(ctx, seq_state.data(), seq_state.size(), 1); - if (ncopy != seq_state.size()) { - fprintf(stderr, "%s : failed to save seq 1 state\n", __func__); - return 1; - } - fprintf(stderr, "%s : saved seq 1 state, %zu bytes\n", __func__, ncopy); - - // clear seq 1 to create a "hole" in the KV cache (fragmentation) - // 0.20.20.20.2.... - llama_memory_t mem = llama_get_memory(ctx); - llama_memory_seq_rm(mem, 1, -1, -1); - fprintf(stderr, "%s : cleared seq 1 to create fragmentation\n", __func__); - - // Now the cache has holes where seq 1 was - // This creates fragmentation - there's no contiguous block large enough - // for the seq 1 state if we only look for contiguous slots - - // Restore seq 1 state into seq 1 (should work with non-contiguous allocation) - // We use seq 1 since it's a valid sequence ID (0 to n_parallel-1) - // Before the fix, this would fail with "failed to find available cells in kv cache" - const size_t nset = llama_state_seq_set_data(ctx, seq_state.data(), seq_state.size(), 1); - if (nset != seq_state.size()) { - fprintf(stderr, "%s : FAILED to restore seq state into fragmented cache (got %zu, expected %zu)\n", - __func__, nset, seq_state.size()); - fprintf(stderr, "%s : This is the bug - state restore fails with fragmented KV cache\n", __func__); - llama_batch_free(batch); - return 1; - } - fprintf(stderr, "%s : restored state into seq 1, %zu bytes\n", __func__, nset); - - // Verify we can decode with the restored state - // Generate one token to verify the restored state is usable - auto sparams = llama_sampler_chain_default_params(); - llama_sampler * smpl = llama_sampler_chain_init(sparams); - llama_sampler_chain_add(smpl, llama_sampler_init_dist(params.sampling.seed)); - - auto next_token = llama_sampler_sample(smpl, ctx, -1); - auto next_token_str = common_token_to_piece(ctx, next_token); - - common_batch_clear(batch); - common_batch_add(batch, next_token, (int)tokens.size(), {1}, true); - - if (llama_decode(ctx, batch)) { - fprintf(stderr, "%s : failed to decode with restored state\n", __func__); - llama_sampler_free(smpl); - llama_batch_free(batch); - return 1; - } - - fprintf(stderr, "%s : successfully decoded with restored state, generated: '%s'\n", __func__, next_token_str.c_str()); - fprintf(stderr, "%s : SUCCESS - state restore works with fragmented KV cache\n", __func__); - - llama_sampler_free(smpl); - llama_batch_free(batch); - - return 0; -} diff --git a/tools/cli/CMakeLists.txt b/tools/cli/CMakeLists.txt deleted file mode 100644 index b08fff4c2..000000000 --- a/tools/cli/CMakeLists.txt +++ /dev/null @@ -1,10 +0,0 @@ -set(TARGET llama-cli) -add_executable(${TARGET} cli.cpp) -target_link_libraries(${TARGET} PRIVATE server-context PUBLIC common ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_17) - -include_directories(../server) - -if(LLAMA_TOOLS_INSTALL) - install(TARGETS ${TARGET} RUNTIME) -endif() diff --git a/tools/cli/README.md b/tools/cli/README.md deleted file mode 100644 index 1333ed77b..000000000 --- a/tools/cli/README.md +++ /dev/null @@ -1 +0,0 @@ -TODO diff --git a/tools/cli/cli.cpp b/tools/cli/cli.cpp deleted file mode 100644 index 8a8639207..000000000 --- a/tools/cli/cli.cpp +++ /dev/null @@ -1,395 +0,0 @@ -#include "common.h" -#include "arg.h" -#include "console.h" -// #include "log.h" - -#include "server-context.h" -#include "server-task.h" - -#include -#include -#include -#include - -#if defined(_WIN32) -#define WIN32_LEAN_AND_MEAN -#ifndef NOMINMAX -# define NOMINMAX -#endif -#include -#endif - -const char * LLAMA_ASCII_LOGO = R"( -▄▄ ▄▄ -██ ██ -██ ██ ▀▀█▄ ███▄███▄ ▀▀█▄ ▄████ ████▄ ████▄ -██ ██ ▄█▀██ ██ ██ ██ ▄█▀██ ██ ██ ██ ██ ██ -██ ██ ▀█▄██ ██ ██ ██ ▀█▄██ ██ ▀████ ████▀ ████▀ - ██ ██ - ▀▀ ▀▀ -)"; - -static std::atomic g_is_interrupted = false; -static bool should_stop() { - return g_is_interrupted.load(); -} - -#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) -static void signal_handler(int) { - if (g_is_interrupted.load()) { - // second Ctrl+C - exit immediately - // make sure to clear colors before exiting (not using LOG or console.cpp here to avoid deadlock) - fprintf(stdout, "\033[0m\n"); - fflush(stdout); - std::exit(130); - } - g_is_interrupted.store(true); -} -#endif - -struct cli_context { - server_context ctx_server; - json messages = json::array(); - std::vector input_files; - task_params defaults; - - // thread for showing "loading" animation - std::atomic loading_show; - - cli_context(const common_params & params) { - defaults.sampling = params.sampling; - defaults.speculative = params.speculative; - defaults.n_keep = params.n_keep; - defaults.n_predict = params.n_predict; - defaults.antiprompt = params.antiprompt; - - defaults.stream = true; // make sure we always use streaming mode - defaults.timings_per_token = true; // in order to get timings even when we cancel mid-way - // defaults.return_progress = true; // TODO: show progress - defaults.oaicompat_chat_syntax.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; - } - - std::string generate_completion(result_timings & out_timings) { - server_response_reader rd = ctx_server.get_response_reader(); - { - // TODO: reduce some copies here in the future - server_task task = server_task(SERVER_TASK_TYPE_COMPLETION); - task.id = rd.get_new_id(); - task.index = 0; - task.params = defaults; // copy - task.cli_input = messages; // copy - task.cli_files = input_files; // copy - rd.post_task({std::move(task)}); - } - - // wait for first result - console::spinner::start(); - server_task_result_ptr result = rd.next(should_stop); - - console::spinner::stop(); - std::string curr_content; - bool is_thinking = false; - - while (result) { - if (should_stop()) { - break; - } - if (result->is_error()) { - json err_data = result->to_json(); - if (err_data.contains("message")) { - console::error("Error: %s\n", err_data["message"].get().c_str()); - } else { - console::error("Error: %s\n", err_data.dump().c_str()); - } - return curr_content; - } - auto res_partial = dynamic_cast(result.get()); - if (res_partial) { - out_timings = std::move(res_partial->timings); - for (const auto & diff : res_partial->oaicompat_msg_diffs) { - if (!diff.content_delta.empty()) { - if (is_thinking) { - console::log("\n[End thinking]\n\n"); - console::set_display(DISPLAY_TYPE_RESET); - is_thinking = false; - } - curr_content += diff.content_delta; - console::log("%s", diff.content_delta.c_str()); - console::flush(); - } - if (!diff.reasoning_content_delta.empty()) { - console::set_display(DISPLAY_TYPE_REASONING); - if (!is_thinking) { - console::log("[Start thinking]\n"); - } - is_thinking = true; - console::log("%s", diff.reasoning_content_delta.c_str()); - console::flush(); - } - } - } - auto res_final = dynamic_cast(result.get()); - if (res_final) { - out_timings = std::move(res_final->timings); - break; - } - result = rd.next(should_stop); - } - g_is_interrupted.store(false); - // server_response_reader automatically cancels pending tasks upon destruction - return curr_content; - } - - // TODO: support remote files in the future (http, https, etc) - std::string load_input_file(const std::string & fname, bool is_media) { - std::ifstream file(fname, std::ios::binary); - if (!file) { - return ""; - } - if (is_media) { - raw_buffer buf; - buf.assign((std::istreambuf_iterator(file)), std::istreambuf_iterator()); - input_files.push_back(std::move(buf)); - return mtmd_default_marker(); - } else { - std::string content((std::istreambuf_iterator(file)), std::istreambuf_iterator()); - return content; - } - } -}; - -int main(int argc, char ** argv) { - common_params params; - - params.verbosity = LOG_LEVEL_ERROR; // by default, less verbose logs - - if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_CLI)) { - return 1; - } - - // TODO: maybe support it later? - if (params.conversation_mode == COMMON_CONVERSATION_MODE_DISABLED) { - console::error("--no-conversation is not supported by llama-cli\n"); - console::error("please use llama-completion instead\n"); - } - - common_init(); - - // struct that contains llama context and inference - cli_context ctx_cli(params); - - llama_backend_init(); - llama_numa_init(params.numa); - - // TODO: avoid using atexit() here by making `console` a singleton - console::init(params.simple_io, params.use_color); - atexit([]() { console::cleanup(); }); - - console::set_display(DISPLAY_TYPE_RESET); - -#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) - struct sigaction sigint_action; - sigint_action.sa_handler = signal_handler; - sigemptyset (&sigint_action.sa_mask); - sigint_action.sa_flags = 0; - sigaction(SIGINT, &sigint_action, NULL); - sigaction(SIGTERM, &sigint_action, NULL); -#elif defined (_WIN32) - auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL { - return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false; - }; - SetConsoleCtrlHandler(reinterpret_cast(console_ctrl_handler), true); -#endif - - console::log("\nLoading model... "); // followed by loading animation - console::spinner::start(); - if (!ctx_cli.ctx_server.load_model(params)) { - console::spinner::stop(); - console::error("\nFailed to load the model\n"); - return 1; - } - - ctx_cli.ctx_server.init(); - - console::spinner::stop(); - console::log("\n"); - - std::thread inference_thread([&ctx_cli]() { - ctx_cli.ctx_server.start_loop(); - }); - - auto inf = ctx_cli.ctx_server.get_info(); - std::string modalities = "text"; - if (inf.has_inp_image) { - modalities += ", vision"; - } - if (inf.has_inp_audio) { - modalities += ", audio"; - } - - if (!params.system_prompt.empty()) { - ctx_cli.messages.push_back({ - {"role", "system"}, - {"content", params.system_prompt} - }); - } - - console::log("\n"); - console::log("%s\n", LLAMA_ASCII_LOGO); - console::log("build : %s\n", inf.build_info.c_str()); - console::log("model : %s\n", inf.model_name.c_str()); - console::log("modalities : %s\n", modalities.c_str()); - if (!params.system_prompt.empty()) { - console::log("using custom system prompt\n"); - } - console::log("\n"); - console::log("available commands:\n"); - console::log(" /exit or Ctrl+C stop or exit\n"); - console::log(" /regen regenerate the last response\n"); - console::log(" /clear clear the chat history\n"); - console::log(" /read add a text file\n"); - if (inf.has_inp_image) { - console::log(" /image add an image file\n"); - } - if (inf.has_inp_audio) { - console::log(" /audio add an audio file\n"); - } - console::log("\n"); - - // interactive loop - std::string cur_msg; - while (true) { - std::string buffer; - console::set_display(DISPLAY_TYPE_USER_INPUT); - if (params.prompt.empty()) { - console::log("\n> "); - std::string line; - bool another_line = true; - do { - another_line = console::readline(line, params.multiline_input); - buffer += line; - } while (another_line); - } else { - // process input prompt from args - for (auto & fname : params.image) { - std::string marker = ctx_cli.load_input_file(fname, true); - if (marker.empty()) { - console::error("file does not exist or cannot be opened: '%s'\n", fname.c_str()); - break; - } - console::log("Loaded media from '%s'\n", fname.c_str()); - cur_msg += marker; - } - buffer = params.prompt; - if (buffer.size() > 500) { - console::log("\n> %s ... (truncated)\n", buffer.substr(0, 500).c_str()); - } else { - console::log("\n> %s\n", buffer.c_str()); - } - params.prompt.clear(); // only use it once - } - console::set_display(DISPLAY_TYPE_RESET); - console::log("\n"); - - if (should_stop()) { - g_is_interrupted.store(false); - break; - } - - // remove trailing newline - if (!buffer.empty() &&buffer.back() == '\n') { - buffer.pop_back(); - } - - // skip empty messages - if (buffer.empty()) { - continue; - } - - bool add_user_msg = true; - - // process commands - if (string_starts_with(buffer, "/exit")) { - break; - } else if (string_starts_with(buffer, "/regen")) { - if (ctx_cli.messages.size() >= 2) { - size_t last_idx = ctx_cli.messages.size() - 1; - ctx_cli.messages.erase(last_idx); - add_user_msg = false; - } else { - console::error("No message to regenerate.\n"); - continue; - } - } else if (string_starts_with(buffer, "/clear")) { - ctx_cli.messages.clear(); - ctx_cli.input_files.clear(); - console::log("Chat history cleared.\n"); - continue; - } else if ( - (string_starts_with(buffer, "/image ") && inf.has_inp_image) || - (string_starts_with(buffer, "/audio ") && inf.has_inp_audio)) { - // just in case (bad copy-paste for example), we strip all trailing/leading spaces - std::string fname = string_strip(buffer.substr(7)); - std::string marker = ctx_cli.load_input_file(fname, true); - if (marker.empty()) { - console::error("file does not exist or cannot be opened: '%s'\n", fname.c_str()); - continue; - } - cur_msg += marker; - console::log("Loaded media from '%s'\n", fname.c_str()); - continue; - } else if (string_starts_with(buffer, "/read ")) { - std::string fname = string_strip(buffer.substr(6)); - std::string marker = ctx_cli.load_input_file(fname, false); - if (marker.empty()) { - console::error("file does not exist or cannot be opened: '%s'\n", fname.c_str()); - continue; - } - cur_msg += marker; - console::log("Loaded text from '%s'\n", fname.c_str()); - continue; - } else { - // not a command - cur_msg += buffer; - } - - // generate response - if (add_user_msg) { - ctx_cli.messages.push_back({ - {"role", "user"}, - {"content", cur_msg} - }); - cur_msg.clear(); - } - result_timings timings; - std::string assistant_content = ctx_cli.generate_completion(timings); - ctx_cli.messages.push_back({ - {"role", "assistant"}, - {"content", assistant_content} - }); - console::log("\n"); - - if (params.show_timings) { - console::set_display(DISPLAY_TYPE_INFO); - console::log("\n"); - console::log("[ Prompt: %.1f t/s | Generation: %.1f t/s ]\n", timings.prompt_per_second, timings.predicted_per_second); - console::set_display(DISPLAY_TYPE_RESET); - } - - if (params.single_turn) { - break; - } - } - - console::set_display(DISPLAY_TYPE_RESET); - - console::log("\nExiting...\n"); - ctx_cli.ctx_server.terminate(); - inference_thread.join(); - - // bump the log level to display timings - common_log_set_verbosity_thold(LOG_LEVEL_INFO); - llama_memory_breakdown_print(ctx_cli.ctx_server.get_llama_context()); - - return 0; -} diff --git a/tools/server/public/index.html.gz b/tools/server/public/index.html.gz index d6cc23ebf..9e44f0326 100644 Binary files a/tools/server/public/index.html.gz and b/tools/server/public/index.html.gz differ diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 90898b5ec..def57d025 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -544,6 +544,8 @@ struct server_context_impl { server_metrics metrics; + json webui_settings = json::object(); + // Necessary similarity of prompt for slot selection float slot_prompt_similarity = 0.0f; @@ -575,6 +577,16 @@ struct server_context_impl { params_base = params; + webui_settings = json::object(); + if (!params_base.webui_config_json.empty()) { + try { + webui_settings = json::parse(params_base.webui_config_json); + } catch (const std::exception & e) { + SRV_ERR("%s: failed to parse webui config: %s\n", __func__, e.what()); + return false; + } + } + llama_init = common_init_from_params(params_base); model = llama_init->model(); @@ -3103,7 +3115,6 @@ void server_routes::init_routes() { }; } - // this endpoint is publicly available, please only return what is safe to be exposed json data = { { "default_generation_settings", default_generation_settings_for_props }, { "total_slots", ctx_server.params_base.n_parallel }, @@ -3117,6 +3128,7 @@ void server_routes::init_routes() { { "endpoint_props", params.endpoint_props }, { "endpoint_metrics", params.endpoint_metrics }, { "webui", params.webui }, + { "webui_settings", ctx_server.webui_settings }, { "chat_template", common_chat_templates_source(ctx_server.chat_templates.get()) }, { "bos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_bos(ctx_server.vocab), /* special= */ true)}, { "eos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_eos(ctx_server.vocab), /* special= */ true)}, diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index 8b5cb5d27..c1f86e549 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -168,7 +168,9 @@ server_presets::server_presets(int argc, char ** argv, common_params & base_para env == "LLAMA_ARG_MODEL" || env == "LLAMA_ARG_MMPROJ" || env == "LLAMA_ARG_HF_REPO" || - env == "LLAMA_ARG_NO_MODELS_AUTOLOAD") { + env == "LLAMA_ARG_NO_MODELS_AUTOLOAD" || + env == "LLAMA_ARG_SSL_KEY_FILE" || + env == "LLAMA_ARG_SSL_CERT_FILE") { control_args[env] = opt; } } @@ -222,6 +224,9 @@ void server_presets::render_args(server_model_meta & meta) { preset.options[control_args["LLAMA_ARG_MMPROJ"]] = meta.path_mmproj; } } + // disable SSL for child processes (HTTPS already handled by router) + preset.options[control_args["LLAMA_ARG_SSL_KEY_FILE"]] = ""; + preset.options[control_args["LLAMA_ARG_SSL_CERT_FILE"]] = ""; meta.args = preset.to_args(); // add back the binary path at the front meta.args.insert(meta.args.begin(), get_server_exec_path().string()); @@ -813,6 +818,7 @@ void server_models_routes::init_routes() { {"params", json{}}, {"n_ctx", 0}, }}, + {"webui_settings", webui_settings}, }); return res; } diff --git a/tools/server/server-models.h b/tools/server/server-models.h index 227b15bbc..cbc4c4324 100644 --- a/tools/server/server-models.h +++ b/tools/server/server-models.h @@ -2,6 +2,7 @@ #include "common.h" #include "preset.h" +#include "server-common.h" #include "server-http.h" #include @@ -149,9 +150,18 @@ public: struct server_models_routes { common_params params; + json webui_settings = json::object(); server_models models; server_models_routes(const common_params & params, int argc, char ** argv, char ** envp) : params(params), models(params, argc, argv, envp) { + if (!this->params.webui_config_json.empty()) { + try { + webui_settings = json::parse(this->params.webui_config_json); + } catch (const std::exception & e) { + LOG_ERR("%s: failed to parse webui config: %s\n", __func__, e.what()); + throw; + } + } init_routes(); } diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 3cebe174b..b6b611b3f 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -8,6 +8,7 @@ #include "log.h" #include +#include #include #include // for std::thread::hardware_concurrency @@ -124,7 +125,12 @@ int main(int argc, char ** argv, char ** envp) { std::optional models_routes{}; if (is_router_server) { // setup server instances manager - models_routes.emplace(params, argc, argv, envp); + try { + models_routes.emplace(params, argc, argv, envp); + } catch (const std::exception & e) { + LOG_ERR("%s: failed to initialize router models: %s\n", __func__, e.what()); + return 1; + } // proxy handlers // note: routes.get_health stays the same diff --git a/tools/server/webui/package-lock.json b/tools/server/webui/package-lock.json index 4f37b308b..0d1a03aca 100644 --- a/tools/server/webui/package-lock.json +++ b/tools/server/webui/package-lock.json @@ -2109,9 +2109,9 @@ } }, "node_modules/@sveltejs/kit": { - "version": "2.48.5", - "resolved": "https://registry.npmjs.org/@sveltejs/kit/-/kit-2.48.5.tgz", - "integrity": "sha512-/rnwfSWS3qwUSzvHynUTORF9xSJi7PCR9yXkxUOnRrNqyKmCmh3FPHH+E9BbgqxXfTevGXBqgnlh9kMb+9T5XA==", + "version": "2.49.2", + "resolved": "https://registry.npmjs.org/@sveltejs/kit/-/kit-2.49.2.tgz", + "integrity": "sha512-Vp3zX/qlwerQmHMP6x0Ry1oY7eKKRcOWGc2P59srOp4zcqyn+etJyQpELgOi4+ZSUgteX8Y387NuwruLgGXLUQ==", "dev": true, "license": "MIT", "dependencies": { @@ -5797,9 +5797,9 @@ } }, "node_modules/mdast-util-to-hast": { - "version": "13.2.0", - "resolved": "https://registry.npmjs.org/mdast-util-to-hast/-/mdast-util-to-hast-13.2.0.tgz", - "integrity": "sha512-QGYKEuUsYT9ykKBCMOEDLsU5JRObWQusAolFMeko/tYPufNkRffBAQjIE+99jbA87xv6FgmjLtwjh9wBWajwAA==", + "version": "13.2.1", + "resolved": "https://registry.npmjs.org/mdast-util-to-hast/-/mdast-util-to-hast-13.2.1.tgz", + "integrity": "sha512-cctsq2wp5vTsLIcaymblUriiTcZd0CwWtCbLvrOzYCDZoWyMNV8sZ7krj09FSnsiJi3WVsHLM4k6Dq/yaPyCXA==", "license": "MIT", "dependencies": { "@types/hast": "^3.0.0", diff --git a/tools/server/webui/src/app.d.ts b/tools/server/webui/src/app.d.ts index 71976936e..73287d91b 100644 --- a/tools/server/webui/src/app.d.ts +++ b/tools/server/webui/src/app.d.ts @@ -124,3 +124,10 @@ declare global { SettingsConfigType }; } + +declare global { + interface Window { + idxThemeStyle?: number; + idxCodeBlock?: number; + } +} diff --git a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageAssistant.svelte b/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageAssistant.svelte index 2c9a012ef..8997963f1 100644 --- a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageAssistant.svelte +++ b/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageAssistant.svelte @@ -244,7 +244,7 @@
{#if displayedModel()} - +
{#if isRouter} {/if} - +
{/if} {#if config().showToolCalls} diff --git a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageStatistics.svelte b/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageStatistics.svelte index a453a3101..a39acb1d7 100644 --- a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageStatistics.svelte +++ b/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageStatistics.svelte @@ -1,20 +1,122 @@ - +
+
+ {#if hasPromptStats} + + + + + +

Reading (prompt processing)

+
+
+ {/if} + + + + + +

Generation (token output)

+
+
+
- - - +
+ {#if activeView === ChatMessageStatsView.GENERATION} + + + + {:else if hasPromptStats} + + + + {/if} +
+
diff --git a/tools/server/webui/src/lib/components/app/chat/ChatScreen/ChatScreen.svelte b/tools/server/webui/src/lib/components/app/chat/ChatScreen/ChatScreen.svelte index 8eacb7744..23a9f37b7 100644 --- a/tools/server/webui/src/lib/components/app/chat/ChatScreen/ChatScreen.svelte +++ b/tools/server/webui/src/lib/components/app/chat/ChatScreen/ChatScreen.svelte @@ -587,7 +587,7 @@ &::after { content: ''; - position: fixed; + position: absolute; bottom: 0; z-index: -1; left: 0; diff --git a/tools/server/webui/src/lib/components/app/misc/BadgeChatStatistic.svelte b/tools/server/webui/src/lib/components/app/misc/BadgeChatStatistic.svelte index 9e5339cab..a2b28d205 100644 --- a/tools/server/webui/src/lib/components/app/misc/BadgeChatStatistic.svelte +++ b/tools/server/webui/src/lib/components/app/misc/BadgeChatStatistic.svelte @@ -1,5 +1,6 @@ - - {#snippet icon()} - - {/snippet} +{#if tooltipLabel} + + + + {#snippet icon()} + + {/snippet} - {value} - + {value} + + + +

{tooltipLabel}

+
+ +{:else} + + {#snippet icon()} + + {/snippet} + + {value} + +{/if} diff --git a/tools/server/webui/src/lib/components/app/misc/MarkdownContent.svelte b/tools/server/webui/src/lib/components/app/misc/MarkdownContent.svelte index 2a4a39535..cb3ae17a6 100644 --- a/tools/server/webui/src/lib/components/app/misc/MarkdownContent.svelte +++ b/tools/server/webui/src/lib/components/app/misc/MarkdownContent.svelte @@ -7,15 +7,19 @@ import remarkRehype from 'remark-rehype'; import rehypeKatex from 'rehype-katex'; import rehypeStringify from 'rehype-stringify'; - import { copyCodeToClipboard, preprocessLaTeX } from '$lib/utils'; - import { rehypeRestoreTableHtml } from '$lib/markdown/table-html-restorer'; + import type { Root as HastRoot, RootContent as HastRootContent } from 'hast'; + import type { Root as MdastRoot } from 'mdast'; import { browser } from '$app/environment'; + import { onDestroy, tick } from 'svelte'; + import { rehypeRestoreTableHtml } from '$lib/markdown/table-html-restorer'; + import { rehypeEnhanceLinks } from '$lib/markdown/enhance-links'; + import { rehypeEnhanceCodeBlocks } from '$lib/markdown/enhance-code-blocks'; + import { remarkLiteralHtml } from '$lib/markdown/literal-html'; + import { copyCodeToClipboard, preprocessLaTeX } from '$lib/utils'; import '$styles/katex-custom.scss'; - import githubDarkCss from 'highlight.js/styles/github-dark.css?inline'; import githubLightCss from 'highlight.js/styles/github.css?inline'; import { mode } from 'mode-watcher'; - import { remarkLiteralHtml } from '$lib/markdown/literal-html'; import CodePreviewDialog from './CodePreviewDialog.svelte'; interface Props { @@ -23,33 +27,24 @@ class?: string; } + interface MarkdownBlock { + id: string; + html: string; + } + let { content, class: className = '' }: Props = $props(); let containerRef = $state(); - let processedHtml = $state(''); + let renderedBlocks = $state([]); + let unstableBlockHtml = $state(''); let previewDialogOpen = $state(false); let previewCode = $state(''); let previewLanguage = $state('text'); - function loadHighlightTheme(isDark: boolean) { - if (!browser) return; + let pendingMarkdown: string | null = null; + let isProcessing = false; - const existingThemes = document.querySelectorAll('style[data-highlight-theme]'); - existingThemes.forEach((style) => style.remove()); - - const style = document.createElement('style'); - style.setAttribute('data-highlight-theme', 'true'); - style.textContent = isDark ? githubDarkCss : githubLightCss; - - document.head.appendChild(style); - } - - $effect(() => { - const currentMode = mode.current; - const isDark = currentMode === 'dark'; - - loadHighlightTheme(isDark); - }); + const themeStyleId = `highlight-theme-${(window.idxThemeStyle = (window.idxThemeStyle ?? 0) + 1)}`; let processor = $derived(() => { return remark() @@ -61,139 +56,64 @@ .use(rehypeKatex) // Render math using KaTeX .use(rehypeHighlight) // Add syntax highlighting .use(rehypeRestoreTableHtml) // Restore limited HTML (e.g.,
,
    ) inside Markdown tables - .use(rehypeStringify); // Convert to HTML string + .use(rehypeEnhanceLinks) // Add target="_blank" to links + .use(rehypeEnhanceCodeBlocks) // Wrap code blocks with header and actions + .use(rehypeStringify, { allowDangerousHtml: true }); // Convert to HTML string }); - function enhanceLinks(html: string): string { - if (!html.includes('('.copy-code-btn'); + const previewButtons = containerRef.querySelectorAll('.preview-code-btn'); + + for (const button of copyButtons) { + button.removeEventListener('click', handleCopyClick); } - const tempDiv = document.createElement('div'); - tempDiv.innerHTML = html; - - // Make all links open in new tabs - const linkElements = tempDiv.querySelectorAll('a[href]'); - let mutated = false; - - for (const link of linkElements) { - const target = link.getAttribute('target'); - const rel = link.getAttribute('rel'); - - if (target !== '_blank' || rel !== 'noopener noreferrer') { - mutated = true; - } - - link.setAttribute('target', '_blank'); - link.setAttribute('rel', 'noopener noreferrer'); - } - - return mutated ? tempDiv.innerHTML : html; - } - - function enhanceCodeBlocks(html: string): string { - if (!html.includes(' - `; - - const actions = document.createElement('div'); - actions.className = 'code-block-actions'; - - actions.appendChild(copyButton); - - if (language.toLowerCase() === 'html') { - const previewButton = document.createElement('button'); - previewButton.className = 'preview-code-btn'; - previewButton.setAttribute('data-code-id', codeId); - previewButton.setAttribute('title', 'Preview code'); - previewButton.setAttribute('type', 'button'); - - previewButton.innerHTML = ` - - `; - - actions.appendChild(previewButton); - } - - header.appendChild(languageLabel); - header.appendChild(actions); - wrapper.appendChild(header); - - const clonedPre = pre.cloneNode(true) as HTMLElement; - wrapper.appendChild(clonedPre); - - pre.parentNode?.replaceChild(wrapper, pre); - } - - return mutated ? tempDiv.innerHTML : html; - } - - async function processMarkdown(text: string): Promise { - try { - let normalized = preprocessLaTeX(text); - const result = await processor().process(normalized); - const html = String(result); - const enhancedLinks = enhanceLinks(html); - - return enhanceCodeBlocks(enhancedLinks); - } catch (error) { - console.error('Markdown processing error:', error); - - // Fallback to plain text with line breaks - return text.replace(/\n/g, '
    '); + for (const button of previewButtons) { + button.removeEventListener('click', handlePreviewClick); } } + /** + * Removes this component's highlight.js theme style from the document head. + * Called on component destroy to clean up injected styles. + */ + function cleanupHighlightTheme() { + if (!browser) return; + + const existingTheme = document.getElementById(themeStyleId); + existingTheme?.remove(); + } + + /** + * Loads the appropriate highlight.js theme based on dark/light mode. + * Injects a scoped style element into the document head. + * @param isDark - Whether to load the dark theme (true) or light theme (false) + */ + function loadHighlightTheme(isDark: boolean) { + if (!browser) return; + + const existingTheme = document.getElementById(themeStyleId); + existingTheme?.remove(); + + const style = document.createElement('style'); + style.id = themeStyleId; + style.textContent = isDark ? githubDarkCss : githubLightCss; + + document.head.appendChild(style); + } + + /** + * Extracts code information from a button click target within a code block. + * @param target - The clicked button element + * @returns Object with rawCode and language, or null if extraction fails + */ function getCodeInfoFromTarget(target: HTMLElement) { const wrapper = target.closest('.code-block-wrapper'); @@ -209,12 +129,7 @@ return null; } - const rawCode = codeElement.getAttribute('data-raw-code'); - - if (rawCode === null) { - console.error('No raw code found'); - return null; - } + const rawCode = codeElement.textContent ?? ''; const languageLabel = wrapper.querySelector('.code-language'); const language = languageLabel?.textContent?.trim() || 'text'; @@ -222,6 +137,28 @@ return { rawCode, language }; } + /** + * Generates a unique identifier for a HAST node based on its position. + * Used for stable block identification during incremental rendering. + * @param node - The HAST root content node + * @param indexFallback - Fallback index if position is unavailable + * @returns Unique string identifier for the node + */ + function getHastNodeId(node: HastRootContent, indexFallback: number): string { + const position = node.position; + + if (position?.start?.offset != null && position?.end?.offset != null) { + return `hast-${position.start.offset}-${position.end.offset}`; + } + + return `${node.type}-${indexFallback}`; + } + + /** + * Handles click events on copy buttons within code blocks. + * Copies the raw code content to the clipboard. + * @param event - The click event from the copy button + */ async function handleCopyClick(event: Event) { event.preventDefault(); event.stopPropagation(); @@ -245,6 +182,25 @@ } } + /** + * Handles preview dialog open state changes. + * Clears preview content when dialog is closed. + * @param open - Whether the dialog is being opened or closed + */ + function handlePreviewDialogOpenChange(open: boolean) { + previewDialogOpen = open; + + if (!open) { + previewCode = ''; + previewLanguage = 'text'; + } + } + + /** + * Handles click events on preview buttons within HTML code blocks. + * Opens a preview dialog with the rendered HTML content. + * @param event - The click event from the preview button + */ function handlePreviewClick(event: Event) { event.preventDefault(); event.stopPropagation(); @@ -266,6 +222,61 @@ previewDialogOpen = true; } + /** + * Processes markdown content into stable and unstable HTML blocks. + * Uses incremental rendering: stable blocks are cached, unstable block is re-rendered. + * @param markdown - The raw markdown string to process + */ + async function processMarkdown(markdown: string) { + if (!markdown) { + renderedBlocks = []; + unstableBlockHtml = ''; + return; + } + + const normalized = preprocessLaTeX(markdown); + const processorInstance = processor(); + const ast = processorInstance.parse(normalized) as MdastRoot; + const processedRoot = (await processorInstance.run(ast)) as HastRoot; + const processedChildren = processedRoot.children ?? []; + const stableCount = Math.max(processedChildren.length - 1, 0); + const nextBlocks: MarkdownBlock[] = []; + + for (let index = 0; index < stableCount; index++) { + const hastChild = processedChildren[index]; + const id = getHastNodeId(hastChild, index); + const existing = renderedBlocks[index]; + + if (existing && existing.id === id) { + nextBlocks.push(existing); + continue; + } + + const html = stringifyProcessedNode( + processorInstance, + processedRoot, + processedChildren[index] + ); + + nextBlocks.push({ id, html }); + } + + let unstableHtml = ''; + + if (processedChildren.length > stableCount) { + const unstableChild = processedChildren[stableCount]; + unstableHtml = stringifyProcessedNode(processorInstance, processedRoot, unstableChild); + } + + renderedBlocks = nextBlocks; + await tick(); // Force DOM sync before updating unstable HTML block + unstableBlockHtml = unstableHtml; + } + + /** + * Attaches click event listeners to copy and preview buttons in code blocks. + * Uses data-listener-bound attribute to prevent duplicate bindings. + */ function setupCodeBlockActions() { if (!containerRef) return; @@ -287,40 +298,97 @@ } } - function handlePreviewDialogOpenChange(open: boolean) { - previewDialogOpen = open; + /** + * Converts a single HAST node to an enhanced HTML string. + * Applies link and code block enhancements to the output. + * @param processorInstance - The remark/rehype processor instance + * @param processedRoot - The full processed HAST root (for context) + * @param child - The specific HAST child node to stringify + * @returns Enhanced HTML string representation of the node + */ + function stringifyProcessedNode( + processorInstance: ReturnType, + processedRoot: HastRoot, + child: unknown + ) { + const root: HastRoot = { + ...(processedRoot as HastRoot), + children: [child as never] + }; - if (!open) { - previewCode = ''; - previewLanguage = 'text'; + return processorInstance.stringify(root); + } + + /** + * Queues markdown for processing with coalescing support. + * Only processes the latest markdown when multiple updates arrive quickly. + * @param markdown - The markdown content to render + */ + async function updateRenderedBlocks(markdown: string) { + pendingMarkdown = markdown; + + if (isProcessing) { + return; + } + + isProcessing = true; + + try { + while (pendingMarkdown !== null) { + const nextMarkdown = pendingMarkdown; + pendingMarkdown = null; + + await processMarkdown(nextMarkdown); + } + } catch (error) { + console.error('Failed to process markdown:', error); + renderedBlocks = []; + unstableBlockHtml = markdown.replace(/\n/g, '
    '); + } finally { + isProcessing = false; } } $effect(() => { - if (content) { - processMarkdown(content) - .then((result) => { - processedHtml = result; - }) - .catch((error) => { - console.error('Failed to process markdown:', error); - processedHtml = content.replace(/\n/g, '
    '); - }); - } else { - processedHtml = ''; - } + const currentMode = mode.current; + const isDark = currentMode === 'dark'; + + loadHighlightTheme(isDark); }); $effect(() => { - if (containerRef && processedHtml) { + updateRenderedBlocks(content); + }); + + $effect(() => { + const hasRenderedBlocks = renderedBlocks.length > 0; + const hasUnstableBlock = Boolean(unstableBlockHtml); + + if ((hasRenderedBlocks || hasUnstableBlock) && containerRef) { setupCodeBlockActions(); } }); + + onDestroy(() => { + cleanupEventListeners(); + cleanupHighlightTheme(); + });
    - - {@html processedHtml} + {#each renderedBlocks as block (block.id)} +
    + + {@html block.html} +
    + {/each} + + {#if unstableBlockHtml} +
    + + {@html unstableBlockHtml} +
    + {/if}