diff --git a/common/arg.cpp b/common/arg.cpp index 4d01e7c4a..d2578ff65 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2402,7 +2402,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.fit_params = false; } else { throw std::runtime_error( - string_format("error: unkown value for --fit: '%s'\n", value.c_str())); + string_format("error: unknown value for --fit: '%s'\n", value.c_str())); } } ).set_env("LLAMA_ARG_FIT")); diff --git a/common/common.h b/common/common.h index a136d9391..74aa2c326 100644 --- a/common/common.h +++ b/common/common.h @@ -866,7 +866,7 @@ std::string common_detokenize( // Embedding utils // -// TODO: repace embd_norm with an enum +// TODO: replace embd_norm with an enum void common_embd_normalize(const float * inp, float * out, int n, int embd_norm); float common_embd_similarity_cos(const float * embd1, const float * embd2, int n); diff --git a/common/console.cpp b/common/console.cpp index 2ea178f81..a770416ab 100644 --- a/common/console.cpp +++ b/common/console.cpp @@ -80,6 +80,8 @@ namespace console { static termios initial_state; #endif + static completion_callback completion_cb = nullptr; + // // Init and cleanup // @@ -493,7 +495,7 @@ namespace console { } static void set_line_contents(std::string new_line, std::string & line, std::vector & widths, size_t & char_pos, - size_t & byte_pos) { + size_t & byte_pos, int cursor_byte_pos = -1) { move_to_line_start(char_pos, byte_pos, widths); clear_current_line(widths); @@ -503,6 +505,7 @@ namespace console { char_pos = 0; size_t idx = 0; + int back_width = 0; while (idx < line.size()) { size_t advance = 0; char32_t cp = decode_utf8(line, idx, advance); @@ -511,8 +514,15 @@ namespace console { if (real_width < 0) real_width = 0; widths.push_back(real_width); idx += advance; - ++char_pos; - byte_pos = idx; + if (cursor_byte_pos >= 0 && static_cast(cursor_byte_pos) < idx) { + back_width += real_width; + } else { + ++char_pos; + byte_pos = idx; + } + } + if (cursor_byte_pos >= 0) { + move_cursor(-back_width); } } @@ -784,6 +794,20 @@ namespace console { break; } + if (completion_cb && input_char == '\t') { + auto candidates = completion_cb(line, byte_pos); + + if (!candidates.empty()) { + if (candidates.size() > 1 || candidates[0].first != line) { + // TODO?: Display all candidates + set_line_contents(candidates[0].first, line, widths, char_pos, byte_pos, candidates[0].second); + } else { + // TODO: Move cursor to new byte_pos + } + continue; + } + } + if (input_char == (char32_t) WEOF || input_char == 0x04 /* Ctrl+D */) { end_of_stream = true; break; @@ -1062,6 +1086,10 @@ namespace console { return readline_advanced(line, multiline_input); } + void set_completion_callback(completion_callback cb) { + completion_cb = cb; + } + namespace spinner { static const char LOADING_CHARS[] = {'|', '/', '-', '\\'}; static std::condition_variable cv_stop; diff --git a/common/console.h b/common/console.h index fad6d3953..72781bea6 100644 --- a/common/console.h +++ b/common/console.h @@ -4,7 +4,9 @@ #include "common.h" +#include #include +#include enum display_type { DISPLAY_TYPE_RESET = 0, @@ -21,6 +23,9 @@ namespace console { void set_display(display_type display); bool readline(std::string & line, bool multiline_input); + using completion_callback = std::function>(std::string_view, size_t)>; + void set_completion_callback(completion_callback cb); + namespace spinner { void start(); void stop(); diff --git a/common/debug.h b/common/debug.h index 0c5596325..e563b40d6 100644 --- a/common/debug.h +++ b/common/debug.h @@ -18,7 +18,7 @@ template void common_debug_print_tensor(uint8_t * data, ggml // prints tensors that are processed in the computation graph // by default prints all tensors, but can be configured by creating a `base_callback_data` instance with // non-empty filter_patterns. See examples/debug.ccp for possible usage patterns -// The template parameter determins whether an error should be thrown whenever a NaN is encountered +// The template parameter determines whether an error should be thrown whenever a NaN is encountered // in a tensor (useful for stopping debug sessions on first erroneous tensor) // The callback data will be passed as the third parameter (user_data) template bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data); diff --git a/common/jinja/README.md b/common/jinja/README.md index 7059105ee..829124076 100644 --- a/common/jinja/README.md +++ b/common/jinja/README.md @@ -63,7 +63,7 @@ The llama.cpp Jinja engine introduces `jinja::string` (see `jinja/string.h`), wh - **One-to-many** (e.g., split): result is marked `is_input` **only if ALL** input parts are marked `is_input` - **Many-to-one** (e.g., join): same as one-to-many -For string concatenation, string parts will be appended to the new string as-is, while perserving the `is_input` flag. +For string concatenation, string parts will be appended to the new string as-is, while preserving the `is_input` flag. **Enabling Input Marking:** diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 095441739..083b5bca9 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -4031,7 +4031,7 @@ class Qwen2VLVisionModel(MmprojModel): # split Conv3D into Conv2Ds c1, c2, kt, kh, kw = data_torch.shape del c1, c2, kh, kw # unused - assert kt == 2, "Current implmentation only support temporal_patch_size of 2" + assert kt == 2, "Current implementation only support temporal_patch_size of 2" yield (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight" , data_torch[:, :, 0, ...]) yield (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight.1", data_torch[:, :, 1, ...]) else: @@ -4842,12 +4842,12 @@ class _LinearAttentionVReorderBase(Qwen3NextModel): yield from super().modify_tensors(data_torch, name, bid) -@ModelBase.register("Qwen3_5ForConditionalGeneration") +@ModelBase.register("Qwen3_5ForConditionalGeneration", "Qwen3_5ForCausalLM") class Qwen3_5TextModel(_LinearAttentionVReorderBase): model_arch = gguf.MODEL_ARCH.QWEN35 -@ModelBase.register("Qwen3_5MoeForConditionalGeneration") +@ModelBase.register("Qwen3_5MoeForConditionalGeneration", "Qwen3_5MoeForCausalLM") class Qwen3_5MoeTextModel(_LinearAttentionVReorderBase): model_arch = gguf.MODEL_ARCH.QWEN35MOE @@ -5404,7 +5404,7 @@ class KimiLinearModel(TextModel): # Get ssm_d_conv from linear_attn_config.short_conv_kernel_size or ssm_d_conv linear_attn_config = self.hparams["linear_attn_config"] # n_head == 0 for KDA layers, n_head > 0 for MLA layers - # full_attention_layers list will be used to distingush layer type + # full_attention_layers list will be used to distinguish layer type _num_kv_heads = list() _full_attn_layers = linear_attn_config["full_attn_layers"] for il in range(self.hparams["num_hidden_layers"]): @@ -6505,7 +6505,7 @@ class Gemma3VisionModel(MmprojModel): super().set_gguf_parameters() hparams = self.hparams self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GEMMA3) - # default values below are taken from HF tranformers code + # default values below are taken from HF transformers code self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("layer_norm_eps", 1e-6)) self.gguf_writer.add_vision_use_gelu(True) # calculate proj_scale_factor (used by tinygemma3 test model) @@ -7097,7 +7097,7 @@ class Rwkv7Model(TextModel): if bid == 0 and "time_mix_a" in new_name: # dummy v0/v1/v2 on first layer - # easist way to make llama happy + # easiest way to make llama happy yield (new_name.replace("time_mix_a", "time_mix_v"), data_torch) yield (new_name, data_torch) @@ -9596,7 +9596,7 @@ class GraniteHybridModel(Mamba2Model, GraniteMoeModel): # NOTE: Explicitly include hparam prefix prefix for d_model to # disambiguate with top-level head_dim # NOTE 2: If needed for future models, this can be isolated in a method - # to separate the prefix setting and teh keys used + # to separate the prefix setting and the keys used self.d_model = self.find_hparam([f"{self.hparam_prefixes[0]}_head_dim", "hidden_size", "d_model"]) self.n_group = self.find_hparam(["n_groups", "num_groups"]) self.d_inner = self.find_hparam(["expand", "num_heads"]) * self.d_model @@ -9743,7 +9743,7 @@ class NemotronHModel(GraniteHybridModel): self.gguf_writer.add_value_length(self.head_dim) # Set feed_forward_length - # NOTE: This will trigger an override warning. This is preferrable to + # NOTE: This will trigger an override warning. This is preferable to # duplicating all the parent logic if not self.is_moe: n_ff = self.find_hparam(["intermediate_size", "n_inner", "hidden_dim"]) diff --git a/examples/diffusion/README.md b/examples/diffusion/README.md index f71d24131..b39420021 100644 --- a/examples/diffusion/README.md +++ b/examples/diffusion/README.md @@ -43,12 +43,12 @@ Choose one of the following scheduling methods: - `-b`: Batch size ### Examples -#### Dream architechture: +#### Dream architecture: ``` llama-diffusion-cli -m dream7b.gguf -p "write code to train MNIST in pytorch" -ub 512 --diffusion-eps 0.001 --diffusion-algorithm 3 --diffusion-steps 256 --diffusion-visual ``` -#### LLaDA architechture: +#### LLaDA architecture: ``` llama-diffusion-cli -m llada-8b.gguf -p "write code to train MNIST in pytorch" -ub 512 --diffusion-block-length 32 --diffusion-steps 256 --diffusion-visual ``` diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h index a9d177864..9fd3f7f32 100644 --- a/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h @@ -259,7 +259,7 @@ extern "C" { Example usage: // operations that use tensors allocated in a buffer with USAGE_WEIGHTS will be assigned - // preferrably to run on the same backend as the buffer + // preferably to run on the same backend as the buffer ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE, false, true); diff --git a/ggml/include/ggml-opt.h b/ggml/include/ggml-opt.h index 4703a05af..1c2ed79b7 100644 --- a/ggml/include/ggml-opt.h +++ b/ggml/include/ggml-opt.h @@ -138,7 +138,7 @@ extern "C" { GGML_API ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params); GGML_API void ggml_opt_free(ggml_opt_context_t opt_ctx); - // set gradients to zero, initilize loss, and optionally reset the optimizer + // set gradients to zero, initialize loss, and optionally reset the optimizer GGML_API void ggml_opt_reset(ggml_opt_context_t opt_ctx, bool optimizer); GGML_API bool ggml_opt_static_graphs(ggml_opt_context_t opt_ctx); // whether the graphs are allocated_statically diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 0ad789ac8..e37944c4e 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -2593,7 +2593,7 @@ extern "C" { struct ggml_tensor * grad, struct ggml_tensor * sgd_params); // alpha, weight decay - // build forward mutiple tensors and select one of them for computing + // build forward multiple tensors and select one of them for computing // this is useful for creating graphs that have constant topology but compute different things based on the input // ref: https://github.com/ggml-org/llama.cpp/pull/18550 // diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index cc9196899..b3ab9ebb8 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1462,6 +1462,10 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s int split_backend_id = split->backend_id; ggml_backend_t split_backend = sched->backends[split_backend_id]; + if (sched->events[split_backend_id][sched->cur_copy] == NULL) { + ggml_backend_synchronize(split_backend); + } + // copy the input tensors to the split backend for (int input_id = 0; input_id < split->n_inputs; input_id++) { ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[input_id]); @@ -1472,16 +1476,12 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s // inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done if (sched->events[split_backend_id][sched->cur_copy] != NULL) { ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]); - } else { - ggml_backend_synchronize(split_backend); } - ggml_backend_tensor_copy(input, input_cpy); + ggml_backend_tensor_copy_async(input_backend, split_backend, input, input_cpy); } else { // wait for the split backend to finish using the input before overwriting it if (sched->events[split_backend_id][sched->cur_copy] != NULL) { ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]); - } else { - ggml_backend_synchronize(split_backend); } // when offloading MoE weights, we can reduce the amount of data copied by copying only the experts that are used @@ -1585,6 +1585,10 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s } } + if (sched->events[split_backend_id][sched->cur_copy] == NULL) { + ggml_backend_synchronize(split_backend); + } + if (!sched->callback_eval) { enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph); if (ec != GGML_STATUS_SUCCESS) { diff --git a/ggml/src/ggml-cpu/arch/arm/quants.c b/ggml/src/ggml-cpu/arch/arm/quants.c index b390ab61c..a707d6398 100644 --- a/ggml/src/ggml-cpu/arch/arm/quants.c +++ b/ggml/src/ggml-cpu/arch/arm/quants.c @@ -968,7 +968,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi const int vector_length = ggml_cpu_get_sve_cnt()*8; - //VLA Implemenation for SVE + //VLA Implementation for SVE switch (vector_length) { case 128: { diff --git a/ggml/src/ggml-cpu/arch/arm/repack.cpp b/ggml/src/ggml-cpu/arch/arm/repack.cpp index c2e4623f3..fea659ee5 100644 --- a/ggml/src/ggml-cpu/arch/arm/repack.cpp +++ b/ggml/src/ggml-cpu/arch/arm/repack.cpp @@ -706,7 +706,7 @@ void ggml_gemv_q4_K_8x8_q8_K(int n, const uint8_t * q4_base = q4_ptr[b].qs + sb * QK_K; - // Load the 64 quants from q8K duplicated to use vecdots with the interelaved columns + // Load the 64 quants from q8K duplicated to use vecdots with the interleaved columns // but still need the qs to use the low and hi bits from q4 const int8_t * q8_base = q8_ptr[b].qs + sb * 64; int8x16_t q8_qs[8]; @@ -3640,7 +3640,7 @@ void ggml_gemm_q4_K_8x8_q8_K(int n, for (int b = 0; b < nb; b++) { // bsums pairs belongs to the same q8_k subblock - // 64 elemnts loaded and made sum of 0-7 and 8-15 sum || 16-23 and 24 - 31 sum + // 64 elements loaded and made sum of 0-7 and 8-15 sum || 16-23 and 24 - 31 sum const int16x8_t bsums[4]{ vpaddq_s16(vld1q_s16(q8_ptr[b].bsums + 16 * 0), vld1q_s16(q8_ptr[b].bsums + 16 * 0 + 8)), vpaddq_s16(vld1q_s16(q8_ptr[b].bsums + 16 * 1), vld1q_s16(q8_ptr[b].bsums + 16 * 1 + 8)), diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 6147ea36f..2b5a352da 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -3305,7 +3305,7 @@ static bool ggml_thread_apply_priority(int32_t prio) { #ifndef USE_FAILSAFE if (prio != GGML_SCHED_PRIO_LOW) { // Tell Windows that this thread should not be throttled (needs its own CPU core). - // Newer Windows 11 versions aggresively park (offline) CPU cores and often place + // Newer Windows 11 versions aggressively park (offline) CPU cores and often place // all our threads onto the first 4 cores which results in terrible performance with // n_threads > 4 #if _WIN32_WINNT >= 0x602 diff --git a/ggml/src/ggml-cpu/llamafile/sgemm.cpp b/ggml/src/ggml-cpu/llamafile/sgemm.cpp index da412fd00..5fd452a03 100644 --- a/ggml/src/ggml-cpu/llamafile/sgemm.cpp +++ b/ggml/src/ggml-cpu/llamafile/sgemm.cpp @@ -533,7 +533,7 @@ class tinyBLAS { if constexpr (RN > 1) { return mnpack(m, n, SIZE_N, BN); } else { - GGML_LOG_ERROR("mnpack<%d, %d> bloc size not supported\n", RM, (int)SIZE_N); + GGML_LOG_ERROR("mnpack<%d, %d> block size not supported\n", RM, (int)SIZE_N); GGML_ASSERT(false); // we have miss something. } } @@ -711,7 +711,7 @@ class tinyBLAS_RVV { if constexpr (RN > 1) { return mnpack(m, n, SIZE_N, BN); } else { - GGML_LOG_ERROR("mnpack<%d, %d> bloc size not supported\n", RM, (int)SIZE_N); + GGML_LOG_ERROR("mnpack<%d, %d> block size not supported\n", RM, (int)SIZE_N); GGML_ASSERT(false); // we have miss something. } } diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index b7a70e06f..ca1b3059b 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -375,7 +375,7 @@ static void ggml_compute_forward_dup_bytes( const size_t rs = ne00 * type_size; if (nb00 == type_size) { - // src0 is contigous on first dimension, copy by rows + // src0 is contiguous on first dimension, copy by rows for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { id += rs * ir0; @@ -1795,7 +1795,7 @@ void ggml_compute_forward_repeat( { ggml_compute_forward_repeat_f32(params, dst); } break; - // TODO: templateify the implemenation and support for I64 + // TODO: templateify the implementation and support for I64 // ref https://github.com/ggml-org/llama.cpp/pull/14274#discussion_r2169492225 //case GGML_TYPE_I64: // { diff --git a/ggml/src/ggml-cpu/repack.cpp b/ggml/src/ggml-cpu/repack.cpp index 303ff8e8e..600df25e4 100644 --- a/ggml/src/ggml-cpu/repack.cpp +++ b/ggml/src/ggml-cpu/repack.cpp @@ -2605,7 +2605,7 @@ template src[1])); - size = GGML_PAD(size, sizeof(int64_t)); // + padding for next bloc. + size = GGML_PAD(size, sizeof(int64_t)); // + padding for next block. const int64_t ne02 = op->src[0]->ne[2]; // n_as, n_expert const int64_t ne12 = op->src[1]->ne[2]; // n_tokens @@ -2870,7 +2870,7 @@ template wdata; auto * wdata_src1_end = (char *)wdata + GGML_PAD(nbw3, sizeof(int64_t)); - // total of [n_as][ne12 + 1] elemets of type mmid_row_mapping (2*int32_t = int64_t) + // total of [n_as][ne12 + 1] elements of type mmid_row_mapping (2*int32_t = int64_t) auto * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as] struct mmid_row_mapping * matrix_rows = (struct mmid_row_mapping *) (matrix_row_counts + n_as); // [n_as][ne12] diff --git a/ggml/src/ggml-cuda/fattn-mma-f16.cuh b/ggml/src/ggml-cuda/fattn-mma-f16.cuh index beb7e32e4..fff70c8eb 100644 --- a/ggml/src/ggml-cuda/fattn-mma-f16.cuh +++ b/ggml/src/ggml-cuda/fattn-mma-f16.cuh @@ -1215,7 +1215,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile( } // If attention sinks are used, potentially re-scale if KQ_max is small. - // Also add the sink as a value to KQ_rowsum, this is done after synchonization of KQ_rowsum + // Also add the sink as a value to KQ_rowsum, this is done after synchronization of KQ_rowsum // so it's being done unconditionally for every thread. if (!is_fixup && (np == 1 || threadIdx.y % np == 0) && sinks_f) { float KQ_max_scale[cols_per_thread]; diff --git a/ggml/src/ggml-cuda/fattn-vec.cuh b/ggml/src/ggml-cuda/fattn-vec.cuh index 3f4a78cc6..7cbe32633 100644 --- a/ggml/src/ggml-cuda/fattn-vec.cuh +++ b/ggml/src/ggml-cuda/fattn-vec.cuh @@ -10,7 +10,7 @@ static constexpr __device__ int ggml_cuda_fattn_vec_get_nthreads_device() { return 128; } -// Currenlty llvm with the amdgcn target does not support unrolling loops +// Currently llvm with the amdgcn target does not support unrolling loops // that contain a break that can not be resolved at compile time. #ifdef __clang__ #pragma clang diagnostic push diff --git a/ggml/src/ggml-cuda/fattn-wmma-f16.cuh b/ggml/src/ggml-cuda/fattn-wmma-f16.cuh index cd3bfd405..aaf711a61 100644 --- a/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +++ b/ggml/src/ggml-cuda/fattn-wmma-f16.cuh @@ -18,7 +18,7 @@ #if defined(RDNA4) && ROCWMMA_VERSION_MAJOR > 1 #define GGML_USE_WMMA_FATTN #elif defined(RDNA4) -#warning "rocwmma fattn is not suported on RDNA4 on rocwmma < v2.0.0, expect degraded performance" +#warning "rocwmma fattn is not supported on RDNA4 on rocwmma < v2.0.0, expect degraded performance" #endif // defined(RDNA4) && ROCWMMA_VERSION_MAJOR > 1 #endif // defined(GGML_HIP_ROCWMMA_FATTN) diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 89d2c35e5..9575746ba 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -2815,11 +2815,14 @@ static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_ ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer; ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer; - if (!ggml_backend_is_cuda(backend_src) || !ggml_backend_is_cuda(backend_dst)) { + //enables async copies from CPU to CUDA, instead of only CUDA-to-CUDA + bool copy_from_host = ggml_backend_buffer_is_host(buf_src) && ggml_backend_dev_type(backend_src->device) == GGML_BACKEND_DEVICE_TYPE_CPU; + + if (!(copy_from_host || ggml_backend_is_cuda(backend_src)) || !ggml_backend_is_cuda(backend_dst)) { return false; } - if (!ggml_backend_buffer_is_cuda(src->buffer) || !ggml_backend_buffer_is_cuda(dst->buffer)) { + if (!(copy_from_host || ggml_backend_buffer_is_cuda(buf_src)) || !ggml_backend_buffer_is_cuda(dst->buffer)) { return false; } @@ -2830,14 +2833,17 @@ static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_ ggml_backend_cuda_buffer_context * buf_ctx_src = (ggml_backend_cuda_buffer_context *)buf_src->context; ggml_backend_cuda_buffer_context * buf_ctx_dst = (ggml_backend_cuda_buffer_context *)buf_dst->context; - if (cuda_ctx_src->device != buf_ctx_src->device || cuda_ctx_dst->device != buf_ctx_dst->device) { + if ((copy_from_host && cuda_ctx_dst->device != buf_ctx_dst->device) || + !copy_from_host && (cuda_ctx_src->device != buf_ctx_src->device || cuda_ctx_dst->device != buf_ctx_dst->device)) { #ifndef NDEBUG GGML_LOG_DEBUG("%s: backend and buffer devices do not match\n", __func__); #endif return false; } - if (backend_src != backend_dst) { + if (copy_from_host) { + CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyHostToDevice, cuda_ctx_dst->stream())); + } else if (backend_src != backend_dst) { // copy on src stream if (cuda_ctx_src->device == cuda_ctx_dst->device) { CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream())); @@ -3346,7 +3352,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, return false; } - //rms_norm kernel assumes contigous rows + //rms_norm kernel assumes contiguous rows if (!ggml_is_contiguous_rows(mul->src[0]) || !ggml_is_contiguous_rows(mul->src[1])) { return false; } diff --git a/ggml/src/ggml-cuda/quantize.cu b/ggml/src/ggml-cuda/quantize.cu index a8c68e44b..4300ffc14 100644 --- a/ggml/src/ggml-cuda/quantize.cu +++ b/ggml/src/ggml-cuda/quantize.cu @@ -235,7 +235,7 @@ static __global__ void quantize_mmq_q8_1( q.z = roundf(xi.z*d_inv); q.w = roundf(xi.w*d_inv); - // Write back 4 int8 values as a single 32 bit value for better memroy bandwidth: + // Write back 4 int8 values as a single 32 bit value for better memory bandwidth: char4 * yqs4 = (char4 *) y[ib].qs; yqs4[iqs/4] = q; diff --git a/ggml/src/ggml-cuda/softmax.cu b/ggml/src/ggml-cuda/softmax.cu index dc06d0693..285c0e954 100644 --- a/ggml/src/ggml-cuda/softmax.cu +++ b/ggml/src/ggml-cuda/softmax.cu @@ -46,7 +46,7 @@ struct soft_max_params { }; // When ncols_template == 0 the bounds for the loops in this function are not known and can't be unrolled. -// As we want to keep pragma unroll for all other cases we supress the clang transformation warning here. +// As we want to keep pragma unroll for all other cases we suppress the clang transformation warning here. #ifdef __clang__ #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wpass-failed" diff --git a/ggml/src/ggml-cuda/solve_tri.cu b/ggml/src/ggml-cuda/solve_tri.cu index 177ffc268..07ca33f51 100644 --- a/ggml/src/ggml-cuda/solve_tri.cu +++ b/ggml/src/ggml-cuda/solve_tri.cu @@ -83,7 +83,7 @@ static void solve_tri_f32_cublas(ggml_backend_cuda_context & ctx, // ====================== // When ncols_template == 0 the bounds for the loops in this function are not // known and can't be unrolled. As we want to keep pragma unroll for all other -// cases we supress the clang transformation warning here. +// cases we suppress the clang transformation warning here. #ifdef __clang__ # pragma clang diagnostic push # pragma clang diagnostic ignored "-Wpass-failed" diff --git a/ggml/src/ggml-metal/ggml-metal-device.m b/ggml/src/ggml-metal/ggml-metal-device.m index 802d3dea0..540479500 100644 --- a/ggml/src/ggml-metal/ggml-metal-device.m +++ b/ggml/src/ggml-metal/ggml-metal-device.m @@ -1287,7 +1287,7 @@ struct ggml_metal_buffer { bool use_residency_sets; // optional MTLResidencySet - // note: cannot use explicity "id" here because it is not available on certain OSes + // note: cannot use explicitly "id" here because it is not available on certain OSes id rset; // pointers to global device diff --git a/ggml/src/ggml-metal/ggml-metal-ops.cpp b/ggml/src/ggml-metal/ggml-metal-ops.cpp index 3d5db0b79..b3390352f 100644 --- a/ggml/src/ggml-metal/ggml-metal-ops.cpp +++ b/ggml/src/ggml-metal/ggml-metal-ops.cpp @@ -631,7 +631,7 @@ int ggml_metal_op_acc(ggml_metal_op_t ctx, int idx) { const bool inplace = (bool) ((const int32_t *) op->op_params)[4]; if (!inplace) { - // run a separete kernel to cpy src->dst + // run a separate kernel to cpy src->dst // not sure how to avoid this // TODO: make a simpler cpy_bytes kernel @@ -1644,7 +1644,7 @@ int ggml_metal_op_set(ggml_metal_op_t ctx, int idx) { const bool inplace = (bool) ((const int32_t *) op->op_params)[4]; if (!inplace) { - // run a separete kernel to cpy src->dst + // run a separate kernel to cpy src->dst // not sure how to avoid this // TODO: make a simpler cpy_bytes kernel @@ -2005,7 +2005,7 @@ int ggml_metal_op_mul_mat(ggml_metal_op_t ctx, int idx) { const int16_t r0ptg = nypsg*nsg; // num src0 rows per threadgroup int16_t r1ptg = 4; // num src1 rows per threadgroup - // note: not sure how optimal are those across all different hardware. there might be someting cleverer + // note: not sure how optimal are those across all different hardware. there might be something cleverer switch (ne11) { case 2: r1ptg = 2; break; diff --git a/ggml/src/ggml-metal/ggml-metal.cpp b/ggml/src/ggml-metal/ggml-metal.cpp index 1c705362f..9382ce53b 100644 --- a/ggml/src/ggml-metal/ggml-metal.cpp +++ b/ggml/src/ggml-metal/ggml-metal.cpp @@ -14,7 +14,7 @@ #define GGML_METAL_MAX_DEVICES 16 // number of Metal devices -// note: can be overriden with GGML_METAL_DEVICES env to simulate virtual devices +// note: can be overridden with GGML_METAL_DEVICES env to simulate virtual devices static int g_devices = 1; //////////////////////////////////////////////////////////////////////////////// diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal index 6c349aa0c..a58e641ad 100644 --- a/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal @@ -4218,7 +4218,7 @@ kernel void kernel_im2col( template [[host_name("kernel_im2col_f32")]] kernel im2col_t kernel_im2col; template [[host_name("kernel_im2col_f16")]] kernel im2col_t kernel_im2col; -// TODO: obolete -- remove +// TODO: obsolete -- remove //typedef void (im2col_ext_t)( // constant ggml_metal_kargs_im2col & args, // device const float * x, diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/concat.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/concat.wgsl new file mode 100644 index 000000000..a22d245d2 --- /dev/null +++ b/ggml/src/ggml-webgpu/wgsl-shaders/concat.wgsl @@ -0,0 +1,75 @@ +struct Params { + ne: u32, + + offset_src0: u32, + offset_src1: u32, + offset_dst: u32, + + stride_src0_0: u32, + stride_src0_1: u32, + stride_src0_2: u32, + stride_src0_3: u32, + + stride_src1_0: u32, + stride_src1_1: u32, + stride_src1_2: u32, + stride_src1_3: u32, + + ne0: u32, + ne1: u32, + ne2: u32, + ne3: u32, + + dim: u32, + src0_nedim: u32 +}; + +#ifdef TYPE_F32 +#define DataType f32 +#endif +#ifdef TYPE_I32 +#define DataType i32 +#endif + +@group(0) @binding(0) +var src0: array; + +@group(0) @binding(1) +var src1 : array; + +@group(0) @binding(2) +var dst: array; + +@group(0) @binding(3) +var params: Params; + +@compute @workgroup_size(WG_SIZE) +fn main(@builtin(global_invocation_id) gid: vec3) { + + if (gid.x < params.ne) { + var i = gid.x; + let i3 = i / (params.ne2 * params.ne1 * params.ne0); + i = i % (params.ne2 * params.ne1 * params.ne0); + let i2 = i / (params.ne1 * params.ne0); + i = i % (params.ne1 * params.ne0); + let i1 = i / params.ne0; + let i0 = i % params.ne0; + + var ni = array(i0, i1, i2, i3); + + if (ni[params.dim] < params.src0_nedim) { + let src_i = ni[0] * params.stride_src0_0 + + ni[1] * params.stride_src0_1 + + ni[2] * params.stride_src0_2 + + ni[3] * params.stride_src0_3; + dst[params.offset_dst + gid.x] = src0[params.offset_src0 + src_i]; + } else { + ni[params.dim] -= params.src0_nedim; + let src_i = ni[0] * params.stride_src1_0 + + ni[1] * params.stride_src1_1 + + ni[2] * params.stride_src1_2 + + ni[3] * params.stride_src1_3; + dst[params.offset_dst + gid.x] = src1[params.offset_src1 + src_i]; + } + } +} diff --git a/gguf-py/gguf/metadata.py b/gguf-py/gguf/metadata.py index e0d478ce9..e954644e2 100644 --- a/gguf-py/gguf/metadata.py +++ b/gguf-py/gguf/metadata.py @@ -186,7 +186,7 @@ class Metadata: # Quick hack to fix the Norway problem # https://hitchdev.com/strictyaml/why/implicit-typing-removed/ yaml_content = yaml_content.replace("- no\n", "- \"no\"\n") - # yaml should use 2 spaces insted of tab + # yaml should use 2 spaces instead of tab # this issue has came up with the Qwen/Qwen3-235B-A22B-Instruct-2507 model card # (I've also sent a pr tp fix the modelcard too) yaml_content = yaml_content.replace("\t", " ") diff --git a/gguf-py/tests/test_metadata.py b/gguf-py/tests/test_metadata.py index 40d484f4e..b77c563ff 100755 --- a/gguf-py/tests/test_metadata.py +++ b/gguf-py/tests/test_metadata.py @@ -164,7 +164,7 @@ class TestMetadataMethod(unittest.TestCase): self.assertEqual(gguf.Metadata.get_model_id_components("Llama-3-Instruct-abliteration-LoRA-8B"), ('Llama-3-Instruct-abliteration-LoRA-8B', None, 'Llama-3', 'Instruct-abliteration-LoRA', None, '8B')) - # Negative size --> output is a LoRA adaper --> prune "LoRA" out of the name to avoid redundancy with the suffix + # Negative size --> output is a LoRA adapter --> prune "LoRA" out of the name to avoid redundancy with the suffix self.assertEqual(gguf.Metadata.get_model_id_components("Llama-3-Instruct-abliteration-LoRA-8B", -1234), ('Llama-3-Instruct-abliteration-LoRA-8B', None, 'Llama-3', 'Instruct-abliteration', None, '8B')) diff --git a/include/llama.h b/include/llama.h index f273a3b11..90bf25395 100644 --- a/include/llama.h +++ b/include/llama.h @@ -976,7 +976,7 @@ extern "C" { // Logits for the ith token. For positive indices, Equivalent to: // llama_get_logits(ctx) + ctx->output_ids[i]*n_vocab - // Negative indicies can be used to access logits in reverse order, -1 is the last logit. + // Negative indices can be used to access logits in reverse order, -1 is the last logit. // returns NULL for invalid ids. LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i); @@ -991,7 +991,7 @@ extern "C" { // Get the embeddings for the ith token. For positive indices, Equivalent to: // llama_get_embeddings(ctx) + ctx->output_ids[i]*n_embd - // Negative indicies can be used to access embeddings in reverse order, -1 is the last embedding. + // Negative indices can be used to access embeddings in reverse order, -1 is the last embedding. // shape: [n_embd] (1-dimensional) // returns NULL for invalid ids. LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i); @@ -1011,9 +1011,9 @@ extern "C" { // Returns LLAMA_TOKEN_NULL if no token was sampled. LLAMA_API llama_token llama_get_sampled_token_ith(struct llama_context * ctx, int32_t i); - // Get the backend sampled probabilites for the ith token + // Get the backend sampled probabilities for the ith token // The index matches llama_get_sampled_token_ith(). - // Returns NULL if no probabilites were generated. + // Returns NULL if no probabilities were generated. LLAMA_API float * llama_get_sampled_probs_ith (struct llama_context * ctx, int32_t i); LLAMA_API uint32_t llama_get_sampled_probs_count_ith(struct llama_context * ctx, int32_t i); @@ -1340,7 +1340,7 @@ extern "C" { float tau, float eta); - /// @details Intializes a GBNF grammar, see grammars/README.md for details. + /// @details Initializes a GBNF grammar, see grammars/README.md for details. /// @param vocab The vocabulary that this grammar will be used with. /// @param grammar_str The production rules for the grammar, encoded as a string. Returns an empty grammar if empty. Returns NULL if parsing of grammar_str fails. /// @param grammar_root The name of the start symbol for the grammar. diff --git a/scripts/snapdragon/windows/run-completion.ps1 b/scripts/snapdragon/windows/run-completion.ps1 new file mode 100644 index 000000000..8a48d2d74 --- /dev/null +++ b/scripts/snapdragon/windows/run-completion.ps1 @@ -0,0 +1,53 @@ + +#!/usr/bin/env pwsh + +# Basedir on device +$basedir=".\pkg-snapdragon" + +$cli_opts=$args + +$model="Llama-3.2-3B-Instruct-Q4_0.gguf" +if ($null -ne $env:M) { + $model=$env:M +} + +$device="HTP0" +if ($null -ne $env:D) { + $device=$env:D +} + +if ($null -ne $env:V) { + $env:GGML_HEXAGON_VERBOSE=$env:V +} + +if ($null -ne $env:E) { + $env:GGML_HEXAGON_EXPERIMENTAL=$env:E +} + +if ($null -ne $env:SCHED) { + $env:GGML_SCHED_DEBUG=$env:SCHED; $cli_opts="$cli_opts -v" +} + +if ($null -ne $env:PROF) { + $env:GGML_HEXAGON_PROFILE=$env:PROF; $env:GGML_HEXAGON_OPSYNC=1 +} + +if ($null -ne $env:OPMASK) { + $env:GGML_HEXAGON_OPMASK=$env:OPMASK +} + +if ($null -ne $env:NHVX) { + $env:GGML_HEXAGON_NHVX=$env:NHVX +} + +if ($null -ne $env:NDEV) { + $env:GGML_HEXAGON_NDEV=$env:NDEV +} + +$env:ADSP_LIBRARY_PATH="$basedir\lib" + +& "$basedir\bin\llama-completion.exe" ` + --no-mmap -m $basedir\..\..\gguf\$model ` + --poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1 ` + --ctx-size 8192 --batch-size 128 -fa on ` + -ngl 99 -no-cnv --device $device $cli_opts diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 7609e56cc..ff78c8cfc 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -161,7 +161,7 @@ llama_context::llama_context( cparams.op_offload = params.op_offload; cparams.kv_unified = params.kv_unified; - // intialized later + // initialized later cparams.pipeline_parallel = false; { @@ -1991,7 +1991,7 @@ ggml_cgraph * llama_context::graph_reserve( ggml_backend_sched_reset(sched.get()); - // when the scheduler is reset, we cannnot reuse the old graph, so we reset the previous graph result to prevent that + // when the scheduler is reset, we cannot reuse the old graph, so we reset the previous graph result to prevent that gf_res_prev->reset(); // store the n_outputs as it is, and restore it afterwards diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 23a86ea29..b8126ce50 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -1616,7 +1616,7 @@ ggml_tensor * llm_graph_context::build_inp_attn_scale() const { ggml_tensor * llm_graph_context::build_inp_out_ids() const { // note: when all tokens are output, we could skip this optimization to spare the ggml_get_rows() calls, // but this would make the graph topology depend on the number of output tokens, which can interere with - // features that require constant topology such as pipline parallelism + // features that require constant topology such as pipeline parallelism // ref: https://github.com/ggml-org/llama.cpp/pull/14275#issuecomment-2987424471 //if (n_outputs < n_tokens) { // return nullptr; @@ -1779,7 +1779,7 @@ ggml_tensor * llm_graph_context::build_attn_mha( if (v_mla) { #if 0 // v_mla can be applied as a matrix-vector multiplication with broadcasting across dimension 3 == n_tokens. - // However, the code is optimized for dimensions 0 and 1 being large, so this is ineffient. + // However, the code is optimized for dimensions 0 and 1 being large, so this is inefficient. cur = ggml_reshape_4d(ctx0, cur, v_mla->ne[0], 1, n_head, n_tokens); cur = ggml_mul_mat(ctx0, v_mla, cur); #else diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index 69b4cc72f..9c3084060 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -583,7 +583,7 @@ llama_kv_cache::slot_info_vec_t llama_kv_cache::prepare(const std::vector seq_srct; std::unordered_map> seq_idxs; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index b78423a60..414e60b9a 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -175,6 +175,7 @@ const char * llm_type_name(llm_type type) { case LLM_TYPE_0_3B: return "0.3B"; case LLM_TYPE_0_5B: return "0.5B"; case LLM_TYPE_0_6B: return "0.6B"; + case LLM_TYPE_0_8B: return "0.8B"; case LLM_TYPE_1B: return "1B"; case LLM_TYPE_1_2B: return "1.2B"; case LLM_TYPE_1_3B: return "1.3B"; @@ -246,12 +247,14 @@ const char * llm_type_name(llm_type type) { case LLM_TYPE_100B_A6B: return "100B.A6B"; case LLM_TYPE_102B_A12B: return "102B.A12B"; case LLM_TYPE_106B_A12B: return "106B.A12B"; + case LLM_TYPE_122B_A10B: return "122B.A10B"; case LLM_TYPE_196B_A11B: return "196B.A11B"; case LLM_TYPE_230B_A10B: return "230B.A10B"; case LLM_TYPE_235B_A22B: return "235B.A22B"; case LLM_TYPE_300B_A47B: return "300B.A47B"; case LLM_TYPE_310B_A15B: return "310B.A15B"; case LLM_TYPE_355B_A32B: return "355B.A32B"; + case LLM_TYPE_397B_A17B: return "397B.A17B"; case LLM_TYPE_744B_A40B: return "744B.A40B"; case LLM_TYPE_E2B: return "E2B"; case LLM_TYPE_E4B: return "E4B"; @@ -1638,7 +1641,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { } switch (hparams.n_layer) { - // TODO: Jamba layers are a bit heterogenous, so naming this is hard. + // TODO: Jamba layers are a bit heterogeneous, so naming this is hard. case 12: // 900M 8x???M case 32: // 51B 16x?B default: type = LLM_TYPE_UNKNOWN; @@ -2642,7 +2645,9 @@ void llama_model::load_hparams(llama_model_loader & ml) { } switch (hparams.n_layer) { - case 24: type = LLM_TYPE_2B; break; + case 24: type = hparams.n_embd == 1024 ? LLM_TYPE_0_8B : LLM_TYPE_2B; break; + case 32: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_9B; break; + case 64: type = LLM_TYPE_27B; break; default: type = LLM_TYPE_UNKNOWN; } } break; @@ -2671,8 +2676,9 @@ void llama_model::load_hparams(llama_model_loader & ml) { } switch (hparams.n_layer) { - case 28: type = LLM_TYPE_35B_A3B; break; - case 48: type = LLM_TYPE_80B_A3B; break; + case 40: type = LLM_TYPE_35B_A3B; break; + case 48: type = LLM_TYPE_122B_A10B; break; + case 60: type = LLM_TYPE_397B_A17B; break; default: type = LLM_TYPE_UNKNOWN; } } break; diff --git a/src/llama-model.h b/src/llama-model.h index d7c3e7d1c..5ecb8344a 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -54,6 +54,7 @@ enum llm_type { LLM_TYPE_0_3B, LLM_TYPE_0_5B, LLM_TYPE_0_6B, + LLM_TYPE_0_8B, LLM_TYPE_1B, LLM_TYPE_1_2B, LLM_TYPE_1_3B, @@ -125,12 +126,14 @@ enum llm_type { LLM_TYPE_100B_A6B, LLM_TYPE_102B_A12B, // Solar-Open LLM_TYPE_106B_A12B, // GLM-4.5-Air + LLM_TYPE_122B_A10B, // Qwen3.5 LLM_TYPE_196B_A11B, // Step3.5-Flash LLM_TYPE_230B_A10B, // Minimax M2 LLM_TYPE_235B_A22B, LLM_TYPE_300B_A47B, // Ernie MoE big LLM_TYPE_310B_A15B, // /MiMo-V2-Flash LLM_TYPE_355B_A32B, // GLM-4.5 + LLM_TYPE_397B_A17B, // Qwen3.5 LLM_TYPE_744B_A40B, // GLM-5 LLM_TYPE_E2B, LLM_TYPE_E4B, diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 9cf9a1d17..477d2d32c 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -2069,7 +2069,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx); precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap); #if defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - // correct endiannes of data in precompiled_charsmap binary blob + // correct endianness of data in precompiled_charsmap binary blob uint32_t * xcda_blob_size = (uint32_t *) &precompiled_charsmap[0]; *xcda_blob_size = __builtin_bswap32(*xcda_blob_size); assert(*xcda_blob_size + sizeof(uint32_t) < n_precompiled_charsmap); diff --git a/src/models/deepseek2.cpp b/src/models/deepseek2.cpp index b608396e5..be81709c5 100644 --- a/src/models/deepseek2.cpp +++ b/src/models/deepseek2.cpp @@ -146,7 +146,7 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr cb(Qcur, "Qcur_attn_temp_scaled", il); } - // note: MLA with the absorption optimzation converts into MQA (ie: GQA with 1 group) + // note: MLA with the absorption optimization converts into MQA (ie: GQA with 1 group) cur = build_attn(inp_attn_k, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, model.layers[il].wv_b, kq_scale, il); diff --git a/src/models/models.h b/src/models/models.h index 0712d03d8..cf9ba04e7 100644 --- a/src/models/models.h +++ b/src/models/models.h @@ -3,7 +3,7 @@ #include "llama-model.h" #include "llama-graph.h" -// note: almost all graphs require atleast sqrtf, so include cmath globally +// note: almost all graphs require at least sqrtf, so include cmath globally #include // diff --git a/src/unicode.cpp b/src/unicode.cpp index 1475b53b6..122c8ca04 100644 --- a/src/unicode.cpp +++ b/src/unicode.cpp @@ -773,7 +773,7 @@ static std::vector unicode_regex_split_custom(const std::string & text, // tiny_aya digit grouping pattern from tokenizer.json: // {"type": "Split", "pattern": {"Regex": "\\d{1,3}(?=(?:\\d{3})*\\b)"}, "behavior": "Isolated"} // Splits digits into groups of 3 from the right (e.g., 1234567 -> 1, 234, 567) - // TODO: Revisit this regex, incase there are any subtle tokenization differences with the original regex. + // TODO: Revisit this regex, in case there are any subtle tokenization differences with the original regex. bpe_offsets = unicode_regex_split_custom_afmoe(text, offsets); } diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index 21a027594..d9116e5da 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -68,7 +68,7 @@ #define TN_POS_EMBD "%s.position_embd.weight" #define TN_CLASS_EMBD "v.class_embd" -#define TN_PATCH_EMBD "v.patch_embd.weight" // not rename tensor with ".0" postfix for backwrad compat +#define TN_PATCH_EMBD "v.patch_embd.weight" // not rename tensor with ".0" postfix for backward compat #define TN_PATCH_EMBD_1 "v.patch_embd.weight.1" #define TN_PATCH_BIAS "v.patch_embd.bias" #define TN_NORM_EMBD "v.norm_embd.%s" diff --git a/tools/mtmd/clip-model.h b/tools/mtmd/clip-model.h index e0eb9b32c..eeb8da58e 100644 --- a/tools/mtmd/clip-model.h +++ b/tools/mtmd/clip-model.h @@ -46,7 +46,7 @@ struct clip_hparams { float image_std[3]; // for models using dynamic image size, we need to have a smaller image size to warmup - // otherwise, user will get OOM everytime they load the model + // otherwise, user will get OOM every time they load the model int32_t warmup_image_size = 0; int32_t warmup_audio_size = 3000; @@ -221,7 +221,7 @@ struct clip_model { // embeddings ggml_tensor * class_embedding = nullptr; ggml_tensor * patch_embeddings_0 = nullptr; - ggml_tensor * patch_embeddings_1 = nullptr; // second Conv2D kernel when we decouple Conv3D along temproal dimension (Qwen2VL) + ggml_tensor * patch_embeddings_1 = nullptr; // second Conv2D kernel when we decouple Conv3D along temporal dimension (Qwen2VL) ggml_tensor * patch_bias = nullptr; ggml_tensor * position_embeddings = nullptr; ggml_tensor * norm_embd_w = nullptr; diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index d7630aaf9..2a765bf75 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -2491,7 +2491,7 @@ static void normalize_image_u8_to_f32(const clip_image_u8 & src, clip_image_f32 } } -// set of tools to manupulate images +// set of tools to manipulate images // in the future, we can have HW acceleration by allowing this struct to access 3rd party lib like imagick or opencv struct img_tool { enum resize_algo { diff --git a/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py b/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py index 944037e70..1f563fbfc 100644 --- a/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py +++ b/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py @@ -186,7 +186,7 @@ def trunc_normal_tf_( best when :math:`a \\leq \text{mean} \\leq b`. NOTE: this 'tf' variant behaves closer to Tensorflow / JAX impl where the bounds [a, b] are applied when sampling the normal distribution with mean=0, std=1.0 - and the result is subsquently scaled and shifted by the mean and std args. + and the result is subsequently scaled and shifted by the mean and std args. Args: tensor: an n-dimensional `torch.Tensor` mean: the mean of the normal distribution diff --git a/tools/mtmd/mtmd-audio.cpp b/tools/mtmd/mtmd-audio.cpp index e8eef035f..447f61aaa 100644 --- a/tools/mtmd/mtmd-audio.cpp +++ b/tools/mtmd/mtmd-audio.cpp @@ -560,7 +560,7 @@ bool mtmd_audio_preprocessor_whisper::preprocess(const float * s for (size_t off = 0; off < (size_t) out_full.n_len; off += frames_per_chunk) { int n_len = std::min(frames_per_chunk, (size_t) out_full.n_len - off); if ((size_t) n_len < frames_per_chunk) { - break; // last uncomplete chunk will always be a padded chunk, safe to ignore + break; // last incomplete chunk will always be a padded chunk, safe to ignore } mtmd_audio_mel out_chunk; diff --git a/tools/quantize/README.md b/tools/quantize/README.md index 22f071028..b8c225124 100644 --- a/tools/quantize/README.md +++ b/tools/quantize/README.md @@ -100,7 +100,7 @@ Examples: ## Memory/Disk Requirements When running the larger models, make sure you have enough disk space to store all the intermediate files. -As the models are currently fully loaded into memory, you will need adequate disk space to save them and sufficient RAM to load them. At the moment, memory and disk requirements are the same. For exmaple (Llama 3.1): +As the models are currently fully loaded into memory, you will need adequate disk space to save them and sufficient RAM to load them. At the moment, memory and disk requirements are the same. For example (Llama 3.1): | Model | Original size | Quantized size (Q4_K_M) | | ----: | ------------: | ----------------------: | diff --git a/tools/server/public/index.html.gz b/tools/server/public/index.html.gz index a5465fcd1..77362ce66 100644 Binary files a/tools/server/public/index.html.gz and b/tools/server/public/index.html.gz differ diff --git a/tools/server/public_legacy/index-new.html b/tools/server/public_legacy/index-new.html index e2f39d668..2cee7f3c3 100644 --- a/tools/server/public_legacy/index-new.html +++ b/tools/server/public_legacy/index-new.html @@ -36,7 +36,7 @@ const params = signal({ n_predict: 358, // 358 is a nice number - temperature: 0.8, // adapt all following parameters to optimized min-p requierements. If for non-english, set to 0.6 or lower + temperature: 0.8, // adapt all following parameters to optimized min-p requirements. If for non-english, set to 0.6 or lower repeat_last_n: 0, // 0 = disable penalty, -1 = context size repeat_penalty: 1.0, // 1.0 = disabled dry_multiplier: 0.0, // 0.0 = disabled, 0.8 works well @@ -108,7 +108,7 @@ let importedTemplates = local_storage_getDataAsObject('user_templates') if (importedTemplates) { - // saved templates were successfuly imported. + // saved templates were successfully imported. console.log('Processing saved templates and updating default template') params.value = { ...params.value, image_data: [] }; @@ -129,7 +129,7 @@ } function userTemplateResetToDefault() { - console.log('Reseting themplate to default') + console.log('Reseting template to default') selectedUserTemplate.value.name = 'default'; selectedUserTemplate.value.data = savedUserTemplates.value['default']; } diff --git a/tools/server/public_simplechat/datautils.mjs b/tools/server/public_simplechat/datautils.mjs index 75159d6b1..08ccc219b 100644 --- a/tools/server/public_simplechat/datautils.mjs +++ b/tools/server/public_simplechat/datautils.mjs @@ -63,7 +63,7 @@ export function trim_repeat_garbage_at_end(sIn, maxSubL=10, maxMatchLenThreshold /** - * Simple minded logic to help remove repeating garbage at end of the string, till it cant. + * Simple minded logic to help remove repeating garbage at end of the string, till it can't. * If its not able to trim, then it will try to skip a char at end and then trim, a few times. * This ensures that even if there are multiple runs of garbage with different patterns, the * logic still tries to munch through them. diff --git a/tools/server/public_simplechat/simplechat.js b/tools/server/public_simplechat/simplechat.js index 2fcd24a86..c67577d5a 100644 --- a/tools/server/public_simplechat/simplechat.js +++ b/tools/server/public_simplechat/simplechat.js @@ -318,7 +318,7 @@ class SimpleChat { } /** - * Allow setting of system prompt, but only at begining. + * Allow setting of system prompt, but only at beginning. * @param {string} sysPrompt * @param {string} msgTag */ @@ -333,7 +333,7 @@ class SimpleChat { console.error(`ERRR:SimpleChat:SC:${msgTag}:You need to specify system prompt before any user query, ignoring...`); } else { if (this.xchat[0].content !== sysPrompt) { - console.error(`ERRR:SimpleChat:SC:${msgTag}:You cant change system prompt, mid way through, ignoring...`); + console.error(`ERRR:SimpleChat:SC:${msgTag}:You can't change system prompt, mid way through, ignoring...`); } } } diff --git a/tools/server/public_simplechat/ui.mjs b/tools/server/public_simplechat/ui.mjs index b2d5b9aea..afa619a06 100644 --- a/tools/server/public_simplechat/ui.mjs +++ b/tools/server/public_simplechat/ui.mjs @@ -44,7 +44,7 @@ export function el_create_button(id, callback, name=undefined, innerText=undefin } /** - * Create a para and set it up. Optionaly append it to a passed parent. + * Create a para and set it up. Optionally append it to a passed parent. * @param {string} text * @param {HTMLElement | undefined} elParent * @param {string | undefined} id @@ -111,7 +111,7 @@ export function el_creatediv_boolbutton(id, label, texts, defaultValue, cb, clas /** * Create a select ui element, with a set of options to select from. * * options: an object which contains name-value pairs - * * defaultOption: the value whose name should be choosen, by default. + * * defaultOption: the value whose name should be chosen, by default. * * cb : the call back returns the name string of the option selected. * * @param {string} id diff --git a/tools/server/webui/src/lib/components/app/actions/ActionIcon.svelte b/tools/server/webui/src/lib/components/app/actions/ActionIcon.svelte index 4494ea880..c676e224a 100644 --- a/tools/server/webui/src/lib/components/app/actions/ActionIcon.svelte +++ b/tools/server/webui/src/lib/components/app/actions/ActionIcon.svelte @@ -8,6 +8,7 @@ tooltip: string; variant?: 'default' | 'destructive' | 'outline' | 'secondary' | 'ghost' | 'link'; size?: 'default' | 'sm' | 'lg' | 'icon'; + iconSize?: string; class?: string; disabled?: boolean; onclick: () => void; @@ -21,6 +22,7 @@ size = 'sm', class: className = '', disabled = false, + iconSize = 'h-3 w-3', onclick, 'aria-label': ariaLabel }: Props = $props(); @@ -38,7 +40,7 @@ > {@const IconComponent = icon} - + diff --git a/tools/server/webui/src/lib/components/app/badges/BadgeModality.svelte b/tools/server/webui/src/lib/components/app/badges/BadgeModality.svelte index a0d5e863c..15936691a 100644 --- a/tools/server/webui/src/lib/components/app/badges/BadgeModality.svelte +++ b/tools/server/webui/src/lib/components/app/badges/BadgeModality.svelte @@ -1,6 +1,6 @@ {#snippet badgeContent()} @@ -31,7 +33,9 @@ {/snippet} - {model} + {#if model} + + {/if} {#if showCopyIcon} @@ -39,7 +43,7 @@ {/snippet} -{#if model && isModelMode} +{#if shouldShow} {#if showTooltip} diff --git a/tools/server/webui/src/lib/components/app/models/ModelId.svelte b/tools/server/webui/src/lib/components/app/models/ModelId.svelte new file mode 100644 index 000000000..817e88286 --- /dev/null +++ b/tools/server/webui/src/lib/components/app/models/ModelId.svelte @@ -0,0 +1,64 @@ + + +{#if resolvedShowRaw} + {modelId} +{:else} + + + {#if showOrgName}{parsed.orgName}/{/if}{parsed.modelName ?? modelId} + + + {#if parsed.params} + + {parsed.params}{parsed.activatedParams ? `-${parsed.activatedParams}` : ''} + + {/if} + + {#if parsed.quantization} + + {parsed.quantization} + + {/if} + + {#if aliases && aliases.length > 0} + {#each aliases as alias (alias)} + {alias} + {/each} + {/if} + + {#if tags && tags.length > 0} + {#each tags as tag (tag)} + {tag} + {/each} + {/if} + +{/if} diff --git a/tools/server/webui/src/lib/components/app/models/ModelsSelector.svelte b/tools/server/webui/src/lib/components/app/models/ModelsSelector.svelte index ebffae121..a40501e2c 100644 --- a/tools/server/webui/src/lib/components/app/models/ModelsSelector.svelte +++ b/tools/server/webui/src/lib/components/app/models/ModelsSelector.svelte @@ -1,6 +1,7 @@ @@ -260,10 +315,25 @@ {#if loading && options.length === 0 && isRouter}
+ Loading models…
{:else if options.length === 0 && isRouter} -

No models available.

+ {#if currentModel} + + + + + + {:else} +

No models available.

+ {/if} {:else} {@const selectedOption = getDisplayOption()} @@ -280,7 +350,7 @@ type="button" class={cn( `inline-grid cursor-pointer grid-cols-[1fr_auto_1fr] items-center gap-1.5 rounded-sm bg-muted-foreground/10 px-1.5 py-1 text-xs transition hover:text-foreground focus:outline-none focus-visible:ring-2 focus-visible:ring-ring focus-visible:ring-offset-2 disabled:cursor-not-allowed disabled:opacity-60`, - !isCurrentModelInCache() + !isCurrentModelInCache ? 'bg-red-400/10 !text-red-400 hover:bg-red-400/20 hover:text-red-400' : forceForegroundText ? 'text-foreground' @@ -294,12 +364,21 @@ > - + {#if selectedOption} + + + + - {#if updating} + +

{selectedOption.model}

+
+
+ {:else} + Select model + {/if} + + {#if updating || isLoadingModel} {:else} @@ -316,10 +395,10 @@ placeholder="Search models..." onSearchKeyDown={handleSearchKeyDown} emptyMessage="No models found." - isEmpty={filteredOptions.length === 0 && isCurrentModelInCache()} + isEmpty={filteredOptions.length === 0 && isCurrentModelInCache} >
- {#if !isCurrentModelInCache() && currentModel} + {#if !isCurrentModelInCache && currentModel} -
{/if} + {#if filteredOptions.length === 0}

No models found.

{/if} - {#each filteredOptions as option, index (option.id)} - {@const status = getModelStatus(option.model)} - {@const isLoaded = status === ServerModelStatus.LOADED} - {@const isLoading = status === ServerModelStatus.LOADING} - {@const isSelected = currentModel === option.model || activeId === option.id} - {@const isHighlighted = index === highlightedIndex} -
handleSelect(option.id)} - onmouseenter={() => (highlightedIndex = index)} - onkeydown={(e) => { - if (e.key === 'Enter' || e.key === ' ') { - e.preventDefault(); - handleSelect(option.id); - } - }} - > - + Loaded models +

+ {:else if group.isFavouritesGroup} +

+ Favourite models +

+ {:else if group.orgName} +

- {option.model} - + {group.orgName} +

+ {/if} -
- {#if isLoading} - - - - - -

Loading model...

-
-
- {:else if isLoaded} - - - - - -

Unload model

-
-
- {:else} - - {/if} -
-
+ {#each group.items as { option, flatIndex } (group.isLoadedGroup ? `loaded-${option.id}` : group.isFavouritesGroup ? `fav-${option.id}` : option.id)} + {@const isSelected = currentModel === option.model || activeId === option.id} + {@const isHighlighted = flatIndex === highlightedIndex} + {@const isFav = modelsStore.favouriteModelIds.has(option.model)} + + (highlightedIndex = flatIndex)} + onKeyDown={(e) => { + if (e.key === KeyboardKey.ENTER || e.key === KeyboardKey.SPACE) { + e.preventDefault(); + handleSelect(option.id); + } + }} + /> + {/each} {/each}
@@ -422,7 +465,7 @@