mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-05-10 04:00:53 +00:00
Merge commit '2cd20b72ed' into concedo_experimental
# Conflicts: # CONTRIBUTING.md # docs/backend/CANN.md # docs/backend/SYCL.md # docs/backend/snapdragon/README.md # docs/backend/snapdragon/windows.md # docs/build.md # docs/multimodal/MobileVLM.md # docs/ops.md # docs/ops/WebGPU.csv # examples/debug/README.md # examples/llama.vim # examples/model-conversion/README.md # examples/sycl/README.md # ggml/src/ggml-cpu/amx/mmq.cpp # ggml/src/ggml-cpu/arch/x86/repack.cpp # ggml/src/ggml-hexagon/ggml-hexagon.cpp # ggml/src/ggml-hexagon/htp-drv.cpp # ggml/src/ggml-hexagon/htp/flash-attn-ops.c # ggml/src/ggml-hexagon/htp/hvx-base.h # ggml/src/ggml-hexagon/htp/hvx-copy.h # ggml/src/ggml-hexagon/htp/hvx-inverse.h # ggml/src/ggml-hexagon/htp/hvx-reduce.h # ggml/src/ggml-hexagon/htp/matmul-ops.c # ggml/src/ggml-hexagon/htp/rope-ops.c # ggml/src/ggml-hexagon/htp/worker-pool.c # ggml/src/ggml-opencl/ggml-opencl.cpp # ggml/src/ggml-opencl/kernels/cpy.cl # ggml/src/ggml-sycl/common.hpp # ggml/src/ggml-sycl/quants.hpp # ggml/src/ggml-sycl/softmax.cpp # ggml/src/ggml-vulkan/CMakeLists.txt # ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp # ggml/src/ggml-webgpu/ggml-webgpu.cpp # scripts/pr2wt.sh # scripts/server-bench.py # scripts/snapdragon/windows/run-cli.ps1 # tests/test-alloc.cpp # tests/test-backend-ops.cpp # tests/test-chat.cpp # tools/cli/cli.cpp # tools/completion/README.md # tools/cvector-generator/cvector-generator.cpp # tools/imatrix/README.md # tools/perplexity/README.md # tools/server/public_simplechat/readme.md # tools/server/tests/README.md
This commit is contained in:
commit
746664fde6
107 changed files with 1037 additions and 314 deletions
|
|
@ -161,7 +161,7 @@ llama_context::llama_context(
|
|||
cparams.op_offload = params.op_offload;
|
||||
cparams.kv_unified = params.kv_unified;
|
||||
|
||||
// intialized later
|
||||
// initialized later
|
||||
cparams.pipeline_parallel = false;
|
||||
|
||||
{
|
||||
|
|
@ -1991,7 +1991,7 @@ ggml_cgraph * llama_context::graph_reserve(
|
|||
|
||||
ggml_backend_sched_reset(sched.get());
|
||||
|
||||
// when the scheduler is reset, we cannnot reuse the old graph, so we reset the previous graph result to prevent that
|
||||
// when the scheduler is reset, we cannot reuse the old graph, so we reset the previous graph result to prevent that
|
||||
gf_res_prev->reset();
|
||||
|
||||
// store the n_outputs as it is, and restore it afterwards
|
||||
|
|
|
|||
|
|
@ -1616,7 +1616,7 @@ ggml_tensor * llm_graph_context::build_inp_attn_scale() const {
|
|||
ggml_tensor * llm_graph_context::build_inp_out_ids() const {
|
||||
// note: when all tokens are output, we could skip this optimization to spare the ggml_get_rows() calls,
|
||||
// but this would make the graph topology depend on the number of output tokens, which can interere with
|
||||
// features that require constant topology such as pipline parallelism
|
||||
// features that require constant topology such as pipeline parallelism
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/14275#issuecomment-2987424471
|
||||
//if (n_outputs < n_tokens) {
|
||||
// return nullptr;
|
||||
|
|
@ -1779,7 +1779,7 @@ ggml_tensor * llm_graph_context::build_attn_mha(
|
|||
if (v_mla) {
|
||||
#if 0
|
||||
// v_mla can be applied as a matrix-vector multiplication with broadcasting across dimension 3 == n_tokens.
|
||||
// However, the code is optimized for dimensions 0 and 1 being large, so this is ineffient.
|
||||
// However, the code is optimized for dimensions 0 and 1 being large, so this is inefficient.
|
||||
cur = ggml_reshape_4d(ctx0, cur, v_mla->ne[0], 1, n_head, n_tokens);
|
||||
cur = ggml_mul_mat(ctx0, v_mla, cur);
|
||||
#else
|
||||
|
|
|
|||
|
|
@ -583,7 +583,7 @@ llama_kv_cache::slot_info_vec_t llama_kv_cache::prepare(const std::vector<llama_
|
|||
break;
|
||||
}
|
||||
|
||||
// remeber the position that we found
|
||||
// remember the position that we found
|
||||
res.push_back(sinfo_new);
|
||||
|
||||
// store the old state of the cells in the recovery stack
|
||||
|
|
@ -1293,7 +1293,7 @@ static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, float *
|
|||
}
|
||||
|
||||
for (uint32_t s = 0; s < n_stream; ++s) {
|
||||
// bookeeping of the KQ mask cells that could change for other tokens of the same sequence
|
||||
// bookkeeping of the KQ mask cells that could change for other tokens of the same sequence
|
||||
std::unordered_map<llama_seq_id, uint32_t> seq_srct;
|
||||
std::unordered_map<llama_seq_id, std::vector<uint32_t>> seq_idxs;
|
||||
|
||||
|
|
|
|||
|
|
@ -175,6 +175,7 @@ const char * llm_type_name(llm_type type) {
|
|||
case LLM_TYPE_0_3B: return "0.3B";
|
||||
case LLM_TYPE_0_5B: return "0.5B";
|
||||
case LLM_TYPE_0_6B: return "0.6B";
|
||||
case LLM_TYPE_0_8B: return "0.8B";
|
||||
case LLM_TYPE_1B: return "1B";
|
||||
case LLM_TYPE_1_2B: return "1.2B";
|
||||
case LLM_TYPE_1_3B: return "1.3B";
|
||||
|
|
@ -246,12 +247,14 @@ const char * llm_type_name(llm_type type) {
|
|||
case LLM_TYPE_100B_A6B: return "100B.A6B";
|
||||
case LLM_TYPE_102B_A12B: return "102B.A12B";
|
||||
case LLM_TYPE_106B_A12B: return "106B.A12B";
|
||||
case LLM_TYPE_122B_A10B: return "122B.A10B";
|
||||
case LLM_TYPE_196B_A11B: return "196B.A11B";
|
||||
case LLM_TYPE_230B_A10B: return "230B.A10B";
|
||||
case LLM_TYPE_235B_A22B: return "235B.A22B";
|
||||
case LLM_TYPE_300B_A47B: return "300B.A47B";
|
||||
case LLM_TYPE_310B_A15B: return "310B.A15B";
|
||||
case LLM_TYPE_355B_A32B: return "355B.A32B";
|
||||
case LLM_TYPE_397B_A17B: return "397B.A17B";
|
||||
case LLM_TYPE_744B_A40B: return "744B.A40B";
|
||||
case LLM_TYPE_E2B: return "E2B";
|
||||
case LLM_TYPE_E4B: return "E4B";
|
||||
|
|
@ -1638,7 +1641,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
}
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
// TODO: Jamba layers are a bit heterogenous, so naming this is hard.
|
||||
// TODO: Jamba layers are a bit heterogeneous, so naming this is hard.
|
||||
case 12: // 900M 8x???M
|
||||
case 32: // 51B 16x?B
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
|
|
@ -2642,7 +2645,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
}
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 24: type = LLM_TYPE_2B; break;
|
||||
case 24: type = hparams.n_embd == 1024 ? LLM_TYPE_0_8B : LLM_TYPE_2B; break;
|
||||
case 32: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_9B; break;
|
||||
case 64: type = LLM_TYPE_27B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
|
|
@ -2671,8 +2676,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
}
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 28: type = LLM_TYPE_35B_A3B; break;
|
||||
case 48: type = LLM_TYPE_80B_A3B; break;
|
||||
case 40: type = LLM_TYPE_35B_A3B; break;
|
||||
case 48: type = LLM_TYPE_122B_A10B; break;
|
||||
case 60: type = LLM_TYPE_397B_A17B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
|
|
|
|||
|
|
@ -54,6 +54,7 @@ enum llm_type {
|
|||
LLM_TYPE_0_3B,
|
||||
LLM_TYPE_0_5B,
|
||||
LLM_TYPE_0_6B,
|
||||
LLM_TYPE_0_8B,
|
||||
LLM_TYPE_1B,
|
||||
LLM_TYPE_1_2B,
|
||||
LLM_TYPE_1_3B,
|
||||
|
|
@ -125,12 +126,14 @@ enum llm_type {
|
|||
LLM_TYPE_100B_A6B,
|
||||
LLM_TYPE_102B_A12B, // Solar-Open
|
||||
LLM_TYPE_106B_A12B, // GLM-4.5-Air
|
||||
LLM_TYPE_122B_A10B, // Qwen3.5
|
||||
LLM_TYPE_196B_A11B, // Step3.5-Flash
|
||||
LLM_TYPE_230B_A10B, // Minimax M2
|
||||
LLM_TYPE_235B_A22B,
|
||||
LLM_TYPE_300B_A47B, // Ernie MoE big
|
||||
LLM_TYPE_310B_A15B, // /MiMo-V2-Flash
|
||||
LLM_TYPE_355B_A32B, // GLM-4.5
|
||||
LLM_TYPE_397B_A17B, // Qwen3.5
|
||||
LLM_TYPE_744B_A40B, // GLM-5
|
||||
LLM_TYPE_E2B,
|
||||
LLM_TYPE_E4B,
|
||||
|
|
|
|||
|
|
@ -2069,7 +2069,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|||
const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
|
||||
precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap);
|
||||
#if defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
||||
// correct endiannes of data in precompiled_charsmap binary blob
|
||||
// correct endianness of data in precompiled_charsmap binary blob
|
||||
uint32_t * xcda_blob_size = (uint32_t *) &precompiled_charsmap[0];
|
||||
*xcda_blob_size = __builtin_bswap32(*xcda_blob_size);
|
||||
assert(*xcda_blob_size + sizeof(uint32_t) < n_precompiled_charsmap);
|
||||
|
|
|
|||
|
|
@ -146,7 +146,7 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
|
|||
cb(Qcur, "Qcur_attn_temp_scaled", il);
|
||||
}
|
||||
|
||||
// note: MLA with the absorption optimzation converts into MQA (ie: GQA with 1 group)
|
||||
// note: MLA with the absorption optimization converts into MQA (ie: GQA with 1 group)
|
||||
cur = build_attn(inp_attn_k,
|
||||
model.layers[il].wo, NULL,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, model.layers[il].wv_b, kq_scale, il);
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@
|
|||
#include "llama-model.h"
|
||||
#include "llama-graph.h"
|
||||
|
||||
// note: almost all graphs require atleast sqrtf, so include cmath globally
|
||||
// note: almost all graphs require at least sqrtf, so include cmath globally
|
||||
#include <cmath>
|
||||
|
||||
//
|
||||
|
|
|
|||
|
|
@ -773,7 +773,7 @@ static std::vector<size_t> unicode_regex_split_custom(const std::string & text,
|
|||
// tiny_aya digit grouping pattern from tokenizer.json:
|
||||
// {"type": "Split", "pattern": {"Regex": "\\d{1,3}(?=(?:\\d{3})*\\b)"}, "behavior": "Isolated"}
|
||||
// Splits digits into groups of 3 from the right (e.g., 1234567 -> 1, 234, 567)
|
||||
// TODO: Revisit this regex, incase there are any subtle tokenization differences with the original regex.
|
||||
// TODO: Revisit this regex, in case there are any subtle tokenization differences with the original regex.
|
||||
bpe_offsets = unicode_regex_split_custom_afmoe(text, offsets);
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue