koboldcpp

mirror of https://github.com/LostRuins/koboldcpp.git synced 2026-05-19 16:31:59 +00:00

Author	SHA1	Message	Date
Concedo	9a38091207	support q5_1 kv	2026-04-17 17:06:15 +08:00
Concedo	ae292c496e	handle SWA conflicting with rewind, increased default SWA padding.	2026-04-16 17:00:26 +08:00
Concedo	0251c6dbde	added swa padding controls	2026-04-16 16:21:48 +08:00
Concedo	ac29e6f0c0	Merge branch 'upstream' into concedo_experimental # Conflicts: # .devops/vulkan.Dockerfile # .github/workflows/build-self-hosted.yml # .github/workflows/build.yml # .github/workflows/release.yml # .github/workflows/server-self-hosted.yml # docs/build.md # ggml/src/ggml-hexagon/htp/CMakeLists.txt # ggml/src/ggml-hexagon/htp/hex-utils.h # ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c # ggml/src/ggml-hexagon/htp/hmx-utils.h # ggml/src/ggml-hexagon/htp/htp-ctx.h # ggml/src/ggml-hexagon/htp/htp-ops.h # ggml/src/ggml-hexagon/htp/hvx-base.h # ggml/src/ggml-hexagon/htp/main.c # ggml/src/ggml-webgpu/ggml-webgpu.cpp # tests/test-backend-ops.cpp # tests/test-mtmd-c-api.c	2026-04-15 15:15:19 +08:00
Xuan-Son Nguyen	fae3a28070	ggml : remove ggml-ext.h (#21869 ) * ggml: correct placement of ggml-ext.h * ggml : remove ggml-ext.h --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2026-04-14 17:32:58 +03:00
Concedo	5361b45fba	Merge branch 'upstream' into concedo_experimental # Conflicts: # ggml/src/ggml-opencl/CMakeLists.txt # ggml/src/ggml-opencl/ggml-opencl.cpp # ggml/src/ggml-opencl/kernels/cvt.cl # requirements/requirements-tool_bench.txt	2026-04-12 16:22:26 +08:00
Johannes Gäßler	865ff06b2f	TP: fix Qwen 3 Next data split (#21732 )	2026-04-11 09:23:42 +02:00
Concedo	4c860ae4ae	Merge branch 'upstream' into concedo_experimental # Conflicts: # common/download.cpp # docs/backend/OPENVINO.md # docs/backend/snapdragon/CMakeUserPresets.json # docs/backend/snapdragon/README.md # ggml/src/ggml-hexagon/ggml-hexagon.cpp # ggml/src/ggml-hexagon/htp/act-ops.c # ggml/src/ggml-hexagon/htp/argsort-ops.c # ggml/src/ggml-hexagon/htp/binary-ops.c # ggml/src/ggml-hexagon/htp/cpy-ops.c # ggml/src/ggml-hexagon/htp/cumsum-ops.c # ggml/src/ggml-hexagon/htp/flash-attn-ops.c # ggml/src/ggml-hexagon/htp/get-rows-ops.c # ggml/src/ggml-hexagon/htp/hex-utils.h # ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c # ggml/src/ggml-hexagon/htp/hmx-ops.h # ggml/src/ggml-hexagon/htp/htp-ctx.h # ggml/src/ggml-hexagon/htp/htp-ops.h # ggml/src/ggml-hexagon/htp/htp_iface.idl # ggml/src/ggml-hexagon/htp/main.c # ggml/src/ggml-hexagon/htp/matmul-ops.c # ggml/src/ggml-hexagon/htp/repeat-ops.c # ggml/src/ggml-hexagon/htp/rope-ops.c # ggml/src/ggml-hexagon/htp/set-rows-ops.c # ggml/src/ggml-hexagon/htp/softmax-ops.c # ggml/src/ggml-hexagon/htp/ssm-conv.c # ggml/src/ggml-hexagon/htp/sum-rows-ops.c # ggml/src/ggml-hexagon/htp/unary-ops.c # ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp # ggml/src/ggml-webgpu/ggml-webgpu.cpp # ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl # ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl # ggml/src/ggml-webgpu/wgsl-shaders/get_rows.wgsl # ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl # ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl # ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl # ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl # models/templates/google-gemma-4-31B-it-interleaved.jinja # models/templates/google-gemma-4-31B-it.jinja # scripts/snapdragon/adb/run-bench.sh # scripts/snapdragon/adb/run-cli.sh # scripts/snapdragon/adb/run-completion.sh # scripts/snapdragon/adb/run-tool.sh # scripts/snapdragon/windows/run-bench.ps1 # scripts/snapdragon/windows/run-cli.ps1 # scripts/snapdragon/windows/run-mtmd.ps1 # scripts/snapdragon/windows/run-tool.ps1 # tests/test-backend-ops.cpp # tests/test-chat.cpp # tools/llama-bench/llama-bench.cpp	2026-04-11 11:19:32 +08:00
Concedo	a165a73120	Merge commit '`d6f3030047`' into concedo_experimental # Conflicts: # examples/model-conversion/scripts/causal/run-casual-gen-embeddings-org.py # examples/model-conversion/scripts/utils/semantic_check.py # ggml/CMakeLists.txt # ggml/src/CMakeLists.txt # ggml/src/ggml-cann/ggml-cann.cpp # ggml/src/ggml-cpu/amx/amx.cpp # ggml/src/ggml-cuda/CMakeLists.txt # ggml/src/ggml-hexagon/ggml-hexagon.cpp # ggml/src/ggml-hip/CMakeLists.txt # ggml/src/ggml-opencl/ggml-opencl.cpp # ggml/src/ggml-openvino/ggml-openvino.cpp # ggml/src/ggml-rpc/ggml-rpc.cpp # ggml/src/ggml-sycl/ggml-sycl.cpp # ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp # ggml/src/ggml-virtgpu/ggml-backend.cpp # ggml/src/ggml-webgpu/ggml-webgpu.cpp # ggml/src/ggml-zdnn/ggml-zdnn.cpp # ggml/src/ggml-zendnn/ggml-zendnn.cpp # pyproject.toml # requirements/requirements-convert_legacy_llama.txt # requirements/requirements-tool_bench.txt # src/llama-model.cpp # src/llama.cpp # tests/test-llama-archs.cpp # tests/test-tokenizer-0.py # tests/test-tokenizer-random.py # tools/llama-bench/llama-bench.cpp # tools/perplexity/perplexity.cpp	2026-04-11 11:10:55 +08:00
Concedo	8b90bfe094	Merge commit '`4ef9301e4d`' into concedo_experimental # Conflicts: # .github/labeler.yml # docs/multimodal.md # embd_res/ggml-vocab-gemma-4.gguf # embd_res/ggml-vocab-gemma-4.gguf.inp # embd_res/ggml-vocab-gemma-4.gguf.out # ggml/src/ggml-sycl/fattn-tile.cpp # ggml/src/ggml-sycl/fattn-tile.hpp # ggml/src/ggml-sycl/fattn-vec.hpp # ggml/src/ggml-sycl/fattn.cpp # ggml/src/ggml-sycl/ggml-sycl.cpp # ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-f16.cpp # ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_0.cpp # ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_1.cpp # ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_0.cpp # ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_1.cpp # ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q8_0.cpp # ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-f16.cpp # ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_0.cpp # ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_1.cpp # ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_0.cpp # ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_1.cpp # ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q8_0.cpp # ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-f16.cpp # ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_0.cpp # ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_1.cpp # ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_0.cpp # ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_1.cpp # ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q8_0.cpp # ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-f16.cpp # ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_0.cpp # ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_1.cpp # ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_0.cpp # ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_1.cpp # ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q8_0.cpp # ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-f16.cpp # ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_0.cpp # ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_1.cpp # ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_0.cpp # ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_1.cpp # ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q8_0.cpp # ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-f16.cpp # ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_0.cpp # ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_1.cpp # ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_0.cpp # ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_1.cpp # ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q8_0.cpp # tests/CMakeLists.txt # tests/test-jinja.cpp # tools/mtmd/CMakeLists.txt	2026-04-11 09:38:50 +08:00
MoonRide303	e62fa13c24	model : make Gemma 4 shared-KV tail attn_k tensors optional on load (#21739 )	2026-04-10 21:45:50 +02:00
Johannes Gäßler	d6f3030047	ggml: backend-agnostic tensor parallelism (experimental) (#19378 ) * ggml: backend-agnostic tensor parallelism * support for GPT-OSS, Qwen 3 MoE * partial Vulkan fix * add support for 4/8 GPUs * unconditional peer access * re-use buffers + ggml contexts * fix output pattern * NCCL support * GGML: HIP: add RCCL support * Remove shfl and AllReduce from backend interface * move allocation workaround out of ggml-alloc.c * 2d tensor set/get support * Fix the seg fault without NCCL * Apply suggestion from JohannesGaessler * support for tensor dims % n_devs != 0 * fix view_offs scaling * arbitrary num. of GPUs/tensor split * fix compilation * better granularity estimate * Support device-specific host buffer types if all underlying backends expose the same type. This allows using pinned memory instead of pageable memory for CUDA. Fix compilation errors. * partial Qwen 3 Next support * Fix qwen3 30b (#8) * Fix crash with Qwen-30B-A3B Q4_0 Qwen-30B-A3B Q4_0 has an intermediate dimension of 768. Using a granularity of 256 forces an uneven split between GPUs, which is not supported by the current implementation. * Decide block size based on tensor quantization type * Fix crashes due to KV cache serialization (#9) KV cache serialization requires non-zero offsets on the tensor. Add support in the meta backend to set/get a tensor with a non-zero offset. * metal : fix build (#7) * static memory allocations, fix usage count * fix tensor granularity * more even memory distribution * use BF16 for allreduce * rebase fixup * better error message for unsupported architectures * Fix device mismatch during scatter of allReduce. (#11) There is a mismatch between the dst buffer device and the backend device, causing the use of sync copies * Enable the previous allreduce implementation. It is better in both perf and stability (#12) * delay AllReduce for Moe for less I/O * build : clean-up compile warnings * backend : move most of the meta backend API to ggml-backend-impl.h * cont : hide unused public API in the implementation * llama : use llama_device + remove ggml_backend_dev_is_meta() * ggml-backend : remove unused alloc include * minor : remove regex include * ggml : introduce ggml-ext.h for staging new APIs * rebase fixup * fix tests * llama : more robust logic for determining Meta devices (#16) * llama : more robust logic for determining Meta devices * cont : fix devs size check Co-authored-by: Johannes Gäßler <johannesg@5d6.de> * cont : fix log type Co-authored-by: Johannes Gäßler <johannesg@5d6.de> --------- Co-authored-by: Johannes Gäßler <johannesg@5d6.de> * disable roundtrip for meta backend * fix arch selection * Qwen 3.5 support * fix Gemma 4 MoE * fix OpenVino, SYCL * fix test-llama-archs for CPU-only builds * Fix Qwen 3.5 MoE * disable meta backend tests for WebGPU * tests : filter CPU-based devices from the Meta backend tests (#17) * meta : formatting, naming, indentation (#18) * formatting : llama-model.cpp * formatting : ggml-ext.h * formatting : ggml-backend-meta.cpp * meta : add TODO * add documentation * better error messages * fix GPT-OSS --------- Co-authored-by: Carl Philipp Klemm <carl@uvos.xyz> Co-authored-by: Gaurav Garg <gaugarg@nvidia.com> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2026-04-09 16:42:19 +02:00
Xuan-Son Nguyen	057dba336e	model: fix multimodal padding token for gemma3n/gemma4 (#21625 ) * model: fix multimodal padding token for gemma3n/gemma4 * nits	2026-04-09 12:18:23 +02:00
Concedo	c82c0b463a	Merge branch 'upstream' into concedo_experimental # Conflicts: # .github/labeler.yml # .github/workflows/release.yml # examples/debug/debug.cpp # ggml/src/ggml-cuda/common.cuh # ggml/src/ggml-cuda/mmq.cuh # ggml/src/ggml-webgpu/ggml-webgpu.cpp # src/llama-vocab.cpp # tests/test-backend-ops.cpp # tests/test-chat.cpp # tests/test-json-schema-to-grammar.cpp # tools/mtmd/CMakeLists.txt	2026-04-09 17:45:04 +08:00
Piotr Wilkin (ilintar)	0ec191e1d7	vocab: add gemma4 tokenizer tests, fix edge case (#21534 ) * YATF (Yet Another Tokenizer Fix) for Gemma 4. With tests! * Remove unnecessary hash from update script. * minor: move constant	2026-04-09 11:41:14 +02:00
Concedo	5529748a01	Merge commit '`de1aa6fa73`' into concedo_experimental # Conflicts: # docs/build.md # docs/ops.md # docs/ops/WebGPU.csv # ggml/src/ggml-sycl/dequantize.hpp # ggml/src/ggml-sycl/dmmv.cpp # ggml/src/ggml-sycl/ggml-sycl.cpp # ggml/src/ggml-sycl/mmvq.cpp # ggml/src/ggml-sycl/quants.hpp # ggml/src/ggml-sycl/vecdotq.hpp # ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp # ggml/src/ggml-webgpu/ggml-webgpu.cpp # ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl # tests/test-backend-ops.cpp # tests/test-quantize-fns.cpp	2026-04-09 17:16:33 +08:00
Aldehir Rojas	d9a12c82f0	vocab : remove </s> eog token if gemma4 (#21492 )	2026-04-08 09:53:06 -05:00
Erik Scholz	3ba12fed0a	kv-cache : extend cache quantization checks (#21586 ) to also check for enabled flash attention, instead of just auto.	2026-04-08 16:08:57 +03:00
Georgi Gerganov	5764d7c6a6	gemma : perform per-layer projections in the first layer (#21612 ) * gemma : reduce graph splits by keeping per-layer ops in the input layer * gemma : put the per-layer proj in the first layer * cont : move the projection before the layer loop	2026-04-08 16:06:30 +03:00
Georgi Gerganov	4eb19514dd	kv-cache : support attention rotation for heterogeneous iSWA (#21513 ) * kv-cache : support attention rotation for heterogeneous iSWA * cont : remove assert	2026-04-07 20:31:28 +03:00
Son H. Nguyen	0d049d6a92	unicode : add custom Qwen2 regex handler to fix segfault on long input (#21257 ) * unicode : add custom Qwen2 regex handler to fix segfault on long input std::regex uses recursive backtracking internally, which causes a stack overflow (segfault) when tokenizing long sequences of repeated characters (e.g. 43K 'A's). The Qwen2 tokenizer regex differs from Llama3 only in the digit pattern (\p{N} vs \p{N}{1,3}), so it was falling through to the std::regex fallback path instead of using a custom handler. Add unicode_regex_split_custom_qwen2() following the established pattern used by gpt2, llama3, kimi_k2, and afmoe custom handlers. Closes: https://github.com/ggml-org/llama.cpp/issues/21113 * cont : remove TODO comment * cont : update comment to reflect original regex * use the correct regex in the comment this time... [no ci] --------- Co-authored-by: Aldehir Rojas <hello@alde.dev>	2026-04-07 16:13:38 +03:00
Johannes Gäßler	a8ec0df461	llama: remove per-arch tensor name lists (#21531 )	2026-04-07 15:02:03 +02:00
Concedo	15d269197e	Merge commit '`506200cf8b`' into concedo_experimental # Conflicts: # docs/multimodal.md # scripts/compare-llama-bench.py # src/llama-vocab.cpp # tools/llama-bench/README.md # tools/llama-bench/llama-bench.cpp	2026-04-07 14:58:36 +08:00
Pasha Khosravi	2e1f0a889e	ggml: add Q1_0 1-bit quantization support (CPU) (#21273 ) * ggml: add Q1_0 and Q1_0_g128 1-bit quantization support (CPU) * add generic fallback for x86 * remove Q1_0 (group size 32) * rename Q1_0_g128 => Q1_0 * fix Q1_0 LlamaFileType Enum * Fix trailing spaces; add generic fallback for othre backends * Apply suggestions from code review Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * fix /r/n spacing + arch-fallback --------- Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>	2026-04-06 20:55:21 +02:00
Aldehir Rojas	4aa962e2b0	vocab : add byte token handling to BPE detokenizer for Gemma4 (#21488 )	2026-04-06 09:08:37 -05:00
Concedo	a395af65db	Merge branch 'upstream' into concedo_experimental # Conflicts: # .github/workflows/build-riscv.yml # .github/workflows/build.yml # ggml/src/ggml-hexagon/htp/argsort-ops.c # ggml/src/ggml-sycl/fattn-tile.hpp # tools/mtmd/CMakeLists.txt	2026-04-06 20:56:02 +08:00
Georgi Gerganov	400ac8e194	convert : set "add bos" == True for Gemma 4 (#21500 ) Some checks are pending Check Pre-Tokenizer Hashes / pre-tokenizer-hashes (push) Waiting to run Details Python check requirements.txt / check-requirements (push) Waiting to run Details Python Type-Check / python type-check (push) Waiting to run Details * convert : set "add bos" == True for Gemma 4 * cont : handle old GGUFs	2026-04-06 13:52:07 +03:00
anchortense	58190cc84d	llama : correct platform-independent loading of BOOL metadata (#21428 ) * model-loader : fix GGUF bool array conversion * model-loader : fix remaining GGUF bool pointer uses	2026-04-06 01:40:38 +02:00
Richard Davison	af76639f72	model : add HunyuanOCR support (#21395 ) * HunyuanOCR: add support for text and vision models - Add HunyuanOCR vision projector (perceiver-based) with Conv2d merge - Add separate HUNYUAN_OCR chat template (content-before-role format) - Handle HunyuanOCR's invalid pad_token_id=-1 in converter - Fix EOS/EOT token IDs from generation_config.json - Support xdrope RoPE scaling type - Add tensor mappings for perceiver projector (mm.before_rms, mm.after_rms, etc.) - Register HunYuanVLForConditionalGeneration for both text and mmproj conversion * fix proper mapping * Update gguf-py/gguf/tensor_mapping.py Co-authored-by: Xuan-Son Nguyen <thichthat@gmail.com> * Update tools/mtmd/clip.cpp Co-authored-by: Xuan-Son Nguyen <thichthat@gmail.com> * address comments * update * Fix typecheck * Update convert_hf_to_gguf.py Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * Update convert_hf_to_gguf.py Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * Update convert_hf_to_gguf.py Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * Update convert_hf_to_gguf.py Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> --------- Co-authored-by: Xuan-Son Nguyen <thichthat@gmail.com> Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>	2026-04-05 23:32:14 +02:00
Concedo	9b1f1bbf35	Merge branch 'upstream' into concedo_experimental # Conflicts: # .github/workflows/build-vulkan.yml # .github/workflows/docker.yml # embd_res/templates/google-gemma-4-31B-it-interleaved.jinja # embd_res/templates/google-gemma-4-31B-it.jinja # tests/test-chat.cpp	2026-04-05 18:46:23 +08:00
Aldehir Rojas	b8635075ff	common : add gemma 4 specialized parser (#21418 ) * common : add gemma4 dedicated parser * cont : add '<\|tool_response>' as eog * cont : emit JSON from Gemma4 tool call AST * cont : more fixes * cont : refactor convert function * cont : refine rules and mapping * cont : add more tests * cont : clean up * cont : remove autoparser gemma4 implementation * cont : more cleanup * cont : rename gemma4.jinja to match the others * cont : add custom template to support interleaved thinking * cont : preserve reasoning in model turns * cont : fix initializer error * cont : fix unused vars * cont : fix accidental static * cont : fix specialized_template signature * fix extra semicolon * remove debug line and extra space [no ci]	2026-04-04 20:39:00 +02:00
Concedo	376aaf258c	Merge branch 'upstream' into concedo_experimental	2026-04-04 23:56:02 +08:00
SamareshSingh	650bf14eb9	llama-model: read final_logit_softcapping for Gemma 4 (#21390 )	2026-04-04 13:05:10 +02:00
Aman Gupta	b7ad48ebda	llama: add custom newline split for Gemma 4 (#21406 )	2026-04-04 15:06:34 +08:00
Piotr Wilkin (ilintar)	d3416a4aa9	fix: remove stale assert (#21369 )	2026-04-03 13:40:41 +02:00
Concedo	784e193fbb	Merge branch 'upstream' into concedo_experimental # Conflicts: # .devops/nix/package.nix # .github/workflows/build.yml # .github/workflows/hip-quality-check.yml # docs/backend/ZenDNN.md # docs/ops.md # docs/ops/ZenDNN.csv # ggml/src/ggml-zendnn/CMakeLists.txt # ggml/src/ggml-zendnn/ggml-zendnn.cpp	2026-04-03 19:04:57 +08:00
Concedo	f39bfc39e9	gemma4 jina tool calls fixed, added gemma4 <eos> token handling	2026-04-03 17:39:21 +08:00
Concedo	8fa87621d1	Merge branch 'upstream' into concedo_experimental # Conflicts: # .github/labeler.yml # common/chat.cpp # ggml/src/ggml-rpc/ggml-rpc.cpp	2026-04-03 16:36:41 +08:00
Piotr Wilkin (ilintar)	b069b10ab4	vocab: fix Gemma4 tokenizer (#21343 ) * seems to work * fix case with new line Co-authored-by: sayap <sokann@gmail.com> * gemma 4: fix pre tok regex --------- Co-authored-by: Xuan Son Nguyen <son@huggingface.co> Co-authored-by: sayap <sokann@gmail.com>	2026-04-03 10:33:03 +02:00
Georgi Gerganov	39b27f0da0	(revert) kv-cache : do not quantize SWA KV cache (#21332 ) This reverts commit `17193cce34`.	2026-04-03 09:07:01 +03:00
Concedo	34ad53e950	merged support for gemma4. the e2b, e4b and 26b work, the 31b does not	2026-04-03 11:07:46 +08:00
Bartowski	7992aa7c8e	tests : add unit test coverage for llama_tensor_get_type (#20112 ) * Add unit test coverage for llama_tensor_get_type * Fix merge conflicts, add more schemas * clang formatter changes * Trailing whitespace * Update name * Start rebase * Updating files with upstream changes prior to rebase * Changes needed from rebase * Update attn_qkv schema, change throw behaviour * Fix merge conflicts * White space * Update with latest changes to state counters * Revert accidental personal CLAUDE.md changes * Change quotation mark * Reuse metadata.name since we have it * Move test-only stuff out of llama-quant.cpp * Hide the regex functionality back in llama-quant.cpp, use a unique pointer to a new struct 'compiled_tensor_type_patterns' which contains the patterns * cont : inital deslop guidelines * Cleanup based on review comments * Continue cleanup * Small cleanup * Manually set proper ordering of tensors, mostly applies to gemma * Formatting * Update tests/test-quant-type-selection.cpp Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * Fix merge conflicts --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>	2026-04-02 22:53:58 +02:00
Concedo	5dee1a1cbb	Merge commit '`fbd441c379`' into concedo_experimental # Conflicts: # .github/workflows/build.yml # AGENTS.md # ci/run.sh # docs/build.md # embd_res/templates/LFM2.5-Instruct.jinja # ggml/CMakeLists.txt # ggml/src/ggml-cuda/fattn.cu # ggml/src/ggml-hexagon/ggml-hexagon.cpp # ggml/src/ggml-hexagon/htp/CMakeLists.txt # ggml/src/ggml-hexagon/htp/htp-msg.h # ggml/src/ggml-hexagon/htp/htp-ops.h # ggml/src/ggml-hexagon/htp/hvx-div.h # ggml/src/ggml-hexagon/htp/main.c # ggml/src/ggml-hexagon/htp/unary-ops.c # ggml/src/ggml-opencl/ggml-opencl.cpp # ggml/src/ggml-sycl/common.hpp # ggml/src/ggml-sycl/convert.cpp # ggml/src/ggml-sycl/dequantize.hpp # ggml/src/ggml-sycl/mmvq.cpp # ggml/src/ggml-sycl/vecdotq.hpp # ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp # ggml/src/ggml-webgpu/ggml-webgpu.cpp # ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl # ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl # ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl # ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl # scripts/hip/gcn-cdna-vgpr-check.py # scripts/sync-ggml.last # tests/test-chat.cpp	2026-04-03 01:06:02 +08:00
Xuan-Son Nguyen	63f8fe0ef4	model, mtmd: fix gguf conversion for audio/vision mmproj (#21309 ) * fix gguf conversion for audio/vision mmproj * fix test	2026-04-02 17:10:32 +02:00
Jesus Talavera	6137c325a1	chat : add Granite 4.0 chat template with correct tool_call role mapping (#20804 ) * chat : add Granite 4.0 chat template with correct tool_call role mapping Introduce `LLM_CHAT_TEMPLATE_GRANITE_4_0` alongside the existing Granite 3.x template (renamed `LLM_CHAT_TEMPLATE_GRANITE_3_X`). The Granite 4.0 Jinja template uses `<tool_call>` XML tags and maps the `assistant_tool_call` role to `<\|start_of_role\|>assistant<\|end_of_role\|><\|tool_call\|>`. Without a matching C++ handler, the fallback path emits the literal role `assistant_tool_call` which the model does not recognize, breaking tool calling when `--jinja` is not used. Changes: - Rename `LLM_CHAT_TEMPLATE_GRANITE` to `LLM_CHAT_TEMPLATE_GRANITE_3_X` (preserves existing 3.x behavior unchanged) - Add `LLM_CHAT_TEMPLATE_GRANITE_4_0` enum, map entry, and handler - Detection: `<\|start_of_role\|>` + (`<tool_call>` or `<tools>`) → 4.0, otherwise → 3.x - Add production Granite 4.0 Jinja template - Add tests for both 3.x and 4.0 template paths (C++ and Jinja) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * Code review: follow standard format and use common logic in test-chat-template.cpp * Rename custom_conversation variable for extra_conversation to give it a more meaningful name --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>	2026-04-02 11:28:56 +02:00
Georgi Gerganov	17193cce34	kv-cache : do not quantize SWA KV cache (#21277 )	2026-04-02 11:54:05 +03:00
Georgi Gerganov	744c0c7310	llama : rotate activations for better quantization (#21038 ) * llama : rotate activations for better quantization * cont : rotate V more + refactor * cont : rotate caches separately + support non-power-of-2 head sizes * cont : simplify * cont : add reference for V rotation * cont : refactor * cont : support context shift * cont : consolidate * cont : dedup + allow different types for the rotation matrix * cont : add env variable to disable rotation * cont : simplify attn rot kv cache logic + rename env * cont : pre-compute the Hadamard matrices	2026-04-01 16:58:01 +03:00
Ettore Di Giacinto	e1cb817483	memory: respect unified KV cache in hybrid memory for eval tasks (#21224 ) The hybrid memory paths (`llama-memory-hybrid.cpp` and `llama-memory-hybrid-iswa.cpp`) always used sequential equal split, ignoring the unified KV cache flag. This caused hellaswag, winogrande, and multiple-choice evaluations to fail on hybrid models (models with both attention and recurrent/SSM layers, such as Qwen3.5-35B-A3B) with: split_equal: sequential split is not supported when there are coupled sequences in the input batch (you may need to use the -kvu flag) PR #19954 fixed this for `llama-kv-cache-iswa.cpp` by automatically enabling unified KV mode and setting n_parallel >= 4 for multi-choice eval tasks. However, the hybrid memory paths were not updated. This commit mirrors the iswa fix: use non-sequential split when KV cache is unified (n_stream == 1), which is automatically set by llama-perplexity for hellaswag/winogrande/multiple-choice since #19954. Tested on Qwen3.5-35B-A3B (hybrid attention+SSM MoE model): - HellaSwag: 83.0% (400 tasks) - Winogrande: 74.5% (400 tasks) - MMLU: 41.2% - ARC-Challenge: 56.2% - TruthfulQA: 37.7% All previously failed with llama_decode() error.	2026-04-01 12:50:17 +03:00
Ed Addario	4951250235	llama : refactor llama_model_quantize_params to expose a pure C interface (#20346 ) * Refactor llama_model_quantize_params to expose a pure C interface * Restore comment and cleanup struct def * Code review refactoring Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * Code review refactoring --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2026-04-01 08:43:00 +03:00
Concedo	31aa072da1	Merge branch 'upstream' into concedo_experimental # Conflicts: # .github/workflows/build.yml # .github/workflows/release.yml # .gitignore # examples/batched/batched.cpp # examples/debug/debug.cpp # examples/eval-callback/eval-callback.cpp # examples/idle/idle.cpp # examples/lookahead/lookahead.cpp # examples/lookup/lookup-create.cpp # examples/lookup/lookup-stats.cpp # examples/lookup/lookup.cpp # examples/parallel/parallel.cpp # examples/passkey/passkey.cpp # examples/retrieval/retrieval.cpp # examples/save-load-state/save-load-state.cpp # examples/speculative-simple/speculative-simple.cpp # examples/speculative/speculative.cpp # examples/training/finetune.cpp # ggml/CMakeLists.txt # ggml/src/ggml-cann/aclnn_ops.cpp # ggml/src/ggml-cann/common.h # ggml/src/ggml-cann/ggml-cann.cpp # ggml/src/ggml-sycl/fattn-tile.hpp # ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp # ggml/src/ggml-webgpu/ggml-webgpu.cpp # ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl # ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py # ggml/src/ggml-webgpu/wgsl-shaders/rope.wgsl # ggml/src/ggml-webgpu/wgsl-shaders/soft_max.wgsl # scripts/sync-ggml.last # tests/export-graph-ops.cpp # tests/test-chat.cpp # tests/test-state-restore-fragmented.cpp # tests/test-thread-safety.cpp # tools/batched-bench/batched-bench.cpp # tools/cli/cli.cpp # tools/cvector-generator/cvector-generator.cpp # tools/export-lora/export-lora.cpp # tools/imatrix/imatrix.cpp # tools/perplexity/perplexity.cpp # tools/results/results.cpp # tools/server/CMakeLists.txt	2026-04-01 10:54:13 +08:00

1 2 3 4 5 ...

1314 commits