koboldcpp

mirror of https://github.com/LostRuins/koboldcpp.git synced 2026-04-28 03:30:20 +00:00

Author	SHA1	Message	Date
Concedo	31aa072da1	Merge branch 'upstream' into concedo_experimental # Conflicts: # .github/workflows/build.yml # .github/workflows/release.yml # .gitignore # examples/batched/batched.cpp # examples/debug/debug.cpp # examples/eval-callback/eval-callback.cpp # examples/idle/idle.cpp # examples/lookahead/lookahead.cpp # examples/lookup/lookup-create.cpp # examples/lookup/lookup-stats.cpp # examples/lookup/lookup.cpp # examples/parallel/parallel.cpp # examples/passkey/passkey.cpp # examples/retrieval/retrieval.cpp # examples/save-load-state/save-load-state.cpp # examples/speculative-simple/speculative-simple.cpp # examples/speculative/speculative.cpp # examples/training/finetune.cpp # ggml/CMakeLists.txt # ggml/src/ggml-cann/aclnn_ops.cpp # ggml/src/ggml-cann/common.h # ggml/src/ggml-cann/ggml-cann.cpp # ggml/src/ggml-sycl/fattn-tile.hpp # ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp # ggml/src/ggml-webgpu/ggml-webgpu.cpp # ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl # ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py # ggml/src/ggml-webgpu/wgsl-shaders/rope.wgsl # ggml/src/ggml-webgpu/wgsl-shaders/soft_max.wgsl # scripts/sync-ggml.last # tests/export-graph-ops.cpp # tests/test-chat.cpp # tests/test-state-restore-fragmented.cpp # tests/test-thread-safety.cpp # tools/batched-bench/batched-bench.cpp # tools/cli/cli.cpp # tools/cvector-generator/cvector-generator.cpp # tools/export-lora/export-lora.cpp # tools/imatrix/imatrix.cpp # tools/perplexity/perplexity.cpp # tools/results/results.cpp # tools/server/CMakeLists.txt	2026-04-01 10:54:13 +08:00
Aldehir Rojas	624733d631	common : gpt-oss handle builtin and unsolicited tool calls (#21213 )	2026-03-31 13:52:42 +02:00
lainon1	0b6ff47996	fix: correct misspellings in code comments (#21217 ) - emdeddings → embeddings (gemma3.cpp, gemma3n-iswa.cpp, gemma-embedding.cpp) - imlpemented → implemented (llama-adapter.cpp) - interere → interfere (llama-graph.cpp) - overridde → overridden (chat.cpp) - stastistics → statistics (ngram-map.h) - layed → laid (llama-kv-cache.h) - worster → worst (llama-context.cpp) - sequantial → sequential (llama-batch.h)	2026-03-31 13:50:51 +02:00
Concedo	42ad89cd86	Merge branch 'upstream' into concedo_experimental # Conflicts: # .devops/cann.Dockerfile # .devops/cpu.Dockerfile # .devops/llama-cli-cann.Dockerfile # .devops/nix/package.nix # .github/workflows/build-android.yml # .github/workflows/build-cann.yml # .github/workflows/build-msys.yml # .github/workflows/docker.yml # .github/workflows/editorconfig.yml # .github/workflows/gguf-publish.yml # .github/workflows/python-lint.yml # .github/workflows/release.yml # CMakeLists.txt # docs/backend/CANN.md # ggml/src/ggml-hexagon/ggml-hexagon.cpp # ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c # ggml/src/ggml-hexagon/htp/htp-ctx.h # ggml/src/ggml-hexagon/htp/main.c # ggml/src/ggml-hexagon/htp/matmul-ops.c # ggml/src/ggml-rpc/ggml-rpc.cpp # scripts/sync_vendor.py # tests/test-chat-auto-parser.cpp # tests/test-chat.cpp # tests/test-json-schema-to-grammar.cpp # tests/test-reasoning-budget.cpp # tools/cli/cli.cpp # tools/server/CMakeLists.txt # tools/server/README.md	2026-03-30 20:45:38 +08:00
Aldehir Rojas	e6f2ec01ff	common : add reasoning_format = none support to gpt-oss (#21094 )	2026-03-28 09:33:39 -05:00
Concedo	c00fe0af5a	Merge commit '`9f102a1407`' into concedo_experimental # Conflicts: # .devops/intel.Dockerfile # .github/ISSUE_TEMPLATE/010-bug-compilation.yml # .github/ISSUE_TEMPLATE/011-bug-results.yml # .github/pull_request_template.md # CODEOWNERS # README.md # common/CMakeLists.txt # ggml/src/ggml-hexagon/ggml-hexagon.cpp # ggml/src/ggml-hexagon/htp/binary-ops.c # ggml/src/ggml-hexagon/htp/hex-dma.c # ggml/src/ggml-hexagon/htp/hex-dma.h # ggml/src/ggml-hexagon/htp/hex-dump.h # ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c # ggml/src/ggml-hexagon/htp/hvx-utils.h # ggml/src/ggml-hexagon/htp/main.c # ggml/src/ggml-hexagon/htp/ssm-conv.c # ggml/src/ggml-opencl/CMakeLists.txt # ggml/src/ggml-opencl/ggml-opencl.cpp # ggml/src/ggml-opencl/kernels/cvt.cl # ggml/src/ggml-rpc/ggml-rpc.cpp # scripts/snapdragon/adb/run-bench.sh # scripts/sync_vendor.py # tests/test-backend-ops.cpp # tools/llama-bench/llama-bench.cpp	2026-03-25 23:45:41 +08:00
Aldehir Rojas	312d870a89	common : replace wrap_for_generation with a prefix convenience function and fix gpt-oss (#20912 )	2026-03-23 22:21:47 -05:00
Concedo	6054bacadd	Merge branch 'upstream' into concedo_experimental # Conflicts: # .github/workflows/ai-issues.yml # CONTRIBUTING.md # docs/autoparser.md # docs/ops.md # docs/ops/Metal.csv # ggml/src/ggml-cann/aclnn_ops.cpp # ggml/src/ggml-cann/ggml-cann.cpp # ggml/src/ggml-cpu/CMakeLists.txt # ggml/src/ggml-hexagon/ggml-hexagon.cpp # ggml/src/ggml-hexagon/htp/CMakeLists.txt # ggml/src/ggml-hexagon/htp/hex-dma.h # ggml/src/ggml-hexagon/htp/hex-utils.h # ggml/src/ggml-hexagon/htp/htp-ctx.h # ggml/src/ggml-hexagon/htp/htp-msg.h # ggml/src/ggml-hexagon/htp/htp_iface.idl # ggml/src/ggml-hexagon/htp/hvx-base.h # ggml/src/ggml-hexagon/htp/main.c # ggml/src/ggml-hip/CMakeLists.txt # models/templates/Apriel-1.6-15b-Thinker-fixed.jinja # models/templates/deepseek-ai-DeepSeek-R1-Distill-Qwen-32B.jinja # models/templates/deepseek-ai-DeepSeek-V3.1.jinja # models/templates/llama-cpp-deepseek-r1.jinja # models/templates/meetkai-functionary-medium-v3.1.jinja # scripts/fetch_server_test_models.py # scripts/snapdragon/adb/run-cli.sh # scripts/snapdragon/adb/run-completion.sh # scripts/snapdragon/adb/run-mtmd.sh # scripts/snapdragon/adb/run-tool.sh # tests/test-chat-auto-parser.cpp # tests/test-chat-peg-parser.cpp # tests/test-chat.cpp # tools/cli/cli.cpp # tools/server/README.md	2026-03-21 12:06:01 +08:00
Concedo	98f099aecc	Merge commit '`c1258830b2`' into concedo_experimental # Conflicts: # docs/docker.md # docs/ops.md # docs/ops/WebGPU.csv # ggml/src/ggml-cann/aclnn_ops.cpp # ggml/src/ggml-cann/ggml-cann.cpp # ggml/src/ggml-cpu/CMakeLists.txt # ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp # ggml/src/ggml-webgpu/ggml-webgpu.cpp # ggml/src/ggml-webgpu/wgsl-shaders/get_rows.wgsl # ggml/src/ggml-webgpu/wgsl-shaders/row_norm.wgsl # ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl	2026-03-21 12:00:52 +08:00
James O'Leary	c46583b86b	common/parser : fix out_of_range crash in throw path (#20424 regression) (#20777 ) * chat : fix out_of_range crash in throw path (#20424 regression) #20424 introduced effective_input = generation_prompt + input, but the throw path uses input.substr(result.end) where result.end is a position within effective_input. Every thinking model with a non-empty generation_prompt crashes with std::out_of_range instead of the intended error message. Test crashes on unpatched master, passes with fix: cmake -B build -DLLAMA_BUILD_TESTS=ON -DLLAMA_BUILD_TOOLS=OFF cmake --build build --target test-chat ./build/bin/test-chat * Update test-chat.cpp * Update test-chat.cpp * Update test-chat.cpp --------- Co-authored-by: Piotr Wilkin (ilintar) <piotr.wilkin@syndatis.com>	2026-03-20 02:37:22 +01:00
Piotr Wilkin (ilintar)	5e54d51b19	common/parser: add proper reasoning tag prefill reading (#20424 ) * Implement proper prefill extraction * Refactor cli parameters, update docs, move reasoning budget sampler part to common/reasoning-budget.cpp * Update tools/server/server-task.cpp * refactor: move grammars to variant, remove grammar_external, handle exception internally * Make code less C++y Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2026-03-19 16:58:21 +01:00
Aldehir Rojas	1b9bbaa357	common : fix gpt-oss content removal (#20745 )	2026-03-19 11:40:39 +01:00
Concedo	48f914e374	Merge branch 'upstream' into concedo_experimental # Conflicts: # ci/run.sh # ggml/CMakeLists.txt # ggml/src/ggml-cpu/arch/riscv/repack.cpp # ggml/src/ggml-cpu/arch/x86/repack.cpp # ggml/src/ggml-cpu/repack.cpp # ggml/src/ggml-hexagon/ggml-hexagon.cpp # ggml/src/ggml-hexagon/htp/CMakeLists.txt # ggml/src/ggml-hexagon/htp/htp-msg.h # ggml/src/ggml-hexagon/htp/htp-ops.h # ggml/src/ggml-hexagon/htp/hvx-base.h # ggml/src/ggml-hexagon/htp/hvx-exp.h # ggml/src/ggml-hexagon/htp/hvx-sigmoid.h # ggml/src/ggml-hexagon/htp/main.c # ggml/src/ggml-hexagon/htp/softmax-ops.c # ggml/src/ggml-hexagon/htp/unary-ops.c # ggml/src/ggml-webgpu/ggml-webgpu.cpp # scripts/sync-ggml.last # tests/test-backend-sampler.cpp # tests/test-chat.cpp # tests/test-jinja.cpp # tools/cli/cli.cpp	2026-03-19 02:23:06 +08:00
Aldehir Rojas	5e8910a0db	common : rework gpt-oss parser (#20393 ) * common : rework gpt-oss parser * cont : fix gpt-oss tests * cont : add structured output test * cont : rename final to final_msg	2026-03-18 10:41:25 +01:00
Piotr Wilkin (ilintar)	d2ecd2d1cf	common/parser: add `--skip-chat-parsing` to force a pure content parser. (#20289 ) * Add `--force-pure-content` to force a pure content parser. * Update common/arg.cpp Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * Change parameter name [no ci] --------- Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>	2026-03-17 16:16:43 +01:00
Concedo	f31b040941	Merge branch 'upstream' into concedo_experimental # Conflicts: # .github/labeler.yml # .github/workflows/build-self-hosted.yml # benches/nemotron/nemotron-dgx-spark.md # docs/ops.md # docs/ops/SYCL.csv # ggml/src/ggml-cpu/kleidiai/kleidiai.cpp # ggml/src/ggml-sycl/backend.hpp # ggml/src/ggml-sycl/element_wise.cpp # ggml/src/ggml-sycl/element_wise.hpp # ggml/src/ggml-sycl/ggml-sycl.cpp # scripts/sync-ggml.last # tests/test-jinja.cpp # tests/test-llama-archs.cpp	2026-03-17 14:05:23 +08:00
Aldehir Rojas	1bbec6a75d	jinja : add capability check for object args (#20612 )	2026-03-16 17:43:14 +01:00
Concedo	b1c500ae2b	Merge commit '`2948e6049a`' into concedo_experimental # Conflicts: # .github/workflows/build.yml # CONTRIBUTING.md # docs/backend/VirtGPU/development.md # docs/ops.md # docs/ops/WebGPU.csv # embd_res/templates/GigaChat3-10B-A1.8B.jinja # embd_res/templates/GigaChat3.1-10B-A1.8B.jinja # ggml/src/ggml-hip/CMakeLists.txt # ggml/src/ggml-opencl/CMakeLists.txt # ggml/src/ggml-opencl/ggml-opencl.cpp # ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp # ggml/src/ggml-webgpu/ggml-webgpu.cpp # scripts/sync_vendor.py # tests/CMakeLists.txt # tests/test-backend-ops.cpp # tests/test-chat.cpp # tests/test-grammar-integration.cpp # tests/test-quantize-fns.cpp	2026-03-15 11:21:24 +08:00
Concedo	67c9798d0b	Merge commit '`3ca19b0e9f`' into concedo_experimental # Conflicts: # .github/workflows/build.yml # common/CMakeLists.txt # common/chat-peg-parser.cpp # docs/backend/SYCL.md # docs/ops.md # docs/ops/SYCL.csv # ggml/src/ggml-sycl/common.hpp # ggml/src/ggml-sycl/convert.hpp # ggml/src/ggml-sycl/element_wise.cpp # ggml/src/ggml-sycl/ggml-sycl.cpp # ggml/src/ggml-sycl/norm.cpp # ggml/src/ggml-sycl/rope.cpp # ggml/src/ggml-sycl/rope.hpp # ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp # ggml/src/ggml-webgpu/ggml-webgpu.cpp # ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl # ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.wgsl # ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl # scripts/compare-llama-bench.py # scripts/sync_vendor.py # tests/CMakeLists.txt # tools/cli/cli.cpp	2026-03-15 11:11:31 +08:00
Concedo	ff3f8533d3	Merge commit '`c96f608d98`' into concedo_experimental # Conflicts: # CONTRIBUTING.md # docs/ops.md # docs/ops/Vulkan.csv # models/templates/LFM2-8B-A1B.jinja # tests/peg-parser/test-python-dict-parser.cpp # tests/peg-parser/test-unicode.cpp # tests/test-chat-peg-parser.cpp # tests/test-chat.cpp # tools/llama-bench/llama-bench.cpp	2026-03-14 17:14:34 +08:00
Concedo	04915d99ee	Merge commit '`451ef08432`' into concedo_experimental # Conflicts: # .github/workflows/build.yml # README.md # docs/ops.md # docs/ops/Vulkan.csv # src/llama-model-loader.cpp # src/llama-model.cpp # src/llama.cpp # tests/CMakeLists.txt # tests/peg-parser/test-basic.cpp # tests/peg-parser/test-json-parser.cpp # tests/peg-parser/test-python-dict-parser.cpp # tests/peg-parser/test-unicode.cpp # tests/test-chat-auto-parser.cpp # tests/test-chat-peg-parser.cpp # tests/test-chat.cpp # tools/CMakeLists.txt	2026-03-13 23:33:37 +08:00
Concedo	d2c911884d	Merge commit '`213c4a0b81`' into concedo_experimental # Conflicts: # CODEOWNERS # common/CMakeLists.txt # common/chat-peg-parser.cpp # common/chat.cpp # docs/backend/SYCL.md # docs/development/parsing.md # docs/ops.md # docs/ops/SYCL.csv # embd_res/templates/Apriel-1.6-15b-Thinker-fixed.jinja # embd_res/templates/Bielik-11B-v3.0-Instruct.jinja # embd_res/templates/GLM-4.7-Flash.jinja # embd_res/templates/LFM2-8B-A1B.jinja # embd_res/templates/StepFun3.5-Flash.jinja # ggml/src/ggml-opencl/CMakeLists.txt # ggml/src/ggml-opencl/ggml-opencl.cpp # ggml/src/ggml-sycl/CMakeLists.txt # ggml/src/ggml-sycl/backend.hpp # ggml/src/ggml-sycl/common.hpp # ggml/src/ggml-sycl/convert.cpp # ggml/src/ggml-sycl/convert.hpp # ggml/src/ggml-sycl/count-equal.cpp # ggml/src/ggml-sycl/dpct/helper.hpp # ggml/src/ggml-sycl/ggml-sycl.cpp # ggml/src/ggml-sycl/presets.hpp # ggml/src/ggml-sycl/softmax.cpp # ggml/src/ggml-sycl/vecdotq.hpp # models/templates/Apertus-8B-Instruct.jinja # models/templates/CohereForAI-c4ai-command-r7b-12-2024-tool_use.jinja # models/templates/Qwen-QwQ-32B.jinja # models/templates/Qwen3-Coder.jinja # models/templates/deepseek-ai-DeepSeek-R1-Distill-Llama-8B.jinja # models/templates/deepseek-ai-DeepSeek-R1-Distill-Qwen-32B.jinja # models/templates/deepseek-ai-DeepSeek-V3.1.jinja # models/templates/fireworks-ai-llama-3-firefunction-v2.jinja # models/templates/moonshotai-Kimi-K2.jinja # models/templates/unsloth-Apriel-1.5.jinja # tests/CMakeLists.txt # tests/peg-parser/test-basic.cpp # tests/peg-parser/tests.h # tests/test-backend-ops.cpp # tests/test-chat-peg-parser.cpp # tests/test-chat-template.cpp # tests/test-chat.cpp # tests/test-json-schema-to-grammar.cpp # tests/test-peg-parser.cpp # tools/CMakeLists.txt # tools/cli/cli.cpp	2026-03-13 21:35:56 +08:00
Mishusha	a8304b4d27	common/parser: add GigaChatV3/3.1 models support (#19931 ) Co-authored-by: Mishusha <pmv26021975@gmail.com>	2026-03-12 01:22:25 +01:00
Piotr Wilkin (ilintar)	acb7c79069	common/parser: handle reasoning budget (#20297 ) * v1 * Finished! * Handlie cli * Reasoning sampler * Apply suggestions from code review Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * Less explosive terminology :) * Add utf-8 case and tests * common : migrate reasoning budget sampler to common * cont : clean up * cont : expose state and allow passing as initial state * cont : remove unused imports * cont : update state machine doc string --------- Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> Co-authored-by: Alde Rojas <hello@alde.dev>	2026-03-11 10:26:12 +01:00
Piotr Wilkin (ilintar)	6c770d16ca	Reduce level of content parser warning message to avoid log spam on non-debug verbosity (#20347 )	2026-03-10 15:21:51 +01:00
Piotr Wilkin (ilintar)	f76565db92	common: map developer role to system (#20215 ) * Map developer role to system * Simplify	2026-03-09 14:25:11 +01:00
Piotr Wilkin (ilintar)	97c64fbdbd	PEG parser for LFM2 (#20251 ) * PEG parser for LFM2 * Simplify using python_value()	2026-03-09 01:11:22 +01:00
Aldehir Rojas	451ef08432	common : gracefully handle incomplete output (#20191 ) * common : handle incomplete UTF-8 at end of input in PEG parser * cont : if reached end prematurely, emit needs_more_input to propagate partial output * cont: refactor peg parse context to add lenient flag * cont : remove partial flag, keep lenient flag	2026-03-08 17:17:02 +01:00
Piotr Wilkin (ilintar)	b283f6d5b3	Revert to OAI-compatible args (#20213 ) * Revert to OAI-compatible args * Apply workaround::func_args_not_string	2026-03-08 11:33:03 +01:00
Piotr Wilkin (ilintar)	566059a26b	Autoparser - complete refactoring of parser architecture (#18675 ) * Autoparser - full single commit squish * Final pre-merge changes: minor fixes, Kimi 2.5 model parser	2026-03-06 21:01:00 +01:00
Concedo	d06700687f	Merge branch 'upstream' into concedo_experimental # Conflicts: # .devops/rocm.Dockerfile # .github/workflows/release.yml # CMakeLists.txt # ggml/src/ggml-cuda/common.cuh # scripts/sync_vendor.py # tests/test-chat.cpp	2026-02-22 09:33:13 +08:00
Aldehir Rojas	94b0200a01	common : merge qwen3-coder and nemotron nano 3 parsers (#19765 ) * common : migrate qwen3-coder to PEG parsing variant * cont : add JSON parameter test	2026-02-20 23:22:22 +01:00
Concedo	e626de2430	Merge branch 'upstream' into concedo_experimental # Conflicts: # docs/ops.md # docs/ops/WebGPU.csv # embd_res/templates/stepfun-ai-Step-3.5-Flash.jinja # ggml/src/ggml-webgpu/ggml-webgpu.cpp # ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl # src/CMakeLists.txt # tests/test-backend-ops.cpp # tests/test-chat.cpp # tools/mtmd/CMakeLists.txt	2026-02-20 15:16:26 +08:00
Concedo	07c45ced56	Merge commit '`c78e682245`' into concedo_experimental # Conflicts: # src/models/qwen35.cpp # src/models/qwen35moe.cpp	2026-02-20 14:41:32 +08:00
Jesse Posner	3dadc88b58	common : fix Step-3.5-Flash format detection and thinking support (#19635 ) * common : fix Step-3.5-Flash format detection and thinking support Step-3.5-Flash uses the same XML-style tool call format as Qwen3-Coder (<tool_call><function=...><parameter=...>) but its Jinja template lacks the bare <function> and plural <parameters> markers that the detection logic previously required. This caused it to fall through to Hermes 2 Pro, which doesn't call func_args_not_string(), so arguments stayed as JSON strings and templates using arguments\|items crashed. Additionally, the Qwen3-Coder-XML format handler had no thinking support. Models like Step-3.5-Flash that unconditionally emit <think> in their generation prompt need the same thinking_forced_open handling that Nemotron v3 and Hermes 2 Pro already have, otherwise reasoning_content is never separated from content in API responses. Changes: - Relax Qwen3-Coder XML detection to only require the 3 shared markers - Tighten Nemotron v3 branch to also require bare <function> and plural <parameters>, preventing Step-3.5-Flash from being misrouted via <think> - Add thinking_forced_open support to Qwen3-Coder-XML init function - Add <think>/</think> to preserved tokens - Fix build_grammar_xml_tool_call to handle thinking_forced_open in the grammar root rule, allowing </think> before tool calls - Add Step-3.5-Flash chat template and format detection test Builds on: https://github.com/ggml-org/llama.cpp/pull/19283 * chat : route Step-3.5-Flash to Nemotron v3 PEG parser, add tests Step-3.5-Flash uses the same XML tool call format as Qwen3-Coder and Nemotron 3 Nano (<tool_call>/<function=...>/<parameter=...>) but with unconditional <think> output. Route it to the Nemotron v3 PEG parser for streaming and schema-aware parameter parsing. Detection: templates with <think> + XML tool tags use Nemotron v3 PEG parser; templates without <think> (Qwen3-Coder) use GBNF grammar. Tests cover: basic messages, tool calls with/without thinking content, parallel tool calls, code string parameters, optional </parameter> closing tags, and JSON schema response format. * chat : remove dead thinking code from qwen3_coder_xml Remove thinking handling code that became unreachable after routing Step-3.5-Flash to the Nemotron v3 PEG parser. Qwen3-Coder has no <think> in its template, so the thinking_forced_open logic, preserved tokens, and grammar prefix were dead paths.	2026-02-19 22:40:52 +01:00
abhijitb11	39e4b1dc9b	common : fix gpt-oss Jinja error when assistant message has both content and thinking with tool calls (#19704 )	2026-02-19 14:59:20 -06:00
Tarek Dakhran	c5897995a7	mtmd : chat : Fix extra \n between text and media marker (#19595 ) * mtmd : chat : Fix extra \n between text and media marker Thanks to @tugot17 for detecting and reporting the issue. For vision models (e.g. LFM2.5-VL-1.6B and Qwen/Qwen3-VL-4B-Instruct) `llama-mtmd-cli` produces identical output to HF implementation. However `llama-server` doesn't. I traced it down to extra newline inserted after `<__media__>`. This happens in `to_json_oaicompat`, that treats media markers as text and joins all parts with `\n` separator. PR introduces new type `media_marker` and uses it for media markers. Extra logic is added to prevent insertion of newlines before and after media markers. With this change number of input tokens is identical to HF implementation and as a result the output is also identical. I explored other ways to address the issue * remove completely `\n` between text parts in `to_json_oaicompat` * merge text messages in server-common.cpp before sending them to `to_json_oaicompat` Please propose alternative ways of fixing this issue. * Refactor to use explicite per type ifs * Update common/chat.cpp Co-authored-by: Piotr Wilkin (ilintar) <piotr.wilkin@syndatis.com> * Update common_chat_templates_apply_legacy --------- Co-authored-by: Piotr Wilkin (ilintar) <piotr.wilkin@syndatis.com>	2026-02-19 12:18:57 +01:00
Concedo	bff3fd3e34	Merge branch 'upstream' into concedo_experimental # Conflicts: # common/common.cpp # docs/backend/snapdragon/README.md # ggml/src/ggml-hexagon/htp/htp-ops.h # ggml/src/ggml-hexagon/htp/matmul-ops.c # ggml/src/ggml-opencl/CMakeLists.txt # ggml/src/ggml-opencl/ggml-opencl.cpp # scripts/pr2wt.sh # tests/test-backend-ops.cpp # tools/server/README.md	2026-02-13 14:00:45 +08:00
Concedo	261d78eaaa	Merge branch 'upstream' into concedo_experimental # Conflicts: # CMakeLists.txt # README.md # docs/speculative.md # ggml/src/ggml-cann/aclnn_ops.cpp # ggml/src/ggml-cann/ggml-cann.cpp # tests/CMakeLists.txt # tests/test-backend-ops.cpp # tools/mtmd/clip.cpp	2026-02-12 18:05:20 +08:00
Xuan-Son Nguyen	98e57ca422	chat: fix case where template accepts type content only (#19419 ) * chat: fix case where template accepts type content only * rm stray log * reuse render_message_to_json	2026-02-09 22:14:12 +01:00
Concedo	8d173f50c2	Merge branch 'upstream' into concedo_experimental # Conflicts: # .github/workflows/build.yml # docs/backend/SYCL.md # docs/backend/snapdragon/CMakeUserPresets.json # docs/backend/snapdragon/README.md # docs/backend/snapdragon/developer.md # docs/ops.md # docs/ops/SYCL.csv # embd_res/templates/upstage-Solar-Open-100B.jinja # ggml/src/CMakeLists.txt # ggml/src/ggml-hexagon/CMakeLists.txt # ggml/src/ggml-hexagon/ggml-hexagon.cpp # ggml/src/ggml-sycl/element_wise.cpp # ggml/src/ggml-sycl/element_wise.hpp # ggml/src/ggml-sycl/ggml-sycl.cpp # ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl # tests/test-chat.cpp	2026-01-30 15:32:59 +08:00
Aldehir Rojas	7b7ae857f6	chat : add parsing for solar-open-100b (#18540 ) * chat : add parsing for solar-open-100b * add comments to rules * cont : make assistant start optional * cont : remove assistant start prefix altogether --------- Co-authored-by: Piotr Wilkin (ilintar) <piotr.wilkin@syndatis.com>	2026-01-29 16:06:15 +01:00
Concedo	7e755014b2	Merge branch 'upstream' into concedo_experimental # Conflicts: # .github/workflows/winget.yml # CODEOWNERS # common/CMakeLists.txt # common/arg.cpp # docs/ops/SYCL.csv # examples/lookup/lookup-create.cpp # examples/lookup/lookup-stats.cpp # examples/lookup/lookup.cpp # examples/speculative-simple/speculative-simple.cpp # examples/speculative/speculative.cpp # ggml/src/ggml-hip/CMakeLists.txt # ggml/src/ggml-sycl/dpct/helper.hpp # ggml/src/ggml-sycl/ggml-sycl.cpp # ggml/src/ggml-sycl/norm.cpp # ggml/src/ggml-zendnn/ggml-zendnn.cpp # tests/test-chat-template.cpp	2026-01-29 23:05:05 +08:00
Sigbjørn Skjæret	b45ef2702c	jinja : do not pass empty tools and add some none filters (#19176 ) Some checks are pending Check Pre-Tokenizer Hashes / pre-tokenizer-hashes (push) Waiting to run Details Python check requirements.txt / check-requirements (push) Waiting to run Details Python Type-Check / pyright type-check (push) Waiting to run Details Update Operations Documentation / update-ops-docs (push) Waiting to run Details	2026-01-29 14:06:54 +01:00
Concedo	f6ece6fd37	Merge branch 'upstream' into concedo_experimental # Conflicts: # .github/workflows/check-vendor.yml # .github/workflows/close-issue.yml # .github/workflows/editorconfig.yml # .github/workflows/gguf-publish.yml # .github/workflows/labeler.yml # .github/workflows/pre-tokenizer-hashes.yml # .github/workflows/python-check-requirements.yml # .github/workflows/python-lint.yml # .github/workflows/python-type-check.yml # .github/workflows/server.yml # .github/workflows/update-ops-docs.yml # README.md # docs/build.md # examples/model-conversion/scripts/utils/perplexity-gen.sh # examples/model-conversion/scripts/utils/perplexity-run-simple.sh # examples/model-conversion/scripts/utils/perplexity-run.sh # examples/model-conversion/scripts/utils/quantize.sh # examples/model-conversion/scripts/utils/run-embedding-server.sh # ggml/src/ggml-cpu/ggml-cpu.c # ggml/src/ggml-hexagon/htp/flash-attn-ops.c # ggml/src/ggml-opencl/CMakeLists.txt # ggml/src/ggml-opencl/ggml-opencl.cpp # ggml/src/ggml-opencl/kernels/cvt.cl # ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32.cl # ggml/src/ggml-sycl/ggml-sycl.cpp # scripts/compare-llama-bench.py # tests/test-backend-ops.cpp # tests/test-gguf.cpp # tools/cli/README.md # tools/completion/README.md # tools/server/README.md	2026-01-27 23:06:13 +08:00
Xuan-Son Nguyen	bb02f74c61	chat: fix language input for translategemma (#19052 ) * chat: fix language input for translategemma * Update common/chat.cpp Co-authored-by: Aldehir Rojas <hello@alde.dev> --------- Co-authored-by: Aldehir Rojas <hello@alde.dev>	2026-01-24 17:58:45 +01:00
Xuan-Son Nguyen	b5b8fa1c8b	chat : fix translategemma crash on common_chat_format_example (#19019 )	2026-01-23 12:03:42 +01:00
Concedo	e8e7c357c9	Merge branch 'upstream' into concedo_experimental # Conflicts: # .github/workflows/build-cache.yml # .github/workflows/build-cmake-pkg.yml # .github/workflows/build-linux-cross.yml # .github/workflows/build.yml # .github/workflows/check-vendor.yml # .github/workflows/close-issue.yml # .github/workflows/copilot-setup-steps.yml # .github/workflows/docker.yml # .github/workflows/editorconfig.yml # .github/workflows/gguf-publish.yml # .github/workflows/labeler.yml # .github/workflows/pre-tokenizer-hashes.yml # .github/workflows/python-check-requirements.yml # .github/workflows/python-lint.yml # .github/workflows/python-type-check.yml # .github/workflows/release.yml # .github/workflows/server-webui.yml # .github/workflows/server.yml # .github/workflows/update-ops-docs.yml # .github/workflows/winget.yml # ggml/src/ggml-opencl/CMakeLists.txt # ggml/src/ggml-opencl/ggml-opencl.cpp # ggml/src/ggml-zdnn/ggml-zdnn.cpp # requirements/requirements-tool_bench.txt # src/CMakeLists.txt # src/llama-quant.cpp # tests/test-backend-ops.cpp # tests/test-chat.cpp # tools/cli/cli.cpp # tools/server/README.md	2026-01-23 14:27:04 +08:00
Xuan-Son Nguyen	51fa458a92	server : support preserving reasoning_content in assistant message (#18994 ) * support reasoning_content input * report template caps to webui * add docs * rm commented code	2026-01-22 21:30:06 +01:00
Concedo	4984c9bc16	Merge commit '`12a4a47e6a`' into concedo_experimental # Conflicts: # ci/run.sh # examples/model-conversion/scripts/causal/run-converted-model-embeddings-logits.sh # examples/model-conversion/scripts/causal/run-converted-model.sh # examples/model-conversion/scripts/embedding/run-converted-model.sh # ggml/src/ggml-cann/ggml-cann.cpp # ggml/src/ggml-hexagon/ggml-hexagon.cpp # ggml/src/ggml-opencl/ggml-opencl.cpp # ggml/src/ggml-sycl/ggml-sycl.cpp # ggml/src/ggml-webgpu/ggml-webgpu.cpp # ggml/src/ggml-zdnn/ggml-zdnn.cpp # ggml/src/ggml-zendnn/ggml-zendnn.cpp # tests/CMakeLists.txt # tests/test-chat-parser.cpp # tests/test-chat-peg-parser.cpp # tests/test-chat.cpp # tools/cli/cli.cpp	2026-01-21 21:00:44 +08:00

1 2 3 4

152 commits