Merge branch 'upstream' into concedo_experimental

# Conflicts:
#	CMakeLists.txt
#	docs/speculative.md
#	ggml/src/ggml-cuda/CMakeLists.txt
#	ggml/src/ggml-hexagon/ggml-hexagon.cpp
#	ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c
#	ggml/src/ggml-hexagon/htp/hmx-ops.h
#	ggml/src/ggml-hexagon/htp/main.c
#	ggml/src/ggml-hexagon/htp/matmul-ops.c
#	ggml/src/ggml-hexagon/htp/rope-ops.c
#	ggml/src/ggml-hexagon/htp/ssm-conv.c
#	ggml/src/ggml-opencl/ggml-opencl.cpp
#	scripts/snapdragon/adb/run-bench.sh
#	scripts/snapdragon/adb/run-cli.sh
#	scripts/snapdragon/adb/run-completion.sh
#	scripts/snapdragon/adb/run-mtmd.sh
#	scripts/snapdragon/windows/run-bench.ps1
#	scripts/snapdragon/windows/run-cli.ps1
#	scripts/snapdragon/windows/run-completion.ps1
#	scripts/snapdragon/windows/run-mtmd.ps1
#	src/llama-vocab.cpp
#	tests/test-backend-ops.cpp
#	tools/batched-bench/CMakeLists.txt
#	tools/batched-bench/batched-bench.cpp
#	tools/cli/CMakeLists.txt
#	tools/cli/README.md
#	tools/cli/cli.cpp
#	tools/completion/CMakeLists.txt
#	tools/completion/README.md
#	tools/llama-bench/CMakeLists.txt
#	tools/llama-bench/llama-bench.cpp
#	tools/mtmd/CMakeLists.txt
#	tools/mtmd/tests/test-deepseek-ocr.py
#	tools/mtmd/tests/tests-requirements.txt
#	tools/perplexity/CMakeLists.txt
#	tools/perplexity/perplexity.cpp
#	tools/quantize/CMakeLists.txt
#	tools/server/CMakeLists.txt
#	tools/server/README.md
#	ty.toml
This commit is contained in:
Concedo 2026-05-21 23:47:21 +08:00
commit 718dc159b6
83 changed files with 1469 additions and 648 deletions

View file

@ -3593,6 +3593,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.speculative.draft.p_min = std::stof(value);
}
).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SPEC_DRAFT_P_MIN"));
add_opt(common_arg(
{"--spec-draft-backend-sampling"},
{"--no-spec-draft-backend-sampling"},
string_format("offload draft sampling to the backend (default: %s)",
params.speculative.draft.backend_sampling ? "enabled" : "disabled"),
[](common_params & params, bool value) {
params.speculative.draft.backend_sampling = value;
}
).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SPEC_DRAFT_BACKEND_SAMPLING"));
add_opt(common_arg(
{"--spec-draft-device", "-devd", "--device-draft"}, "<dev1,dev2,..>",
"comma-separated list of devices to use for offloading the draft model (none = don't offload)\n"