Merge branch 'upstream' into concedo_experimental

# Conflicts:
#	.devops/intel.Dockerfile
#	.github/workflows/build-android.yml
#	.github/workflows/build.yml
#	.github/workflows/release.yml
#	.gitignore
#	docs/backend/SYCL.md
#	docs/backend/snapdragon/README.md
#	examples/model-conversion/scripts/causal/convert-model.sh
#	ggml/CMakeLists.txt
#	ggml/src/CMakeLists.txt
#	ggml/src/ggml-hexagon/ggml-hexagon.cpp
#	ggml/src/ggml-hexagon/htp/CMakeLists.txt
#	ggml/src/ggml-hexagon/htp/hex-utils.h
#	ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c
#	ggml/src/ggml-hexagon/htp/htp-ctx.h
#	ggml/src/ggml-hexagon/htp/htp-ops.h
#	ggml/src/ggml-hexagon/htp/htp_iface.idl
#	ggml/src/ggml-hexagon/htp/hvx-base.h
#	ggml/src/ggml-hexagon/htp/main.c
#	ggml/src/ggml-hexagon/htp/matmul-ops.c
#	ggml/src/ggml-hexagon/libggml-htp.inf
#	ggml/src/ggml-sycl/ggml-sycl.cpp
#	ggml/src/ggml-sycl/mmvq.cpp
#	ggml/src/ggml-sycl/mmvq.hpp
#	ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp
#	ggml/src/ggml-webgpu/ggml-webgpu.cpp
#	ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl
#	ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_blk.wgsl
#	ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_split.wgsl
#	scripts/server-test-structured.py
#	scripts/snapdragon/adb/run-bench.sh
#	scripts/snapdragon/adb/run-cli.sh
#	scripts/snapdragon/adb/run-completion.sh
#	scripts/snapdragon/adb/run-mtmd.sh
#	scripts/snapdragon/adb/run-tool.sh
#	scripts/snapdragon/qdc/requirements.txt
#	scripts/snapdragon/windows/run-bench.ps1
#	scripts/snapdragon/windows/run-cli.ps1
#	scripts/snapdragon/windows/run-completion.ps1
#	scripts/snapdragon/windows/run-mtmd.ps1
#	scripts/snapdragon/windows/run-tool.ps1
#	tests/test-backend-ops.cpp
#	tools/cli/cli.cpp
#	ty.toml
This commit is contained in:
Concedo 2026-04-25 12:13:14 +08:00
commit 340b22283e
20 changed files with 260 additions and 98 deletions

View file

@ -558,6 +558,26 @@ bool common_chat_templates_was_explicit(const struct common_chat_templates * tmp
return tmpls->has_explicit_template;
}
// LFM2 format detection: template uses <|tool_list_start|>[...]<|tool_list_end|> around the tool list
// and <|tool_call_start|>[...]<|tool_call_end|> around each tool call
static bool is_lfm2_template(const std::string & src) {
return src.find("<|tool_list_start|>") != std::string::npos &&
src.find("<|tool_list_end|>") != std::string::npos;
}
common_chat_prompt_preset common_chat_get_asr_prompt(const common_chat_templates * chat_templates) {
common_chat_prompt_preset asr_preset;
asr_preset.system = "";
asr_preset.user = "Transcribe audio to text";
if (chat_templates && chat_templates->template_default && is_lfm2_template(chat_templates->template_default->source())) {
asr_preset.system = "Perform ASR.";
asr_preset.user = "";
}
return asr_preset;
}
std::string common_chat_templates_source(const struct common_chat_templates * tmpls, const std::string & variant) {
if (!variant.empty()) {
if (variant == "tool_use") {
@ -2067,10 +2087,7 @@ std::optional<common_chat_params> common_chat_try_specialized_template(
return common_chat_params_init_kimi_k2(tmpl, params);
}
// LFM2 format detection: template uses <|tool_list_start|>[...]<|tool_list_end|> around the tool list
// and <|tool_call_start|>[...]<|tool_call_end|> around each tool call
if (src.find("<|tool_list_start|>") != std::string::npos &&
src.find("<|tool_list_end|>") != std::string::npos) {
if (is_lfm2_template(src)) {
LOG_DBG("Using specialized template: LFM2\n");
return common_chat_params_init_lfm2(tmpl, params);
}
@ -2379,4 +2396,3 @@ std::map<std::string, bool> common_chat_templates_get_caps(const common_chat_tem
GGML_ASSERT(chat_templates->template_default != nullptr);
return chat_templates->template_default->caps.to_map();
}

View file

@ -274,3 +274,11 @@ std::optional<common_chat_params> common_chat_try_specialized_template(
const common_chat_template & tmpl,
const std::string & src,
autoparser::generation_params & params);
// specialized per-task preset
struct common_chat_prompt_preset {
std::string system;
std::string user;
};
common_chat_prompt_preset common_chat_get_asr_prompt(const common_chat_templates * chat_templates);

View file

@ -747,6 +747,11 @@ inline bool string_starts_with(std::string_view str, std::string_view prefix) {
str.compare(0, prefix.size(), prefix) == 0;
}
// remove when moving to c++20
inline bool string_starts_with(std::string_view str, char prefix) {
return !str.empty() && str.front() == prefix;
}
// remove when moving to c++20
inline bool string_ends_with(std::string_view str, std::string_view suffix) {
return str.size() >= suffix.size() &&

View file

@ -1,4 +1,3 @@
#include "log.h"
#include "value.h"
#include "runtime.h"
#include "caps.h"

View file

@ -106,10 +106,16 @@ struct statement {
size_t pos; // position in source, for debugging
virtual ~statement() = default;
virtual std::string type() const { return "Statement"; }
// execute_impl must be overridden by derived classes
virtual value execute_impl(context &) { throw std::runtime_error("cannot exec " + type()); }
virtual value execute_impl(context &) { throw_exec_error(); }
// execute is the public method to execute a statement with error handling
value execute(context &);
private:
[[noreturn]] void throw_exec_error() const {
throw std::runtime_error("cannot exec " + type());
}
};
// Type Checking Utilities
@ -143,7 +149,7 @@ struct program : public statement {
program() = default;
explicit program(statements && body) : body(std::move(body)) {}
std::string type() const override { return "Program"; }
value execute_impl(context &) override {
[[noreturn]] value execute_impl(context &) override {
throw std::runtime_error("Cannot execute program directly, use jinja::runtime instead");
}
};
@ -195,7 +201,7 @@ struct break_statement : public statement {
}
};
value execute_impl(context &) override {
[[noreturn]] value execute_impl(context &) override {
throw break_statement::signal();
}
};
@ -209,7 +215,7 @@ struct continue_statement : public statement {
}
};
value execute_impl(context &) override {
[[noreturn]] value execute_impl(context &) override {
throw continue_statement::signal();
}
};
@ -509,7 +515,7 @@ struct slice_expression : public expression {
chk_type<expression>(this->step_expr);
}
std::string type() const override { return "SliceExpression"; }
value execute_impl(context &) override {
[[noreturn]] value execute_impl(context &) override {
throw std::runtime_error("must be handled by MemberExpression");
}
};

View file

@ -590,6 +590,10 @@ static bool string_endswith(const std::string & str, const std::string & suffix)
return str.compare(str.length() - suffix.length(), suffix.length(), suffix) == 0;
}
[[noreturn]] static value string_join_not_implemented(const func_args &) {
throw not_implemented_exception("String join builtin not implemented");
}
const func_builtins & value_string_t::get_builtins() const {
static const func_builtins builtins = {
{"default", default_value},
@ -851,9 +855,7 @@ const func_builtins & value_string_t::get_builtins() const {
res->val_str.mark_input_based_on(val_input->as_string());
return res;
}},
{"join", [](const func_args &) -> value {
throw not_implemented_exception("String join builtin not implemented");
}},
{"join", string_join_not_implemented},
};
return builtins;
}
@ -884,6 +886,9 @@ const func_builtins & value_bool_t::get_builtins() const {
return builtins;
}
[[noreturn]] static value array_unique_not_implemented(const func_args &) {
throw not_implemented_exception("Array unique builtin not implemented");
}
const func_builtins & value_array_t::get_builtins() const {
static const func_builtins builtins = {
@ -1084,13 +1089,14 @@ const func_builtins & value_array_t::get_builtins() const {
std::reverse(arr.begin(), arr.end());
return is_val<value_tuple>(val) ? mk_val<value_tuple>(std::move(arr)) : mk_val<value_array>(std::move(arr));
}},
{"unique", [](const func_args &) -> value {
throw not_implemented_exception("Array unique builtin not implemented");
}},
{"unique", array_unique_not_implemented},
};
return builtins;
}
[[noreturn]] static value object_join_not_implemented(const func_args &) {
throw not_implemented_exception("object join not implemented");
}
const func_builtins & value_object_t::get_builtins() const {
if (!has_builtins) {
@ -1183,9 +1189,7 @@ const func_builtins & value_object_t::get_builtins() const {
});
return result;
}},
{"join", [](const func_args &) -> value {
throw not_implemented_exception("object join not implemented");
}},
{"join", object_join_not_implemented},
};
return builtins;
}

View file

@ -129,27 +129,25 @@ struct value_t {
// Note: only for debugging and error reporting purposes
virtual std::string type() const { return ""; }
virtual int64_t as_int() const { throw std::runtime_error(type() + " is not an int value"); }
virtual double as_float() const { throw std::runtime_error(type() + " is not a float value"); }
virtual string as_string() const { throw std::runtime_error(type() + " is not a string value"); }
virtual bool as_bool() const { throw std::runtime_error(type() + " is not a bool value"); }
virtual const std::vector<value> & as_array() const { throw std::runtime_error(type() + " is not an array value"); }
virtual const std::vector<std::pair<value, value>> & as_ordered_object() const { throw std::runtime_error(type() + " is not an object value"); }
virtual value invoke(const func_args &) const { throw std::runtime_error(type() + " is not a function value"); }
virtual int64_t as_int() const { throw_type_error("is not an int value"); }
virtual double as_float() const { throw_type_error("is not a float value"); }
virtual string as_string() const { throw_type_error("is not a string value"); }
virtual bool as_bool() const { throw_type_error("is not a bool value"); }
virtual const std::vector<value> & as_array() const { throw_type_error("is not an array value"); }
virtual const std::vector<std::pair<value, value>> & as_ordered_object() const { throw_type_error("is not an object value"); }
virtual value invoke(const func_args &) const { throw_type_error("is not a function value"); }
virtual bool is_none() const { return false; }
virtual bool is_undefined() const { return false; }
virtual const func_builtins & get_builtins() const {
throw std::runtime_error("No builtins available for type " + type());
}
virtual const func_builtins & get_builtins() const { throw_type_error("has no builtins"); }
virtual bool has_key(const value &) { throw std::runtime_error(type() + " is not an object value"); }
virtual void insert(const value & /* key */, const value & /* val */) { throw std::runtime_error(type() + " is not an object value"); }
virtual value & at(const value & /* key */, value & /* default_val */) { throw std::runtime_error(type() + " is not an object value"); }
virtual value & at(const value & /* key */) { throw std::runtime_error(type() + " is not an object value"); }
virtual value & at(const std::string & /* key */, value & /* default_val */) { throw std::runtime_error(type() + " is not an object value"); }
virtual value & at(const std::string & /* key */) { throw std::runtime_error(type() + " is not an object value"); }
virtual value & at(int64_t /* idx */, value & /* default_val */) { throw std::runtime_error(type() + " is not an array value"); }
virtual value & at(int64_t /* idx */) { throw std::runtime_error(type() + " is not an array value"); }
virtual bool has_key(const value &) { throw_type_error("is not an object value"); }
virtual void insert(const value & /* key */, const value & /* val */) { throw_type_error("is not an object value"); }
virtual value & at(const value & /* key */, value & /* default_val */) { throw_type_error("is not an object value"); }
virtual value & at(const value & /* key */) { throw_type_error("is not an object value"); }
virtual value & at(const std::string & /* key */, value & /* default_val */) { throw_type_error("is not an object value"); }
virtual value & at(const std::string & /* key */) { throw_type_error("is not an object value"); }
virtual value & at(int64_t /* idx */, value & /* default_val */) { throw_type_error("is not an array value"); }
virtual value & at(int64_t /* idx */) { throw_type_error("is not an array value"); }
virtual bool is_numeric() const { return false; }
virtual bool is_hashable() const { return false; }
@ -163,6 +161,11 @@ struct value_t {
// Note: only for debugging purposes
virtual std::string as_repr() const { return as_string().str(); }
private:
[[noreturn]] void throw_type_error(const char* expected) const {
throw std::runtime_error(type() + " " + expected);
}
protected:
virtual bool equivalent(const value_t &) const = 0;
virtual bool nonequal(const value_t & other) const { return !equivalent(other); }

View file

@ -746,7 +746,12 @@ class ModelBase:
if (not quant_algo or not quant_layers) and quant_config_file.is_file():
with open(quant_config_file, "r", encoding="utf-8") as f:
quant_config = json.load(f).get("quantization") or {}
hf_quant_config = json.load(f)
quant_config = hf_quant_config.get("quantization") or {}
producer = hf_quant_config.get("producer") or {}
producer_name = (producer.get("name") or "").lower()
if quant_method is None:
self.hparams.setdefault("quantization_config", {})["quant_method"] = producer_name
quant_algo = quant_config.get("quant_algo", quant_algo)
quant_layers = quant_config.get("quantized_layers", quant_layers) or {}

View file

@ -3608,6 +3608,30 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph,
return true;
}
if (ops.size() == 2 && ops.begin()[0] == GGML_OP_UNARY && ops.begin()[1] == GGML_OP_SQR
&& unary_ops.size() == 1 && unary_ops.begin()[0] == GGML_UNARY_OP_RELU) {
const ggml_tensor * unary = cgraph->nodes[node_idx];
const ggml_tensor * sqr = cgraph->nodes[node_idx+1];
if (ggml_get_unary_op(unary) != GGML_UNARY_OP_RELU) {
return false;
}
if (unary->type != GGML_TYPE_F32 && unary->type != GGML_TYPE_F16) {
return false;
}
if (unary->type != sqr->type) {
return false;
}
if (!ggml_is_contiguous(unary->src[0])) {
return false;
}
return true;
}
if (ops.size() == 3 && ops.begin()[0] == GGML_OP_SCALE && ops.begin()[1] == GGML_OP_UNARY && ops.begin()[2] == GGML_OP_SCALE
&& unary_ops.size() == 1 && unary_ops.begin()[0] == GGML_UNARY_OP_TANH) {
const ggml_tensor *scale = cgraph->nodes[node_idx];
@ -4116,6 +4140,12 @@ static void ggml_cuda_graph_evaluate_and_capture(ggml_backend_cuda_context * cud
continue;
}
if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_UNARY, GGML_OP_SQR }, { GGML_UNARY_OP_RELU })) {
ggml_cuda_op_relu_sqr(*cuda_ctx, node, cgraph->nodes[i+1]);
i++;
continue;
}
if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_SCALE, GGML_OP_UNARY, GGML_OP_SCALE }, { GGML_UNARY_OP_TANH })) {
i += 2;
ggml_cuda_op_softcap(*cuda_ctx, cgraph->nodes[i], node);

View file

@ -65,6 +65,11 @@ static __device__ __forceinline__ float op_sqr(float x) {
return x * x;
}
static __device__ __forceinline__ float op_relu_sqr(float x) {
const float r = fmaxf(x, 0.0f);
return r * r;
}
static __device__ __forceinline__ float op_sqrt(float x) {
return sqrtf(x);
}
@ -615,3 +620,21 @@ void ggml_cuda_op_unary_mul(ggml_backend_cuda_context & ctx, ggml_tensor * unary
GGML_ABORT("Unsupported unary op for fused unary+mul");
}
}
/* fused relu + sqr */
void ggml_cuda_op_relu_sqr(ggml_backend_cuda_context & ctx, ggml_tensor * relu_node, ggml_tensor * sqr_node) {
const ggml_tensor * src = relu_node->src[0];
cudaStream_t stream = ctx.stream();
GGML_ASSERT(ggml_is_contiguous(src));
GGML_ASSERT(src->type == GGML_TYPE_F32 || src->type == GGML_TYPE_F16);
GGML_ASSERT(src->type == sqr_node->type);
const int k = ggml_nelements(src);
if (src->type == GGML_TYPE_F16) {
unary_cuda<op_relu_sqr>((const half *)src->data, (half *)sqr_node->data, k, stream);
} else {
unary_cuda<op_relu_sqr>((const float *)src->data, (float *)sqr_node->data, k, stream);
}
}

View file

@ -91,6 +91,8 @@ void ggml_cuda_op_xielu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_unary_mul(ggml_backend_cuda_context & ctx, ggml_tensor * unary_node, ggml_tensor * mul_node);
void ggml_cuda_op_relu_sqr(ggml_backend_cuda_context & ctx, ggml_tensor * relu_node, ggml_tensor * sqr_node);
__device__ __forceinline__ float ggml_cuda_op_silu_single(float x) {
return x / (1.0f + expf(-x));
}

View file

@ -820,7 +820,7 @@ ggml_metal_device_t ggml_metal_device_init(int device) {
}
// print MTL GPU family:
GGML_LOG_INFO("%s: GPU name: %s\n", __func__, dev->props.name);
GGML_LOG_INFO("%s: GPU name: %s (%s)\n", __func__, dev->props.name, dev->props.desc);
// determine max supported GPU family
// https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf
@ -937,13 +937,13 @@ void ggml_metal_device_rsets_keep_alive(ggml_metal_device_t dev) {
}
struct ggml_metal_event {
void * obj; // id<MTLEvent>
void * obj; // id<MTLSharedEvent>
atomic_int value;
};
void ggml_metal_event_encode_signal(ggml_metal_event_t ev, ggml_metal_cmd_buf_t cmd_buf_raw) {
id<MTLEvent> event = (id<MTLEvent>)ev->obj;
id<MTLSharedEvent> event = (id<MTLSharedEvent>)ev->obj;
id<MTLCommandBuffer> cmd_buf = (id<MTLCommandBuffer>) cmd_buf_raw;
@ -951,7 +951,7 @@ void ggml_metal_event_encode_signal(ggml_metal_event_t ev, ggml_metal_cmd_buf_t
}
void ggml_metal_event_encode_wait(ggml_metal_event_t ev, ggml_metal_cmd_buf_t cmd_buf_raw) {
id<MTLEvent> event = (id<MTLEvent>)ev->obj;
id<MTLSharedEvent> event = (id<MTLSharedEvent>)ev->obj;
id<MTLCommandBuffer> cmd_buf = (id<MTLCommandBuffer>) cmd_buf_raw;
@ -959,7 +959,7 @@ void ggml_metal_event_encode_wait(ggml_metal_event_t ev, ggml_metal_cmd_buf_t cm
}
ggml_metal_event_t ggml_metal_device_event_init(ggml_metal_device_t dev) {
id<MTLEvent> event = [dev->mtl_device newEvent];
id<MTLSharedEvent> event = [dev->mtl_device newSharedEvent];
ggml_metal_event_t ev = calloc(1, sizeof(struct ggml_metal_event));
@ -970,7 +970,7 @@ ggml_metal_event_t ggml_metal_device_event_init(ggml_metal_device_t dev) {
}
void ggml_metal_device_event_free(ggml_metal_device_t dev, ggml_metal_event_t ev) {
id<MTLEvent> event = ev->obj;
id<MTLSharedEvent> event = ev->obj;
[event release];
free(ev);
@ -979,14 +979,13 @@ void ggml_metal_device_event_free(ggml_metal_device_t dev, ggml_metal_event_t ev
}
void ggml_metal_device_event_synchronize(ggml_metal_device_t dev, ggml_metal_event_t ev) {
@autoreleasepool {
id<MTLEvent> event = ev->obj;
id<MTLCommandBuffer> cmd_buf = [dev->mtl_queue commandBuffer];
[cmd_buf encodeWaitForEvent:event value:atomic_load_explicit(&ev->value, memory_order_relaxed)];
[cmd_buf commit];
[cmd_buf waitUntilCompleted];
id<MTLSharedEvent> event = ev->obj;
const bool res = [event waitUntilSignaledValue:atomic_load_explicit(&ev->value, memory_order_relaxed) timeoutMS:60000];
if (!res) {
GGML_ABORT("%s: failed to wait for event\n", __func__);
}
GGML_UNUSED(dev);
}
void ggml_metal_device_get_memory(ggml_metal_device_t dev, size_t * free, size_t * total) {

View file

@ -7672,7 +7672,7 @@ size_t ggml_quantize_chunk(
int64_t nrows,
int64_t n_per_row,
const float * imatrix) {
const int64_t n = (int64_t) nrows * n_per_row;
const int64_t n = nrows * n_per_row;
if (ggml_quantize_requires_imatrix(type)) {
GGML_ASSERT(imatrix != NULL);
@ -7689,21 +7689,21 @@ size_t ggml_quantize_chunk(
size_t result = 0;
switch (type) {
case GGML_TYPE_Q1_0: result = quantize_q1_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q4_0: result = quantize_q4_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q4_1: result = quantize_q4_1(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q5_0: result = quantize_q5_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q5_1: result = quantize_q5_1(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q8_0: result = quantize_q8_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_MXFP4: result = quantize_mxfp4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_NVFP4: result = quantize_nvfp4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q2_K: result = quantize_q2_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q3_K: result = quantize_q3_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q4_K: result = quantize_q4_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q5_K: result = quantize_q5_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q6_K: result = quantize_q6_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_TQ1_0: result = quantize_tq1_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_TQ2_0: result = quantize_tq2_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q1_0: result = quantize_q1_0 (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q4_0: result = quantize_q4_0 (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q4_1: result = quantize_q4_1 (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q5_0: result = quantize_q5_0 (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q5_1: result = quantize_q5_1 (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q8_0: result = quantize_q8_0 (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_MXFP4: result = quantize_mxfp4 (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_NVFP4: result = quantize_nvfp4 (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q2_K: result = quantize_q2_K (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q3_K: result = quantize_q3_K (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q4_K: result = quantize_q4_K (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q5_K: result = quantize_q5_K (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q6_K: result = quantize_q6_K (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_TQ1_0: result = quantize_tq1_0 (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_TQ2_0: result = quantize_tq2_0 (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_IQ2_XXS: result = quantize_iq2_xxs(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_IQ2_XS: result = quantize_iq2_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_IQ3_XXS: result = quantize_iq3_xxs(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
@ -7768,10 +7768,10 @@ struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads) {
}
bool ggml_threadpool_params_match(const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1) {
if (p0->n_threads != p1->n_threads ) return false;
if (p0->prio != p1->prio ) return false;
if (p0->poll != p1->poll ) return false;
if (p0->strict_cpu != p1->strict_cpu ) return false;
if (p0->n_threads != p1->n_threads ) return false;
if (p0->prio != p1->prio ) return false;
if (p0->poll != p1->poll ) return false;
if (p0->strict_cpu != p1->strict_cpu ) return false;
return memcmp(p0->cpumask, p1->cpumask, GGML_MAX_N_THREADS) == 0;
}

View file

@ -281,6 +281,42 @@ json server_chat_convert_responses_to_chatcmpl(const json & response_body) {
return chatcmpl_body;
}
// Edits the cch section of an "x-anthropic-billing-header" system prompt.
// Does nothing to any other prompt.
//
// This is a claude message with a "cch=ef01a" attribute that breaks prefix caching.
// The cch stamp is a whitebox end-to-end integrity hint. It's not meaningful as a
// system prompt data, particularly to llama.cpp, but its presence means the prefix
// cache will not get past it: It changes on each request.
//
// Reference: https://github.com/ggml-org/llama.cpp/pull/21793
// Example header:
// ```
// x-anthropic-billing-header: cc_version=2.1.101.e51; cc_entrypoint=cli; cch=a5145;You are Claude Code, Anthropic's official CLI for Claude.
// ^^^^^
// ```
static void normalize_anthropic_billing_header(std::string & system_text) {
if (system_text.rfind("x-anthropic-billing-header:", 0) != 0) {
return;
}
const size_t header_prefix_length = strlen("x-anthropic-billing-header:");
const size_t cch_length = 5;
const size_t index_cch = system_text.find("cch=", header_prefix_length);
if (index_cch == std::string::npos) {
return;
}
const size_t index_replace = index_cch + 4;
if (index_replace + cch_length < system_text.length() && system_text[index_replace + cch_length] == ';') {
for (size_t i = 0; i < cch_length; ++i) {
system_text[index_replace + i] = 'f';
}
} else {
LOG_ERR("anthropic string not as expected: %s", system_text.c_str());
}
}
json server_chat_convert_anthropic_to_oai(const json & body) {
json oai_body;
@ -292,10 +328,13 @@ json server_chat_convert_anthropic_to_oai(const json & body) {
if (system_param.is_string()) {
system_content = system_param.get<std::string>();
normalize_anthropic_billing_header(system_content);
} else if (system_param.is_array()) {
for (const auto & block : system_param) {
if (json_value(block, "type", std::string()) == "text") {
system_content += json_value(block, "text", std::string());
auto system_text = json_value(block, "text", std::string());
normalize_anthropic_billing_header(system_text);
system_content += system_text;
}
}
}
@ -475,7 +514,7 @@ json server_chat_convert_anthropic_to_oai(const json & body) {
}
// Pass through common params
for (const auto & key : {"temperature", "top_p", "top_k", "stream"}) {
for (const auto & key : {"temperature", "top_p", "top_k", "stream", "chat_template_kwargs"}) {
if (body.contains(key)) {
oai_body[key] = body.at(key);
}
@ -535,6 +574,7 @@ json server_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff) {
json convert_transcriptions_to_chatcmpl(
const json & inp_body,
const common_chat_templates * tmpls,
const std::map<std::string, raw_buffer> & in_files,
std::vector<raw_buffer> & out_files) {
// TODO @ngxson : this function may need to be improved in the future
@ -548,27 +588,29 @@ json convert_transcriptions_to_chatcmpl(
}
// handle input data
std::string prompt = json_value(inp_body, "prompt", std::string());
std::string language = json_value(inp_body, "language", std::string());
std::string prompt = json_value(inp_body, "prompt", std::string());
std::string language = json_value(inp_body, "language", std::string());
std::string response_format = json_value(inp_body, "response_format", std::string("json"));
if (response_format != "json") {
throw std::invalid_argument("Only 'json' response_format is supported for transcription");
}
const common_chat_prompt_preset preset = common_chat_get_asr_prompt(tmpls);
if (prompt.empty()) {
prompt = "Transcribe audio to text";
prompt = preset.user;
}
if (!language.empty()) {
prompt += string_format(" (language: %s)", language.c_str());
}
prompt += get_media_marker();
json messages = json::array();
if (!preset.system.empty()) {
messages.push_back({{"role", "system"}, {"content", preset.system}});
}
messages.push_back({{"role", "user"}, {"content", prompt}});
json chatcmpl_body = inp_body; // copy all fields
chatcmpl_body["messages"] = json::array({
{
{"role", "user"},
{"content", prompt},
},
});
chatcmpl_body["messages"] = messages;
// because input from form-data, everything is string, we need to correct the types here
std::string stream = json_value(inp_body, "stream", std::string("false"));

View file

@ -18,6 +18,7 @@ json server_chat_convert_anthropic_to_oai(const json & body);
// convert OpenAI transcriptions API format to OpenAI Chat Completions API format
json convert_transcriptions_to_chatcmpl(
const json & body,
const common_chat_templates * tmpls,
const std::map<std::string, raw_buffer> & in_files,
std::vector<raw_buffer> & out_files);

View file

@ -947,7 +947,9 @@ json oaicompat_chat_params_parse(
json response_format = json_value(body, "response_format", json::object());
std::string response_type = json_value(response_format, "type", std::string());
if (response_type == "json_object") {
json_schema = json_value(response_format, "schema", json::object());
if (response_format.contains("schema") || json_schema.empty()) {
json_schema = json_value(response_format, "schema", json::object());
}
} else if (response_type == "json_schema") {
auto schema_wrapper = json_value(response_format, "json_schema", json::object());
json_schema = json_value(schema_wrapper, "schema", json::object());

View file

@ -675,6 +675,10 @@ private:
int32_t n_ctx; // total context for all clients / slots
// set to llama_model_n_swa(model)
// if swa_full is enabled, this is set to 0 to simulate a non-SWA model
int32_t n_swa;
// slots / clients
std::vector<server_slot> slots;
@ -719,7 +723,7 @@ private:
return;
}
SLT_INF(slot, "%s", "saving idle slot to prompt cache\n");
SLT_DBG(slot, "%s", "__TEST_TAG_CLEAR_IDLE_SLOT__\n");
SLT_DBG(slot, "%s", "__TEST_TAG_CACHE_IDLE_SLOT__\n");
slot.prompt_save(*prompt_cache);
slot.prompt_clear(false);
prompt_cache->update();
@ -854,6 +858,8 @@ private:
}
}
n_swa = params_base.swa_full ? 0 : llama_model_n_swa(model);
// Necessary similarity of prompt for slot selection
slot_prompt_similarity = params_base.slot_prompt_similarity;
@ -996,7 +1002,7 @@ private:
params_base.cache_idle_slots = false;
} else {
SRV_INF("%s: idle slots will be saved to prompt cache and cleared upon starting a new task\n", __func__);
SRV_DBG("%s", "__TEST_TAG_CLEAR_IDLE_ENABLED__\n");
SRV_DBG("%s", "__TEST_TAG_CACHE_IDLE_SLOTS_ENABLED__\n");
}
}
@ -2415,9 +2421,6 @@ private:
llama_pos pos_next = slot.prompt.tokens.pos_next(n_past);
// note: when n_swa == 0, the model does not use SWA
const auto n_swa = std::max(0, llama_model_n_swa(model));
// the largest pos_min required for a checkpoint to be useful
const auto pos_min_thold = std::max(0, pos_next - n_swa);
@ -2589,10 +2592,10 @@ private:
// make a checkpoint of the parts of the memory that cannot be rolled back.
// checkpoints are created only if:
// - the model does not support partial sequence removal
// - the model uses SWA and we are not using `swa_full`
// - the model uses SWA (and we are not using `swa_full`)
do_checkpoint = do_checkpoint && (
(slot.ctx_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_FULL) ||
(llama_model_n_swa(model) > 0 && !params_base.swa_full));
(n_swa > 0));
bool has_mtmd = false;
@ -3807,6 +3810,7 @@ void server_routes::init_routes() {
std::vector<raw_buffer> files;
json body = convert_transcriptions_to_chatcmpl(
json::parse(req.body),
meta->chat_params.tmpls.get(),
req.files,
files);
SRV_DBG("%s\n", "Request converted: OpenAI Transcriptions -> OpenAI Chat Completions");

View file

@ -270,6 +270,7 @@ task_params server_task::params_from_json_cmpl(
params.n_indent = json_value(data, "n_indent", defaults.n_indent);
params.n_keep = json_value(data, "n_keep", defaults.n_keep);
params.n_discard = json_value(data, "n_discard", defaults.n_discard);
params.n_discard = std::max(0, params.n_discard);
params.n_cmpl = json_value(data, "n_cmpl", json_value(data, "n", 1));
params.n_cache_reuse = json_value(data, "n_cache_reuse", defaults.n_cache_reuse);
//params.t_max_prompt_ms = json_value(data, "t_max_prompt_ms", defaults.t_max_prompt_ms); // TODO: implement

View file

@ -48,7 +48,7 @@ def test_clear_and_restore():
log = LogReader(server.log_path)
# verify feature is enabled
assert "__TEST_TAG_CLEAR_IDLE_ENABLED__" in log.drain()
assert "__TEST_TAG_CACHE_IDLE_SLOTS_ENABLED__" in log.drain()
res = server.make_request("POST", "/completion", data={
"prompt": LONG_PROMPT,
@ -59,7 +59,7 @@ def test_clear_and_restore():
original_prompt_n = res.body["timings"]["prompt_n"]
# Slot 0 is the only slot with KV — should NOT be cleared
assert "__TEST_TAG_CLEAR_IDLE_SLOT__" not in log.drain()
assert "__TEST_TAG_CACHE_IDLE_SLOT__" not in log.drain()
# Launching slot 1 clears idle slot 0
res = server.make_request("POST", "/completion", data={
@ -68,7 +68,7 @@ def test_clear_and_restore():
"cache_prompt": True,
})
assert res.status_code == 200
assert "__TEST_TAG_CLEAR_IDLE_SLOT__" in log.drain()
assert "__TEST_TAG_CACHE_IDLE_SLOT__" in log.drain()
# Re-send same prompt — should restore from cache-ram
res = server.make_request("POST", "/completion", data={
@ -86,7 +86,7 @@ def test_clear_and_restore():
"cache_prompt": True,
})
assert res.status_code == 200
assert "__TEST_TAG_CLEAR_IDLE_SLOT__" not in log.drain()
assert "__TEST_TAG_CACHE_IDLE_SLOT__" not in log.drain()
def test_disabled_with_flag():
@ -96,7 +96,7 @@ def test_disabled_with_flag():
log = LogReader(server.log_path)
# Feature should not be enabled
assert "__TEST_TAG_CLEAR_IDLE_ENABLED__" not in log.drain()
assert "__TEST_TAG_CACHE_IDLE_SLOTS_ENABLED__" not in log.drain()
res = server.make_request("POST", "/completion", data={
"prompt": LONG_PROMPT,
@ -112,4 +112,4 @@ def test_disabled_with_flag():
"cache_prompt": True,
})
assert res.status_code == 200
assert "__TEST_TAG_CLEAR_IDLE_SLOT__" not in log.drain()
assert "__TEST_TAG_CACHE_IDLE_SLOT__" not in log.drain()

View file

@ -81,7 +81,7 @@ if (LLAMA_BUILD_BORINGSSL)
target_link_libraries(${TARGET} PUBLIC ssl crypto)
elseif (LLAMA_BUILD_LIBRESSL)
set(LIBRESSL_VERSION "4.2.1" CACHE STRING "LibreSSL version")
set(LIBRESSL_VERSION "4.3.1" CACHE STRING "LibreSSL version")
message(STATUS "Fetching LibreSSL version ${LIBRESSL_VERSION}")
@ -161,12 +161,24 @@ if(LLAMA_BUILD_BORINGSSL OR LLAMA_BUILD_LIBRESSL)
if(LLAMA_BUILD_BORINGSSL)
target_compile_options(fipsmodule PRIVATE /w)
endif()
if(LLAMA_BUILD_LIBRESSL)
target_compile_options(ssl_obj PRIVATE /w)
target_compile_options(bs_obj PRIVATE /w)
target_compile_options(compat_obj PRIVATE /w)
target_compile_options(crypto_obj PRIVATE /w)
endif()
else()
target_compile_options(ssl PRIVATE -w)
target_compile_options(crypto PRIVATE -w)
if(LLAMA_BUILD_BORINGSSL)
target_compile_options(fipsmodule PRIVATE -w)
endif()
if(LLAMA_BUILD_LIBRESSL)
target_compile_options(ssl_obj PRIVATE -w)
target_compile_options(bs_obj PRIVATE -w)
target_compile_options(compat_obj PRIVATE -w)
target_compile_options(crypto_obj PRIVATE -w)
endif()
endif()
endif()