Merge commit '1e796eb41f' into concedo_experimental

# Conflicts:
#	.devops/nix/package.nix
#	.github/workflows/build-riscv.yml
#	.github/workflows/build-vulkan.yml
#	.github/workflows/build.yml
#	docs/backend/SYCL.md
#	docs/build.md
#	docs/development/HOWTO-add-model.md
#	embd_res/templates/Reka-Edge.jinja
#	ggml/CMakeLists.txt
#	ggml/src/ggml-rpc/CMakeLists.txt
#	ggml/src/ggml-rpc/ggml-rpc.cpp
#	ggml/src/ggml-sycl/CMakeLists.txt
#	ggml/src/ggml-sycl/convert.cpp
#	ggml/src/ggml-sycl/dequantize.hpp
#	ggml/src/ggml-sycl/dmmv.cpp
#	ggml/src/ggml-sycl/ggml-sycl.cpp
#	ggml/src/ggml-webgpu/ggml-webgpu.cpp
#	ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl
#	ggml/src/ggml-webgpu/wgsl-shaders/get_rows.wgsl
#	ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl
#	ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl
#	ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id.wgsl
#	ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.wgsl
#	ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.wgsl
#	ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl
#	ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl
#	tests/test-chat.cpp
#	tools/rpc/README.md
This commit is contained in:
Concedo 2026-04-17 21:47:29 +08:00
commit 768527b031
29 changed files with 1544 additions and 248 deletions

View file

@ -198,10 +198,19 @@ common_peg_parser analyze_tools::build_tool_parser_json_native(parser_build_cont
args_field = format.function_field + "." + args_field;
}
auto tools_parser = p.standard_json_tools(
format.section_start, format.section_end, inputs.tools, inputs.parallel_tool_calls,
inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED, name_field, args_field, format.tools_array_wrapped,
format.fun_name_is_key, format.id_field, format.gen_id_field, format.parameter_order);
auto tools_parser = p.eps();
if (format.section_start.empty() && !format.per_call_start.empty()) {
auto single_tool_parser = p.standard_json_tools(
format.per_call_start, format.per_call_end, inputs.tools, inputs.parallel_tool_calls,
inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED, name_field, args_field, format.tools_array_wrapped,
format.fun_name_is_key, format.id_field, format.gen_id_field, format.parameter_order);
tools_parser = p.trigger_rule("tool-calls", p.one_or_more(single_tool_parser + p.space()));
} else {
tools_parser = p.standard_json_tools(
format.section_start, format.section_end, inputs.tools, inputs.parallel_tool_calls,
inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED, name_field, args_field, format.tools_array_wrapped,
format.fun_name_is_key, format.id_field, format.gen_id_field, format.parameter_order);
}
// Handle content wrappers if present
if (ctx.content && ctx.content->is_always_wrapped()) {

View file

@ -308,19 +308,23 @@ struct analyze_tools : analyze_base {
private:
// Extract tool calling 'haystack' for further analysis and delegate further analysis based on format
void analyze_tool_calls(const analyze_reasoning & reasoning);
void analyze_tool_calls(const analyze_reasoning & reasoning, bool supports_parallel_tool_calls);
// Analyze format based on position of function and argument name in needle
void analyze_tool_call_format(const std::string & haystack,
const std::string & fun_name_needle,
const std::string & arg_name_needle,
const analyze_reasoning & reasoning);
const analyze_reasoning & reasoning,
bool supports_parallel_tool_calls);
// Analyze specifics of JSON native format (entire tool call is a JSON object)
void analyze_tool_call_format_json_native(const std::string & clean_haystack,
const std::string & fun_name_needle,
const std::string & arg_name_needle);
// Check if parallel calls in JSON native format array wrapped or tag wrapped
void analyze_json_native_parallel_calls();
// Analyze specifics of non-JSON native format (tags for function name or for function name and arguments)
void analyze_tool_call_format_non_json(const std::string & clean_haystack,
const std::string & fun_name_needle);

View file

@ -558,7 +558,7 @@ analyze_tools::analyze_tools(const common_chat_template & tmpl,
: analyze_base(tmpl) {
LOG_DBG(ANSI_ORANGE "Phase 3: Tool call analysis\n" ANSI_RESET);
analyze_tool_calls(reasoning);
analyze_tool_calls(reasoning, caps.supports_parallel_tool_calls);
if (format.mode != tool_format::NONE && format.mode != tool_format::JSON_NATIVE) {
if (caps.supports_parallel_tool_calls) {
@ -577,7 +577,7 @@ analyze_tools::analyze_tools(const common_chat_template & tmpl,
}
}
void analyze_tools::analyze_tool_calls(const analyze_reasoning & reasoning) {
void analyze_tools::analyze_tool_calls(const analyze_reasoning & reasoning, bool supports_parallel_tool_calls) {
json assistant_no_tools = json{
{ "role", "assistant" },
{ "content", ASSISTANT_MSG }
@ -611,13 +611,14 @@ void analyze_tools::analyze_tool_calls(const analyze_reasoning & reasoning) {
return;
}
analyze_tool_call_format(tool_section, FUN_FIRST, ARG_FIRST, reasoning);
analyze_tool_call_format(tool_section, FUN_FIRST, ARG_FIRST, reasoning, supports_parallel_tool_calls);
}
void analyze_tools::analyze_tool_call_format(const std::string & haystack,
const std::string & fun_name_needle,
const std::string & arg_name_needle,
const analyze_reasoning & reasoning) {
const analyze_reasoning & reasoning,
bool supports_parallel_tool_calls) {
if (fun_name_needle.empty() || arg_name_needle.empty() || haystack.empty()) {
return;
}
@ -660,6 +661,9 @@ void analyze_tools::analyze_tool_call_format(const std::string & haystack,
if (format.mode == tool_format::JSON_NATIVE) {
analyze_tool_call_format_json_native(clean_haystack, fun_name_needle, arg_name_needle);
if (supports_parallel_tool_calls) {
analyze_json_native_parallel_calls();
}
} else {
analyze_tool_call_format_non_json(clean_haystack, fun_name_needle);
}
@ -668,6 +672,42 @@ void analyze_tools::analyze_tool_call_format(const std::string & haystack,
format.per_call_end = trim_whitespace(format.per_call_end);
}
void analyze_tools::analyze_json_native_parallel_calls() {
json assistant_one_tool = json{
{ "role", "assistant" },
{ "content", "" },
{ "tool_calls", json::array({ first_tool_call }) }
};
json assistant_two_tools = json{
{ "role", "assistant" },
{ "content", "" },
{ "tool_calls", json::array({ first_tool_call, second_tool_call }) }
};
template_params params;
params.messages = json::array({ user_msg, assistant_one_tool });
params.tools = tools;
params.add_generation_prompt = false;
params.enable_thinking = true;
auto comparison = compare_variants(
*tmpl, params, [&](template_params & p) { p.messages = json::array({ user_msg, assistant_two_tools }); });
if (!comparison) {
LOG_DBG(ANSI_ORANGE "%s: Template application failed\n" ANSI_RESET, __func__);
return;
}
std::string & second_call = comparison->diff.right;
if (!format.section_start.empty() && second_call.find(format.section_start) != std::string::npos) {
format.per_call_start = format.section_start;
format.per_call_end = format.section_end;
format.section_start.clear();
format.section_end.clear();
}
}
void analyze_tools::analyze_tool_call_format_json_native(const std::string & clean_haystack,
const std::string & fun_name_needle,
const std::string & arg_name_needle) {

View file

@ -676,7 +676,7 @@ common_peg_parser common_chat_peg_builder::build_json_tools_nested_keys(
ordered_json params = function.contains("parameters") ? function.at("parameters") : ordered_json::object();
auto nested_name = literal("\"" + nested_name_field + "\"") + space() + literal(":") + space() +
literal("\"") + tool_name(literal(name)) + literal("\"");
atomic(literal("\"") + tool_name(literal(name)) + literal("\""));
auto nested_args = literal("\"" + nested_args_field + "\"") + space() + literal(":") + space() +
tool_args(schema(json(), "tool-" + name + "-schema", params));
@ -744,7 +744,7 @@ common_peg_parser common_chat_peg_builder::build_json_tools_flat_keys(
ordered_json params = function.contains("parameters") ? function.at("parameters") : ordered_json::object();
auto tool_name_ = name_key_parser + space() + literal(":") + space() +
literal("\"") + tool_name(literal(name)) + literal("\"");
atomic(literal("\"") + tool_name(literal(name)) + literal("\""));
auto tool_args_ = args_key_parser + space() + literal(":") + space() +
tool_args(schema(json(), "tool-" + name + "-schema", params));

View file

@ -602,8 +602,8 @@ int main(int argc, char ** argv) {
int n_input = input_tokens.size();
if (n_input >= params.n_ctx) {
LOG_ERR("error: input too long (%d tokens), max context is %d\n", n_input, params.n_ctx);
if (static_cast<uint32_t>(n_input) >= llama_n_ctx(ctx)) {
LOG_ERR("error: input too long (%d tokens), max context is %d\n", n_input, llama_n_ctx(ctx));
llama_free(ctx);
llama_model_free(model);
return 1;

View file

@ -202,8 +202,11 @@ extern "C" {
// Common functions that may be obtained using ggml_backend_reg_get_proc_address
// AllReduce operation for tensor parallelism (meta backend)
typedef bool (*ggml_backend_allreduce_tensor_t)(ggml_backend_t * backends, struct ggml_tensor ** tensors, size_t n_backends);
// Context management and operations for faster communication between backends, used for tensor parallelism (meta backend)
typedef void * (*ggml_backend_comm_init_t)(ggml_backend_t * backends, size_t n_backends);
typedef void (*ggml_backend_comm_free_t)(void * comm_ctx);
typedef bool (*ggml_backend_comm_allreduce_tensor_t)(void * comm_ctx, struct ggml_tensor ** tensors);
// Split buffer type for tensor parallelism (old)
typedef ggml_backend_buffer_type_t (*ggml_backend_split_buffer_type_t)(int main_device, const float * tensor_split);
// Set the number of threads for the backend

View file

@ -6,9 +6,9 @@
extern "C" {
#endif
#define RPC_PROTO_MAJOR_VERSION 3
#define RPC_PROTO_MINOR_VERSION 6
#define RPC_PROTO_PATCH_VERSION 1
#define RPC_PROTO_MAJOR_VERSION 4
#define RPC_PROTO_MINOR_VERSION 0
#define RPC_PROTO_PATCH_VERSION 0
#ifdef __cplusplus
static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT has changed - update RPC_PROTO_PATCH_VERSION");

View file

@ -1791,8 +1791,32 @@ extern "C" {
int n_dims,
int mode);
// custom RoPE
// RoPE operations with extended options
// a is the input tensor to apply RoPE to, shape [n_embd, n_head, n_token]
// b is an int32 vector with size n_token
// c is freq factors (e.g. phi3-128k), (optional)
// mode can be GGML_ROPE_TYPE_NORMAL or NEOX; for MROPE and VISION mode, use ggml_rope_multi
//
// pseudo-code for computing theta:
// for i in [0, n_dims/2):
// theta[i] = b[i] * powf(freq_base, -2.0 * i / n_dims);
// theta[i] = theta[i] / c[i]; # if c is provided, divide theta by c
// theta[i] = rope_yarn(theta[i], ...); # note: theta = theta * freq_scale is applied here
//
// other params are used by YaRN RoPE scaling, these default values will disable YaRN:
// freq_scale = 1.0f
// ext_factor = 0.0f
// attn_factor = 1.0f
// beta_fast = 0.0f
// beta_slow = 0.0f
//
// example:
// (marking: c = cos, s = sin, 0 = unrotated)
// given a single head with size = 8 --> [00000000]
// GGML_ROPE_TYPE_NORMAL n_dims = 4 --> [cscs0000]
// GGML_ROPE_TYPE_NORMAL n_dims = 8 --> [cscscscs]
// GGML_ROPE_TYPE_NEOX n_dims = 4 --> [ccss0000]
// GGML_ROPE_TYPE_NEOX n_dims = 8 --> [ccccssss]
GGML_API struct ggml_tensor * ggml_rope_ext(
struct ggml_context * ctx,
struct ggml_tensor * a,
@ -1808,6 +1832,36 @@ extern "C" {
float beta_fast,
float beta_slow);
// multi-dimensional RoPE, for Qwen-VL and similar vision models
// mode can be either VISION, MROPE, IMROPE, cannot be combined with NORMAL or NEOX
// sections specify how many dimensions to rotate in each section:
// section length is equivalent to number of cos/sin pairs, NOT the number of dims
// (i.e. sum of 4 sections are expected to be n_dims/2)
// last sections can be 0, means ignored
// all other options are identical to ggml_rope_ext
//
// important note:
// - NEOX ordering is automatically applied and cannot be disabled for MROPE and VISION
// if you need normal ordering, there are 2 methods:
// (1) split the tensor manually using ggml_view
// (2) permute the weight upon conversion
// - for VISION, n_dims must be head_size/2
//
// example M-RoPE:
// given sections = [t=4, y=2, x=2, 0]
// given a single head with size = 18 --> [000000000000000000]
// GGML_ROPE_TYPE_MROPE n_dims = 16 --> [ttttyyxxttttyyxx00] (cos/sin are applied in NEOX ordering)
// GGML_ROPE_TYPE_IMROPE n_dims = 16 --> [ttyxttyxttyxttyx00] (interleaved M-RoPE, still NEOX ordering)
// note: the theta for each dim is computed the same way as ggml_rope_ext, no matter the section
// in other words, idx used for theta: [0123456789... until n_dims/2], not reset for each section
//
// example vision RoPE:
// given sections = [y=4, x=4, 0, 0] (last 2 sections are ignored)
// given a single head with size = 8 --> [00000000]
// GGML_ROPE_TYPE_VISION n_dims = 4 --> [yyyyxxxx]
// other values of n_dims are untested and is undefined behavior
// note: unlike MROPE, the theta for each dim is computed differently for each section
// in other words, idx used for theta: [0123] for y section, then [0123] for x section
GGML_API struct ggml_tensor * ggml_rope_multi(
struct ggml_context * ctx,
struct ggml_tensor * a,

View file

@ -1419,22 +1419,48 @@ struct ggml_backend_meta_context {
size_t max_tmp_size = 0;
size_t max_subgraphs = 0;
void * comm_ctx = nullptr;
ggml_backend_comm_allreduce_tensor_t comm_allreduce = nullptr;
ggml_backend_meta_context(ggml_backend_dev_t meta_dev, const char * params) {
const size_t n_devs = ggml_backend_meta_dev_n_devs(meta_dev);
name = "Meta(";
std::vector<ggml_backend_t> simple_backends;
backend_configs.reserve(n_devs);
simple_backends.reserve(n_devs);
for (size_t i = 0; i < n_devs; i++) {
ggml_backend_dev_t simple_dev = ggml_backend_meta_dev_simple_dev(meta_dev, i);
if (i > 0) {
name += ",";
}
name += ggml_backend_dev_name(simple_dev);
backend_configs.emplace_back(ggml_backend_dev_init(simple_dev, params));
simple_backends.push_back(ggml_backend_dev_init(simple_dev, params));
backend_configs.emplace_back(simple_backends.back());
}
name += ")";
if (n_devs > 1) {
ggml_backend_comm_init_t comm_init = (ggml_backend_comm_init_t) ggml_backend_reg_get_proc_address(
ggml_backend_dev_backend_reg(ggml_backend_get_device(simple_backends[0])), "ggml_backend_comm_init");
if (comm_init != nullptr) {
comm_ctx = comm_init(simple_backends.data(), simple_backends.size());
}
}
if (comm_ctx != nullptr) {
comm_allreduce = (ggml_backend_comm_allreduce_tensor_t)
ggml_backend_reg_get_proc_address(ggml_backend_dev_backend_reg(
ggml_backend_get_device(simple_backends[0])), "ggml_backend_comm_allreduce_tensor");
GGML_ASSERT(comm_allreduce != nullptr);
}
}
~ggml_backend_meta_context() {
if (comm_ctx != nullptr) {
ggml_backend_comm_free_t comm_free = (ggml_backend_comm_free_t) ggml_backend_reg_get_proc_address(
ggml_backend_dev_backend_reg(ggml_backend_get_device(backend_configs[0].backend)), "ggml_backend_comm_free");
GGML_ASSERT(comm_free != nullptr);
comm_free(comm_ctx);
}
for (auto & bc : backend_configs) {
ggml_backend_free(bc.backend);
}
@ -1845,20 +1871,15 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
if (n_backends > 1 && i < n_subgraphs - 1) {
bool backend_allreduce_success = false;
ggml_backend_allreduce_tensor_t allreduce_tensor = (ggml_backend_allreduce_tensor_t) ggml_backend_reg_get_proc_address(
ggml_backend_dev_backend_reg(ggml_backend_get_device(backend_ctx->backend_configs[0].backend)), "ggml_backend_allreduce_tensor");
if (allreduce_tensor) {
std::vector<ggml_backend_t> backends;
backends.reserve(n_backends);
if (backend_ctx->comm_ctx) {
std::vector<ggml_tensor *> nodes;
nodes.reserve(n_backends);
for (size_t j = 0; j < n_backends; j++) {
auto & bcj = backend_ctx->backend_configs[j];
backends.push_back(bcj.backend);
ggml_cgraph * cgraph_ij = bcj.cgraphs[i].cgraph_main;
nodes.push_back(cgraph_ij->nodes[cgraph_ij->n_nodes-1]);
}
backend_allreduce_success = allreduce_tensor(backends.data(), nodes.data(), n_backends);
backend_allreduce_success = backend_ctx->comm_allreduce(backend_ctx->comm_ctx, nodes.data());
}
if (!backend_allreduce_success) {

File diff suppressed because it is too large Load diff

View file

@ -109,6 +109,96 @@ static void simd_gemm(
C += N;
}
}
#elif defined(GGML_SIMD) && defined(__riscv_v_intrinsic)
// RM accumulators + 1 B vector = RM + 1 <= 8 => RM <= 7
// Microkernel: C[RM x vl] += A[RM x K] * B[K x N]
template <int RM>
static inline void rvv_simd_gemm_ukernel(
float * GGML_RESTRICT C,
const float * GGML_RESTRICT A,
const float * GGML_RESTRICT B,
int K, int N, size_t vl)
{
static_assert(RM >= 1 && RM <= 7, "RM must be 1..7 for LMUL=4");
vfloat32m4_t acc_0 = __riscv_vle32_v_f32m4(C + 0 * N, vl);
vfloat32m4_t acc_1, acc_2, acc_3, acc_4, acc_5, acc_6;
if constexpr (RM > 1) acc_1 = __riscv_vle32_v_f32m4(C + 1 * N, vl);
if constexpr (RM > 2) acc_2 = __riscv_vle32_v_f32m4(C + 2 * N, vl);
if constexpr (RM > 3) acc_3 = __riscv_vle32_v_f32m4(C + 3 * N, vl);
if constexpr (RM > 4) acc_4 = __riscv_vle32_v_f32m4(C + 4 * N, vl);
if constexpr (RM > 5) acc_5 = __riscv_vle32_v_f32m4(C + 5 * N, vl);
if constexpr (RM > 6) acc_6 = __riscv_vle32_v_f32m4(C + 6 * N, vl);
for (int kk = 0; kk < K; kk++) {
vfloat32m4_t b_0 = __riscv_vle32_v_f32m4(B + kk * N, vl);
acc_0 = __riscv_vfmacc_vf_f32m4(acc_0, A[0 * K + kk], b_0, vl);
if constexpr (RM > 1) acc_1 = __riscv_vfmacc_vf_f32m4(acc_1, A[1 * K + kk], b_0, vl);
if constexpr (RM > 2) acc_2 = __riscv_vfmacc_vf_f32m4(acc_2, A[2 * K + kk], b_0, vl);
if constexpr (RM > 3) acc_3 = __riscv_vfmacc_vf_f32m4(acc_3, A[3 * K + kk], b_0, vl);
if constexpr (RM > 4) acc_4 = __riscv_vfmacc_vf_f32m4(acc_4, A[4 * K + kk], b_0, vl);
if constexpr (RM > 5) acc_5 = __riscv_vfmacc_vf_f32m4(acc_5, A[5 * K + kk], b_0, vl);
if constexpr (RM > 6) acc_6 = __riscv_vfmacc_vf_f32m4(acc_6, A[6 * K + kk], b_0, vl);
}
__riscv_vse32_v_f32m4(C + 0 * N, acc_0, vl);
if constexpr (RM > 1) __riscv_vse32_v_f32m4(C + 1 * N, acc_1, vl);
if constexpr (RM > 2) __riscv_vse32_v_f32m4(C + 2 * N, acc_2, vl);
if constexpr (RM > 3) __riscv_vse32_v_f32m4(C + 3 * N, acc_3, vl);
if constexpr (RM > 4) __riscv_vse32_v_f32m4(C + 4 * N, acc_4, vl);
if constexpr (RM > 5) __riscv_vse32_v_f32m4(C + 5 * N, acc_5, vl);
if constexpr (RM > 6) __riscv_vse32_v_f32m4(C + 6 * N, acc_6, vl);
}
template <int RM>
static inline void rvv_simd_gemm_dispatch_tail(
float * GGML_RESTRICT C,
const float * GGML_RESTRICT A,
const float * GGML_RESTRICT B,
int K, int N, int KN, int remaining_rows)
{
if constexpr (RM > 0) {
if (remaining_rows == RM) {
int64_t jj = 0;
for (; jj + KN <= N; jj += KN) {
rvv_simd_gemm_ukernel<RM>(C + jj, A, B + jj, K, N, KN);
}
if (jj < N) {
rvv_simd_gemm_ukernel<RM>(C + jj, A, B + jj, K, N, N - jj);
}
} else {
rvv_simd_gemm_dispatch_tail<RM - 1>(C, A, B, K, N, KN, remaining_rows);
}
}
}
static constexpr int GEMM_RM = 7;
// C[M x N] += A[M x K] * B[K x N]
static void simd_gemm(
float * GGML_RESTRICT C,
const float * GGML_RESTRICT A,
const float * GGML_RESTRICT B,
int M, int K, int N)
{
const int KN = (int)__riscv_vlenb();
int64_t ii = 0;
for (; ii + GEMM_RM <= M; ii += GEMM_RM) {
int64_t jj = 0;
for (; jj + KN <= N; jj += KN) {
rvv_simd_gemm_ukernel<GEMM_RM>(C + jj, A, B + jj, K, N, KN);
}
if (jj < N) {
rvv_simd_gemm_ukernel<GEMM_RM>(C + jj, A, B + jj, K, N, N - jj);
}
A += GEMM_RM * K;
C += GEMM_RM * N;
}
int remaining_rows = M - ii;
rvv_simd_gemm_dispatch_tail<GEMM_RM - 1>(C, A, B, K, N, KN, remaining_rows);
}
#if defined(__GNUC__) && !defined(__clang__)
#pragma GCC diagnostic pop

View file

@ -931,6 +931,13 @@ struct ggml_cuda_type_traits<GGML_TYPE_F16> {
static constexpr int qr = 1;
};
template<>
struct ggml_cuda_type_traits<GGML_TYPE_Q1_0> {
static constexpr int qk = QK1_0;
static constexpr int qr = QR1_0;
static constexpr int qi = QI1_0;
};
template<>
struct ggml_cuda_type_traits<GGML_TYPE_Q4_0> {
static constexpr int qk = QK4_0;
@ -1099,10 +1106,6 @@ struct ggml_cuda_device_info {
cuda_device_info devices[GGML_CUDA_MAX_DEVICES] = {};
std::array<float, GGML_CUDA_MAX_DEVICES> default_tensor_split = {};
#ifdef GGML_USE_NCCL
ncclComm_t comms[GGML_CUDA_MAX_DEVICES];
#endif // GGML_USE_NCCL
};
const ggml_cuda_device_info & ggml_cuda_info();

View file

@ -711,6 +711,8 @@ to_bf16_cuda_t ggml_get_to_bf16_cuda(ggml_type type) {
to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
switch (type) {
case GGML_TYPE_Q1_0:
return dequantize_block_cont_cuda<QK1_0, QR1_0, dequantize_q1_0>;
case GGML_TYPE_Q4_0:
return dequantize_row_q4_0_cuda;
case GGML_TYPE_Q4_1:
@ -767,6 +769,8 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
switch (type) {
case GGML_TYPE_Q1_0:
return dequantize_block_cont_cuda<QK1_0, QR1_0, dequantize_q1_0>;
case GGML_TYPE_Q4_0:
return dequantize_row_q4_0_cuda;
case GGML_TYPE_Q4_1:
@ -822,6 +826,8 @@ to_fp16_nc_cuda_t ggml_get_to_fp16_nc_cuda(ggml_type type) {
switch (type) {
case GGML_TYPE_F32:
return convert_unary_cuda<float>;
case GGML_TYPE_Q1_0:
return dequantize_block_cuda<QK1_0, QR1_0, dequantize_q1_0>;
case GGML_TYPE_Q4_0:
return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
case GGML_TYPE_Q4_1:
@ -843,6 +849,8 @@ to_bf16_nc_cuda_t ggml_get_to_bf16_nc_cuda(ggml_type type) {
switch (type) {
case GGML_TYPE_F32:
return convert_unary_cuda<float, nv_bfloat16>;
case GGML_TYPE_Q1_0:
return dequantize_block_cuda<QK1_0, QR1_0, dequantize_q1_0>;
case GGML_TYPE_Q4_0:
return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
case GGML_TYPE_Q4_1:
@ -864,6 +872,8 @@ to_fp32_nc_cuda_t ggml_get_to_fp32_nc_cuda(ggml_type type) {
switch (type) {
case GGML_TYPE_F16:
return convert_unary_cuda<half, float>;
case GGML_TYPE_Q1_0:
return dequantize_block_cuda<QK1_0, QR1_0, dequantize_q1_0>;
case GGML_TYPE_Q4_0:
return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
case GGML_TYPE_Q4_1:

View file

@ -1,5 +1,27 @@
#include "common.cuh"
static __device__ __forceinline__ void dequantize_q1_0(const void * vx, const int64_t ib, const int iqs, float2 & v){
const block_q1_0 * x = (const block_q1_0 *) vx;
const float d = x[ib].d;
const int bit_index_0 = iqs;
const int bit_index_1 = iqs + 1;
const int byte_index_0 = bit_index_0 / 8;
const int bit_offset_0 = bit_index_0 % 8;
const int byte_index_1 = bit_index_1 / 8;
const int bit_offset_1 = bit_index_1 % 8;
// Extract bits: 1 = +d, 0 = -d (branchless)
const int bit_0 = (x[ib].qs[byte_index_0] >> bit_offset_0) & 1;
const int bit_1 = (x[ib].qs[byte_index_1] >> bit_offset_1) & 1;
v.x = (2*bit_0 - 1) * d;
v.y = (2*bit_1 - 1) * d;
}
static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const int64_t ib, const int iqs, float2 & v){
const block_q4_0 * x = (const block_q4_0 *) vx;

View file

@ -179,6 +179,10 @@ static void ggml_cuda_get_rows_switch_src0_type(
get_rows_cuda_float((const nv_bfloat16 *) src0_d, src1_d, dst_d,
ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
break;
case GGML_TYPE_Q1_0:
get_rows_cuda_q<QK1_0, QR1_0, dequantize_q1_0>(src0_d, src1_d, dst_d,
ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
break;
case GGML_TYPE_Q4_0:
get_rows_cuda_q<QK4_0, QR4_0, dequantize_q4_0>(src0_d, src1_d, dst_d,
ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);

View file

@ -326,28 +326,22 @@ static ggml_cuda_device_info ggml_cuda_init() {
// configure logging to stdout
// CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
for (int id = 0; id < info.device_count; ++id) {
ggml_cuda_set_device(id);
for (int id_other = 0; id_other < info.device_count; ++id_other) {
if (id == id_other) {
continue;
}
int can_access_peer;
CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer, id, id_other));
if (can_access_peer) {
CUDA_CHECK(cudaDeviceEnablePeerAccess(id_other, 0));
if (getenv("GGML_CUDA_P2P") != nullptr) {
for (int id = 0; id < info.device_count; ++id) {
ggml_cuda_set_device(id);
for (int id_other = 0; id_other < info.device_count; ++id_other) {
if (id == id_other) {
continue;
}
int can_access_peer;
CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer, id, id_other));
if (can_access_peer) {
CUDA_CHECK(cudaDeviceEnablePeerAccess(id_other, 0));
}
}
}
}
#ifdef GGML_USE_NCCL
int dev_ids[GGML_CUDA_MAX_DEVICES];
for (int id = 0; id < info.device_count; ++id) {
dev_ids[id] = id;
}
NCCL_CHECK(ncclCommInitAll(info.comms, info.device_count, dev_ids));
#endif // GGML_USE_NCCL
return info;
}
@ -1125,66 +1119,51 @@ static const ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_inte
/* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host,
};
bool ggml_backend_cuda_allreduce_tensor(ggml_backend_t * backends, struct ggml_tensor ** tensors, size_t n_backends) {
#ifdef GGML_USE_NCCL
const int64_t ne = ggml_nelements(tensors[0]);
// FIXME the input of llm_graph_context::build_in_out_ids can produce a tensor with 0 elements if n_outputs == 0
// This then causes a crash in this function
if (ne == 0) {
return true;
}
for (size_t i = 0; i < n_backends; ++i) {
GGML_ASSERT(tensors[i] != nullptr);
GGML_ASSERT(ggml_nelements(tensors[i]) == ne);
GGML_ASSERT(ggml_is_contiguously_allocated(tensors[i]));
}
struct ggml_backend_cuda_comm_context {
std::vector<ggml_backend_t> backends;
std::vector<ncclComm_t> comms;
const ggml_cuda_device_info info = ggml_cuda_info();
// For small tensors, simply reduce them as FP32.
// The following heuristic for how "small" a tensor should be is based on RTX 4090s connected via 16x PCIe 4.0.
if ((n_backends <= 2 && ne < 32768) || (n_backends == 3 && ne < 131072) || (n_backends >= 4 && ne < 262144)) {
NCCL_CHECK(ncclGroupStart());
for (size_t i = 0; i < n_backends; ++i) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backends[i]->context;
NCCL_CHECK(ncclAllReduce(tensors[i]->data, tensors[i]->data, ne, ncclFloat, ncclSum, info.comms[cuda_ctx->device], cuda_ctx->stream()));
~ggml_backend_cuda_comm_context() {
for (ncclComm_t comm : comms) {
NCCL_CHECK(ncclCommDestroy(comm));
}
NCCL_CHECK(ncclGroupEnd());
return true;
}
};
#endif // GGML_USE_NCCL
// For large tensors it's faster to compress them to BF16 for the reduction:
to_bf16_cuda_t to_bf16 = ggml_get_to_bf16_cuda(GGML_TYPE_F32);
to_fp32_cuda_t to_fp32 = ggml_get_to_fp32_cuda(GGML_TYPE_BF16);
static void ggml_backend_cuda_comm_free(void * comm_ctx_v) {
#ifdef GGML_USE_NCCL
if (comm_ctx_v == nullptr) {
return;
}
ggml_backend_cuda_comm_context * comm_ctx = (ggml_backend_cuda_comm_context *) comm_ctx_v;
delete comm_ctx;
#else
GGML_UNUSED(comm_ctx_v);
#endif // GGML_USE_NCCL
}
ggml_cuda_pool_alloc<nv_bfloat16> tmp[GGML_CUDA_MAX_DEVICES];
for (size_t i = 0; i < n_backends; ++i) {
static void * ggml_backend_cuda_comm_init(ggml_backend_t * backends, size_t n_backends) {
#ifdef GGML_USE_NCCL
for (size_t i = 0; i < n_backends; i++) {
if (!ggml_backend_is_cuda(backends[i])) {
return nullptr;
}
}
ggml_backend_cuda_comm_context * ret = new ggml_backend_cuda_comm_context;
std::vector<int> dev_ids;
ret->backends.reserve(n_backends);
dev_ids.reserve(n_backends);
for (size_t i = 0; i < n_backends; i++) {
ret->backends.push_back(backends[i]);
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backends[i]->context;
tmp[i].pool = &cuda_ctx->pool();
tmp[i].alloc(ne);
ggml_cuda_set_device(i);
to_bf16(tensors[i]->data, tmp[i].get(), ne, cuda_ctx->stream());
CUDA_CHECK(cudaGetLastError());
dev_ids.push_back(cuda_ctx->device);
}
NCCL_CHECK(ncclGroupStart());
for (size_t i = 0; i < n_backends; ++i) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backends[i]->context;
NCCL_CHECK(ncclAllReduce(tmp[i].get(), tmp[i].get(), ne, ncclBfloat16, ncclSum, info.comms[cuda_ctx->device], cuda_ctx->stream()));
}
NCCL_CHECK(ncclGroupEnd());
for (size_t i = 0; i < n_backends; ++i) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backends[i]->context;
ggml_cuda_set_device(i);
to_fp32(tmp[i].get(), (float *) tensors[i]->data, ne, cuda_ctx->stream());
CUDA_CHECK(cudaGetLastError());
}
return true;
ret->comms.resize(n_backends);
NCCL_CHECK(ncclCommInitAll(ret->comms.data(), n_backends, dev_ids.data()));
return ret;
#else
// If NCCL is installed it is used by default for optimal performance.
// However, NVIDIA does not distribute NCCL with CUDA so users may be unwittingly missing this package.
@ -1197,7 +1176,76 @@ bool ggml_backend_cuda_allreduce_tensor(ggml_backend_t * backends, struct ggml_t
warning_printed = true;
}
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
GGML_UNUSED_VARS(backends, tensors, n_backends);
GGML_UNUSED_VARS(backends, n_backends);
return nullptr;
#endif // GGML_USE_NCCL
}
static bool ggml_backend_cuda_comm_allreduce_tensor(void * comm_ctx_v, struct ggml_tensor ** tensors) {
#ifdef GGML_USE_NCCL
const int64_t ne = ggml_nelements(tensors[0]);
// FIXME the input of llm_graph_context::build_in_out_ids can produce a tensor with 0 elements if n_outputs == 0
// This then causes a crash in this function
if (ne == 0) {
return true;
}
GGML_ASSERT(comm_ctx_v != nullptr);
ggml_backend_cuda_comm_context * comm_ctx = (ggml_backend_cuda_comm_context *) comm_ctx_v;
const size_t n_backends = comm_ctx->backends.size();
for (size_t i = 0; i < n_backends; ++i) {
GGML_ASSERT(tensors[i] != nullptr);
GGML_ASSERT(ggml_nelements(tensors[i]) == ne);
GGML_ASSERT(ggml_is_contiguously_allocated(tensors[i]));
}
// For small tensors, simply reduce them as FP32.
// The following heuristic for how "small" a tensor should be is based on RTX 4090s connected via 16x PCIe 4.0.
if ((n_backends <= 2 && ne < 32768) || (n_backends == 3 && ne < 131072) || (n_backends >= 4 && ne < 262144)) {
NCCL_CHECK(ncclGroupStart());
for (size_t i = 0; i < n_backends; ++i) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) comm_ctx->backends[i]->context;
NCCL_CHECK(ncclAllReduce(tensors[i]->data, tensors[i]->data, ne, ncclFloat, ncclSum, comm_ctx->comms[i], cuda_ctx->stream()));
}
NCCL_CHECK(ncclGroupEnd());
return true;
}
// For large tensors it's faster to compress them to BF16 for the reduction:
to_bf16_cuda_t to_bf16 = ggml_get_to_bf16_cuda(GGML_TYPE_F32);
to_fp32_cuda_t to_fp32 = ggml_get_to_fp32_cuda(GGML_TYPE_BF16);
ggml_cuda_pool_alloc<nv_bfloat16> tmp[GGML_CUDA_MAX_DEVICES];
for (size_t i = 0; i < n_backends; ++i) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) comm_ctx->backends[i]->context;
tmp[i].pool = &cuda_ctx->pool();
tmp[i].alloc(ne);
ggml_cuda_set_device(cuda_ctx->device);
to_bf16(tensors[i]->data, tmp[i].get(), ne, cuda_ctx->stream());
CUDA_CHECK(cudaGetLastError());
}
NCCL_CHECK(ncclGroupStart());
for (size_t i = 0; i < n_backends; ++i) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) comm_ctx->backends[i]->context;
NCCL_CHECK(ncclAllReduce(tmp[i].get(), tmp[i].get(), ne, ncclBfloat16, ncclSum, comm_ctx->comms[i], cuda_ctx->stream()));
}
NCCL_CHECK(ncclGroupEnd());
for (size_t i = 0; i < n_backends; ++i) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) comm_ctx->backends[i]->context;
ggml_cuda_set_device(cuda_ctx->device);
to_fp32(tmp[i].get(), (float *) tensors[i]->data, ne, cuda_ctx->stream());
CUDA_CHECK(cudaGetLastError());
}
return true;
#else
GGML_UNUSED_VARS(comm_ctx_v, tensors);
return false;
#endif // GGML_USE_NCCL
}
@ -4804,6 +4852,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
switch (a->type) {
case GGML_TYPE_F32:
case GGML_TYPE_F16:
case GGML_TYPE_Q1_0:
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q5_0:
@ -4841,6 +4890,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_TYPE_F32:
case GGML_TYPE_BF16:
case GGML_TYPE_I32:
case GGML_TYPE_Q1_0:
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q5_0:
@ -5242,8 +5292,14 @@ static ggml_backend_feature * ggml_backend_cuda_get_features(ggml_backend_reg_t
static void * ggml_backend_cuda_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) {
GGML_UNUSED(reg);
if (strcmp(name, "ggml_backend_allreduce_tensor") == 0) {
return (void *)ggml_backend_cuda_allreduce_tensor;
if (strcmp(name, "ggml_backend_comm_init") == 0) {
return (void *)ggml_backend_cuda_comm_init;
}
if (strcmp(name, "ggml_backend_comm_free") == 0) {
return (void *)ggml_backend_cuda_comm_free;
}
if (strcmp(name, "ggml_backend_comm_allreduce_tensor") == 0) {
return (void *)ggml_backend_cuda_comm_allreduce_tensor;
}
if (strcmp(name, "ggml_backend_split_buffer_type") == 0) {
return (void *)ggml_backend_cuda_split_buffer_type;

View file

@ -5,6 +5,9 @@
static void ggml_cuda_mul_mat_q_switch_type(ggml_backend_cuda_context & ctx, const mmq_args & args, cudaStream_t stream) {
switch (args.type_x) {
case GGML_TYPE_Q1_0:
mul_mat_q_case<GGML_TYPE_Q1_0>(ctx, args, stream);
break;
case GGML_TYPE_Q4_0:
mul_mat_q_case<GGML_TYPE_Q4_0>(ctx, args, stream);
break;
@ -272,6 +275,7 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t
bool mmq_supported;
switch (type) {
case GGML_TYPE_Q1_0:
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q5_0:

View file

@ -58,6 +58,8 @@ static_assert(sizeof(block_fp4_mmq) == sizeof(block_q8_1_mmq), "Unexpected b
static mmq_q8_1_ds_layout mmq_get_q8_1_ds_layout(const ggml_type type_x) {
switch (type_x) {
case GGML_TYPE_Q1_0:
return MMQ_Q8_1_DS_LAYOUT_D4;
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
return MMQ_Q8_1_DS_LAYOUT_DS4;
@ -186,6 +188,7 @@ static constexpr __device__ int get_mmq_y_device() {
static constexpr __host__ __device__ tile_x_sizes mmq_get_dp4a_tile_x_sizes(ggml_type type, int mmq_y) {
switch (type) {
case GGML_TYPE_Q1_0: return MMQ_DP4A_TXS_Q8_0;
case GGML_TYPE_Q4_0: return MMQ_DP4A_TXS_Q4_0;
case GGML_TYPE_Q4_1: return MMQ_DP4A_TXS_Q4_1;
case GGML_TYPE_Q5_0: return MMQ_DP4A_TXS_Q8_0;
@ -230,6 +233,7 @@ static_assert(MMQ_MMA_TILE_X_K_NVFP4 % 8 == 4, "Wrong padding.");
static constexpr __host__ __device__ int mmq_get_mma_tile_x_k(ggml_type type) {
switch (type) {
case GGML_TYPE_Q1_0: return MMQ_MMA_TILE_X_K_Q8_0;
case GGML_TYPE_Q4_0: return MMQ_MMA_TILE_X_K_Q8_0;
case GGML_TYPE_Q4_1: return MMQ_MMA_TILE_X_K_Q8_1;
case GGML_TYPE_Q5_0: return MMQ_MMA_TILE_X_K_Q8_0;
@ -303,6 +307,87 @@ static constexpr __device__ int mmq_get_nwarps_device() {
// ------------------------------------------------------------
template <int mmq_y, bool need_check> static __device__ __forceinline__ void load_tiles_q1_0(
const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
constexpr int nwarps = mmq_get_nwarps_device();
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
int * x_qs = (int *) x_tile;
float * x_df = (float *) (x_qs + 2*MMQ_TILE_NE_K);
#else
constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q8_0, mmq_y);
int * x_qs = (int *) x_tile;
float * x_df = (float *) (x_qs + txs.qs);
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
constexpr int blocks_per_iter = MMQ_ITER_K / QK1_0;
constexpr int threads_per_row = blocks_per_iter * QI1_0;
constexpr int nrows = warp_size / threads_per_row;
constexpr int scale_entries_per_block = QK1_0 / QK8_1;
constexpr int scale_entries_per_row = blocks_per_iter * scale_entries_per_block;
const int txi = threadIdx.x % threads_per_row;
const int kbx = txi / QI1_0;
const int kqsx = txi % QI1_0;
#pragma unroll
for (int i0 = 0; i0 < mmq_y; i0 += nrows*nwarps) {
int i = i0 + threadIdx.y*nrows + threadIdx.x/threads_per_row;
if (need_check) {
i = min(i, i_max);
}
const block_q1_0 * bxi = (const block_q1_0 *) x + kbx0 + i*stride + kbx;
const int qs_offset = 4*kqsx;
const int qs0 = bxi->qs[qs_offset + 0] | (bxi->qs[qs_offset + 1] << 8) |
(bxi->qs[qs_offset + 2] << 16) | (bxi->qs[qs_offset + 3] << 24);
int unpacked_bytes[8];
#pragma unroll
for (int j = 0; j < 8; ++j) {
const int shift = j * 4;
const int bits4 = (qs0 >> shift) & 0x0F;
const int b0 = (bits4 & 0x01) ? 1 : -1;
const int b1 = (bits4 & 0x02) ? 1 : -1;
const int b2 = (bits4 & 0x04) ? 1 : -1;
const int b3 = (bits4 & 0x08) ? 1 : -1;
unpacked_bytes[j] = (b0 & 0xFF) | ((b1 & 0xFF) << 8) | ((b2 & 0xFF) << 16) | ((b3 & 0xFF) << 24);
}
const int dst_offset = kbx*(scale_entries_per_block*QI8_0) + kqsx*QI8_0;
#pragma unroll
for (int j = 0; j < 8; ++j) {
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + dst_offset + j] = unpacked_bytes[j];
#else
x_qs[i*(2*MMQ_TILE_NE_K + 1) + dst_offset + j] = unpacked_bytes[j];
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
}
}
const int ksx = threadIdx.x % scale_entries_per_row;
const int scale_block = ksx / scale_entries_per_block;
#pragma unroll
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
int i = i0 + threadIdx.y;
if (need_check) {
i = min(i, i_max);
}
const block_q1_0 * bxi = (const block_q1_0 *) x + kbx0 + i*stride + scale_block;
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + ksx] = bxi->d;
#else
x_df[i*(2*MMQ_TILE_NE_K/QI8_0) + i/(QI8_0/2) + ksx] = bxi->d;
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
}
}
template <int mmq_y, bool need_check> static __device__ __forceinline__ void load_tiles_q4_0(
const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
constexpr int nwarps = mmq_get_nwarps_device();
@ -3291,6 +3376,14 @@ static __device__ __forceinline__ void mmq_write_back_mma(
template <int mmq_x, int mmq_y, bool need_check, ggml_type type>
struct mmq_type_traits;
template <int mmq_x, int mmq_y, bool need_check>
struct mmq_type_traits<mmq_x, mmq_y, need_check, GGML_TYPE_Q1_0> {
static constexpr int vdr = VDR_Q1_0_Q8_1_MMQ;
static constexpr load_tiles_mmq_t load_tiles = load_tiles_q1_0<mmq_y, need_check>;
static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_q8_1_mma<mmq_x, mmq_y, MMQ_Q8_1_DS_LAYOUT_D4>;
static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a<mmq_x, mmq_y>;
};
template <int mmq_x, int mmq_y, bool need_check>
struct mmq_type_traits<mmq_x, mmq_y, need_check, GGML_TYPE_Q4_0> {
static constexpr int vdr = VDR_Q4_0_Q8_1_MMQ;

View file

@ -9,6 +9,7 @@ typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_
static constexpr __device__ vec_dot_q_cuda_t get_vec_dot_q_cuda(ggml_type type) {
switch (type) {
case GGML_TYPE_Q1_0: return vec_dot_q1_0_q8_1;
case GGML_TYPE_Q4_0: return vec_dot_q4_0_q8_1;
case GGML_TYPE_Q4_1: return vec_dot_q4_1_q8_1;
case GGML_TYPE_Q5_0: return vec_dot_q5_0_q8_1;
@ -36,6 +37,7 @@ static constexpr __device__ vec_dot_q_cuda_t get_vec_dot_q_cuda(ggml_type type)
static constexpr __host__ __device__ int get_vdr_mmvq(ggml_type type) {
switch (type) {
case GGML_TYPE_Q1_0: return VDR_Q1_0_Q8_1_MMVQ;
case GGML_TYPE_Q4_0: return VDR_Q4_0_Q8_1_MMVQ;
case GGML_TYPE_Q4_1: return VDR_Q4_1_Q8_1_MMVQ;
case GGML_TYPE_Q5_0: return VDR_Q5_0_Q8_1_MMVQ;
@ -886,6 +888,12 @@ static void mul_mat_vec_q_switch_type(
const int nsamples_x, const int nsamples_dst, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst,
const int ids_stride, cudaStream_t stream) {
switch (type_x) {
case GGML_TYPE_Q1_0:
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q1_0>
(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream);
break;
case GGML_TYPE_Q4_0:
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q4_0>
(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,

View file

@ -32,6 +32,7 @@ SOURCE_FATTN_MMA_START = """// This file has been autogenerated by generate_cu_f
SOURCE_FATTN_MMA_CASE = "DECL_FATTN_MMA_F16_CASE({head_size_kq}, {head_size_v}, {ncols1}, {ncols2});\n"
TYPES_MMQ = [
"GGML_TYPE_Q1_0",
"GGML_TYPE_Q4_0", "GGML_TYPE_Q4_1", "GGML_TYPE_Q5_0", "GGML_TYPE_Q5_1", "GGML_TYPE_Q8_0",
"GGML_TYPE_Q2_K", "GGML_TYPE_Q3_K", "GGML_TYPE_Q4_K", "GGML_TYPE_Q5_K", "GGML_TYPE_Q6_K",
"GGML_TYPE_IQ2_XXS", "GGML_TYPE_IQ2_XS", "GGML_TYPE_IQ2_S", "GGML_TYPE_IQ3_XXS", "GGML_TYPE_IQ3_S",

View file

@ -0,0 +1,5 @@
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../mmq.cuh"
DECL_MMQ_CASE(GGML_TYPE_Q1_0);

View file

@ -106,6 +106,9 @@ static __device__ __forceinline__ uint32_t unpack_ksigns(const uint8_t v) {
// VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called
// MMVQ = mul_mat_vec_q, MMQ = mul_mat_q
#define VDR_Q1_0_Q8_1_MMVQ 1 // Process one 32-element chunk at a time for parallelism
#define VDR_Q1_0_Q8_1_MMQ 4 // Q1_0 has 128 bits (4 ints) per block
#define VDR_Q4_0_Q8_1_MMVQ 2
#define VDR_Q4_0_Q8_1_MMQ 4
@ -669,6 +672,51 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq(
return d6 * sumf_d;
}
static __device__ __forceinline__ float vec_dot_q1_0_q8_1(
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
const block_q1_0 * bq1_0 = (const block_q1_0 *) vbq + kbx;
// Q1_0: 128 elements with ONE scale
// Q8_1: 32 elements per block with individual scales
// iqs selects which of the 4 chunks of 32 elements to process (0-3)
const float d1 = bq1_0->d;
// Process only the chunk specified by iqs
const block_q8_1 * bq8_1_chunk = bq8_1 + iqs;
// Load 32 bits (4 bytes) for this chunk from Q1_0
const int offset = iqs * 4;
const int v = bq1_0->qs[offset + 0] | (bq1_0->qs[offset + 1] << 8) |
(bq1_0->qs[offset + 2] << 16) | (bq1_0->qs[offset + 3] << 24);
// Unpack 32 bits into 32 signed values (-1 or +1)
int vi_bytes[8];
#pragma unroll
for (int j = 0; j < 8; ++j) {
const int shift = j * 4;
const int bits4 = (v >> shift) & 0x0F;
const int b0 = (bits4 & 0x01) ? 1 : -1;
const int b1 = (bits4 & 0x02) ? 1 : -1;
const int b2 = (bits4 & 0x04) ? 1 : -1;
const int b3 = (bits4 & 0x08) ? 1 : -1;
vi_bytes[j] = (b0 & 0xFF) | ((b1 & 0xFF) << 8) | ((b2 & 0xFF) << 16) | ((b3 & 0xFF) << 24);
}
// Compute dot product for this 32-element chunk
int sumi = 0;
#pragma unroll
for (int j = 0; j < 8; ++j) {
const int u = get_int_b4(bq8_1_chunk->qs, j);
sumi = ggml_cuda_dp4a(vi_bytes[j], u, sumi);
}
// Apply Q1_0's single scale and this chunk's Q8_1 scale
const float d8 = __low2float(bq8_1_chunk->ds);
return d1 * d8 * sumi;
}
static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {

View file

@ -1406,7 +1406,7 @@ struct vk_op_im2col_push_constants {
uint32_t IW; uint32_t IH;
uint32_t OW; uint32_t OH;
uint32_t KW; uint32_t KH;
uint32_t pelements;
uint32_t OH_batch;
uint32_t CHW;
int32_t s0; int32_t s1;
int32_t p0; int32_t p1;
@ -10098,7 +10098,13 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
const uint32_t batch = src1->ne[is_2D ? 3 : 2];
elements = { OW * KW * KH, OH, batch * IC };
const uint32_t CHW = IC * KH * KW;
// Cap X workgroups to limit concurrent IC channel reads.
// The shader loops over X to cover the full CHW dimension.
// AMD prefers a lower limit
const uint32_t min_cap = ctx->device->vendor_id == VK_VENDOR_ID_AMD ? 512u : 4096u;
const uint32_t x_elements = std::min(CHW, std::max(min_cap, OW * KH * KW));
elements = { x_elements, OW, OH * batch };
elements[1] = std::min(elements[1], ctx->device->properties.limits.maxComputeWorkGroupCount[1]);
elements[2] = std::min(elements[2], ctx->device->properties.limits.maxComputeWorkGroupCount[2]);
} break;
@ -11761,7 +11767,6 @@ static void ggml_vk_im2col(ggml_backend_vk_context * ctx, vk_context& subctx, co
const uint32_t offset_delta = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
const uint32_t batch_offset = src1->nb[is_2D ? 3 : 2] / 4; // nb is byte offset, src is type float32
const uint32_t pelements = OW * KW * KH;
const uint32_t batch = src1->ne[is_2D ? 3 : 2];
const ggml_backend_vk_buffer_context * d_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
@ -11773,7 +11778,7 @@ static void ggml_vk_im2col(ggml_backend_vk_context * ctx, vk_context& subctx, co
dst_addr,
batch_offset, offset_delta,
IC, IW, IH, OW, OH, KW, KH,
pelements,
OH * batch,
IC * KH * KW,
s0, s1, p0, p1, d0, d1, batch * IC
});

View file

@ -13,7 +13,7 @@ layout (push_constant) uniform parameter
uint IW; uint IH;
uint OW; uint OH;
uint KW; uint KH;
uint pelements;
uint OH_batch;
uint CHW;
int s0; int s1;
int p0; int p1;
@ -34,82 +34,60 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
layout (buffer_reference) buffer D_ptr {D_TYPE d;};
#endif
void im2col(const uint y, const uint z) {
const uint gidx = gl_GlobalInvocationID.x;
void im2col(const uint ow, const uint z_idx) {
const uint oh = z_idx % p.OH;
const uint batch_idx = z_idx / p.OH;
const uint oh = y;
const uint batch = z / p.IC;
const uint ic = z % p.IC;
const uint gidx = gl_LocalInvocationID.x;
const uint src_batch = batch_idx * p.batch_offset;
const BDA_OFFSET_T dst_row = ((BDA_OFFSET_T(batch_idx) * p.OH + oh) * p.OW + ow) * p.CHW;
const uint src_base = ic * p.offset_delta + batch * p.batch_offset;
const BDA_OFFSET_T dst_base = ((BDA_OFFSET_T(batch) * p.OH + oh) * p.OW) * p.CHW + BDA_OFFSET_T(ic) * (p.KW * p.KH);
const int oh_s1 = int(oh) * p.s1;
const uint ksize = p.OW * p.KH;
const uint KHKW = p.KH * p.KW;
const uint base_linear_idx = gidx * NUM_ITER;
uint wg_x = gl_WorkGroupID.x;
do {
const uint wg_offset = wg_x * 512;
uint current_kx = base_linear_idx / ksize;
const uint rem = base_linear_idx - (current_kx * ksize);
uint current_ky = rem / p.OW;
uint current_ix = rem % p.OW;
[[unroll]] for (uint i = 0; i < NUM_ITER; ++i) {
const uint chw_idx = wg_offset + gidx + i * BLOCK_SIZE;
A_TYPE values[NUM_ITER];
BDA_OFFSET_T offset_dst[NUM_ITER];
[[unroll]] for (uint idx = 0; idx < NUM_ITER; ++idx) {
values[idx] = A_TYPE(0);
}
[[unroll]] for (uint idx = 0; idx < NUM_ITER; ++idx) {
const uint linear_idx = base_linear_idx + idx;
if (linear_idx >= p.pelements) {
continue;
}
const uint iiw = current_ix * p.s0 + current_kx * p.d0 - p.p0;
const uint iih = oh_s1 + current_ky * p.d1 - p.p1;
offset_dst[idx] = dst_base + BDA_OFFSET_T(current_ix) * p.CHW + current_ky * p.KW + current_kx;
if ((iih < p.IH) && (iiw < p.IW)) {
values[idx] = data_a[src_base + iih * p.IW + iiw];
}
if (++current_ix == p.OW) {
current_ix = 0;
if (++current_ky == p.KH) {
current_ky = 0;
current_kx++;
if (chw_idx >= p.CHW) {
return;
}
}
}
[[unroll]] for (uint idx = 0; idx < NUM_ITER; ++idx) {
const uint ic = chw_idx / KHKW;
const uint rem = chw_idx - ic * KHKW;
const uint ky = rem / p.KW;
const uint kx = rem - ky * p.KW;
const uint linear_idx = base_linear_idx + idx;
const uint iiw = ow * p.s0 + kx * p.d0 - p.p0;
const uint iih = oh * p.s1 + ky * p.d1 - p.p1;
if (linear_idx >= p.pelements) {
continue;
}
A_TYPE val = A_TYPE(0);
if (iih < p.IH && iiw < p.IW) {
val = data_a[src_batch + ic * p.offset_delta + iih * p.IW + iiw];
}
#if BDA
D_ptr dst_addr = D_ptr(p.dst_addr + D_SIZE * offset_dst[idx]);
dst_addr.d = D_TYPE(values[idx]);
D_ptr out_ptr = D_ptr(p.dst_addr + D_SIZE * (dst_row + chw_idx));
out_ptr.d = D_TYPE(val);
#else
data_d[offset_dst[idx]] = D_TYPE(values[idx]);
data_d[dst_row + chw_idx] = D_TYPE(val);
#endif
}
}
wg_x += gl_NumWorkGroups.x;
} while (wg_x * 512 < p.CHW);
}
void main() {
uint y = gl_GlobalInvocationID.y;
while (y < p.OH) {
uint ow = gl_GlobalInvocationID.y;
while (ow < p.OW) {
uint z = gl_GlobalInvocationID.z;
while (z < p.batch_IC) {
im2col(y, z);
while (z < p.OH_batch) {
im2col(ow, z);
z += gl_NumWorkGroups.z;
}
y += gl_NumWorkGroups.y;
ow += gl_NumWorkGroups.y;
}
}

View file

@ -109,7 +109,7 @@ mtmd_context_params mtmd_context_params_default() {
/* use_gpu */ true,
/* print_timings */ true,
/* n_threads */ 4,
/* image_marker */ MTMD_DEFAULT_IMAGE_MARKER,
/* image_marker */ nullptr,
/* media_marker */ mtmd_default_marker(),
/* flash_attn_type */ LLAMA_FLASH_ATTN_TYPE_AUTO,
/* warmup */ true,
@ -169,7 +169,7 @@ struct mtmd_context {
media_marker (ctx_params.media_marker),
n_embd_text (llama_model_n_embd_inp(text_model))
{
if (std::string(ctx_params.image_marker) != MTMD_DEFAULT_IMAGE_MARKER) {
if (ctx_params.image_marker != nullptr) {
throw std::runtime_error("custom image_marker is not supported anymore, use media_marker instead");
}
@ -584,9 +584,6 @@ struct mtmd_tokenizer {
parse_special = text->parse_special;
input_text = text->text;
vocab = llama_model_get_vocab(ctx->text_model);
// for compatibility, we convert image marker to media marker
string_replace_all(input_text, MTMD_DEFAULT_IMAGE_MARKER, ctx->media_marker);
}
int32_t tokenize(mtmd_input_chunks * output) {

View file

@ -46,9 +46,6 @@
# define MTMD_API
#endif
// deprecated marker, use mtmd_default_marker() instead
#define MTMD_DEFAULT_IMAGE_MARKER "<__image__>"
#ifdef __cplusplus
extern "C" {
#endif

View file

@ -84,6 +84,14 @@ std::string gen_tool_call_id() {
return random_string();
}
static std::string media_marker = "";
const char * get_media_marker() {
if (media_marker.empty()) {
media_marker = "<__media_" + random_string() + "__>";
}
return media_marker.c_str();
}
//
// lora utils
//
@ -975,7 +983,7 @@ json oaicompat_chat_params_parse(
handle_media(out_files, image_url, opt.media_path);
p["type"] = "media_marker";
p["text"] = mtmd_default_marker();
p["text"] = get_media_marker();
p.erase("image_url");
} else if (type == "input_audio") {
@ -996,7 +1004,7 @@ json oaicompat_chat_params_parse(
// TODO: add audio_url support by reusing handle_media()
p["type"] = "media_marker";
p["text"] = mtmd_default_marker();
p["text"] = get_media_marker();
p.erase("input_audio");
} else if (type != "text") {
@ -1460,7 +1468,7 @@ json convert_transcriptions_to_chatcmpl(
if (!language.empty()) {
prompt += string_format(" (language: %s)", language.c_str());
}
prompt += mtmd_default_marker();
prompt += get_media_marker();
json chatcmpl_body = inp_body; // copy all fields
chatcmpl_body["messages"] = json::array({

View file

@ -92,6 +92,9 @@ std::string random_string();
std::string gen_chatcmplid();
std::string gen_tool_call_id();
// get a random marker; note: each time the server restarts, the marker will be different
const char * get_media_marker();
//
// lora utils
//

View file

@ -708,6 +708,7 @@ private:
mparams.warmup = params_base.warmup;
mparams.image_min_tokens = params_base.image_min_tokens;
mparams.image_max_tokens = params_base.image_max_tokens;
mparams.media_marker = get_media_marker();
mctx = mtmd_init_from_file(mmproj_path.c_str(), model, mparams);
if (mctx == nullptr) {