mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-04-28 03:30:20 +00:00
Merge commit '1e796eb41f' into concedo_experimental
# Conflicts: # .devops/nix/package.nix # .github/workflows/build-riscv.yml # .github/workflows/build-vulkan.yml # .github/workflows/build.yml # docs/backend/SYCL.md # docs/build.md # docs/development/HOWTO-add-model.md # embd_res/templates/Reka-Edge.jinja # ggml/CMakeLists.txt # ggml/src/ggml-rpc/CMakeLists.txt # ggml/src/ggml-rpc/ggml-rpc.cpp # ggml/src/ggml-sycl/CMakeLists.txt # ggml/src/ggml-sycl/convert.cpp # ggml/src/ggml-sycl/dequantize.hpp # ggml/src/ggml-sycl/dmmv.cpp # ggml/src/ggml-sycl/ggml-sycl.cpp # ggml/src/ggml-webgpu/ggml-webgpu.cpp # ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl # ggml/src/ggml-webgpu/wgsl-shaders/get_rows.wgsl # ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl # ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl # ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id.wgsl # ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.wgsl # ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.wgsl # ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl # ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl # tests/test-chat.cpp # tools/rpc/README.md
This commit is contained in:
commit
768527b031
29 changed files with 1544 additions and 248 deletions
|
|
@ -198,10 +198,19 @@ common_peg_parser analyze_tools::build_tool_parser_json_native(parser_build_cont
|
|||
args_field = format.function_field + "." + args_field;
|
||||
}
|
||||
|
||||
auto tools_parser = p.standard_json_tools(
|
||||
format.section_start, format.section_end, inputs.tools, inputs.parallel_tool_calls,
|
||||
inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED, name_field, args_field, format.tools_array_wrapped,
|
||||
format.fun_name_is_key, format.id_field, format.gen_id_field, format.parameter_order);
|
||||
auto tools_parser = p.eps();
|
||||
if (format.section_start.empty() && !format.per_call_start.empty()) {
|
||||
auto single_tool_parser = p.standard_json_tools(
|
||||
format.per_call_start, format.per_call_end, inputs.tools, inputs.parallel_tool_calls,
|
||||
inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED, name_field, args_field, format.tools_array_wrapped,
|
||||
format.fun_name_is_key, format.id_field, format.gen_id_field, format.parameter_order);
|
||||
tools_parser = p.trigger_rule("tool-calls", p.one_or_more(single_tool_parser + p.space()));
|
||||
} else {
|
||||
tools_parser = p.standard_json_tools(
|
||||
format.section_start, format.section_end, inputs.tools, inputs.parallel_tool_calls,
|
||||
inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED, name_field, args_field, format.tools_array_wrapped,
|
||||
format.fun_name_is_key, format.id_field, format.gen_id_field, format.parameter_order);
|
||||
}
|
||||
|
||||
// Handle content wrappers if present
|
||||
if (ctx.content && ctx.content->is_always_wrapped()) {
|
||||
|
|
|
|||
|
|
@ -308,19 +308,23 @@ struct analyze_tools : analyze_base {
|
|||
|
||||
private:
|
||||
// Extract tool calling 'haystack' for further analysis and delegate further analysis based on format
|
||||
void analyze_tool_calls(const analyze_reasoning & reasoning);
|
||||
void analyze_tool_calls(const analyze_reasoning & reasoning, bool supports_parallel_tool_calls);
|
||||
|
||||
// Analyze format based on position of function and argument name in needle
|
||||
void analyze_tool_call_format(const std::string & haystack,
|
||||
const std::string & fun_name_needle,
|
||||
const std::string & arg_name_needle,
|
||||
const analyze_reasoning & reasoning);
|
||||
const analyze_reasoning & reasoning,
|
||||
bool supports_parallel_tool_calls);
|
||||
|
||||
// Analyze specifics of JSON native format (entire tool call is a JSON object)
|
||||
void analyze_tool_call_format_json_native(const std::string & clean_haystack,
|
||||
const std::string & fun_name_needle,
|
||||
const std::string & arg_name_needle);
|
||||
|
||||
// Check if parallel calls in JSON native format array wrapped or tag wrapped
|
||||
void analyze_json_native_parallel_calls();
|
||||
|
||||
// Analyze specifics of non-JSON native format (tags for function name or for function name and arguments)
|
||||
void analyze_tool_call_format_non_json(const std::string & clean_haystack,
|
||||
const std::string & fun_name_needle);
|
||||
|
|
|
|||
|
|
@ -558,7 +558,7 @@ analyze_tools::analyze_tools(const common_chat_template & tmpl,
|
|||
: analyze_base(tmpl) {
|
||||
LOG_DBG(ANSI_ORANGE "Phase 3: Tool call analysis\n" ANSI_RESET);
|
||||
|
||||
analyze_tool_calls(reasoning);
|
||||
analyze_tool_calls(reasoning, caps.supports_parallel_tool_calls);
|
||||
|
||||
if (format.mode != tool_format::NONE && format.mode != tool_format::JSON_NATIVE) {
|
||||
if (caps.supports_parallel_tool_calls) {
|
||||
|
|
@ -577,7 +577,7 @@ analyze_tools::analyze_tools(const common_chat_template & tmpl,
|
|||
}
|
||||
}
|
||||
|
||||
void analyze_tools::analyze_tool_calls(const analyze_reasoning & reasoning) {
|
||||
void analyze_tools::analyze_tool_calls(const analyze_reasoning & reasoning, bool supports_parallel_tool_calls) {
|
||||
json assistant_no_tools = json{
|
||||
{ "role", "assistant" },
|
||||
{ "content", ASSISTANT_MSG }
|
||||
|
|
@ -611,13 +611,14 @@ void analyze_tools::analyze_tool_calls(const analyze_reasoning & reasoning) {
|
|||
return;
|
||||
}
|
||||
|
||||
analyze_tool_call_format(tool_section, FUN_FIRST, ARG_FIRST, reasoning);
|
||||
analyze_tool_call_format(tool_section, FUN_FIRST, ARG_FIRST, reasoning, supports_parallel_tool_calls);
|
||||
}
|
||||
|
||||
void analyze_tools::analyze_tool_call_format(const std::string & haystack,
|
||||
const std::string & fun_name_needle,
|
||||
const std::string & arg_name_needle,
|
||||
const analyze_reasoning & reasoning) {
|
||||
const analyze_reasoning & reasoning,
|
||||
bool supports_parallel_tool_calls) {
|
||||
if (fun_name_needle.empty() || arg_name_needle.empty() || haystack.empty()) {
|
||||
return;
|
||||
}
|
||||
|
|
@ -660,6 +661,9 @@ void analyze_tools::analyze_tool_call_format(const std::string & haystack,
|
|||
|
||||
if (format.mode == tool_format::JSON_NATIVE) {
|
||||
analyze_tool_call_format_json_native(clean_haystack, fun_name_needle, arg_name_needle);
|
||||
if (supports_parallel_tool_calls) {
|
||||
analyze_json_native_parallel_calls();
|
||||
}
|
||||
} else {
|
||||
analyze_tool_call_format_non_json(clean_haystack, fun_name_needle);
|
||||
}
|
||||
|
|
@ -668,6 +672,42 @@ void analyze_tools::analyze_tool_call_format(const std::string & haystack,
|
|||
format.per_call_end = trim_whitespace(format.per_call_end);
|
||||
}
|
||||
|
||||
void analyze_tools::analyze_json_native_parallel_calls() {
|
||||
json assistant_one_tool = json{
|
||||
{ "role", "assistant" },
|
||||
{ "content", "" },
|
||||
{ "tool_calls", json::array({ first_tool_call }) }
|
||||
};
|
||||
|
||||
json assistant_two_tools = json{
|
||||
{ "role", "assistant" },
|
||||
{ "content", "" },
|
||||
{ "tool_calls", json::array({ first_tool_call, second_tool_call }) }
|
||||
};
|
||||
|
||||
template_params params;
|
||||
params.messages = json::array({ user_msg, assistant_one_tool });
|
||||
params.tools = tools;
|
||||
params.add_generation_prompt = false;
|
||||
params.enable_thinking = true;
|
||||
|
||||
auto comparison = compare_variants(
|
||||
*tmpl, params, [&](template_params & p) { p.messages = json::array({ user_msg, assistant_two_tools }); });
|
||||
|
||||
if (!comparison) {
|
||||
LOG_DBG(ANSI_ORANGE "%s: Template application failed\n" ANSI_RESET, __func__);
|
||||
return;
|
||||
}
|
||||
|
||||
std::string & second_call = comparison->diff.right;
|
||||
if (!format.section_start.empty() && second_call.find(format.section_start) != std::string::npos) {
|
||||
format.per_call_start = format.section_start;
|
||||
format.per_call_end = format.section_end;
|
||||
format.section_start.clear();
|
||||
format.section_end.clear();
|
||||
}
|
||||
}
|
||||
|
||||
void analyze_tools::analyze_tool_call_format_json_native(const std::string & clean_haystack,
|
||||
const std::string & fun_name_needle,
|
||||
const std::string & arg_name_needle) {
|
||||
|
|
|
|||
|
|
@ -676,7 +676,7 @@ common_peg_parser common_chat_peg_builder::build_json_tools_nested_keys(
|
|||
ordered_json params = function.contains("parameters") ? function.at("parameters") : ordered_json::object();
|
||||
|
||||
auto nested_name = literal("\"" + nested_name_field + "\"") + space() + literal(":") + space() +
|
||||
literal("\"") + tool_name(literal(name)) + literal("\"");
|
||||
atomic(literal("\"") + tool_name(literal(name)) + literal("\""));
|
||||
auto nested_args = literal("\"" + nested_args_field + "\"") + space() + literal(":") + space() +
|
||||
tool_args(schema(json(), "tool-" + name + "-schema", params));
|
||||
|
||||
|
|
@ -744,7 +744,7 @@ common_peg_parser common_chat_peg_builder::build_json_tools_flat_keys(
|
|||
ordered_json params = function.contains("parameters") ? function.at("parameters") : ordered_json::object();
|
||||
|
||||
auto tool_name_ = name_key_parser + space() + literal(":") + space() +
|
||||
literal("\"") + tool_name(literal(name)) + literal("\"");
|
||||
atomic(literal("\"") + tool_name(literal(name)) + literal("\""));
|
||||
auto tool_args_ = args_key_parser + space() + literal(":") + space() +
|
||||
tool_args(schema(json(), "tool-" + name + "-schema", params));
|
||||
|
||||
|
|
|
|||
|
|
@ -602,8 +602,8 @@ int main(int argc, char ** argv) {
|
|||
|
||||
int n_input = input_tokens.size();
|
||||
|
||||
if (n_input >= params.n_ctx) {
|
||||
LOG_ERR("error: input too long (%d tokens), max context is %d\n", n_input, params.n_ctx);
|
||||
if (static_cast<uint32_t>(n_input) >= llama_n_ctx(ctx)) {
|
||||
LOG_ERR("error: input too long (%d tokens), max context is %d\n", n_input, llama_n_ctx(ctx));
|
||||
llama_free(ctx);
|
||||
llama_model_free(model);
|
||||
return 1;
|
||||
|
|
|
|||
|
|
@ -202,8 +202,11 @@ extern "C" {
|
|||
|
||||
// Common functions that may be obtained using ggml_backend_reg_get_proc_address
|
||||
|
||||
// AllReduce operation for tensor parallelism (meta backend)
|
||||
typedef bool (*ggml_backend_allreduce_tensor_t)(ggml_backend_t * backends, struct ggml_tensor ** tensors, size_t n_backends);
|
||||
// Context management and operations for faster communication between backends, used for tensor parallelism (meta backend)
|
||||
typedef void * (*ggml_backend_comm_init_t)(ggml_backend_t * backends, size_t n_backends);
|
||||
typedef void (*ggml_backend_comm_free_t)(void * comm_ctx);
|
||||
typedef bool (*ggml_backend_comm_allreduce_tensor_t)(void * comm_ctx, struct ggml_tensor ** tensors);
|
||||
|
||||
// Split buffer type for tensor parallelism (old)
|
||||
typedef ggml_backend_buffer_type_t (*ggml_backend_split_buffer_type_t)(int main_device, const float * tensor_split);
|
||||
// Set the number of threads for the backend
|
||||
|
|
|
|||
|
|
@ -6,9 +6,9 @@
|
|||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define RPC_PROTO_MAJOR_VERSION 3
|
||||
#define RPC_PROTO_MINOR_VERSION 6
|
||||
#define RPC_PROTO_PATCH_VERSION 1
|
||||
#define RPC_PROTO_MAJOR_VERSION 4
|
||||
#define RPC_PROTO_MINOR_VERSION 0
|
||||
#define RPC_PROTO_PATCH_VERSION 0
|
||||
|
||||
#ifdef __cplusplus
|
||||
static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT has changed - update RPC_PROTO_PATCH_VERSION");
|
||||
|
|
|
|||
|
|
@ -1791,8 +1791,32 @@ extern "C" {
|
|||
int n_dims,
|
||||
int mode);
|
||||
|
||||
// custom RoPE
|
||||
// RoPE operations with extended options
|
||||
// a is the input tensor to apply RoPE to, shape [n_embd, n_head, n_token]
|
||||
// b is an int32 vector with size n_token
|
||||
// c is freq factors (e.g. phi3-128k), (optional)
|
||||
// mode can be GGML_ROPE_TYPE_NORMAL or NEOX; for MROPE and VISION mode, use ggml_rope_multi
|
||||
//
|
||||
// pseudo-code for computing theta:
|
||||
// for i in [0, n_dims/2):
|
||||
// theta[i] = b[i] * powf(freq_base, -2.0 * i / n_dims);
|
||||
// theta[i] = theta[i] / c[i]; # if c is provided, divide theta by c
|
||||
// theta[i] = rope_yarn(theta[i], ...); # note: theta = theta * freq_scale is applied here
|
||||
//
|
||||
// other params are used by YaRN RoPE scaling, these default values will disable YaRN:
|
||||
// freq_scale = 1.0f
|
||||
// ext_factor = 0.0f
|
||||
// attn_factor = 1.0f
|
||||
// beta_fast = 0.0f
|
||||
// beta_slow = 0.0f
|
||||
//
|
||||
// example:
|
||||
// (marking: c = cos, s = sin, 0 = unrotated)
|
||||
// given a single head with size = 8 --> [00000000]
|
||||
// GGML_ROPE_TYPE_NORMAL n_dims = 4 --> [cscs0000]
|
||||
// GGML_ROPE_TYPE_NORMAL n_dims = 8 --> [cscscscs]
|
||||
// GGML_ROPE_TYPE_NEOX n_dims = 4 --> [ccss0000]
|
||||
// GGML_ROPE_TYPE_NEOX n_dims = 8 --> [ccccssss]
|
||||
GGML_API struct ggml_tensor * ggml_rope_ext(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
|
|
@ -1808,6 +1832,36 @@ extern "C" {
|
|||
float beta_fast,
|
||||
float beta_slow);
|
||||
|
||||
// multi-dimensional RoPE, for Qwen-VL and similar vision models
|
||||
// mode can be either VISION, MROPE, IMROPE, cannot be combined with NORMAL or NEOX
|
||||
// sections specify how many dimensions to rotate in each section:
|
||||
// section length is equivalent to number of cos/sin pairs, NOT the number of dims
|
||||
// (i.e. sum of 4 sections are expected to be n_dims/2)
|
||||
// last sections can be 0, means ignored
|
||||
// all other options are identical to ggml_rope_ext
|
||||
//
|
||||
// important note:
|
||||
// - NEOX ordering is automatically applied and cannot be disabled for MROPE and VISION
|
||||
// if you need normal ordering, there are 2 methods:
|
||||
// (1) split the tensor manually using ggml_view
|
||||
// (2) permute the weight upon conversion
|
||||
// - for VISION, n_dims must be head_size/2
|
||||
//
|
||||
// example M-RoPE:
|
||||
// given sections = [t=4, y=2, x=2, 0]
|
||||
// given a single head with size = 18 --> [000000000000000000]
|
||||
// GGML_ROPE_TYPE_MROPE n_dims = 16 --> [ttttyyxxttttyyxx00] (cos/sin are applied in NEOX ordering)
|
||||
// GGML_ROPE_TYPE_IMROPE n_dims = 16 --> [ttyxttyxttyxttyx00] (interleaved M-RoPE, still NEOX ordering)
|
||||
// note: the theta for each dim is computed the same way as ggml_rope_ext, no matter the section
|
||||
// in other words, idx used for theta: [0123456789... until n_dims/2], not reset for each section
|
||||
//
|
||||
// example vision RoPE:
|
||||
// given sections = [y=4, x=4, 0, 0] (last 2 sections are ignored)
|
||||
// given a single head with size = 8 --> [00000000]
|
||||
// GGML_ROPE_TYPE_VISION n_dims = 4 --> [yyyyxxxx]
|
||||
// other values of n_dims are untested and is undefined behavior
|
||||
// note: unlike MROPE, the theta for each dim is computed differently for each section
|
||||
// in other words, idx used for theta: [0123] for y section, then [0123] for x section
|
||||
GGML_API struct ggml_tensor * ggml_rope_multi(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
|
|
|
|||
|
|
@ -1419,22 +1419,48 @@ struct ggml_backend_meta_context {
|
|||
size_t max_tmp_size = 0;
|
||||
size_t max_subgraphs = 0;
|
||||
|
||||
void * comm_ctx = nullptr;
|
||||
ggml_backend_comm_allreduce_tensor_t comm_allreduce = nullptr;
|
||||
|
||||
ggml_backend_meta_context(ggml_backend_dev_t meta_dev, const char * params) {
|
||||
const size_t n_devs = ggml_backend_meta_dev_n_devs(meta_dev);
|
||||
name = "Meta(";
|
||||
std::vector<ggml_backend_t> simple_backends;
|
||||
backend_configs.reserve(n_devs);
|
||||
simple_backends.reserve(n_devs);
|
||||
for (size_t i = 0; i < n_devs; i++) {
|
||||
ggml_backend_dev_t simple_dev = ggml_backend_meta_dev_simple_dev(meta_dev, i);
|
||||
if (i > 0) {
|
||||
name += ",";
|
||||
}
|
||||
name += ggml_backend_dev_name(simple_dev);
|
||||
backend_configs.emplace_back(ggml_backend_dev_init(simple_dev, params));
|
||||
simple_backends.push_back(ggml_backend_dev_init(simple_dev, params));
|
||||
backend_configs.emplace_back(simple_backends.back());
|
||||
}
|
||||
name += ")";
|
||||
|
||||
if (n_devs > 1) {
|
||||
ggml_backend_comm_init_t comm_init = (ggml_backend_comm_init_t) ggml_backend_reg_get_proc_address(
|
||||
ggml_backend_dev_backend_reg(ggml_backend_get_device(simple_backends[0])), "ggml_backend_comm_init");
|
||||
if (comm_init != nullptr) {
|
||||
comm_ctx = comm_init(simple_backends.data(), simple_backends.size());
|
||||
}
|
||||
}
|
||||
if (comm_ctx != nullptr) {
|
||||
comm_allreduce = (ggml_backend_comm_allreduce_tensor_t)
|
||||
ggml_backend_reg_get_proc_address(ggml_backend_dev_backend_reg(
|
||||
ggml_backend_get_device(simple_backends[0])), "ggml_backend_comm_allreduce_tensor");
|
||||
GGML_ASSERT(comm_allreduce != nullptr);
|
||||
}
|
||||
}
|
||||
|
||||
~ggml_backend_meta_context() {
|
||||
if (comm_ctx != nullptr) {
|
||||
ggml_backend_comm_free_t comm_free = (ggml_backend_comm_free_t) ggml_backend_reg_get_proc_address(
|
||||
ggml_backend_dev_backend_reg(ggml_backend_get_device(backend_configs[0].backend)), "ggml_backend_comm_free");
|
||||
GGML_ASSERT(comm_free != nullptr);
|
||||
comm_free(comm_ctx);
|
||||
}
|
||||
for (auto & bc : backend_configs) {
|
||||
ggml_backend_free(bc.backend);
|
||||
}
|
||||
|
|
@ -1845,20 +1871,15 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
|
|||
|
||||
if (n_backends > 1 && i < n_subgraphs - 1) {
|
||||
bool backend_allreduce_success = false;
|
||||
ggml_backend_allreduce_tensor_t allreduce_tensor = (ggml_backend_allreduce_tensor_t) ggml_backend_reg_get_proc_address(
|
||||
ggml_backend_dev_backend_reg(ggml_backend_get_device(backend_ctx->backend_configs[0].backend)), "ggml_backend_allreduce_tensor");
|
||||
if (allreduce_tensor) {
|
||||
std::vector<ggml_backend_t> backends;
|
||||
backends.reserve(n_backends);
|
||||
if (backend_ctx->comm_ctx) {
|
||||
std::vector<ggml_tensor *> nodes;
|
||||
nodes.reserve(n_backends);
|
||||
for (size_t j = 0; j < n_backends; j++) {
|
||||
auto & bcj = backend_ctx->backend_configs[j];
|
||||
backends.push_back(bcj.backend);
|
||||
ggml_cgraph * cgraph_ij = bcj.cgraphs[i].cgraph_main;
|
||||
nodes.push_back(cgraph_ij->nodes[cgraph_ij->n_nodes-1]);
|
||||
}
|
||||
backend_allreduce_success = allreduce_tensor(backends.data(), nodes.data(), n_backends);
|
||||
backend_allreduce_success = backend_ctx->comm_allreduce(backend_ctx->comm_ctx, nodes.data());
|
||||
}
|
||||
|
||||
if (!backend_allreduce_success) {
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
|
|
@ -109,6 +109,96 @@ static void simd_gemm(
|
|||
C += N;
|
||||
}
|
||||
}
|
||||
#elif defined(GGML_SIMD) && defined(__riscv_v_intrinsic)
|
||||
// RM accumulators + 1 B vector = RM + 1 <= 8 => RM <= 7
|
||||
// Microkernel: C[RM x vl] += A[RM x K] * B[K x N]
|
||||
template <int RM>
|
||||
static inline void rvv_simd_gemm_ukernel(
|
||||
float * GGML_RESTRICT C,
|
||||
const float * GGML_RESTRICT A,
|
||||
const float * GGML_RESTRICT B,
|
||||
int K, int N, size_t vl)
|
||||
{
|
||||
static_assert(RM >= 1 && RM <= 7, "RM must be 1..7 for LMUL=4");
|
||||
|
||||
vfloat32m4_t acc_0 = __riscv_vle32_v_f32m4(C + 0 * N, vl);
|
||||
vfloat32m4_t acc_1, acc_2, acc_3, acc_4, acc_5, acc_6;
|
||||
if constexpr (RM > 1) acc_1 = __riscv_vle32_v_f32m4(C + 1 * N, vl);
|
||||
if constexpr (RM > 2) acc_2 = __riscv_vle32_v_f32m4(C + 2 * N, vl);
|
||||
if constexpr (RM > 3) acc_3 = __riscv_vle32_v_f32m4(C + 3 * N, vl);
|
||||
if constexpr (RM > 4) acc_4 = __riscv_vle32_v_f32m4(C + 4 * N, vl);
|
||||
if constexpr (RM > 5) acc_5 = __riscv_vle32_v_f32m4(C + 5 * N, vl);
|
||||
if constexpr (RM > 6) acc_6 = __riscv_vle32_v_f32m4(C + 6 * N, vl);
|
||||
|
||||
for (int kk = 0; kk < K; kk++) {
|
||||
vfloat32m4_t b_0 = __riscv_vle32_v_f32m4(B + kk * N, vl);
|
||||
|
||||
acc_0 = __riscv_vfmacc_vf_f32m4(acc_0, A[0 * K + kk], b_0, vl);
|
||||
if constexpr (RM > 1) acc_1 = __riscv_vfmacc_vf_f32m4(acc_1, A[1 * K + kk], b_0, vl);
|
||||
if constexpr (RM > 2) acc_2 = __riscv_vfmacc_vf_f32m4(acc_2, A[2 * K + kk], b_0, vl);
|
||||
if constexpr (RM > 3) acc_3 = __riscv_vfmacc_vf_f32m4(acc_3, A[3 * K + kk], b_0, vl);
|
||||
if constexpr (RM > 4) acc_4 = __riscv_vfmacc_vf_f32m4(acc_4, A[4 * K + kk], b_0, vl);
|
||||
if constexpr (RM > 5) acc_5 = __riscv_vfmacc_vf_f32m4(acc_5, A[5 * K + kk], b_0, vl);
|
||||
if constexpr (RM > 6) acc_6 = __riscv_vfmacc_vf_f32m4(acc_6, A[6 * K + kk], b_0, vl);
|
||||
}
|
||||
|
||||
__riscv_vse32_v_f32m4(C + 0 * N, acc_0, vl);
|
||||
if constexpr (RM > 1) __riscv_vse32_v_f32m4(C + 1 * N, acc_1, vl);
|
||||
if constexpr (RM > 2) __riscv_vse32_v_f32m4(C + 2 * N, acc_2, vl);
|
||||
if constexpr (RM > 3) __riscv_vse32_v_f32m4(C + 3 * N, acc_3, vl);
|
||||
if constexpr (RM > 4) __riscv_vse32_v_f32m4(C + 4 * N, acc_4, vl);
|
||||
if constexpr (RM > 5) __riscv_vse32_v_f32m4(C + 5 * N, acc_5, vl);
|
||||
if constexpr (RM > 6) __riscv_vse32_v_f32m4(C + 6 * N, acc_6, vl);
|
||||
}
|
||||
|
||||
template <int RM>
|
||||
static inline void rvv_simd_gemm_dispatch_tail(
|
||||
float * GGML_RESTRICT C,
|
||||
const float * GGML_RESTRICT A,
|
||||
const float * GGML_RESTRICT B,
|
||||
int K, int N, int KN, int remaining_rows)
|
||||
{
|
||||
if constexpr (RM > 0) {
|
||||
if (remaining_rows == RM) {
|
||||
int64_t jj = 0;
|
||||
for (; jj + KN <= N; jj += KN) {
|
||||
rvv_simd_gemm_ukernel<RM>(C + jj, A, B + jj, K, N, KN);
|
||||
}
|
||||
if (jj < N) {
|
||||
rvv_simd_gemm_ukernel<RM>(C + jj, A, B + jj, K, N, N - jj);
|
||||
}
|
||||
} else {
|
||||
rvv_simd_gemm_dispatch_tail<RM - 1>(C, A, B, K, N, KN, remaining_rows);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static constexpr int GEMM_RM = 7;
|
||||
|
||||
// C[M x N] += A[M x K] * B[K x N]
|
||||
static void simd_gemm(
|
||||
float * GGML_RESTRICT C,
|
||||
const float * GGML_RESTRICT A,
|
||||
const float * GGML_RESTRICT B,
|
||||
int M, int K, int N)
|
||||
{
|
||||
const int KN = (int)__riscv_vlenb();
|
||||
int64_t ii = 0;
|
||||
for (; ii + GEMM_RM <= M; ii += GEMM_RM) {
|
||||
int64_t jj = 0;
|
||||
for (; jj + KN <= N; jj += KN) {
|
||||
rvv_simd_gemm_ukernel<GEMM_RM>(C + jj, A, B + jj, K, N, KN);
|
||||
}
|
||||
if (jj < N) {
|
||||
rvv_simd_gemm_ukernel<GEMM_RM>(C + jj, A, B + jj, K, N, N - jj);
|
||||
}
|
||||
A += GEMM_RM * K;
|
||||
C += GEMM_RM * N;
|
||||
}
|
||||
|
||||
int remaining_rows = M - ii;
|
||||
rvv_simd_gemm_dispatch_tail<GEMM_RM - 1>(C, A, B, K, N, KN, remaining_rows);
|
||||
}
|
||||
|
||||
#if defined(__GNUC__) && !defined(__clang__)
|
||||
#pragma GCC diagnostic pop
|
||||
|
|
|
|||
|
|
@ -931,6 +931,13 @@ struct ggml_cuda_type_traits<GGML_TYPE_F16> {
|
|||
static constexpr int qr = 1;
|
||||
};
|
||||
|
||||
template<>
|
||||
struct ggml_cuda_type_traits<GGML_TYPE_Q1_0> {
|
||||
static constexpr int qk = QK1_0;
|
||||
static constexpr int qr = QR1_0;
|
||||
static constexpr int qi = QI1_0;
|
||||
};
|
||||
|
||||
template<>
|
||||
struct ggml_cuda_type_traits<GGML_TYPE_Q4_0> {
|
||||
static constexpr int qk = QK4_0;
|
||||
|
|
@ -1099,10 +1106,6 @@ struct ggml_cuda_device_info {
|
|||
cuda_device_info devices[GGML_CUDA_MAX_DEVICES] = {};
|
||||
|
||||
std::array<float, GGML_CUDA_MAX_DEVICES> default_tensor_split = {};
|
||||
|
||||
#ifdef GGML_USE_NCCL
|
||||
ncclComm_t comms[GGML_CUDA_MAX_DEVICES];
|
||||
#endif // GGML_USE_NCCL
|
||||
};
|
||||
|
||||
const ggml_cuda_device_info & ggml_cuda_info();
|
||||
|
|
|
|||
|
|
@ -711,6 +711,8 @@ to_bf16_cuda_t ggml_get_to_bf16_cuda(ggml_type type) {
|
|||
|
||||
to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_Q1_0:
|
||||
return dequantize_block_cont_cuda<QK1_0, QR1_0, dequantize_q1_0>;
|
||||
case GGML_TYPE_Q4_0:
|
||||
return dequantize_row_q4_0_cuda;
|
||||
case GGML_TYPE_Q4_1:
|
||||
|
|
@ -767,6 +769,8 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
|
|||
|
||||
to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_Q1_0:
|
||||
return dequantize_block_cont_cuda<QK1_0, QR1_0, dequantize_q1_0>;
|
||||
case GGML_TYPE_Q4_0:
|
||||
return dequantize_row_q4_0_cuda;
|
||||
case GGML_TYPE_Q4_1:
|
||||
|
|
@ -822,6 +826,8 @@ to_fp16_nc_cuda_t ggml_get_to_fp16_nc_cuda(ggml_type type) {
|
|||
switch (type) {
|
||||
case GGML_TYPE_F32:
|
||||
return convert_unary_cuda<float>;
|
||||
case GGML_TYPE_Q1_0:
|
||||
return dequantize_block_cuda<QK1_0, QR1_0, dequantize_q1_0>;
|
||||
case GGML_TYPE_Q4_0:
|
||||
return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
|
||||
case GGML_TYPE_Q4_1:
|
||||
|
|
@ -843,6 +849,8 @@ to_bf16_nc_cuda_t ggml_get_to_bf16_nc_cuda(ggml_type type) {
|
|||
switch (type) {
|
||||
case GGML_TYPE_F32:
|
||||
return convert_unary_cuda<float, nv_bfloat16>;
|
||||
case GGML_TYPE_Q1_0:
|
||||
return dequantize_block_cuda<QK1_0, QR1_0, dequantize_q1_0>;
|
||||
case GGML_TYPE_Q4_0:
|
||||
return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
|
||||
case GGML_TYPE_Q4_1:
|
||||
|
|
@ -864,6 +872,8 @@ to_fp32_nc_cuda_t ggml_get_to_fp32_nc_cuda(ggml_type type) {
|
|||
switch (type) {
|
||||
case GGML_TYPE_F16:
|
||||
return convert_unary_cuda<half, float>;
|
||||
case GGML_TYPE_Q1_0:
|
||||
return dequantize_block_cuda<QK1_0, QR1_0, dequantize_q1_0>;
|
||||
case GGML_TYPE_Q4_0:
|
||||
return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
|
||||
case GGML_TYPE_Q4_1:
|
||||
|
|
|
|||
|
|
@ -1,5 +1,27 @@
|
|||
#include "common.cuh"
|
||||
|
||||
static __device__ __forceinline__ void dequantize_q1_0(const void * vx, const int64_t ib, const int iqs, float2 & v){
|
||||
const block_q1_0 * x = (const block_q1_0 *) vx;
|
||||
|
||||
const float d = x[ib].d;
|
||||
|
||||
const int bit_index_0 = iqs;
|
||||
const int bit_index_1 = iqs + 1;
|
||||
|
||||
const int byte_index_0 = bit_index_0 / 8;
|
||||
const int bit_offset_0 = bit_index_0 % 8;
|
||||
|
||||
const int byte_index_1 = bit_index_1 / 8;
|
||||
const int bit_offset_1 = bit_index_1 % 8;
|
||||
|
||||
// Extract bits: 1 = +d, 0 = -d (branchless)
|
||||
const int bit_0 = (x[ib].qs[byte_index_0] >> bit_offset_0) & 1;
|
||||
const int bit_1 = (x[ib].qs[byte_index_1] >> bit_offset_1) & 1;
|
||||
|
||||
v.x = (2*bit_0 - 1) * d;
|
||||
v.y = (2*bit_1 - 1) * d;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const int64_t ib, const int iqs, float2 & v){
|
||||
const block_q4_0 * x = (const block_q4_0 *) vx;
|
||||
|
||||
|
|
|
|||
|
|
@ -179,6 +179,10 @@ static void ggml_cuda_get_rows_switch_src0_type(
|
|||
get_rows_cuda_float((const nv_bfloat16 *) src0_d, src1_d, dst_d,
|
||||
ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q1_0:
|
||||
get_rows_cuda_q<QK1_0, QR1_0, dequantize_q1_0>(src0_d, src1_d, dst_d,
|
||||
ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q4_0:
|
||||
get_rows_cuda_q<QK4_0, QR4_0, dequantize_q4_0>(src0_d, src1_d, dst_d,
|
||||
ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
|
||||
|
|
|
|||
|
|
@ -326,28 +326,22 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
|||
// configure logging to stdout
|
||||
// CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
|
||||
|
||||
for (int id = 0; id < info.device_count; ++id) {
|
||||
ggml_cuda_set_device(id);
|
||||
for (int id_other = 0; id_other < info.device_count; ++id_other) {
|
||||
if (id == id_other) {
|
||||
continue;
|
||||
}
|
||||
int can_access_peer;
|
||||
CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer, id, id_other));
|
||||
if (can_access_peer) {
|
||||
CUDA_CHECK(cudaDeviceEnablePeerAccess(id_other, 0));
|
||||
if (getenv("GGML_CUDA_P2P") != nullptr) {
|
||||
for (int id = 0; id < info.device_count; ++id) {
|
||||
ggml_cuda_set_device(id);
|
||||
for (int id_other = 0; id_other < info.device_count; ++id_other) {
|
||||
if (id == id_other) {
|
||||
continue;
|
||||
}
|
||||
int can_access_peer;
|
||||
CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer, id, id_other));
|
||||
if (can_access_peer) {
|
||||
CUDA_CHECK(cudaDeviceEnablePeerAccess(id_other, 0));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef GGML_USE_NCCL
|
||||
int dev_ids[GGML_CUDA_MAX_DEVICES];
|
||||
for (int id = 0; id < info.device_count; ++id) {
|
||||
dev_ids[id] = id;
|
||||
}
|
||||
NCCL_CHECK(ncclCommInitAll(info.comms, info.device_count, dev_ids));
|
||||
#endif // GGML_USE_NCCL
|
||||
|
||||
return info;
|
||||
}
|
||||
|
||||
|
|
@ -1125,66 +1119,51 @@ static const ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_inte
|
|||
/* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host,
|
||||
};
|
||||
|
||||
bool ggml_backend_cuda_allreduce_tensor(ggml_backend_t * backends, struct ggml_tensor ** tensors, size_t n_backends) {
|
||||
#ifdef GGML_USE_NCCL
|
||||
const int64_t ne = ggml_nelements(tensors[0]);
|
||||
// FIXME the input of llm_graph_context::build_in_out_ids can produce a tensor with 0 elements if n_outputs == 0
|
||||
// This then causes a crash in this function
|
||||
if (ne == 0) {
|
||||
return true;
|
||||
}
|
||||
for (size_t i = 0; i < n_backends; ++i) {
|
||||
GGML_ASSERT(tensors[i] != nullptr);
|
||||
GGML_ASSERT(ggml_nelements(tensors[i]) == ne);
|
||||
GGML_ASSERT(ggml_is_contiguously_allocated(tensors[i]));
|
||||
}
|
||||
struct ggml_backend_cuda_comm_context {
|
||||
std::vector<ggml_backend_t> backends;
|
||||
std::vector<ncclComm_t> comms;
|
||||
|
||||
const ggml_cuda_device_info info = ggml_cuda_info();
|
||||
|
||||
// For small tensors, simply reduce them as FP32.
|
||||
// The following heuristic for how "small" a tensor should be is based on RTX 4090s connected via 16x PCIe 4.0.
|
||||
if ((n_backends <= 2 && ne < 32768) || (n_backends == 3 && ne < 131072) || (n_backends >= 4 && ne < 262144)) {
|
||||
NCCL_CHECK(ncclGroupStart());
|
||||
for (size_t i = 0; i < n_backends; ++i) {
|
||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backends[i]->context;
|
||||
NCCL_CHECK(ncclAllReduce(tensors[i]->data, tensors[i]->data, ne, ncclFloat, ncclSum, info.comms[cuda_ctx->device], cuda_ctx->stream()));
|
||||
~ggml_backend_cuda_comm_context() {
|
||||
for (ncclComm_t comm : comms) {
|
||||
NCCL_CHECK(ncclCommDestroy(comm));
|
||||
}
|
||||
NCCL_CHECK(ncclGroupEnd());
|
||||
|
||||
return true;
|
||||
}
|
||||
};
|
||||
#endif // GGML_USE_NCCL
|
||||
|
||||
// For large tensors it's faster to compress them to BF16 for the reduction:
|
||||
to_bf16_cuda_t to_bf16 = ggml_get_to_bf16_cuda(GGML_TYPE_F32);
|
||||
to_fp32_cuda_t to_fp32 = ggml_get_to_fp32_cuda(GGML_TYPE_BF16);
|
||||
static void ggml_backend_cuda_comm_free(void * comm_ctx_v) {
|
||||
#ifdef GGML_USE_NCCL
|
||||
if (comm_ctx_v == nullptr) {
|
||||
return;
|
||||
}
|
||||
ggml_backend_cuda_comm_context * comm_ctx = (ggml_backend_cuda_comm_context *) comm_ctx_v;
|
||||
delete comm_ctx;
|
||||
#else
|
||||
GGML_UNUSED(comm_ctx_v);
|
||||
#endif // GGML_USE_NCCL
|
||||
}
|
||||
|
||||
ggml_cuda_pool_alloc<nv_bfloat16> tmp[GGML_CUDA_MAX_DEVICES];
|
||||
for (size_t i = 0; i < n_backends; ++i) {
|
||||
static void * ggml_backend_cuda_comm_init(ggml_backend_t * backends, size_t n_backends) {
|
||||
#ifdef GGML_USE_NCCL
|
||||
for (size_t i = 0; i < n_backends; i++) {
|
||||
if (!ggml_backend_is_cuda(backends[i])) {
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
ggml_backend_cuda_comm_context * ret = new ggml_backend_cuda_comm_context;
|
||||
std::vector<int> dev_ids;
|
||||
ret->backends.reserve(n_backends);
|
||||
dev_ids.reserve(n_backends);
|
||||
for (size_t i = 0; i < n_backends; i++) {
|
||||
ret->backends.push_back(backends[i]);
|
||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backends[i]->context;
|
||||
tmp[i].pool = &cuda_ctx->pool();
|
||||
tmp[i].alloc(ne);
|
||||
|
||||
ggml_cuda_set_device(i);
|
||||
to_bf16(tensors[i]->data, tmp[i].get(), ne, cuda_ctx->stream());
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
dev_ids.push_back(cuda_ctx->device);
|
||||
}
|
||||
|
||||
NCCL_CHECK(ncclGroupStart());
|
||||
for (size_t i = 0; i < n_backends; ++i) {
|
||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backends[i]->context;
|
||||
NCCL_CHECK(ncclAllReduce(tmp[i].get(), tmp[i].get(), ne, ncclBfloat16, ncclSum, info.comms[cuda_ctx->device], cuda_ctx->stream()));
|
||||
}
|
||||
NCCL_CHECK(ncclGroupEnd());
|
||||
|
||||
for (size_t i = 0; i < n_backends; ++i) {
|
||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backends[i]->context;
|
||||
|
||||
ggml_cuda_set_device(i);
|
||||
to_fp32(tmp[i].get(), (float *) tensors[i]->data, ne, cuda_ctx->stream());
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
}
|
||||
|
||||
return true;
|
||||
ret->comms.resize(n_backends);
|
||||
NCCL_CHECK(ncclCommInitAll(ret->comms.data(), n_backends, dev_ids.data()));
|
||||
return ret;
|
||||
#else
|
||||
// If NCCL is installed it is used by default for optimal performance.
|
||||
// However, NVIDIA does not distribute NCCL with CUDA so users may be unwittingly missing this package.
|
||||
|
|
@ -1197,7 +1176,76 @@ bool ggml_backend_cuda_allreduce_tensor(ggml_backend_t * backends, struct ggml_t
|
|||
warning_printed = true;
|
||||
}
|
||||
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
|
||||
GGML_UNUSED_VARS(backends, tensors, n_backends);
|
||||
GGML_UNUSED_VARS(backends, n_backends);
|
||||
return nullptr;
|
||||
#endif // GGML_USE_NCCL
|
||||
}
|
||||
|
||||
static bool ggml_backend_cuda_comm_allreduce_tensor(void * comm_ctx_v, struct ggml_tensor ** tensors) {
|
||||
#ifdef GGML_USE_NCCL
|
||||
const int64_t ne = ggml_nelements(tensors[0]);
|
||||
// FIXME the input of llm_graph_context::build_in_out_ids can produce a tensor with 0 elements if n_outputs == 0
|
||||
// This then causes a crash in this function
|
||||
if (ne == 0) {
|
||||
return true;
|
||||
}
|
||||
|
||||
GGML_ASSERT(comm_ctx_v != nullptr);
|
||||
ggml_backend_cuda_comm_context * comm_ctx = (ggml_backend_cuda_comm_context *) comm_ctx_v;
|
||||
const size_t n_backends = comm_ctx->backends.size();
|
||||
|
||||
for (size_t i = 0; i < n_backends; ++i) {
|
||||
GGML_ASSERT(tensors[i] != nullptr);
|
||||
GGML_ASSERT(ggml_nelements(tensors[i]) == ne);
|
||||
GGML_ASSERT(ggml_is_contiguously_allocated(tensors[i]));
|
||||
}
|
||||
|
||||
// For small tensors, simply reduce them as FP32.
|
||||
// The following heuristic for how "small" a tensor should be is based on RTX 4090s connected via 16x PCIe 4.0.
|
||||
if ((n_backends <= 2 && ne < 32768) || (n_backends == 3 && ne < 131072) || (n_backends >= 4 && ne < 262144)) {
|
||||
NCCL_CHECK(ncclGroupStart());
|
||||
for (size_t i = 0; i < n_backends; ++i) {
|
||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) comm_ctx->backends[i]->context;
|
||||
NCCL_CHECK(ncclAllReduce(tensors[i]->data, tensors[i]->data, ne, ncclFloat, ncclSum, comm_ctx->comms[i], cuda_ctx->stream()));
|
||||
}
|
||||
NCCL_CHECK(ncclGroupEnd());
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// For large tensors it's faster to compress them to BF16 for the reduction:
|
||||
to_bf16_cuda_t to_bf16 = ggml_get_to_bf16_cuda(GGML_TYPE_F32);
|
||||
to_fp32_cuda_t to_fp32 = ggml_get_to_fp32_cuda(GGML_TYPE_BF16);
|
||||
|
||||
ggml_cuda_pool_alloc<nv_bfloat16> tmp[GGML_CUDA_MAX_DEVICES];
|
||||
for (size_t i = 0; i < n_backends; ++i) {
|
||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) comm_ctx->backends[i]->context;
|
||||
tmp[i].pool = &cuda_ctx->pool();
|
||||
tmp[i].alloc(ne);
|
||||
|
||||
ggml_cuda_set_device(cuda_ctx->device);
|
||||
to_bf16(tensors[i]->data, tmp[i].get(), ne, cuda_ctx->stream());
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
}
|
||||
|
||||
NCCL_CHECK(ncclGroupStart());
|
||||
for (size_t i = 0; i < n_backends; ++i) {
|
||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) comm_ctx->backends[i]->context;
|
||||
NCCL_CHECK(ncclAllReduce(tmp[i].get(), tmp[i].get(), ne, ncclBfloat16, ncclSum, comm_ctx->comms[i], cuda_ctx->stream()));
|
||||
}
|
||||
NCCL_CHECK(ncclGroupEnd());
|
||||
|
||||
for (size_t i = 0; i < n_backends; ++i) {
|
||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) comm_ctx->backends[i]->context;
|
||||
|
||||
ggml_cuda_set_device(cuda_ctx->device);
|
||||
to_fp32(tmp[i].get(), (float *) tensors[i]->data, ne, cuda_ctx->stream());
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
}
|
||||
|
||||
return true;
|
||||
#else
|
||||
GGML_UNUSED_VARS(comm_ctx_v, tensors);
|
||||
return false;
|
||||
#endif // GGML_USE_NCCL
|
||||
}
|
||||
|
|
@ -4804,6 +4852,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
|||
switch (a->type) {
|
||||
case GGML_TYPE_F32:
|
||||
case GGML_TYPE_F16:
|
||||
case GGML_TYPE_Q1_0:
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_1:
|
||||
case GGML_TYPE_Q5_0:
|
||||
|
|
@ -4841,6 +4890,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
|||
case GGML_TYPE_F32:
|
||||
case GGML_TYPE_BF16:
|
||||
case GGML_TYPE_I32:
|
||||
case GGML_TYPE_Q1_0:
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_1:
|
||||
case GGML_TYPE_Q5_0:
|
||||
|
|
@ -5242,8 +5292,14 @@ static ggml_backend_feature * ggml_backend_cuda_get_features(ggml_backend_reg_t
|
|||
|
||||
static void * ggml_backend_cuda_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) {
|
||||
GGML_UNUSED(reg);
|
||||
if (strcmp(name, "ggml_backend_allreduce_tensor") == 0) {
|
||||
return (void *)ggml_backend_cuda_allreduce_tensor;
|
||||
if (strcmp(name, "ggml_backend_comm_init") == 0) {
|
||||
return (void *)ggml_backend_cuda_comm_init;
|
||||
}
|
||||
if (strcmp(name, "ggml_backend_comm_free") == 0) {
|
||||
return (void *)ggml_backend_cuda_comm_free;
|
||||
}
|
||||
if (strcmp(name, "ggml_backend_comm_allreduce_tensor") == 0) {
|
||||
return (void *)ggml_backend_cuda_comm_allreduce_tensor;
|
||||
}
|
||||
if (strcmp(name, "ggml_backend_split_buffer_type") == 0) {
|
||||
return (void *)ggml_backend_cuda_split_buffer_type;
|
||||
|
|
|
|||
|
|
@ -5,6 +5,9 @@
|
|||
|
||||
static void ggml_cuda_mul_mat_q_switch_type(ggml_backend_cuda_context & ctx, const mmq_args & args, cudaStream_t stream) {
|
||||
switch (args.type_x) {
|
||||
case GGML_TYPE_Q1_0:
|
||||
mul_mat_q_case<GGML_TYPE_Q1_0>(ctx, args, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q4_0:
|
||||
mul_mat_q_case<GGML_TYPE_Q4_0>(ctx, args, stream);
|
||||
break;
|
||||
|
|
@ -272,6 +275,7 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t
|
|||
bool mmq_supported;
|
||||
|
||||
switch (type) {
|
||||
case GGML_TYPE_Q1_0:
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_1:
|
||||
case GGML_TYPE_Q5_0:
|
||||
|
|
|
|||
|
|
@ -58,6 +58,8 @@ static_assert(sizeof(block_fp4_mmq) == sizeof(block_q8_1_mmq), "Unexpected b
|
|||
|
||||
static mmq_q8_1_ds_layout mmq_get_q8_1_ds_layout(const ggml_type type_x) {
|
||||
switch (type_x) {
|
||||
case GGML_TYPE_Q1_0:
|
||||
return MMQ_Q8_1_DS_LAYOUT_D4;
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_1:
|
||||
return MMQ_Q8_1_DS_LAYOUT_DS4;
|
||||
|
|
@ -186,6 +188,7 @@ static constexpr __device__ int get_mmq_y_device() {
|
|||
|
||||
static constexpr __host__ __device__ tile_x_sizes mmq_get_dp4a_tile_x_sizes(ggml_type type, int mmq_y) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_Q1_0: return MMQ_DP4A_TXS_Q8_0;
|
||||
case GGML_TYPE_Q4_0: return MMQ_DP4A_TXS_Q4_0;
|
||||
case GGML_TYPE_Q4_1: return MMQ_DP4A_TXS_Q4_1;
|
||||
case GGML_TYPE_Q5_0: return MMQ_DP4A_TXS_Q8_0;
|
||||
|
|
@ -230,6 +233,7 @@ static_assert(MMQ_MMA_TILE_X_K_NVFP4 % 8 == 4, "Wrong padding.");
|
|||
|
||||
static constexpr __host__ __device__ int mmq_get_mma_tile_x_k(ggml_type type) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_Q1_0: return MMQ_MMA_TILE_X_K_Q8_0;
|
||||
case GGML_TYPE_Q4_0: return MMQ_MMA_TILE_X_K_Q8_0;
|
||||
case GGML_TYPE_Q4_1: return MMQ_MMA_TILE_X_K_Q8_1;
|
||||
case GGML_TYPE_Q5_0: return MMQ_MMA_TILE_X_K_Q8_0;
|
||||
|
|
@ -303,6 +307,87 @@ static constexpr __device__ int mmq_get_nwarps_device() {
|
|||
|
||||
// ------------------------------------------------------------
|
||||
|
||||
template <int mmq_y, bool need_check> static __device__ __forceinline__ void load_tiles_q1_0(
|
||||
const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
|
||||
constexpr int nwarps = mmq_get_nwarps_device();
|
||||
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
||||
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
|
||||
int * x_qs = (int *) x_tile;
|
||||
float * x_df = (float *) (x_qs + 2*MMQ_TILE_NE_K);
|
||||
#else
|
||||
constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q8_0, mmq_y);
|
||||
int * x_qs = (int *) x_tile;
|
||||
float * x_df = (float *) (x_qs + txs.qs);
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
|
||||
|
||||
constexpr int blocks_per_iter = MMQ_ITER_K / QK1_0;
|
||||
constexpr int threads_per_row = blocks_per_iter * QI1_0;
|
||||
constexpr int nrows = warp_size / threads_per_row;
|
||||
constexpr int scale_entries_per_block = QK1_0 / QK8_1;
|
||||
constexpr int scale_entries_per_row = blocks_per_iter * scale_entries_per_block;
|
||||
|
||||
const int txi = threadIdx.x % threads_per_row;
|
||||
const int kbx = txi / QI1_0;
|
||||
const int kqsx = txi % QI1_0;
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < mmq_y; i0 += nrows*nwarps) {
|
||||
int i = i0 + threadIdx.y*nrows + threadIdx.x/threads_per_row;
|
||||
|
||||
if (need_check) {
|
||||
i = min(i, i_max);
|
||||
}
|
||||
|
||||
const block_q1_0 * bxi = (const block_q1_0 *) x + kbx0 + i*stride + kbx;
|
||||
const int qs_offset = 4*kqsx;
|
||||
const int qs0 = bxi->qs[qs_offset + 0] | (bxi->qs[qs_offset + 1] << 8) |
|
||||
(bxi->qs[qs_offset + 2] << 16) | (bxi->qs[qs_offset + 3] << 24);
|
||||
|
||||
int unpacked_bytes[8];
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
const int shift = j * 4;
|
||||
const int bits4 = (qs0 >> shift) & 0x0F;
|
||||
const int b0 = (bits4 & 0x01) ? 1 : -1;
|
||||
const int b1 = (bits4 & 0x02) ? 1 : -1;
|
||||
const int b2 = (bits4 & 0x04) ? 1 : -1;
|
||||
const int b3 = (bits4 & 0x08) ? 1 : -1;
|
||||
unpacked_bytes[j] = (b0 & 0xFF) | ((b1 & 0xFF) << 8) | ((b2 & 0xFF) << 16) | ((b3 & 0xFF) << 24);
|
||||
}
|
||||
|
||||
const int dst_offset = kbx*(scale_entries_per_block*QI8_0) + kqsx*QI8_0;
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + dst_offset + j] = unpacked_bytes[j];
|
||||
#else
|
||||
x_qs[i*(2*MMQ_TILE_NE_K + 1) + dst_offset + j] = unpacked_bytes[j];
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
|
||||
}
|
||||
}
|
||||
|
||||
const int ksx = threadIdx.x % scale_entries_per_row;
|
||||
const int scale_block = ksx / scale_entries_per_block;
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
||||
int i = i0 + threadIdx.y;
|
||||
|
||||
if (need_check) {
|
||||
i = min(i, i_max);
|
||||
}
|
||||
|
||||
const block_q1_0 * bxi = (const block_q1_0 *) x + kbx0 + i*stride + scale_block;
|
||||
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
|
||||
x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + ksx] = bxi->d;
|
||||
#else
|
||||
x_df[i*(2*MMQ_TILE_NE_K/QI8_0) + i/(QI8_0/2) + ksx] = bxi->d;
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
|
||||
}
|
||||
}
|
||||
|
||||
template <int mmq_y, bool need_check> static __device__ __forceinline__ void load_tiles_q4_0(
|
||||
const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
|
||||
constexpr int nwarps = mmq_get_nwarps_device();
|
||||
|
|
@ -3291,6 +3376,14 @@ static __device__ __forceinline__ void mmq_write_back_mma(
|
|||
template <int mmq_x, int mmq_y, bool need_check, ggml_type type>
|
||||
struct mmq_type_traits;
|
||||
|
||||
template <int mmq_x, int mmq_y, bool need_check>
|
||||
struct mmq_type_traits<mmq_x, mmq_y, need_check, GGML_TYPE_Q1_0> {
|
||||
static constexpr int vdr = VDR_Q1_0_Q8_1_MMQ;
|
||||
static constexpr load_tiles_mmq_t load_tiles = load_tiles_q1_0<mmq_y, need_check>;
|
||||
static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_q8_1_mma<mmq_x, mmq_y, MMQ_Q8_1_DS_LAYOUT_D4>;
|
||||
static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a<mmq_x, mmq_y>;
|
||||
};
|
||||
|
||||
template <int mmq_x, int mmq_y, bool need_check>
|
||||
struct mmq_type_traits<mmq_x, mmq_y, need_check, GGML_TYPE_Q4_0> {
|
||||
static constexpr int vdr = VDR_Q4_0_Q8_1_MMQ;
|
||||
|
|
|
|||
|
|
@ -9,6 +9,7 @@ typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_
|
|||
|
||||
static constexpr __device__ vec_dot_q_cuda_t get_vec_dot_q_cuda(ggml_type type) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_Q1_0: return vec_dot_q1_0_q8_1;
|
||||
case GGML_TYPE_Q4_0: return vec_dot_q4_0_q8_1;
|
||||
case GGML_TYPE_Q4_1: return vec_dot_q4_1_q8_1;
|
||||
case GGML_TYPE_Q5_0: return vec_dot_q5_0_q8_1;
|
||||
|
|
@ -36,6 +37,7 @@ static constexpr __device__ vec_dot_q_cuda_t get_vec_dot_q_cuda(ggml_type type)
|
|||
|
||||
static constexpr __host__ __device__ int get_vdr_mmvq(ggml_type type) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_Q1_0: return VDR_Q1_0_Q8_1_MMVQ;
|
||||
case GGML_TYPE_Q4_0: return VDR_Q4_0_Q8_1_MMVQ;
|
||||
case GGML_TYPE_Q4_1: return VDR_Q4_1_Q8_1_MMVQ;
|
||||
case GGML_TYPE_Q5_0: return VDR_Q5_0_Q8_1_MMVQ;
|
||||
|
|
@ -886,6 +888,12 @@ static void mul_mat_vec_q_switch_type(
|
|||
const int nsamples_x, const int nsamples_dst, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst,
|
||||
const int ids_stride, cudaStream_t stream) {
|
||||
switch (type_x) {
|
||||
case GGML_TYPE_Q1_0:
|
||||
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q1_0>
|
||||
(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q4_0:
|
||||
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q4_0>
|
||||
(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
|
||||
|
|
|
|||
|
|
@ -32,6 +32,7 @@ SOURCE_FATTN_MMA_START = """// This file has been autogenerated by generate_cu_f
|
|||
SOURCE_FATTN_MMA_CASE = "DECL_FATTN_MMA_F16_CASE({head_size_kq}, {head_size_v}, {ncols1}, {ncols2});\n"
|
||||
|
||||
TYPES_MMQ = [
|
||||
"GGML_TYPE_Q1_0",
|
||||
"GGML_TYPE_Q4_0", "GGML_TYPE_Q4_1", "GGML_TYPE_Q5_0", "GGML_TYPE_Q5_1", "GGML_TYPE_Q8_0",
|
||||
"GGML_TYPE_Q2_K", "GGML_TYPE_Q3_K", "GGML_TYPE_Q4_K", "GGML_TYPE_Q5_K", "GGML_TYPE_Q6_K",
|
||||
"GGML_TYPE_IQ2_XXS", "GGML_TYPE_IQ2_XS", "GGML_TYPE_IQ2_S", "GGML_TYPE_IQ3_XXS", "GGML_TYPE_IQ3_S",
|
||||
|
|
|
|||
|
|
@ -0,0 +1,5 @@
|
|||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../mmq.cuh"
|
||||
|
||||
DECL_MMQ_CASE(GGML_TYPE_Q1_0);
|
||||
|
|
@ -106,6 +106,9 @@ static __device__ __forceinline__ uint32_t unpack_ksigns(const uint8_t v) {
|
|||
// VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called
|
||||
// MMVQ = mul_mat_vec_q, MMQ = mul_mat_q
|
||||
|
||||
#define VDR_Q1_0_Q8_1_MMVQ 1 // Process one 32-element chunk at a time for parallelism
|
||||
#define VDR_Q1_0_Q8_1_MMQ 4 // Q1_0 has 128 bits (4 ints) per block
|
||||
|
||||
#define VDR_Q4_0_Q8_1_MMVQ 2
|
||||
#define VDR_Q4_0_Q8_1_MMQ 4
|
||||
|
||||
|
|
@ -669,6 +672,51 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq(
|
|||
return d6 * sumf_d;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ float vec_dot_q1_0_q8_1(
|
||||
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
|
||||
|
||||
const block_q1_0 * bq1_0 = (const block_q1_0 *) vbq + kbx;
|
||||
|
||||
// Q1_0: 128 elements with ONE scale
|
||||
// Q8_1: 32 elements per block with individual scales
|
||||
// iqs selects which of the 4 chunks of 32 elements to process (0-3)
|
||||
|
||||
const float d1 = bq1_0->d;
|
||||
|
||||
// Process only the chunk specified by iqs
|
||||
const block_q8_1 * bq8_1_chunk = bq8_1 + iqs;
|
||||
|
||||
// Load 32 bits (4 bytes) for this chunk from Q1_0
|
||||
const int offset = iqs * 4;
|
||||
const int v = bq1_0->qs[offset + 0] | (bq1_0->qs[offset + 1] << 8) |
|
||||
(bq1_0->qs[offset + 2] << 16) | (bq1_0->qs[offset + 3] << 24);
|
||||
|
||||
// Unpack 32 bits into 32 signed values (-1 or +1)
|
||||
int vi_bytes[8];
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
const int shift = j * 4;
|
||||
const int bits4 = (v >> shift) & 0x0F;
|
||||
const int b0 = (bits4 & 0x01) ? 1 : -1;
|
||||
const int b1 = (bits4 & 0x02) ? 1 : -1;
|
||||
const int b2 = (bits4 & 0x04) ? 1 : -1;
|
||||
const int b3 = (bits4 & 0x08) ? 1 : -1;
|
||||
vi_bytes[j] = (b0 & 0xFF) | ((b1 & 0xFF) << 8) | ((b2 & 0xFF) << 16) | ((b3 & 0xFF) << 24);
|
||||
}
|
||||
|
||||
// Compute dot product for this 32-element chunk
|
||||
int sumi = 0;
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
const int u = get_int_b4(bq8_1_chunk->qs, j);
|
||||
sumi = ggml_cuda_dp4a(vi_bytes[j], u, sumi);
|
||||
}
|
||||
|
||||
// Apply Q1_0's single scale and this chunk's Q8_1 scale
|
||||
const float d8 = __low2float(bq8_1_chunk->ds);
|
||||
return d1 * d8 * sumi;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
|
||||
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
|
||||
|
||||
|
|
|
|||
|
|
@ -1406,7 +1406,7 @@ struct vk_op_im2col_push_constants {
|
|||
uint32_t IW; uint32_t IH;
|
||||
uint32_t OW; uint32_t OH;
|
||||
uint32_t KW; uint32_t KH;
|
||||
uint32_t pelements;
|
||||
uint32_t OH_batch;
|
||||
uint32_t CHW;
|
||||
int32_t s0; int32_t s1;
|
||||
int32_t p0; int32_t p1;
|
||||
|
|
@ -10098,7 +10098,13 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
|||
|
||||
const uint32_t batch = src1->ne[is_2D ? 3 : 2];
|
||||
|
||||
elements = { OW * KW * KH, OH, batch * IC };
|
||||
const uint32_t CHW = IC * KH * KW;
|
||||
// Cap X workgroups to limit concurrent IC channel reads.
|
||||
// The shader loops over X to cover the full CHW dimension.
|
||||
// AMD prefers a lower limit
|
||||
const uint32_t min_cap = ctx->device->vendor_id == VK_VENDOR_ID_AMD ? 512u : 4096u;
|
||||
const uint32_t x_elements = std::min(CHW, std::max(min_cap, OW * KH * KW));
|
||||
elements = { x_elements, OW, OH * batch };
|
||||
elements[1] = std::min(elements[1], ctx->device->properties.limits.maxComputeWorkGroupCount[1]);
|
||||
elements[2] = std::min(elements[2], ctx->device->properties.limits.maxComputeWorkGroupCount[2]);
|
||||
} break;
|
||||
|
|
@ -11761,7 +11767,6 @@ static void ggml_vk_im2col(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
|||
const uint32_t offset_delta = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
|
||||
const uint32_t batch_offset = src1->nb[is_2D ? 3 : 2] / 4; // nb is byte offset, src is type float32
|
||||
|
||||
const uint32_t pelements = OW * KW * KH;
|
||||
const uint32_t batch = src1->ne[is_2D ? 3 : 2];
|
||||
|
||||
const ggml_backend_vk_buffer_context * d_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
|
||||
|
|
@ -11773,7 +11778,7 @@ static void ggml_vk_im2col(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
|||
dst_addr,
|
||||
batch_offset, offset_delta,
|
||||
IC, IW, IH, OW, OH, KW, KH,
|
||||
pelements,
|
||||
OH * batch,
|
||||
IC * KH * KW,
|
||||
s0, s1, p0, p1, d0, d1, batch * IC
|
||||
});
|
||||
|
|
|
|||
|
|
@ -13,7 +13,7 @@ layout (push_constant) uniform parameter
|
|||
uint IW; uint IH;
|
||||
uint OW; uint OH;
|
||||
uint KW; uint KH;
|
||||
uint pelements;
|
||||
uint OH_batch;
|
||||
uint CHW;
|
||||
int s0; int s1;
|
||||
int p0; int p1;
|
||||
|
|
@ -34,82 +34,60 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
|||
layout (buffer_reference) buffer D_ptr {D_TYPE d;};
|
||||
#endif
|
||||
|
||||
void im2col(const uint y, const uint z) {
|
||||
const uint gidx = gl_GlobalInvocationID.x;
|
||||
void im2col(const uint ow, const uint z_idx) {
|
||||
const uint oh = z_idx % p.OH;
|
||||
const uint batch_idx = z_idx / p.OH;
|
||||
|
||||
const uint oh = y;
|
||||
const uint batch = z / p.IC;
|
||||
const uint ic = z % p.IC;
|
||||
const uint gidx = gl_LocalInvocationID.x;
|
||||
const uint src_batch = batch_idx * p.batch_offset;
|
||||
const BDA_OFFSET_T dst_row = ((BDA_OFFSET_T(batch_idx) * p.OH + oh) * p.OW + ow) * p.CHW;
|
||||
|
||||
const uint src_base = ic * p.offset_delta + batch * p.batch_offset;
|
||||
const BDA_OFFSET_T dst_base = ((BDA_OFFSET_T(batch) * p.OH + oh) * p.OW) * p.CHW + BDA_OFFSET_T(ic) * (p.KW * p.KH);
|
||||
const int oh_s1 = int(oh) * p.s1;
|
||||
const uint ksize = p.OW * p.KH;
|
||||
const uint KHKW = p.KH * p.KW;
|
||||
|
||||
const uint base_linear_idx = gidx * NUM_ITER;
|
||||
uint wg_x = gl_WorkGroupID.x;
|
||||
do {
|
||||
const uint wg_offset = wg_x * 512;
|
||||
|
||||
uint current_kx = base_linear_idx / ksize;
|
||||
const uint rem = base_linear_idx - (current_kx * ksize);
|
||||
uint current_ky = rem / p.OW;
|
||||
uint current_ix = rem % p.OW;
|
||||
[[unroll]] for (uint i = 0; i < NUM_ITER; ++i) {
|
||||
const uint chw_idx = wg_offset + gidx + i * BLOCK_SIZE;
|
||||
|
||||
A_TYPE values[NUM_ITER];
|
||||
BDA_OFFSET_T offset_dst[NUM_ITER];
|
||||
[[unroll]] for (uint idx = 0; idx < NUM_ITER; ++idx) {
|
||||
values[idx] = A_TYPE(0);
|
||||
}
|
||||
|
||||
[[unroll]] for (uint idx = 0; idx < NUM_ITER; ++idx) {
|
||||
|
||||
const uint linear_idx = base_linear_idx + idx;
|
||||
|
||||
if (linear_idx >= p.pelements) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const uint iiw = current_ix * p.s0 + current_kx * p.d0 - p.p0;
|
||||
const uint iih = oh_s1 + current_ky * p.d1 - p.p1;
|
||||
|
||||
offset_dst[idx] = dst_base + BDA_OFFSET_T(current_ix) * p.CHW + current_ky * p.KW + current_kx;
|
||||
|
||||
if ((iih < p.IH) && (iiw < p.IW)) {
|
||||
values[idx] = data_a[src_base + iih * p.IW + iiw];
|
||||
}
|
||||
|
||||
if (++current_ix == p.OW) {
|
||||
current_ix = 0;
|
||||
if (++current_ky == p.KH) {
|
||||
current_ky = 0;
|
||||
current_kx++;
|
||||
if (chw_idx >= p.CHW) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
[[unroll]] for (uint idx = 0; idx < NUM_ITER; ++idx) {
|
||||
const uint ic = chw_idx / KHKW;
|
||||
const uint rem = chw_idx - ic * KHKW;
|
||||
const uint ky = rem / p.KW;
|
||||
const uint kx = rem - ky * p.KW;
|
||||
|
||||
const uint linear_idx = base_linear_idx + idx;
|
||||
const uint iiw = ow * p.s0 + kx * p.d0 - p.p0;
|
||||
const uint iih = oh * p.s1 + ky * p.d1 - p.p1;
|
||||
|
||||
if (linear_idx >= p.pelements) {
|
||||
continue;
|
||||
}
|
||||
A_TYPE val = A_TYPE(0);
|
||||
if (iih < p.IH && iiw < p.IW) {
|
||||
val = data_a[src_batch + ic * p.offset_delta + iih * p.IW + iiw];
|
||||
}
|
||||
|
||||
#if BDA
|
||||
D_ptr dst_addr = D_ptr(p.dst_addr + D_SIZE * offset_dst[idx]);
|
||||
dst_addr.d = D_TYPE(values[idx]);
|
||||
D_ptr out_ptr = D_ptr(p.dst_addr + D_SIZE * (dst_row + chw_idx));
|
||||
out_ptr.d = D_TYPE(val);
|
||||
#else
|
||||
data_d[offset_dst[idx]] = D_TYPE(values[idx]);
|
||||
data_d[dst_row + chw_idx] = D_TYPE(val);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
wg_x += gl_NumWorkGroups.x;
|
||||
} while (wg_x * 512 < p.CHW);
|
||||
}
|
||||
|
||||
void main() {
|
||||
uint y = gl_GlobalInvocationID.y;
|
||||
while (y < p.OH) {
|
||||
uint ow = gl_GlobalInvocationID.y;
|
||||
while (ow < p.OW) {
|
||||
uint z = gl_GlobalInvocationID.z;
|
||||
while (z < p.batch_IC) {
|
||||
im2col(y, z);
|
||||
while (z < p.OH_batch) {
|
||||
im2col(ow, z);
|
||||
z += gl_NumWorkGroups.z;
|
||||
}
|
||||
y += gl_NumWorkGroups.y;
|
||||
ow += gl_NumWorkGroups.y;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -109,7 +109,7 @@ mtmd_context_params mtmd_context_params_default() {
|
|||
/* use_gpu */ true,
|
||||
/* print_timings */ true,
|
||||
/* n_threads */ 4,
|
||||
/* image_marker */ MTMD_DEFAULT_IMAGE_MARKER,
|
||||
/* image_marker */ nullptr,
|
||||
/* media_marker */ mtmd_default_marker(),
|
||||
/* flash_attn_type */ LLAMA_FLASH_ATTN_TYPE_AUTO,
|
||||
/* warmup */ true,
|
||||
|
|
@ -169,7 +169,7 @@ struct mtmd_context {
|
|||
media_marker (ctx_params.media_marker),
|
||||
n_embd_text (llama_model_n_embd_inp(text_model))
|
||||
{
|
||||
if (std::string(ctx_params.image_marker) != MTMD_DEFAULT_IMAGE_MARKER) {
|
||||
if (ctx_params.image_marker != nullptr) {
|
||||
throw std::runtime_error("custom image_marker is not supported anymore, use media_marker instead");
|
||||
}
|
||||
|
||||
|
|
@ -584,9 +584,6 @@ struct mtmd_tokenizer {
|
|||
parse_special = text->parse_special;
|
||||
input_text = text->text;
|
||||
vocab = llama_model_get_vocab(ctx->text_model);
|
||||
|
||||
// for compatibility, we convert image marker to media marker
|
||||
string_replace_all(input_text, MTMD_DEFAULT_IMAGE_MARKER, ctx->media_marker);
|
||||
}
|
||||
|
||||
int32_t tokenize(mtmd_input_chunks * output) {
|
||||
|
|
|
|||
|
|
@ -46,9 +46,6 @@
|
|||
# define MTMD_API
|
||||
#endif
|
||||
|
||||
// deprecated marker, use mtmd_default_marker() instead
|
||||
#define MTMD_DEFAULT_IMAGE_MARKER "<__image__>"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -84,6 +84,14 @@ std::string gen_tool_call_id() {
|
|||
return random_string();
|
||||
}
|
||||
|
||||
static std::string media_marker = "";
|
||||
const char * get_media_marker() {
|
||||
if (media_marker.empty()) {
|
||||
media_marker = "<__media_" + random_string() + "__>";
|
||||
}
|
||||
return media_marker.c_str();
|
||||
}
|
||||
|
||||
//
|
||||
// lora utils
|
||||
//
|
||||
|
|
@ -975,7 +983,7 @@ json oaicompat_chat_params_parse(
|
|||
handle_media(out_files, image_url, opt.media_path);
|
||||
|
||||
p["type"] = "media_marker";
|
||||
p["text"] = mtmd_default_marker();
|
||||
p["text"] = get_media_marker();
|
||||
p.erase("image_url");
|
||||
|
||||
} else if (type == "input_audio") {
|
||||
|
|
@ -996,7 +1004,7 @@ json oaicompat_chat_params_parse(
|
|||
// TODO: add audio_url support by reusing handle_media()
|
||||
|
||||
p["type"] = "media_marker";
|
||||
p["text"] = mtmd_default_marker();
|
||||
p["text"] = get_media_marker();
|
||||
p.erase("input_audio");
|
||||
|
||||
} else if (type != "text") {
|
||||
|
|
@ -1460,7 +1468,7 @@ json convert_transcriptions_to_chatcmpl(
|
|||
if (!language.empty()) {
|
||||
prompt += string_format(" (language: %s)", language.c_str());
|
||||
}
|
||||
prompt += mtmd_default_marker();
|
||||
prompt += get_media_marker();
|
||||
|
||||
json chatcmpl_body = inp_body; // copy all fields
|
||||
chatcmpl_body["messages"] = json::array({
|
||||
|
|
|
|||
|
|
@ -92,6 +92,9 @@ std::string random_string();
|
|||
std::string gen_chatcmplid();
|
||||
std::string gen_tool_call_id();
|
||||
|
||||
// get a random marker; note: each time the server restarts, the marker will be different
|
||||
const char * get_media_marker();
|
||||
|
||||
//
|
||||
// lora utils
|
||||
//
|
||||
|
|
|
|||
|
|
@ -708,6 +708,7 @@ private:
|
|||
mparams.warmup = params_base.warmup;
|
||||
mparams.image_min_tokens = params_base.image_min_tokens;
|
||||
mparams.image_max_tokens = params_base.image_max_tokens;
|
||||
mparams.media_marker = get_media_marker();
|
||||
|
||||
mctx = mtmd_init_from_file(mmproj_path.c_str(), model, mparams);
|
||||
if (mctx == nullptr) {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue