mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-10 17:14:36 +00:00
up to date merge, without vulkan-gen-shaders. They will be built before each release from now on, as they are very large
This commit is contained in:
commit
ed75f8a741
24 changed files with 655 additions and 573 deletions
|
@ -1,3 +0,0 @@
|
||||||
# collaborators can optionally add themselves here to indicate their availability for reviewing related PRs
|
|
||||||
|
|
||||||
ci/ @ggerganov
|
|
|
@ -146,6 +146,35 @@ static void common_params_handle_model_default(common_params & params) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const std::vector<ggml_type> kv_cache_types = {
|
||||||
|
GGML_TYPE_F32,
|
||||||
|
GGML_TYPE_F16,
|
||||||
|
GGML_TYPE_BF16,
|
||||||
|
GGML_TYPE_Q8_0,
|
||||||
|
GGML_TYPE_Q4_0,
|
||||||
|
GGML_TYPE_Q4_1,
|
||||||
|
GGML_TYPE_IQ4_NL,
|
||||||
|
GGML_TYPE_Q5_0,
|
||||||
|
GGML_TYPE_Q5_1,
|
||||||
|
};
|
||||||
|
|
||||||
|
static ggml_type kv_cache_type_from_str(const std::string & s) {
|
||||||
|
for (const auto & type : kv_cache_types) {
|
||||||
|
if (ggml_type_name(type) == s) {
|
||||||
|
return type;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
throw std::runtime_error("Unsupported cache type: " + s);
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::string get_all_kv_cache_types() {
|
||||||
|
std::ostringstream msg;
|
||||||
|
for (const auto & type : kv_cache_types) {
|
||||||
|
msg << ggml_type_name(type) << (&type == &kv_cache_types.back() ? "" : ", ");
|
||||||
|
}
|
||||||
|
return msg.str();
|
||||||
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
// CLI argument parsing functions
|
// CLI argument parsing functions
|
||||||
//
|
//
|
||||||
|
@ -1175,18 +1204,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
).set_env("LLAMA_ARG_NO_KV_OFFLOAD"));
|
).set_env("LLAMA_ARG_NO_KV_OFFLOAD"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-ctk", "--cache-type-k"}, "TYPE",
|
{"-ctk", "--cache-type-k"}, "TYPE",
|
||||||
string_format("KV cache data type for K (default: %s)", params.cache_type_k.c_str()),
|
string_format(
|
||||||
|
"KV cache data type for K\n"
|
||||||
|
"allowed values: %s\n"
|
||||||
|
"(default: %s)",
|
||||||
|
get_all_kv_cache_types().c_str(),
|
||||||
|
ggml_type_name(params.cache_type_k)
|
||||||
|
),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
// TODO: get the type right here
|
params.cache_type_k = kv_cache_type_from_str(value);
|
||||||
params.cache_type_k = value;
|
|
||||||
}
|
}
|
||||||
).set_env("LLAMA_ARG_CACHE_TYPE_K"));
|
).set_env("LLAMA_ARG_CACHE_TYPE_K"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-ctv", "--cache-type-v"}, "TYPE",
|
{"-ctv", "--cache-type-v"}, "TYPE",
|
||||||
string_format("KV cache data type for V (default: %s)", params.cache_type_v.c_str()),
|
string_format(
|
||||||
|
"KV cache data type for V\n"
|
||||||
|
"allowed values: %s\n"
|
||||||
|
"(default: %s)",
|
||||||
|
get_all_kv_cache_types().c_str(),
|
||||||
|
ggml_type_name(params.cache_type_v)
|
||||||
|
),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
// TODO: get the type right here
|
params.cache_type_v = kv_cache_type_from_str(value);
|
||||||
params.cache_type_v = value;
|
|
||||||
}
|
}
|
||||||
).set_env("LLAMA_ARG_CACHE_TYPE_V"));
|
).set_env("LLAMA_ARG_CACHE_TYPE_V"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
|
@ -2084,35 +2123,35 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
[](common_params & params, int value) {
|
[](common_params & params, int value) {
|
||||||
params.speculative.n_max = value;
|
params.speculative.n_max = value;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}));
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MAX"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--draft-min", "--draft-n-min"}, "N",
|
{"--draft-min", "--draft-n-min"}, "N",
|
||||||
string_format("minimum number of draft tokens to use for speculative decoding (default: %d)", params.speculative.n_min),
|
string_format("minimum number of draft tokens to use for speculative decoding (default: %d)", params.speculative.n_min),
|
||||||
[](common_params & params, int value) {
|
[](common_params & params, int value) {
|
||||||
params.speculative.n_min = value;
|
params.speculative.n_min = value;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}));
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MIN"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--draft-p-split"}, "P",
|
{"--draft-p-split"}, "P",
|
||||||
string_format("speculative decoding split probability (default: %.1f)", (double)params.speculative.p_split),
|
string_format("speculative decoding split probability (default: %.1f)", (double)params.speculative.p_split),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.speculative.p_split = std::stof(value);
|
params.speculative.p_split = std::stof(value);
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}).set_env("LLAMA_ARG_DRAFT_P_SPLIT"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--draft-p-min"}, "P",
|
{"--draft-p-min"}, "P",
|
||||||
string_format("minimum speculative decoding probability (greedy) (default: %.1f)", (double)params.speculative.p_min),
|
string_format("minimum speculative decoding probability (greedy) (default: %.1f)", (double)params.speculative.p_min),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.speculative.p_min = std::stof(value);
|
params.speculative.p_min = std::stof(value);
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_P_MIN"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-cd", "--ctx-size-draft"}, "N",
|
{"-cd", "--ctx-size-draft"}, "N",
|
||||||
string_format("size of the prompt context for the draft model (default: %d, 0 = loaded from model)", params.speculative.n_ctx),
|
string_format("size of the prompt context for the draft model (default: %d, 0 = loaded from model)", params.speculative.n_ctx),
|
||||||
[](common_params & params, int value) {
|
[](common_params & params, int value) {
|
||||||
params.speculative.n_ctx = value;
|
params.speculative.n_ctx = value;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CTX_SIZE_DRAFT"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-devd", "--device-draft"}, "<dev1,dev2,..>",
|
{"-devd", "--device-draft"}, "<dev1,dev2,..>",
|
||||||
"comma-separated list of devices to use for offloading the draft model (none = don't offload)\n"
|
"comma-separated list of devices to use for offloading the draft model (none = don't offload)\n"
|
||||||
|
@ -2132,14 +2171,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n");
|
fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_N_GPU_LAYERS_DRAFT"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-md", "--model-draft"}, "FNAME",
|
{"-md", "--model-draft"}, "FNAME",
|
||||||
"draft model for speculative decoding (default: unused)",
|
"draft model for speculative decoding (default: unused)",
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.speculative.model = value;
|
params.speculative.model = value;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));
|
||||||
|
|
||||||
return ctx_arg;
|
return ctx_arg;
|
||||||
}
|
}
|
||||||
|
|
|
@ -1017,38 +1017,6 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
|
||||||
return mparams;
|
return mparams;
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_type kv_cache_type_from_str(const std::string & s) {
|
|
||||||
if (s == "f32") {
|
|
||||||
return GGML_TYPE_F32;
|
|
||||||
}
|
|
||||||
if (s == "f16") {
|
|
||||||
return GGML_TYPE_F16;
|
|
||||||
}
|
|
||||||
if (s == "bf16") {
|
|
||||||
return GGML_TYPE_BF16;
|
|
||||||
}
|
|
||||||
if (s == "q8_0") {
|
|
||||||
return GGML_TYPE_Q8_0;
|
|
||||||
}
|
|
||||||
if (s == "q4_0") {
|
|
||||||
return GGML_TYPE_Q4_0;
|
|
||||||
}
|
|
||||||
if (s == "q4_1") {
|
|
||||||
return GGML_TYPE_Q4_1;
|
|
||||||
}
|
|
||||||
if (s == "iq4_nl") {
|
|
||||||
return GGML_TYPE_IQ4_NL;
|
|
||||||
}
|
|
||||||
if (s == "q5_0") {
|
|
||||||
return GGML_TYPE_Q5_0;
|
|
||||||
}
|
|
||||||
if (s == "q5_1") {
|
|
||||||
return GGML_TYPE_Q5_1;
|
|
||||||
}
|
|
||||||
|
|
||||||
throw std::runtime_error("Unsupported cache type: " + s);
|
|
||||||
}
|
|
||||||
|
|
||||||
struct llama_context_params common_context_params_to_llama(const common_params & params) {
|
struct llama_context_params common_context_params_to_llama(const common_params & params) {
|
||||||
auto cparams = llama_context_default_params();
|
auto cparams = llama_context_default_params();
|
||||||
|
|
||||||
|
@ -1083,8 +1051,8 @@ struct llama_context_params common_context_params_to_llama(const common_params &
|
||||||
cparams.pooling_type = LLAMA_POOLING_TYPE_RANK;
|
cparams.pooling_type = LLAMA_POOLING_TYPE_RANK;
|
||||||
}
|
}
|
||||||
|
|
||||||
cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
|
cparams.type_k = params.cache_type_k;
|
||||||
cparams.type_v = kv_cache_type_from_str(params.cache_type_v);
|
cparams.type_v = params.cache_type_v;
|
||||||
|
|
||||||
return cparams;
|
return cparams;
|
||||||
}
|
}
|
||||||
|
|
|
@ -282,8 +282,8 @@ struct common_params {
|
||||||
bool warmup = true; // warmup run
|
bool warmup = true; // warmup run
|
||||||
bool check_tensors = false; // validate tensor data
|
bool check_tensors = false; // validate tensor data
|
||||||
|
|
||||||
std::string cache_type_k = "f16"; // KV cache data type for the K
|
ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
|
||||||
std::string cache_type_v = "f16"; // KV cache data type for the V
|
ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
|
||||||
|
|
||||||
// multimodal models (see examples/llava)
|
// multimodal models (see examples/llava)
|
||||||
std::string mmproj = ""; // path to multimodal projector // NOLINT
|
std::string mmproj = ""; // path to multimodal projector // NOLINT
|
||||||
|
|
|
@ -288,7 +288,7 @@ struct split_strategy {
|
||||||
}
|
}
|
||||||
|
|
||||||
void print_info() {
|
void print_info() {
|
||||||
printf("n_split: %ld\n", ctx_outs.size());
|
printf("n_split: %zu\n", ctx_outs.size());
|
||||||
int i_split = 0;
|
int i_split = 0;
|
||||||
for (auto & ctx_out : ctx_outs) {
|
for (auto & ctx_out : ctx_outs) {
|
||||||
// re-calculate the real gguf size for each split (= metadata size + total size of all tensors)
|
// re-calculate the real gguf size for each split (= metadata size + total size of all tensors)
|
||||||
|
@ -298,7 +298,7 @@ struct split_strategy {
|
||||||
total_size += ggml_nbytes(t);
|
total_size += ggml_nbytes(t);
|
||||||
}
|
}
|
||||||
total_size = total_size / 1000 / 1000; // convert to megabytes
|
total_size = total_size / 1000 / 1000; // convert to megabytes
|
||||||
printf("split %05d: n_tensors = %d, total_size = %ldM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size);
|
printf("split %05d: n_tensors = %d, total_size = %zuM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size);
|
||||||
i_split++;
|
i_split++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -81,7 +81,7 @@ Several quantization methods are supported. They differ in the resulting model d
|
||||||
- [#4930 - imatrix for all k-quants](https://github.com/ggerganov/llama.cpp/pull/4930)
|
- [#4930 - imatrix for all k-quants](https://github.com/ggerganov/llama.cpp/pull/4930)
|
||||||
- [#4951 - imatrix on the GPU](https://github.com/ggerganov/llama.cpp/pull/4957)
|
- [#4951 - imatrix on the GPU](https://github.com/ggerganov/llama.cpp/pull/4957)
|
||||||
- [#4969 - imatrix for legacy quants](https://github.com/ggerganov/llama.cpp/pull/4969)
|
- [#4969 - imatrix for legacy quants](https://github.com/ggerganov/llama.cpp/pull/4969)
|
||||||
- [#4996 - k-qunats tuning](https://github.com/ggerganov/llama.cpp/pull/4996)
|
- [#4996 - k-quants tuning](https://github.com/ggerganov/llama.cpp/pull/4996)
|
||||||
- [#5060 - Q3_K_XS](https://github.com/ggerganov/llama.cpp/pull/5060)
|
- [#5060 - Q3_K_XS](https://github.com/ggerganov/llama.cpp/pull/5060)
|
||||||
- [#5196 - 3-bit i-quants](https://github.com/ggerganov/llama.cpp/pull/5196)
|
- [#5196 - 3-bit i-quants](https://github.com/ggerganov/llama.cpp/pull/5196)
|
||||||
- [quantization tuning](https://github.com/ggerganov/llama.cpp/pull/5320), [another one](https://github.com/ggerganov/llama.cpp/pull/5334), and [another one](https://github.com/ggerganov/llama.cpp/pull/5361)
|
- [quantization tuning](https://github.com/ggerganov/llama.cpp/pull/5320), [another one](https://github.com/ggerganov/llama.cpp/pull/5334), and [another one](https://github.com/ggerganov/llama.cpp/pull/5361)
|
||||||
|
|
File diff suppressed because one or more lines are too long
|
@ -333,7 +333,7 @@ static std::string llama_get_chat_template(const struct llama_model * model) {
|
||||||
if (res < 2) {
|
if (res < 2) {
|
||||||
return "";
|
return "";
|
||||||
} else {
|
} else {
|
||||||
std::vector<char> model_template(res, 0);
|
std::vector<char> model_template(res + 1, 0);
|
||||||
llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
|
llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
|
||||||
return std::string(model_template.data(), model_template.size() - 1);
|
return std::string(model_template.data(), model_template.size() - 1);
|
||||||
}
|
}
|
||||||
|
|
|
@ -15,7 +15,7 @@
|
||||||
<!-- sidebar -->
|
<!-- sidebar -->
|
||||||
<div class="drawer-side h-screen lg:h-screen z-50 lg:max-w-64">
|
<div class="drawer-side h-screen lg:h-screen z-50 lg:max-w-64">
|
||||||
<label for="toggle-drawer" aria-label="close sidebar" class="drawer-overlay"></label>
|
<label for="toggle-drawer" aria-label="close sidebar" class="drawer-overlay"></label>
|
||||||
<div class="flex flex-col bg-base-200 min-h-full max-w-[calc(100vw-2em)] py-4 px-4">
|
<div class="flex flex-col bg-base-200 min-h-full max-w-64 py-4 px-4">
|
||||||
<div class="flex flex-row items-center justify-between mb-4 mt-4">
|
<div class="flex flex-row items-center justify-between mb-4 mt-4">
|
||||||
<h2 class="font-bold ml-4">Conversations</h2>
|
<h2 class="font-bold ml-4">Conversations</h2>
|
||||||
|
|
||||||
|
@ -120,51 +120,25 @@
|
||||||
{{ messages.length === 0 ? 'Send a message to start' : '' }}
|
{{ messages.length === 0 ? 'Send a message to start' : '' }}
|
||||||
</div>
|
</div>
|
||||||
<div v-for="msg in messages" class="group">
|
<div v-for="msg in messages" class="group">
|
||||||
<div :class="{
|
<message-bubble
|
||||||
'chat': true,
|
:config="config"
|
||||||
'chat-start': msg.role !== 'user',
|
:msg="msg"
|
||||||
'chat-end': msg.role === 'user',
|
:key="msg.id"
|
||||||
}">
|
:is-generating="isGenerating"
|
||||||
<div :class="{
|
:edit-user-msg-and-regenerate="editUserMsgAndRegenerate"
|
||||||
'chat-bubble markdown': true,
|
:regenerate-msg="regenerateMsg"></message-bubble>
|
||||||
'chat-bubble-base-300': msg.role !== 'user',
|
|
||||||
}">
|
|
||||||
<!-- textarea for editing message -->
|
|
||||||
<template v-if="editingMsg && editingMsg.id === msg.id">
|
|
||||||
<textarea
|
|
||||||
class="textarea textarea-bordered bg-base-100 text-base-content w-[calc(90vw-8em)] lg:w-96"
|
|
||||||
v-model="msg.content"></textarea>
|
|
||||||
<br/>
|
|
||||||
<button class="btn btn-ghost mt-2 mr-2" @click="editingMsg = null">Cancel</button>
|
|
||||||
<button class="btn mt-2" @click="editUserMsgAndRegenerate(msg)">Submit</button>
|
|
||||||
</template>
|
|
||||||
<!-- render message as markdown -->
|
|
||||||
<vue-markdown v-else :source="msg.content" />
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<!-- actions for each message -->
|
|
||||||
<div :class="{'text-right': msg.role === 'user'}" class="mx-4 mt-2 mb-2">
|
|
||||||
<!-- user message -->
|
|
||||||
<button v-if="msg.role === 'user'" class="badge btn-mini show-on-hover" @click="editingMsg = msg" :disabled="isGenerating">
|
|
||||||
✍️ Edit
|
|
||||||
</button>
|
|
||||||
<!-- assistant message -->
|
|
||||||
<button v-if="msg.role === 'assistant'" class="badge btn-mini show-on-hover mr-2" @click="regenerateMsg(msg)" :disabled="isGenerating">
|
|
||||||
🔄 Regenerate
|
|
||||||
</button>
|
|
||||||
<button v-if="msg.role === 'assistant'" class="badge btn-mini show-on-hover mr-2" @click="copyMsg(msg)" :disabled="isGenerating">
|
|
||||||
📋 Copy
|
|
||||||
</button>
|
|
||||||
</div>
|
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<!-- pending (ongoing) assistant message -->
|
<!-- pending (ongoing) assistant message -->
|
||||||
<div id="pending-msg" class="chat chat-start">
|
<div id="pending-msg" class="group">
|
||||||
<div v-if="pendingMsg" class="chat-bubble markdown chat-bubble-base-300">
|
<message-bubble
|
||||||
<span v-if="!pendingMsg.content" class="loading loading-dots loading-md"></span>
|
v-if="pendingMsg"
|
||||||
<vue-markdown v-else :source="pendingMsg.content" />
|
:config="config"
|
||||||
</div>
|
:msg="pendingMsg"
|
||||||
|
:key="pendingMsg.id"
|
||||||
|
:is-generating="isGenerating"
|
||||||
|
:edit-user-msg-and-regenerate="() => {}"
|
||||||
|
:regenerate-msg="() => {}"></message-bubble>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
@ -227,6 +201,10 @@
|
||||||
<details class="collapse collapse-arrow bg-base-200 mb-2 overflow-visible">
|
<details class="collapse collapse-arrow bg-base-200 mb-2 overflow-visible">
|
||||||
<summary class="collapse-title font-bold">Advanced config</summary>
|
<summary class="collapse-title font-bold">Advanced config</summary>
|
||||||
<div class="collapse-content">
|
<div class="collapse-content">
|
||||||
|
<div class="flex flex-row items-center mb-2">
|
||||||
|
<input type="checkbox" class="checkbox" v-model="config.showTokensPerSecond" />
|
||||||
|
<span class="ml-4">Show tokens per second</span>
|
||||||
|
</div>
|
||||||
<label class="form-control mb-2">
|
<label class="form-control mb-2">
|
||||||
<!-- Custom parameters input -->
|
<!-- Custom parameters input -->
|
||||||
<div class="label inline">Custom JSON config (For more info, refer to <a class="underline" href="https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md" target="_blank" rel="noopener noreferrer">server documentation</a>)</div>
|
<div class="label inline">Custom JSON config (For more info, refer to <a class="underline" href="https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md" target="_blank" rel="noopener noreferrer">server documentation</a>)</div>
|
||||||
|
@ -247,6 +225,66 @@
|
||||||
|
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
|
||||||
|
<!-- Template to be used as message bubble -->
|
||||||
|
<template id="message-bubble">
|
||||||
|
<div :class="{
|
||||||
|
'chat': true,
|
||||||
|
'chat-start': msg.role !== 'user',
|
||||||
|
'chat-end': msg.role === 'user',
|
||||||
|
}">
|
||||||
|
<div :class="{
|
||||||
|
'chat-bubble markdown': true,
|
||||||
|
'chat-bubble-base-300': msg.role !== 'user',
|
||||||
|
}">
|
||||||
|
<!-- textarea for editing message -->
|
||||||
|
<template v-if="editingContent !== null">
|
||||||
|
<textarea
|
||||||
|
class="textarea textarea-bordered bg-base-100 text-base-content w-[calc(90vw-8em)] lg:w-96"
|
||||||
|
v-model="editingContent"></textarea>
|
||||||
|
<br/>
|
||||||
|
<button class="btn btn-ghost mt-2 mr-2" @click="editingContent = null">Cancel</button>
|
||||||
|
<button class="btn mt-2" @click="editMsg()">Submit</button>
|
||||||
|
</template>
|
||||||
|
<template v-else>
|
||||||
|
<!-- show loading dots for pending message -->
|
||||||
|
<span v-if="msg.content === null" class="loading loading-dots loading-md"></span>
|
||||||
|
<!-- render message as markdown -->
|
||||||
|
<vue-markdown v-else :source="msg.content"></vue-markdown>
|
||||||
|
<!-- render timings if enabled -->
|
||||||
|
<div class="dropdown dropdown-hover dropdown-top mt-2" v-if="timings && config.showTokensPerSecond">
|
||||||
|
<div tabindex="0" role="button" class="cursor-pointer font-semibold text-sm opacity-60">Speed: {{ timings.predicted_per_second.toFixed(1) }} t/s</div>
|
||||||
|
<div class="dropdown-content bg-base-100 z-10 w-64 p-2 shadow mt-4">
|
||||||
|
<b>Prompt</b><br/>
|
||||||
|
- Tokens: {{ timings.prompt_n }}<br/>
|
||||||
|
- Time: {{ timings.prompt_ms }} ms<br/>
|
||||||
|
- Speed: {{ timings.prompt_per_second.toFixed(1) }} t/s<br/>
|
||||||
|
<b>Generation</b><br/>
|
||||||
|
- Tokens: {{ timings.predicted_n }}<br/>
|
||||||
|
- Time: {{ timings.predicted_ms }} ms<br/>
|
||||||
|
- Speed: {{ timings.predicted_per_second.toFixed(1) }} t/s<br/>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</template>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<!-- actions for each message -->
|
||||||
|
<div :class="{'text-right': msg.role === 'user', 'opacity-0': isGenerating}" class="mx-4 mt-2 mb-2">
|
||||||
|
<!-- user message -->
|
||||||
|
<button v-if="msg.role === 'user'" class="badge btn-mini show-on-hover" @click="editingContent = msg.content" :disabled="isGenerating">
|
||||||
|
✍️ Edit
|
||||||
|
</button>
|
||||||
|
<!-- assistant message -->
|
||||||
|
<button v-if="msg.role === 'assistant'" class="badge btn-mini show-on-hover mr-2" @click="regenerateMsg(msg)" :disabled="isGenerating">
|
||||||
|
🔄 Regenerate
|
||||||
|
</button>
|
||||||
|
<button v-if="msg.role === 'assistant'" class="badge btn-mini show-on-hover mr-2" @click="copyMsg()" :disabled="isGenerating">
|
||||||
|
📋 Copy
|
||||||
|
</button>
|
||||||
|
</div>
|
||||||
|
</template>
|
||||||
|
|
||||||
|
|
||||||
<!-- Template to be used by settings modal -->
|
<!-- Template to be used by settings modal -->
|
||||||
<template id="settings-modal-short-input">
|
<template id="settings-modal-short-input">
|
||||||
<label class="input input-bordered join-item grow flex items-center gap-2 mb-2">
|
<label class="input input-bordered join-item grow flex items-center gap-2 mb-2">
|
||||||
|
|
7
examples/server/webui/package-lock.json
generated
7
examples/server/webui/package-lock.json
generated
|
@ -13,6 +13,7 @@
|
||||||
"markdown-it": "^14.1.0",
|
"markdown-it": "^14.1.0",
|
||||||
"postcss": "^8.4.49",
|
"postcss": "^8.4.49",
|
||||||
"tailwindcss": "^3.4.15",
|
"tailwindcss": "^3.4.15",
|
||||||
|
"textlinestream": "^1.1.1",
|
||||||
"vite-plugin-singlefile": "^2.0.3",
|
"vite-plugin-singlefile": "^2.0.3",
|
||||||
"vue": "^3.5.13"
|
"vue": "^3.5.13"
|
||||||
},
|
},
|
||||||
|
@ -2677,6 +2678,12 @@
|
||||||
"node": ">=14.0.0"
|
"node": ">=14.0.0"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"node_modules/textlinestream": {
|
||||||
|
"version": "1.1.1",
|
||||||
|
"resolved": "https://registry.npmjs.org/textlinestream/-/textlinestream-1.1.1.tgz",
|
||||||
|
"integrity": "sha512-iBHbi7BQxrFmwZUQJsT0SjNzlLLsXhvW/kg7EyOMVMBIrlnj/qYofwo1LVLZi+3GbUEo96Iu2eqToI2+lZoAEQ==",
|
||||||
|
"license": "MIT"
|
||||||
|
},
|
||||||
"node_modules/uc.micro": {
|
"node_modules/uc.micro": {
|
||||||
"version": "2.1.0",
|
"version": "2.1.0",
|
||||||
"resolved": "https://registry.npmjs.org/uc.micro/-/uc.micro-2.1.0.tgz",
|
"resolved": "https://registry.npmjs.org/uc.micro/-/uc.micro-2.1.0.tgz",
|
||||||
|
|
|
@ -17,6 +17,7 @@
|
||||||
"markdown-it": "^14.1.0",
|
"markdown-it": "^14.1.0",
|
||||||
"postcss": "^8.4.49",
|
"postcss": "^8.4.49",
|
||||||
"tailwindcss": "^3.4.15",
|
"tailwindcss": "^3.4.15",
|
||||||
|
"textlinestream": "^1.1.1",
|
||||||
"vite-plugin-singlefile": "^2.0.3",
|
"vite-plugin-singlefile": "^2.0.3",
|
||||||
"vue": "^3.5.13"
|
"vue": "^3.5.13"
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,225 +0,0 @@
|
||||||
const paramDefaults = {
|
|
||||||
stream: true,
|
|
||||||
temperature: 0.2,
|
|
||||||
};
|
|
||||||
|
|
||||||
let generation_settings = null;
|
|
||||||
|
|
||||||
export class CompletionError extends Error {
|
|
||||||
constructor(message, name, data) {
|
|
||||||
super(message);
|
|
||||||
this.name = name;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
// Completes the prompt as a generator. Recommended for most use cases.
|
|
||||||
//
|
|
||||||
// Example:
|
|
||||||
//
|
|
||||||
// import { llama } from '/completion.js'
|
|
||||||
//
|
|
||||||
// const request = llama("Tell me a joke", {n_predict: 800})
|
|
||||||
// for await (const chunk of request) {
|
|
||||||
// document.write(chunk.data.content)
|
|
||||||
// }
|
|
||||||
//
|
|
||||||
export async function* llama(prompt, params = {}, config = {}) {
|
|
||||||
let controller = config.controller;
|
|
||||||
const api_url = config.api_url?.replace(/\/+$/, '') || "";
|
|
||||||
|
|
||||||
if (!controller) {
|
|
||||||
controller = new AbortController();
|
|
||||||
}
|
|
||||||
|
|
||||||
const completionParams = { ...paramDefaults, ...params, prompt };
|
|
||||||
|
|
||||||
const response = await fetch(`${api_url}${config.endpoint || '/completion'}`, {
|
|
||||||
method: 'POST',
|
|
||||||
body: JSON.stringify(completionParams),
|
|
||||||
headers: {
|
|
||||||
'Connection': 'keep-alive',
|
|
||||||
'Content-Type': 'application/json',
|
|
||||||
'Accept': 'text/event-stream',
|
|
||||||
...(params.api_key ? {'Authorization': `Bearer ${params.api_key}`} : {})
|
|
||||||
},
|
|
||||||
signal: controller.signal,
|
|
||||||
});
|
|
||||||
|
|
||||||
const status = response.status;
|
|
||||||
if (status !== 200) {
|
|
||||||
try {
|
|
||||||
const body = await response.json();
|
|
||||||
if (body && body.error && body.error.message) {
|
|
||||||
throw new CompletionError(body.error.message, 'ServerError');
|
|
||||||
}
|
|
||||||
} catch (err) {
|
|
||||||
throw new CompletionError(err.message, 'ServerError');
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
const reader = response.body.getReader();
|
|
||||||
const decoder = new TextDecoder();
|
|
||||||
|
|
||||||
let content = "";
|
|
||||||
let leftover = ""; // Buffer for partially read lines
|
|
||||||
|
|
||||||
try {
|
|
||||||
let cont = true;
|
|
||||||
|
|
||||||
while (cont) {
|
|
||||||
const result = await reader.read();
|
|
||||||
if (result.done) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Add any leftover data to the current chunk of data
|
|
||||||
const text = leftover + decoder.decode(result.value);
|
|
||||||
|
|
||||||
// Check if the last character is a line break
|
|
||||||
const endsWithLineBreak = text.endsWith('\n');
|
|
||||||
|
|
||||||
// Split the text into lines
|
|
||||||
let lines = text.split('\n');
|
|
||||||
|
|
||||||
// If the text doesn't end with a line break, then the last line is incomplete
|
|
||||||
// Store it in leftover to be added to the next chunk of data
|
|
||||||
if (!endsWithLineBreak) {
|
|
||||||
leftover = lines.pop();
|
|
||||||
} else {
|
|
||||||
leftover = ""; // Reset leftover if we have a line break at the end
|
|
||||||
}
|
|
||||||
|
|
||||||
// Parse all sse events and add them to result
|
|
||||||
const regex = /^(\S+):\s(.*)$/gm;
|
|
||||||
for (const line of lines) {
|
|
||||||
const match = regex.exec(line);
|
|
||||||
if (match) {
|
|
||||||
result[match[1]] = match[2];
|
|
||||||
if (result.data === '[DONE]') {
|
|
||||||
cont = false;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
// since we know this is llama.cpp, let's just decode the json in data
|
|
||||||
if (result.data) {
|
|
||||||
result.data = JSON.parse(result.data);
|
|
||||||
content += result.data.content;
|
|
||||||
|
|
||||||
// yield
|
|
||||||
yield result;
|
|
||||||
|
|
||||||
// if we got a stop token from server, we will break here
|
|
||||||
if (result.data.stop) {
|
|
||||||
if (result.data.generation_settings) {
|
|
||||||
generation_settings = result.data.generation_settings;
|
|
||||||
}
|
|
||||||
cont = false;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (result.error) {
|
|
||||||
try {
|
|
||||||
result.error = JSON.parse(result.error);
|
|
||||||
if (result.error.message.includes('slot unavailable')) {
|
|
||||||
// Throw an error to be caught by upstream callers
|
|
||||||
throw new Error('slot unavailable');
|
|
||||||
} else {
|
|
||||||
console.error(`llama.cpp error [${result.error.code} - ${result.error.type}]: ${result.error.message}`);
|
|
||||||
}
|
|
||||||
} catch(e) {
|
|
||||||
console.error(`llama.cpp error ${result.error}`)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} catch (e) {
|
|
||||||
if (e.name !== 'AbortError') {
|
|
||||||
console.error("llama error: ", e);
|
|
||||||
}
|
|
||||||
throw e;
|
|
||||||
}
|
|
||||||
finally {
|
|
||||||
controller.abort();
|
|
||||||
}
|
|
||||||
|
|
||||||
return content;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Call llama, return an event target that you can subscribe to
|
|
||||||
//
|
|
||||||
// Example:
|
|
||||||
//
|
|
||||||
// import { llamaEventTarget } from '/completion.js'
|
|
||||||
//
|
|
||||||
// const conn = llamaEventTarget(prompt)
|
|
||||||
// conn.addEventListener("message", (chunk) => {
|
|
||||||
// document.write(chunk.detail.content)
|
|
||||||
// })
|
|
||||||
//
|
|
||||||
export const llamaEventTarget = (prompt, params = {}, config = {}) => {
|
|
||||||
const eventTarget = new EventTarget();
|
|
||||||
(async () => {
|
|
||||||
let content = "";
|
|
||||||
for await (const chunk of llama(prompt, params, config)) {
|
|
||||||
if (chunk.data) {
|
|
||||||
content += chunk.data.content;
|
|
||||||
eventTarget.dispatchEvent(new CustomEvent("message", { detail: chunk.data }));
|
|
||||||
}
|
|
||||||
if (chunk.data.generation_settings) {
|
|
||||||
eventTarget.dispatchEvent(new CustomEvent("generation_settings", { detail: chunk.data.generation_settings }));
|
|
||||||
}
|
|
||||||
if (chunk.data.timings) {
|
|
||||||
eventTarget.dispatchEvent(new CustomEvent("timings", { detail: chunk.data.timings }));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
eventTarget.dispatchEvent(new CustomEvent("done", { detail: { content } }));
|
|
||||||
})();
|
|
||||||
return eventTarget;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Call llama, return a promise that resolves to the completed text. This does not support streaming
|
|
||||||
//
|
|
||||||
// Example:
|
|
||||||
//
|
|
||||||
// llamaPromise(prompt).then((content) => {
|
|
||||||
// document.write(content)
|
|
||||||
// })
|
|
||||||
//
|
|
||||||
// or
|
|
||||||
//
|
|
||||||
// const content = await llamaPromise(prompt)
|
|
||||||
// document.write(content)
|
|
||||||
//
|
|
||||||
export const llamaPromise = (prompt, params = {}, config = {}) => {
|
|
||||||
return new Promise(async (resolve, reject) => {
|
|
||||||
let content = "";
|
|
||||||
try {
|
|
||||||
for await (const chunk of llama(prompt, params, config)) {
|
|
||||||
content += chunk.data.content;
|
|
||||||
}
|
|
||||||
resolve(content);
|
|
||||||
} catch (error) {
|
|
||||||
reject(error);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
};
|
|
||||||
|
|
||||||
/**
|
|
||||||
* (deprecated)
|
|
||||||
*/
|
|
||||||
export const llamaComplete = async (params, controller, callback) => {
|
|
||||||
for await (const chunk of llama(params.prompt, params, { controller })) {
|
|
||||||
callback(chunk);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Get the model info from the server. This is useful for getting the context window and so on.
|
|
||||||
export const llamaModelInfo = async (config = {}) => {
|
|
||||||
if (!generation_settings) {
|
|
||||||
const api_url = config.api_url?.replace(/\/+$/, '') || "";
|
|
||||||
const props = await fetch(`${api_url}/props`).then(r => r.json());
|
|
||||||
generation_settings = props.default_generation_settings;
|
|
||||||
}
|
|
||||||
return generation_settings;
|
|
||||||
}
|
|
|
@ -1,21 +1,25 @@
|
||||||
import './styles.css';
|
import './styles.css';
|
||||||
import { createApp, defineComponent, shallowRef, computed, h } from 'vue/dist/vue.esm-bundler.js';
|
import { createApp, defineComponent, shallowRef, computed, h } from 'vue/dist/vue.esm-bundler.js';
|
||||||
import { llama } from './completion.js';
|
|
||||||
import MarkdownIt from 'markdown-it';
|
import MarkdownIt from 'markdown-it';
|
||||||
|
import TextLineStream from 'textlinestream';
|
||||||
|
|
||||||
|
const isDev = import.meta.env.MODE === 'development';
|
||||||
|
|
||||||
// utility functions
|
// utility functions
|
||||||
const isString = (x) => !!x.toLowerCase;
|
const isString = (x) => !!x.toLowerCase;
|
||||||
const isNumeric = (n) => !isString(n) && !isNaN(n);
|
const isBoolean = (x) => x === true || x === false;
|
||||||
|
const isNumeric = (n) => !isString(n) && !isNaN(n) && !isBoolean(n);
|
||||||
const escapeAttr = (str) => str.replace(/>/g, '>').replace(/"/g, '"');
|
const escapeAttr = (str) => str.replace(/>/g, '>').replace(/"/g, '"');
|
||||||
const copyStr = (str) => navigator.clipboard.writeText(str);
|
const copyStr = (str) => navigator.clipboard.writeText(str);
|
||||||
|
|
||||||
// constants
|
// constants
|
||||||
const BASE_URL = localStorage.getItem('base') // for debugging
|
const BASE_URL = localStorage.getItem('base') // for debugging
|
||||||
|| (new URL('.', document.baseURI).href).toString(); // for production
|
|| (new URL('.', document.baseURI).href).toString().replace(/\/$/, ''); // for production
|
||||||
const CONFIG_DEFAULT = {
|
const CONFIG_DEFAULT = {
|
||||||
// Note: in order not to introduce breaking changes, please keep the same data type (number, string, etc) if you want to change the default value. Do not use null or undefined for default value.
|
// Note: in order not to introduce breaking changes, please keep the same data type (number, string, etc) if you want to change the default value. Do not use null or undefined for default value.
|
||||||
apiKey: '',
|
apiKey: '',
|
||||||
systemMessage: 'You are a helpful assistant.',
|
systemMessage: 'You are a helpful assistant.',
|
||||||
|
showTokensPerSecond: false,
|
||||||
// make sure these default values are in sync with `common.h`
|
// make sure these default values are in sync with `common.h`
|
||||||
samplers: 'dkypmxt',
|
samplers: 'dkypmxt',
|
||||||
temperature: 0.8,
|
temperature: 0.8,
|
||||||
|
@ -101,6 +105,48 @@ const SettingsModalShortInput = defineComponent({
|
||||||
},
|
},
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// message bubble component
|
||||||
|
const MessageBubble = defineComponent({
|
||||||
|
components: {
|
||||||
|
VueMarkdown
|
||||||
|
},
|
||||||
|
template: document.getElementById('message-bubble').innerHTML,
|
||||||
|
props: {
|
||||||
|
config: Object,
|
||||||
|
msg: Object,
|
||||||
|
isGenerating: Boolean,
|
||||||
|
editUserMsgAndRegenerate: Function,
|
||||||
|
regenerateMsg: Function,
|
||||||
|
},
|
||||||
|
data() {
|
||||||
|
return {
|
||||||
|
editingContent: null,
|
||||||
|
};
|
||||||
|
},
|
||||||
|
computed: {
|
||||||
|
timings() {
|
||||||
|
if (!this.msg.timings) return null;
|
||||||
|
return {
|
||||||
|
...this.msg.timings,
|
||||||
|
prompt_per_second: this.msg.timings.prompt_n / (this.msg.timings.prompt_ms / 1000),
|
||||||
|
predicted_per_second: this.msg.timings.predicted_n / (this.msg.timings.predicted_ms / 1000),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
},
|
||||||
|
methods: {
|
||||||
|
copyMsg() {
|
||||||
|
copyStr(this.msg.content);
|
||||||
|
},
|
||||||
|
editMsg() {
|
||||||
|
this.editUserMsgAndRegenerate({
|
||||||
|
...this.msg,
|
||||||
|
content: this.editingContent,
|
||||||
|
});
|
||||||
|
this.editingContent = null;
|
||||||
|
},
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
// coversations is stored in localStorage
|
// coversations is stored in localStorage
|
||||||
// format: { [convId]: { id: string, lastModified: number, messages: [...] } }
|
// format: { [convId]: { id: string, lastModified: number, messages: [...] } }
|
||||||
// convId is a string prefixed with 'conv-'
|
// convId is a string prefixed with 'conv-'
|
||||||
|
@ -192,10 +238,29 @@ const chatScrollToBottom = (requiresNearBottom) => {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// wrapper for SSE
|
||||||
|
async function* sendSSEPostRequest(url, fetchOptions) {
|
||||||
|
const res = await fetch(url, fetchOptions);
|
||||||
|
const lines = res.body
|
||||||
|
.pipeThrough(new TextDecoderStream())
|
||||||
|
.pipeThrough(new TextLineStream());
|
||||||
|
for await (const line of lines) {
|
||||||
|
if (isDev) console.log({line});
|
||||||
|
if (line.startsWith('data:') && !line.endsWith('[DONE]')) {
|
||||||
|
const data = JSON.parse(line.slice(5));
|
||||||
|
yield data;
|
||||||
|
} else if (line.startsWith('error:')) {
|
||||||
|
const data = JSON.parse(line.slice(6));
|
||||||
|
throw new Error(data.message || 'Unknown error');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
const mainApp = createApp({
|
const mainApp = createApp({
|
||||||
components: {
|
components: {
|
||||||
VueMarkdown,
|
VueMarkdown,
|
||||||
SettingsModalShortInput,
|
SettingsModalShortInput,
|
||||||
|
MessageBubble,
|
||||||
},
|
},
|
||||||
data() {
|
data() {
|
||||||
return {
|
return {
|
||||||
|
@ -209,7 +274,6 @@ const mainApp = createApp({
|
||||||
selectedTheme: StorageUtils.getTheme(),
|
selectedTheme: StorageUtils.getTheme(),
|
||||||
config: StorageUtils.getConfig(),
|
config: StorageUtils.getConfig(),
|
||||||
showConfigDialog: false,
|
showConfigDialog: false,
|
||||||
editingMsg: null,
|
|
||||||
// const
|
// const
|
||||||
themes: THEMES,
|
themes: THEMES,
|
||||||
configDefault: {...CONFIG_DEFAULT},
|
configDefault: {...CONFIG_DEFAULT},
|
||||||
|
@ -226,6 +290,15 @@ const mainApp = createApp({
|
||||||
});
|
});
|
||||||
resizeObserver.observe(pendingMsgElem);
|
resizeObserver.observe(pendingMsgElem);
|
||||||
},
|
},
|
||||||
|
watch: {
|
||||||
|
viewingConvId: function(val, oldVal) {
|
||||||
|
if (val != oldVal) {
|
||||||
|
this.fetchMessages();
|
||||||
|
chatScrollToBottom();
|
||||||
|
this.hideSidebar();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
methods: {
|
methods: {
|
||||||
hideSidebar() {
|
hideSidebar() {
|
||||||
document.getElementById('toggle-drawer').checked = false;
|
document.getElementById('toggle-drawer').checked = false;
|
||||||
|
@ -237,18 +310,10 @@ const mainApp = createApp({
|
||||||
newConversation() {
|
newConversation() {
|
||||||
if (this.isGenerating) return;
|
if (this.isGenerating) return;
|
||||||
this.viewingConvId = StorageUtils.getNewConvId();
|
this.viewingConvId = StorageUtils.getNewConvId();
|
||||||
this.editingMsg = null;
|
|
||||||
this.fetchMessages();
|
|
||||||
chatScrollToBottom();
|
|
||||||
this.hideSidebar();
|
|
||||||
},
|
},
|
||||||
setViewingConv(convId) {
|
setViewingConv(convId) {
|
||||||
if (this.isGenerating) return;
|
if (this.isGenerating) return;
|
||||||
this.viewingConvId = convId;
|
this.viewingConvId = convId;
|
||||||
this.editingMsg = null;
|
|
||||||
this.fetchMessages();
|
|
||||||
chatScrollToBottom();
|
|
||||||
this.hideSidebar();
|
|
||||||
},
|
},
|
||||||
deleteConv(convId) {
|
deleteConv(convId) {
|
||||||
if (this.isGenerating) return;
|
if (this.isGenerating) return;
|
||||||
|
@ -256,7 +321,6 @@ const mainApp = createApp({
|
||||||
StorageUtils.remove(convId);
|
StorageUtils.remove(convId);
|
||||||
if (this.viewingConvId === convId) {
|
if (this.viewingConvId === convId) {
|
||||||
this.viewingConvId = StorageUtils.getNewConvId();
|
this.viewingConvId = StorageUtils.getNewConvId();
|
||||||
this.editingMsg = null;
|
|
||||||
}
|
}
|
||||||
this.fetchConversation();
|
this.fetchConversation();
|
||||||
this.fetchMessages();
|
this.fetchMessages();
|
||||||
|
@ -291,7 +355,6 @@ const mainApp = createApp({
|
||||||
this.fetchConversation();
|
this.fetchConversation();
|
||||||
this.fetchMessages();
|
this.fetchMessages();
|
||||||
this.inputMsg = '';
|
this.inputMsg = '';
|
||||||
this.editingMsg = null;
|
|
||||||
this.generateMessage(currConvId);
|
this.generateMessage(currConvId);
|
||||||
chatScrollToBottom();
|
chatScrollToBottom();
|
||||||
},
|
},
|
||||||
|
@ -299,7 +362,6 @@ const mainApp = createApp({
|
||||||
if (this.isGenerating) return;
|
if (this.isGenerating) return;
|
||||||
this.pendingMsg = { id: Date.now()+1, role: 'assistant', content: null };
|
this.pendingMsg = { id: Date.now()+1, role: 'assistant', content: null };
|
||||||
this.isGenerating = true;
|
this.isGenerating = true;
|
||||||
this.editingMsg = null;
|
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const abortController = new AbortController();
|
const abortController = new AbortController();
|
||||||
|
@ -330,17 +392,21 @@ const mainApp = createApp({
|
||||||
dry_allowed_length: this.config.dry_allowed_length,
|
dry_allowed_length: this.config.dry_allowed_length,
|
||||||
dry_penalty_last_n: this.config.dry_penalty_last_n,
|
dry_penalty_last_n: this.config.dry_penalty_last_n,
|
||||||
max_tokens: this.config.max_tokens,
|
max_tokens: this.config.max_tokens,
|
||||||
|
timings_per_token: !!this.config.showTokensPerSecond,
|
||||||
...(this.config.custom.length ? JSON.parse(this.config.custom) : {}),
|
...(this.config.custom.length ? JSON.parse(this.config.custom) : {}),
|
||||||
...(this.config.apiKey ? { api_key: this.config.apiKey } : {}),
|
|
||||||
};
|
};
|
||||||
const config = {
|
const chunks = sendSSEPostRequest(`${BASE_URL}/v1/chat/completions`, {
|
||||||
controller: abortController,
|
method: 'POST',
|
||||||
api_url: BASE_URL,
|
headers: {
|
||||||
endpoint: '/chat/completions',
|
'Content-Type': 'application/json',
|
||||||
};
|
'Authorization': this.config.apiKey ? `Bearer ${this.config.apiKey}` : undefined,
|
||||||
for await (const chunk of llama(prompt, params, config)) {
|
},
|
||||||
const stop = chunk.data.stop;
|
body: JSON.stringify(params),
|
||||||
const addedContent = chunk.data.choices[0].delta.content;
|
signal: abortController.signal,
|
||||||
|
});
|
||||||
|
for await (const chunk of chunks) {
|
||||||
|
const stop = chunk.stop;
|
||||||
|
const addedContent = chunk.choices[0].delta.content;
|
||||||
const lastContent = this.pendingMsg.content || '';
|
const lastContent = this.pendingMsg.content || '';
|
||||||
if (addedContent) {
|
if (addedContent) {
|
||||||
this.pendingMsg = {
|
this.pendingMsg = {
|
||||||
|
@ -349,6 +415,16 @@ const mainApp = createApp({
|
||||||
content: lastContent + addedContent,
|
content: lastContent + addedContent,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
const timings = chunk.timings;
|
||||||
|
if (timings && this.config.showTokensPerSecond) {
|
||||||
|
// only extract what's really needed, to save some space
|
||||||
|
this.pendingMsg.timings = {
|
||||||
|
prompt_n: timings.prompt_n,
|
||||||
|
prompt_ms: timings.prompt_ms,
|
||||||
|
predicted_n: timings.predicted_n,
|
||||||
|
predicted_ms: timings.predicted_ms,
|
||||||
|
};
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
StorageUtils.appendMsg(currConvId, this.pendingMsg);
|
StorageUtils.appendMsg(currConvId, this.pendingMsg);
|
||||||
|
@ -387,14 +463,10 @@ const mainApp = createApp({
|
||||||
this.fetchMessages();
|
this.fetchMessages();
|
||||||
this.generateMessage(currConvId);
|
this.generateMessage(currConvId);
|
||||||
},
|
},
|
||||||
copyMsg(msg) {
|
|
||||||
copyStr(msg.content);
|
|
||||||
},
|
|
||||||
editUserMsgAndRegenerate(msg) {
|
editUserMsgAndRegenerate(msg) {
|
||||||
if (this.isGenerating) return;
|
if (this.isGenerating) return;
|
||||||
const currConvId = this.viewingConvId;
|
const currConvId = this.viewingConvId;
|
||||||
const newContent = msg.content;
|
const newContent = msg.content;
|
||||||
this.editingMsg = null;
|
|
||||||
StorageUtils.filterAndKeepMsgs(currConvId, (m) => m.id < msg.id);
|
StorageUtils.filterAndKeepMsgs(currConvId, (m) => m.id < msg.id);
|
||||||
StorageUtils.appendMsg(currConvId, {
|
StorageUtils.appendMsg(currConvId, {
|
||||||
id: Date.now(),
|
id: Date.now(),
|
||||||
|
|
|
@ -122,7 +122,7 @@ static const char * ggml_backend_amx_buffer_type_get_name(ggml_backend_buffer_ty
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_backend_buffer_t ggml_backend_amx_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
static ggml_backend_buffer_t ggml_backend_amx_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||||
void * data = aligned_alloc(TENSOR_ALIGNMENT, size);
|
void * data = ggml_aligned_malloc(size);
|
||||||
if (data == NULL) {
|
if (data == NULL) {
|
||||||
fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size);
|
fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size);
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
|
@ -129,8 +129,7 @@ struct ggml_arm_arch_features_type {
|
||||||
#endif
|
#endif
|
||||||
#include <windows.h>
|
#include <windows.h>
|
||||||
|
|
||||||
|
#if defined(_MSC_VER) && !defined(__clang__)
|
||||||
#if !defined(__clang__)
|
|
||||||
#define GGML_CACHE_ALIGN __declspec(align(GGML_CACHE_LINE))
|
#define GGML_CACHE_ALIGN __declspec(align(GGML_CACHE_LINE))
|
||||||
|
|
||||||
typedef volatile LONG atomic_int;
|
typedef volatile LONG atomic_int;
|
||||||
|
@ -458,21 +457,21 @@ const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type
|
||||||
#define GGML_F32x4_ADD vaddq_f32
|
#define GGML_F32x4_ADD vaddq_f32
|
||||||
#define GGML_F32x4_MUL vmulq_f32
|
#define GGML_F32x4_MUL vmulq_f32
|
||||||
#define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
|
#define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
|
||||||
#define GGML_F32x4_REDUCE(res, x) \
|
#define GGML_F32x4_REDUCE(res, x) \
|
||||||
{ \
|
{ \
|
||||||
int offset = GGML_F32_ARR >> 1; \
|
int offset = GGML_F32_ARR >> 1; \
|
||||||
for (int i = 0; i < offset; ++i) { \
|
for (int i = 0; i < offset; ++i) { \
|
||||||
(x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
|
(x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
|
||||||
} \
|
} \
|
||||||
offset >>= 1; \
|
offset >>= 1; \
|
||||||
for (int i = 0; i < offset; ++i) { \
|
for (int i = 0; i < offset; ++i) { \
|
||||||
(x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
|
(x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
|
||||||
} \
|
} \
|
||||||
offset >>= 1; \
|
offset >>= 1; \
|
||||||
for (int i = 0; i < offset; ++i) { \
|
for (int i = 0; i < offset; ++i) { \
|
||||||
(x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
|
(x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
|
||||||
} \
|
} \
|
||||||
(res) = GGML_F32x4_REDUCE_ONE((x)[0]); \
|
(res) = (ggml_float) GGML_F32x4_REDUCE_ONE((x)[0]); \
|
||||||
}
|
}
|
||||||
|
|
||||||
#define GGML_F32_VEC GGML_F32x4
|
#define GGML_F32_VEC GGML_F32x4
|
||||||
|
@ -2400,7 +2399,7 @@ static void ggml_init_arm_arch_features(void) {
|
||||||
uint32_t hwcap2 = getauxval(AT_HWCAP2);
|
uint32_t hwcap2 = getauxval(AT_HWCAP2);
|
||||||
|
|
||||||
ggml_arm_arch_features.has_neon = !!(hwcap & HWCAP_ASIMD);
|
ggml_arm_arch_features.has_neon = !!(hwcap & HWCAP_ASIMD);
|
||||||
ggml_arm_arch_features.has_dotprod = !!(hwcap && HWCAP_ASIMDDP);
|
ggml_arm_arch_features.has_dotprod = !!(hwcap & HWCAP_ASIMDDP);
|
||||||
ggml_arm_arch_features.has_i8mm = !!(hwcap2 & HWCAP2_I8MM);
|
ggml_arm_arch_features.has_i8mm = !!(hwcap2 & HWCAP2_I8MM);
|
||||||
ggml_arm_arch_features.has_sve = !!(hwcap & HWCAP_SVE);
|
ggml_arm_arch_features.has_sve = !!(hwcap & HWCAP_SVE);
|
||||||
|
|
||||||
|
@ -12982,7 +12981,7 @@ static thread_ret_t ggml_graph_compute_secondary_thread(void* data);
|
||||||
#include "windows.h"
|
#include "windows.h"
|
||||||
|
|
||||||
// TODO: support > 64 CPUs
|
// TODO: support > 64 CPUs
|
||||||
bool ggml_thread_apply_affinity(bool * mask) {
|
static bool ggml_thread_apply_affinity(bool * mask) {
|
||||||
HANDLE h = GetCurrentThread();
|
HANDLE h = GetCurrentThread();
|
||||||
uint64_t bitmask = 0ULL;
|
uint64_t bitmask = 0ULL;
|
||||||
|
|
||||||
|
|
|
@ -94,7 +94,9 @@ static void concat_f32_cuda(const float * x, const float * y, float * dst, int n
|
||||||
}
|
}
|
||||||
|
|
||||||
// non-contiguous kernel (slow)
|
// non-contiguous kernel (slow)
|
||||||
static __global__ void concat_f32_non_cont(
|
template <int dim>
|
||||||
|
static __global__ void __launch_bounds__(CUDA_CONCAT_BLOCK_SIZE)
|
||||||
|
concat_f32_non_cont(
|
||||||
const char * src0,
|
const char * src0,
|
||||||
const char * src1,
|
const char * src1,
|
||||||
char * dst,
|
char * dst,
|
||||||
|
@ -121,22 +123,28 @@ static __global__ void concat_f32_non_cont(
|
||||||
uint64_t nb0,
|
uint64_t nb0,
|
||||||
uint64_t nb1,
|
uint64_t nb1,
|
||||||
uint64_t nb2,
|
uint64_t nb2,
|
||||||
uint64_t nb3,
|
uint64_t nb3){
|
||||||
int32_t dim) {
|
static_assert(dim >= 0 && dim <= 3);
|
||||||
|
|
||||||
const int64_t i3 = blockIdx.z;
|
const int64_t i3 = blockIdx.z;
|
||||||
const int64_t i2 = blockIdx.y;
|
const int64_t i2 = blockIdx.y;
|
||||||
const int64_t i1 = blockIdx.x;
|
const int64_t i1 = blockIdx.x;
|
||||||
|
|
||||||
int64_t o[4] = {0, 0, 0, 0};
|
|
||||||
o[dim] = dim == 0 ? ne00 : (dim == 1 ? ne01 : (dim == 2 ? ne02 : ne03));
|
|
||||||
|
|
||||||
const float * x;
|
const float * x;
|
||||||
|
|
||||||
for (int i0 = threadIdx.x; i0 < ne0; i0 += blockDim.x) {
|
for (int64_t i0 = threadIdx.x; i0 < ne0; i0 += blockDim.x) {
|
||||||
if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
|
if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
|
||||||
x = (const float *)(src0 + (i3 )*nb03 + (i2 )*nb02 + (i1 )*nb01 + (i0 )*nb00);
|
x = (const float *)(src0 + (i3 )*nb03 + (i2 )*nb02 + (i1 )*nb01 + (i0 )*nb00);
|
||||||
} else {
|
} else {
|
||||||
x = (const float *)(src1 + (i3 - o[3])*nb13 + (i2 - o[2])*nb12 + (i1 - o[1])*nb11 + (i0 - o[0])*nb10);
|
if constexpr (dim == 0) {
|
||||||
|
x = (const float *) (src1 + i3 * nb13 + i2 * nb12 + i1 * nb11 + (i0 - ne00) * nb10);
|
||||||
|
} else if constexpr (dim == 1) {
|
||||||
|
x = (const float *) (src1 + i3 * nb13 + i2 * nb12 + (i1 - ne01) * nb11 + i0 * nb10);
|
||||||
|
} else if constexpr (dim == 2) {
|
||||||
|
x = (const float *) (src1 + i3 * nb13 + (i2 - ne02) * nb12 + i1 * nb11 + i0 * nb10);
|
||||||
|
} else if constexpr (dim == 3) {
|
||||||
|
x = (const float *) (src1 + (i3 - ne03) * nb13 + i2 * nb12 + i1 * nb11 + i0 * nb10);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
float * y = (float *)(dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
float * y = (float *)(dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||||
|
@ -182,15 +190,32 @@ void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
dim3 grid_dim(dst->ne[1], dst->ne[2], dst->ne[3]);
|
dim3 grid_dim(dst->ne[1], dst->ne[2], dst->ne[3]);
|
||||||
concat_f32_non_cont<<<grid_dim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(
|
auto launch_kernel = [&](auto dim) {
|
||||||
(const char *)src0->data,
|
concat_f32_non_cont<dim><<<grid_dim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(
|
||||||
(const char *)src1->data,
|
(const char *) src0->data, (const char *) src1->data, (char *) dst->data,
|
||||||
( char *)dst->data,
|
|
||||||
src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
|
src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
|
||||||
src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
|
src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
|
||||||
src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
|
src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
|
||||||
src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3],
|
src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3],
|
||||||
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
|
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
|
||||||
dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3], dim);
|
dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3]);
|
||||||
|
};
|
||||||
|
switch (dim) {
|
||||||
|
case 0:
|
||||||
|
launch_kernel(std::integral_constant<int, 0>{});
|
||||||
|
break;
|
||||||
|
case 1:
|
||||||
|
launch_kernel(std::integral_constant<int, 1>{});
|
||||||
|
break;
|
||||||
|
case 2:
|
||||||
|
launch_kernel(std::integral_constant<int, 2>{});
|
||||||
|
break;
|
||||||
|
case 3:
|
||||||
|
launch_kernel(std::integral_constant<int, 3>{});
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
GGML_ABORT("Invalid dim: %d", dim);
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -74,8 +74,8 @@ static inline int ggml_up(int n, int m) {
|
||||||
//
|
//
|
||||||
|
|
||||||
GGML_ATTRIBUTE_FORMAT(2, 3)
|
GGML_ATTRIBUTE_FORMAT(2, 3)
|
||||||
void ggml_log_internal (enum ggml_log_level level, const char * format, ...);
|
GGML_API void ggml_log_internal (enum ggml_log_level level, const char * format, ...);
|
||||||
void ggml_log_callback_default(enum ggml_log_level level, const char * text, void * user_data);
|
GGML_API void ggml_log_callback_default(enum ggml_log_level level, const char * text, void * user_data);
|
||||||
|
|
||||||
#define GGML_LOG(...) ggml_log_internal(GGML_LOG_LEVEL_NONE , __VA_ARGS__)
|
#define GGML_LOG(...) ggml_log_internal(GGML_LOG_LEVEL_NONE , __VA_ARGS__)
|
||||||
#define GGML_LOG_INFO(...) ggml_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
|
#define GGML_LOG_INFO(...) ggml_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
|
||||||
|
@ -304,8 +304,8 @@ struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1);
|
||||||
|
|
||||||
// Memory allocation
|
// Memory allocation
|
||||||
|
|
||||||
void * ggml_aligned_malloc(size_t size);
|
GGML_API void * ggml_aligned_malloc(size_t size);
|
||||||
void ggml_aligned_free(void * ptr, size_t size);
|
GGML_API void ggml_aligned_free(void * ptr, size_t size);
|
||||||
|
|
||||||
// FP16 to FP32 conversion
|
// FP16 to FP32 conversion
|
||||||
|
|
||||||
|
|
|
@ -1,11 +1,13 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
|
#include "ggml.h"
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
void ggml_critical_section_start(void);
|
GGML_API void ggml_critical_section_start(void);
|
||||||
void ggml_critical_section_end(void);
|
GGML_API void ggml_critical_section_end(void);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
|
|
|
@ -163,7 +163,11 @@ struct vk_device_struct {
|
||||||
uint32_t shader_core_count;
|
uint32_t shader_core_count;
|
||||||
bool uma;
|
bool uma;
|
||||||
bool float_controls_rte_fp16;
|
bool float_controls_rte_fp16;
|
||||||
bool coopmat2;
|
|
||||||
|
bool subgroup_size_control;
|
||||||
|
uint32_t subgroup_min_size;
|
||||||
|
uint32_t subgroup_max_size;
|
||||||
|
bool subgroup_require_full_support;
|
||||||
|
|
||||||
bool coopmat_support;
|
bool coopmat_support;
|
||||||
bool coopmat_acc_f32_support;
|
bool coopmat_acc_f32_support;
|
||||||
|
@ -171,6 +175,7 @@ struct vk_device_struct {
|
||||||
uint32_t coopmat_m;
|
uint32_t coopmat_m;
|
||||||
uint32_t coopmat_n;
|
uint32_t coopmat_n;
|
||||||
uint32_t coopmat_k;
|
uint32_t coopmat_k;
|
||||||
|
bool coopmat2;
|
||||||
|
|
||||||
size_t idx;
|
size_t idx;
|
||||||
|
|
||||||
|
@ -749,8 +754,12 @@ static uint32_t compile_count = 0;
|
||||||
static std::mutex compile_count_mutex;
|
static std::mutex compile_count_mutex;
|
||||||
static std::condition_variable compile_count_cond;
|
static std::condition_variable compile_count_cond;
|
||||||
|
|
||||||
static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipeline, const std::string name, size_t spv_size, const void* spv_data, const std::string entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, std::vector<uint32_t> specialization_constants, uint32_t align, bool disable_robustness) {
|
static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipeline, const std::string name, size_t spv_size, const void* spv_data, const std::string entrypoint,
|
||||||
VK_LOG_DEBUG("ggml_vk_create_pipeline(" << device->name << ", " << name << ", " << entrypoint << ", " << parameter_count << ", " << push_constant_size << ", (" << wg_denoms[0] << "," << wg_denoms[1] << "," << wg_denoms[2] << "), specialization_constants, " << align << ")");
|
uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, std::vector<uint32_t> specialization_constants,
|
||||||
|
uint32_t align, bool disable_robustness, bool require_full_subgroups, uint32_t required_subgroup_size) {
|
||||||
|
VK_LOG_DEBUG("ggml_vk_create_pipeline(" << device->name << ", " << name << ", " << entrypoint << ", " << parameter_count << ", " << push_constant_size <<
|
||||||
|
", (" << wg_denoms[0] << "," << wg_denoms[1] << "," << wg_denoms[2] << "), specialization_constants, " << align <<
|
||||||
|
", " << disable_robustness << ", " << require_full_subgroups << ", " << required_subgroup_size << ")");
|
||||||
GGML_ASSERT(parameter_count > 0);
|
GGML_ASSERT(parameter_count > 0);
|
||||||
GGML_ASSERT(wg_denoms[0] > 0 && wg_denoms[1] > 0 && wg_denoms[2] > 0); // NOLINT
|
GGML_ASSERT(wg_denoms[0] > 0 && wg_denoms[1] > 0 && wg_denoms[2] > 0); // NOLINT
|
||||||
|
|
||||||
|
@ -809,14 +818,28 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
|
||||||
specialization_constants.data()
|
specialization_constants.data()
|
||||||
);
|
);
|
||||||
|
|
||||||
|
vk::PipelineShaderStageCreateFlags pipeline_shader_stage_create_flags{};
|
||||||
|
|
||||||
|
if (device->subgroup_require_full_support && require_full_subgroups) {
|
||||||
|
pipeline_shader_stage_create_flags |= vk::PipelineShaderStageCreateFlagBits::eRequireFullSubgroupsEXT;
|
||||||
|
}
|
||||||
|
|
||||||
vk::PipelineShaderStageCreateInfo pipeline_shader_create_info(
|
vk::PipelineShaderStageCreateInfo pipeline_shader_create_info(
|
||||||
vk::PipelineShaderStageCreateFlags(),
|
pipeline_shader_stage_create_flags,
|
||||||
vk::ShaderStageFlagBits::eCompute,
|
vk::ShaderStageFlagBits::eCompute,
|
||||||
pipeline->shader_module,
|
pipeline->shader_module,
|
||||||
entrypoint.c_str(),
|
entrypoint.c_str(),
|
||||||
&specialization_info);
|
&specialization_info);
|
||||||
|
|
||||||
|
vk::PipelineShaderStageRequiredSubgroupSizeCreateInfoEXT pipeline_shader_stage_required_subgroup_size_create_info;
|
||||||
|
pipeline_shader_stage_required_subgroup_size_create_info.requiredSubgroupSize = required_subgroup_size;
|
||||||
|
if (device->subgroup_size_control && required_subgroup_size > 0) {
|
||||||
|
GGML_ASSERT(device->subgroup_min_size <= required_subgroup_size && required_subgroup_size <= device->subgroup_max_size);
|
||||||
|
pipeline_shader_create_info.setPNext(&pipeline_shader_stage_required_subgroup_size_create_info);
|
||||||
|
}
|
||||||
|
|
||||||
vk::ComputePipelineCreateInfo compute_pipeline_create_info(
|
vk::ComputePipelineCreateInfo compute_pipeline_create_info(
|
||||||
vk::PipelineCreateFlags(),
|
vk::PipelineCreateFlags{},
|
||||||
pipeline_shader_create_info,
|
pipeline_shader_create_info,
|
||||||
pipeline->layout);
|
pipeline->layout);
|
||||||
|
|
||||||
|
@ -1496,7 +1519,9 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||||
device->pipeline_matmul_id_f32 = std::make_shared<vk_matmul_pipeline_struct>();
|
device->pipeline_matmul_id_f32 = std::make_shared<vk_matmul_pipeline_struct>();
|
||||||
|
|
||||||
std::vector<std::future<void>> compiles;
|
std::vector<std::future<void>> compiles;
|
||||||
auto const &ggml_vk_create_pipeline = [&](vk_device& device, vk_pipeline& pipeline, const std::string &name, size_t spv_size, const void* spv_data, const std::string &entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, const std::vector<uint32_t>& specialization_constants, uint32_t align, bool disable_robustness = false) {
|
auto const &ggml_vk_create_pipeline = [&](vk_device& device, vk_pipeline& pipeline, const std::string &name, size_t spv_size, const void* spv_data, const std::string &entrypoint,
|
||||||
|
uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, const std::vector<uint32_t>& specialization_constants,
|
||||||
|
uint32_t align, bool disable_robustness = false, bool require_full_subgroups = false, uint32_t required_subgroup_size = 0) {
|
||||||
{
|
{
|
||||||
// wait until fewer than N compiles are in progress
|
// wait until fewer than N compiles are in progress
|
||||||
uint32_t N = std::max(1u, std::thread::hardware_concurrency());
|
uint32_t N = std::max(1u, std::thread::hardware_concurrency());
|
||||||
|
@ -1506,7 +1531,8 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||||
}
|
}
|
||||||
compile_count++;
|
compile_count++;
|
||||||
}
|
}
|
||||||
compiles.push_back(std::async(ggml_vk_create_pipeline_func, std::ref(device), std::ref(pipeline), name, spv_size, spv_data, entrypoint, parameter_count, push_constant_size, wg_denoms, specialization_constants, align, disable_robustness));
|
compiles.push_back(std::async(ggml_vk_create_pipeline_func, std::ref(device), std::ref(pipeline), name, spv_size, spv_data, entrypoint,
|
||||||
|
parameter_count, push_constant_size, wg_denoms, specialization_constants, align, disable_robustness, require_full_subgroups, required_subgroup_size));
|
||||||
};
|
};
|
||||||
|
|
||||||
#if defined(VK_NV_cooperative_matrix2) && defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
|
#if defined(VK_NV_cooperative_matrix2) && defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
|
||||||
|
@ -1612,40 +1638,59 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||||
// Create 6 variants, {s,m,l}x{unaligned,aligned}
|
// Create 6 variants, {s,m,l}x{unaligned,aligned}
|
||||||
#define CREATE_MM(PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
|
#define CREATE_MM(PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
|
||||||
if (device->mul_mat ## ID ## _l) \
|
if (device->mul_mat ## ID ## _l) \
|
||||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _coopmat_len, NAMELC ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1); \
|
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _coopmat_len, NAMELC ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1, false, true); \
|
||||||
if (device->mul_mat ## ID ## _m) \
|
if (device->mul_mat ## ID ## _m) \
|
||||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _coopmat_len, NAMELC ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1); \
|
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _coopmat_len, NAMELC ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1, false, true); \
|
||||||
if (device->mul_mat ## ID ## _s) \
|
if (device->mul_mat ## ID ## _s) \
|
||||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _coopmat_len, NAMELC ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1); \
|
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _coopmat_len, NAMELC ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1, false, true); \
|
||||||
if (device->mul_mat ## ID ## _l) \
|
if (device->mul_mat ## ID ## _l) \
|
||||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", NAMELC ## _aligned ## F16ACC ## _coopmat_len, NAMELC ## _aligned ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, l_align); \
|
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", NAMELC ## _aligned ## F16ACC ## _coopmat_len, NAMELC ## _aligned ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, l_align, false, true); \
|
||||||
if (device->mul_mat ## ID ## _m) \
|
if (device->mul_mat ## ID ## _m) \
|
||||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", NAMELC ## _aligned ## F16ACC ## _coopmat_len, NAMELC ## _aligned ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, m_align); \
|
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", NAMELC ## _aligned ## F16ACC ## _coopmat_len, NAMELC ## _aligned ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, m_align, false, true); \
|
||||||
if (device->mul_mat ## ID ## _s) \
|
if (device->mul_mat ## ID ## _s) \
|
||||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## _aligned ## F16ACC ## _coopmat_len, NAMELC ## _aligned ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align); \
|
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## _aligned ## F16ACC ## _coopmat_len, NAMELC ## _aligned ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align, false, true); \
|
||||||
|
|
||||||
// Create 2 variants, {f16,f32} accumulator
|
// Create 2 variants, {f16,f32} accumulator
|
||||||
#define CREATE_MM2(PIPELINE_NAME, NAMELC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
|
#define CREATE_MM2(PIPELINE_NAME, NAMELC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
|
||||||
CREATE_MM(PIPELINE_NAME . f16acc, NAMELC, _f16acc, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
|
if (device->coopmat_acc_f16_support) { \
|
||||||
CREATE_MM(PIPELINE_NAME . f32acc, NAMELC, , WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
|
CREATE_MM(PIPELINE_NAME . f16acc, NAMELC, _f16acc, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
|
||||||
|
} \
|
||||||
|
if (device->coopmat_acc_f32_support) { \
|
||||||
|
CREATE_MM(PIPELINE_NAME . f32acc, NAMELC, , WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
|
||||||
|
} \
|
||||||
|
|
||||||
CREATE_MM(pipeline_matmul_f32, matmul_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
|
CREATE_MM(pipeline_matmul_f32, matmul_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
|
||||||
CREATE_MM(pipeline_matmul_f32_f16, matmul_f32_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
|
CREATE_MM(pipeline_matmul_f32_f16, matmul_f32_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
|
||||||
CREATE_MM2(pipeline_matmul_f16, matmul_f16, wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
|
CREATE_MM2(pipeline_matmul_f16, matmul_f16, wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
|
||||||
CREATE_MM2(pipeline_matmul_f16_f32, matmul_f16_f32, wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
|
CREATE_MM2(pipeline_matmul_f16_f32, matmul_f16_f32, wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
|
||||||
|
|
||||||
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f16acc, matmul_q4_0_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
if (device->coopmat_acc_f16_support) {
|
||||||
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f16acc, matmul_q4_1_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f16acc, matmul_q4_0_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||||
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0].f16acc, matmul_q5_0_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f16acc, matmul_q4_1_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||||
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1].f16acc, matmul_q5_1_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0].f16acc, matmul_q5_0_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||||
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0].f16acc, matmul_q8_0_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1].f16acc, matmul_q5_1_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||||
|
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0].f16acc, matmul_q8_0_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||||
|
|
||||||
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K].f16acc, matmul_q2_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K].f16acc, matmul_q2_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||||
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K].f16acc, matmul_q3_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K].f16acc, matmul_q3_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||||
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K].f16acc, matmul_q4_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K].f16acc, matmul_q4_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||||
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f16acc, matmul_q5_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f16acc, matmul_q5_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||||
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f16acc, matmul_q6_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f16acc, matmul_q6_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||||
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f16acc, matmul_iq4_nl_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f16acc, matmul_iq4_nl_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||||
|
} else {
|
||||||
|
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f16acc, matmul_q4_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||||
|
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f16acc, matmul_q4_1_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||||
|
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0].f16acc, matmul_q5_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||||
|
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1].f16acc, matmul_q5_1_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||||
|
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0].f16acc, matmul_q8_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||||
|
|
||||||
|
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K].f16acc, matmul_q2_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||||
|
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K].f16acc, matmul_q3_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||||
|
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K].f16acc, matmul_q4_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||||
|
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f16acc, matmul_q5_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||||
|
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f16acc, matmul_q6_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||||
|
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f16acc, matmul_iq4_nl_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||||
|
}
|
||||||
|
|
||||||
// If there's not enough shared memory for row_ids and the result tile, don't create these pipelines.
|
// If there's not enough shared memory for row_ids and the result tile, don't create these pipelines.
|
||||||
if (device->mul_mat_id_s || device->mul_mat_id_m || device->mul_mat_id_l) {
|
if (device->mul_mat_id_s || device->mul_mat_id_m || device->mul_mat_id_l) {
|
||||||
|
@ -1653,19 +1698,35 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||||
CREATE_MM2(pipeline_matmul_id_f16, matmul_id_f16, wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id);
|
CREATE_MM2(pipeline_matmul_id_f16, matmul_id_f16, wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id);
|
||||||
CREATE_MM2(pipeline_matmul_id_f16_f32, matmul_id_f16_f32, wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id);
|
CREATE_MM2(pipeline_matmul_id_f16_f32, matmul_id_f16_f32, wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id);
|
||||||
|
|
||||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f16acc, matmul_id_q4_0_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
if (device->coopmat_acc_f16_support) {
|
||||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1].f16acc, matmul_id_q4_1_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f16acc, matmul_id_q4_0_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0].f16acc, matmul_id_q5_0_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1].f16acc, matmul_id_q4_1_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1].f16acc, matmul_id_q5_1_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0].f16acc, matmul_id_q5_0_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0].f16acc, matmul_id_q8_0_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1].f16acc, matmul_id_q5_1_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||||
|
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0].f16acc, matmul_id_q8_0_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||||
|
|
||||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K].f16acc, matmul_id_q2_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K].f16acc, matmul_id_q2_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K].f16acc, matmul_id_q3_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K].f16acc, matmul_id_q3_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K].f16acc, matmul_id_q4_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K].f16acc, matmul_id_q4_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K].f16acc, matmul_id_q5_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K].f16acc, matmul_id_q5_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f16acc, matmul_id_q6_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f16acc, matmul_id_q6_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f16acc, matmul_id_iq4_nl_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f16acc, matmul_id_iq4_nl_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||||
|
} else {
|
||||||
|
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f16acc, matmul_id_q4_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||||
|
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1].f16acc, matmul_id_q4_1_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||||
|
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0].f16acc, matmul_id_q5_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||||
|
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1].f16acc, matmul_id_q5_1_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||||
|
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0].f16acc, matmul_id_q8_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||||
|
|
||||||
|
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K].f16acc, matmul_id_q2_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||||
|
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K].f16acc, matmul_id_q3_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||||
|
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K].f16acc, matmul_id_q4_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||||
|
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K].f16acc, matmul_id_q5_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||||
|
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f16acc, matmul_id_q6_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||||
|
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f16acc, matmul_id_iq4_nl_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
#undef CREATE_MM2
|
||||||
#undef CREATE_MM
|
#undef CREATE_MM
|
||||||
} else if (device->fp16) {
|
} else if (device->fp16) {
|
||||||
// Create 6 variants, {s,m,l}x{unaligned,aligned}
|
// Create 6 variants, {s,m,l}x{unaligned,aligned}
|
||||||
|
@ -1683,6 +1744,11 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||||
if (device->mul_mat ## ID ## _s) \
|
if (device->mul_mat ## ID ## _s) \
|
||||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## _aligned ## F16ACC ## _len, NAMELC ## _aligned ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align); \
|
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## _aligned ## F16ACC ## _len, NAMELC ## _aligned ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align); \
|
||||||
|
|
||||||
|
// Create 2 variants, {f16,f32} accumulator
|
||||||
|
#define CREATE_MM2(PIPELINE_NAME, NAMELC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
|
||||||
|
CREATE_MM(PIPELINE_NAME . f16acc, NAMELC, _f16acc, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
|
||||||
|
CREATE_MM(PIPELINE_NAME . f32acc, NAMELC, , WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
|
||||||
|
|
||||||
CREATE_MM(pipeline_matmul_f32, matmul_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
|
CREATE_MM(pipeline_matmul_f32, matmul_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
|
||||||
CREATE_MM(pipeline_matmul_f32_f16, matmul_f32_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
|
CREATE_MM(pipeline_matmul_f32_f16, matmul_f32_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
|
||||||
CREATE_MM2(pipeline_matmul_f16, matmul_f16, wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
|
CREATE_MM2(pipeline_matmul_f16, matmul_f16, wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
|
||||||
|
@ -1720,6 +1786,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f16acc, matmul_id_q6_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f16acc, matmul_id_q6_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f16acc, matmul_id_iq4_nl_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f16acc, matmul_id_iq4_nl_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||||
}
|
}
|
||||||
|
#undef CREATE_MM2
|
||||||
#undef CREATE_MM
|
#undef CREATE_MM
|
||||||
} else {
|
} else {
|
||||||
// Create 6 variants, {s,m,l}x{unaligned,aligned}
|
// Create 6 variants, {s,m,l}x{unaligned,aligned}
|
||||||
|
@ -1774,7 +1841,6 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f32acc, matmul_id_q6_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f32acc, matmul_id_q6_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f32acc, matmul_id_iq4_nl_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f32acc, matmul_id_iq4_nl_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||||
}
|
}
|
||||||
#undef CREATE_MM2
|
|
||||||
#undef CREATE_MM
|
#undef CREATE_MM
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1998,6 +2064,8 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||||
amd_shader_core_properties2 = true;
|
amd_shader_core_properties2 = true;
|
||||||
} else if (strcmp("VK_EXT_pipeline_robustness", properties.extensionName) == 0) {
|
} else if (strcmp("VK_EXT_pipeline_robustness", properties.extensionName) == 0) {
|
||||||
pipeline_robustness = true;
|
pipeline_robustness = true;
|
||||||
|
} else if (strcmp("VK_EXT_subgroup_size_control", properties.extensionName) == 0) {
|
||||||
|
device->subgroup_size_control = true;
|
||||||
} else if (strcmp("VK_KHR_cooperative_matrix", properties.extensionName) == 0 &&
|
} else if (strcmp("VK_KHR_cooperative_matrix", properties.extensionName) == 0 &&
|
||||||
!getenv("GGML_VK_DISABLE_COOPMAT")) {
|
!getenv("GGML_VK_DISABLE_COOPMAT")) {
|
||||||
device->coopmat_support = true;
|
device->coopmat_support = true;
|
||||||
|
@ -2018,6 +2086,8 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||||
vk::PhysicalDeviceShaderSMBuiltinsPropertiesNV sm_props;
|
vk::PhysicalDeviceShaderSMBuiltinsPropertiesNV sm_props;
|
||||||
vk::PhysicalDeviceShaderCoreProperties2AMD amd_shader_core_properties2_props;
|
vk::PhysicalDeviceShaderCoreProperties2AMD amd_shader_core_properties2_props;
|
||||||
vk::PhysicalDeviceVulkan12Properties vk12_props;
|
vk::PhysicalDeviceVulkan12Properties vk12_props;
|
||||||
|
vk::PhysicalDeviceSubgroupSizeControlPropertiesEXT subgroup_size_control_props;
|
||||||
|
|
||||||
props2.pNext = &props3;
|
props2.pNext = &props3;
|
||||||
props3.pNext = &subgroup_props;
|
props3.pNext = &subgroup_props;
|
||||||
subgroup_props.pNext = &driver_props;
|
subgroup_props.pNext = &driver_props;
|
||||||
|
@ -2037,6 +2107,10 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||||
last_struct->pNext = (VkBaseOutStructure *)&amd_shader_core_properties2_props;
|
last_struct->pNext = (VkBaseOutStructure *)&amd_shader_core_properties2_props;
|
||||||
last_struct = (VkBaseOutStructure *)&amd_shader_core_properties2_props;
|
last_struct = (VkBaseOutStructure *)&amd_shader_core_properties2_props;
|
||||||
}
|
}
|
||||||
|
if (device->subgroup_size_control) {
|
||||||
|
last_struct->pNext = (VkBaseOutStructure *)&subgroup_size_control_props;
|
||||||
|
last_struct = (VkBaseOutStructure *)&subgroup_size_control_props;
|
||||||
|
}
|
||||||
|
|
||||||
#if defined(VK_NV_cooperative_matrix2)
|
#if defined(VK_NV_cooperative_matrix2)
|
||||||
vk::PhysicalDeviceCooperativeMatrix2PropertiesNV coopmat2_props;
|
vk::PhysicalDeviceCooperativeMatrix2PropertiesNV coopmat2_props;
|
||||||
|
@ -2075,7 +2149,7 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||||
|
|
||||||
device->fp16 = !force_disable_f16 && fp16_storage && fp16_compute;
|
device->fp16 = !force_disable_f16 && fp16_storage && fp16_compute;
|
||||||
|
|
||||||
if (device->vendor_id == VK_VENDOR_ID_INTEL || (props2.properties.vendorID == VK_VENDOR_ID_AMD && driver_props.driverID == vk::DriverId::eAmdProprietary)) {
|
if (device->vendor_id == VK_VENDOR_ID_INTEL || (device->vendor_id == VK_VENDOR_ID_AMD && (driver_props.driverID == vk::DriverId::eAmdProprietary || driver_props.driverID == vk::DriverId::eAmdOpenSource))) {
|
||||||
// Intel drivers don't support coopmat properly yet
|
// Intel drivers don't support coopmat properly yet
|
||||||
// Only RADV supports coopmat properly on AMD
|
// Only RADV supports coopmat properly on AMD
|
||||||
device->coopmat_support = false;
|
device->coopmat_support = false;
|
||||||
|
@ -2131,6 +2205,17 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||||
device_extensions.push_back("VK_EXT_pipeline_robustness");
|
device_extensions.push_back("VK_EXT_pipeline_robustness");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
VkPhysicalDeviceSubgroupSizeControlFeaturesEXT subgroup_size_control_features;
|
||||||
|
subgroup_size_control_features.pNext = nullptr;
|
||||||
|
subgroup_size_control_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_SIZE_CONTROL_FEATURES_EXT;
|
||||||
|
subgroup_size_control_features.computeFullSubgroups = false;
|
||||||
|
subgroup_size_control_features.subgroupSizeControl = false;
|
||||||
|
|
||||||
|
if (device->subgroup_size_control) {
|
||||||
|
last_struct->pNext = (VkBaseOutStructure *)&subgroup_size_control_features;
|
||||||
|
last_struct = (VkBaseOutStructure *)&subgroup_size_control_features;
|
||||||
|
}
|
||||||
|
|
||||||
VkPhysicalDeviceCooperativeMatrixFeaturesKHR coopmat_features;
|
VkPhysicalDeviceCooperativeMatrixFeaturesKHR coopmat_features;
|
||||||
coopmat_features.pNext = nullptr;
|
coopmat_features.pNext = nullptr;
|
||||||
coopmat_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_KHR;
|
coopmat_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_KHR;
|
||||||
|
@ -2158,6 +2243,17 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||||
|
|
||||||
device->pipeline_robustness = pl_robustness_features.pipelineRobustness;
|
device->pipeline_robustness = pl_robustness_features.pipelineRobustness;
|
||||||
|
|
||||||
|
device->subgroup_size_control = device->subgroup_size_control &&
|
||||||
|
(subgroup_size_control_props.requiredSubgroupSizeStages & vk::ShaderStageFlagBits::eCompute) &&
|
||||||
|
subgroup_size_control_features.subgroupSizeControl;
|
||||||
|
|
||||||
|
if (device->subgroup_size_control) {
|
||||||
|
device->subgroup_min_size = subgroup_size_control_props.minSubgroupSize;
|
||||||
|
device->subgroup_max_size = subgroup_size_control_props.maxSubgroupSize;
|
||||||
|
device->subgroup_require_full_support = subgroup_size_control_features.computeFullSubgroups;
|
||||||
|
device_extensions.push_back("VK_EXT_subgroup_size_control");
|
||||||
|
}
|
||||||
|
|
||||||
device->coopmat_support = device->coopmat_support && coopmat_features.cooperativeMatrix;
|
device->coopmat_support = device->coopmat_support && coopmat_features.cooperativeMatrix;
|
||||||
|
|
||||||
if (coopmat2_support) {
|
if (coopmat2_support) {
|
||||||
|
@ -2307,7 +2403,7 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (device->coopmat_m == 0) {
|
if (device->coopmat_m == 0 || !device->coopmat_acc_f32_support) {
|
||||||
// No suitable matmul mode found
|
// No suitable matmul mode found
|
||||||
GGML_LOG_DEBUG("ggml_vulkan: WARNING: No suitable matrix core mode found. Disabling matrix cores.\n");
|
GGML_LOG_DEBUG("ggml_vulkan: WARNING: No suitable matrix core mode found. Disabling matrix cores.\n");
|
||||||
device->coopmat_support = false;
|
device->coopmat_support = false;
|
||||||
|
@ -2440,7 +2536,7 @@ static void ggml_vk_print_gpu_info(size_t idx) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (props2.properties.vendorID == VK_VENDOR_ID_INTEL || (props2.properties.vendorID == VK_VENDOR_ID_AMD && driver_props.driverID == vk::DriverId::eAmdProprietary)) {
|
if (props2.properties.vendorID == VK_VENDOR_ID_INTEL || (props2.properties.vendorID == VK_VENDOR_ID_AMD && (driver_props.driverID == vk::DriverId::eAmdProprietary || driver_props.driverID == vk::DriverId::eAmdOpenSource))) {
|
||||||
// Intel drivers don't support coopmat properly yet
|
// Intel drivers don't support coopmat properly yet
|
||||||
// Only RADV supports coopmat properly on AMD
|
// Only RADV supports coopmat properly on AMD
|
||||||
coopmat_support = false;
|
coopmat_support = false;
|
||||||
|
@ -2727,7 +2823,7 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_conte
|
||||||
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F16) {
|
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F16) {
|
||||||
return ctx->device->pipeline_matmul_f32_f16;
|
return ctx->device->pipeline_matmul_f32_f16;
|
||||||
}
|
}
|
||||||
if (prec == GGML_PREC_DEFAULT && ctx->device->fp16) {
|
if (prec == GGML_PREC_DEFAULT && ctx->device->fp16 && !(ctx->device->coopmat_support && !ctx->device->coopmat_acc_f16_support)) {
|
||||||
if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) {
|
if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) {
|
||||||
return ctx->device->pipeline_matmul_f16_f32.f16acc;
|
return ctx->device->pipeline_matmul_f16_f32.f16acc;
|
||||||
}
|
}
|
||||||
|
@ -2802,7 +2898,7 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_id_pipeline(ggml_backend_vk_co
|
||||||
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
|
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
|
||||||
return ctx->device->pipeline_matmul_id_f32;
|
return ctx->device->pipeline_matmul_id_f32;
|
||||||
}
|
}
|
||||||
if (prec == GGML_PREC_DEFAULT && ctx->device->fp16) {
|
if (prec == GGML_PREC_DEFAULT && ctx->device->fp16 && !(ctx->device->coopmat_support && !ctx->device->coopmat_acc_f16_support)) {
|
||||||
if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) {
|
if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) {
|
||||||
return ctx->device->pipeline_matmul_id_f16_f32.f16acc;
|
return ctx->device->pipeline_matmul_id_f16_f32.f16acc;
|
||||||
}
|
}
|
||||||
|
|
|
@ -9,8 +9,8 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
|
||||||
|
|
||||||
void main() {
|
void main() {
|
||||||
[[unroll]] for (uint wgy = 0; wgy < 256; wgy++) {
|
[[unroll]] for (uint wgy = 0; wgy < 256; wgy++) {
|
||||||
const uint i = gl_WorkGroupID.x * 256 + wgy;
|
const uint ib = gl_WorkGroupID.x * 256 + wgy;
|
||||||
if (i >= p.M * p.K / QUANT_K) {
|
if (ib >= p.M * p.K / QUANT_K) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -20,37 +20,49 @@ void main() {
|
||||||
const uint is = 2 * il;
|
const uint is = 2 * il;
|
||||||
const uint n = 4;
|
const uint n = 4;
|
||||||
|
|
||||||
const FLOAT_TYPE dall = FLOAT_TYPE(data_a[i].d.x);
|
const FLOAT_TYPE dall = FLOAT_TYPE(data_a[ib].d.x);
|
||||||
const FLOAT_TYPE dmin = FLOAT_TYPE(data_a[i].d.y);
|
const FLOAT_TYPE dmin = FLOAT_TYPE(data_a[ib].d.y);
|
||||||
|
|
||||||
const uint y_idx = i * QUANT_K + 64 * il + n * ir;
|
const uint y_idx = ib * QUANT_K + 64 * il + n * ir;
|
||||||
const uint qs_idx = 32*il + n * ir;
|
const uint qs_idx = 32*il + n * ir;
|
||||||
|
|
||||||
uint8_t sc;
|
uint scidx0 = (is < 4) ? is : (is + 4);
|
||||||
uint8_t m;
|
uint scidx1 = (is < 4) ? is : (is - 4);
|
||||||
if (is < 4) {
|
uint scidxmask1 = (is < 4) ? 0x30 : 0xC0;
|
||||||
sc = uint8_t(data_a[i].scales[is] & 63);
|
uint scidxshift1 = (is < 4) ? 0 : 2;
|
||||||
m = uint8_t(data_a[i].scales[is + 4] & 63);
|
uint mbidx0 = is + 4;
|
||||||
} else {
|
uint mbidx1 = (is < 4) ? is + 4 : is;
|
||||||
sc = uint8_t((data_a[i].scales[is + 4] & 0xF) | ((data_a[i].scales[is - 4] >> 6) << 4));
|
uint mbidxmask0 = (is < 4) ? 0xF : 0xF0;
|
||||||
m = uint8_t((data_a[i].scales[is + 4] >> 4) | ((data_a[i].scales[is ] >> 6) << 4));
|
uint mbidxshift0 = (is < 4) ? 0 : 4;
|
||||||
}
|
uint mbidxmask1 = (is < 4) ? 0x30 : 0xC0;
|
||||||
const FLOAT_TYPE d1 = dall * sc;
|
uint mbidxshift1 = (is < 4) ? 0 : 2;
|
||||||
const FLOAT_TYPE m1 = dmin * m;
|
|
||||||
|
uint8_t sc = uint8_t((data_a[ib].scales[scidx0] & 0xF) | ((data_a[ib].scales[scidx1] & scidxmask1) >> scidxshift1));
|
||||||
|
uint8_t mbyte = uint8_t((data_a[ib].scales[mbidx0] & mbidxmask0) >> mbidxshift0 | ((data_a[ib].scales[mbidx1] & mbidxmask1) >> mbidxshift1));
|
||||||
|
|
||||||
|
const FLOAT_TYPE d1 = dall * sc;
|
||||||
|
const FLOAT_TYPE m1 = dmin * mbyte;
|
||||||
|
|
||||||
|
scidx0 = (is < 4) ? is + 1 : (is + 5);
|
||||||
|
scidx1 = (is < 4) ? is + 1 : (is - 3);
|
||||||
|
scidxmask1 = (is < 4) ? 0x30 : 0xC0;
|
||||||
|
scidxshift1 = (is < 4) ? 0 : 2;
|
||||||
|
mbidx0 = is + 5;
|
||||||
|
mbidx1 = (is < 4) ? is + 5 : is + 1;
|
||||||
|
mbidxmask0 = (is < 4) ? 0xF : 0xF0;
|
||||||
|
mbidxshift0 = (is < 4) ? 0 : 4;
|
||||||
|
mbidxmask1 = (is < 4) ? 0x30 : 0xC0;
|
||||||
|
mbidxshift1 = (is < 4) ? 0 : 2;
|
||||||
|
|
||||||
|
sc = uint8_t((data_a[ib].scales[scidx0] & 0xF) | ((data_a[ib].scales[scidx1] & scidxmask1) >> scidxshift1));
|
||||||
|
mbyte = uint8_t((data_a[ib].scales[mbidx0] & mbidxmask0) >> mbidxshift0 | ((data_a[ib].scales[mbidx1] & mbidxmask1) >> mbidxshift1));
|
||||||
|
|
||||||
if (is < 4) {
|
|
||||||
sc = uint8_t(data_a[i].scales[is + 1] & 63);
|
|
||||||
m = uint8_t(data_a[i].scales[is + 5] & 63);
|
|
||||||
} else {
|
|
||||||
sc = uint8_t((data_a[i].scales[is + 5] & 0xF) | ((data_a[i].scales[is - 3] >> 6) << 4));
|
|
||||||
m = uint8_t((data_a[i].scales[is + 5] >> 4) | ((data_a[i].scales[is + 1] >> 6) << 4));
|
|
||||||
}
|
|
||||||
const FLOAT_TYPE d2 = dall * sc;
|
const FLOAT_TYPE d2 = dall * sc;
|
||||||
const FLOAT_TYPE m2 = dmin * m;
|
const FLOAT_TYPE m2 = dmin * mbyte;
|
||||||
|
|
||||||
[[unroll]] for (uint l = 0; l < n; ++l) {
|
[[unroll]] for (uint l = 0; l < n; ++l) {
|
||||||
data_b[y_idx + l ] = D_TYPE(d1 * FLOAT_TYPE(data_a[i].qs[qs_idx + l] & 0xF) - m1);
|
data_b[y_idx + l ] = D_TYPE(d1 * FLOAT_TYPE(data_a[ib].qs[qs_idx + l] & 0xF) - m1);
|
||||||
data_b[y_idx + l + 32] = D_TYPE(d2 * FLOAT_TYPE(data_a[i].qs[qs_idx + l] >> 4) - m2);
|
data_b[y_idx + l + 32] = D_TYPE(d2 * FLOAT_TYPE(data_a[ib].qs[qs_idx + l] >> 4) - m2);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -9,8 +9,8 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
|
||||||
|
|
||||||
void main() {
|
void main() {
|
||||||
[[unroll]] for (uint wgy = 0; wgy < 256; wgy++) {
|
[[unroll]] for (uint wgy = 0; wgy < 256; wgy++) {
|
||||||
const uint i = gl_WorkGroupID.x * 256 + wgy;
|
const uint ib = gl_WorkGroupID.x * 256 + wgy;
|
||||||
if (i >= p.M * p.K / QUANT_K) {
|
if (ib >= p.M * p.K / QUANT_K) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -19,40 +19,52 @@ void main() {
|
||||||
const uint ir = tid % 16;
|
const uint ir = tid % 16;
|
||||||
const uint is = 2 * il;
|
const uint is = 2 * il;
|
||||||
|
|
||||||
const FLOAT_TYPE dall = FLOAT_TYPE(data_a[i].d.x);
|
const FLOAT_TYPE dall = FLOAT_TYPE(data_a[ib].d.x);
|
||||||
const FLOAT_TYPE dmin = FLOAT_TYPE(data_a[i].d.y);
|
const FLOAT_TYPE dmin = FLOAT_TYPE(data_a[ib].d.y);
|
||||||
|
|
||||||
const uint y_idx = i * QUANT_K + 64 * il + 2 * ir;
|
const uint y_idx = ib * QUANT_K + 64 * il + 2 * ir;
|
||||||
const uint qs_idx = 32*il + 2 * ir;
|
const uint qs_idx = 32*il + 2 * ir;
|
||||||
const uint qh_idx = 2 * ir;
|
const uint qh_idx = 2 * ir;
|
||||||
|
|
||||||
uint8_t sc;
|
uint scidx0 = (is < 4) ? is : (is + 4);
|
||||||
uint8_t m;
|
uint scidx1 = (is < 4) ? is : (is - 4);
|
||||||
if (is < 4) {
|
uint scidxmask1 = (is < 4) ? 0x30 : 0xC0;
|
||||||
sc = uint8_t(data_a[i].scales[is] & 63);
|
uint scidxshift1 = (is < 4) ? 0 : 2;
|
||||||
m = uint8_t(data_a[i].scales[is + 4] & 63);
|
uint mbidx0 = is + 4;
|
||||||
} else {
|
uint mbidx1 = (is < 4) ? is + 4 : is;
|
||||||
sc = uint8_t((data_a[i].scales[is + 4] & 0xF) | ((data_a[i].scales[is - 4] >> 6) << 4));
|
uint mbidxmask0 = (is < 4) ? 0xF : 0xF0;
|
||||||
m = uint8_t((data_a[i].scales[is + 4] >> 4) | ((data_a[i].scales[is ] >> 6) << 4));
|
uint mbidxshift0 = (is < 4) ? 0 : 4;
|
||||||
}
|
uint mbidxmask1 = (is < 4) ? 0x30 : 0xC0;
|
||||||
const FLOAT_TYPE d1 = dall * sc;
|
uint mbidxshift1 = (is < 4) ? 0 : 2;
|
||||||
const FLOAT_TYPE m1 = dmin * m;
|
|
||||||
|
uint8_t sc = uint8_t((data_a[ib].scales[scidx0] & 0xF) | ((data_a[ib].scales[scidx1] & scidxmask1) >> scidxshift1));
|
||||||
|
uint8_t mbyte = uint8_t((data_a[ib].scales[mbidx0] & mbidxmask0) >> mbidxshift0 | ((data_a[ib].scales[mbidx1] & mbidxmask1) >> mbidxshift1));
|
||||||
|
|
||||||
|
const FLOAT_TYPE d1 = dall * sc;
|
||||||
|
const FLOAT_TYPE m1 = dmin * mbyte;
|
||||||
|
|
||||||
|
scidx0 = (is < 4) ? is + 1 : (is + 5);
|
||||||
|
scidx1 = (is < 4) ? is + 1 : (is - 3);
|
||||||
|
scidxmask1 = (is < 4) ? 0x30 : 0xC0;
|
||||||
|
scidxshift1 = (is < 4) ? 0 : 2;
|
||||||
|
mbidx0 = is + 5;
|
||||||
|
mbidx1 = (is < 4) ? is + 5 : is + 1;
|
||||||
|
mbidxmask0 = (is < 4) ? 0xF : 0xF0;
|
||||||
|
mbidxshift0 = (is < 4) ? 0 : 4;
|
||||||
|
mbidxmask1 = (is < 4) ? 0x30 : 0xC0;
|
||||||
|
mbidxshift1 = (is < 4) ? 0 : 2;
|
||||||
|
|
||||||
|
sc = uint8_t((data_a[ib].scales[scidx0] & 0xF) | ((data_a[ib].scales[scidx1] & scidxmask1) >> scidxshift1));
|
||||||
|
mbyte = uint8_t((data_a[ib].scales[mbidx0] & mbidxmask0) >> mbidxshift0 | ((data_a[ib].scales[mbidx1] & mbidxmask1) >> mbidxshift1));
|
||||||
|
|
||||||
if (is < 4) {
|
|
||||||
sc = uint8_t(data_a[i].scales[is + 1] & 63);
|
|
||||||
m = uint8_t(data_a[i].scales[is + 5] & 63);
|
|
||||||
} else {
|
|
||||||
sc = uint8_t((data_a[i].scales[is + 5] & 0xF) | ((data_a[i].scales[is - 3] >> 6) << 4));
|
|
||||||
m = uint8_t((data_a[i].scales[is + 5] >> 4) | ((data_a[i].scales[is + 1] >> 6) << 4));
|
|
||||||
}
|
|
||||||
const FLOAT_TYPE d2 = dall * sc;
|
const FLOAT_TYPE d2 = dall * sc;
|
||||||
const FLOAT_TYPE m2 = dmin * m;
|
const FLOAT_TYPE m2 = dmin * mbyte;
|
||||||
|
|
||||||
const uint8_t hm1 = uint8_t(1 << (2 * il ));
|
const uint8_t hm1 = uint8_t(1 << (2 * il ));
|
||||||
const uint8_t hm2 = uint8_t(1 << (2 * il + 1));
|
const uint8_t hm2 = uint8_t(1 << (2 * il + 1));
|
||||||
data_b[y_idx ] = D_TYPE(d1 * FLOAT_TYPE((data_a[i].qs[qs_idx ] & 0xF) + (((data_a[i].qh[qh_idx ] & hm1) != 0) ? 16 : 0)) - m1);
|
data_b[y_idx ] = D_TYPE(d1 * FLOAT_TYPE((data_a[ib].qs[qs_idx ] & 0xF) + (((data_a[ib].qh[qh_idx ] & hm1) != 0) ? 16 : 0)) - m1);
|
||||||
data_b[y_idx + 1] = D_TYPE(d1 * FLOAT_TYPE((data_a[i].qs[qs_idx + 1] & 0xF) + (((data_a[i].qh[qh_idx + 1] & hm1) != 0) ? 16 : 0)) - m1);
|
data_b[y_idx + 1] = D_TYPE(d1 * FLOAT_TYPE((data_a[ib].qs[qs_idx + 1] & 0xF) + (((data_a[ib].qh[qh_idx + 1] & hm1) != 0) ? 16 : 0)) - m1);
|
||||||
data_b[y_idx + 32] = D_TYPE(d2 * FLOAT_TYPE((data_a[i].qs[qs_idx ] >> 4) + (((data_a[i].qh[qh_idx ] & hm2) != 0) ? 16 : 0)) - m2);
|
data_b[y_idx + 32] = D_TYPE(d2 * FLOAT_TYPE((data_a[ib].qs[qs_idx ] >> 4) + (((data_a[ib].qh[qh_idx ] & hm2) != 0) ? 16 : 0)) - m2);
|
||||||
data_b[y_idx + 33] = D_TYPE(d2 * FLOAT_TYPE((data_a[i].qs[qs_idx + 1] >> 4) + (((data_a[i].qh[qh_idx + 1] & hm2) != 0) ? 16 : 0)) - m2);
|
data_b[y_idx + 33] = D_TYPE(d2 * FLOAT_TYPE((data_a[ib].qs[qs_idx + 1] >> 4) + (((data_a[ib].qh[qh_idx + 1] & hm2) != 0) ? 16 : 0)) - m2);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "gguf"
|
name = "gguf"
|
||||||
version = "0.10.0"
|
version = "0.11.0"
|
||||||
description = "Read and write ML models in GGUF for GGML"
|
description = "Read and write ML models in GGUF for GGML"
|
||||||
authors = ["GGML <ggml@ggml.ai>"]
|
authors = ["GGML <ggml@ggml.ai>"]
|
||||||
packages = [
|
packages = [
|
||||||
|
|
|
@ -456,6 +456,7 @@ extern "C" {
|
||||||
// Functions to access the model's GGUF metadata scalar values
|
// Functions to access the model's GGUF metadata scalar values
|
||||||
// - The functions return the length of the string on success, or -1 on failure
|
// - The functions return the length of the string on success, or -1 on failure
|
||||||
// - The output string is always null-terminated and cleared on failure
|
// - The output string is always null-terminated and cleared on failure
|
||||||
|
// - When retrieving a string, an extra byte must be allocated to account for the null terminator
|
||||||
// - GGUF array values are not supported by these functions
|
// - GGUF array values are not supported by these functions
|
||||||
|
|
||||||
// Get metadata value as a string by key name
|
// Get metadata value as a string by key name
|
||||||
|
|
|
@ -1808,7 +1808,7 @@ private:
|
||||||
DWORD bufLen = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
|
DWORD bufLen = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
|
||||||
NULL, error_code, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&lpMsgBuf, 0, NULL);
|
NULL, error_code, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&lpMsgBuf, 0, NULL);
|
||||||
if (!bufLen) {
|
if (!bufLen) {
|
||||||
ret = format("Win32 error code: %s", error_code);
|
ret = format("Win32 error code: %lx", error_code);
|
||||||
} else {
|
} else {
|
||||||
ret = lpMsgBuf;
|
ret = lpMsgBuf;
|
||||||
LocalFree(lpMsgBuf);
|
LocalFree(lpMsgBuf);
|
||||||
|
@ -2147,7 +2147,7 @@ struct llama_mmap {
|
||||||
HMODULE hKernel32 = GetModuleHandleW(L"kernel32.dll");
|
HMODULE hKernel32 = GetModuleHandleW(L"kernel32.dll");
|
||||||
|
|
||||||
// may fail on pre-Windows 8 systems
|
// may fail on pre-Windows 8 systems
|
||||||
pPrefetchVirtualMemory = reinterpret_cast<decltype(pPrefetchVirtualMemory)> (GetProcAddress(hKernel32, "PrefetchVirtualMemory"));
|
pPrefetchVirtualMemory = (decltype(pPrefetchVirtualMemory))(void *) GetProcAddress(hKernel32, "PrefetchVirtualMemory");
|
||||||
|
|
||||||
if (pPrefetchVirtualMemory) {
|
if (pPrefetchVirtualMemory) {
|
||||||
// advise the kernel to preload the mapped memory
|
// advise the kernel to preload the mapped memory
|
||||||
|
@ -21755,7 +21755,7 @@ float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
|
||||||
throw std::runtime_error(format("negative index out of range [0, %d)", ctx->n_outputs));
|
throw std::runtime_error(format("negative index out of range [0, %d)", ctx->n_outputs));
|
||||||
}
|
}
|
||||||
} else if ((size_t) i >= ctx->output_ids.size()) {
|
} else if ((size_t) i >= ctx->output_ids.size()) {
|
||||||
throw std::runtime_error(format("out of range [0, %lu)", ctx->output_ids.size()));
|
throw std::runtime_error(format("out of range [0, %zu)", ctx->output_ids.size()));
|
||||||
} else {
|
} else {
|
||||||
j = ctx->output_ids[i];
|
j = ctx->output_ids[i];
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue