mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-10 17:14:36 +00:00
Merge branch 'master' into concedo_experimental
# Conflicts: # .devops/nix/package.nix # .github/workflows/build.yml # .gitignore # CMakeLists.txt # Makefile # README.md # ci/run.sh # flake.lock # flake.nix # scripts/get-flags.mk # scripts/get-wikitext-2.sh # scripts/sync-ggml.last # tests/CMakeLists.txt # tests/test-backend-ops.cpp # tests/test-grammar-parser.cpp # tests/test-llama-grammar.cpp
This commit is contained in:
commit
f0a662112b
34 changed files with 2394 additions and 753 deletions
|
@ -272,7 +272,7 @@ Please install [Visual Studio](https://visualstudio.microsoft.com/) which impact
|
||||||
|
|
||||||
a. Please follow the procedure in [Get the Intel® oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html).
|
a. Please follow the procedure in [Get the Intel® oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html).
|
||||||
|
|
||||||
Recommend to install to default folder: **/opt/intel/oneapi**.
|
Recommend to install to default folder: **C:\Program Files (x86)\Intel\oneAPI**.
|
||||||
|
|
||||||
Following guide uses the default folder as example. If you use other folder, please modify the following guide info with your folder.
|
Following guide uses the default folder as example. If you use other folder, please modify the following guide info with your folder.
|
||||||
|
|
||||||
|
|
|
@ -1705,6 +1705,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
|
||||||
}
|
}
|
||||||
fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
|
fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
|
||||||
fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
|
fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
|
||||||
|
fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep);
|
||||||
fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
|
fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
|
||||||
fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau);
|
fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau);
|
||||||
fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta);
|
fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta);
|
||||||
|
@ -1742,7 +1743,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
|
||||||
|
|
||||||
fprintf(stream, "rope_freq_base: %f # default: 10000.0\n", params.rope_freq_base);
|
fprintf(stream, "rope_freq_base: %f # default: 10000.0\n", params.rope_freq_base);
|
||||||
fprintf(stream, "rope_freq_scale: %f # default: 1.0\n", params.rope_freq_scale);
|
fprintf(stream, "rope_freq_scale: %f # default: 1.0\n", params.rope_freq_scale);
|
||||||
fprintf(stream, "seed: %d # default: -1 (random seed)\n", params.seed);
|
fprintf(stream, "seed: %u # default: -1 (random seed)\n", params.seed);
|
||||||
fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
|
fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
|
||||||
fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
|
fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
|
||||||
fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp);
|
fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp);
|
||||||
|
@ -1751,7 +1752,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
|
||||||
dump_vector_float_yaml(stream, "tensor_split", tensor_split_vector);
|
dump_vector_float_yaml(stream, "tensor_split", tensor_split_vector);
|
||||||
|
|
||||||
fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);
|
fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);
|
||||||
fprintf(stream, "threads: %d # default: %d\n", params.n_threads, std::thread::hardware_concurrency());
|
fprintf(stream, "threads: %d # default: %u\n", params.n_threads, std::thread::hardware_concurrency());
|
||||||
fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
|
fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
|
||||||
fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
|
fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
|
||||||
fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
|
fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
|
||||||
|
@ -1802,7 +1803,8 @@ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) {
|
||||||
if (cs_curr[j] < 0) { continue; }
|
if (cs_curr[j] < 0) { continue; }
|
||||||
if (seqs.find(cs_curr[j]) == seqs.end()) {
|
if (seqs.find(cs_curr[j]) == seqs.end()) {
|
||||||
if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
|
if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
|
||||||
seqs[cs_curr[j]] = seqs.size();
|
const size_t sz = seqs.size();
|
||||||
|
seqs[cs_curr[j]] = sz;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
|
if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
|
||||||
|
|
|
@ -121,7 +121,7 @@ static void sampler_queue(
|
||||||
struct llama_context * ctx_main,
|
struct llama_context * ctx_main,
|
||||||
const llama_sampling_params & params,
|
const llama_sampling_params & params,
|
||||||
llama_token_data_array & cur_p,
|
llama_token_data_array & cur_p,
|
||||||
size_t & min_keep) {
|
size_t min_keep) {
|
||||||
const float temp = params.temp;
|
const float temp = params.temp;
|
||||||
const float dynatemp_range = params.dynatemp_range;
|
const float dynatemp_range = params.dynatemp_range;
|
||||||
const float dynatemp_exponent = params.dynatemp_exponent;
|
const float dynatemp_exponent = params.dynatemp_exponent;
|
||||||
|
@ -249,7 +249,7 @@ static llama_token llama_sampling_sample_impl(
|
||||||
id = llama_sample_token_mirostat_v2(ctx_main, &cur_p, mirostat_tau, mirostat_eta, &ctx_sampling->mirostat_mu);
|
id = llama_sample_token_mirostat_v2(ctx_main, &cur_p, mirostat_tau, mirostat_eta, &ctx_sampling->mirostat_mu);
|
||||||
} else {
|
} else {
|
||||||
// temperature sampling
|
// temperature sampling
|
||||||
size_t min_keep = std::max(1, params.n_probs);
|
size_t min_keep = std::max(1, params.min_keep);
|
||||||
|
|
||||||
sampler_queue(ctx_main, params, cur_p, min_keep);
|
sampler_queue(ctx_main, params, cur_p, min_keep);
|
||||||
|
|
||||||
|
|
|
@ -22,6 +22,7 @@ enum class llama_sampler_type : char {
|
||||||
typedef struct llama_sampling_params {
|
typedef struct llama_sampling_params {
|
||||||
int32_t n_prev = 64; // number of previous tokens to remember
|
int32_t n_prev = 64; // number of previous tokens to remember
|
||||||
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
||||||
|
int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
|
||||||
int32_t top_k = 40; // <= 0 to use vocab size
|
int32_t top_k = 40; // <= 0 to use vocab size
|
||||||
float top_p = 0.95f; // 1.0 = disabled
|
float top_p = 0.95f; // 1.0 = disabled
|
||||||
float min_p = 0.05f; // 0.0 = disabled
|
float min_p = 0.05f; // 0.0 = disabled
|
||||||
|
|
|
@ -1533,16 +1533,17 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
int n_past = 0;
|
int n_past = 0;
|
||||||
|
|
||||||
ggml_cgraph gf = {};
|
struct ggml_cgraph * gf = NULL;
|
||||||
|
gf = ggml_new_graph_custom(ctx0, LLAMA_TRAIN_MAX_NODES, true);
|
||||||
|
|
||||||
get_example_targets_batch(ctx0, 64*ex+0, tokens_input, targets);
|
get_example_targets_batch(ctx0, 64*ex+0, tokens_input, targets);
|
||||||
|
|
||||||
struct ggml_tensor * logits = forward_batch(&model, &kv_self, ctx0, &gf, tokens_input, n_tokens, n_past, n_batch);
|
struct ggml_tensor * logits = forward_batch(&model, &kv_self, ctx0, gf, tokens_input, n_tokens, n_past, n_batch);
|
||||||
// struct ggml_tensor * e = cross_entropy_loss(ctx0, targets, logits);
|
// struct ggml_tensor * e = cross_entropy_loss(ctx0, targets, logits);
|
||||||
struct ggml_tensor * e = square_error_loss(ctx0, targets, logits);
|
struct ggml_tensor * e = square_error_loss(ctx0, targets, logits);
|
||||||
|
|
||||||
ggml_build_forward_expand(&gf, e);
|
ggml_build_forward_expand(gf, e);
|
||||||
ggml_graph_compute_helper(work_buffer, &gf, /*n_threads*/ 1);
|
ggml_graph_compute_helper(work_buffer, gf, /*n_threads*/ 1);
|
||||||
|
|
||||||
float error_before_opt = ggml_get_f32_1d(e, 0);
|
float error_before_opt = ggml_get_f32_1d(e, 0);
|
||||||
|
|
||||||
|
@ -1552,8 +1553,8 @@ int main(int argc, char ** argv) {
|
||||||
opt_params_lbfgs.lbfgs.n_iter = 16;
|
opt_params_lbfgs.lbfgs.n_iter = 16;
|
||||||
ggml_opt(ctx0, opt_params_lbfgs, e);
|
ggml_opt(ctx0, opt_params_lbfgs, e);
|
||||||
//
|
//
|
||||||
ggml_build_forward_expand(&gf, e);
|
ggml_build_forward_expand(gf, e);
|
||||||
ggml_graph_compute_helper(work_buffer, &gf, /*n_threads*/ 1);
|
ggml_graph_compute_helper(work_buffer, gf, /*n_threads*/ 1);
|
||||||
|
|
||||||
float error_after_opt = ggml_get_f32_1d(e, 0);
|
float error_after_opt = ggml_get_f32_1d(e, 0);
|
||||||
|
|
||||||
|
@ -1600,13 +1601,14 @@ int main(int argc, char ** argv) {
|
||||||
};
|
};
|
||||||
struct ggml_context * ctx0 = ggml_init(params);
|
struct ggml_context * ctx0 = ggml_init(params);
|
||||||
|
|
||||||
ggml_cgraph gf = {};
|
struct ggml_cgraph * gf = NULL;
|
||||||
|
gf = ggml_new_graph_custom(ctx0, LLAMA_TRAIN_MAX_NODES, true);
|
||||||
|
|
||||||
int n_past = 0;
|
int n_past = 0;
|
||||||
struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, &gf, tokens_input, sample_ctx, n_past);
|
struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, gf, tokens_input, sample_ctx, n_past);
|
||||||
|
|
||||||
ggml_build_forward_expand(&gf, logits);
|
ggml_build_forward_expand(gf, logits);
|
||||||
ggml_graph_compute_helper(work_buffer, &gf, /*n_threads*/ 1);
|
ggml_graph_compute_helper(work_buffer, gf, /*n_threads*/ 1);
|
||||||
|
|
||||||
struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
|
struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
|
||||||
struct ggml_tensor * probs = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, sample_ctx);
|
struct ggml_tensor * probs = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, sample_ctx);
|
||||||
|
|
|
@ -159,7 +159,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_TEE("\n");
|
LOG_TEE("\n");
|
||||||
LOG_TEE("%s: n_kv_max = %d, is_pp_shared = %d, n_gpu_layers = %d, mmq = %d, n_threads = %d, n_threads_batch = %d\n", __func__, n_kv_max, is_pp_shared, n_gpu_layers, mmq, ctx_params.n_threads, ctx_params.n_threads_batch);
|
LOG_TEE("%s: n_kv_max = %d, is_pp_shared = %d, n_gpu_layers = %d, mmq = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, is_pp_shared, n_gpu_layers, mmq, ctx_params.n_threads, ctx_params.n_threads_batch);
|
||||||
LOG_TEE("\n");
|
LOG_TEE("\n");
|
||||||
|
|
||||||
LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
|
LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
|
||||||
|
|
|
@ -92,7 +92,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
const int n_ctx = llama_n_ctx(ctx);
|
const int n_ctx = llama_n_ctx(ctx);
|
||||||
|
|
||||||
LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_batch = %d, n_parallel = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
|
LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
|
||||||
|
|
||||||
// make sure the KV cache is big enough to hold all the prompt and generated tokens
|
// make sure the KV cache is big enough to hold all the prompt and generated tokens
|
||||||
if (n_kv_req > n_ctx) {
|
if (n_kv_req > n_ctx) {
|
||||||
|
|
|
@ -325,14 +325,14 @@ struct train_params {
|
||||||
};
|
};
|
||||||
|
|
||||||
static void print_params(struct my_llama_hparams * params) {
|
static void print_params(struct my_llama_hparams * params) {
|
||||||
printf("%s: n_vocab: %d\n", __func__, params->n_vocab);
|
printf("%s: n_vocab: %u\n", __func__, params->n_vocab);
|
||||||
printf("%s: n_ctx: %d\n", __func__, params->n_ctx);
|
printf("%s: n_ctx: %u\n", __func__, params->n_ctx);
|
||||||
printf("%s: n_embd: %d\n", __func__, params->n_embd);
|
printf("%s: n_embd: %u\n", __func__, params->n_embd);
|
||||||
printf("%s: n_mult: %d\n", __func__, params->n_mult);
|
printf("%s: n_mult: %u\n", __func__, params->n_mult);
|
||||||
printf("%s: n_head: %d\n", __func__, params->n_head);
|
printf("%s: n_head: %u\n", __func__, params->n_head);
|
||||||
printf("%s: n_ff: %d\n", __func__, params->n_ff);
|
printf("%s: n_ff: %u\n", __func__, params->n_ff);
|
||||||
printf("%s: n_layer: %d\n", __func__, params->n_layer);
|
printf("%s: n_layer: %u\n", __func__, params->n_layer);
|
||||||
printf("%s: n_rot: %d\n", __func__, params->n_rot);
|
printf("%s: n_rot: %u\n", __func__, params->n_rot);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void init_model(struct my_llama_model * model) {
|
static void init_model(struct my_llama_model * model) {
|
||||||
|
@ -350,25 +350,25 @@ static void init_model(struct my_llama_model * model) {
|
||||||
model->train_tokens = 0;
|
model->train_tokens = 0;
|
||||||
|
|
||||||
model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
|
model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
|
||||||
printf("[%s:GG] Allocating [%d] x [%d] = [%d] float space for model->tok_embeddings\n",__func__,n_embd , n_vocab, n_embd * n_vocab);
|
printf("[%s:GG] Allocating [%u] x [%u] = [%u] float space for model->tok_embeddings\n",__func__,n_embd , n_vocab, n_embd * n_vocab);
|
||||||
|
|
||||||
model->norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
model->norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
||||||
printf("[%s:GG] Allocating [%d] float space for model->norm\n",__func__,n_embd);
|
printf("[%s:GG] Allocating [%u] float space for model->norm\n",__func__,n_embd);
|
||||||
|
|
||||||
model->output = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
|
model->output = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
|
||||||
printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for model->output\n",__func__,n_embd, n_vocab, n_embd * n_vocab);
|
printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for model->output\n",__func__,n_embd, n_vocab, n_embd * n_vocab);
|
||||||
|
|
||||||
// printing the per-layer allocations here so we dont print in the for loop.
|
// printing the per-layer allocations here so we dont print in the for loop.
|
||||||
printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wq for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
|
printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wq for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
|
||||||
printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wk for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
|
printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wk for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
|
||||||
printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wv for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
|
printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wv for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
|
||||||
printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wo for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
|
printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wo for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
|
||||||
|
|
||||||
printf("[%s:GG] Allocating [%d] float space for layer.ffn_norm for [%d] layers\n",__func__,n_embd, n_layer);
|
printf("[%s:GG] Allocating [%u] float space for layer.ffn_norm for [%u] layers\n",__func__,n_embd, n_layer);
|
||||||
|
|
||||||
printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w1 for [%d] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer);
|
printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.w1 for [%u] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer);
|
||||||
printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w2 for [%d] layers\n",__func__, n_embd, n_ff, n_ff * n_embd, n_layer);
|
printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.w2 for [%u] layers\n",__func__, n_embd, n_ff, n_ff * n_embd, n_layer);
|
||||||
printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w3 for [%d] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer);
|
printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.w3 for [%u] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer);
|
||||||
|
|
||||||
ggml_set_name(model->tok_embeddings, "tok_embeddings.weight");
|
ggml_set_name(model->tok_embeddings, "tok_embeddings.weight");
|
||||||
ggml_set_name(model->norm, "norm.weight");
|
ggml_set_name(model->norm, "norm.weight");
|
||||||
|
|
|
@ -7,8 +7,6 @@
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <thread>
|
#include <thread>
|
||||||
|
|
||||||
static const size_t tensor_alignment = 32;
|
|
||||||
|
|
||||||
struct lora_info {
|
struct lora_info {
|
||||||
std::string filename;
|
std::string filename;
|
||||||
float scale;
|
float scale;
|
||||||
|
|
|
@ -87,7 +87,21 @@ class SchemaConverter:
|
||||||
elif schema_type == 'array' and 'items' in schema:
|
elif schema_type == 'array' and 'items' in schema:
|
||||||
# TODO `prefixItems` keyword
|
# TODO `prefixItems` keyword
|
||||||
item_rule_name = self.visit(schema['items'], f'{name}{"-" if name else ""}item')
|
item_rule_name = self.visit(schema['items'], f'{name}{"-" if name else ""}item')
|
||||||
rule = f'"[" space ({item_rule_name} ("," space {item_rule_name})*)? "]" space'
|
list_item_operator = f'("," space {item_rule_name})'
|
||||||
|
successive_items = ""
|
||||||
|
min_items = schema.get("minItems", 0)
|
||||||
|
if min_items > 0:
|
||||||
|
first_item = f"({item_rule_name})"
|
||||||
|
successive_items = list_item_operator * (min_items - 1)
|
||||||
|
min_items -= 1
|
||||||
|
else:
|
||||||
|
first_item = f"({item_rule_name})?"
|
||||||
|
max_items = schema.get("maxItems")
|
||||||
|
if max_items is not None and max_items > min_items:
|
||||||
|
successive_items += (list_item_operator + "?") * (max_items - min_items - 1)
|
||||||
|
else:
|
||||||
|
successive_items += list_item_operator + "*"
|
||||||
|
rule = f'"[" space {first_item} {successive_items} "]" space'
|
||||||
return self._add_rule(rule_name, rule)
|
return self._add_rule(rule_name, rule)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -53,7 +53,7 @@ python ./examples/llava/convert-image-encoder-to-gguf.py -m ../clip-vit-large-pa
|
||||||
5. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF:
|
5. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
python ./convert.py ../llava-v1.5-7b
|
python ./convert.py ../llava-v1.5-7b --skip-unknown
|
||||||
```
|
```
|
||||||
|
|
||||||
Now both the LLaMA part and the image encoder is in the `llava-v1.5-7b` directory.
|
Now both the LLaMA part and the image encoder is in the `llava-v1.5-7b` directory.
|
||||||
|
|
|
@ -616,9 +616,9 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||||
KQ = ggml_soft_max_inplace(ctx0, KQ);
|
KQ = ggml_soft_max_inplace(ctx0, KQ);
|
||||||
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ);
|
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ);
|
||||||
KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_positions, n_head, batch_size);
|
KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_positions, n_head, batch_size);
|
||||||
KQV = ggml_cont(ctx0, ggml_permute(ctx0, KQV, 0, 2, 1, 3));
|
KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
||||||
|
|
||||||
cur = ggml_cpy(ctx0, KQV, ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size));
|
cur = ggml_cont_3d(ctx0, KQV, hidden_size, num_positions, batch_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
// attention output
|
// attention output
|
||||||
|
|
|
@ -19,19 +19,12 @@ mm_tensors = [k for k, v in checkpoint.items() if k.startswith("model.mm_project
|
||||||
projector = {name: checkpoint[name].float() for name in mm_tensors}
|
projector = {name: checkpoint[name].float() for name in mm_tensors}
|
||||||
torch.save(projector, f"{args.model}/llava.projector")
|
torch.save(projector, f"{args.model}/llava.projector")
|
||||||
|
|
||||||
# remove these tensors from the checkpoint and save it again
|
|
||||||
for name in mm_tensors:
|
|
||||||
del checkpoint[name]
|
|
||||||
|
|
||||||
# BakLLaVA models contain CLIP tensors in it
|
# BakLLaVA models contain CLIP tensors in it
|
||||||
clip_tensors = [k for k, v in checkpoint.items() if k.startswith("model.vision_tower")]
|
clip_tensors = [k for k, v in checkpoint.items() if k.startswith("model.vision_tower")]
|
||||||
if len(clip_tensors) > 0:
|
if len(clip_tensors) > 0:
|
||||||
clip = {name.replace("vision_tower.vision_tower.", ""): checkpoint[name].float() for name in clip_tensors}
|
clip = {name.replace("vision_tower.vision_tower.", ""): checkpoint[name].float() for name in clip_tensors}
|
||||||
torch.save(clip, f"{args.model}/llava.clip")
|
torch.save(clip, f"{args.model}/llava.clip")
|
||||||
|
|
||||||
# remove these tensors
|
|
||||||
for name in clip_tensors:
|
|
||||||
del checkpoint[name]
|
|
||||||
|
|
||||||
# added tokens should be removed to be able to convert Mistral models
|
# added tokens should be removed to be able to convert Mistral models
|
||||||
if os.path.exists(f"{args.model}/added_tokens.json"):
|
if os.path.exists(f"{args.model}/added_tokens.json"):
|
||||||
|
@ -39,7 +32,6 @@ if len(clip_tensors) > 0:
|
||||||
f.write("{}\n")
|
f.write("{}\n")
|
||||||
|
|
||||||
|
|
||||||
torch.save(checkpoint, path)
|
|
||||||
|
|
||||||
print("Done!")
|
print("Done!")
|
||||||
print(f"Now you can convert {args.model} to a regular LLaMA GGUF file.")
|
print(f"Now you can convert {args.model} to a regular LLaMA GGUF file.")
|
||||||
|
|
|
@ -310,7 +310,7 @@ static void process_logits(int n_vocab, const float * logits, const int * tokens
|
||||||
}
|
}
|
||||||
|
|
||||||
static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & params) {
|
static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & params) {
|
||||||
// Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
|
// Download: https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
|
||||||
// Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
|
// Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
|
||||||
// Output: `perplexity: 13.5106 [114/114]`
|
// Output: `perplexity: 13.5106 [114/114]`
|
||||||
// BOS tokens will be added for each chunk before eval
|
// BOS tokens will be added for each chunk before eval
|
||||||
|
@ -448,7 +448,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
||||||
return perplexity_v2(ctx, params);
|
return perplexity_v2(ctx, params);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
|
// Download: https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
|
||||||
// Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
|
// Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
|
||||||
// Output: `perplexity: 13.5106 [114/114]`
|
// Output: `perplexity: 13.5106 [114/114]`
|
||||||
// BOS tokens will be added for each chunk before eval
|
// BOS tokens will be added for each chunk before eval
|
||||||
|
@ -1624,7 +1624,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
|
||||||
uint32_t n_ctx;
|
uint32_t n_ctx;
|
||||||
in.read((char *)&n_ctx, sizeof(n_ctx));
|
in.read((char *)&n_ctx, sizeof(n_ctx));
|
||||||
if (n_ctx > llama_n_ctx(ctx)) {
|
if (n_ctx > llama_n_ctx(ctx)) {
|
||||||
fprintf(stderr, "%s: %s has been computed with %d, while the current context is %d. Increase it with -c and retry\n",
|
fprintf(stderr, "%s: %s has been computed with %u, while the current context is %d. Increase it with -c and retry\n",
|
||||||
__func__, params.logits_file.c_str(), n_ctx, params.n_ctx);
|
__func__, params.logits_file.c_str(), n_ctx, params.n_ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -24,6 +24,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
|
||||||
{ "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 4.70G, +0.0349 ppl @ LLaMA-v1-7B", },
|
{ "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 4.70G, +0.0349 ppl @ LLaMA-v1-7B", },
|
||||||
{ "IQ2_XXS",LLAMA_FTYPE_MOSTLY_IQ2_XXS," 2.06 bpw quantization", },
|
{ "IQ2_XXS",LLAMA_FTYPE_MOSTLY_IQ2_XXS," 2.06 bpw quantization", },
|
||||||
{ "IQ2_XS", LLAMA_FTYPE_MOSTLY_IQ2_XS, " 2.31 bpw quantization", },
|
{ "IQ2_XS", LLAMA_FTYPE_MOSTLY_IQ2_XS, " 2.31 bpw quantization", },
|
||||||
|
{ "IQ1_S", LLAMA_FTYPE_MOSTLY_IQ1_S, " 1.56 bpw quantization", },
|
||||||
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", },
|
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", },
|
||||||
{ "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.16G, +9.0634 ppl @ LLaMA-v1-7B", },
|
{ "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.16G, +9.0634 ppl @ LLaMA-v1-7B", },
|
||||||
{ "IQ3_XXS",LLAMA_FTYPE_MOSTLY_IQ3_XXS," 3.06 bpw quantization", },
|
{ "IQ3_XXS",LLAMA_FTYPE_MOSTLY_IQ3_XXS," 3.06 bpw quantization", },
|
||||||
|
@ -288,9 +289,10 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if ((params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || params.ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) && imatrix_data.empty()) {
|
if ((params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS ||
|
||||||
|
params.ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) && imatrix_data.empty()) {
|
||||||
fprintf(stderr, "\n===============================================================================================\n");
|
fprintf(stderr, "\n===============================================================================================\n");
|
||||||
fprintf(stderr, "Please do not use IQ2_XXS, IQ2_XS or Q2_K_S quantization without an importance matrix\n");
|
fprintf(stderr, "Please do not use IQ1_S, IQ2_XXS, IQ2_XS or Q2_K_S quantization without an importance matrix\n");
|
||||||
fprintf(stderr, "===============================================================================================\n\n\n");
|
fprintf(stderr, "===============================================================================================\n\n\n");
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
|
@ -39,6 +39,8 @@ see https://github.com/ggerganov/llama.cpp/issues/1437
|
||||||
- `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA.
|
- `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA.
|
||||||
- `--grp-attn-n`: Set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`
|
- `--grp-attn-n`: Set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`
|
||||||
- `--grp-attn-w`: Set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`
|
- `--grp-attn-w`: Set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`
|
||||||
|
- `-n, --n-predict`: Set the maximum tokens to predict (default: -1)
|
||||||
|
- `--slots-endpoint-disable`: To disable slots state monitoring endpoint. Slots state may contain user data, prompts included.
|
||||||
|
|
||||||
## Build
|
## Build
|
||||||
|
|
||||||
|
@ -132,9 +134,11 @@ node index.js
|
||||||
## API Endpoints
|
## API Endpoints
|
||||||
|
|
||||||
- **GET** `/health`: Returns the current state of the server:
|
- **GET** `/health`: Returns the current state of the server:
|
||||||
- `{"status": "loading model"}` if the model is still being loaded.
|
- 503 -> `{"status": "loading model"}` if the model is still being loaded.
|
||||||
- `{"status": "error"}` if the model failed to load.
|
- 500 -> `{"status": "error"}` if the model failed to load.
|
||||||
- `{"status": "ok"}` if the model is successfully loaded and the server is ready for further requests mentioned below.
|
- 200 -> `{"status": "ok", "slots_idle": 1, "slots_processing": 2 }` if the model is successfully loaded and the server is ready for further requests mentioned below.
|
||||||
|
- 200 -> `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if no slot are currently available.
|
||||||
|
- 503 -> `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if the query parameter `fail_on_no_slot` is provided and no slot are currently available.
|
||||||
|
|
||||||
- **POST** `/completion`: Given a `prompt`, it returns the predicted completion.
|
- **POST** `/completion`: Given a `prompt`, it returns the predicted completion.
|
||||||
|
|
||||||
|
@ -196,6 +200,8 @@ node index.js
|
||||||
|
|
||||||
`n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token (default: 0)
|
`n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token (default: 0)
|
||||||
|
|
||||||
|
`min_keep`: If greater than 0, force samplers to return N possible tokens at minimum (default: 0)
|
||||||
|
|
||||||
`image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `prompt`. You can determine the place of the image in the prompt as in the following: `USER:[img-12]Describe the image in detail.\nASSISTANT:`. In this case, `[img-12]` will be replaced by the embeddings of the image with id `12` in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 12}]}`. Use `image_data` only with multimodal models, e.g., LLaVA.
|
`image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `prompt`. You can determine the place of the image in the prompt as in the following: `USER:[img-12]Describe the image in detail.\nASSISTANT:`. In this case, `[img-12]` will be replaced by the embeddings of the image with id `12` in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 12}]}`. Use `image_data` only with multimodal models, e.g., LLaVA.
|
||||||
|
|
||||||
`slot_id`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot (default: -1)
|
`slot_id`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot (default: -1)
|
||||||
|
@ -379,6 +385,69 @@ Notice that each `probs` is an array of length `n_probs`.
|
||||||
}'
|
}'
|
||||||
```
|
```
|
||||||
|
|
||||||
|
- **GET** `/slots`: Returns the current slots processing state. Can be disabled with `--slots-endpoint-disable`.
|
||||||
|
|
||||||
|
### Result JSON
|
||||||
|
|
||||||
|
```json
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"dynatemp_exponent": 1.0,
|
||||||
|
"dynatemp_range": 0.0,
|
||||||
|
"frequency_penalty": 0.0,
|
||||||
|
"grammar": "",
|
||||||
|
"id": 0,
|
||||||
|
"ignore_eos": false,
|
||||||
|
"logit_bias": [],
|
||||||
|
"min_p": 0.05000000074505806,
|
||||||
|
"mirostat": 0,
|
||||||
|
"mirostat_eta": 0.10000000149011612,
|
||||||
|
"mirostat_tau": 5.0,
|
||||||
|
"model": "llama-2-7b-32k-instruct.Q2_K.gguf",
|
||||||
|
"n_ctx": 2048,
|
||||||
|
"n_keep": 0,
|
||||||
|
"n_predict": 100000,
|
||||||
|
"n_probs": 0,
|
||||||
|
"next_token": {
|
||||||
|
"has_next_token": true,
|
||||||
|
"n_remain": -1,
|
||||||
|
"num_tokens_predicted": 0,
|
||||||
|
"stopped_eos": false,
|
||||||
|
"stopped_limit": false,
|
||||||
|
"stopped_word": false,
|
||||||
|
"stopping_word": ""
|
||||||
|
},
|
||||||
|
"penalize_nl": true,
|
||||||
|
"penalty_prompt_tokens": [],
|
||||||
|
"presence_penalty": 0.0,
|
||||||
|
"prompt": "Say hello to llama.cpp",
|
||||||
|
"repeat_last_n": 64,
|
||||||
|
"repeat_penalty": 1.100000023841858,
|
||||||
|
"samplers": [
|
||||||
|
"top_k",
|
||||||
|
"tfs_z",
|
||||||
|
"typical_p",
|
||||||
|
"top_p",
|
||||||
|
"min_p",
|
||||||
|
"temperature"
|
||||||
|
],
|
||||||
|
"seed": 42,
|
||||||
|
"state": 1,
|
||||||
|
"stop": [
|
||||||
|
"\n"
|
||||||
|
],
|
||||||
|
"stream": false,
|
||||||
|
"task_id": 0,
|
||||||
|
"temperature": 0.0,
|
||||||
|
"tfs_z": 1.0,
|
||||||
|
"top_k": 40,
|
||||||
|
"top_p": 0.949999988079071,
|
||||||
|
"typical_p": 1.0,
|
||||||
|
"use_penalty_prompt_tokens": false
|
||||||
|
}
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
## More examples
|
## More examples
|
||||||
|
|
||||||
### Change system prompt on runtime
|
### Change system prompt on runtime
|
||||||
|
|
|
@ -234,6 +234,7 @@
|
||||||
mirostat_eta: 0.1, // learning rate
|
mirostat_eta: 0.1, // learning rate
|
||||||
grammar: '',
|
grammar: '',
|
||||||
n_probs: 0, // no completion_probabilities,
|
n_probs: 0, // no completion_probabilities,
|
||||||
|
min_keep: 0, // min probs from each sampler,
|
||||||
image_data: [],
|
image_data: [],
|
||||||
cache_prompt: true,
|
cache_prompt: true,
|
||||||
api_key: ''
|
api_key: ''
|
||||||
|
@ -791,6 +792,9 @@
|
||||||
<fieldset>
|
<fieldset>
|
||||||
${IntField({ label: "Show Probabilities", max: 10, min: 0, name: "n_probs", value: params.value.n_probs })}
|
${IntField({ label: "Show Probabilities", max: 10, min: 0, name: "n_probs", value: params.value.n_probs })}
|
||||||
</fieldset>
|
</fieldset>
|
||||||
|
<fieldset>
|
||||||
|
${IntField({ label: "Min Probabilities from each Sampler", max: 10, min: 0, name: "min_keep", value: params.value.min_keep })}
|
||||||
|
</fieldset>
|
||||||
<fieldset>
|
<fieldset>
|
||||||
<label for="api_key">API Key</label>
|
<label for="api_key">API Key</label>
|
||||||
<input type="text" name="api_key" value="${params.value.api_key}" placeholder="Enter API key" oninput=${updateParams} />
|
<input type="text" name="api_key" value="${params.value.api_key}" placeholder="Enter API key" oninput=${updateParams} />
|
||||||
|
|
|
@ -29,6 +29,7 @@
|
||||||
#include <chrono>
|
#include <chrono>
|
||||||
#include <condition_variable>
|
#include <condition_variable>
|
||||||
#include <atomic>
|
#include <atomic>
|
||||||
|
#include <signal.h>
|
||||||
|
|
||||||
using json = nlohmann::json;
|
using json = nlohmann::json;
|
||||||
|
|
||||||
|
@ -41,6 +42,7 @@ struct server_params
|
||||||
int32_t port = 8080;
|
int32_t port = 8080;
|
||||||
int32_t read_timeout = 600;
|
int32_t read_timeout = 600;
|
||||||
int32_t write_timeout = 600;
|
int32_t write_timeout = 600;
|
||||||
|
bool slots_endpoint = true;
|
||||||
};
|
};
|
||||||
|
|
||||||
bool server_verbose = false;
|
bool server_verbose = false;
|
||||||
|
@ -159,6 +161,7 @@ struct llama_client_slot
|
||||||
int32_t n_decoded = 0;
|
int32_t n_decoded = 0;
|
||||||
int32_t n_remaining = -1;
|
int32_t n_remaining = -1;
|
||||||
int32_t i_batch = -1;
|
int32_t i_batch = -1;
|
||||||
|
int32_t n_predict = -1;
|
||||||
|
|
||||||
int32_t num_prompt_tokens = 0;
|
int32_t num_prompt_tokens = 0;
|
||||||
int32_t num_prompt_tokens_processed = 0;
|
int32_t num_prompt_tokens_processed = 0;
|
||||||
|
@ -410,6 +413,7 @@ struct llama_server_context
|
||||||
|
|
||||||
slot.id = i;
|
slot.id = i;
|
||||||
slot.n_ctx = n_ctx_slot;
|
slot.n_ctx = n_ctx_slot;
|
||||||
|
slot.n_predict = params.n_predict;
|
||||||
|
|
||||||
LOG_TEE(" -> Slot %i - max context: %i\n", slot.id, n_ctx_slot);
|
LOG_TEE(" -> Slot %i - max context: %i\n", slot.id, n_ctx_slot);
|
||||||
|
|
||||||
|
@ -545,6 +549,16 @@ struct llama_server_context
|
||||||
slot->params.seed = json_value(data, "seed", default_params.seed);
|
slot->params.seed = json_value(data, "seed", default_params.seed);
|
||||||
slot->sparams.grammar = json_value(data, "grammar", default_sparams.grammar);
|
slot->sparams.grammar = json_value(data, "grammar", default_sparams.grammar);
|
||||||
slot->sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs);
|
slot->sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs);
|
||||||
|
slot->sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep);
|
||||||
|
|
||||||
|
if (slot->n_predict > 0 && slot->params.n_predict > slot->n_predict) {
|
||||||
|
// Might be better to reject the request with a 400 ?
|
||||||
|
LOG_WARNING("Max tokens to predict exceeds server configuration", {
|
||||||
|
{"params.n_predict", slot->params.n_predict},
|
||||||
|
{"slot.n_predict", slot->n_predict},
|
||||||
|
});
|
||||||
|
slot->params.n_predict = slot->n_predict;
|
||||||
|
}
|
||||||
|
|
||||||
// infill
|
// infill
|
||||||
if (data.count("input_prefix") != 0)
|
if (data.count("input_prefix") != 0)
|
||||||
|
@ -1053,6 +1067,7 @@ struct llama_server_context
|
||||||
|
|
||||||
return json {
|
return json {
|
||||||
{"n_ctx", slot.n_ctx},
|
{"n_ctx", slot.n_ctx},
|
||||||
|
{"n_predict", slot.n_predict},
|
||||||
{"model", params.model_alias},
|
{"model", params.model_alias},
|
||||||
{"seed", slot.params.seed},
|
{"seed", slot.params.seed},
|
||||||
{"temperature", slot.sparams.temp},
|
{"temperature", slot.sparams.temp},
|
||||||
|
@ -1080,6 +1095,7 @@ struct llama_server_context
|
||||||
{"stream", slot.params.stream},
|
{"stream", slot.params.stream},
|
||||||
{"logit_bias", slot.sparams.logit_bias},
|
{"logit_bias", slot.sparams.logit_bias},
|
||||||
{"n_probs", slot.sparams.n_probs},
|
{"n_probs", slot.sparams.n_probs},
|
||||||
|
{"min_keep", slot.sparams.min_keep},
|
||||||
{"grammar", slot.sparams.grammar},
|
{"grammar", slot.sparams.grammar},
|
||||||
{"samplers", samplers_sequence}
|
{"samplers", samplers_sequence}
|
||||||
};
|
};
|
||||||
|
@ -1914,14 +1930,16 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
||||||
printf(" set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n");
|
printf(" set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n");
|
||||||
printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA.\n");
|
printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA.\n");
|
||||||
printf(" --log-disable disables logging to a file.\n");
|
printf(" --log-disable disables logging to a file.\n");
|
||||||
|
printf(" --slots-endpoint-disable disables slots monitoring endpoint.\n");
|
||||||
printf("\n");
|
printf("\n");
|
||||||
|
printf(" -n, --n-predict maximum tokens to predict (default: %d)\n", params.n_predict);
|
||||||
printf(" --override-kv KEY=TYPE:VALUE\n");
|
printf(" --override-kv KEY=TYPE:VALUE\n");
|
||||||
printf(" advanced option to override model metadata by key. may be specified multiple times.\n");
|
printf(" advanced option to override model metadata by key. may be specified multiple times.\n");
|
||||||
printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
|
printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
|
||||||
printf(" -gan N, --grp-attn-n N set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`");
|
printf(" -gan N, --grp-attn-n N set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`");
|
||||||
printf(" -gaw N, --grp-attn-w N set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`");
|
printf(" -gaw N, --grp-attn-w N set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`");
|
||||||
printf(" --chat-template FORMAT_NAME");
|
printf(" --chat-template FORMAT_NAME");
|
||||||
printf(" set chat template, possible valus is: llama2, chatml (default %s)", sparams.chat_template.c_str());
|
printf(" set chat template, possible value is: llama2, chatml (default %s)", sparams.chat_template.c_str());
|
||||||
printf("\n");
|
printf("\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2361,6 +2379,10 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||||
log_set_target(stdout);
|
log_set_target(stdout);
|
||||||
LOG_INFO("logging to file is disabled.", {});
|
LOG_INFO("logging to file is disabled.", {});
|
||||||
}
|
}
|
||||||
|
else if (arg == "--slots-endpoint-disable")
|
||||||
|
{
|
||||||
|
sparams.slots_endpoint = false;
|
||||||
|
}
|
||||||
else if (arg == "--chat-template")
|
else if (arg == "--chat-template")
|
||||||
{
|
{
|
||||||
if (++i >= argc)
|
if (++i >= argc)
|
||||||
|
@ -2512,6 +2534,9 @@ static void append_to_generated_text_from_generated_token_probs(llama_server_con
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::function<void(int)> shutdown_handler;
|
||||||
|
inline void signal_handler(int signal) { shutdown_handler(signal); }
|
||||||
|
|
||||||
int main(int argc, char **argv)
|
int main(int argc, char **argv)
|
||||||
{
|
{
|
||||||
#if SERVER_VERBOSE != 1
|
#if SERVER_VERBOSE != 1
|
||||||
|
@ -2558,13 +2583,40 @@ int main(int argc, char **argv)
|
||||||
res.set_header("Access-Control-Allow-Headers", "*");
|
res.set_header("Access-Control-Allow-Headers", "*");
|
||||||
});
|
});
|
||||||
|
|
||||||
svr.Get("/health", [&](const httplib::Request&, httplib::Response& res) {
|
svr.Get("/health", [&](const httplib::Request& req, httplib::Response& res) {
|
||||||
server_state current_state = state.load();
|
server_state current_state = state.load();
|
||||||
switch(current_state) {
|
switch(current_state) {
|
||||||
case SERVER_STATE_READY:
|
case SERVER_STATE_READY: {
|
||||||
res.set_content(R"({"status": "ok"})", "application/json");
|
int available_slots = 0;
|
||||||
res.status = 200; // HTTP OK
|
int processing_slots = 0;
|
||||||
|
for (llama_client_slot &slot: llama.slots) {
|
||||||
|
if (slot.available()) {
|
||||||
|
available_slots++;
|
||||||
|
} else {
|
||||||
|
processing_slots++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (available_slots > 0) {
|
||||||
|
json health = {
|
||||||
|
{"status", "ok"},
|
||||||
|
{"slots_idle", available_slots},
|
||||||
|
{"slots_processing", processing_slots}};
|
||||||
|
res.set_content(health.dump(), "application/json");
|
||||||
|
res.status = 200; // HTTP OK
|
||||||
|
} else {
|
||||||
|
json health = {
|
||||||
|
{"status", "no slot available"},
|
||||||
|
{"slots_idle", available_slots},
|
||||||
|
{"slots_processing", processing_slots}};
|
||||||
|
res.set_content(health.dump(), "application/json");
|
||||||
|
if (req.has_param("fail_on_no_slot")) {
|
||||||
|
res.status = 503; // HTTP Service Unavailable
|
||||||
|
} else {
|
||||||
|
res.status = 200; // HTTP OK
|
||||||
|
}
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
|
}
|
||||||
case SERVER_STATE_LOADING_MODEL:
|
case SERVER_STATE_LOADING_MODEL:
|
||||||
res.set_content(R"({"status": "loading model"})", "application/json");
|
res.set_content(R"({"status": "loading model"})", "application/json");
|
||||||
res.status = 503; // HTTP Service Unavailable
|
res.status = 503; // HTTP Service Unavailable
|
||||||
|
@ -2576,6 +2628,32 @@ int main(int argc, char **argv)
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
if (sparams.slots_endpoint) {
|
||||||
|
svr.Get("/slots", [&](const httplib::Request&, httplib::Response& res) {
|
||||||
|
json slots;
|
||||||
|
for (llama_client_slot & slot : llama.slots) {
|
||||||
|
json slot_data = llama.get_formated_generation(slot);
|
||||||
|
slot_data["id"] = slot.id;
|
||||||
|
slot_data["task_id"] = slot.task_id;
|
||||||
|
slot_data["state"] = slot.state;
|
||||||
|
slot_data["prompt"] = slot.prompt;
|
||||||
|
slot_data["next_token"] = {
|
||||||
|
{"has_next_token", slot.has_next_token},
|
||||||
|
{"n_remain", slot.n_remaining},
|
||||||
|
{"num_tokens_predicted", slot.n_decoded},
|
||||||
|
{"stopped_eos", slot.stopped_eos},
|
||||||
|
{"stopped_word", slot.stopped_word},
|
||||||
|
{"stopped_limit", slot.stopped_limit},
|
||||||
|
{"stopping_word", slot.stopping_word},
|
||||||
|
};
|
||||||
|
|
||||||
|
slots.push_back(slot_data);
|
||||||
|
}
|
||||||
|
res.set_content(slots.dump(), "application/json");
|
||||||
|
res.status = 200; // HTTP OK
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
svr.set_logger(log_server_request);
|
svr.set_logger(log_server_request);
|
||||||
|
|
||||||
svr.set_exception_handler([](const httplib::Request &, httplib::Response &res, std::exception_ptr ep)
|
svr.set_exception_handler([](const httplib::Request &, httplib::Response &res, std::exception_ptr ep)
|
||||||
|
@ -3129,8 +3207,25 @@ int main(int argc, char **argv)
|
||||||
std::placeholders::_2,
|
std::placeholders::_2,
|
||||||
std::placeholders::_3
|
std::placeholders::_3
|
||||||
));
|
));
|
||||||
llama.queue_tasks.start_loop();
|
|
||||||
|
|
||||||
|
shutdown_handler = [&](int) {
|
||||||
|
llama.queue_tasks.terminate();
|
||||||
|
};
|
||||||
|
|
||||||
|
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
|
||||||
|
struct sigaction sigint_action;
|
||||||
|
sigint_action.sa_handler = signal_handler;
|
||||||
|
sigemptyset (&sigint_action.sa_mask);
|
||||||
|
sigint_action.sa_flags = 0;
|
||||||
|
sigaction(SIGINT, &sigint_action, NULL);
|
||||||
|
#elif defined (_WIN32)
|
||||||
|
auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
|
||||||
|
return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
|
||||||
|
};
|
||||||
|
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
|
||||||
|
#endif
|
||||||
|
llama.queue_tasks.start_loop();
|
||||||
|
svr.stop();
|
||||||
t.join();
|
t.join();
|
||||||
|
|
||||||
llama_backend_free();
|
llama_backend_free();
|
||||||
|
|
|
@ -220,6 +220,7 @@ inline std::string format_chatml(std::vector<json> messages)
|
||||||
struct llama_server_queue {
|
struct llama_server_queue {
|
||||||
int id = 0;
|
int id = 0;
|
||||||
std::mutex mutex_tasks;
|
std::mutex mutex_tasks;
|
||||||
|
bool running;
|
||||||
// queues
|
// queues
|
||||||
std::vector<task_server> queue_tasks;
|
std::vector<task_server> queue_tasks;
|
||||||
std::vector<task_server> queue_tasks_deferred;
|
std::vector<task_server> queue_tasks_deferred;
|
||||||
|
@ -278,9 +279,18 @@ struct llama_server_queue {
|
||||||
queue_tasks_deferred.clear();
|
queue_tasks_deferred.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
// Start the main loop. This call is blocking
|
// end the start_loop routine
|
||||||
[[noreturn]]
|
void terminate() {
|
||||||
|
{
|
||||||
|
std::unique_lock<std::mutex> lock(mutex_tasks);
|
||||||
|
running = false;
|
||||||
|
}
|
||||||
|
condition_tasks.notify_all();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Start the main loop.
|
||||||
void start_loop() {
|
void start_loop() {
|
||||||
|
running = true;
|
||||||
while (true) {
|
while (true) {
|
||||||
// new task arrived
|
// new task arrived
|
||||||
LOG_VERBOSE("have new task", {});
|
LOG_VERBOSE("have new task", {});
|
||||||
|
@ -324,8 +334,12 @@ struct llama_server_queue {
|
||||||
{
|
{
|
||||||
std::unique_lock<std::mutex> lock(mutex_tasks);
|
std::unique_lock<std::mutex> lock(mutex_tasks);
|
||||||
if (queue_tasks.empty()) {
|
if (queue_tasks.empty()) {
|
||||||
|
if (!running) {
|
||||||
|
LOG_VERBOSE("ending start_loop", {});
|
||||||
|
return;
|
||||||
|
}
|
||||||
condition_tasks.wait(lock, [&]{
|
condition_tasks.wait(lock, [&]{
|
||||||
return !queue_tasks.empty();
|
return (!queue_tasks.empty() || !running);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -111,13 +111,13 @@ static const char * LLM_TENSOR_FFN_DOWN = "blk.%d.ffn_down";
|
||||||
static const char * LLM_TENSOR_FFN_UP = "blk.%d.ffn_up";
|
static const char * LLM_TENSOR_FFN_UP = "blk.%d.ffn_up";
|
||||||
|
|
||||||
static void print_params(struct my_llama_hparams * params) {
|
static void print_params(struct my_llama_hparams * params) {
|
||||||
printf("%s: n_vocab: %d\n", __func__, params->n_vocab);
|
printf("%s: n_vocab: %u\n", __func__, params->n_vocab);
|
||||||
printf("%s: n_ctx: %d\n", __func__, params->n_ctx);
|
printf("%s: n_ctx: %u\n", __func__, params->n_ctx);
|
||||||
printf("%s: n_embd: %d\n", __func__, params->n_embd);
|
printf("%s: n_embd: %u\n", __func__, params->n_embd);
|
||||||
printf("%s: n_head: %d\n", __func__, params->n_head);
|
printf("%s: n_head: %u\n", __func__, params->n_head);
|
||||||
printf("%s: n_ff: %d\n", __func__, params->n_ff);
|
printf("%s: n_ff: %u\n", __func__, params->n_ff);
|
||||||
printf("%s: n_layer: %d\n", __func__, params->n_layer);
|
printf("%s: n_layer: %u\n", __func__, params->n_layer);
|
||||||
printf("%s: n_rot: %d\n", __func__, params->n_rot);
|
printf("%s: n_rot: %u\n", __func__, params->n_rot);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void set_param_model(struct my_llama_model * model) {
|
static void set_param_model(struct my_llama_model * model) {
|
||||||
|
|
120
ggml-alloc.c
120
ggml-alloc.c
|
@ -377,6 +377,9 @@ struct ggml_gallocr {
|
||||||
|
|
||||||
struct node_alloc * node_allocs; // [n_nodes]
|
struct node_alloc * node_allocs; // [n_nodes]
|
||||||
int n_nodes;
|
int n_nodes;
|
||||||
|
|
||||||
|
struct tensor_alloc * leaf_allocs; // [n_leafs]
|
||||||
|
int n_leafs;
|
||||||
};
|
};
|
||||||
|
|
||||||
ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) {
|
ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) {
|
||||||
|
@ -427,6 +430,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
|
||||||
free(galloc->buffers);
|
free(galloc->buffers);
|
||||||
free(galloc->buf_tallocs);
|
free(galloc->buf_tallocs);
|
||||||
free(galloc->node_allocs);
|
free(galloc->node_allocs);
|
||||||
|
free(galloc->leaf_allocs);
|
||||||
free(galloc);
|
free(galloc);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -464,7 +468,7 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
|
||||||
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||||
struct ggml_tensor * parent = node->src[i];
|
struct ggml_tensor * parent = node->src[i];
|
||||||
if (parent == NULL) {
|
if (parent == NULL) {
|
||||||
break;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// if the node's data is external, then we cannot re-use it
|
// if the node's data is external, then we cannot re-use it
|
||||||
|
@ -544,22 +548,8 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
|
||||||
memset(galloc->hash_set.keys, 0, galloc->hash_set.size * sizeof(struct ggml_tensor *));
|
memset(galloc->hash_set.keys, 0, galloc->hash_set.size * sizeof(struct ggml_tensor *));
|
||||||
memset(galloc->hash_values, 0, galloc->hash_set.size * sizeof(struct hash_node));
|
memset(galloc->hash_values, 0, galloc->hash_set.size * sizeof(struct hash_node));
|
||||||
|
|
||||||
// allocate all graph inputs first to avoid overwriting them
|
|
||||||
for (int i = 0; i < graph->n_nodes; i++) {
|
|
||||||
if (graph->nodes[i]->flags & GGML_TENSOR_FLAG_INPUT) {
|
|
||||||
ggml_gallocr_allocate_node(galloc, graph->nodes[i], get_node_buffer_id(node_buffer_ids, i));
|
|
||||||
}
|
|
||||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
|
||||||
if (graph->nodes[i]->src[j] == NULL) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
if (graph->nodes[i]->src[j]->flags & GGML_TENSOR_FLAG_INPUT) {
|
|
||||||
ggml_gallocr_allocate_node(galloc, graph->nodes[i]->src[j], get_node_buffer_id(node_buffer_ids, i));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// count number of children and views
|
// count number of children and views
|
||||||
|
// allocate all graph inputs and leafs first to avoid overwriting them
|
||||||
for (int i = 0; i < graph->n_nodes; i++) {
|
for (int i = 0; i < graph->n_nodes; i++) {
|
||||||
struct ggml_tensor * node = graph->nodes[i];
|
struct ggml_tensor * node = graph->nodes[i];
|
||||||
|
|
||||||
|
@ -568,14 +558,37 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
|
||||||
ggml_gallocr_hash_get(galloc, view_src)->n_views += 1;
|
ggml_gallocr_hash_get(galloc, view_src)->n_views += 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
if (node->flags & GGML_TENSOR_FLAG_INPUT) {
|
||||||
struct ggml_tensor * parent = node->src[j];
|
ggml_gallocr_allocate_node(galloc, graph->nodes[i], get_node_buffer_id(node_buffer_ids, i));
|
||||||
if (parent == NULL) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
ggml_gallocr_hash_get(galloc, parent)->n_children += 1;
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||||
|
struct ggml_tensor * src = node->src[j];
|
||||||
|
if (src == NULL) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_gallocr_hash_get(galloc, src)->n_children += 1;
|
||||||
|
|
||||||
|
// allocate explicit inputs and leafs
|
||||||
|
if (src->flags & GGML_TENSOR_FLAG_INPUT || src->op == GGML_OP_NONE) {
|
||||||
|
ggml_gallocr_allocate_node(galloc, src, get_node_buffer_id(node_buffer_ids, i));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// allocate the remaining leafs that are unused on the graph
|
||||||
|
// these are effectively static tensors that the application is not using in the graph, but may still want to allocate for other purposes
|
||||||
|
for (int i = 0; i < graph->n_leafs; i++) {
|
||||||
|
struct ggml_tensor * leaf = graph->leafs[i];
|
||||||
|
struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
|
||||||
|
|
||||||
|
if (hn->n_children == 0) {
|
||||||
|
assert(!hn->allocated);
|
||||||
|
// since buffer ids are only given for nodes, these leafs are always allocated in the first buffer
|
||||||
|
ggml_gallocr_allocate_node(galloc, leaf, 0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// allocate tensors
|
// allocate tensors
|
||||||
for (int i = 0; i < graph->n_nodes; i++) {
|
for (int i = 0; i < graph->n_nodes; i++) {
|
||||||
|
@ -586,7 +599,7 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
|
||||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||||
struct ggml_tensor * parent = node->src[j];
|
struct ggml_tensor * parent = node->src[j];
|
||||||
if (parent == NULL) {
|
if (parent == NULL) {
|
||||||
break;
|
continue;
|
||||||
}
|
}
|
||||||
ggml_gallocr_allocate_node(galloc, parent, buffer_id);
|
ggml_gallocr_allocate_node(galloc, parent, buffer_id);
|
||||||
}
|
}
|
||||||
|
@ -598,7 +611,7 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
|
||||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||||
struct ggml_tensor * parent = node->src[j];
|
struct ggml_tensor * parent = node->src[j];
|
||||||
if (parent == NULL) {
|
if (parent == NULL) {
|
||||||
break;
|
continue;
|
||||||
}
|
}
|
||||||
AT_PRINTF("%s", parent->name);
|
AT_PRINTF("%s", parent->name);
|
||||||
if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
|
if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
|
||||||
|
@ -611,7 +624,7 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
|
||||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||||
struct ggml_tensor * parent = node->src[j];
|
struct ggml_tensor * parent = node->src[j];
|
||||||
if (parent == NULL) {
|
if (parent == NULL) {
|
||||||
break;
|
continue;
|
||||||
}
|
}
|
||||||
struct hash_node * p_hn = ggml_gallocr_hash_get(galloc, parent);
|
struct hash_node * p_hn = ggml_gallocr_hash_get(galloc, parent);
|
||||||
p_hn->n_children -= 1;
|
p_hn->n_children -= 1;
|
||||||
|
@ -696,6 +709,18 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (galloc->n_leafs < graph->n_leafs) {
|
||||||
|
free(galloc->leaf_allocs);
|
||||||
|
galloc->leaf_allocs = calloc(sizeof(struct tensor_alloc), graph->n_leafs);
|
||||||
|
GGML_ASSERT(galloc->leaf_allocs != NULL);
|
||||||
|
}
|
||||||
|
galloc->n_leafs = graph->n_leafs;
|
||||||
|
for (int i = 0; i < graph->n_leafs; i++) {
|
||||||
|
struct ggml_tensor * leaf = graph->leafs[i];
|
||||||
|
struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
|
||||||
|
galloc->leaf_allocs[i].offset = hn->offset;
|
||||||
|
galloc->leaf_allocs[i].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
|
||||||
|
}
|
||||||
|
|
||||||
// reallocate buffers if needed
|
// reallocate buffers if needed
|
||||||
for (int i = 0; i < galloc->n_buffers; i++) {
|
for (int i = 0; i < galloc->n_buffers; i++) {
|
||||||
|
@ -722,8 +747,8 @@ bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
|
||||||
return ggml_gallocr_reserve_n(galloc, graph, NULL);
|
return ggml_gallocr_reserve_n(galloc, graph, NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * node, struct node_alloc * node_alloc, struct tensor_alloc * tensor_alloc) {
|
static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id, struct tensor_alloc * tensor_alloc) {
|
||||||
assert(node->data || node->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[node_alloc->buffer_id], node) <= tensor_alloc->size_max);
|
assert(node->data || node->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], node) <= tensor_alloc->size_max);
|
||||||
|
|
||||||
if (node->view_src != NULL) {
|
if (node->view_src != NULL) {
|
||||||
if (node->buffer == NULL) {
|
if (node->buffer == NULL) {
|
||||||
|
@ -732,29 +757,20 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
|
||||||
// this tensor was allocated without ggml-backend
|
// this tensor was allocated without ggml-backend
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
ggml_backend_view_init(galloc->buffers[node_alloc->buffer_id], node);
|
ggml_backend_view_init(galloc->buffers[buffer_id], node);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (node->data == NULL) {
|
if (node->data == NULL) {
|
||||||
assert(tensor_alloc->offset != SIZE_MAX);
|
assert(tensor_alloc->offset != SIZE_MAX);
|
||||||
assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[node_alloc->buffer_id], node) <= tensor_alloc->size_max);
|
assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], node) <= tensor_alloc->size_max);
|
||||||
void * base = ggml_backend_buffer_get_base(galloc->buffers[node_alloc->buffer_id]);
|
void * base = ggml_backend_buffer_get_base(galloc->buffers[buffer_id]);
|
||||||
void * addr = (char *)base + tensor_alloc->offset;
|
void * addr = (char *)base + tensor_alloc->offset;
|
||||||
ggml_backend_tensor_alloc(galloc->buffers[node_alloc->buffer_id], node, addr);
|
ggml_backend_tensor_alloc(galloc->buffers[buffer_id], node, addr);
|
||||||
} else {
|
} else {
|
||||||
if (node->buffer == NULL) {
|
if (node->buffer == NULL) {
|
||||||
// this tensor was allocated without ggml-backend
|
// this tensor was allocated without ggml-backend
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifndef NDEBUG
|
|
||||||
size_t offset =
|
|
||||||
(char *)node->data -
|
|
||||||
(char *)ggml_backend_buffer_get_base(node->buffer);
|
|
||||||
size_t size = ggml_backend_buffer_get_alloc_size(node->buffer, node);
|
|
||||||
assert(tensor_alloc->offset == SIZE_MAX || offset == tensor_alloc->offset);
|
|
||||||
assert(tensor_alloc->offset == SIZE_MAX || size <= tensor_alloc->size_max);
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -773,6 +789,13 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (galloc->n_leafs != graph->n_leafs) {
|
||||||
|
#ifndef NDEBUG
|
||||||
|
fprintf(stderr, "%s: graph has different number of leafs\n", __func__);
|
||||||
|
#endif
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
for (int i = 0; i < graph->n_nodes; i++) {
|
for (int i = 0; i < graph->n_nodes; i++) {
|
||||||
struct ggml_tensor * node = graph->nodes[i];
|
struct ggml_tensor * node = graph->nodes[i];
|
||||||
struct node_alloc * node_alloc = &galloc->node_allocs[i];
|
struct node_alloc * node_alloc = &galloc->node_allocs[i];
|
||||||
|
@ -787,7 +810,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
|
||||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||||
struct ggml_tensor * src = node->src[j];
|
struct ggml_tensor * src = node->src[j];
|
||||||
if (src == NULL) {
|
if (src == NULL) {
|
||||||
break;
|
continue;
|
||||||
}
|
}
|
||||||
if (!ggml_gallocr_node_needs_realloc(galloc, src, node_alloc, &node_alloc->src[j])) {
|
if (!ggml_gallocr_node_needs_realloc(galloc, src, node_alloc, &node_alloc->src[j])) {
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
|
@ -827,17 +850,24 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
|
||||||
}
|
}
|
||||||
|
|
||||||
// allocate the graph tensors from the previous assignments
|
// allocate the graph tensors from the previous assignments
|
||||||
|
// nodes
|
||||||
for (int i = 0; i < graph->n_nodes; i++) {
|
for (int i = 0; i < graph->n_nodes; i++) {
|
||||||
struct ggml_tensor * node = graph->nodes[i];
|
struct ggml_tensor * node = graph->nodes[i];
|
||||||
struct node_alloc * node_alloc = &galloc->node_allocs[i];
|
struct node_alloc * node_alloc = &galloc->node_allocs[i];
|
||||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||||
struct ggml_tensor * src = node->src[j];
|
struct ggml_tensor * src = node->src[j];
|
||||||
if (src == NULL) {
|
if (src == NULL) {
|
||||||
break;
|
continue;
|
||||||
}
|
}
|
||||||
ggml_gallocr_init_tensor(galloc, src, node_alloc, &node_alloc->src[j]);
|
ggml_gallocr_init_tensor(galloc, src, node_alloc->buffer_id, &node_alloc->src[j]);
|
||||||
}
|
}
|
||||||
ggml_gallocr_init_tensor(galloc, node, node_alloc, &node_alloc->dst);
|
ggml_gallocr_init_tensor(galloc, node, node_alloc->buffer_id, &node_alloc->dst);
|
||||||
|
}
|
||||||
|
// leafs
|
||||||
|
for (int i = 0; i < graph->n_leafs; i++) {
|
||||||
|
struct ggml_tensor * leaf = graph->leafs[i];
|
||||||
|
struct tensor_alloc * leaf_alloc = &galloc->leaf_allocs[i];
|
||||||
|
ggml_gallocr_init_tensor(galloc, leaf, 0, leaf_alloc);
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
|
|
|
@ -756,7 +756,7 @@ GGML_CALL static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, str
|
||||||
GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
||||||
switch (op->op) {
|
switch (op->op) {
|
||||||
case GGML_OP_CPY:
|
case GGML_OP_CPY:
|
||||||
return op->type != GGML_TYPE_IQ2_XXS && op->type != GGML_TYPE_IQ2_XS; // missing type_traits.from_float
|
return op->type != GGML_TYPE_IQ2_XXS && op->type != GGML_TYPE_IQ2_XS && op->type != GGML_TYPE_IQ1_S; // missing type_traits.from_float
|
||||||
case GGML_OP_MUL_MAT:
|
case GGML_OP_MUL_MAT:
|
||||||
return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
|
return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
|
||||||
default:
|
default:
|
||||||
|
@ -1006,6 +1006,7 @@ static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, gg
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
GGML_ASSERT(false && "tensor buffer type not supported by any backend");
|
GGML_ASSERT(false && "tensor buffer type not supported by any backend");
|
||||||
|
return -1; // silence warning
|
||||||
}
|
}
|
||||||
|
|
||||||
#if 0
|
#if 0
|
||||||
|
@ -1040,7 +1041,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
|
||||||
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||||
const struct ggml_tensor * src = tensor->src[i];
|
const struct ggml_tensor * src = tensor->src[i];
|
||||||
if (src == NULL) {
|
if (src == NULL) {
|
||||||
break;
|
continue;
|
||||||
}
|
}
|
||||||
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
||||||
int src_backend = ggml_backend_sched_backend_from_buffer(sched, src->buffer);
|
int src_backend = ggml_backend_sched_backend_from_buffer(sched, src->buffer);
|
||||||
|
@ -1087,7 +1088,7 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
|
||||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||||
struct ggml_tensor * src = node->src[j];
|
struct ggml_tensor * src = node->src[j];
|
||||||
if (src == NULL) {
|
if (src == NULL) {
|
||||||
break;
|
continue;
|
||||||
}
|
}
|
||||||
ggml_backend_t src_backend = tensor_backend(src);
|
ggml_backend_t src_backend = tensor_backend(src);
|
||||||
fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
|
fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
|
||||||
|
@ -1143,7 +1144,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||||
struct ggml_tensor * src = node->src[j];
|
struct ggml_tensor * src = node->src[j];
|
||||||
if (src == NULL) {
|
if (src == NULL) {
|
||||||
break;
|
continue;
|
||||||
}
|
}
|
||||||
if (tensor_backend_id(src) == -1) {
|
if (tensor_backend_id(src) == -1) {
|
||||||
tensor_backend_id(src) = ggml_backend_sched_backend_id_from_cur(sched, src);
|
tensor_backend_id(src) = ggml_backend_sched_backend_id_from_cur(sched, src);
|
||||||
|
@ -1255,7 +1256,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||||
struct ggml_tensor * src = node->src[j];
|
struct ggml_tensor * src = node->src[j];
|
||||||
if (src == NULL) {
|
if (src == NULL) {
|
||||||
break;
|
continue;
|
||||||
}
|
}
|
||||||
int src_backend_id = tensor_backend_id(src);
|
int src_backend_id = tensor_backend_id(src);
|
||||||
if (src_backend_id == -1) {
|
if (src_backend_id == -1) {
|
||||||
|
@ -1314,7 +1315,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||||
struct ggml_tensor * src = node->src[j];
|
struct ggml_tensor * src = node->src[j];
|
||||||
if (src == NULL) {
|
if (src == NULL) {
|
||||||
break;
|
continue;
|
||||||
}
|
}
|
||||||
int src_backend_id = tensor_backend_id(src);
|
int src_backend_id = tensor_backend_id(src);
|
||||||
assert(src_backend_id != -1); // all inputs should be assigned by now
|
assert(src_backend_id != -1); // all inputs should be assigned by now
|
||||||
|
@ -1361,7 +1362,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||||
struct ggml_tensor * src = node->src[j];
|
struct ggml_tensor * src = node->src[j];
|
||||||
if (src == NULL) {
|
if (src == NULL) {
|
||||||
break;
|
continue;
|
||||||
}
|
}
|
||||||
ggml_backend_t src_backend = tensor_backend(src);
|
ggml_backend_t src_backend = tensor_backend(src);
|
||||||
if (src_backend != tensor_backend /* && src_backend != NULL */) {
|
if (src_backend != tensor_backend /* && src_backend != NULL */) {
|
||||||
|
@ -1667,7 +1668,7 @@ static struct ggml_tensor * graph_copy_dup_tensor(struct ggml_hash_set hash_set,
|
||||||
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||||
struct ggml_tensor * s = src->src[i];
|
struct ggml_tensor * s = src->src[i];
|
||||||
if (s == NULL) {
|
if (s == NULL) {
|
||||||
break;
|
continue;
|
||||||
}
|
}
|
||||||
dst->src[i] = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, s);
|
dst->src[i] = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, s);
|
||||||
}
|
}
|
||||||
|
@ -1696,7 +1697,7 @@ static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_te
|
||||||
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||||
struct ggml_tensor * s = src->src[i];
|
struct ggml_tensor * s = src->src[i];
|
||||||
if (s == NULL) {
|
if (s == NULL) {
|
||||||
break;
|
continue;
|
||||||
}
|
}
|
||||||
graph_copy_init_tensor(hash_set, node_copies, node_init, s);
|
graph_copy_init_tensor(hash_set, node_copies, node_init, s);
|
||||||
}
|
}
|
||||||
|
|
559
ggml-cuda.cu
559
ggml-cuda.cu
|
@ -54,6 +54,8 @@
|
||||||
#define cudaDeviceProp hipDeviceProp_t
|
#define cudaDeviceProp hipDeviceProp_t
|
||||||
#define cudaDeviceSynchronize hipDeviceSynchronize
|
#define cudaDeviceSynchronize hipDeviceSynchronize
|
||||||
#define cudaError_t hipError_t
|
#define cudaError_t hipError_t
|
||||||
|
#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
|
||||||
|
#define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
|
||||||
#define cudaEventCreateWithFlags hipEventCreateWithFlags
|
#define cudaEventCreateWithFlags hipEventCreateWithFlags
|
||||||
#define cudaEventDisableTiming hipEventDisableTiming
|
#define cudaEventDisableTiming hipEventDisableTiming
|
||||||
#define cudaEventRecord hipEventRecord
|
#define cudaEventRecord hipEventRecord
|
||||||
|
@ -517,6 +519,15 @@ typedef struct {
|
||||||
} block_iq3_xxs;
|
} block_iq3_xxs;
|
||||||
static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
|
static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
|
||||||
|
|
||||||
|
#define QR1_S 8
|
||||||
|
#define QI1_S (QK_K / (4*QR1_S))
|
||||||
|
typedef struct {
|
||||||
|
half d;
|
||||||
|
uint8_t qs[QK_K/8];
|
||||||
|
uint8_t scales[QK_K/16];
|
||||||
|
} block_iq1_s;
|
||||||
|
static_assert(sizeof(block_iq1_s) == sizeof(ggml_fp16_t) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding");
|
||||||
|
|
||||||
#define WARP_SIZE 32
|
#define WARP_SIZE 32
|
||||||
#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
|
#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
|
||||||
|
|
||||||
|
@ -643,18 +654,18 @@ static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
|
||||||
return a;
|
return a;
|
||||||
}
|
}
|
||||||
|
|
||||||
static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
|
//static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
|
||||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
|
//#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
|
||||||
#pragma unroll
|
//#pragma unroll
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
// for (int mask = 16; mask > 0; mask >>= 1) {
|
||||||
a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32));
|
// a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32));
|
||||||
}
|
// }
|
||||||
return a;
|
// return a;
|
||||||
#else
|
//#else
|
||||||
(void) a;
|
// (void) a;
|
||||||
NO_DEVICE_CODE;
|
// NO_DEVICE_CODE;
|
||||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
|
//#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
|
||||||
}
|
//}
|
||||||
|
|
||||||
static __device__ __forceinline__ float warp_reduce_max(float x) {
|
static __device__ __forceinline__ float warp_reduce_max(float x) {
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
|
@ -664,18 +675,18 @@ static __device__ __forceinline__ float warp_reduce_max(float x) {
|
||||||
return x;
|
return x;
|
||||||
}
|
}
|
||||||
|
|
||||||
static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
|
//static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
|
||||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
|
//#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
|
||||||
#pragma unroll
|
//#pragma unroll
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
// for (int mask = 16; mask > 0; mask >>= 1) {
|
||||||
x = __hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
|
// x = __hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
|
||||||
}
|
// }
|
||||||
return x;
|
// return x;
|
||||||
#else
|
//#else
|
||||||
(void) x;
|
// (void) x;
|
||||||
NO_DEVICE_CODE;
|
// NO_DEVICE_CODE;
|
||||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
|
//#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
|
||||||
}
|
//}
|
||||||
|
|
||||||
static __device__ __forceinline__ float op_repeat(const float a, const float b) {
|
static __device__ __forceinline__ float op_repeat(const float a, const float b) {
|
||||||
return b;
|
return b;
|
||||||
|
@ -1682,6 +1693,137 @@ static const __device__ uint32_t iq3xxs_grid[256] = {
|
||||||
0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04,
|
0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
static const __device__ uint64_t iq1s_grid[512] = {
|
||||||
|
0xffffffffffff0101, 0xffffffffff01ff00, 0xffffffffff010100, 0xffffffff00000000,
|
||||||
|
0xffffffff01ff00ff, 0xffffffff01ff0001, 0xffffffff0101ffff, 0xffffffff0101ff01,
|
||||||
|
0xffffff00ff000000, 0xffffff000000ff00, 0xffffff00000000ff, 0xffffff0000000100,
|
||||||
|
0xffffff0000010000, 0xffffff0001000000, 0xffffff01ffff00ff, 0xffffff01ff01ff00,
|
||||||
|
0xffffff01ff010100, 0xffffff0100000001, 0xffffff0101ffff00, 0xffffff0101ff0101,
|
||||||
|
0xffffff0101010100, 0xffff00ffff00ff01, 0xffff00ffff0000ff, 0xffff00ff00ff0100,
|
||||||
|
0xffff00ff0100ff00, 0xffff00ff010001ff, 0xffff0000ff0101ff, 0xffff000000ffff00,
|
||||||
|
0xffff000000000000, 0xffff00000001ff01, 0xffff000001000101, 0xffff0000010100ff,
|
||||||
|
0xffff0001ffff0100, 0xffff00010000ff00, 0xffff000100010101, 0xffff000101000000,
|
||||||
|
0xffff01ffffff0000, 0xffff01ffff01ffff, 0xffff01ffff010100, 0xffff01ff00000000,
|
||||||
|
0xffff01ff01ffffff, 0xffff01ff01ff0001, 0xffff01ff0101ffff, 0xffff01ff01010001,
|
||||||
|
0xffff0100ffffff01, 0xffff01000000ffff, 0xffff010000000100, 0xffff010001ff01ff,
|
||||||
|
0xffff010001000000, 0xffff0101ff000000, 0xffff0101000101ff, 0xffff010101ffff01,
|
||||||
|
0xffff01010101ff00, 0xff00ffffff000000, 0xff00ffff00ffff00, 0xff00ffff00000001,
|
||||||
|
0xff00ffff000001ff, 0xff00ffff01010000, 0xff00ff00ffff0000, 0xff00ff00ff00ff00,
|
||||||
|
0xff00ff00ff0000ff, 0xff00ff00ff000100, 0xff00ff00ff010001, 0xff00ff0000ff0001,
|
||||||
|
0xff00ff000000ffff, 0xff00ff0000000000, 0xff00ff000001ff00, 0xff00ff0000010100,
|
||||||
|
0xff00ff0001ff0000, 0xff00ff000100ff00, 0xff00ff0001000100, 0xff00ff01ff000000,
|
||||||
|
0xff00ff0100ff0000, 0xff00ff01000001ff, 0xff00ff0101010001, 0xff0000ff00000000,
|
||||||
|
0xff0000ff0001ff00, 0xff0000ff00010100, 0xff000000ffff0101, 0xff000000ff000000,
|
||||||
|
0xff000000ff01ff00, 0xff00000000ff0000, 0xff0000000000ff00, 0xff000000000000ff,
|
||||||
|
0xff00000000000000, 0xff00000000000001, 0xff00000000000100, 0xff0000000001ffff,
|
||||||
|
0xff00000000010000, 0xff00000001000000, 0xff00000001010100, 0xff000001ff00ff01,
|
||||||
|
0xff000001ff0100ff, 0xff00000100000000, 0xff0000010001ff00, 0xff00000101ff0100,
|
||||||
|
0xff0000010100ff00, 0xff0001ff00ff00ff, 0xff0001ff00000101, 0xff0001ff000100ff,
|
||||||
|
0xff0001ff01000000, 0xff000100ff0001ff, 0xff0001000000ff01, 0xff00010000000000,
|
||||||
|
0xff00010000010001, 0xff00010000010100, 0xff00010001ffff00, 0xff00010001ff0101,
|
||||||
|
0xff00010001010000, 0xff000101ffffffff, 0xff000101ff000101, 0xff00010101ff00ff,
|
||||||
|
0xff00010101000001, 0xff000101010100ff, 0xff01ffffff000101, 0xff01ffffff01ffff,
|
||||||
|
0xff01ffffff01ff01, 0xff01ffffff0101ff, 0xff01ffff00000000, 0xff01ffff01ff0001,
|
||||||
|
0xff01ffff0101ff01, 0xff01ff00ff000000, 0xff01ff0000ff0100, 0xff01ff000000ff01,
|
||||||
|
0xff01ff0000010000, 0xff01ff00010000ff, 0xff01ff01ff01ff00, 0xff01ff0100000101,
|
||||||
|
0xff0100ffffff0000, 0xff0100ffff010000, 0xff0100ff01ff00ff, 0xff0100ff01000100,
|
||||||
|
0xff0100ff010100ff, 0xff010000ffffff01, 0xff01000000000000, 0xff0100000101ff00,
|
||||||
|
0xff010001ffff00ff, 0xff010001ff000100, 0xff01000100ffff00, 0xff01000100010001,
|
||||||
|
0xff01000101ff0001, 0xff010001010001ff, 0xff0101ffffffffff, 0xff0101ffff01ffff,
|
||||||
|
0xff0101ffff010101, 0xff0101ff0000ff00, 0xff0101ff01010001, 0xff010100ff000000,
|
||||||
|
0xff010100ff01ff01, 0xff01010000ff0001, 0xff01010000000100, 0xff01010001000000,
|
||||||
|
0xff0101010100ffff, 0x00ffffff0000ff01, 0x00ffffff000000ff, 0x00ffffff00000100,
|
||||||
|
0x00ffffff00010000, 0x00ffff00ffff0001, 0x00ffff00ff0000ff, 0x00ffff00ff000100,
|
||||||
|
0x00ffff0000000000, 0x00ffff0001000100, 0x00ffff0001010001, 0x00ffff01ff00ff01,
|
||||||
|
0x00ffff0100ff0100, 0x00ffff010000ff00, 0x00ffff01000100ff, 0x00ffff0101ff00ff,
|
||||||
|
0x00ffff010101ff00, 0x00ff00ffffffffff, 0x00ff00ffffff01ff, 0x00ff00ffff000101,
|
||||||
|
0x00ff00ff00000000, 0x00ff00ff000101ff, 0x00ff00ff01010101, 0x00ff0000ff000000,
|
||||||
|
0x00ff0000ff01ffff, 0x00ff000000ff0000, 0x00ff00000000ff00, 0x00ff0000000000ff,
|
||||||
|
0x00ff000000000000, 0x00ff000000000001, 0x00ff000000000100, 0x00ff000000010000,
|
||||||
|
0x00ff000001ffff01, 0x00ff000001000000, 0x00ff0001ff000101, 0x00ff000100ffffff,
|
||||||
|
0x00ff000100000000, 0x00ff0001010001ff, 0x00ff01ffff000000, 0x00ff01ff0001ff00,
|
||||||
|
0x00ff01ff01ff0100, 0x00ff0100ff01ff01, 0x00ff010000ff00ff, 0x00ff010000ff0101,
|
||||||
|
0x00ff010000000000, 0x00ff010000010101, 0x00ff01000100ff00, 0x00ff010001010000,
|
||||||
|
0x00ff0101ffffff00, 0x00ff01010000ff01, 0x00ff010100000100, 0x00ff010101ff0000,
|
||||||
|
0x0000ffffffff0100, 0x0000ffffff00ff00, 0x0000ffffff0000ff, 0x0000ffffff010000,
|
||||||
|
0x0000ffff00000000, 0x0000ffff00010101, 0x0000ffff01ffff01, 0x0000ffff01000100,
|
||||||
|
0x0000ff00ff000000, 0x0000ff00ff01ff00, 0x0000ff00ff0101ff, 0x0000ff0000ff0000,
|
||||||
|
0x0000ff000000ff00, 0x0000ff00000000ff, 0x0000ff0000000000, 0x0000ff0000000001,
|
||||||
|
0x0000ff0000000100, 0x0000ff0000010000, 0x0000ff0001ffffff, 0x0000ff0001ff01ff,
|
||||||
|
0x0000ff0001000000, 0x0000ff000101ffff, 0x0000ff01ffff0101, 0x0000ff01ff010000,
|
||||||
|
0x0000ff0100000000, 0x0000ff0101000101, 0x000000ffffff0001, 0x000000ffff000000,
|
||||||
|
0x000000ff00ff0000, 0x000000ff0000ff00, 0x000000ff000000ff, 0x000000ff00000000,
|
||||||
|
0x000000ff00000001, 0x000000ff00000100, 0x000000ff00010000, 0x000000ff01000000,
|
||||||
|
0x000000ff0101ff00, 0x00000000ffff0000, 0x00000000ff00ff00, 0x00000000ff0000ff,
|
||||||
|
0x00000000ff000000, 0x00000000ff000001, 0x00000000ff000100, 0x00000000ff010000,
|
||||||
|
0x0000000000ffff00, 0x0000000000ff00ff, 0x0000000000ff0000, 0x0000000000ff0001,
|
||||||
|
0x0000000000ff0100, 0x000000000000ffff, 0x000000000000ff00, 0x000000000000ff01,
|
||||||
|
0x00000000000000ff, 0x0000000000000001, 0x00000000000001ff, 0x0000000000000100,
|
||||||
|
0x0000000000000101, 0x000000000001ff00, 0x00000000000100ff, 0x0000000000010000,
|
||||||
|
0x0000000000010001, 0x0000000000010100, 0x0000000001ff0000, 0x000000000100ff00,
|
||||||
|
0x00000000010000ff, 0x0000000001000000, 0x0000000001000001, 0x0000000001000100,
|
||||||
|
0x0000000001010000, 0x00000001ffff01ff, 0x00000001ff000000, 0x0000000100ff0000,
|
||||||
|
0x000000010000ff00, 0x00000001000000ff, 0x0000000100000000, 0x0000000100000001,
|
||||||
|
0x0000000100000100, 0x0000000100010000, 0x0000000101000000, 0x000001ffff00ff00,
|
||||||
|
0x000001ffff010001, 0x000001ffff0101ff, 0x000001ff00ffff01, 0x000001ff0000ffff,
|
||||||
|
0x000001ff00000000, 0x000001ff010000ff, 0x000001ff01010100, 0x00000100ffff0100,
|
||||||
|
0x00000100ff000000, 0x0000010000ff0000, 0x000001000000ff00, 0x00000100000000ff,
|
||||||
|
0x0000010000000000, 0x0000010000000001, 0x0000010000000100, 0x0000010000010000,
|
||||||
|
0x0000010001000000, 0x000001000101ff01, 0x00000101ffff0001, 0x00000101ff01ffff,
|
||||||
|
0x0000010100000000, 0x0000010101010100, 0x0001ffffff000000, 0x0001ffff00ffffff,
|
||||||
|
0x0001ffff00000100, 0x0001ffff0001ff00, 0x0001ffff01000000, 0x0001ff00ffffff00,
|
||||||
|
0x0001ff00ffff01ff, 0x0001ff00ff010000, 0x0001ff0000000000, 0x0001ff0000010001,
|
||||||
|
0x0001ff0001ff0000, 0x0001ff0001010100, 0x0001ff01ff0000ff, 0x0001ff01ff000001,
|
||||||
|
0x0001ff0100ffffff, 0x0001ff010001ffff, 0x0001ff01000101ff, 0x0001ff010100ff01,
|
||||||
|
0x000100ffff00ffff, 0x000100ffff00ff01, 0x000100ffff000100, 0x000100ff00000000,
|
||||||
|
0x000100ff000101ff, 0x000100ff01ff0101, 0x000100ff0100ffff, 0x000100ff01010101,
|
||||||
|
0x00010000ff000000, 0x00010000ff010100, 0x0001000000ff0000, 0x000100000000ff00,
|
||||||
|
0x00010000000000ff, 0x0001000000000000, 0x0001000000000001, 0x0001000000000100,
|
||||||
|
0x0001000000010000, 0x0001000001ffff01, 0x0001000001000000, 0x0001000100ff0101,
|
||||||
|
0x0001000100000000, 0x00010001010100ff, 0x000101ffffff01ff, 0x000101ffffff0101,
|
||||||
|
0x000101ff00010000, 0x000101ff01ff0000, 0x000101ff0100ff01, 0x00010100ffff0000,
|
||||||
|
0x0001010000000000, 0x000101000001ffff, 0x0001010000010101, 0x00010100010001ff,
|
||||||
|
0x00010101ff00ff00, 0x00010101ff010001, 0x0001010100ffffff, 0x0001010100ff01ff,
|
||||||
|
0x00010101000101ff, 0x0001010101ff0000, 0x000101010100ff01, 0x0001010101000101,
|
||||||
|
0x01ffffffffff0101, 0x01ffffffff01ffff, 0x01ffffffff01ff01, 0x01ffffffff0101ff,
|
||||||
|
0x01ffffffff010101, 0x01ffffff00000000, 0x01ffffff01ff01ff, 0x01ffffff01000101,
|
||||||
|
0x01ffffff0101ff01, 0x01ffffff010100ff, 0x01ffff000000ff00, 0x01ffff0000000001,
|
||||||
|
0x01ffff00000001ff, 0x01ffff0000010000, 0x01ffff0001ff0000, 0x01ffff01ffffffff,
|
||||||
|
0x01ffff01ffff01ff, 0x01ffff01ff000000, 0x01ffff01ff01ffff, 0x01ffff01ff0101ff,
|
||||||
|
0x01ffff010100ffff, 0x01ff00ffffff0000, 0x01ff00ffff010000, 0x01ff00ff00ffff01,
|
||||||
|
0x01ff0000ff0000ff, 0x01ff000000000000, 0x01ff00000001ff01, 0x01ff000001ffffff,
|
||||||
|
0x01ff000001010100, 0x01ff0001ffffff01, 0x01ff0001ff010001, 0x01ff000101ff0100,
|
||||||
|
0x01ff000101000001, 0x01ff0001010100ff, 0x01ff01ffff00ffff, 0x01ff01ff00010001,
|
||||||
|
0x01ff01ff01000000, 0x01ff01ff010101ff, 0x01ff0100ff000001, 0x01ff010000ffff00,
|
||||||
|
0x01ff010000000100, 0x01ff010001ff01ff, 0x01ff01000101ffff, 0x01ff0101ffff00ff,
|
||||||
|
0x01ff0101ffff0101, 0x01ff0101ff0101ff, 0x01ff010100010000, 0x0100ffff00ff00ff,
|
||||||
|
0x0100ffff00ff0001, 0x0100ffff00000100, 0x0100ffff0100ff00, 0x0100ff00ffff0000,
|
||||||
|
0x0100ff00ff00ffff, 0x0100ff00ff00ff01, 0x0100ff00ff000100, 0x0100ff00ff010000,
|
||||||
|
0x0100ff0000000000, 0x0100ff00000100ff, 0x0100ff0001ff0101, 0x0100ff0001010101,
|
||||||
|
0x0100ff0100ff00ff, 0x0100ff0100ff0001, 0x0100ff0100000100, 0x0100ff0100010001,
|
||||||
|
0x0100ff0101000000, 0x010000ffff00ff00, 0x010000ff0000ffff, 0x010000ff00000000,
|
||||||
|
0x010000ff010001ff, 0x010000ff01010001, 0x01000000ffffff00, 0x01000000ffff0101,
|
||||||
|
0x01000000ff000000, 0x01000000ff0100ff, 0x01000000ff010101, 0x0100000000ff0000,
|
||||||
|
0x010000000000ff00, 0x01000000000000ff, 0x0100000000000000, 0x0100000000000001,
|
||||||
|
0x0100000000000100, 0x0100000000010000, 0x0100000001000000, 0x0100000100000000,
|
||||||
|
0x01000001000101ff, 0x0100000101ffff01, 0x010001ffff000101, 0x010001ff00ff0100,
|
||||||
|
0x010001ff0000ff00, 0x010001ff000100ff, 0x010001ff01ffffff, 0x01000100ffff0000,
|
||||||
|
0x01000100ff0001ff, 0x0100010000000000, 0x010001000001ff00, 0x0100010001ff0000,
|
||||||
|
0x01000100010000ff, 0x0100010001000101, 0x01000101ff00ff01, 0x0100010100ff0100,
|
||||||
|
0x010001010000ffff, 0x0100010101010001, 0x0101ffffffff0101, 0x0101ffffff0001ff,
|
||||||
|
0x0101ffffff01ffff, 0x0101ffffff010101, 0x0101ffff00000000, 0x0101ffff0101ffff,
|
||||||
|
0x0101ffff010101ff, 0x0101ff00ff000000, 0x0101ff0000ff0100, 0x0101ff000000ff00,
|
||||||
|
0x0101ff0000010000, 0x0101ff00010000ff, 0x0101ff0001000001, 0x0101ff01ff010101,
|
||||||
|
0x0101ff0100000000, 0x0101ff010101ff00, 0x010100ffffff0000, 0x010100ffff010000,
|
||||||
|
0x010100ff00ff01ff, 0x010100ff000000ff, 0x010100ff00000101, 0x010100ff01ffff00,
|
||||||
|
0x01010000ffffff01, 0x01010000ff000100, 0x01010000ff01ff01, 0x0101000000000000,
|
||||||
|
0x01010000000100ff, 0x010100000101ff01, 0x01010001ffff0000, 0x01010001ff00ffff,
|
||||||
|
0x01010001ff010000, 0x0101000101ffffff, 0x0101000101ff01ff, 0x0101000101010101,
|
||||||
|
0x010101ffff01ffff, 0x010101ff00000000, 0x010101ff0001ff01, 0x010101ff0101ffff,
|
||||||
|
0x010101ff010101ff, 0x01010100ffffffff, 0x01010100ff000001, 0x010101000000ff00,
|
||||||
|
0x0101010001010000, 0x0101010100ff0001, 0x010101010001ff01, 0x010101010101ffff,
|
||||||
|
};
|
||||||
|
|
||||||
static const __device__ uint8_t ksigns_iq2xs[128] = {
|
static const __device__ uint8_t ksigns_iq2xs[128] = {
|
||||||
0, 129, 130, 3, 132, 5, 6, 135, 136, 9, 10, 139, 12, 141, 142, 15,
|
0, 129, 130, 3, 132, 5, 6, 135, 136, 9, 10, 139, 12, 141, 142, 15,
|
||||||
144, 17, 18, 147, 20, 149, 150, 23, 24, 153, 154, 27, 156, 29, 30, 159,
|
144, 17, 18, 147, 20, 149, 150, 23, 24, 153, 154, 27, 156, 29, 30, 159,
|
||||||
|
@ -1824,6 +1966,29 @@ static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, ds
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<typename dst_t>
|
||||||
|
static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||||
|
|
||||||
|
const int i = blockIdx.x;
|
||||||
|
const block_iq1_s * x = (const block_iq1_s *) vx;
|
||||||
|
|
||||||
|
const int tid = threadIdx.x;
|
||||||
|
#if QK_K == 256
|
||||||
|
const int il = tid/8; // 0...3
|
||||||
|
const int ib = tid%8; // 0...7
|
||||||
|
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
||||||
|
const int i8 = 4*ib+il;
|
||||||
|
uint8_t h = x[i].scales[i8/2] >> 4*(i8%2);
|
||||||
|
const int8_t * grid = (const int8_t *)(iq1s_grid + (x[i].qs[i8] | ((h & 8) << 5)));
|
||||||
|
const float d = (float)x[i].d * (2*(h & 7) + 1);
|
||||||
|
for (int j = 0; j < 8; ++j) y[j] = d * grid[j];
|
||||||
|
#else
|
||||||
|
assert(false);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
|
static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
|
||||||
|
|
||||||
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
|
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
|
||||||
|
@ -4479,10 +4644,12 @@ static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
|
||||||
const float d = (float)bq2->d * __low2float(bq8_1[ib32].ds) * 0.25f;
|
const float d = (float)bq2->d * __low2float(bq8_1[ib32].ds) * 0.25f;
|
||||||
return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
|
return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
|
||||||
#else
|
#else
|
||||||
|
(void) ksigns64;
|
||||||
assert(false);
|
assert(false);
|
||||||
return 0.f;
|
return 0.f;
|
||||||
#endif
|
#endif
|
||||||
#else
|
#else
|
||||||
|
(void) ksigns64;
|
||||||
assert(false);
|
assert(false);
|
||||||
return 0.f;
|
return 0.f;
|
||||||
#endif
|
#endif
|
||||||
|
@ -4523,6 +4690,49 @@ static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1(
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static __device__ __forceinline__ float vec_dot_iq1_s_q8_1(
|
||||||
|
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
||||||
|
#if QK_K == 256
|
||||||
|
const block_iq1_s * bq1 = (const block_iq1_s *) vbq;
|
||||||
|
|
||||||
|
const int ib32 = iqs;
|
||||||
|
int sumi1 = 0, sumi2 = 0, sumi3 = 0, sumi4 = 0;
|
||||||
|
const uint8_t h1 = bq1->scales[2*ib32+0];
|
||||||
|
const uint8_t h2 = bq1->scales[2*ib32+1];
|
||||||
|
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
||||||
|
const int * q8 = (const int *)bq8_1[ib32].qs;
|
||||||
|
const int * grid1 = (const int *)(iq1s_grid + (bq1->qs[4*ib32+0] | ((h1 & 0x08) << 5)));
|
||||||
|
const int * grid2 = (const int *)(iq1s_grid + (bq1->qs[4*ib32+1] | ((h1 & 0x80) << 1)));
|
||||||
|
const int * grid3 = (const int *)(iq1s_grid + (bq1->qs[4*ib32+2] | ((h2 & 0x08) << 5)));
|
||||||
|
const int * grid4 = (const int *)(iq1s_grid + (bq1->qs[4*ib32+3] | ((h2 & 0x80) << 1)));
|
||||||
|
for (int j = 0; j < 2; ++j) {
|
||||||
|
sumi1 = __dp4a(q8[j+0], grid1[j], sumi1);
|
||||||
|
sumi2 = __dp4a(q8[j+2], grid2[j], sumi2);
|
||||||
|
sumi3 = __dp4a(q8[j+4], grid3[j], sumi3);
|
||||||
|
sumi4 = __dp4a(q8[j+6], grid4[j], sumi4);
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
const int8_t * q8 = bq8_1[ib32].qs;
|
||||||
|
const int8_t * grid1 = (const int8_t *)(iq1s_grid + (bq1->qs[4*ib32+0] | ((h1 & 0x08) << 5)));
|
||||||
|
const int8_t * grid2 = (const int8_t *)(iq1s_grid + (bq1->qs[4*ib32+1] | ((h1 & 0x80) << 1)));
|
||||||
|
const int8_t * grid3 = (const int8_t *)(iq1s_grid + (bq1->qs[4*ib32+2] | ((h2 & 0x08) << 5)));
|
||||||
|
const int8_t * grid4 = (const int8_t *)(iq1s_grid + (bq1->qs[4*ib32+3] | ((h2 & 0x80) << 1)));
|
||||||
|
for (int j = 0; j < 8; ++j) {
|
||||||
|
sumi1 += q8[j+ 0] * grid1[j];
|
||||||
|
sumi2 += q8[j+ 8] * grid2[j];
|
||||||
|
sumi3 += q8[j+16] * grid3[j];
|
||||||
|
sumi4 += q8[j+24] * grid4[j];
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
const float d = (float)bq1->d * __low2float(bq8_1[ib32].ds);
|
||||||
|
return d * (sumi1 * (2*(h1 & 7) + 1) + sumi2 * (2*((h1 >> 4) & 7) + 1) +
|
||||||
|
sumi3 * (2*(h2 & 7) + 1) + sumi4 * (2*((h2 >> 4) & 7) + 1));
|
||||||
|
#else
|
||||||
|
assert(false);
|
||||||
|
return 0.f;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x, int mmq_y, int nwarps,
|
template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x, int mmq_y, int nwarps,
|
||||||
allocate_tiles_cuda_t allocate_tiles, load_tiles_cuda_t load_tiles, int vdr, vec_dot_q_mul_mat_cuda_t vec_dot>
|
allocate_tiles_cuda_t allocate_tiles, load_tiles_cuda_t load_tiles, int vdr, vec_dot_q_mul_mat_cuda_t vec_dot>
|
||||||
static __device__ __forceinline__ void mul_mat_q(
|
static __device__ __forceinline__ void mul_mat_q(
|
||||||
|
@ -5957,149 +6167,31 @@ static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int
|
||||||
dst[i] = x[i] - (col > n_past + row % rows_per_channel) * FLT_MAX;
|
dst[i] = x[i] - (col > n_past + row % rows_per_channel) * FLT_MAX;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <bool vals_smem, int ncols_template, int block_size_template, bool need_check>
|
|
||||||
static __global__ void soft_max_f16(const float * x, const float * y, float * dst, const int ncols_par, const int nrows_y, const float scale) {
|
|
||||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
|
|
||||||
const int ncols_data = ncols_template == 0 ? ncols_par : ncols_template;
|
|
||||||
const int ncols_smem = GGML_PAD(ncols_data, 2*WARP_SIZE)/2;
|
|
||||||
|
|
||||||
const int tid = threadIdx.x;
|
|
||||||
const int rowx = blockIdx.x;
|
|
||||||
const int rowy = rowx % nrows_y; // broadcast the mask (y) in the row dimension
|
|
||||||
|
|
||||||
const int block_size = block_size_template == 0 ? blockDim.x : block_size_template;
|
|
||||||
|
|
||||||
const int warp_id = threadIdx.x / WARP_SIZE;
|
|
||||||
const int lane_id = threadIdx.x % WARP_SIZE;
|
|
||||||
|
|
||||||
extern __shared__ half data_soft_max_f16[];
|
|
||||||
half * buf_iw = data_soft_max_f16 + 0; // shared memory buffer for inter-warp communication
|
|
||||||
// (shared memory) buffer to cache values between iterations:
|
|
||||||
half2 * vals = vals_smem ? (half2 *) (buf_iw + WARP_SIZE) : (half2 *) (dst + rowx*ncols_data);
|
|
||||||
// if the buffer is larger than max. shared memory per block, use dst as temp. buffer instead
|
|
||||||
// in that case col_smem == col_data must be enforced to avoid race conditions
|
|
||||||
|
|
||||||
half2 max_val = make_half2(-INFINITY, -INFINITY);
|
|
||||||
|
|
||||||
#pragma unroll
|
|
||||||
for (int col0 = 0; col0 < ncols_smem; col0 += block_size) {
|
|
||||||
const int col_data = 2*col0 + 2*WARP_SIZE*warp_id + lane_id;
|
|
||||||
const int col_smem = vals_smem ? col0 + tid : col_data;
|
|
||||||
|
|
||||||
const int ix = rowx*ncols_data + col_data;
|
|
||||||
const int iy = rowy*ncols_data + col_data;
|
|
||||||
|
|
||||||
half2 val;
|
|
||||||
if (need_check && col_data + 0 >= ncols_data) {
|
|
||||||
val.x = -INFINITY;
|
|
||||||
} else {
|
|
||||||
val.x = x[ix + 0]*scale + (y ? y[iy + 0] : 0.0f);
|
|
||||||
}
|
|
||||||
if (need_check && col_data + WARP_SIZE >= ncols_data) {
|
|
||||||
val.y = -INFINITY;
|
|
||||||
} else {
|
|
||||||
val.y = x[ix + WARP_SIZE]*scale + (y ? y[iy + WARP_SIZE] : 0.0f);
|
|
||||||
}
|
|
||||||
if (!need_check || col_smem < (vals_smem ? ncols_smem : ncols_data)) {
|
|
||||||
vals[col_smem] = val;
|
|
||||||
}
|
|
||||||
max_val = __hmax2(max_val, val);
|
|
||||||
}
|
|
||||||
|
|
||||||
// find the max value in the block
|
|
||||||
max_val = warp_reduce_max(max_val);
|
|
||||||
if (block_size > WARP_SIZE) {
|
|
||||||
if (warp_id == 0) {
|
|
||||||
buf_iw[lane_id] = -INFINITY;
|
|
||||||
}
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
if (lane_id == 0) {
|
|
||||||
buf_iw[warp_id] = __hmax(max_val.x, max_val.y);
|
|
||||||
}
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
max_val = __half2half2(buf_iw[lane_id]);
|
|
||||||
max_val = warp_reduce_max(max_val);
|
|
||||||
} else {
|
|
||||||
max_val = __half2half2(__hmax(max_val.x, max_val.y));
|
|
||||||
}
|
|
||||||
|
|
||||||
half2 tmp = make_half2(0.0f, 0.0f); // partial sums
|
|
||||||
|
|
||||||
#pragma unroll
|
|
||||||
for (int col0 = 0; col0 < ncols_smem; col0 += block_size) {
|
|
||||||
const int col_smem = vals_smem ? col0 + tid : 2*col0 + 2*warp_id*WARP_SIZE + lane_id;
|
|
||||||
|
|
||||||
if (ncols_template == 0 && col_smem >= (vals_smem ? ncols_smem : ncols_data)) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
const half2 val = h2exp(vals[col_smem] - max_val);
|
|
||||||
|
|
||||||
tmp += val;
|
|
||||||
vals[col_smem] = val;
|
|
||||||
}
|
|
||||||
|
|
||||||
// find the sum of exps in the block
|
|
||||||
tmp = warp_reduce_sum(tmp);
|
|
||||||
if (block_size > WARP_SIZE) {
|
|
||||||
if (warp_id == 0) {
|
|
||||||
buf_iw[lane_id] = 0.0f;
|
|
||||||
}
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
if (lane_id == 0) {
|
|
||||||
buf_iw[warp_id] = tmp.x + tmp.y;
|
|
||||||
}
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
tmp = __half2half2(buf_iw[lane_id]);
|
|
||||||
tmp = warp_reduce_sum(tmp);
|
|
||||||
} else {
|
|
||||||
tmp = __half2half2(tmp.x + tmp.y);
|
|
||||||
}
|
|
||||||
|
|
||||||
const half2 inv_sum = make_half2(1.0f, 1.0f) / tmp;
|
|
||||||
|
|
||||||
#pragma unroll
|
|
||||||
for (int col0 = 0; col0 < ncols_smem; col0 += block_size) {
|
|
||||||
const int col_data = 2*col0 + 2*WARP_SIZE*warp_id + lane_id;
|
|
||||||
const int col_smem = vals_smem ? col0 + tid : col_data;
|
|
||||||
|
|
||||||
const int idst = rowx*ncols_data + col_data;
|
|
||||||
const half2 result = vals[col_smem] * inv_sum;
|
|
||||||
|
|
||||||
if (need_check && col_data + 0 >= ncols_data) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
dst[idst] = result.x;
|
|
||||||
|
|
||||||
if (need_check && col_data + WARP_SIZE >= ncols_data) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
dst[idst + WARP_SIZE] = result.y;
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
(void) x; (void) y; (void) dst; (void) ncols_par; (void) nrows_y; (void) scale;
|
|
||||||
NO_DEVICE_CODE;
|
|
||||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
|
|
||||||
}
|
|
||||||
|
|
||||||
template <bool vals_smem, int ncols_template, int block_size_template>
|
template <bool vals_smem, int ncols_template, int block_size_template>
|
||||||
static __global__ void soft_max_f32(const float * x, const float * y, float * dst, const int ncols_par, const int nrows_y, const float scale) {
|
static __global__ void soft_max_f32(const float * x, const float * mask, const float * pos, float * dst, const int ncols_par, const int nrows_y, const float scale, const float max_bias, const float m0, const float m1, uint32_t n_head_log2) {
|
||||||
const int ncols = ncols_template == 0 ? ncols_par : ncols_template;
|
const int ncols = ncols_template == 0 ? ncols_par : ncols_template;
|
||||||
|
|
||||||
const int tid = threadIdx.x;
|
const int tid = threadIdx.x;
|
||||||
const int rowx = blockIdx.x;
|
const int rowx = blockIdx.x;
|
||||||
const int rowy = rowx % nrows_y; // broadcast the mask (y) in the row dimension
|
const int rowy = rowx % nrows_y; // broadcast the mask in the row dimension
|
||||||
|
|
||||||
const int block_size = block_size_template == 0 ? blockDim.x : block_size_template;
|
const int block_size = block_size_template == 0 ? blockDim.x : block_size_template;
|
||||||
|
|
||||||
const int warp_id = threadIdx.x / WARP_SIZE;
|
const int warp_id = threadIdx.x / WARP_SIZE;
|
||||||
const int lane_id = threadIdx.x % WARP_SIZE;
|
const int lane_id = threadIdx.x % WARP_SIZE;
|
||||||
|
|
||||||
|
float slope = 0.0f;
|
||||||
|
|
||||||
|
// ALiBi
|
||||||
|
if (max_bias > 0.0f) {
|
||||||
|
const int h = rowx/nrows_y; // head index
|
||||||
|
|
||||||
|
const float base = h < n_head_log2 ? m0 : m1;
|
||||||
|
const int exp = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
|
||||||
|
|
||||||
|
slope = powf(base, exp);
|
||||||
|
}
|
||||||
|
|
||||||
extern __shared__ float data_soft_max_f32[];
|
extern __shared__ float data_soft_max_f32[];
|
||||||
float * buf_iw = data_soft_max_f32; // shared memory buffer for inter-warp communication
|
float * buf_iw = data_soft_max_f32; // shared memory buffer for inter-warp communication
|
||||||
// shared memory buffer to cache values between iterations:
|
// shared memory buffer to cache values between iterations:
|
||||||
|
@ -6118,7 +6210,8 @@ static __global__ void soft_max_f32(const float * x, const float * y, float * ds
|
||||||
const int ix = rowx*ncols + col;
|
const int ix = rowx*ncols + col;
|
||||||
const int iy = rowy*ncols + col;
|
const int iy = rowy*ncols + col;
|
||||||
|
|
||||||
const float val = x[ix]*scale + (y ? y[iy] : 0.0f);
|
const float val = x[ix]*scale + (mask ? mask[iy] : 0.0f) + (pos ? slope*pos[col] : 0.0f);
|
||||||
|
|
||||||
vals[col] = val;
|
vals[col] = val;
|
||||||
max_val = max(max_val, val);
|
max_val = max(max_val, val);
|
||||||
}
|
}
|
||||||
|
@ -6679,6 +6772,12 @@ static void dequantize_row_iq3_xxs_cuda(const void * vx, dst_t * y, const int k,
|
||||||
dequantize_block_iq3_xxs<<<nb, 32, 0, stream>>>(vx, y);
|
dequantize_block_iq3_xxs<<<nb, 32, 0, stream>>>(vx, y);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<typename dst_t>
|
||||||
|
static void dequantize_row_iq1_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
||||||
|
const int nb = k / QK_K;
|
||||||
|
dequantize_block_iq1_s<<<nb, 32, 0, stream>>>(vx, y);
|
||||||
|
}
|
||||||
|
|
||||||
template <typename src_t, typename dst_t>
|
template <typename src_t, typename dst_t>
|
||||||
static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
|
static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
|
||||||
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
||||||
|
@ -6718,6 +6817,8 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
|
||||||
return dequantize_row_iq2_xs_cuda;
|
return dequantize_row_iq2_xs_cuda;
|
||||||
case GGML_TYPE_IQ3_XXS:
|
case GGML_TYPE_IQ3_XXS:
|
||||||
return dequantize_row_iq3_xxs_cuda;
|
return dequantize_row_iq3_xxs_cuda;
|
||||||
|
case GGML_TYPE_IQ1_S:
|
||||||
|
return dequantize_row_iq1_s_cuda;
|
||||||
case GGML_TYPE_F32:
|
case GGML_TYPE_F32:
|
||||||
return convert_unary_cuda<float>;
|
return convert_unary_cuda<float>;
|
||||||
default:
|
default:
|
||||||
|
@ -6753,6 +6854,8 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
|
||||||
return dequantize_row_iq2_xs_cuda;
|
return dequantize_row_iq2_xs_cuda;
|
||||||
case GGML_TYPE_IQ3_XXS:
|
case GGML_TYPE_IQ3_XXS:
|
||||||
return dequantize_row_iq3_xxs_cuda;
|
return dequantize_row_iq3_xxs_cuda;
|
||||||
|
case GGML_TYPE_IQ1_S:
|
||||||
|
return dequantize_row_iq1_s_cuda;
|
||||||
case GGML_TYPE_F16:
|
case GGML_TYPE_F16:
|
||||||
return convert_unary_cuda<half>;
|
return convert_unary_cuda<half>;
|
||||||
default:
|
default:
|
||||||
|
@ -7590,89 +7693,53 @@ static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols
|
||||||
diag_mask_inf_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x, rows_per_channel, n_past);
|
diag_mask_inf_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x, rows_per_channel, n_past);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void soft_max_f16_cuda(const float * x, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nrows_y, const float scale, cudaStream_t stream) {
|
static void soft_max_f32_cuda(const float * x, const float * mask, const float * pos, float * dst, const int ncols_x, const int nrows_x, const int nrows_y, const float scale, const float max_bias, cudaStream_t stream) {
|
||||||
int nth = WARP_SIZE;
|
|
||||||
while (nth < ncols_x/2 && nth < CUDA_SOFT_MAX_BLOCK_SIZE) nth *= 2;
|
|
||||||
const dim3 block_dims(nth, 1, 1);
|
|
||||||
const dim3 block_nums(nrows_x, 1, 1);
|
|
||||||
const size_t shmem = (GGML_PAD(ncols_x, 2*WARP_SIZE) + WARP_SIZE)*sizeof(half);
|
|
||||||
static_assert(CUDA_SOFT_MAX_BLOCK_SIZE == 1024, "These values need to be adjusted.");
|
|
||||||
if (shmem <= g_device_caps[g_main_device].smpb) {
|
|
||||||
switch (ncols_x) {
|
|
||||||
case 32:
|
|
||||||
soft_max_f16<true, 32, 32, true><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
|
|
||||||
break;
|
|
||||||
case 64:
|
|
||||||
soft_max_f16<true, 64, 32, false><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
|
|
||||||
break;
|
|
||||||
case 128:
|
|
||||||
soft_max_f16<true, 128, 64, false><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
|
|
||||||
break;
|
|
||||||
case 256:
|
|
||||||
soft_max_f16<true, 256, 128, false><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
|
|
||||||
break;
|
|
||||||
case 512:
|
|
||||||
soft_max_f16<true, 512, 256, false><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
|
|
||||||
break;
|
|
||||||
case 1024:
|
|
||||||
soft_max_f16<true, 1024, 512, false><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
|
|
||||||
break;
|
|
||||||
case 2048:
|
|
||||||
soft_max_f16<true, 2048, 1024, false><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
|
|
||||||
break;
|
|
||||||
case 4096:
|
|
||||||
soft_max_f16<true, 4096, 1024, false><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
soft_max_f16<true, 0, 0, true><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
const size_t shmem_low = WARP_SIZE*sizeof(half);
|
|
||||||
soft_max_f16<false, 0, 0, true><<<block_nums, block_dims, shmem_low, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static void soft_max_f32_cuda(const float * x, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nrows_y, const float scale, cudaStream_t stream) {
|
|
||||||
int nth = WARP_SIZE;
|
int nth = WARP_SIZE;
|
||||||
while (nth < ncols_x && nth < CUDA_SOFT_MAX_BLOCK_SIZE) nth *= 2;
|
while (nth < ncols_x && nth < CUDA_SOFT_MAX_BLOCK_SIZE) nth *= 2;
|
||||||
const dim3 block_dims(nth, 1, 1);
|
const dim3 block_dims(nth, 1, 1);
|
||||||
const dim3 block_nums(nrows_x, 1, 1);
|
const dim3 block_nums(nrows_x, 1, 1);
|
||||||
const size_t shmem = (GGML_PAD(ncols_x, WARP_SIZE) + WARP_SIZE)*sizeof(float);
|
const size_t shmem = (GGML_PAD(ncols_x, WARP_SIZE) + WARP_SIZE)*sizeof(float);
|
||||||
static_assert(CUDA_SOFT_MAX_BLOCK_SIZE == 1024, "These values need to be adjusted.");
|
static_assert(CUDA_SOFT_MAX_BLOCK_SIZE == 1024, "These values need to be adjusted.");
|
||||||
|
|
||||||
|
const uint32_t n_head_kv = nrows_x/nrows_y;
|
||||||
|
const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv));
|
||||||
|
|
||||||
|
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
||||||
|
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
||||||
|
|
||||||
if (shmem < g_device_caps[g_main_device].smpb) {
|
if (shmem < g_device_caps[g_main_device].smpb) {
|
||||||
switch (ncols_x) {
|
switch (ncols_x) {
|
||||||
case 32:
|
case 32:
|
||||||
soft_max_f32<true, 32, 32><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
|
soft_max_f32<true, 32, 32><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
|
||||||
break;
|
break;
|
||||||
case 64:
|
case 64:
|
||||||
soft_max_f32<true, 64, 64><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
|
soft_max_f32<true, 64, 64><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
|
||||||
break;
|
break;
|
||||||
case 128:
|
case 128:
|
||||||
soft_max_f32<true, 128, 128><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
|
soft_max_f32<true, 128, 128><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
|
||||||
break;
|
break;
|
||||||
case 256:
|
case 256:
|
||||||
soft_max_f32<true, 256, 256><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
|
soft_max_f32<true, 256, 256><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
|
||||||
break;
|
break;
|
||||||
case 512:
|
case 512:
|
||||||
soft_max_f32<true, 512, 512><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
|
soft_max_f32<true, 512, 512><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
|
||||||
break;
|
break;
|
||||||
case 1024:
|
case 1024:
|
||||||
soft_max_f32<true, 1024, 1024><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
|
soft_max_f32<true, 1024, 1024><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
|
||||||
break;
|
break;
|
||||||
case 2048:
|
case 2048:
|
||||||
soft_max_f32<true, 2048, 1024><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
|
soft_max_f32<true, 2048, 1024><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
|
||||||
break;
|
break;
|
||||||
case 4096:
|
case 4096:
|
||||||
soft_max_f32<true, 4096, 1024><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
|
soft_max_f32<true, 4096, 1024><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
soft_max_f32<true, 0, 0><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
|
soft_max_f32<true, 0, 0><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
const size_t shmem_low = WARP_SIZE*sizeof(float);
|
const size_t shmem_low = WARP_SIZE*sizeof(float);
|
||||||
soft_max_f32<false, 0, 0><<<block_nums, block_dims, shmem_low, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
|
soft_max_f32<false, 0, 0><<<block_nums, block_dims, shmem_low, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -8523,6 +8590,7 @@ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_CUD
|
||||||
case GGML_TYPE_IQ2_XXS:
|
case GGML_TYPE_IQ2_XXS:
|
||||||
case GGML_TYPE_IQ2_XS:
|
case GGML_TYPE_IQ2_XS:
|
||||||
case GGML_TYPE_IQ3_XXS:
|
case GGML_TYPE_IQ3_XXS:
|
||||||
|
case GGML_TYPE_IQ1_S:
|
||||||
return max_compute_capability >= CC_RDNA2 ? 128 : 64;
|
return max_compute_capability >= CC_RDNA2 ? 128 : 64;
|
||||||
default:
|
default:
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
|
@ -8546,6 +8614,7 @@ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_CUD
|
||||||
case GGML_TYPE_IQ2_XXS:
|
case GGML_TYPE_IQ2_XXS:
|
||||||
case GGML_TYPE_IQ2_XS:
|
case GGML_TYPE_IQ2_XS:
|
||||||
case GGML_TYPE_IQ3_XXS:
|
case GGML_TYPE_IQ3_XXS:
|
||||||
|
case GGML_TYPE_IQ1_S:
|
||||||
return max_compute_capability >= CC_VOLTA ? 128 : 64;
|
return max_compute_capability >= CC_VOLTA ? 128 : 64;
|
||||||
case GGML_TYPE_Q6_K:
|
case GGML_TYPE_Q6_K:
|
||||||
return 64;
|
return 64;
|
||||||
|
@ -8643,6 +8712,10 @@ static void ggml_cuda_op_mul_mat_vec_q(
|
||||||
mul_mat_vec_q_cuda<QK_K, QI3_XXS, block_iq3_xxs, 1, vec_dot_iq3_xxs_q8_1>
|
mul_mat_vec_q_cuda<QK_K, QI3_XXS, block_iq3_xxs, 1, vec_dot_iq3_xxs_q8_1>
|
||||||
(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
||||||
break;
|
break;
|
||||||
|
case GGML_TYPE_IQ1_S:
|
||||||
|
mul_mat_vec_q_cuda<QK_K, QI1_S, block_iq1_s, 1, vec_dot_iq1_s_q8_1>
|
||||||
|
(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
||||||
|
break;
|
||||||
default:
|
default:
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
break;
|
break;
|
||||||
|
@ -9082,30 +9155,36 @@ static void ggml_cuda_op_soft_max(
|
||||||
|
|
||||||
GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32); // src1 contains mask and it is optional
|
GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32); // src1 contains mask and it is optional
|
||||||
|
|
||||||
const int64_t ne00 = src0->ne[0];
|
const int64_t ne00 = src0->ne[0];
|
||||||
const int64_t nrows_x = ggml_nrows(src0);
|
const int64_t nrows_x = ggml_nrows(src0);
|
||||||
const int64_t nrows_y = src1 ? ggml_nrows(src1) : 1;
|
const int64_t nrows_y = src0->ne[1];
|
||||||
|
|
||||||
float scale = 1.0f;
|
float scale = 1.0f;
|
||||||
memcpy(&scale, dst->op_params, sizeof(float));
|
float max_bias = 0.0f;
|
||||||
|
|
||||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION >= CUDART_HMAX
|
memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
|
||||||
#ifdef GGML_CUDA_F16
|
memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float));
|
||||||
const bool use_f16_soft_max = true;
|
|
||||||
#else
|
|
||||||
const bool use_f16_soft_max = false;
|
|
||||||
#endif // GGML_CUDA_F16
|
|
||||||
#else
|
|
||||||
const bool use_f16_soft_max = false;
|
|
||||||
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && CUDART_VERSION >= CUDART_HMAX
|
|
||||||
|
|
||||||
if (use_f16_soft_max) {
|
// positions tensor
|
||||||
soft_max_f16_cuda(src0_dd, src1 ? src1_dd : nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, main_stream);
|
float * src2_dd = nullptr;
|
||||||
} else {
|
cuda_pool_alloc<float> src2_f;
|
||||||
soft_max_f32_cuda(src0_dd, src1 ? src1_dd : nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, main_stream);
|
|
||||||
|
ggml_tensor * src2 = dst->src[2];
|
||||||
|
const bool use_src2 = src2 != nullptr;
|
||||||
|
|
||||||
|
if (use_src2) {
|
||||||
|
const bool src2_on_device = src2->backend == GGML_BACKEND_GPU;
|
||||||
|
|
||||||
|
if (src2_on_device) {
|
||||||
|
ggml_tensor_extra_gpu * src2_extra = (ggml_tensor_extra_gpu *) src2->extra;
|
||||||
|
src2_dd = (float *) src2_extra->data_device[g_main_device];
|
||||||
|
} else {
|
||||||
|
src2_dd = src2_f.alloc(ggml_nelements(src2));
|
||||||
|
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src2_dd, src2, 0, 0, 0, 1, main_stream));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
(void) dst;
|
soft_max_f32_cuda(src0_dd, src1 ? src1_dd : nullptr, src2_dd, dst_dd, ne00, nrows_x, nrows_y, scale, max_bias, main_stream);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_cuda_op_scale(
|
static void ggml_cuda_op_scale(
|
||||||
|
@ -9240,9 +9319,15 @@ static void ggml_cuda_set_peer_access(const int n_tokens) {
|
||||||
CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer, id, id_other));
|
CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer, id, id_other));
|
||||||
if (can_access_peer) {
|
if (can_access_peer) {
|
||||||
if (enable_peer_access) {
|
if (enable_peer_access) {
|
||||||
CUDA_CHECK(cudaDeviceEnablePeerAccess(id_other, 0));
|
cudaError_t err = cudaDeviceEnablePeerAccess(id_other, 0);
|
||||||
|
if (err != cudaErrorPeerAccessAlreadyEnabled) {
|
||||||
|
CUDA_CHECK(err);
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
CUDA_CHECK(cudaDeviceDisablePeerAccess(id_other));
|
cudaError_t err = cudaDeviceDisablePeerAccess(id_other);
|
||||||
|
if (err != cudaErrorPeerAccessNotEnabled) {
|
||||||
|
CUDA_CHECK(err);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -10920,10 +11005,10 @@ GGML_CALL static const char * ggml_backend_cuda_split_buffer_get_name(ggml_backe
|
||||||
UNUSED(buffer);
|
UNUSED(buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
// unused at the moment
|
static bool ggml_backend_buffer_is_cuda_split(ggml_backend_buffer_t buffer) {
|
||||||
//static bool ggml_backend_buffer_is_cuda_split(ggml_backend_buffer_t buffer) {
|
return buffer->iface.get_name == ggml_backend_cuda_split_buffer_get_name;
|
||||||
// return buffer->iface.get_name == ggml_backend_cuda_split_buffer_get_name;
|
UNUSED(ggml_backend_buffer_is_cuda_split); // only used in debug builds currently, avoid unused function warning in release builds
|
||||||
//}
|
}
|
||||||
|
|
||||||
GGML_CALL static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
GGML_CALL static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||||
ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
|
ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
|
||||||
|
@ -11311,7 +11396,7 @@ GGML_CALL static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, gg
|
||||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||||
if (node->src[j] != nullptr) {
|
if (node->src[j] != nullptr) {
|
||||||
assert(node->src[j]->backend == GGML_BACKEND_GPU || node->src[j]->backend == GGML_BACKEND_GPU_SPLIT);
|
assert(node->src[j]->backend == GGML_BACKEND_GPU || node->src[j]->backend == GGML_BACKEND_GPU_SPLIT);
|
||||||
assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
|
assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) || ggml_backend_buffer_is_cuda_split(node->src[j]->buffer));
|
||||||
assert(node->src[j]->extra != nullptr);
|
assert(node->src[j]->extra != nullptr);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -11359,7 +11444,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
ggml_type a_type = a->type;
|
ggml_type a_type = a->type;
|
||||||
if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ3_XXS) {
|
if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ3_XXS || a_type == GGML_TYPE_IQ1_S) {
|
||||||
if (b->ne[1] == 1 && ggml_nrows(b) > 1) {
|
if (b->ne[1] == 1 && ggml_nrows(b) > 1) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
75
ggml-metal.m
75
ggml-metal.m
|
@ -61,6 +61,7 @@ enum ggml_metal_kernel_type {
|
||||||
GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS,
|
GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS,
|
||||||
GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS,
|
GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS,
|
||||||
GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS,
|
GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS,
|
||||||
|
GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S,
|
||||||
GGML_METAL_KERNEL_TYPE_GET_ROWS_I32,
|
GGML_METAL_KERNEL_TYPE_GET_ROWS_I32,
|
||||||
GGML_METAL_KERNEL_TYPE_RMS_NORM,
|
GGML_METAL_KERNEL_TYPE_RMS_NORM,
|
||||||
GGML_METAL_KERNEL_TYPE_GROUP_NORM,
|
GGML_METAL_KERNEL_TYPE_GROUP_NORM,
|
||||||
|
@ -83,6 +84,7 @@ enum ggml_metal_kernel_type {
|
||||||
GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XXS_F32,
|
GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XXS_F32,
|
||||||
GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32,
|
GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32,
|
||||||
GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32,
|
GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32,
|
||||||
|
GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32,
|
||||||
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32,
|
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32,
|
||||||
//GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F16,
|
//GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F16,
|
||||||
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32,
|
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32,
|
||||||
|
@ -101,6 +103,7 @@ enum ggml_metal_kernel_type {
|
||||||
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XXS_F32,
|
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XXS_F32,
|
||||||
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32,
|
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32,
|
||||||
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32,
|
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32,
|
||||||
|
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32,
|
||||||
GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32,
|
GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32,
|
||||||
GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32,
|
GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32,
|
||||||
GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32,
|
GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32,
|
||||||
|
@ -116,6 +119,7 @@ enum ggml_metal_kernel_type {
|
||||||
GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32,
|
GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32,
|
||||||
GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32,
|
GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32,
|
||||||
GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32,
|
GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32,
|
||||||
|
GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32,
|
||||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32,
|
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32,
|
||||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32,
|
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32,
|
||||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32,
|
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32,
|
||||||
|
@ -131,6 +135,7 @@ enum ggml_metal_kernel_type {
|
||||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32,
|
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32,
|
||||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32,
|
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32,
|
||||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32,
|
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32,
|
||||||
|
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32,
|
||||||
GGML_METAL_KERNEL_TYPE_ROPE_F32,
|
GGML_METAL_KERNEL_TYPE_ROPE_F32,
|
||||||
GGML_METAL_KERNEL_TYPE_ROPE_F16,
|
GGML_METAL_KERNEL_TYPE_ROPE_F16,
|
||||||
GGML_METAL_KERNEL_TYPE_ALIBI_F32,
|
GGML_METAL_KERNEL_TYPE_ALIBI_F32,
|
||||||
|
@ -176,7 +181,7 @@ struct ggml_metal_context {
|
||||||
// MSL code
|
// MSL code
|
||||||
// TODO: move the contents here when ready
|
// TODO: move the contents here when ready
|
||||||
// for now it is easier to work in a separate file
|
// for now it is easier to work in a separate file
|
||||||
//static NSString * const msl_library_source = @"see metal.metal";
|
// static NSString * const msl_library_source = @"see metal.metal";
|
||||||
|
|
||||||
// Here to assist with NSBundle Path Hack
|
// Here to assist with NSBundle Path Hack
|
||||||
@interface GGMLMetalClass : NSObject
|
@interface GGMLMetalClass : NSObject
|
||||||
|
@ -272,6 +277,14 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
#if GGML_METAL_EMBED_LIBRARY
|
||||||
|
GGML_METAL_LOG_INFO("%s: using embedded metal library\n", __func__);
|
||||||
|
|
||||||
|
extern const char ggml_metallib_start[];
|
||||||
|
extern const char ggml_metallib_end[];
|
||||||
|
|
||||||
|
NSString * src = [[NSString alloc] initWithBytes:ggml_metallib_start length:(ggml_metallib_end-ggml_metallib_start) encoding:NSUTF8StringEncoding];
|
||||||
|
#else
|
||||||
GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
|
GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
|
||||||
|
|
||||||
NSString * sourcePath;
|
NSString * sourcePath;
|
||||||
|
@ -294,6 +307,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||||
GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
@autoreleasepool {
|
@autoreleasepool {
|
||||||
// dictionary of preprocessor macros
|
// dictionary of preprocessor macros
|
||||||
|
@ -433,6 +447,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS, get_rows_iq2_xxs, true);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS, get_rows_iq2_xxs, true);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS, get_rows_iq2_xs, true);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS, get_rows_iq2_xs, true);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS, get_rows_iq3_xxs, true);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS, get_rows_iq3_xxs, true);
|
||||||
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S, get_rows_iq1_s, true);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_I32, get_rows_i32, true);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_I32, get_rows_i32, true);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RMS_NORM, rms_norm, ctx->support_simdgroup_reduction);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RMS_NORM, rms_norm, ctx->support_simdgroup_reduction);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GROUP_NORM, group_norm, ctx->support_simdgroup_reduction);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GROUP_NORM, group_norm, ctx->support_simdgroup_reduction);
|
||||||
|
@ -455,6 +470,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XXS_F32, mul_mv_iq2_xxs_f32, ctx->support_simdgroup_reduction);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XXS_F32, mul_mv_iq2_xxs_f32, ctx->support_simdgroup_reduction);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32, mul_mv_iq2_xs_f32, ctx->support_simdgroup_reduction);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32, mul_mv_iq2_xs_f32, ctx->support_simdgroup_reduction);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32, mul_mv_iq3_xxs_f32, ctx->support_simdgroup_reduction);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32, mul_mv_iq3_xxs_f32, ctx->support_simdgroup_reduction);
|
||||||
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32, mul_mv_iq1_s_f32, ctx->support_simdgroup_reduction);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32, mul_mv_id_f32_f32, ctx->support_simdgroup_reduction);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32, mul_mv_id_f32_f32, ctx->support_simdgroup_reduction);
|
||||||
//GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F16, mul_mv_id_f16_f16, ctx->support_simdgroup_reduction);
|
//GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F16, mul_mv_id_f16_f16, ctx->support_simdgroup_reduction);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32, mul_mv_id_f16_f32, ctx->support_simdgroup_reduction);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32, mul_mv_id_f16_f32, ctx->support_simdgroup_reduction);
|
||||||
|
@ -473,6 +489,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XXS_F32, mul_mv_id_iq2_xxs_f32, ctx->support_simdgroup_reduction);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XXS_F32, mul_mv_id_iq2_xxs_f32, ctx->support_simdgroup_reduction);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32, mul_mv_id_iq2_xs_f32, ctx->support_simdgroup_reduction);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32, mul_mv_id_iq2_xs_f32, ctx->support_simdgroup_reduction);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32, mul_mv_id_iq3_xxs_f32, ctx->support_simdgroup_reduction);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32, mul_mv_id_iq3_xxs_f32, ctx->support_simdgroup_reduction);
|
||||||
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32, mul_mv_id_iq1_s_f32, ctx->support_simdgroup_reduction);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32, mul_mm_f32_f32, ctx->support_simdgroup_mm);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32, mul_mm_f32_f32, ctx->support_simdgroup_mm);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32, mul_mm_f16_f32, ctx->support_simdgroup_mm);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32, mul_mm_f16_f32, ctx->support_simdgroup_mm);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32, mul_mm_q4_0_f32, ctx->support_simdgroup_mm);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32, mul_mm_q4_0_f32, ctx->support_simdgroup_mm);
|
||||||
|
@ -488,6 +505,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32, mul_mm_iq2_xxs_f32, ctx->support_simdgroup_mm);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32, mul_mm_iq2_xxs_f32, ctx->support_simdgroup_mm);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32, mul_mm_iq2_xs_f32, ctx->support_simdgroup_mm);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32, mul_mm_iq2_xs_f32, ctx->support_simdgroup_mm);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32, mul_mm_iq3_xxs_f32, ctx->support_simdgroup_mm);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32, mul_mm_iq3_xxs_f32, ctx->support_simdgroup_mm);
|
||||||
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32, mul_mm_iq1_s_f32, ctx->support_simdgroup_mm);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32, mul_mm_id_f32_f32, ctx->support_simdgroup_mm);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32, mul_mm_id_f32_f32, ctx->support_simdgroup_mm);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32, mul_mm_id_f16_f32, ctx->support_simdgroup_mm);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32, mul_mm_id_f16_f32, ctx->support_simdgroup_mm);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32, mul_mm_id_q4_0_f32, ctx->support_simdgroup_mm);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32, mul_mm_id_q4_0_f32, ctx->support_simdgroup_mm);
|
||||||
|
@ -503,6 +521,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32, mul_mm_id_iq2_xxs_f32, ctx->support_simdgroup_mm);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32, mul_mm_id_iq2_xxs_f32, ctx->support_simdgroup_mm);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32, mul_mm_id_iq2_xs_f32, ctx->support_simdgroup_mm);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32, mul_mm_id_iq2_xs_f32, ctx->support_simdgroup_mm);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32, mul_mm_id_iq3_xxs_f32, ctx->support_simdgroup_mm);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32, mul_mm_id_iq3_xxs_f32, ctx->support_simdgroup_mm);
|
||||||
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32, mul_mm_id_iq1_s_f32, ctx->support_simdgroup_mm);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_F32, rope_f32, true);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_F32, rope_f32, true);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_F16, rope_f16, true);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_F16, rope_f16, true);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ALIBI_F32, alibi_f32, true);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ALIBI_F32, alibi_f32, true);
|
||||||
|
@ -728,6 +747,7 @@ static bool ggml_metal_graph_compute(
|
||||||
|
|
||||||
size_t offs_src0 = 0;
|
size_t offs_src0 = 0;
|
||||||
size_t offs_src1 = 0;
|
size_t offs_src1 = 0;
|
||||||
|
size_t offs_src2 = 0;
|
||||||
size_t offs_dst = 0;
|
size_t offs_dst = 0;
|
||||||
|
|
||||||
id<MTLCommandBuffer> command_buffer = command_buffers[cb_idx];
|
id<MTLCommandBuffer> command_buffer = command_buffers[cb_idx];
|
||||||
|
@ -746,6 +766,7 @@ static bool ggml_metal_graph_compute(
|
||||||
|
|
||||||
struct ggml_tensor * src0 = gf->nodes[i]->src[0];
|
struct ggml_tensor * src0 = gf->nodes[i]->src[0];
|
||||||
struct ggml_tensor * src1 = gf->nodes[i]->src[1];
|
struct ggml_tensor * src1 = gf->nodes[i]->src[1];
|
||||||
|
struct ggml_tensor * src2 = gf->nodes[i]->src[2];
|
||||||
struct ggml_tensor * dst = gf->nodes[i];
|
struct ggml_tensor * dst = gf->nodes[i];
|
||||||
|
|
||||||
switch (dst->op) {
|
switch (dst->op) {
|
||||||
|
@ -807,6 +828,7 @@ static bool ggml_metal_graph_compute(
|
||||||
|
|
||||||
id<MTLBuffer> id_src0 = src0 ? ggml_metal_get_buffer(src0, &offs_src0) : nil;
|
id<MTLBuffer> id_src0 = src0 ? ggml_metal_get_buffer(src0, &offs_src0) : nil;
|
||||||
id<MTLBuffer> id_src1 = src1 ? ggml_metal_get_buffer(src1, &offs_src1) : nil;
|
id<MTLBuffer> id_src1 = src1 ? ggml_metal_get_buffer(src1, &offs_src1) : nil;
|
||||||
|
id<MTLBuffer> id_src2 = src2 ? ggml_metal_get_buffer(src2, &offs_src2) : nil;
|
||||||
id<MTLBuffer> id_dst = dst ? ggml_metal_get_buffer(dst, &offs_dst) : nil;
|
id<MTLBuffer> id_dst = dst ? ggml_metal_get_buffer(dst, &offs_dst) : nil;
|
||||||
|
|
||||||
//GGML_METAL_LOG_INFO("%s: op - %s\n", __func__, ggml_op_name(dst->op));
|
//GGML_METAL_LOG_INFO("%s: op - %s\n", __func__, ggml_op_name(dst->op));
|
||||||
|
@ -1188,7 +1210,16 @@ static bool ggml_metal_graph_compute(
|
||||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX].pipeline;
|
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX].pipeline;
|
||||||
}
|
}
|
||||||
|
|
||||||
const float scale = ((float *) dst->op_params)[0];
|
const float scale = ((float *) dst->op_params)[0];
|
||||||
|
const float max_bias = ((float *) dst->op_params)[1];
|
||||||
|
|
||||||
|
const int64_t nrows_x = ggml_nrows(src0);
|
||||||
|
const int64_t nrows_y = src0->ne[1];
|
||||||
|
const uint32_t n_head_kv = nrows_x/nrows_y;
|
||||||
|
const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv));
|
||||||
|
|
||||||
|
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
||||||
|
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
||||||
|
|
||||||
[encoder setComputePipelineState:pipeline];
|
[encoder setComputePipelineState:pipeline];
|
||||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||||
|
@ -1197,11 +1228,20 @@ static bool ggml_metal_graph_compute(
|
||||||
} else {
|
} else {
|
||||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
|
||||||
}
|
}
|
||||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
|
if (id_src2) {
|
||||||
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
|
[encoder setBuffer:id_src2 offset:offs_src2 atIndex:2];
|
||||||
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
|
} else {
|
||||||
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:2];
|
||||||
[encoder setBytes:&scale length:sizeof(scale) atIndex:6];
|
}
|
||||||
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:3];
|
||||||
|
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:4];
|
||||||
|
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:5];
|
||||||
|
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:6];
|
||||||
|
[encoder setBytes:&scale length:sizeof(scale) atIndex:7];
|
||||||
|
[encoder setBytes:&max_bias length:sizeof(max_bias) atIndex:8];
|
||||||
|
[encoder setBytes:&m0 length:sizeof(m0) atIndex:9];
|
||||||
|
[encoder setBytes:&m1 length:sizeof(m1) atIndex:10];
|
||||||
|
[encoder setBytes:&n_head_log2 length:sizeof(n_head_log2) atIndex:11];
|
||||||
[encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
|
[encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
|
||||||
|
|
||||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01*ne02*ne03, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
[encoder dispatchThreadgroups:MTLSizeMake(ne01*ne02*ne03, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
||||||
|
@ -1297,6 +1337,7 @@ static bool ggml_metal_graph_compute(
|
||||||
case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32].pipeline; break;
|
case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32].pipeline; break;
|
||||||
case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32 ].pipeline; break;
|
case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32 ].pipeline; break;
|
||||||
case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32].pipeline; break;
|
case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32].pipeline; break;
|
||||||
|
case GGML_TYPE_IQ1_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32 ].pipeline; break;
|
||||||
default: GGML_ASSERT(false && "MUL MAT-MAT not implemented");
|
default: GGML_ASSERT(false && "MUL MAT-MAT not implemented");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1431,6 +1472,12 @@ static bool ggml_metal_graph_compute(
|
||||||
nth1 = 16;
|
nth1 = 16;
|
||||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32].pipeline;
|
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32].pipeline;
|
||||||
} break;
|
} break;
|
||||||
|
case GGML_TYPE_IQ1_S:
|
||||||
|
{
|
||||||
|
nth0 = 4;
|
||||||
|
nth1 = 16;
|
||||||
|
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32].pipeline;
|
||||||
|
} break;
|
||||||
default:
|
default:
|
||||||
{
|
{
|
||||||
GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src0t);
|
GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src0t);
|
||||||
|
@ -1465,7 +1512,7 @@ static bool ggml_metal_graph_compute(
|
||||||
|
|
||||||
if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 ||
|
if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 ||
|
||||||
src0t == GGML_TYPE_Q5_0 || src0t == GGML_TYPE_Q5_1 || src0t == GGML_TYPE_Q8_0 ||
|
src0t == GGML_TYPE_Q5_0 || src0t == GGML_TYPE_Q5_1 || src0t == GGML_TYPE_Q8_0 ||
|
||||||
src0t == GGML_TYPE_Q2_K) { // || src0t == GGML_TYPE_Q4_K) {
|
src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_IQ1_S) { // || src0t == GGML_TYPE_Q4_K) {
|
||||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||||
}
|
}
|
||||||
else if (src0t == GGML_TYPE_IQ2_XXS || src0t == GGML_TYPE_IQ2_XS) {
|
else if (src0t == GGML_TYPE_IQ2_XXS || src0t == GGML_TYPE_IQ2_XS) {
|
||||||
|
@ -1514,8 +1561,6 @@ static bool ggml_metal_graph_compute(
|
||||||
// max size of the src1ids array in the kernel stack
|
// max size of the src1ids array in the kernel stack
|
||||||
GGML_ASSERT(ne11 <= 512);
|
GGML_ASSERT(ne11 <= 512);
|
||||||
|
|
||||||
struct ggml_tensor * src2 = gf->nodes[i]->src[2];
|
|
||||||
|
|
||||||
const int64_t ne20 = src2 ? src2->ne[0] : 0;
|
const int64_t ne20 = src2 ? src2->ne[0] : 0;
|
||||||
const int64_t ne21 = src2 ? src2->ne[1] : 0;
|
const int64_t ne21 = src2 ? src2->ne[1] : 0;
|
||||||
const int64_t ne22 = src2 ? src2->ne[2] : 0;
|
const int64_t ne22 = src2 ? src2->ne[2] : 0;
|
||||||
|
@ -1573,6 +1618,7 @@ static bool ggml_metal_graph_compute(
|
||||||
case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32].pipeline; break;
|
case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32].pipeline; break;
|
||||||
case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32 ].pipeline; break;
|
case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32 ].pipeline; break;
|
||||||
case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32].pipeline; break;
|
case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32].pipeline; break;
|
||||||
|
case GGML_TYPE_IQ1_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32 ].pipeline; break;
|
||||||
default: GGML_ASSERT(false && "MUL_MAT_ID not implemented");
|
default: GGML_ASSERT(false && "MUL_MAT_ID not implemented");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1710,6 +1756,12 @@ static bool ggml_metal_graph_compute(
|
||||||
nth1 = 16;
|
nth1 = 16;
|
||||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32].pipeline;
|
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32].pipeline;
|
||||||
} break;
|
} break;
|
||||||
|
case GGML_TYPE_IQ1_S:
|
||||||
|
{
|
||||||
|
nth0 = 4;
|
||||||
|
nth1 = 16;
|
||||||
|
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32].pipeline;
|
||||||
|
} break;
|
||||||
default:
|
default:
|
||||||
{
|
{
|
||||||
GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src2t);
|
GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src2t);
|
||||||
|
@ -1760,7 +1812,7 @@ static bool ggml_metal_graph_compute(
|
||||||
|
|
||||||
if (src2t == GGML_TYPE_Q4_0 || src2t == GGML_TYPE_Q4_1 ||
|
if (src2t == GGML_TYPE_Q4_0 || src2t == GGML_TYPE_Q4_1 ||
|
||||||
src2t == GGML_TYPE_Q5_0 || src2t == GGML_TYPE_Q5_1 || src2t == GGML_TYPE_Q8_0 ||
|
src2t == GGML_TYPE_Q5_0 || src2t == GGML_TYPE_Q5_1 || src2t == GGML_TYPE_Q8_0 ||
|
||||||
src2t == GGML_TYPE_Q2_K) { // || src2t == GGML_TYPE_Q4_K) {
|
src2t == GGML_TYPE_Q2_K || src2t == GGML_TYPE_IQ1_S) { // || src2t == GGML_TYPE_Q4_K) {
|
||||||
[encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
[encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||||
}
|
}
|
||||||
else if (src2t == GGML_TYPE_IQ2_XXS || src2t == GGML_TYPE_IQ2_XS) {
|
else if (src2t == GGML_TYPE_IQ2_XXS || src2t == GGML_TYPE_IQ2_XS) {
|
||||||
|
@ -1814,6 +1866,7 @@ static bool ggml_metal_graph_compute(
|
||||||
case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS].pipeline; break;
|
case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS].pipeline; break;
|
||||||
case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS ].pipeline; break;
|
case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS ].pipeline; break;
|
||||||
case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS].pipeline; break;
|
case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS].pipeline; break;
|
||||||
|
case GGML_TYPE_IQ1_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S ].pipeline; break;
|
||||||
case GGML_TYPE_I32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_I32 ].pipeline; break;
|
case GGML_TYPE_I32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_I32 ].pipeline; break;
|
||||||
default: GGML_ASSERT(false && "not implemented");
|
default: GGML_ASSERT(false && "not implemented");
|
||||||
}
|
}
|
||||||
|
|
404
ggml-metal.metal
404
ggml-metal.metal
|
@ -351,12 +351,17 @@ kernel void kernel_sum_rows(
|
||||||
kernel void kernel_soft_max(
|
kernel void kernel_soft_max(
|
||||||
device const float * src0,
|
device const float * src0,
|
||||||
device const float * src1,
|
device const float * src1,
|
||||||
|
device const float * src2,
|
||||||
device float * dst,
|
device float * dst,
|
||||||
constant int64_t & ne00,
|
constant int64_t & ne00,
|
||||||
constant int64_t & ne01,
|
constant int64_t & ne01,
|
||||||
constant int64_t & ne02,
|
constant int64_t & ne02,
|
||||||
constant float & scale,
|
constant float & scale,
|
||||||
threadgroup float * buf [[threadgroup(0)]],
|
constant float & max_bias,
|
||||||
|
constant float & m0,
|
||||||
|
constant float & m1,
|
||||||
|
constant uint32_t & n_head_log2,
|
||||||
|
threadgroup float * buf [[threadgroup(0)]],
|
||||||
uint tgpig[[threadgroup_position_in_grid]],
|
uint tgpig[[threadgroup_position_in_grid]],
|
||||||
uint tpitg[[thread_position_in_threadgroup]],
|
uint tpitg[[thread_position_in_threadgroup]],
|
||||||
uint sgitg[[simdgroup_index_in_threadgroup]],
|
uint sgitg[[simdgroup_index_in_threadgroup]],
|
||||||
|
@ -368,13 +373,26 @@ kernel void kernel_soft_max(
|
||||||
|
|
||||||
device const float * psrc0 = src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
|
device const float * psrc0 = src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
|
||||||
device const float * pmask = src1 != src0 ? src1 + i01*ne00 : nullptr;
|
device const float * pmask = src1 != src0 ? src1 + i01*ne00 : nullptr;
|
||||||
|
device const float * ppos = src2 != src0 ? src2 : nullptr;
|
||||||
device float * pdst = dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
|
device float * pdst = dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
|
||||||
|
|
||||||
|
float slope = 0.0f;
|
||||||
|
|
||||||
|
// ALiBi
|
||||||
|
if (max_bias > 0.0f) {
|
||||||
|
const int64_t h = i02;
|
||||||
|
|
||||||
|
const float base = h < n_head_log2 ? m0 : m1;
|
||||||
|
const int exp = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
|
||||||
|
|
||||||
|
slope = pow(base, exp);
|
||||||
|
}
|
||||||
|
|
||||||
// parallel max
|
// parallel max
|
||||||
float lmax = -INFINITY;
|
float lmax = -INFINITY;
|
||||||
|
|
||||||
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
|
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
|
||||||
lmax = MAX(lmax, psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f));
|
lmax = MAX(lmax, psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f));
|
||||||
}
|
}
|
||||||
|
|
||||||
// find the max value in the block
|
// find the max value in the block
|
||||||
|
@ -399,7 +417,7 @@ kernel void kernel_soft_max(
|
||||||
// parallel sum
|
// parallel sum
|
||||||
float lsum = 0.0f;
|
float lsum = 0.0f;
|
||||||
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
|
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
|
||||||
const float exp_psrc0 = exp((psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f)) - max_val);
|
const float exp_psrc0 = exp((psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f)) - max_val);
|
||||||
lsum += exp_psrc0;
|
lsum += exp_psrc0;
|
||||||
pdst[i00] = exp_psrc0;
|
pdst[i00] = exp_psrc0;
|
||||||
}
|
}
|
||||||
|
@ -437,12 +455,17 @@ kernel void kernel_soft_max(
|
||||||
kernel void kernel_soft_max_4(
|
kernel void kernel_soft_max_4(
|
||||||
device const float * src0,
|
device const float * src0,
|
||||||
device const float * src1,
|
device const float * src1,
|
||||||
|
device const float * src2,
|
||||||
device float * dst,
|
device float * dst,
|
||||||
constant int64_t & ne00,
|
constant int64_t & ne00,
|
||||||
constant int64_t & ne01,
|
constant int64_t & ne01,
|
||||||
constant int64_t & ne02,
|
constant int64_t & ne02,
|
||||||
constant float & scale,
|
constant float & scale,
|
||||||
threadgroup float * buf [[threadgroup(0)]],
|
constant float & max_bias,
|
||||||
|
constant float & m0,
|
||||||
|
constant float & m1,
|
||||||
|
constant uint32_t & n_head_log2,
|
||||||
|
threadgroup float * buf [[threadgroup(0)]],
|
||||||
uint tgpig[[threadgroup_position_in_grid]],
|
uint tgpig[[threadgroup_position_in_grid]],
|
||||||
uint tpitg[[thread_position_in_threadgroup]],
|
uint tpitg[[thread_position_in_threadgroup]],
|
||||||
uint sgitg[[simdgroup_index_in_threadgroup]],
|
uint sgitg[[simdgroup_index_in_threadgroup]],
|
||||||
|
@ -454,13 +477,25 @@ kernel void kernel_soft_max_4(
|
||||||
|
|
||||||
device const float4 * psrc4 = (device const float4 *)(src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
|
device const float4 * psrc4 = (device const float4 *)(src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
|
||||||
device const float4 * pmask = src1 != src0 ? (device const float4 *)(src1 + i01*ne00) : nullptr;
|
device const float4 * pmask = src1 != src0 ? (device const float4 *)(src1 + i01*ne00) : nullptr;
|
||||||
|
device const float4 * ppos = src2 != src0 ? (device const float4 *)(src2) : nullptr;
|
||||||
device float4 * pdst4 = (device float4 *)(dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
|
device float4 * pdst4 = (device float4 *)(dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
|
||||||
|
|
||||||
|
float slope = 0.0f;
|
||||||
|
|
||||||
|
if (max_bias > 0.0f) {
|
||||||
|
const int64_t h = i02;
|
||||||
|
|
||||||
|
const float base = h < n_head_log2 ? m0 : m1;
|
||||||
|
const int exp = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
|
||||||
|
|
||||||
|
slope = pow(base, exp);
|
||||||
|
}
|
||||||
|
|
||||||
// parallel max
|
// parallel max
|
||||||
float4 lmax4 = -INFINITY;
|
float4 lmax4 = -INFINITY;
|
||||||
|
|
||||||
for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
|
for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
|
||||||
lmax4 = fmax(lmax4, psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f));
|
lmax4 = fmax(lmax4, psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f));
|
||||||
}
|
}
|
||||||
|
|
||||||
const float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3]));
|
const float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3]));
|
||||||
|
@ -486,7 +521,7 @@ kernel void kernel_soft_max_4(
|
||||||
// parallel sum
|
// parallel sum
|
||||||
float4 lsum4 = 0.0f;
|
float4 lsum4 = 0.0f;
|
||||||
for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
|
for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
|
||||||
const float4 exp_psrc4 = exp((psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f)) - max_val);
|
const float4 exp_psrc4 = exp((psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f)) - max_val);
|
||||||
lsum4 += exp_psrc4;
|
lsum4 += exp_psrc4;
|
||||||
pdst4[i00] = exp_psrc4;
|
pdst4[i00] = exp_psrc4;
|
||||||
}
|
}
|
||||||
|
@ -2490,6 +2525,13 @@ typedef struct {
|
||||||
} block_iq3_xxs;
|
} block_iq3_xxs;
|
||||||
// 98 bytes / block for QK_K = 256, so 3.0625 bpw
|
// 98 bytes / block for QK_K = 256, so 3.0625 bpw
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
half d;
|
||||||
|
uint8_t qs[QK_K/8];
|
||||||
|
uint8_t scales[QK_K/16];
|
||||||
|
} block_iq1_s;
|
||||||
|
|
||||||
|
|
||||||
//====================================== dot products =========================
|
//====================================== dot products =========================
|
||||||
|
|
||||||
void kernel_mul_mv_q2_K_f32_impl(
|
void kernel_mul_mv_q2_K_f32_impl(
|
||||||
|
@ -3747,6 +3789,137 @@ constexpr constant static uint32_t iq3xxs_grid[256] = {
|
||||||
0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04,
|
0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#define NGRID_IQ1S 512
|
||||||
|
constexpr constant static uint64_t iq1s_grid[NGRID_IQ1S] = {
|
||||||
|
0xffffffffffff0101, 0xffffffffff01ff00, 0xffffffffff010100, 0xffffffff00000000,
|
||||||
|
0xffffffff01ff00ff, 0xffffffff01ff0001, 0xffffffff0101ffff, 0xffffffff0101ff01,
|
||||||
|
0xffffff00ff000000, 0xffffff000000ff00, 0xffffff00000000ff, 0xffffff0000000100,
|
||||||
|
0xffffff0000010000, 0xffffff0001000000, 0xffffff01ffff00ff, 0xffffff01ff01ff00,
|
||||||
|
0xffffff01ff010100, 0xffffff0100000001, 0xffffff0101ffff00, 0xffffff0101ff0101,
|
||||||
|
0xffffff0101010100, 0xffff00ffff00ff01, 0xffff00ffff0000ff, 0xffff00ff00ff0100,
|
||||||
|
0xffff00ff0100ff00, 0xffff00ff010001ff, 0xffff0000ff0101ff, 0xffff000000ffff00,
|
||||||
|
0xffff000000000000, 0xffff00000001ff01, 0xffff000001000101, 0xffff0000010100ff,
|
||||||
|
0xffff0001ffff0100, 0xffff00010000ff00, 0xffff000100010101, 0xffff000101000000,
|
||||||
|
0xffff01ffffff0000, 0xffff01ffff01ffff, 0xffff01ffff010100, 0xffff01ff00000000,
|
||||||
|
0xffff01ff01ffffff, 0xffff01ff01ff0001, 0xffff01ff0101ffff, 0xffff01ff01010001,
|
||||||
|
0xffff0100ffffff01, 0xffff01000000ffff, 0xffff010000000100, 0xffff010001ff01ff,
|
||||||
|
0xffff010001000000, 0xffff0101ff000000, 0xffff0101000101ff, 0xffff010101ffff01,
|
||||||
|
0xffff01010101ff00, 0xff00ffffff000000, 0xff00ffff00ffff00, 0xff00ffff00000001,
|
||||||
|
0xff00ffff000001ff, 0xff00ffff01010000, 0xff00ff00ffff0000, 0xff00ff00ff00ff00,
|
||||||
|
0xff00ff00ff0000ff, 0xff00ff00ff000100, 0xff00ff00ff010001, 0xff00ff0000ff0001,
|
||||||
|
0xff00ff000000ffff, 0xff00ff0000000000, 0xff00ff000001ff00, 0xff00ff0000010100,
|
||||||
|
0xff00ff0001ff0000, 0xff00ff000100ff00, 0xff00ff0001000100, 0xff00ff01ff000000,
|
||||||
|
0xff00ff0100ff0000, 0xff00ff01000001ff, 0xff00ff0101010001, 0xff0000ff00000000,
|
||||||
|
0xff0000ff0001ff00, 0xff0000ff00010100, 0xff000000ffff0101, 0xff000000ff000000,
|
||||||
|
0xff000000ff01ff00, 0xff00000000ff0000, 0xff0000000000ff00, 0xff000000000000ff,
|
||||||
|
0xff00000000000000, 0xff00000000000001, 0xff00000000000100, 0xff0000000001ffff,
|
||||||
|
0xff00000000010000, 0xff00000001000000, 0xff00000001010100, 0xff000001ff00ff01,
|
||||||
|
0xff000001ff0100ff, 0xff00000100000000, 0xff0000010001ff00, 0xff00000101ff0100,
|
||||||
|
0xff0000010100ff00, 0xff0001ff00ff00ff, 0xff0001ff00000101, 0xff0001ff000100ff,
|
||||||
|
0xff0001ff01000000, 0xff000100ff0001ff, 0xff0001000000ff01, 0xff00010000000000,
|
||||||
|
0xff00010000010001, 0xff00010000010100, 0xff00010001ffff00, 0xff00010001ff0101,
|
||||||
|
0xff00010001010000, 0xff000101ffffffff, 0xff000101ff000101, 0xff00010101ff00ff,
|
||||||
|
0xff00010101000001, 0xff000101010100ff, 0xff01ffffff000101, 0xff01ffffff01ffff,
|
||||||
|
0xff01ffffff01ff01, 0xff01ffffff0101ff, 0xff01ffff00000000, 0xff01ffff01ff0001,
|
||||||
|
0xff01ffff0101ff01, 0xff01ff00ff000000, 0xff01ff0000ff0100, 0xff01ff000000ff01,
|
||||||
|
0xff01ff0000010000, 0xff01ff00010000ff, 0xff01ff01ff01ff00, 0xff01ff0100000101,
|
||||||
|
0xff0100ffffff0000, 0xff0100ffff010000, 0xff0100ff01ff00ff, 0xff0100ff01000100,
|
||||||
|
0xff0100ff010100ff, 0xff010000ffffff01, 0xff01000000000000, 0xff0100000101ff00,
|
||||||
|
0xff010001ffff00ff, 0xff010001ff000100, 0xff01000100ffff00, 0xff01000100010001,
|
||||||
|
0xff01000101ff0001, 0xff010001010001ff, 0xff0101ffffffffff, 0xff0101ffff01ffff,
|
||||||
|
0xff0101ffff010101, 0xff0101ff0000ff00, 0xff0101ff01010001, 0xff010100ff000000,
|
||||||
|
0xff010100ff01ff01, 0xff01010000ff0001, 0xff01010000000100, 0xff01010001000000,
|
||||||
|
0xff0101010100ffff, 0x00ffffff0000ff01, 0x00ffffff000000ff, 0x00ffffff00000100,
|
||||||
|
0x00ffffff00010000, 0x00ffff00ffff0001, 0x00ffff00ff0000ff, 0x00ffff00ff000100,
|
||||||
|
0x00ffff0000000000, 0x00ffff0001000100, 0x00ffff0001010001, 0x00ffff01ff00ff01,
|
||||||
|
0x00ffff0100ff0100, 0x00ffff010000ff00, 0x00ffff01000100ff, 0x00ffff0101ff00ff,
|
||||||
|
0x00ffff010101ff00, 0x00ff00ffffffffff, 0x00ff00ffffff01ff, 0x00ff00ffff000101,
|
||||||
|
0x00ff00ff00000000, 0x00ff00ff000101ff, 0x00ff00ff01010101, 0x00ff0000ff000000,
|
||||||
|
0x00ff0000ff01ffff, 0x00ff000000ff0000, 0x00ff00000000ff00, 0x00ff0000000000ff,
|
||||||
|
0x00ff000000000000, 0x00ff000000000001, 0x00ff000000000100, 0x00ff000000010000,
|
||||||
|
0x00ff000001ffff01, 0x00ff000001000000, 0x00ff0001ff000101, 0x00ff000100ffffff,
|
||||||
|
0x00ff000100000000, 0x00ff0001010001ff, 0x00ff01ffff000000, 0x00ff01ff0001ff00,
|
||||||
|
0x00ff01ff01ff0100, 0x00ff0100ff01ff01, 0x00ff010000ff00ff, 0x00ff010000ff0101,
|
||||||
|
0x00ff010000000000, 0x00ff010000010101, 0x00ff01000100ff00, 0x00ff010001010000,
|
||||||
|
0x00ff0101ffffff00, 0x00ff01010000ff01, 0x00ff010100000100, 0x00ff010101ff0000,
|
||||||
|
0x0000ffffffff0100, 0x0000ffffff00ff00, 0x0000ffffff0000ff, 0x0000ffffff010000,
|
||||||
|
0x0000ffff00000000, 0x0000ffff00010101, 0x0000ffff01ffff01, 0x0000ffff01000100,
|
||||||
|
0x0000ff00ff000000, 0x0000ff00ff01ff00, 0x0000ff00ff0101ff, 0x0000ff0000ff0000,
|
||||||
|
0x0000ff000000ff00, 0x0000ff00000000ff, 0x0000ff0000000000, 0x0000ff0000000001,
|
||||||
|
0x0000ff0000000100, 0x0000ff0000010000, 0x0000ff0001ffffff, 0x0000ff0001ff01ff,
|
||||||
|
0x0000ff0001000000, 0x0000ff000101ffff, 0x0000ff01ffff0101, 0x0000ff01ff010000,
|
||||||
|
0x0000ff0100000000, 0x0000ff0101000101, 0x000000ffffff0001, 0x000000ffff000000,
|
||||||
|
0x000000ff00ff0000, 0x000000ff0000ff00, 0x000000ff000000ff, 0x000000ff00000000,
|
||||||
|
0x000000ff00000001, 0x000000ff00000100, 0x000000ff00010000, 0x000000ff01000000,
|
||||||
|
0x000000ff0101ff00, 0x00000000ffff0000, 0x00000000ff00ff00, 0x00000000ff0000ff,
|
||||||
|
0x00000000ff000000, 0x00000000ff000001, 0x00000000ff000100, 0x00000000ff010000,
|
||||||
|
0x0000000000ffff00, 0x0000000000ff00ff, 0x0000000000ff0000, 0x0000000000ff0001,
|
||||||
|
0x0000000000ff0100, 0x000000000000ffff, 0x000000000000ff00, 0x000000000000ff01,
|
||||||
|
0x00000000000000ff, 0x0000000000000001, 0x00000000000001ff, 0x0000000000000100,
|
||||||
|
0x0000000000000101, 0x000000000001ff00, 0x00000000000100ff, 0x0000000000010000,
|
||||||
|
0x0000000000010001, 0x0000000000010100, 0x0000000001ff0000, 0x000000000100ff00,
|
||||||
|
0x00000000010000ff, 0x0000000001000000, 0x0000000001000001, 0x0000000001000100,
|
||||||
|
0x0000000001010000, 0x00000001ffff01ff, 0x00000001ff000000, 0x0000000100ff0000,
|
||||||
|
0x000000010000ff00, 0x00000001000000ff, 0x0000000100000000, 0x0000000100000001,
|
||||||
|
0x0000000100000100, 0x0000000100010000, 0x0000000101000000, 0x000001ffff00ff00,
|
||||||
|
0x000001ffff010001, 0x000001ffff0101ff, 0x000001ff00ffff01, 0x000001ff0000ffff,
|
||||||
|
0x000001ff00000000, 0x000001ff010000ff, 0x000001ff01010100, 0x00000100ffff0100,
|
||||||
|
0x00000100ff000000, 0x0000010000ff0000, 0x000001000000ff00, 0x00000100000000ff,
|
||||||
|
0x0000010000000000, 0x0000010000000001, 0x0000010000000100, 0x0000010000010000,
|
||||||
|
0x0000010001000000, 0x000001000101ff01, 0x00000101ffff0001, 0x00000101ff01ffff,
|
||||||
|
0x0000010100000000, 0x0000010101010100, 0x0001ffffff000000, 0x0001ffff00ffffff,
|
||||||
|
0x0001ffff00000100, 0x0001ffff0001ff00, 0x0001ffff01000000, 0x0001ff00ffffff00,
|
||||||
|
0x0001ff00ffff01ff, 0x0001ff00ff010000, 0x0001ff0000000000, 0x0001ff0000010001,
|
||||||
|
0x0001ff0001ff0000, 0x0001ff0001010100, 0x0001ff01ff0000ff, 0x0001ff01ff000001,
|
||||||
|
0x0001ff0100ffffff, 0x0001ff010001ffff, 0x0001ff01000101ff, 0x0001ff010100ff01,
|
||||||
|
0x000100ffff00ffff, 0x000100ffff00ff01, 0x000100ffff000100, 0x000100ff00000000,
|
||||||
|
0x000100ff000101ff, 0x000100ff01ff0101, 0x000100ff0100ffff, 0x000100ff01010101,
|
||||||
|
0x00010000ff000000, 0x00010000ff010100, 0x0001000000ff0000, 0x000100000000ff00,
|
||||||
|
0x00010000000000ff, 0x0001000000000000, 0x0001000000000001, 0x0001000000000100,
|
||||||
|
0x0001000000010000, 0x0001000001ffff01, 0x0001000001000000, 0x0001000100ff0101,
|
||||||
|
0x0001000100000000, 0x00010001010100ff, 0x000101ffffff01ff, 0x000101ffffff0101,
|
||||||
|
0x000101ff00010000, 0x000101ff01ff0000, 0x000101ff0100ff01, 0x00010100ffff0000,
|
||||||
|
0x0001010000000000, 0x000101000001ffff, 0x0001010000010101, 0x00010100010001ff,
|
||||||
|
0x00010101ff00ff00, 0x00010101ff010001, 0x0001010100ffffff, 0x0001010100ff01ff,
|
||||||
|
0x00010101000101ff, 0x0001010101ff0000, 0x000101010100ff01, 0x0001010101000101,
|
||||||
|
0x01ffffffffff0101, 0x01ffffffff01ffff, 0x01ffffffff01ff01, 0x01ffffffff0101ff,
|
||||||
|
0x01ffffffff010101, 0x01ffffff00000000, 0x01ffffff01ff01ff, 0x01ffffff01000101,
|
||||||
|
0x01ffffff0101ff01, 0x01ffffff010100ff, 0x01ffff000000ff00, 0x01ffff0000000001,
|
||||||
|
0x01ffff00000001ff, 0x01ffff0000010000, 0x01ffff0001ff0000, 0x01ffff01ffffffff,
|
||||||
|
0x01ffff01ffff01ff, 0x01ffff01ff000000, 0x01ffff01ff01ffff, 0x01ffff01ff0101ff,
|
||||||
|
0x01ffff010100ffff, 0x01ff00ffffff0000, 0x01ff00ffff010000, 0x01ff00ff00ffff01,
|
||||||
|
0x01ff0000ff0000ff, 0x01ff000000000000, 0x01ff00000001ff01, 0x01ff000001ffffff,
|
||||||
|
0x01ff000001010100, 0x01ff0001ffffff01, 0x01ff0001ff010001, 0x01ff000101ff0100,
|
||||||
|
0x01ff000101000001, 0x01ff0001010100ff, 0x01ff01ffff00ffff, 0x01ff01ff00010001,
|
||||||
|
0x01ff01ff01000000, 0x01ff01ff010101ff, 0x01ff0100ff000001, 0x01ff010000ffff00,
|
||||||
|
0x01ff010000000100, 0x01ff010001ff01ff, 0x01ff01000101ffff, 0x01ff0101ffff00ff,
|
||||||
|
0x01ff0101ffff0101, 0x01ff0101ff0101ff, 0x01ff010100010000, 0x0100ffff00ff00ff,
|
||||||
|
0x0100ffff00ff0001, 0x0100ffff00000100, 0x0100ffff0100ff00, 0x0100ff00ffff0000,
|
||||||
|
0x0100ff00ff00ffff, 0x0100ff00ff00ff01, 0x0100ff00ff000100, 0x0100ff00ff010000,
|
||||||
|
0x0100ff0000000000, 0x0100ff00000100ff, 0x0100ff0001ff0101, 0x0100ff0001010101,
|
||||||
|
0x0100ff0100ff00ff, 0x0100ff0100ff0001, 0x0100ff0100000100, 0x0100ff0100010001,
|
||||||
|
0x0100ff0101000000, 0x010000ffff00ff00, 0x010000ff0000ffff, 0x010000ff00000000,
|
||||||
|
0x010000ff010001ff, 0x010000ff01010001, 0x01000000ffffff00, 0x01000000ffff0101,
|
||||||
|
0x01000000ff000000, 0x01000000ff0100ff, 0x01000000ff010101, 0x0100000000ff0000,
|
||||||
|
0x010000000000ff00, 0x01000000000000ff, 0x0100000000000000, 0x0100000000000001,
|
||||||
|
0x0100000000000100, 0x0100000000010000, 0x0100000001000000, 0x0100000100000000,
|
||||||
|
0x01000001000101ff, 0x0100000101ffff01, 0x010001ffff000101, 0x010001ff00ff0100,
|
||||||
|
0x010001ff0000ff00, 0x010001ff000100ff, 0x010001ff01ffffff, 0x01000100ffff0000,
|
||||||
|
0x01000100ff0001ff, 0x0100010000000000, 0x010001000001ff00, 0x0100010001ff0000,
|
||||||
|
0x01000100010000ff, 0x0100010001000101, 0x01000101ff00ff01, 0x0100010100ff0100,
|
||||||
|
0x010001010000ffff, 0x0100010101010001, 0x0101ffffffff0101, 0x0101ffffff0001ff,
|
||||||
|
0x0101ffffff01ffff, 0x0101ffffff010101, 0x0101ffff00000000, 0x0101ffff0101ffff,
|
||||||
|
0x0101ffff010101ff, 0x0101ff00ff000000, 0x0101ff0000ff0100, 0x0101ff000000ff00,
|
||||||
|
0x0101ff0000010000, 0x0101ff00010000ff, 0x0101ff0001000001, 0x0101ff01ff010101,
|
||||||
|
0x0101ff0100000000, 0x0101ff010101ff00, 0x010100ffffff0000, 0x010100ffff010000,
|
||||||
|
0x010100ff00ff01ff, 0x010100ff000000ff, 0x010100ff00000101, 0x010100ff01ffff00,
|
||||||
|
0x01010000ffffff01, 0x01010000ff000100, 0x01010000ff01ff01, 0x0101000000000000,
|
||||||
|
0x01010000000100ff, 0x010100000101ff01, 0x01010001ffff0000, 0x01010001ff00ffff,
|
||||||
|
0x01010001ff010000, 0x0101000101ffffff, 0x0101000101ff01ff, 0x0101000101010101,
|
||||||
|
0x010101ffff01ffff, 0x010101ff00000000, 0x010101ff0001ff01, 0x010101ff0101ffff,
|
||||||
|
0x010101ff010101ff, 0x01010100ffffffff, 0x01010100ff000001, 0x010101000000ff00,
|
||||||
|
0x0101010001010000, 0x0101010100ff0001, 0x010101010001ff01, 0x010101010101ffff,
|
||||||
|
};
|
||||||
|
|
||||||
constexpr constant static uint8_t ksigns_iq2xs[128] = {
|
constexpr constant static uint8_t ksigns_iq2xs[128] = {
|
||||||
0, 129, 130, 3, 132, 5, 6, 135, 136, 9, 10, 139, 12, 141, 142, 15,
|
0, 129, 130, 3, 132, 5, 6, 135, 136, 9, 10, 139, 12, 141, 142, 15,
|
||||||
|
@ -3854,7 +4027,10 @@ void kernel_mul_mv_iq2_xxs_f32_impl(
|
||||||
y4 += 32 * 32;
|
y4 += 32 * 32;
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
// TODO
|
(void) x;
|
||||||
|
(void) y;
|
||||||
|
(void) yl;
|
||||||
|
(void) nb32;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
for (int row = 0; row < N_DST; ++row) {
|
for (int row = 0; row < N_DST; ++row) {
|
||||||
|
@ -3997,7 +4173,10 @@ void kernel_mul_mv_iq2_xs_f32_impl(
|
||||||
y4 += 32 * 32;
|
y4 += 32 * 32;
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
// TODO
|
(void) x;
|
||||||
|
(void) y;
|
||||||
|
(void) yl;
|
||||||
|
(void) nb32;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
for (int row = 0; row < N_DST; ++row) {
|
for (int row = 0; row < N_DST; ++row) {
|
||||||
|
@ -4133,7 +4312,10 @@ void kernel_mul_mv_iq3_xxs_f32_impl(
|
||||||
y4 += 32 * 32;
|
y4 += 32 * 32;
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
// TODO
|
(void) x;
|
||||||
|
(void) y;
|
||||||
|
(void) yl;
|
||||||
|
(void) nb32;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
for (int row = 0; row < N_DST; ++row) {
|
for (int row = 0; row < N_DST; ++row) {
|
||||||
|
@ -4173,6 +4355,126 @@ kernel void kernel_mul_mv_iq3_xxs_f32(
|
||||||
kernel_mul_mv_iq3_xxs_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
|
kernel_mul_mv_iq3_xxs_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void kernel_mul_mv_iq1_s_f32_impl(
|
||||||
|
device const void * src0,
|
||||||
|
device const float * src1,
|
||||||
|
device float * dst,
|
||||||
|
constant int64_t & ne00,
|
||||||
|
constant int64_t & ne01,
|
||||||
|
constant int64_t & ne02,
|
||||||
|
constant int64_t & ne10,
|
||||||
|
constant int64_t & ne12,
|
||||||
|
constant int64_t & ne0,
|
||||||
|
constant int64_t & ne1,
|
||||||
|
constant uint & r2,
|
||||||
|
constant uint & r3,
|
||||||
|
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||||
|
uint tiisg[[thread_index_in_simdgroup]],
|
||||||
|
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||||
|
|
||||||
|
const int nb = ne00/QK_K;
|
||||||
|
const int r0 = tgpig.x;
|
||||||
|
const int r1 = tgpig.y;
|
||||||
|
const int im = tgpig.z;
|
||||||
|
|
||||||
|
const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
|
||||||
|
const int ib_row = first_row * nb;
|
||||||
|
|
||||||
|
const uint i12 = im%ne12;
|
||||||
|
const uint i13 = im/ne12;
|
||||||
|
|
||||||
|
const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
|
||||||
|
|
||||||
|
device const block_iq1_s * x = (device const block_iq1_s *) src0 + ib_row + offset0;
|
||||||
|
device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1;
|
||||||
|
|
||||||
|
float yl[16];
|
||||||
|
float sumf[N_DST]={0.f}, all_sum;
|
||||||
|
|
||||||
|
const int nb32 = nb * (QK_K / 32);
|
||||||
|
|
||||||
|
#if QK_K == 256
|
||||||
|
const int ix = tiisg/2;
|
||||||
|
const int il = tiisg%2;
|
||||||
|
|
||||||
|
device const float * y4 = y + 32 * ix + 16 * il;
|
||||||
|
|
||||||
|
for (int ib32 = ix; ib32 < nb32; ib32 += 16) {
|
||||||
|
|
||||||
|
for (int i = 0; i < 16; ++i) {
|
||||||
|
yl[i] = y4[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
const int ibl = ib32 / (QK_K / 32);
|
||||||
|
const int ib = ib32 % (QK_K / 32);
|
||||||
|
|
||||||
|
device const block_iq1_s * xr = x + ibl;
|
||||||
|
device const uint8_t * qs = xr->qs + 4 * ib + 2 * il;
|
||||||
|
device const uint8_t * sc = xr->scales + 2 * ib + il;
|
||||||
|
device const half * dh = &xr->d;
|
||||||
|
|
||||||
|
for (int row = 0; row < N_DST; row++) {
|
||||||
|
|
||||||
|
constant int8_t * grid1 = (constant int8_t *)(iq1s_grid + (qs[0] | ((sc[0] & 0x08) << 5)));
|
||||||
|
constant int8_t * grid2 = (constant int8_t *)(iq1s_grid + (qs[1] | ((sc[0] & 0x80) << 1)));
|
||||||
|
|
||||||
|
float2 sum = {0};
|
||||||
|
for (int j = 0; j < 8; ++j) {
|
||||||
|
sum[0] += yl[j+ 0] * grid1[j];
|
||||||
|
sum[1] += yl[j+ 8] * grid2[j];
|
||||||
|
}
|
||||||
|
sumf[row] += (float)dh[0] * (sum[0] * (2*(sc[0] & 7) + 1) + sum[1] * (2*((sc[0] >> 4) & 7) + 1));
|
||||||
|
|
||||||
|
dh += nb*sizeof(block_iq1_s)/2;
|
||||||
|
qs += nb*sizeof(block_iq1_s);
|
||||||
|
sc += nb*sizeof(block_iq1_s);
|
||||||
|
}
|
||||||
|
|
||||||
|
y4 += 16 * 32;
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
(void) x;
|
||||||
|
(void) y;
|
||||||
|
(void) yl;
|
||||||
|
(void) nb32;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
for (int row = 0; row < N_DST; ++row) {
|
||||||
|
all_sum = simd_sum(sumf[row]);
|
||||||
|
if (tiisg == 0) {
|
||||||
|
dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
[[host_name("kernel_mul_mv_iq1_s_f32")]]
|
||||||
|
kernel void kernel_mul_mv_iq1_s_f32(
|
||||||
|
device const void * src0,
|
||||||
|
device const float * src1,
|
||||||
|
device float * dst,
|
||||||
|
constant int64_t & ne00,
|
||||||
|
constant int64_t & ne01,
|
||||||
|
constant int64_t & ne02,
|
||||||
|
constant uint64_t & nb00,
|
||||||
|
constant uint64_t & nb01,
|
||||||
|
constant uint64_t & nb02,
|
||||||
|
constant int64_t & ne10,
|
||||||
|
constant int64_t & ne11,
|
||||||
|
constant int64_t & ne12,
|
||||||
|
constant uint64_t & nb10,
|
||||||
|
constant uint64_t & nb11,
|
||||||
|
constant uint64_t & nb12,
|
||||||
|
constant int64_t & ne0,
|
||||||
|
constant int64_t & ne1,
|
||||||
|
constant uint & r2,
|
||||||
|
constant uint & r3,
|
||||||
|
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||||
|
uint tiisg[[thread_index_in_simdgroup]],
|
||||||
|
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||||
|
|
||||||
|
kernel_mul_mv_iq1_s_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, tgpig, tiisg, sgitg);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
//============================= templates and their specializations =============================
|
//============================= templates and their specializations =============================
|
||||||
|
|
||||||
|
@ -4369,6 +4671,8 @@ void dequantize_q4_K(device const block_q4_K *xb, short il, thread type4x4 & reg
|
||||||
const float dl = d * sc[0];
|
const float dl = d * sc[0];
|
||||||
const float ml = min * sc[1];
|
const float ml = min * sc[1];
|
||||||
#else
|
#else
|
||||||
|
(void) get_scale_min_k4_just2;
|
||||||
|
|
||||||
q = q + 16 * (il&1);
|
q = q + 16 * (il&1);
|
||||||
device const uint8_t * s = xb->scales;
|
device const uint8_t * s = xb->scales;
|
||||||
device const half2 * dh = (device const half2 *)xb->d;
|
device const half2 * dh = (device const half2 *)xb->d;
|
||||||
|
@ -4518,6 +4822,22 @@ void dequantize_iq3_xxs(device const block_iq3_xxs * xb, short il, thread type4x
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <typename type4x4>
|
||||||
|
void dequantize_iq1_s(device const block_iq1_s * xb, short il, thread type4x4 & reg) {
|
||||||
|
// il is 0...15 for QK_K = 256 => index of block of 32 is il/2
|
||||||
|
const float d = xb->d;
|
||||||
|
device const uint8_t * qs = xb->qs + 2*il;
|
||||||
|
device const uint8_t * sc = xb->scales + il;
|
||||||
|
const float dl1 = d * (2*(sc[0] & 7) + 1);
|
||||||
|
const float dl2 = d * (2*((sc[0] >> 4) & 7) + 1);
|
||||||
|
constant int8_t * grid1 = (constant int8_t *)(iq1s_grid + (qs[0] | ((sc[0] & 0x08) << 5)));
|
||||||
|
constant int8_t * grid2 = (constant int8_t *)(iq1s_grid + (qs[1] | ((sc[0] & 0x80) << 1)));
|
||||||
|
for (int i = 0; i < 8; ++i) {
|
||||||
|
reg[i/4+0][i%4] = dl1 * grid1[i];
|
||||||
|
reg[i/4+2][i%4] = dl2 * grid2[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread float4x4 &)>
|
template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread float4x4 &)>
|
||||||
kernel void kernel_get_rows(
|
kernel void kernel_get_rows(
|
||||||
device const void * src0,
|
device const void * src0,
|
||||||
|
@ -5060,6 +5380,7 @@ template [[host_name("kernel_get_rows_q6_K")]] kernel get_rows_t kernel_get_rows
|
||||||
template [[host_name("kernel_get_rows_iq2_xxs")]] kernel get_rows_t kernel_get_rows<block_iq2_xxs, QK_NL, dequantize_iq2_xxs>;
|
template [[host_name("kernel_get_rows_iq2_xxs")]] kernel get_rows_t kernel_get_rows<block_iq2_xxs, QK_NL, dequantize_iq2_xxs>;
|
||||||
template [[host_name("kernel_get_rows_iq2_xs")]] kernel get_rows_t kernel_get_rows<block_iq2_xs, QK_NL, dequantize_iq2_xs>;
|
template [[host_name("kernel_get_rows_iq2_xs")]] kernel get_rows_t kernel_get_rows<block_iq2_xs, QK_NL, dequantize_iq2_xs>;
|
||||||
template [[host_name("kernel_get_rows_iq3_xxs")]] kernel get_rows_t kernel_get_rows<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
|
template [[host_name("kernel_get_rows_iq3_xxs")]] kernel get_rows_t kernel_get_rows<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
|
||||||
|
template [[host_name("kernel_get_rows_iq1_s")]] kernel get_rows_t kernel_get_rows<block_iq1_s, QK_NL, dequantize_iq1_s>;
|
||||||
|
|
||||||
//
|
//
|
||||||
// matrix-matrix multiplication
|
// matrix-matrix multiplication
|
||||||
|
@ -5099,6 +5420,7 @@ template [[host_name("kernel_mul_mm_q6_K_f32")]] kernel mat_mm_t kernel_mul_mm<b
|
||||||
template [[host_name("kernel_mul_mm_iq2_xxs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq2_xxs, QK_NL, dequantize_iq2_xxs>;
|
template [[host_name("kernel_mul_mm_iq2_xxs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq2_xxs, QK_NL, dequantize_iq2_xxs>;
|
||||||
template [[host_name("kernel_mul_mm_iq2_xs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq2_xs, QK_NL, dequantize_iq2_xs>;
|
template [[host_name("kernel_mul_mm_iq2_xs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq2_xs, QK_NL, dequantize_iq2_xs>;
|
||||||
template [[host_name("kernel_mul_mm_iq3_xxs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
|
template [[host_name("kernel_mul_mm_iq3_xxs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
|
||||||
|
template [[host_name("kernel_mul_mm_iq1_s_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq1_s, QK_NL, dequantize_iq1_s>;
|
||||||
|
|
||||||
//
|
//
|
||||||
// indirect matrix-matrix multiplication
|
// indirect matrix-matrix multiplication
|
||||||
|
@ -5150,6 +5472,7 @@ template [[host_name("kernel_mul_mm_id_q6_K_f32")]] kernel mat_mm_id_t kernel_mu
|
||||||
template [[host_name("kernel_mul_mm_id_iq2_xxs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq2_xxs, QK_NL, dequantize_iq2_xxs>;
|
template [[host_name("kernel_mul_mm_id_iq2_xxs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq2_xxs, QK_NL, dequantize_iq2_xxs>;
|
||||||
template [[host_name("kernel_mul_mm_id_iq2_xs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq2_xs, QK_NL, dequantize_iq2_xs>;
|
template [[host_name("kernel_mul_mm_id_iq2_xs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq2_xs, QK_NL, dequantize_iq2_xs>;
|
||||||
template [[host_name("kernel_mul_mm_id_iq3_xxs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
|
template [[host_name("kernel_mul_mm_id_iq3_xxs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
|
||||||
|
template [[host_name("kernel_mul_mm_id_iq1_s_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq1_s, QK_NL, dequantize_iq1_s>;
|
||||||
|
|
||||||
//
|
//
|
||||||
// matrix-vector multiplication
|
// matrix-vector multiplication
|
||||||
|
@ -6117,3 +6440,66 @@ kernel void kernel_mul_mv_id_iq3_xxs_f32(
|
||||||
tiisg,
|
tiisg,
|
||||||
sgitg);
|
sgitg);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[[host_name("kernel_mul_mv_id_iq1_s_f32")]]
|
||||||
|
kernel void kernel_mul_mv_id_iq1_s_f32(
|
||||||
|
device const char * ids,
|
||||||
|
device const char * src1,
|
||||||
|
device float * dst,
|
||||||
|
constant uint64_t & nbi1,
|
||||||
|
constant int64_t & ne00,
|
||||||
|
constant int64_t & ne01,
|
||||||
|
constant int64_t & ne02,
|
||||||
|
constant uint64_t & nb00,
|
||||||
|
constant uint64_t & nb01,
|
||||||
|
constant uint64_t & nb02,
|
||||||
|
constant int64_t & ne10,
|
||||||
|
constant int64_t & ne11,
|
||||||
|
constant int64_t & ne12,
|
||||||
|
constant int64_t & ne13,
|
||||||
|
constant uint64_t & nb10,
|
||||||
|
constant uint64_t & nb11,
|
||||||
|
constant uint64_t & nb12,
|
||||||
|
constant int64_t & ne0,
|
||||||
|
constant int64_t & ne1,
|
||||||
|
constant uint64_t & nb1,
|
||||||
|
constant uint & r2,
|
||||||
|
constant uint & r3,
|
||||||
|
constant int & idx,
|
||||||
|
device const char * src00,
|
||||||
|
device const char * src01,
|
||||||
|
device const char * src02,
|
||||||
|
device const char * src03,
|
||||||
|
device const char * src04,
|
||||||
|
device const char * src05,
|
||||||
|
device const char * src06,
|
||||||
|
device const char * src07,
|
||||||
|
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||||
|
uint tiitg[[thread_index_in_threadgroup]],
|
||||||
|
uint tiisg[[thread_index_in_simdgroup]],
|
||||||
|
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||||
|
device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07};
|
||||||
|
|
||||||
|
const int64_t bid = tgpig.z/(ne12*ne13);
|
||||||
|
|
||||||
|
tgpig.z = tgpig.z%(ne12*ne13);
|
||||||
|
|
||||||
|
const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx];
|
||||||
|
|
||||||
|
kernel_mul_mv_iq1_s_f32_impl(
|
||||||
|
src0[id],
|
||||||
|
(device const float *) (src1 + bid*nb11),
|
||||||
|
dst + bid*ne0,
|
||||||
|
ne00,
|
||||||
|
ne01,
|
||||||
|
ne02,
|
||||||
|
ne10,
|
||||||
|
ne12,
|
||||||
|
ne0,
|
||||||
|
ne1,
|
||||||
|
r2,
|
||||||
|
r3,
|
||||||
|
tgpig,
|
||||||
|
tiisg,
|
||||||
|
sgitg);
|
||||||
|
}
|
||||||
|
|
679
ggml-quants.c
679
ggml-quants.c
|
@ -1839,9 +1839,9 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri
|
||||||
float sigma2 = sumx2/QK_K;
|
float sigma2 = sumx2/QK_K;
|
||||||
for (int j = 0; j < QK_K/16; ++j) {
|
for (int j = 0; j < QK_K/16; ++j) {
|
||||||
const float * restrict qw = quant_weights + QK_K * i + 16*j;
|
const float * restrict qw = quant_weights + QK_K * i + 16*j;
|
||||||
for (int l = 0; l < 16; ++l) weight[l] = qw[l] * sqrtf(sigma2 + x[16*j + l]*x[16*j + l]);
|
for (int l = 0; l < QK_K/16; ++l) weight[l] = qw[l] * sqrtf(sigma2 + x[16*j + l]*x[16*j + l]);
|
||||||
for (int l = 0; l < 16; ++l) sw[j] += weight[l];
|
for (int l = 0; l < QK_K/16; ++l) sw[j] += weight[l];
|
||||||
scales[j] = make_qkx3_quants(16, 3, x + 16*j, weight, L + 16*j, &mins[j], Laux, -0.9f, 0.05f, 36, false);
|
scales[j] = make_qkx3_quants(QK_K/16, 3, x + 16*j, weight, L + 16*j, &mins[j], Laux, -0.9f, 0.05f, 36, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
float dm = make_qp_quants(QK_K/16, 15, scales, Ls, sw);
|
float dm = make_qp_quants(QK_K/16, 15, scales, Ls, sw);
|
||||||
|
@ -3482,6 +3482,139 @@ static const uint32_t iq3xxs_grid[256] = {
|
||||||
0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04,
|
0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#define NGRID_IQ2XXS 512
|
||||||
|
static const uint64_t iq1s_grid[NGRID_IQ2XXS] = {
|
||||||
|
0xffffffffffff0101, 0xffffffffff01ff00, 0xffffffffff010100, 0xffffffff00000000,
|
||||||
|
0xffffffff01ff00ff, 0xffffffff01ff0001, 0xffffffff0101ffff, 0xffffffff0101ff01,
|
||||||
|
0xffffff00ff000000, 0xffffff000000ff00, 0xffffff00000000ff, 0xffffff0000000100,
|
||||||
|
0xffffff0000010000, 0xffffff0001000000, 0xffffff01ffff00ff, 0xffffff01ff01ff00,
|
||||||
|
0xffffff01ff010100, 0xffffff0100000001, 0xffffff0101ffff00, 0xffffff0101ff0101,
|
||||||
|
0xffffff0101010100, 0xffff00ffff00ff01, 0xffff00ffff0000ff, 0xffff00ff00ff0100,
|
||||||
|
0xffff00ff0100ff00, 0xffff00ff010001ff, 0xffff0000ff0101ff, 0xffff000000ffff00,
|
||||||
|
0xffff000000000000, 0xffff00000001ff01, 0xffff000001000101, 0xffff0000010100ff,
|
||||||
|
0xffff0001ffff0100, 0xffff00010000ff00, 0xffff000100010101, 0xffff000101000000,
|
||||||
|
0xffff01ffffff0000, 0xffff01ffff01ffff, 0xffff01ffff010100, 0xffff01ff00000000,
|
||||||
|
0xffff01ff01ffffff, 0xffff01ff01ff0001, 0xffff01ff0101ffff, 0xffff01ff01010001,
|
||||||
|
0xffff0100ffffff01, 0xffff01000000ffff, 0xffff010000000100, 0xffff010001ff01ff,
|
||||||
|
0xffff010001000000, 0xffff0101ff000000, 0xffff0101000101ff, 0xffff010101ffff01,
|
||||||
|
0xffff01010101ff00, 0xff00ffffff000000, 0xff00ffff00ffff00, 0xff00ffff00000001,
|
||||||
|
0xff00ffff000001ff, 0xff00ffff01010000, 0xff00ff00ffff0000, 0xff00ff00ff00ff00,
|
||||||
|
0xff00ff00ff0000ff, 0xff00ff00ff000100, 0xff00ff00ff010001, 0xff00ff0000ff0001,
|
||||||
|
0xff00ff000000ffff, 0xff00ff0000000000, 0xff00ff000001ff00, 0xff00ff0000010100,
|
||||||
|
0xff00ff0001ff0000, 0xff00ff000100ff00, 0xff00ff0001000100, 0xff00ff01ff000000,
|
||||||
|
0xff00ff0100ff0000, 0xff00ff01000001ff, 0xff00ff0101010001, 0xff0000ff00000000,
|
||||||
|
0xff0000ff0001ff00, 0xff0000ff00010100, 0xff000000ffff0101, 0xff000000ff000000,
|
||||||
|
0xff000000ff01ff00, 0xff00000000ff0000, 0xff0000000000ff00, 0xff000000000000ff,
|
||||||
|
0xff00000000000000, 0xff00000000000001, 0xff00000000000100, 0xff0000000001ffff,
|
||||||
|
0xff00000000010000, 0xff00000001000000, 0xff00000001010100, 0xff000001ff00ff01,
|
||||||
|
0xff000001ff0100ff, 0xff00000100000000, 0xff0000010001ff00, 0xff00000101ff0100,
|
||||||
|
0xff0000010100ff00, 0xff0001ff00ff00ff, 0xff0001ff00000101, 0xff0001ff000100ff,
|
||||||
|
0xff0001ff01000000, 0xff000100ff0001ff, 0xff0001000000ff01, 0xff00010000000000,
|
||||||
|
0xff00010000010001, 0xff00010000010100, 0xff00010001ffff00, 0xff00010001ff0101,
|
||||||
|
0xff00010001010000, 0xff000101ffffffff, 0xff000101ff000101, 0xff00010101ff00ff,
|
||||||
|
0xff00010101000001, 0xff000101010100ff, 0xff01ffffff000101, 0xff01ffffff01ffff,
|
||||||
|
0xff01ffffff01ff01, 0xff01ffffff0101ff, 0xff01ffff00000000, 0xff01ffff01ff0001,
|
||||||
|
0xff01ffff0101ff01, 0xff01ff00ff000000, 0xff01ff0000ff0100, 0xff01ff000000ff01,
|
||||||
|
0xff01ff0000010000, 0xff01ff00010000ff, 0xff01ff01ff01ff00, 0xff01ff0100000101,
|
||||||
|
0xff0100ffffff0000, 0xff0100ffff010000, 0xff0100ff01ff00ff, 0xff0100ff01000100,
|
||||||
|
0xff0100ff010100ff, 0xff010000ffffff01, 0xff01000000000000, 0xff0100000101ff00,
|
||||||
|
0xff010001ffff00ff, 0xff010001ff000100, 0xff01000100ffff00, 0xff01000100010001,
|
||||||
|
0xff01000101ff0001, 0xff010001010001ff, 0xff0101ffffffffff, 0xff0101ffff01ffff,
|
||||||
|
0xff0101ffff010101, 0xff0101ff0000ff00, 0xff0101ff01010001, 0xff010100ff000000,
|
||||||
|
0xff010100ff01ff01, 0xff01010000ff0001, 0xff01010000000100, 0xff01010001000000,
|
||||||
|
0xff0101010100ffff, 0x00ffffff0000ff01, 0x00ffffff000000ff, 0x00ffffff00000100,
|
||||||
|
0x00ffffff00010000, 0x00ffff00ffff0001, 0x00ffff00ff0000ff, 0x00ffff00ff000100,
|
||||||
|
0x00ffff0000000000, 0x00ffff0001000100, 0x00ffff0001010001, 0x00ffff01ff00ff01,
|
||||||
|
0x00ffff0100ff0100, 0x00ffff010000ff00, 0x00ffff01000100ff, 0x00ffff0101ff00ff,
|
||||||
|
0x00ffff010101ff00, 0x00ff00ffffffffff, 0x00ff00ffffff01ff, 0x00ff00ffff000101,
|
||||||
|
0x00ff00ff00000000, 0x00ff00ff000101ff, 0x00ff00ff01010101, 0x00ff0000ff000000,
|
||||||
|
0x00ff0000ff01ffff, 0x00ff000000ff0000, 0x00ff00000000ff00, 0x00ff0000000000ff,
|
||||||
|
0x00ff000000000000, 0x00ff000000000001, 0x00ff000000000100, 0x00ff000000010000,
|
||||||
|
0x00ff000001ffff01, 0x00ff000001000000, 0x00ff0001ff000101, 0x00ff000100ffffff,
|
||||||
|
0x00ff000100000000, 0x00ff0001010001ff, 0x00ff01ffff000000, 0x00ff01ff0001ff00,
|
||||||
|
0x00ff01ff01ff0100, 0x00ff0100ff01ff01, 0x00ff010000ff00ff, 0x00ff010000ff0101,
|
||||||
|
0x00ff010000000000, 0x00ff010000010101, 0x00ff01000100ff00, 0x00ff010001010000,
|
||||||
|
0x00ff0101ffffff00, 0x00ff01010000ff01, 0x00ff010100000100, 0x00ff010101ff0000,
|
||||||
|
0x0000ffffffff0100, 0x0000ffffff00ff00, 0x0000ffffff0000ff, 0x0000ffffff010000,
|
||||||
|
0x0000ffff00000000, 0x0000ffff00010101, 0x0000ffff01ffff01, 0x0000ffff01000100,
|
||||||
|
0x0000ff00ff000000, 0x0000ff00ff01ff00, 0x0000ff00ff0101ff, 0x0000ff0000ff0000,
|
||||||
|
0x0000ff000000ff00, 0x0000ff00000000ff, 0x0000ff0000000000, 0x0000ff0000000001,
|
||||||
|
0x0000ff0000000100, 0x0000ff0000010000, 0x0000ff0001ffffff, 0x0000ff0001ff01ff,
|
||||||
|
0x0000ff0001000000, 0x0000ff000101ffff, 0x0000ff01ffff0101, 0x0000ff01ff010000,
|
||||||
|
0x0000ff0100000000, 0x0000ff0101000101, 0x000000ffffff0001, 0x000000ffff000000,
|
||||||
|
0x000000ff00ff0000, 0x000000ff0000ff00, 0x000000ff000000ff, 0x000000ff00000000,
|
||||||
|
0x000000ff00000001, 0x000000ff00000100, 0x000000ff00010000, 0x000000ff01000000,
|
||||||
|
0x000000ff0101ff00, 0x00000000ffff0000, 0x00000000ff00ff00, 0x00000000ff0000ff,
|
||||||
|
0x00000000ff000000, 0x00000000ff000001, 0x00000000ff000100, 0x00000000ff010000,
|
||||||
|
0x0000000000ffff00, 0x0000000000ff00ff, 0x0000000000ff0000, 0x0000000000ff0001,
|
||||||
|
0x0000000000ff0100, 0x000000000000ffff, 0x000000000000ff00, 0x000000000000ff01,
|
||||||
|
0x00000000000000ff, 0x0000000000000001, 0x00000000000001ff, 0x0000000000000100,
|
||||||
|
0x0000000000000101, 0x000000000001ff00, 0x00000000000100ff, 0x0000000000010000,
|
||||||
|
0x0000000000010001, 0x0000000000010100, 0x0000000001ff0000, 0x000000000100ff00,
|
||||||
|
0x00000000010000ff, 0x0000000001000000, 0x0000000001000001, 0x0000000001000100,
|
||||||
|
0x0000000001010000, 0x00000001ffff01ff, 0x00000001ff000000, 0x0000000100ff0000,
|
||||||
|
0x000000010000ff00, 0x00000001000000ff, 0x0000000100000000, 0x0000000100000001,
|
||||||
|
0x0000000100000100, 0x0000000100010000, 0x0000000101000000, 0x000001ffff00ff00,
|
||||||
|
0x000001ffff010001, 0x000001ffff0101ff, 0x000001ff00ffff01, 0x000001ff0000ffff,
|
||||||
|
0x000001ff00000000, 0x000001ff010000ff, 0x000001ff01010100, 0x00000100ffff0100,
|
||||||
|
0x00000100ff000000, 0x0000010000ff0000, 0x000001000000ff00, 0x00000100000000ff,
|
||||||
|
0x0000010000000000, 0x0000010000000001, 0x0000010000000100, 0x0000010000010000,
|
||||||
|
0x0000010001000000, 0x000001000101ff01, 0x00000101ffff0001, 0x00000101ff01ffff,
|
||||||
|
0x0000010100000000, 0x0000010101010100, 0x0001ffffff000000, 0x0001ffff00ffffff,
|
||||||
|
0x0001ffff00000100, 0x0001ffff0001ff00, 0x0001ffff01000000, 0x0001ff00ffffff00,
|
||||||
|
0x0001ff00ffff01ff, 0x0001ff00ff010000, 0x0001ff0000000000, 0x0001ff0000010001,
|
||||||
|
0x0001ff0001ff0000, 0x0001ff0001010100, 0x0001ff01ff0000ff, 0x0001ff01ff000001,
|
||||||
|
0x0001ff0100ffffff, 0x0001ff010001ffff, 0x0001ff01000101ff, 0x0001ff010100ff01,
|
||||||
|
0x000100ffff00ffff, 0x000100ffff00ff01, 0x000100ffff000100, 0x000100ff00000000,
|
||||||
|
0x000100ff000101ff, 0x000100ff01ff0101, 0x000100ff0100ffff, 0x000100ff01010101,
|
||||||
|
0x00010000ff000000, 0x00010000ff010100, 0x0001000000ff0000, 0x000100000000ff00,
|
||||||
|
0x00010000000000ff, 0x0001000000000000, 0x0001000000000001, 0x0001000000000100,
|
||||||
|
0x0001000000010000, 0x0001000001ffff01, 0x0001000001000000, 0x0001000100ff0101,
|
||||||
|
0x0001000100000000, 0x00010001010100ff, 0x000101ffffff01ff, 0x000101ffffff0101,
|
||||||
|
0x000101ff00010000, 0x000101ff01ff0000, 0x000101ff0100ff01, 0x00010100ffff0000,
|
||||||
|
0x0001010000000000, 0x000101000001ffff, 0x0001010000010101, 0x00010100010001ff,
|
||||||
|
0x00010101ff00ff00, 0x00010101ff010001, 0x0001010100ffffff, 0x0001010100ff01ff,
|
||||||
|
0x00010101000101ff, 0x0001010101ff0000, 0x000101010100ff01, 0x0001010101000101,
|
||||||
|
0x01ffffffffff0101, 0x01ffffffff01ffff, 0x01ffffffff01ff01, 0x01ffffffff0101ff,
|
||||||
|
0x01ffffffff010101, 0x01ffffff00000000, 0x01ffffff01ff01ff, 0x01ffffff01000101,
|
||||||
|
0x01ffffff0101ff01, 0x01ffffff010100ff, 0x01ffff000000ff00, 0x01ffff0000000001,
|
||||||
|
0x01ffff00000001ff, 0x01ffff0000010000, 0x01ffff0001ff0000, 0x01ffff01ffffffff,
|
||||||
|
0x01ffff01ffff01ff, 0x01ffff01ff000000, 0x01ffff01ff01ffff, 0x01ffff01ff0101ff,
|
||||||
|
0x01ffff010100ffff, 0x01ff00ffffff0000, 0x01ff00ffff010000, 0x01ff00ff00ffff01,
|
||||||
|
0x01ff0000ff0000ff, 0x01ff000000000000, 0x01ff00000001ff01, 0x01ff000001ffffff,
|
||||||
|
0x01ff000001010100, 0x01ff0001ffffff01, 0x01ff0001ff010001, 0x01ff000101ff0100,
|
||||||
|
0x01ff000101000001, 0x01ff0001010100ff, 0x01ff01ffff00ffff, 0x01ff01ff00010001,
|
||||||
|
0x01ff01ff01000000, 0x01ff01ff010101ff, 0x01ff0100ff000001, 0x01ff010000ffff00,
|
||||||
|
0x01ff010000000100, 0x01ff010001ff01ff, 0x01ff01000101ffff, 0x01ff0101ffff00ff,
|
||||||
|
0x01ff0101ffff0101, 0x01ff0101ff0101ff, 0x01ff010100010000, 0x0100ffff00ff00ff,
|
||||||
|
0x0100ffff00ff0001, 0x0100ffff00000100, 0x0100ffff0100ff00, 0x0100ff00ffff0000,
|
||||||
|
0x0100ff00ff00ffff, 0x0100ff00ff00ff01, 0x0100ff00ff000100, 0x0100ff00ff010000,
|
||||||
|
0x0100ff0000000000, 0x0100ff00000100ff, 0x0100ff0001ff0101, 0x0100ff0001010101,
|
||||||
|
0x0100ff0100ff00ff, 0x0100ff0100ff0001, 0x0100ff0100000100, 0x0100ff0100010001,
|
||||||
|
0x0100ff0101000000, 0x010000ffff00ff00, 0x010000ff0000ffff, 0x010000ff00000000,
|
||||||
|
0x010000ff010001ff, 0x010000ff01010001, 0x01000000ffffff00, 0x01000000ffff0101,
|
||||||
|
0x01000000ff000000, 0x01000000ff0100ff, 0x01000000ff010101, 0x0100000000ff0000,
|
||||||
|
0x010000000000ff00, 0x01000000000000ff, 0x0100000000000000, 0x0100000000000001,
|
||||||
|
0x0100000000000100, 0x0100000000010000, 0x0100000001000000, 0x0100000100000000,
|
||||||
|
0x01000001000101ff, 0x0100000101ffff01, 0x010001ffff000101, 0x010001ff00ff0100,
|
||||||
|
0x010001ff0000ff00, 0x010001ff000100ff, 0x010001ff01ffffff, 0x01000100ffff0000,
|
||||||
|
0x01000100ff0001ff, 0x0100010000000000, 0x010001000001ff00, 0x0100010001ff0000,
|
||||||
|
0x01000100010000ff, 0x0100010001000101, 0x01000101ff00ff01, 0x0100010100ff0100,
|
||||||
|
0x010001010000ffff, 0x0100010101010001, 0x0101ffffffff0101, 0x0101ffffff0001ff,
|
||||||
|
0x0101ffffff01ffff, 0x0101ffffff010101, 0x0101ffff00000000, 0x0101ffff0101ffff,
|
||||||
|
0x0101ffff010101ff, 0x0101ff00ff000000, 0x0101ff0000ff0100, 0x0101ff000000ff00,
|
||||||
|
0x0101ff0000010000, 0x0101ff00010000ff, 0x0101ff0001000001, 0x0101ff01ff010101,
|
||||||
|
0x0101ff0100000000, 0x0101ff010101ff00, 0x010100ffffff0000, 0x010100ffff010000,
|
||||||
|
0x010100ff00ff01ff, 0x010100ff000000ff, 0x010100ff00000101, 0x010100ff01ffff00,
|
||||||
|
0x01010000ffffff01, 0x01010000ff000100, 0x01010000ff01ff01, 0x0101000000000000,
|
||||||
|
0x01010000000100ff, 0x010100000101ff01, 0x01010001ffff0000, 0x01010001ff00ffff,
|
||||||
|
0x01010001ff010000, 0x0101000101ffffff, 0x0101000101ff01ff, 0x0101000101010101,
|
||||||
|
0x010101ffff01ffff, 0x010101ff00000000, 0x010101ff0001ff01, 0x010101ff0101ffff,
|
||||||
|
0x010101ff010101ff, 0x01010100ffffffff, 0x01010100ff000001, 0x010101000000ff00,
|
||||||
|
0x0101010001010000, 0x0101010100ff0001, 0x010101010001ff01, 0x010101010101ffff,
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
static const uint8_t ksigns_iq2xs[128] = {
|
static const uint8_t ksigns_iq2xs[128] = {
|
||||||
0, 129, 130, 3, 132, 5, 6, 135, 136, 9, 10, 139, 12, 141, 142, 15,
|
0, 129, 130, 3, 132, 5, 6, 135, 136, 9, 10, 139, 12, 141, 142, 15,
|
||||||
144, 17, 18, 147, 20, 149, 150, 23, 24, 153, 154, 27, 156, 29, 30, 159,
|
144, 17, 18, 147, 20, 149, 150, 23, 24, 153, 154, 27, 156, 29, 30, 159,
|
||||||
|
@ -3580,6 +3713,49 @@ void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ====================== 1.5625 bpw (de)-quantization
|
||||||
|
|
||||||
|
void dequantize_row_iq1_s(const block_iq1_s * restrict x, float * restrict y, int k) {
|
||||||
|
assert(k % QK_K == 0);
|
||||||
|
const int nb = k / QK_K;
|
||||||
|
|
||||||
|
float db[4];
|
||||||
|
uint16_t idx[4];
|
||||||
|
//const int8_t * grid[4];
|
||||||
|
|
||||||
|
for (int i = 0; i < nb; i++) {
|
||||||
|
|
||||||
|
const float d = GGML_FP16_TO_FP32(x[i].d);
|
||||||
|
const uint8_t * sc = x[i].scales;
|
||||||
|
const uint8_t * qs = x[i].qs;
|
||||||
|
|
||||||
|
for (int i8 = 0; i8 < QK_K/8; i8 += 4) {
|
||||||
|
idx[0] = qs[0] | ((sc[0] & 0x08) << 5);
|
||||||
|
idx[1] = qs[1] | ((sc[0] & 0x80) << 1);
|
||||||
|
idx[2] = qs[2] | ((sc[1] & 0x08) << 5);
|
||||||
|
idx[3] = qs[3] | ((sc[1] & 0x80) << 1);
|
||||||
|
//grid[0] = (const int8_t *)(iq1s_grid + (qs[0] | ((sc[0] & 0x08) << 5)));
|
||||||
|
//grid[1] = (const int8_t *)(iq1s_grid + (qs[1] | ((sc[0] & 0x80) << 1)));
|
||||||
|
//grid[2] = (const int8_t *)(iq1s_grid + (qs[2] | ((sc[1] & 0x08) << 5)));
|
||||||
|
//grid[3] = (const int8_t *)(iq1s_grid + (qs[3] | ((sc[1] & 0x80) << 1)));
|
||||||
|
db[0] = d * (2*(sc[0] & 7) + 1);
|
||||||
|
db[1] = d * (2*((sc[0] >> 4) & 7) + 1);
|
||||||
|
db[2] = d * (2*(sc[1] & 7) + 1);
|
||||||
|
db[3] = d * (2*((sc[1] >> 4) & 7) + 1);
|
||||||
|
for (int l = 0; l < 4; ++l) {
|
||||||
|
const int8_t * grid = (const int8_t *)(iq1s_grid + idx[l]);
|
||||||
|
for (int j = 0; j < 8; ++j) {
|
||||||
|
//y[j] = db[l] * grid[l][j];
|
||||||
|
y[j] = db[l] * grid[j];
|
||||||
|
}
|
||||||
|
y += 8;
|
||||||
|
}
|
||||||
|
qs += 4;
|
||||||
|
sc += 2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
//===================================== Q8_K ==============================================
|
//===================================== Q8_K ==============================================
|
||||||
|
|
||||||
void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k) {
|
void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k) {
|
||||||
|
@ -3850,15 +4026,15 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
||||||
|
|
||||||
const __m128i tmp = _mm_loadu_si128((const __m128i *)x[i].qs);
|
const __m128i tmp = _mm_loadu_si128((const __m128i *)x[i].qs);
|
||||||
|
|
||||||
__m128i bx = _mm_and_si128(lowMask, tmp);
|
__m128i bx_0 = _mm_and_si128(lowMask, tmp);
|
||||||
__m128i by = _mm_loadu_si128((const __m128i *)y[i].qs);
|
__m128i by_0 = _mm_loadu_si128((const __m128i *)y[i].qs);
|
||||||
bx = _mm_sub_epi8(bx, off);
|
bx_0 = _mm_sub_epi8(bx_0, off);
|
||||||
const __m128i i32_0 = mul_sum_i8_pairs(bx, by);
|
const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0);
|
||||||
|
|
||||||
bx = _mm_and_si128(lowMask, _mm_srli_epi64(tmp, 4));
|
bx_0 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp, 4));
|
||||||
by = _mm_loadu_si128((const __m128i *)(y[i].qs + 16));
|
by_0 = _mm_loadu_si128((const __m128i *)(y[i].qs + 16));
|
||||||
bx = _mm_sub_epi8(bx, off);
|
bx_0 = _mm_sub_epi8(bx_0, off);
|
||||||
const __m128i i32_1 = mul_sum_i8_pairs(bx, by);
|
const __m128i i32_1 = mul_sum_i8_pairs(bx_0, by_0);
|
||||||
|
|
||||||
// Convert int32_t to float
|
// Convert int32_t to float
|
||||||
__m256 p = _mm256_cvtepi32_ps(MM256_SET_M128I(i32_0, i32_1));
|
__m256 p = _mm256_cvtepi32_ps(MM256_SET_M128I(i32_0, i32_1));
|
||||||
|
@ -4444,21 +4620,21 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
||||||
/* Compute combined scale for the block */
|
/* Compute combined scale for the block */
|
||||||
const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d));
|
const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d));
|
||||||
|
|
||||||
__m256i bx = bytes_from_nibbles_32(x[i].qs);
|
__m256i bx_0 = bytes_from_nibbles_32(x[i].qs);
|
||||||
const __m256i bxhi = bytes_from_bits_32(x[i].qh);
|
const __m256i bxhi = bytes_from_bits_32(x[i].qh);
|
||||||
__m128i bxhil = _mm256_castsi256_si128(bxhi);
|
__m128i bxhil = _mm256_castsi256_si128(bxhi);
|
||||||
__m128i bxhih = _mm256_extractf128_si256(bxhi, 1);
|
__m128i bxhih = _mm256_extractf128_si256(bxhi, 1);
|
||||||
bxhil = _mm_andnot_si128(bxhil, mask);
|
bxhil = _mm_andnot_si128(bxhil, mask);
|
||||||
bxhih = _mm_andnot_si128(bxhih, mask);
|
bxhih = _mm_andnot_si128(bxhih, mask);
|
||||||
__m128i bxl = _mm256_castsi256_si128(bx);
|
__m128i bxl = _mm256_castsi256_si128(bx_0);
|
||||||
__m128i bxh = _mm256_extractf128_si256(bx, 1);
|
__m128i bxh = _mm256_extractf128_si256(bx_0, 1);
|
||||||
bxl = _mm_or_si128(bxl, bxhil);
|
bxl = _mm_or_si128(bxl, bxhil);
|
||||||
bxh = _mm_or_si128(bxh, bxhih);
|
bxh = _mm_or_si128(bxh, bxhih);
|
||||||
bx = MM256_SET_M128I(bxh, bxl);
|
bx_0 = MM256_SET_M128I(bxh, bxl);
|
||||||
|
|
||||||
const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
const __m256i by_0 = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
||||||
|
|
||||||
const __m256 q = mul_sum_i8_pairs_float(bx, by);
|
const __m256 q = mul_sum_i8_pairs_float(bx_0, by_0);
|
||||||
|
|
||||||
/* Multiply q with scale and accumulate */
|
/* Multiply q with scale and accumulate */
|
||||||
acc = _mm256_add_ps(_mm256_mul_ps(d, q), acc);
|
acc = _mm256_add_ps(_mm256_mul_ps(d, q), acc);
|
||||||
|
@ -4751,22 +4927,22 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
||||||
|
|
||||||
summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s;
|
summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s;
|
||||||
|
|
||||||
__m256i bx = bytes_from_nibbles_32(x[i].qs);
|
__m256i bx_0 = bytes_from_nibbles_32(x[i].qs);
|
||||||
const __m256i bxhi = bytes_from_bits_32(x[i].qh);
|
const __m256i bxhi = bytes_from_bits_32(x[i].qh);
|
||||||
__m128i bxhil = _mm256_castsi256_si128(bxhi);
|
__m128i bxhil = _mm256_castsi256_si128(bxhi);
|
||||||
__m128i bxhih = _mm256_extractf128_si256(bxhi, 1);
|
__m128i bxhih = _mm256_extractf128_si256(bxhi, 1);
|
||||||
bxhil = _mm_and_si128(bxhil, mask);
|
bxhil = _mm_and_si128(bxhil, mask);
|
||||||
bxhih = _mm_and_si128(bxhih, mask);
|
bxhih = _mm_and_si128(bxhih, mask);
|
||||||
__m128i bxl = _mm256_castsi256_si128(bx);
|
__m128i bxl = _mm256_castsi256_si128(bx_0);
|
||||||
__m128i bxh = _mm256_extractf128_si256(bx, 1);
|
__m128i bxh = _mm256_extractf128_si256(bx_0, 1);
|
||||||
bxl = _mm_or_si128(bxl, bxhil);
|
bxl = _mm_or_si128(bxl, bxhil);
|
||||||
bxh = _mm_or_si128(bxh, bxhih);
|
bxh = _mm_or_si128(bxh, bxhih);
|
||||||
bx = MM256_SET_M128I(bxh, bxl);
|
bx_0 = MM256_SET_M128I(bxh, bxl);
|
||||||
|
|
||||||
const __m256 dy = _mm256_set1_ps(y[i].d);
|
const __m256 dy = _mm256_set1_ps(y[i].d);
|
||||||
const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
const __m256i by_0 = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
||||||
|
|
||||||
const __m256 q = mul_sum_us8_pairs_float(bx, by);
|
const __m256 q = mul_sum_us8_pairs_float(bx_0, by_0);
|
||||||
|
|
||||||
acc = _mm256_add_ps(_mm256_mul_ps(q, _mm256_mul_ps(dx, dy)), acc);
|
acc = _mm256_add_ps(_mm256_mul_ps(q, _mm256_mul_ps(dx, dy)), acc);
|
||||||
}
|
}
|
||||||
|
@ -4995,10 +5171,10 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
||||||
|
|
||||||
for (int i = 0; i < nb; i++) {
|
for (int i = 0; i < nb; i++) {
|
||||||
// load elements
|
// load elements
|
||||||
vint8m1_t bx = __riscv_vle8_v_i8m1(x[i].qs, vl);
|
vint8m1_t bx_0 = __riscv_vle8_v_i8m1(x[i].qs, vl);
|
||||||
vint8m1_t by = __riscv_vle8_v_i8m1(y[i].qs, vl);
|
vint8m1_t by_0 = __riscv_vle8_v_i8m1(y[i].qs, vl);
|
||||||
|
|
||||||
vint16m2_t vw_mul = __riscv_vwmul_vv_i16m2(bx, by, vl);
|
vint16m2_t vw_mul = __riscv_vwmul_vv_i16m2(bx_0, by_0, vl);
|
||||||
|
|
||||||
vint32m1_t v_zero = __riscv_vmv_v_x_i32m1(0, vl);
|
vint32m1_t v_zero = __riscv_vmv_v_x_i32m1(0, vl);
|
||||||
vint32m1_t v_sum = __riscv_vwredsum_vs_i16m2_i32m1(vw_mul, v_zero, vl);
|
vint32m1_t v_sum = __riscv_vwredsum_vs_i16m2_i32m1(vw_mul, v_zero, vl);
|
||||||
|
@ -9109,6 +9285,178 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef __AVX2__
|
||||||
|
static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) {
|
||||||
|
const __m256i ax = _mm256_sign_epi8(x, x);
|
||||||
|
const __m256i sy = _mm256_sign_epi8(y, x);
|
||||||
|
return _mm256_maddubs_epi16(ax, sy);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||||
|
assert(n % QK_K == 0);
|
||||||
|
assert(nrc == 1);
|
||||||
|
UNUSED(nrc);
|
||||||
|
UNUSED(bx);
|
||||||
|
UNUSED(by);
|
||||||
|
UNUSED(bs);
|
||||||
|
|
||||||
|
const block_iq1_s * restrict x = vx;
|
||||||
|
const block_q8_K * restrict y = vy;
|
||||||
|
|
||||||
|
const int nb = n / QK_K;
|
||||||
|
|
||||||
|
#if defined __ARM_NEON
|
||||||
|
|
||||||
|
const uint8x16_t m8 = vdupq_n_u8(0x08);
|
||||||
|
const uint8x16_t m7 = vdupq_n_u8(0x07);
|
||||||
|
const uint8x16_t m1 = vdupq_n_u8(0x01);
|
||||||
|
const int32x4_t vzero = vdupq_n_s32(0);
|
||||||
|
|
||||||
|
uint16_t gindex[8];
|
||||||
|
uint16x8x2_t vindex;
|
||||||
|
int8x16x4_t q1b;
|
||||||
|
int8x16x4_t q8b;
|
||||||
|
uint16x8x4_t scales;
|
||||||
|
int32x4x2_t sumi;
|
||||||
|
int32x4x2_t dotq;
|
||||||
|
|
||||||
|
float sumf = 0;
|
||||||
|
for (int i = 0; i < nb; ++i) {
|
||||||
|
|
||||||
|
const int8_t * q8 = y[i].qs;
|
||||||
|
const uint8_t * qs = x[i].qs;
|
||||||
|
const uint8_t * sc = x[i].scales;
|
||||||
|
|
||||||
|
sumi.val[0] = sumi.val[1] = vzero;
|
||||||
|
|
||||||
|
for (int i128 = 0; i128 < QK_K/128; ++i128) {
|
||||||
|
const uint8x16_t ql = vld1q_u8(qs); qs += 16;
|
||||||
|
const uint8x8_t tm1 = vld1_u8 (sc); sc += 8;
|
||||||
|
const uint8x8_t tm2 = vshr_n_u8(tm1, 4);
|
||||||
|
const uint8x16_t qh = vcombine_u8(vzip1_u8(tm1, tm2), vzip2_u8(tm1, tm2));
|
||||||
|
const uint8x16_t hbit = vandq_u8(qh, m8);
|
||||||
|
vindex.val[0] = vorrq_u16(vmovl_u8(vget_low_u8 (ql)), vshlq_n_u16(vmovl_u8(vget_low_u8 (hbit)), 5));
|
||||||
|
vindex.val[1] = vorrq_u16(vmovl_u8(vget_high_u8(ql)), vshlq_n_u16(vmovl_u8(vget_high_u8(hbit)), 5));
|
||||||
|
const uint8x16_t scales8 = vorrq_u8(vshlq_n_u8(vandq_u8(qh, m7), 1), m1);
|
||||||
|
scales.val[0] = vmovl_u8(vget_low_u8 (scales8));
|
||||||
|
scales.val[1] = vmovl_u8(vget_high_u8 (scales8));
|
||||||
|
|
||||||
|
for (int l = 0; l < 2; ++l) {
|
||||||
|
vst1q_u16(gindex+0, vindex.val[l]);
|
||||||
|
q1b.val[0] = vcombine_s8(vld1_s8((const void *)(iq1s_grid+gindex[0])), vld1_s8((const void *)(iq1s_grid+gindex[1])));
|
||||||
|
q1b.val[1] = vcombine_s8(vld1_s8((const void *)(iq1s_grid+gindex[2])), vld1_s8((const void *)(iq1s_grid+gindex[3])));
|
||||||
|
q1b.val[2] = vcombine_s8(vld1_s8((const void *)(iq1s_grid+gindex[4])), vld1_s8((const void *)(iq1s_grid+gindex[5])));
|
||||||
|
q1b.val[3] = vcombine_s8(vld1_s8((const void *)(iq1s_grid+gindex[6])), vld1_s8((const void *)(iq1s_grid+gindex[7])));
|
||||||
|
q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
|
||||||
|
|
||||||
|
dotq.val[0] = vpaddq_s32(ggml_vdotq_s32(vzero, q1b.val[0], q8b.val[0]), ggml_vdotq_s32(vzero, q1b.val[1], q8b.val[1]));
|
||||||
|
dotq.val[1] = vpaddq_s32(ggml_vdotq_s32(vzero, q1b.val[2], q8b.val[2]), ggml_vdotq_s32(vzero, q1b.val[3], q8b.val[3]));
|
||||||
|
|
||||||
|
sumi.val[0] = vmlaq_s32(sumi.val[0], dotq.val[0], vreinterpretq_s32_u32(vmovl_u16(vget_low_u16 (scales.val[l]))));
|
||||||
|
sumi.val[1] = vmlaq_s32(sumi.val[1], dotq.val[1], vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(scales.val[l]))));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
sumf += y[i].d * GGML_FP16_TO_FP32(x[i].d) * vaddvq_s32(vaddq_s32(sumi.val[0], sumi.val[1]));
|
||||||
|
}
|
||||||
|
|
||||||
|
*s = sumf;
|
||||||
|
|
||||||
|
#elif defined __AVX2__
|
||||||
|
|
||||||
|
const __m128i m8 = _mm_set1_epi8(0x08);
|
||||||
|
const __m128i m7 = _mm_set1_epi8(0x07);
|
||||||
|
const __m128i m1 = _mm_set1_epi8(0x01);
|
||||||
|
const __m128i shuffle_h = _mm_set_epi8(15, 7, 14, 6, 13, 5, 12, 4, 11, 3, 10, 2, 9, 1, 8, 0);
|
||||||
|
const __m128i shuffle_s[4] = {
|
||||||
|
_mm_set_epi32(0x03030303, 0x02020202, 0x01010101, 0x00000000),
|
||||||
|
_mm_set_epi32(0x07070707, 0x06060606, 0x05050505, 0x04040404),
|
||||||
|
_mm_set_epi32(0x0b0b0b0b, 0x0a0a0a0a, 0x09090909, 0x08080808),
|
||||||
|
_mm_set_epi32(0x0f0f0f0f, 0x0e0e0e0e, 0x0d0d0d0d, 0x0c0c0c0c)
|
||||||
|
};
|
||||||
|
|
||||||
|
uint64_t aux64;
|
||||||
|
|
||||||
|
__m256i v_gindex;
|
||||||
|
const uint16_t * gindex = (const uint16_t *)&v_gindex;
|
||||||
|
|
||||||
|
__m256 accum = _mm256_setzero_ps();
|
||||||
|
for (int i = 0; i < nb; ++i) {
|
||||||
|
|
||||||
|
const int8_t * q8 = y[i].qs;
|
||||||
|
const uint8_t * qs = x[i].qs;
|
||||||
|
const uint8_t * sc = x[i].scales;
|
||||||
|
|
||||||
|
__m256i sumi = _mm256_setzero_si256();
|
||||||
|
for (int i128 = 0; i128 < QK_K/128; ++i128) {
|
||||||
|
const __m128i ql = _mm_loadu_si128((const __m128i*)qs); qs += 16;
|
||||||
|
memcpy(&aux64, sc, 8); sc += 8;
|
||||||
|
const __m128i qh = _mm_shuffle_epi8(_mm_set_epi64x(aux64 >> 4, aux64), shuffle_h);
|
||||||
|
const __m256i hbit = _mm256_cvtepu8_epi16(_mm_and_si128(qh, m8));
|
||||||
|
v_gindex = _mm256_or_si256(_mm256_cvtepu8_epi16(ql), _mm256_slli_epi16(hbit, 5));
|
||||||
|
const __m128i scales = _mm_or_si128(_mm_slli_epi16(_mm_and_si128(qh, m7), 1), m1);
|
||||||
|
|
||||||
|
for (int i32 = 0; i32 < 4; ++i32) {
|
||||||
|
const __m256i q8b = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
|
||||||
|
const __m256i q1b = _mm256_set_epi64x(iq1s_grid[gindex[4*i32+3]], iq1s_grid[gindex[4*i32+2]],
|
||||||
|
iq1s_grid[gindex[4*i32+1]], iq1s_grid[gindex[4*i32+0]]);
|
||||||
|
const __m256i dot = mul_add_epi8(q1b, q8b);
|
||||||
|
const __m256i s16 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, shuffle_s[i32]));
|
||||||
|
const __m256i p = _mm256_madd_epi16(s16, dot);
|
||||||
|
sumi = _mm256_add_epi32(sumi, p);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
accum = _mm256_fmadd_ps(_mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(x[i].d)), _mm256_cvtepi32_ps(sumi), accum);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
*s = hsum_float_8(accum);
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
int db[4];
|
||||||
|
uint16_t idx[4];
|
||||||
|
|
||||||
|
float sumf = 0;
|
||||||
|
for (int i = 0; i < nb; ++i) {
|
||||||
|
|
||||||
|
const int8_t * q8 = y[i].qs;
|
||||||
|
const uint8_t * qs = x[i].qs;
|
||||||
|
const uint8_t * sc = x[i].scales;
|
||||||
|
|
||||||
|
int sumi = 0;
|
||||||
|
for (int i32 = 0; i32 < QK_K/32; ++i32) {
|
||||||
|
idx[0] = qs[0] | ((sc[0] & 0x08) << 5);
|
||||||
|
idx[1] = qs[1] | ((sc[0] & 0x80) << 1);
|
||||||
|
idx[2] = qs[2] | ((sc[1] & 0x08) << 5);
|
||||||
|
idx[3] = qs[3] | ((sc[1] & 0x80) << 1);
|
||||||
|
db[0] = (2*(sc[0] & 7) + 1);
|
||||||
|
db[1] = (2*((sc[0] >> 4) & 7) + 1);
|
||||||
|
db[2] = (2*(sc[1] & 7) + 1);
|
||||||
|
db[3] = (2*((sc[1] >> 4) & 7) + 1);
|
||||||
|
for (int l = 0; l < 4; ++l) {
|
||||||
|
const int8_t * grid = (const int8_t *)(iq1s_grid + idx[l]);
|
||||||
|
int suml = 0;
|
||||||
|
for (int j = 0; j < 8; ++j) suml += q8[j] * grid[j];
|
||||||
|
sumi += db[l] * suml;
|
||||||
|
q8 += 8;
|
||||||
|
}
|
||||||
|
qs += 4;
|
||||||
|
sc += 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
sumf += GGML_FP16_TO_FP32(x[i].d) * y[i].d * sumi;
|
||||||
|
}
|
||||||
|
|
||||||
|
*s = sumf;
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
// ================================ IQ2 quantization =============================================
|
// ================================ IQ2 quantization =============================================
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
|
@ -9117,14 +9465,22 @@ typedef struct {
|
||||||
uint16_t * neighbours;
|
uint16_t * neighbours;
|
||||||
} iq2_entry_t;
|
} iq2_entry_t;
|
||||||
|
|
||||||
static iq2_entry_t iq2_data[2] = {
|
static iq2_entry_t iq2_data[3] = {
|
||||||
|
{NULL, NULL, NULL},
|
||||||
{NULL, NULL, NULL},
|
{NULL, NULL, NULL},
|
||||||
{NULL, NULL, NULL},
|
{NULL, NULL, NULL},
|
||||||
};
|
};
|
||||||
|
|
||||||
static inline int iq2_data_index(int grid_size) {
|
static inline int iq2_data_index(enum ggml_type type) {
|
||||||
GGML_ASSERT(grid_size == 256 || grid_size == 512);
|
GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S);
|
||||||
return grid_size == 256 ? 0 : 1;
|
return type == GGML_TYPE_IQ2_XXS ? 0 :
|
||||||
|
type == GGML_TYPE_IQ2_XS ? 1 : 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline int iq2_grid_size(enum ggml_type type) {
|
||||||
|
GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S);
|
||||||
|
return type == GGML_TYPE_IQ2_XXS ? 256 :
|
||||||
|
type == GGML_TYPE_IQ2_XS ? 512 : 512;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int iq2_compare_func(const void * left, const void * right) {
|
static int iq2_compare_func(const void * left, const void * right) {
|
||||||
|
@ -9133,12 +9489,13 @@ static int iq2_compare_func(const void * left, const void * right) {
|
||||||
return l[0] < r[0] ? -1 : l[0] > r[0] ? 1 : l[1] < r[1] ? -1 : l[1] > r[1] ? 1 : 0;
|
return l[0] < r[0] ? -1 : l[0] > r[0] ? 1 : l[1] < r[1] ? -1 : l[1] > r[1] ? 1 : 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
void iq2xs_init_impl(int grid_size) {
|
void iq2xs_init_impl(enum ggml_type type) {
|
||||||
const int gindex = iq2_data_index(grid_size);
|
const int gindex = iq2_data_index(type);
|
||||||
|
const int grid_size = iq2_grid_size(type);
|
||||||
if (iq2_data[gindex].grid) {
|
if (iq2_data[gindex].grid) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
static const uint16_t kgrid_256[256] = {
|
static const uint16_t kgrid_2bit_256[256] = {
|
||||||
0, 2, 5, 8, 10, 17, 20, 32, 34, 40, 42, 65, 68, 80, 88, 97,
|
0, 2, 5, 8, 10, 17, 20, 32, 34, 40, 42, 65, 68, 80, 88, 97,
|
||||||
100, 128, 130, 138, 162, 257, 260, 272, 277, 320, 388, 408, 512, 514, 546, 642,
|
100, 128, 130, 138, 162, 257, 260, 272, 277, 320, 388, 408, 512, 514, 546, 642,
|
||||||
1025, 1028, 1040, 1057, 1060, 1088, 1090, 1096, 1120, 1153, 1156, 1168, 1188, 1280, 1282, 1288,
|
1025, 1028, 1040, 1057, 1060, 1088, 1090, 1096, 1120, 1153, 1156, 1168, 1188, 1280, 1282, 1288,
|
||||||
|
@ -9156,7 +9513,7 @@ void iq2xs_init_impl(int grid_size) {
|
||||||
33888, 34048, 34118, 34196, 34313, 34368, 34400, 34818, 35076, 35345, 36868, 36880, 36900, 36928, 37025, 37142,
|
33888, 34048, 34118, 34196, 34313, 34368, 34400, 34818, 35076, 35345, 36868, 36880, 36900, 36928, 37025, 37142,
|
||||||
37248, 37445, 37888, 37922, 37956, 38225, 39041, 39200, 40962, 41040, 41093, 41225, 41472, 42008, 43088, 43268,
|
37248, 37445, 37888, 37922, 37956, 38225, 39041, 39200, 40962, 41040, 41093, 41225, 41472, 42008, 43088, 43268,
|
||||||
};
|
};
|
||||||
static const uint16_t kgrid_512[512] = {
|
static const uint16_t kgrid_2bit_512[512] = {
|
||||||
0, 2, 5, 8, 10, 17, 20, 22, 25, 32, 34, 37, 40, 65, 68, 70,
|
0, 2, 5, 8, 10, 17, 20, 22, 25, 32, 34, 37, 40, 65, 68, 70,
|
||||||
73, 80, 82, 85, 88, 97, 100, 128, 130, 133, 136, 145, 148, 153, 160, 257,
|
73, 80, 82, 85, 88, 97, 100, 128, 130, 133, 136, 145, 148, 153, 160, 257,
|
||||||
260, 262, 265, 272, 274, 277, 280, 282, 289, 292, 320, 322, 325, 328, 337, 340,
|
260, 262, 265, 272, 274, 277, 280, 282, 289, 292, 320, 322, 325, 328, 337, 340,
|
||||||
|
@ -9190,9 +9547,45 @@ void iq2xs_init_impl(int grid_size) {
|
||||||
40962, 40968, 40970, 40992, 41002, 41120, 41297, 41305, 41382, 41472, 41474, 41480, 41514, 41600, 41632, 42048,
|
40962, 40968, 40970, 40992, 41002, 41120, 41297, 41305, 41382, 41472, 41474, 41480, 41514, 41600, 41632, 42048,
|
||||||
42133, 42597, 42648, 43018, 43040, 43042, 43048, 43168, 43176, 43268, 43396, 43398, 43560, 43562, 43665, 43690,
|
42133, 42597, 42648, 43018, 43040, 43042, 43048, 43168, 43176, 43268, 43396, 43398, 43560, 43562, 43665, 43690,
|
||||||
};
|
};
|
||||||
|
static const uint16_t kgrid_1bit_512[512] = {
|
||||||
|
10, 33, 41, 85, 132, 134, 160, 162, 277, 337, 340, 345, 357, 405, 516, 545,
|
||||||
|
553, 598, 641, 650, 681, 1042, 1044, 1097, 1169, 1176, 1320, 1345, 1365, 1378, 1434, 1444,
|
||||||
|
1545, 1617, 1642, 1685, 2053, 2080, 2089, 2133, 2176, 2182, 2208, 2214, 2306, 2384, 2393, 2440,
|
||||||
|
2453, 2581, 2664, 2690, 2721, 4117, 4161, 4182, 4184, 4261, 4357, 4369, 4372, 4377, 4390, 4422,
|
||||||
|
4432, 4437, 4449, 4457, 4485, 4497, 4505, 4629, 4677, 4696, 4774, 5205, 5217, 5225, 5386, 5397,
|
||||||
|
5409, 5445, 5457, 5460, 5461, 5462, 5465, 5472, 5477, 5525, 5545, 5650, 5668, 5717, 5729, 5769,
|
||||||
|
5777, 6212, 6234, 6244, 6293, 6424, 6482, 6485, 6502, 6505, 6529, 6538, 6565, 6656, 6682, 6788,
|
||||||
|
6806, 6820, 8218, 8224, 8226, 8232, 8277, 8326, 8354, 8469, 8521, 8530, 8549, 8596, 8737, 8794,
|
||||||
|
9221, 9253, 9348, 9369, 9380, 9474, 9557, 9633, 9732, 9753, 9793, 9830, 9862, 9880, 10240, 10272,
|
||||||
|
10282, 10321, 10406, 10517, 10530, 10566, 10585, 10645, 10896, 16466, 16468, 16473, 16485, 16646, 16660, 16665,
|
||||||
|
16725, 16793, 16806, 16914, 16969, 16977, 16996, 17028, 17057, 17408, 17416, 17434, 17493, 17512, 17578, 17685,
|
||||||
|
17696, 17733, 17745, 17748, 17749, 17750, 17753, 17765, 17794, 17813, 17946, 17984, 18005, 18072, 18453, 18529,
|
||||||
|
18569, 18722, 18756, 18762, 18773, 18794, 18833, 18853, 18945, 19026, 19033, 19077, 20489, 20497, 20500, 20517,
|
||||||
|
20565, 20586, 20610, 20633, 20757, 20769, 20776, 20805, 20817, 20820, 20821, 20822, 20825, 20837, 20864, 20872,
|
||||||
|
20885, 20896, 21002, 21029, 21077, 21146, 21510, 21525, 21573, 21585, 21588, 21589, 21590, 21593, 21605, 21653,
|
||||||
|
21665, 21765, 21777, 21780, 21781, 21782, 21785, 21797, 21825, 21828, 21829, 21830, 21833, 21840, 21841, 21842,
|
||||||
|
21844, 21846, 21848, 21849, 21850, 21857, 21860, 21861, 21862, 21865, 21893, 21905, 21908, 21909, 21910, 21913,
|
||||||
|
21925, 22024, 22037, 22085, 22097, 22100, 22101, 22102, 22105, 22117, 22165, 22545, 22566, 22568, 22594, 22608,
|
||||||
|
22613, 22676, 22697, 22793, 22805, 22853, 22865, 22868, 22869, 22870, 22873, 22885, 22933, 22946, 23046, 23072,
|
||||||
|
23125, 23209, 24597, 24640, 24665, 24673, 24725, 24833, 24840, 24869, 24917, 24934, 24965, 25001, 25108, 25110,
|
||||||
|
25152, 25184, 25192, 25234, 25616, 25618, 25625, 25685, 25704, 25738, 25744, 25770, 25877, 25897, 25925, 25937,
|
||||||
|
25940, 25941, 25942, 25945, 25957, 25986, 26005, 26186, 26197, 26276, 26632, 26634, 26725, 26757, 26770, 26885,
|
||||||
|
26965, 26976, 26986, 27032, 27153, 27174, 27200, 27208, 27240, 27269, 27282, 27290, 32778, 32800, 32802, 32808,
|
||||||
|
32810, 32853, 32904, 32922, 32930, 32932, 33105, 33110, 33112, 33125, 33157, 33280, 33288, 33301, 33312, 33320,
|
||||||
|
33424, 33797, 33829, 33858, 34068, 34133, 34146, 34176, 34217, 34306, 34342, 34441, 34454, 34468, 34832, 34918,
|
||||||
|
34965, 34984, 35094, 35137, 35161, 35208, 35232, 35332, 35338, 35368, 35429, 36932, 36934, 36953, 37009, 37125,
|
||||||
|
37136, 37138, 37145, 37157, 37205, 37220, 37258, 37290, 37444, 37446, 37465, 37478, 37525, 37905, 37968, 37973,
|
||||||
|
38040, 38054, 38145, 38154, 38165, 38180, 38186, 38213, 38225, 38228, 38229, 38230, 38233, 38245, 38293, 38485,
|
||||||
|
38504, 38530, 38938, 38985, 38993, 39012, 39040, 39173, 39192, 39253, 39265, 39301, 39316, 39322, 39442, 39497,
|
||||||
|
39504, 39590, 40970, 40984, 40992, 41002, 41045, 41120, 41128, 41237, 41289, 41297, 41317, 41364, 41366, 41514,
|
||||||
|
41557, 41633, 41989, 42021, 42056, 42068, 42074, 42113, 42242, 42265, 42274, 42325, 42340, 42402, 42501, 42512,
|
||||||
|
42533, 42624, 42632, 42666, 43040, 43093, 43106, 43168, 43176, 43264, 43286, 43345, 43429, 43590, 43618, 43680,
|
||||||
|
};
|
||||||
|
|
||||||
const int kmap_size = 43692;
|
const int kmap_size = 43692;
|
||||||
const int nwant = 2;
|
const int nwant = type == GGML_TYPE_IQ1_S ? 3 : 2;
|
||||||
const uint16_t * kgrid = grid_size == 256 ? kgrid_256 : kgrid_512;
|
const uint16_t * kgrid = type == GGML_TYPE_IQ2_XXS ? kgrid_2bit_256 :
|
||||||
|
type == GGML_TYPE_IQ2_XS ? kgrid_2bit_512 : kgrid_1bit_512;
|
||||||
uint64_t * kgrid_q2xs;
|
uint64_t * kgrid_q2xs;
|
||||||
int * kmap_q2xs;
|
int * kmap_q2xs;
|
||||||
uint16_t * kneighbors_q2xs;
|
uint16_t * kneighbors_q2xs;
|
||||||
|
@ -9288,9 +9681,9 @@ void iq2xs_init_impl(int grid_size) {
|
||||||
free(dist2);
|
free(dist2);
|
||||||
}
|
}
|
||||||
|
|
||||||
void iq2xs_free_impl(int grid_size) {
|
void iq2xs_free_impl(enum ggml_type type) {
|
||||||
GGML_ASSERT(grid_size == 256 || grid_size == 512 || grid_size == 1024);
|
GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S);
|
||||||
const int gindex = iq2_data_index(grid_size);
|
const int gindex = iq2_data_index(type);
|
||||||
if (iq2_data[gindex].grid) {
|
if (iq2_data[gindex].grid) {
|
||||||
free(iq2_data[gindex].grid); iq2_data[gindex].grid = NULL;
|
free(iq2_data[gindex].grid); iq2_data[gindex].grid = NULL;
|
||||||
free(iq2_data[gindex].map); iq2_data[gindex].map = NULL;
|
free(iq2_data[gindex].map); iq2_data[gindex].map = NULL;
|
||||||
|
@ -9324,7 +9717,7 @@ static int iq2_find_best_neighbour(const uint16_t * restrict neighbours, const u
|
||||||
|
|
||||||
static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) {
|
static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) {
|
||||||
|
|
||||||
const int gindex = iq2_data_index(256);
|
const int gindex = iq2_data_index(GGML_TYPE_IQ2_XXS);
|
||||||
|
|
||||||
const uint64_t * kgrid_q2xs = iq2_data[gindex].grid;
|
const uint64_t * kgrid_q2xs = iq2_data[gindex].grid;
|
||||||
const int * kmap_q2xs = iq2_data[gindex].map;
|
const int * kmap_q2xs = iq2_data[gindex].map;
|
||||||
|
@ -9497,7 +9890,7 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
|
||||||
|
|
||||||
static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) {
|
static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) {
|
||||||
|
|
||||||
const int gindex = iq2_data_index(512);
|
const int gindex = iq2_data_index(GGML_TYPE_IQ2_XS);
|
||||||
|
|
||||||
const uint64_t * kgrid_q2xs = iq2_data[gindex].grid;
|
const uint64_t * kgrid_q2xs = iq2_data[gindex].grid;
|
||||||
const int * kmap_q2xs = iq2_data[gindex].map;
|
const int * kmap_q2xs = iq2_data[gindex].map;
|
||||||
|
@ -10134,3 +10527,207 @@ void quantize_row_iq3_xxs_reference(const float * restrict x, block_iq3_xxs * re
|
||||||
assert(k % QK_K == 0);
|
assert(k % QK_K == 0);
|
||||||
quantize_row_iq3_xxs_impl(x, y, k, NULL);
|
quantize_row_iq3_xxs_impl(x, y, k, NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// =================================== 1.5 bpw ===================================================
|
||||||
|
|
||||||
|
static int iq1_find_best_neighbour(const uint16_t * restrict neighbours, const uint64_t * restrict grid,
|
||||||
|
const float * restrict xval, const float * restrict weight, float * scale, int8_t * restrict L, int ngrid) {
|
||||||
|
int num_neighbors = neighbours[0];
|
||||||
|
GGML_ASSERT(num_neighbors > 0);
|
||||||
|
float best_score = 0;
|
||||||
|
int grid_index = -1;
|
||||||
|
for (int j = 1; j <= num_neighbors; ++j) {
|
||||||
|
const int8_t * pg = (const int8_t *)(grid + neighbours[j]);
|
||||||
|
float sumqx = 0, sumq2 = 0;
|
||||||
|
for (int i = 0; i < 8; ++i) {
|
||||||
|
float q = (pg[i] - 3)/2;
|
||||||
|
float w = weight[i];
|
||||||
|
sumqx += w*q*xval[i];
|
||||||
|
sumq2 += w*q*q;
|
||||||
|
}
|
||||||
|
if (sumqx > 0 && sumq2 > 0 && sumqx*sumqx > best_score*sumq2) {
|
||||||
|
*scale = sumqx/sumq2; best_score = *scale * sumqx;
|
||||||
|
grid_index = neighbours[j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (grid_index < 0) {
|
||||||
|
for (int i = 0; i < ngrid; ++i) {
|
||||||
|
const int8_t * grid_i = (const int8_t *)(grid + i);
|
||||||
|
float sumqx = 0, sumq2 = 0;
|
||||||
|
for (int j = 0; j < 8; ++j) {
|
||||||
|
float w = weight[j];
|
||||||
|
float q = (grid_i[j] - 3)/2;
|
||||||
|
sumqx += w*q*xval[j];
|
||||||
|
sumq2 += w*q*q;
|
||||||
|
}
|
||||||
|
if (sumqx > 0 && sumq2 > 0 && sumqx*sumqx > best_score*sumq2) {
|
||||||
|
*scale = sumqx/sumq2; best_score = *scale*sumqx;
|
||||||
|
grid_index = i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (grid_index < 0) {
|
||||||
|
printf("Oops, did not find grid point\n");
|
||||||
|
printf("Have %d neighbours\n", num_neighbors);
|
||||||
|
for (int j = 1; j <= num_neighbors; ++j) {
|
||||||
|
const int8_t * pg = (const int8_t *)(grid + neighbours[j]);
|
||||||
|
float sumqx = 0, sumq2 = 0;
|
||||||
|
for (int i = 0; i < 8; ++i) {
|
||||||
|
float q = (pg[i] - 3)/2;
|
||||||
|
float w = weight[i];
|
||||||
|
sumqx += w*q*xval[i];
|
||||||
|
sumq2 += w*q*q;
|
||||||
|
}
|
||||||
|
printf(" neighbour %d: sumqx = %g sumq2 = %g\n", j, (double)sumqx, (double)sumq2);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
GGML_ASSERT(grid_index >= 0);
|
||||||
|
//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||||
|
*scale *= 1.05f; // This is a fudge factor. Don't ask me why it improves the result.
|
||||||
|
//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||||
|
const int8_t * pg = (const int8_t *)(grid + grid_index);
|
||||||
|
for (int i = 0; i < 8; ++i) L[i] = (pg[i] - 1)/2;
|
||||||
|
return grid_index;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int iq1_sort_helper(const void * left, const void * right) {
|
||||||
|
const float * l = left;
|
||||||
|
const float * r = right;
|
||||||
|
return *l < *r ? -1 : *l > *r ? 1 : 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) {
|
||||||
|
|
||||||
|
const int gindex = iq2_data_index(GGML_TYPE_IQ1_S);
|
||||||
|
|
||||||
|
const uint64_t * kgrid_q2xs = iq2_data[gindex].grid;
|
||||||
|
const int * kmap_q2xs = iq2_data[gindex].map;
|
||||||
|
const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
|
||||||
|
|
||||||
|
GGML_ASSERT(quant_weights && "missing quantization weights");
|
||||||
|
GGML_ASSERT(kgrid_q2xs && "forgot to call ggml_quantize_init()?");
|
||||||
|
GGML_ASSERT(kmap_q2xs && "forgot to call ggml_quantize_init()?");
|
||||||
|
GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
|
||||||
|
GGML_ASSERT(n%QK_K == 0);
|
||||||
|
|
||||||
|
const int nbl = n/256;
|
||||||
|
|
||||||
|
block_iq1_s * y = vy;
|
||||||
|
|
||||||
|
float scales[QK_K/8];
|
||||||
|
float weight[8];
|
||||||
|
int8_t L[8];
|
||||||
|
float sumx[9];
|
||||||
|
float sumw[9];
|
||||||
|
float pairs[16];
|
||||||
|
int * idx = (int *)(pairs + 1);
|
||||||
|
uint8_t hbit[QK_K/8];
|
||||||
|
|
||||||
|
for (int ibl = 0; ibl < nbl; ++ibl) {
|
||||||
|
|
||||||
|
y[ibl].d = GGML_FP32_TO_FP16(0.f);
|
||||||
|
memset(y[ibl].qs, 0, QK_K/8);
|
||||||
|
memset(y[ibl].scales, 0, QK_K/16);
|
||||||
|
|
||||||
|
float max_scale = 0;
|
||||||
|
|
||||||
|
const float * xbl = x + QK_K*ibl;
|
||||||
|
float sumx2 = 0;
|
||||||
|
for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
|
||||||
|
float sigma2 = sumx2/QK_K;
|
||||||
|
|
||||||
|
for (int ib = 0; ib < QK_K/8; ++ib) {
|
||||||
|
const float * xb = xbl + 8*ib;
|
||||||
|
const float * qw = quant_weights + QK_K*ibl + 8*ib;
|
||||||
|
for (int i = 0; i < 8; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
|
||||||
|
float max = fabsf(xb[0]);
|
||||||
|
for (int i = 1; i < 8; ++i) max = MAX(max, fabsf(xb[i]));
|
||||||
|
if (!max) {
|
||||||
|
scales[ib] = 0;
|
||||||
|
memset(L, 1, 8);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
// Here we solve exactly the sum of squared difference (SSD) weighted minimization problem.
|
||||||
|
// With just 3 allowed quant values (-1, 0, 1), we can search exhaustively for the two
|
||||||
|
// boundaries that split the weights xb[i] into 3 groups. To do so, we sort the weights
|
||||||
|
// in ascending order, compute Si = sum[weight[j] xb[j], j = 0...i] and
|
||||||
|
// Wi = sum[weight[j], j = 0...i], and use these to quckly get get the optimum scale
|
||||||
|
// for each possible and score for each split.
|
||||||
|
for (int j = 0; j < 8; ++j) {
|
||||||
|
pairs[2*j] = xb[j];
|
||||||
|
idx[2*j] = j;
|
||||||
|
}
|
||||||
|
qsort(pairs, 8, 2*sizeof(float), iq1_sort_helper);
|
||||||
|
{
|
||||||
|
sumx[0] = sumw[0] = 0;
|
||||||
|
for (int j = 0; j < 8; ++j) {
|
||||||
|
int i = idx[2*j];
|
||||||
|
sumx[j+1] = sumx[j] + weight[i]*xb[i];
|
||||||
|
sumw[j+1] = sumw[j] + weight[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
float best_score = 0, scale = max;
|
||||||
|
int besti1 = 0, besti2 = 0;
|
||||||
|
for (int i1 = 0; i1 <= 8; ++i1) {
|
||||||
|
for (int i2 = i1; i2 <= 8; ++i2) {
|
||||||
|
float sumqx = -(sumx[i1] - sumx[0]) + (sumx[8] - sumx[i2]);
|
||||||
|
float sumq2 = (sumw[i1] - sumw[0]) + (sumw[8] - sumw[i2]);
|
||||||
|
if (sumq2 > 0 && sumqx*sumqx > best_score*sumq2) {
|
||||||
|
scale = sumqx/sumq2; best_score = scale*sumqx;
|
||||||
|
besti1 = i1; besti2 = i2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (int j = 0; j < besti1; ++j) L[idx[2*j]] = 0;
|
||||||
|
for (int j = besti1; j < besti2; ++j) L[idx[2*j]] = 1;
|
||||||
|
for (int j = besti2; j < 8; ++j) L[idx[2*j]] = 2;
|
||||||
|
if (scale < 0) {
|
||||||
|
for (int j = 0; j < 8; ++j) L[j] = 2 - L[j];
|
||||||
|
scale = -scale;
|
||||||
|
}
|
||||||
|
// Now we check if the solution found above corresponds to a grid point and, if not, use a neighbouring
|
||||||
|
// grid point that minimizes SSD.
|
||||||
|
uint16_t u = 0;
|
||||||
|
for (int j = 0; j < 8; ++j) u |= (L[j] << 2*j);
|
||||||
|
int grid_index = kmap_q2xs[u];
|
||||||
|
if (grid_index < 0) {
|
||||||
|
const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
|
||||||
|
grid_index = iq1_find_best_neighbour(neighbours, kgrid_q2xs, xb, weight, &scale, L, NGRID_IQ2XXS);
|
||||||
|
GGML_ASSERT(grid_index >= 0);
|
||||||
|
}
|
||||||
|
y[ibl].qs[ib] = grid_index & 255;
|
||||||
|
hbit[ib] = grid_index >> 8;
|
||||||
|
GGML_ASSERT(scale >= 0);
|
||||||
|
scales[ib] = scale;
|
||||||
|
max_scale = MAX(max_scale, scale);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!max_scale) {
|
||||||
|
memset(y[ibl].qs, 0, QK_K/8);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
float d = max_scale/15;
|
||||||
|
y[ibl].d = GGML_FP32_TO_FP16(d*1.085f); // 1.085f is another fudge factor. Don't ask me why it is needed.
|
||||||
|
float id = 1/d;
|
||||||
|
for (int ib = 0; ib < QK_K/8; ++ib) {
|
||||||
|
int l = nearest_int(0.5f*(id*scales[ib]-1));
|
||||||
|
l = MAX(0, MIN(7, l));
|
||||||
|
if (hbit[ib]) l |= 8;
|
||||||
|
y[ibl].scales[ib/2] |= (l << 4*(ib%2));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t quantize_iq1_s(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
||||||
|
(void)hist;
|
||||||
|
GGML_ASSERT(n_per_row%QK_K == 0);
|
||||||
|
int nblock = n_per_row/QK_K;
|
||||||
|
char * qrow = (char *)dst;
|
||||||
|
for (int row = 0; row < nrow; ++row) {
|
||||||
|
quantize_row_iq1_s_impl(src, qrow, n_per_row, quant_weights);
|
||||||
|
src += n_per_row;
|
||||||
|
qrow += nblock*sizeof(block_iq1_s);
|
||||||
|
}
|
||||||
|
return nrow * nblock * sizeof(block_iq1_s);
|
||||||
|
}
|
||||||
|
|
|
@ -191,6 +191,13 @@ typedef struct {
|
||||||
} block_iq3_xxs;
|
} block_iq3_xxs;
|
||||||
static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
|
static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
ggml_fp16_t d;
|
||||||
|
uint8_t qs[QK_K/8];
|
||||||
|
uint8_t scales[QK_K/16];
|
||||||
|
} block_iq1_s;
|
||||||
|
static_assert(sizeof(block_iq1_s) == sizeof(ggml_fp16_t) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding");
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
@ -243,6 +250,7 @@ void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRI
|
||||||
void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||||
void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||||
void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||||
|
void dequantize_row_iq1_s (const block_iq1_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||||
|
|
||||||
// Dot product
|
// Dot product
|
||||||
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
|
@ -259,6 +267,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||||
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
|
void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
|
|
||||||
//
|
//
|
||||||
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
|
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
|
||||||
|
@ -266,6 +275,7 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
|
||||||
size_t quantize_iq2_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
size_t quantize_iq2_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||||
size_t quantize_iq2_xs (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
size_t quantize_iq2_xs (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||||
size_t quantize_iq3_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
size_t quantize_iq3_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||||
|
size_t quantize_iq1_s (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||||
size_t quantize_q2_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
size_t quantize_q2_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||||
size_t quantize_q3_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
size_t quantize_q3_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||||
size_t quantize_q4_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
size_t quantize_q4_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||||
|
@ -276,8 +286,8 @@ size_t quantize_q4_1 (const float * src, void * dst, int nrows, int n_per_row,
|
||||||
size_t quantize_q5_0 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
size_t quantize_q5_0 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||||
size_t quantize_q5_1 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
size_t quantize_q5_1 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||||
|
|
||||||
void iq2xs_init_impl(int grid_size);
|
void iq2xs_init_impl(enum ggml_type type);
|
||||||
void iq2xs_free_impl(int grid_size);
|
void iq2xs_free_impl(enum ggml_type type);
|
||||||
void iq3xs_init_impl(int grid_size);
|
void iq3xs_init_impl(int grid_size);
|
||||||
void iq3xs_free_impl(int grid_size);
|
void iq3xs_free_impl(int grid_size);
|
||||||
|
|
||||||
|
|
258
ggml-sycl.cpp
258
ggml-sycl.cpp
|
@ -9188,174 +9188,22 @@ static void convert_mul_mat_vec_f16_sycl(const void *vx, const dfloat *y,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy,
|
template <int qk, int qi, typename block_q_t, int vdr,
|
||||||
float *dst, const int ncols,
|
vec_dot_q_sycl_t vec_dot_q_sycl>
|
||||||
const int nrows,
|
static void mul_mat_vec_q_sycl_submitter(const void *vx, const void *vy,
|
||||||
dpct::queue_ptr stream) {
|
float *dst, const int ncols,
|
||||||
GGML_ASSERT(ncols % QK4_0 == 0);
|
const int nrows,
|
||||||
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
dpct::queue_ptr stream) {
|
||||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
GGML_ASSERT(ncols % QK4_0 == 0);
|
||||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
||||||
stream->parallel_for(
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
||||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
stream->parallel_for(
|
||||||
mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ,
|
sycl::nd_range<3>(block_nums * block_dims, block_dims), [=
|
||||||
vec_dot_q4_0_q8_1>(vx, vy, dst, ncols, nrows,
|
](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
||||||
item_ct1);
|
mul_mat_vec_q<qk, qi, block_q_t, vdr, vec_dot_q_sycl>(
|
||||||
});
|
vx, vy, dst, ncols, nrows, item_ct1);
|
||||||
}
|
});
|
||||||
|
|
||||||
static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy,
|
|
||||||
float *dst, const int ncols,
|
|
||||||
const int nrows,
|
|
||||||
dpct::queue_ptr stream) {
|
|
||||||
GGML_ASSERT(ncols % QK4_1 == 0);
|
|
||||||
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
|
||||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
|
||||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
|
||||||
stream->parallel_for(
|
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
|
||||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
|
||||||
mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ,
|
|
||||||
vec_dot_q4_1_q8_1>(vx, vy, dst, ncols, nrows,
|
|
||||||
item_ct1);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy,
|
|
||||||
float *dst, const int ncols,
|
|
||||||
const int nrows,
|
|
||||||
dpct::queue_ptr stream) {
|
|
||||||
GGML_ASSERT(ncols % QK5_0 == 0);
|
|
||||||
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
|
||||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
|
||||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
|
||||||
stream->parallel_for(
|
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
|
||||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
|
||||||
mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ,
|
|
||||||
vec_dot_q5_0_q8_1>(vx, vy, dst, ncols, nrows,
|
|
||||||
item_ct1);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy,
|
|
||||||
float *dst, const int ncols,
|
|
||||||
const int nrows,
|
|
||||||
dpct::queue_ptr stream) {
|
|
||||||
GGML_ASSERT(ncols % QK5_1 == 0);
|
|
||||||
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
|
||||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
|
||||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
|
||||||
stream->parallel_for(
|
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
|
||||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
|
||||||
mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ,
|
|
||||||
vec_dot_q5_1_q8_1>(vx, vy, dst, ncols, nrows,
|
|
||||||
item_ct1);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy,
|
|
||||||
float *dst, const int ncols,
|
|
||||||
const int nrows,
|
|
||||||
dpct::queue_ptr stream) {
|
|
||||||
GGML_ASSERT(ncols % QK8_0 == 0);
|
|
||||||
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
|
||||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
|
||||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
|
||||||
stream->parallel_for(
|
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
|
||||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
|
||||||
mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ,
|
|
||||||
vec_dot_q8_0_q8_1>(vx, vy, dst, ncols, nrows,
|
|
||||||
item_ct1);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy,
|
|
||||||
float *dst, const int ncols,
|
|
||||||
const int nrows,
|
|
||||||
dpct::queue_ptr stream) {
|
|
||||||
GGML_ASSERT(ncols % QK_K == 0);
|
|
||||||
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
|
||||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
|
||||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
|
||||||
stream->parallel_for(
|
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
|
||||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
|
||||||
mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ,
|
|
||||||
vec_dot_q2_K_q8_1>(vx, vy, dst, ncols, nrows,
|
|
||||||
item_ct1);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy,
|
|
||||||
float *dst, const int ncols,
|
|
||||||
const int nrows,
|
|
||||||
dpct::queue_ptr stream) {
|
|
||||||
GGML_ASSERT(ncols % QK_K == 0);
|
|
||||||
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
|
||||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
|
||||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
|
||||||
stream->parallel_for(
|
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
|
||||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
|
||||||
mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ,
|
|
||||||
vec_dot_q3_K_q8_1>(vx, vy, dst, ncols, nrows,
|
|
||||||
item_ct1);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
|
|
||||||
float *dst, const int ncols,
|
|
||||||
const int nrows,
|
|
||||||
dpct::queue_ptr stream) {
|
|
||||||
GGML_ASSERT(ncols % QK_K == 0);
|
|
||||||
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
|
||||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
|
||||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
|
||||||
stream->parallel_for(
|
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
|
||||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
|
||||||
mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ,
|
|
||||||
vec_dot_q4_K_q8_1>(vx, vy, dst, ncols, nrows,
|
|
||||||
item_ct1);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
|
|
||||||
float *dst, const int ncols,
|
|
||||||
const int nrows,
|
|
||||||
dpct::queue_ptr stream) {
|
|
||||||
GGML_ASSERT(ncols % QK_K == 0);
|
|
||||||
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
|
||||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
|
||||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
|
||||||
stream->parallel_for(
|
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
|
||||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
|
||||||
mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ,
|
|
||||||
vec_dot_q5_K_q8_1>(vx, vy, dst, ncols, nrows,
|
|
||||||
item_ct1);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
|
|
||||||
float *dst, const int ncols,
|
|
||||||
const int nrows,
|
|
||||||
dpct::queue_ptr stream) {
|
|
||||||
GGML_ASSERT(ncols % QK_K == 0);
|
|
||||||
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
|
||||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
|
||||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
|
||||||
stream->parallel_for(
|
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
|
||||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
|
||||||
mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ,
|
|
||||||
vec_dot_q6_K_q8_1>(vx, vy, dst, ncols, nrows,
|
|
||||||
item_ct1);
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int get_device_index_by_id(int id){
|
int get_device_index_by_id(int id){
|
||||||
|
@ -12095,37 +11943,63 @@ inline void ggml_sycl_op_mul_mat_vec_q(
|
||||||
const int64_t ne00 = src0->ne[0];
|
const int64_t ne00 = src0->ne[0];
|
||||||
const int64_t row_diff = row_high - row_low;
|
const int64_t row_diff = row_high - row_low;
|
||||||
|
|
||||||
|
// TODO: support these quantization types
|
||||||
|
GGML_ASSERT(!(src0->type == GGML_TYPE_IQ2_XXS ||
|
||||||
|
src0->type == GGML_TYPE_IQ2_XS ||
|
||||||
|
src0->type == GGML_TYPE_IQ3_XXS ||
|
||||||
|
src0->type == GGML_TYPE_IQ1_S));
|
||||||
|
|
||||||
switch (src0->type) {
|
switch (src0->type) {
|
||||||
case GGML_TYPE_Q4_0:
|
case GGML_TYPE_Q4_0:
|
||||||
mul_mat_vec_q4_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
mul_mat_vec_q_sycl_submitter<QK4_0, QI4_0, block_q4_0,
|
||||||
break;
|
VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>(
|
||||||
|
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
||||||
|
break;
|
||||||
case GGML_TYPE_Q4_1:
|
case GGML_TYPE_Q4_1:
|
||||||
mul_mat_vec_q4_1_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
mul_mat_vec_q_sycl_submitter<QK4_1, QI4_1, block_q4_1,
|
||||||
break;
|
VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>(
|
||||||
|
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
||||||
|
break;
|
||||||
case GGML_TYPE_Q5_0:
|
case GGML_TYPE_Q5_0:
|
||||||
mul_mat_vec_q5_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
mul_mat_vec_q_sycl_submitter<QK5_0, QI5_0, block_q5_0,
|
||||||
break;
|
VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>(
|
||||||
|
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
||||||
|
break;
|
||||||
case GGML_TYPE_Q5_1:
|
case GGML_TYPE_Q5_1:
|
||||||
mul_mat_vec_q5_1_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
mul_mat_vec_q_sycl_submitter<QK5_1, QI5_1, block_q5_1,
|
||||||
break;
|
VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>(
|
||||||
|
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
||||||
|
break;
|
||||||
case GGML_TYPE_Q8_0:
|
case GGML_TYPE_Q8_0:
|
||||||
mul_mat_vec_q8_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
mul_mat_vec_q_sycl_submitter<QK8_0, QI8_0, block_q8_0,
|
||||||
break;
|
VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>(
|
||||||
|
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
||||||
|
break;
|
||||||
case GGML_TYPE_Q2_K:
|
case GGML_TYPE_Q2_K:
|
||||||
mul_mat_vec_q2_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
mul_mat_vec_q_sycl_submitter<QK_K, QI2_K, block_q2_K,
|
||||||
break;
|
VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>(
|
||||||
|
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
||||||
|
break;
|
||||||
case GGML_TYPE_Q3_K:
|
case GGML_TYPE_Q3_K:
|
||||||
mul_mat_vec_q3_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
mul_mat_vec_q_sycl_submitter<QK_K, QI3_K, block_q3_K,
|
||||||
break;
|
VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>(
|
||||||
|
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
||||||
|
break;
|
||||||
case GGML_TYPE_Q4_K:
|
case GGML_TYPE_Q4_K:
|
||||||
mul_mat_vec_q4_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
mul_mat_vec_q_sycl_submitter<QK_K, QI4_K, block_q4_K,
|
||||||
break;
|
VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>(
|
||||||
|
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
||||||
|
break;
|
||||||
case GGML_TYPE_Q5_K:
|
case GGML_TYPE_Q5_K:
|
||||||
mul_mat_vec_q5_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
mul_mat_vec_q_sycl_submitter<QK_K, QI5_K, block_q5_K,
|
||||||
break;
|
VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>(
|
||||||
|
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
||||||
|
break;
|
||||||
case GGML_TYPE_Q6_K:
|
case GGML_TYPE_Q6_K:
|
||||||
mul_mat_vec_q6_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
mul_mat_vec_q_sycl_submitter<QK_K, QI6_K, block_q6_K,
|
||||||
break;
|
VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>(
|
||||||
|
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
||||||
|
break;
|
||||||
default:
|
default:
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
break;
|
break;
|
||||||
|
@ -12145,7 +12019,7 @@ inline void ggml_sycl_op_dequantize_mul_mat_vec(
|
||||||
const int64_t src1_ncols, const int64_t src1_padded_row_size,
|
const int64_t src1_ncols, const int64_t src1_padded_row_size,
|
||||||
const dpct::queue_ptr &stream) {
|
const dpct::queue_ptr &stream) {
|
||||||
|
|
||||||
GGML_TENSOR_BINARY_OP_LOCALS
|
GGML_TENSOR_BINARY_OP_LOCALS;
|
||||||
|
|
||||||
const int64_t row_diff = row_high - row_low;
|
const int64_t row_diff = row_high - row_low;
|
||||||
|
|
||||||
|
@ -15093,6 +14967,12 @@ static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, const ggml_ten
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (a->type == GGML_TYPE_IQ1_S) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (a->type == GGML_TYPE_IQ3_XXS) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
if (a->type == GGML_TYPE_IQ2_XXS) {
|
if (a->type == GGML_TYPE_IQ2_XXS) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
103
ggml-vulkan.cpp
103
ggml-vulkan.cpp
|
@ -1091,7 +1091,10 @@ static void ggml_vk_print_gpu_info(size_t idx) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_vk_instance_init() {
|
static bool ggml_vk_instance_validation_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions);
|
||||||
|
static bool ggml_vk_instance_portability_enumeration_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions);
|
||||||
|
|
||||||
|
void ggml_vk_instance_init() {
|
||||||
if (vk_instance_initialized) {
|
if (vk_instance_initialized) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -1100,28 +1103,42 @@ static void ggml_vk_instance_init() {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
vk::ApplicationInfo app_info{ "ggml-vulkan", 1, nullptr, 0, VK_API_VERSION };
|
vk::ApplicationInfo app_info{ "ggml-vulkan", 1, nullptr, 0, VK_API_VERSION };
|
||||||
const std::vector<const char*> layers = {
|
|
||||||
#ifdef GGML_VULKAN_VALIDATE
|
|
||||||
"VK_LAYER_KHRONOS_validation",
|
|
||||||
#endif
|
|
||||||
};
|
|
||||||
const std::vector<const char*> extensions = {
|
|
||||||
#ifdef GGML_VULKAN_VALIDATE
|
|
||||||
"VK_EXT_validation_features",
|
|
||||||
#endif
|
|
||||||
};
|
|
||||||
vk::InstanceCreateInfo instance_create_info(vk::InstanceCreateFlags(), &app_info, layers, extensions);
|
|
||||||
#ifdef GGML_VULKAN_VALIDATE
|
|
||||||
const std::vector<vk::ValidationFeatureEnableEXT> features_enable = { vk::ValidationFeatureEnableEXT::eBestPractices };
|
|
||||||
vk::ValidationFeaturesEXT validation_features = {
|
|
||||||
features_enable,
|
|
||||||
{},
|
|
||||||
};
|
|
||||||
validation_features.setPNext(nullptr);
|
|
||||||
instance_create_info.setPNext(&validation_features);
|
|
||||||
|
|
||||||
std::cerr << "ggml_vulkan: Validation layers enabled" << std::endl;
|
const std::vector<vk::ExtensionProperties> instance_extensions = vk::enumerateInstanceExtensionProperties();
|
||||||
#endif
|
const bool validation_ext = ggml_vk_instance_validation_ext_available(instance_extensions);
|
||||||
|
const bool portability_enumeration_ext = ggml_vk_instance_portability_enumeration_ext_available(instance_extensions);
|
||||||
|
|
||||||
|
std::vector<const char*> layers;
|
||||||
|
|
||||||
|
if (validation_ext) {
|
||||||
|
layers.push_back("VK_LAYER_KHRONOS_validation");
|
||||||
|
}
|
||||||
|
std::vector<const char*> extensions;
|
||||||
|
if (validation_ext) {
|
||||||
|
extensions.push_back("VK_EXT_validation_features");
|
||||||
|
}
|
||||||
|
if (portability_enumeration_ext) {
|
||||||
|
extensions.push_back("VK_KHR_portability_enumeration");
|
||||||
|
}
|
||||||
|
vk::InstanceCreateInfo instance_create_info(vk::InstanceCreateFlags{}, &app_info, layers, extensions);
|
||||||
|
if (portability_enumeration_ext) {
|
||||||
|
instance_create_info.flags |= vk::InstanceCreateFlagBits::eEnumeratePortabilityKHR;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<vk::ValidationFeatureEnableEXT> features_enable;
|
||||||
|
vk::ValidationFeaturesEXT validation_features;
|
||||||
|
|
||||||
|
if (validation_ext) {
|
||||||
|
features_enable = { vk::ValidationFeatureEnableEXT::eBestPractices };
|
||||||
|
validation_features = {
|
||||||
|
features_enable,
|
||||||
|
{},
|
||||||
|
};
|
||||||
|
validation_features.setPNext(nullptr);
|
||||||
|
instance_create_info.setPNext(&validation_features);
|
||||||
|
|
||||||
|
std::cerr << "ggml_vulkan: Validation layers enabled" << std::endl;
|
||||||
|
}
|
||||||
vk_instance.instance = vk::createInstance(instance_create_info);
|
vk_instance.instance = vk::createInstance(instance_create_info);
|
||||||
|
|
||||||
memset(vk_instance.initialized, 0, sizeof(bool) * GGML_VK_MAX_DEVICES);
|
memset(vk_instance.initialized, 0, sizeof(bool) * GGML_VK_MAX_DEVICES);
|
||||||
|
@ -1168,12 +1185,12 @@ static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
|
||||||
vk_instance.devices[idx] = std::make_shared<vk_device>();
|
vk_instance.devices[idx] = std::make_shared<vk_device>();
|
||||||
ctx->device = vk_instance.devices[idx];
|
ctx->device = vk_instance.devices[idx];
|
||||||
ctx->device.lock()->physical_device = devices[dev_num];
|
ctx->device.lock()->physical_device = devices[dev_num];
|
||||||
std::vector<vk::ExtensionProperties> ext_props = ctx->device.lock()->physical_device.enumerateDeviceExtensionProperties();
|
const std::vector<vk::ExtensionProperties> ext_props = ctx->device.lock()->physical_device.enumerateDeviceExtensionProperties();
|
||||||
|
|
||||||
bool maintenance4_support = false;
|
bool maintenance4_support = false;
|
||||||
|
|
||||||
// Check if maintenance4 is supported
|
// Check if maintenance4 is supported
|
||||||
for (auto properties : ext_props) {
|
for (const auto& properties : ext_props) {
|
||||||
if (strcmp("VK_KHR_maintenance4", properties.extensionName) == 0) {
|
if (strcmp("VK_KHR_maintenance4", properties.extensionName) == 0) {
|
||||||
maintenance4_support = true;
|
maintenance4_support = true;
|
||||||
}
|
}
|
||||||
|
@ -1204,7 +1221,7 @@ static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
|
||||||
bool fp16_storage = false;
|
bool fp16_storage = false;
|
||||||
bool fp16_compute = false;
|
bool fp16_compute = false;
|
||||||
|
|
||||||
for (auto properties : ext_props) {
|
for (const auto& properties : ext_props) {
|
||||||
if (strcmp("VK_KHR_16bit_storage", properties.extensionName) == 0) {
|
if (strcmp("VK_KHR_16bit_storage", properties.extensionName) == 0) {
|
||||||
fp16_storage = true;
|
fp16_storage = true;
|
||||||
} else if (strcmp("VK_KHR_shader_float16_int8", properties.extensionName) == 0) {
|
} else if (strcmp("VK_KHR_shader_float16_int8", properties.extensionName) == 0) {
|
||||||
|
@ -5301,6 +5318,42 @@ GGML_CALL int ggml_backend_vk_reg_devices() {
|
||||||
return vk_instance.device_indices.size();
|
return vk_instance.device_indices.size();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Extension availability
|
||||||
|
static bool ggml_vk_instance_validation_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions) {
|
||||||
|
#ifdef GGML_VULKAN_VALIDATE
|
||||||
|
bool portability_enumeration_ext = false;
|
||||||
|
// Check for portability enumeration extension for MoltenVK support
|
||||||
|
for (const auto& properties : instance_extensions) {
|
||||||
|
if (strcmp("VK_KHR_portability_enumeration", properties.extensionName) == 0) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!portability_enumeration_ext) {
|
||||||
|
std::cerr << "ggml_vulkan: WARNING: Instance extension VK_KHR_portability_enumeration not found." << std::endl;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
return false;
|
||||||
|
|
||||||
|
UNUSED(instance_extensions);
|
||||||
|
}
|
||||||
|
static bool ggml_vk_instance_portability_enumeration_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions) {
|
||||||
|
#ifdef __APPLE__
|
||||||
|
bool portability_enumeration_ext = false;
|
||||||
|
// Check for portability enumeration extension for MoltenVK support
|
||||||
|
for (const auto& properties : instance_extensions) {
|
||||||
|
if (strcmp("VK_KHR_portability_enumeration", properties.extensionName) == 0) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!portability_enumeration_ext) {
|
||||||
|
std::cerr << "ggml_vulkan: WARNING: Instance extension VK_KHR_portability_enumeration not found." << std::endl;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
return false;
|
||||||
|
|
||||||
|
UNUSED(instance_extensions);
|
||||||
|
}
|
||||||
|
|
||||||
// checks
|
// checks
|
||||||
|
|
||||||
#ifdef GGML_VULKAN_CHECK_RESULTS
|
#ifdef GGML_VULKAN_CHECK_RESULTS
|
||||||
|
|
202
ggml.c
202
ggml.c
|
@ -23,6 +23,9 @@
|
||||||
#include <limits.h>
|
#include <limits.h>
|
||||||
#include <stdarg.h>
|
#include <stdarg.h>
|
||||||
#include <signal.h>
|
#include <signal.h>
|
||||||
|
#if defined(__gnu_linux__)
|
||||||
|
#include <syscall.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef GGML_USE_METAL
|
#ifdef GGML_USE_METAL
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
|
@ -270,6 +273,8 @@ inline static void * ggml_calloc(size_t num, size_t size) {
|
||||||
#include <Accelerate/Accelerate.h>
|
#include <Accelerate/Accelerate.h>
|
||||||
#if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions
|
#if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions
|
||||||
#include "ggml-opencl.h"
|
#include "ggml-opencl.h"
|
||||||
|
#elif defined(GGML_USE_VULKAN)
|
||||||
|
#include "ggml-vulkan.h"
|
||||||
#endif
|
#endif
|
||||||
#elif defined(GGML_USE_OPENBLAS)
|
#elif defined(GGML_USE_OPENBLAS)
|
||||||
#if defined(GGML_BLAS_USE_MKL)
|
#if defined(GGML_BLAS_USE_MKL)
|
||||||
|
@ -673,6 +678,18 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||||
.nrows = 1,
|
.nrows = 1,
|
||||||
},
|
},
|
||||||
|
[GGML_TYPE_IQ1_S] = {
|
||||||
|
.type_name = "iq1_s",
|
||||||
|
.blck_size = QK_K,
|
||||||
|
.type_size = sizeof(block_iq1_s),
|
||||||
|
.is_quantized = true,
|
||||||
|
.to_float = (ggml_to_float_t) dequantize_row_iq1_s,
|
||||||
|
.from_float = NULL,
|
||||||
|
.from_float_reference = NULL,
|
||||||
|
.vec_dot = ggml_vec_dot_iq1_s_q8_K,
|
||||||
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||||
|
.nrows = 1,
|
||||||
|
},
|
||||||
[GGML_TYPE_Q8_K] = {
|
[GGML_TYPE_Q8_K] = {
|
||||||
.type_name = "q8_K",
|
.type_name = "q8_K",
|
||||||
.blck_size = QK_K,
|
.blck_size = QK_K,
|
||||||
|
@ -868,7 +885,7 @@ do { \
|
||||||
const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(x[0]), \
|
const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(x[0]), \
|
||||||
_mm256_extractf128_ps(x[0], 1)); \
|
_mm256_extractf128_ps(x[0], 1)); \
|
||||||
const __m128 t1 = _mm_hadd_ps(t0, t0); \
|
const __m128 t1 = _mm_hadd_ps(t0, t0); \
|
||||||
res = _mm_cvtss_f32(_mm_hadd_ps(t1, t1)); \
|
res = (ggml_float) _mm_cvtss_f32(_mm_hadd_ps(t1, t1)); \
|
||||||
} while (0)
|
} while (0)
|
||||||
// TODO: is this optimal ?
|
// TODO: is this optimal ?
|
||||||
|
|
||||||
|
@ -1149,7 +1166,7 @@ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
|
||||||
x[i] = _mm_add_ps(x[i], x[offset+i]); \
|
x[i] = _mm_add_ps(x[i], x[offset+i]); \
|
||||||
} \
|
} \
|
||||||
const __m128 t0 = _mm_hadd_ps(x[0], x[0]); \
|
const __m128 t0 = _mm_hadd_ps(x[0], x[0]); \
|
||||||
res = _mm_cvtss_f32(_mm_hadd_ps(t0, t0)); \
|
res = (ggml_float) _mm_cvtss_f32(_mm_hadd_ps(t0, t0)); \
|
||||||
}
|
}
|
||||||
// TODO: is this optimal ?
|
// TODO: is this optimal ?
|
||||||
|
|
||||||
|
@ -1959,7 +1976,7 @@ struct ggml_numa_nodes {
|
||||||
uint32_t n_nodes;
|
uint32_t n_nodes;
|
||||||
uint32_t total_cpus; // hardware threads on system
|
uint32_t total_cpus; // hardware threads on system
|
||||||
uint32_t current_node; // node on which main process is execting
|
uint32_t current_node; // node on which main process is execting
|
||||||
#ifdef __linux__
|
#if defined(__gnu_linux__)
|
||||||
cpu_set_t cpuset; // cpuset from numactl
|
cpu_set_t cpuset; // cpuset from numactl
|
||||||
#else
|
#else
|
||||||
uint32_t cpuset; // no NUMA support outside of Linux at this time. Use a portable datatype
|
uint32_t cpuset; // no NUMA support outside of Linux at this time. Use a portable datatype
|
||||||
|
@ -1997,7 +2014,7 @@ inline static void ggml_critical_section_end(void) {
|
||||||
atomic_fetch_sub(&g_state_barrier, 1);
|
atomic_fetch_sub(&g_state_barrier, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef __linux__
|
#if defined(__gnu_linux__)
|
||||||
static cpu_set_t ggml_get_numa_affinity(void) {
|
static cpu_set_t ggml_get_numa_affinity(void) {
|
||||||
cpu_set_t cpuset;
|
cpu_set_t cpuset;
|
||||||
pthread_t thread;
|
pthread_t thread;
|
||||||
|
@ -2019,7 +2036,7 @@ void ggml_numa_init(enum ggml_numa_strategy numa_flag) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef __linux__
|
#if defined(__gnu_linux__)
|
||||||
struct stat st;
|
struct stat st;
|
||||||
char path[256];
|
char path[256];
|
||||||
int rv;
|
int rv;
|
||||||
|
@ -2051,7 +2068,13 @@ void ggml_numa_init(enum ggml_numa_strategy numa_flag) {
|
||||||
|
|
||||||
// figure out which node we're on
|
// figure out which node we're on
|
||||||
uint current_cpu;
|
uint current_cpu;
|
||||||
int getcpu_ret = getcpu(¤t_cpu, &g_state.numa.current_node);
|
int getcpu_ret = 0;
|
||||||
|
#if __GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ > 28)
|
||||||
|
getcpu_ret = getcpu(¤t_cpu, &g_state.numa.current_node);
|
||||||
|
#else
|
||||||
|
// old glibc doesn't have a wrapper for this call. Fall back on direct syscall
|
||||||
|
getcpu_ret = syscall(SYS_getcpu,¤t_cpu,&g_state.numa.current_node);
|
||||||
|
#endif
|
||||||
|
|
||||||
if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1 || getcpu_ret != 0) {
|
if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1 || getcpu_ret != 0) {
|
||||||
g_state.numa.n_nodes = 0;
|
g_state.numa.n_nodes = 0;
|
||||||
|
@ -2086,6 +2109,7 @@ void ggml_numa_init(enum ggml_numa_strategy numa_flag) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
|
GGML_UNUSED(numa_flag);
|
||||||
// TODO
|
// TODO
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
@ -2266,6 +2290,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
|
||||||
case GGML_FTYPE_MOSTLY_IQ2_XXS: wtype = GGML_TYPE_IQ2_XXS; break;
|
case GGML_FTYPE_MOSTLY_IQ2_XXS: wtype = GGML_TYPE_IQ2_XXS; break;
|
||||||
case GGML_FTYPE_MOSTLY_IQ2_XS: wtype = GGML_TYPE_IQ2_XS; break;
|
case GGML_FTYPE_MOSTLY_IQ2_XS: wtype = GGML_TYPE_IQ2_XS; break;
|
||||||
case GGML_FTYPE_MOSTLY_IQ3_XXS: wtype = GGML_TYPE_IQ3_XXS; break;
|
case GGML_FTYPE_MOSTLY_IQ3_XXS: wtype = GGML_TYPE_IQ3_XXS; break;
|
||||||
|
case GGML_FTYPE_MOSTLY_IQ1_S: wtype = GGML_TYPE_IQ1_S; break;
|
||||||
case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break;
|
case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break;
|
||||||
case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
|
case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
|
||||||
}
|
}
|
||||||
|
@ -3219,7 +3244,7 @@ const char * ggml_get_name(const struct ggml_tensor * tensor) {
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * name) {
|
struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * name) {
|
||||||
strncpy(tensor->name, name, sizeof(tensor->name));
|
strncpy(tensor->name, name, sizeof(tensor->name) - 1);
|
||||||
tensor->name[sizeof(tensor->name) - 1] = '\0';
|
tensor->name[sizeof(tensor->name) - 1] = '\0';
|
||||||
return tensor;
|
return tensor;
|
||||||
}
|
}
|
||||||
|
@ -5095,16 +5120,28 @@ static struct ggml_tensor * ggml_soft_max_impl(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
struct ggml_tensor * mask,
|
struct ggml_tensor * mask,
|
||||||
|
struct ggml_tensor * pos,
|
||||||
float scale,
|
float scale,
|
||||||
|
float max_bias,
|
||||||
bool inplace) {
|
bool inplace) {
|
||||||
GGML_ASSERT(ggml_is_contiguous(a));
|
GGML_ASSERT(ggml_is_contiguous(a));
|
||||||
|
|
||||||
if (mask) {
|
if (mask) {
|
||||||
GGML_ASSERT(ggml_is_contiguous(mask));
|
GGML_ASSERT(ggml_is_contiguous(mask));
|
||||||
GGML_ASSERT(mask->ne[2] == 1);
|
GGML_ASSERT(ggml_is_matrix(mask));
|
||||||
GGML_ASSERT(mask->ne[3] == 1);
|
|
||||||
GGML_ASSERT(ggml_can_repeat_rows(mask, a));
|
GGML_ASSERT(ggml_can_repeat_rows(mask, a));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (pos) {
|
||||||
|
GGML_ASSERT(ggml_is_vector(pos));
|
||||||
|
GGML_ASSERT(pos->type == GGML_TYPE_F32);
|
||||||
|
GGML_ASSERT(pos->ne[0] == a->ne[0]);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (max_bias > 0.0f) {
|
||||||
|
GGML_ASSERT(pos);
|
||||||
|
}
|
||||||
|
|
||||||
bool is_node = false;
|
bool is_node = false;
|
||||||
|
|
||||||
if (a->grad) {
|
if (a->grad) {
|
||||||
|
@ -5113,13 +5150,14 @@ static struct ggml_tensor * ggml_soft_max_impl(
|
||||||
|
|
||||||
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
||||||
|
|
||||||
float params[] = { scale };
|
float params[] = { scale, max_bias };
|
||||||
ggml_set_op_params(result, params, sizeof(params));
|
ggml_set_op_params(result, params, sizeof(params));
|
||||||
|
|
||||||
result->op = GGML_OP_SOFT_MAX;
|
result->op = GGML_OP_SOFT_MAX;
|
||||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||||
result->src[0] = a;
|
result->src[0] = a;
|
||||||
result->src[1] = mask;
|
result->src[1] = mask;
|
||||||
|
result->src[2] = pos;
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
@ -5127,21 +5165,23 @@ static struct ggml_tensor * ggml_soft_max_impl(
|
||||||
struct ggml_tensor * ggml_soft_max(
|
struct ggml_tensor * ggml_soft_max(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a) {
|
struct ggml_tensor * a) {
|
||||||
return ggml_soft_max_impl(ctx, a, NULL, 1.0f, false);
|
return ggml_soft_max_impl(ctx, a, NULL, NULL, 1.0f, 0.0f, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * ggml_soft_max_inplace(
|
struct ggml_tensor * ggml_soft_max_inplace(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a) {
|
struct ggml_tensor * a) {
|
||||||
return ggml_soft_max_impl(ctx, a, NULL, 1.0f, true);
|
return ggml_soft_max_impl(ctx, a, NULL, NULL, 1.0f, 0.0f, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * ggml_soft_max_ext(
|
struct ggml_tensor * ggml_soft_max_ext(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
struct ggml_tensor * mask,
|
struct ggml_tensor * mask,
|
||||||
float scale) {
|
struct ggml_tensor * pos,
|
||||||
return ggml_soft_max_impl(ctx, a, mask, scale, false);
|
float scale,
|
||||||
|
float max_bias) {
|
||||||
|
return ggml_soft_max_impl(ctx, a, mask, pos, scale, max_bias, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
// ggml_soft_max_back
|
// ggml_soft_max_back
|
||||||
|
@ -7661,6 +7701,7 @@ static void ggml_compute_forward_add(
|
||||||
case GGML_TYPE_IQ2_XXS:
|
case GGML_TYPE_IQ2_XXS:
|
||||||
case GGML_TYPE_IQ2_XS:
|
case GGML_TYPE_IQ2_XS:
|
||||||
case GGML_TYPE_IQ3_XXS:
|
case GGML_TYPE_IQ3_XXS:
|
||||||
|
case GGML_TYPE_IQ1_S:
|
||||||
{
|
{
|
||||||
ggml_compute_forward_add_q_f32(params, src0, src1, dst);
|
ggml_compute_forward_add_q_f32(params, src0, src1, dst);
|
||||||
} break;
|
} break;
|
||||||
|
@ -7928,6 +7969,7 @@ static void ggml_compute_forward_add1(
|
||||||
case GGML_TYPE_IQ2_XXS:
|
case GGML_TYPE_IQ2_XXS:
|
||||||
case GGML_TYPE_IQ2_XS:
|
case GGML_TYPE_IQ2_XS:
|
||||||
case GGML_TYPE_IQ3_XXS:
|
case GGML_TYPE_IQ3_XXS:
|
||||||
|
case GGML_TYPE_IQ1_S:
|
||||||
{
|
{
|
||||||
ggml_compute_forward_add1_q_f32(params, src0, src1, dst);
|
ggml_compute_forward_add1_q_f32(params, src0, src1, dst);
|
||||||
} break;
|
} break;
|
||||||
|
@ -8048,6 +8090,7 @@ static void ggml_compute_forward_acc(
|
||||||
case GGML_TYPE_IQ2_XXS:
|
case GGML_TYPE_IQ2_XXS:
|
||||||
case GGML_TYPE_IQ2_XS:
|
case GGML_TYPE_IQ2_XS:
|
||||||
case GGML_TYPE_IQ3_XXS:
|
case GGML_TYPE_IQ3_XXS:
|
||||||
|
case GGML_TYPE_IQ1_S:
|
||||||
default:
|
default:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
|
@ -10814,6 +10857,7 @@ static void ggml_compute_forward_out_prod(
|
||||||
case GGML_TYPE_IQ2_XXS:
|
case GGML_TYPE_IQ2_XXS:
|
||||||
case GGML_TYPE_IQ2_XS:
|
case GGML_TYPE_IQ2_XS:
|
||||||
case GGML_TYPE_IQ3_XXS:
|
case GGML_TYPE_IQ3_XXS:
|
||||||
|
case GGML_TYPE_IQ1_S:
|
||||||
{
|
{
|
||||||
ggml_compute_forward_out_prod_q_f32(params, src0, src1, dst);
|
ggml_compute_forward_out_prod_q_f32(params, src0, src1, dst);
|
||||||
} break;
|
} break;
|
||||||
|
@ -10994,6 +11038,7 @@ static void ggml_compute_forward_set(
|
||||||
case GGML_TYPE_IQ2_XXS:
|
case GGML_TYPE_IQ2_XXS:
|
||||||
case GGML_TYPE_IQ2_XS:
|
case GGML_TYPE_IQ2_XS:
|
||||||
case GGML_TYPE_IQ3_XXS:
|
case GGML_TYPE_IQ3_XXS:
|
||||||
|
case GGML_TYPE_IQ1_S:
|
||||||
default:
|
default:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
|
@ -11191,6 +11236,7 @@ static void ggml_compute_forward_get_rows(
|
||||||
case GGML_TYPE_IQ2_XXS:
|
case GGML_TYPE_IQ2_XXS:
|
||||||
case GGML_TYPE_IQ2_XS:
|
case GGML_TYPE_IQ2_XS:
|
||||||
case GGML_TYPE_IQ3_XXS:
|
case GGML_TYPE_IQ3_XXS:
|
||||||
|
case GGML_TYPE_IQ1_S:
|
||||||
{
|
{
|
||||||
ggml_compute_forward_get_rows_q(params, src0, src1, dst);
|
ggml_compute_forward_get_rows_q(params, src0, src1, dst);
|
||||||
} break;
|
} break;
|
||||||
|
@ -11494,6 +11540,7 @@ static void ggml_compute_forward_soft_max_f32(
|
||||||
const struct ggml_compute_params * params,
|
const struct ggml_compute_params * params,
|
||||||
const struct ggml_tensor * src0,
|
const struct ggml_tensor * src0,
|
||||||
const struct ggml_tensor * src1,
|
const struct ggml_tensor * src1,
|
||||||
|
const struct ggml_tensor * src2,
|
||||||
struct ggml_tensor * dst) {
|
struct ggml_tensor * dst) {
|
||||||
assert(ggml_is_contiguous(dst));
|
assert(ggml_is_contiguous(dst));
|
||||||
assert(ggml_are_same_shape(src0, dst));
|
assert(ggml_are_same_shape(src0, dst));
|
||||||
|
@ -11502,16 +11549,29 @@ static void ggml_compute_forward_soft_max_f32(
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
float scale = 1.0f;
|
float scale = 1.0f;
|
||||||
memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
|
float max_bias = 0.0f;
|
||||||
|
|
||||||
|
memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
|
||||||
|
memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float));
|
||||||
|
|
||||||
// TODO: handle transposed/permuted matrices
|
// TODO: handle transposed/permuted matrices
|
||||||
|
|
||||||
const int ith = params->ith;
|
const int ith = params->ith;
|
||||||
const int nth = params->nth;
|
const int nth = params->nth;
|
||||||
|
|
||||||
|
GGML_TENSOR_UNARY_OP_LOCALS
|
||||||
|
|
||||||
const int64_t ne11 = src1 ? src1->ne[1] : 1;
|
const int64_t ne11 = src1 ? src1->ne[1] : 1;
|
||||||
|
|
||||||
|
// TODO: is this supposed to be ceil instead of floor?
|
||||||
|
// https://huggingface.co/mosaicml/mpt-7b/blob/main/attention.py#L370
|
||||||
|
const uint32_t n_head_kv = ne02;
|
||||||
|
const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head_kv));
|
||||||
|
|
||||||
|
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
||||||
|
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
||||||
|
|
||||||
const int nc = src0->ne[0];
|
const int nc = src0->ne[0];
|
||||||
const int nr = ggml_nrows(src0);
|
const int nr = ggml_nrows(src0);
|
||||||
|
|
||||||
|
@ -11524,6 +11584,9 @@ static void ggml_compute_forward_soft_max_f32(
|
||||||
|
|
||||||
float * wp = (float *) params->wdata + (nc + CACHE_LINE_SIZE_F32) * ith;
|
float * wp = (float *) params->wdata + (nc + CACHE_LINE_SIZE_F32) * ith;
|
||||||
|
|
||||||
|
// when max_bias <= 0.0f, src2 is not used and we default it to src0 to avoid branching
|
||||||
|
float * pos = src2 ? (float *) src2->data : src0->data;
|
||||||
|
|
||||||
for (int i1 = ir0; i1 < ir1; i1++) {
|
for (int i1 = ir0; i1 < ir1; i1++) {
|
||||||
float * sp = (float *)((char *) src0->data + i1*src0->nb[1]);
|
float * sp = (float *)((char *) src0->data + i1*src0->nb[1]);
|
||||||
float * dp = (float *)((char *) dst->data + i1*dst->nb[1]);
|
float * dp = (float *)((char *) dst->data + i1*dst->nb[1]);
|
||||||
|
@ -11537,6 +11600,16 @@ static void ggml_compute_forward_soft_max_f32(
|
||||||
ggml_vec_acc_f32(nc, wp, mp);
|
ggml_vec_acc_f32(nc, wp, mp);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ALiBi bias
|
||||||
|
if (max_bias > 0.0f) {
|
||||||
|
const uint32_t h = (i1/ne01)%ne02; // head
|
||||||
|
const float slope = h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1);
|
||||||
|
|
||||||
|
for (int i = 0; i < nc; i++) {
|
||||||
|
wp[i] = wp[i] + slope*pos[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
for (int i = 0; i < nc; ++i) {
|
for (int i = 0; i < nc; ++i) {
|
||||||
//printf("p[%d] = %f\n", i, p[i]);
|
//printf("p[%d] = %f\n", i, p[i]);
|
||||||
|
@ -11581,11 +11654,12 @@ static void ggml_compute_forward_soft_max(
|
||||||
const struct ggml_compute_params * params,
|
const struct ggml_compute_params * params,
|
||||||
const struct ggml_tensor * src0,
|
const struct ggml_tensor * src0,
|
||||||
const struct ggml_tensor * src1,
|
const struct ggml_tensor * src1,
|
||||||
|
const struct ggml_tensor * src2,
|
||||||
struct ggml_tensor * dst) {
|
struct ggml_tensor * dst) {
|
||||||
switch (src0->type) {
|
switch (src0->type) {
|
||||||
case GGML_TYPE_F32:
|
case GGML_TYPE_F32:
|
||||||
{
|
{
|
||||||
ggml_compute_forward_soft_max_f32(params, src0, src1, dst);
|
ggml_compute_forward_soft_max_f32(params, src0, src1, src2, dst);
|
||||||
} break;
|
} break;
|
||||||
default:
|
default:
|
||||||
{
|
{
|
||||||
|
@ -11729,22 +11803,20 @@ static void ggml_compute_forward_alibi_f32(
|
||||||
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
|
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
|
||||||
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
|
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
|
||||||
|
|
||||||
for (int64_t i = 0; i < ne0; i++) {
|
for (int64_t k = 0; k < ne2_ne3; k++) {
|
||||||
for (int64_t j = 0; j < ne1; j++) {
|
// TODO: k*nb2 or k*nb3
|
||||||
for (int64_t k = 0; k < ne2_ne3; k++) {
|
float m_k;
|
||||||
|
|
||||||
|
if (k < n_heads_log2_floor) {
|
||||||
|
m_k = powf(m0, k + 1);
|
||||||
|
} else {
|
||||||
|
m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int64_t i = 0; i < ne0; i++) {
|
||||||
|
for (int64_t j = 0; j < ne1; j++) {
|
||||||
float * const src = (float *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
|
float * const src = (float *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
|
||||||
float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2);
|
float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2);
|
||||||
|
|
||||||
// TODO: k*nb2 or k*nb3
|
|
||||||
|
|
||||||
float m_k;
|
|
||||||
|
|
||||||
if (k < n_heads_log2_floor) {
|
|
||||||
m_k = powf(m0, k + 1);
|
|
||||||
} else {
|
|
||||||
m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
pdst[0] = i * m_k + src[0];
|
pdst[0] = i * m_k + src[0];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -11789,21 +11861,20 @@ static void ggml_compute_forward_alibi_f16(
|
||||||
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
|
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
|
||||||
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
|
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
|
||||||
|
|
||||||
for (int i = 0; i < ne0; i++) {
|
for (int k = 0; k < ne2_ne3; k++) {
|
||||||
for (int j = 0; j < ne1; j++) {
|
// TODO: k*nb2 or k*nb3
|
||||||
for (int k = 0; k < ne2_ne3; k++) {
|
float m_k;
|
||||||
|
|
||||||
|
if (k < n_heads_log2_floor) {
|
||||||
|
m_k = powf(m0, k + 1);
|
||||||
|
} else {
|
||||||
|
m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < ne0; i++) {
|
||||||
|
for (int j = 0; j < ne1; j++) {
|
||||||
ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
|
ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
|
||||||
float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2);
|
float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2);
|
||||||
|
|
||||||
// TODO: k*nb2 or k*nb3
|
|
||||||
|
|
||||||
float m_k;
|
|
||||||
|
|
||||||
if (k < n_heads_log2_floor) {
|
|
||||||
m_k = powf(m0, k + 1);
|
|
||||||
} else {
|
|
||||||
m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
// we return F32
|
// we return F32
|
||||||
pdst[0] = i * m_k + GGML_FP16_TO_FP32(src[0]);
|
pdst[0] = i * m_k + GGML_FP16_TO_FP32(src[0]);
|
||||||
|
@ -11839,6 +11910,7 @@ static void ggml_compute_forward_alibi(
|
||||||
case GGML_TYPE_IQ2_XXS:
|
case GGML_TYPE_IQ2_XXS:
|
||||||
case GGML_TYPE_IQ2_XS:
|
case GGML_TYPE_IQ2_XS:
|
||||||
case GGML_TYPE_IQ3_XXS:
|
case GGML_TYPE_IQ3_XXS:
|
||||||
|
case GGML_TYPE_IQ1_S:
|
||||||
case GGML_TYPE_Q8_K:
|
case GGML_TYPE_Q8_K:
|
||||||
case GGML_TYPE_I8:
|
case GGML_TYPE_I8:
|
||||||
case GGML_TYPE_I16:
|
case GGML_TYPE_I16:
|
||||||
|
@ -11916,6 +11988,7 @@ static void ggml_compute_forward_clamp(
|
||||||
case GGML_TYPE_IQ2_XXS:
|
case GGML_TYPE_IQ2_XXS:
|
||||||
case GGML_TYPE_IQ2_XS:
|
case GGML_TYPE_IQ2_XS:
|
||||||
case GGML_TYPE_IQ3_XXS:
|
case GGML_TYPE_IQ3_XXS:
|
||||||
|
case GGML_TYPE_IQ1_S:
|
||||||
case GGML_TYPE_Q8_K:
|
case GGML_TYPE_Q8_K:
|
||||||
case GGML_TYPE_I8:
|
case GGML_TYPE_I8:
|
||||||
case GGML_TYPE_I16:
|
case GGML_TYPE_I16:
|
||||||
|
@ -15115,7 +15188,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_SOFT_MAX:
|
case GGML_OP_SOFT_MAX:
|
||||||
{
|
{
|
||||||
ggml_compute_forward_soft_max(params, tensor->src[0], tensor->src[1], tensor);
|
ggml_compute_forward_soft_max(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_SOFT_MAX_BACK:
|
case GGML_OP_SOFT_MAX_BACK:
|
||||||
{
|
{
|
||||||
|
@ -16672,7 +16745,7 @@ typedef pthread_t ggml_thread_t;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Android's libc implementation "bionic" does not support setting affinity
|
// Android's libc implementation "bionic" does not support setting affinity
|
||||||
#if defined(__linux__) && !defined(__BIONIC__)
|
#if defined(__gnu_linux__)
|
||||||
static void set_numa_thread_affinity(int thread_n) {
|
static void set_numa_thread_affinity(int thread_n) {
|
||||||
if (!ggml_is_numa()) {
|
if (!ggml_is_numa()) {
|
||||||
return;
|
return;
|
||||||
|
@ -17847,7 +17920,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
|
||||||
|
|
||||||
ptr += ggml_nbytes(tensor);
|
ptr += ggml_nbytes(tensor);
|
||||||
|
|
||||||
fprintf(stderr, "%s: loaded leaf %d: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor));
|
fprintf(stderr, "%s: loaded leaf %u: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -17950,7 +18023,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
|
||||||
|
|
||||||
result->nodes[i] = tensor;
|
result->nodes[i] = tensor;
|
||||||
|
|
||||||
fprintf(stderr, "%s: loaded node %d: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor));
|
fprintf(stderr, "%s: loaded node %u: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -18575,7 +18648,9 @@ static enum ggml_opt_result linesearch_backtracking(
|
||||||
(*step) *= width;
|
(*step) *= width;
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_UNREACHABLE();
|
GGML_ASSERT(false && "line search failed");
|
||||||
|
|
||||||
|
return GGML_LINESEARCH_FAIL;
|
||||||
}
|
}
|
||||||
|
|
||||||
static enum ggml_opt_result ggml_opt_lbfgs(
|
static enum ggml_opt_result ggml_opt_lbfgs(
|
||||||
|
@ -18843,7 +18918,9 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
||||||
step[0] = 1.0;
|
step[0] = 1.0;
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_UNREACHABLE();
|
GGML_ASSERT(false && "lbfgs failed");
|
||||||
|
|
||||||
|
return GGML_OPT_DID_NOT_CONVERGE;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) {
|
struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) {
|
||||||
|
@ -19091,8 +19168,9 @@ void ggml_quantize_init(enum ggml_type type) {
|
||||||
ggml_critical_section_start();
|
ggml_critical_section_start();
|
||||||
|
|
||||||
switch (type) {
|
switch (type) {
|
||||||
case GGML_TYPE_IQ2_XXS: iq2xs_init_impl(256); break;
|
case GGML_TYPE_IQ2_XXS:
|
||||||
case GGML_TYPE_IQ2_XS: iq2xs_init_impl(512); break;
|
case GGML_TYPE_IQ2_XS:
|
||||||
|
case GGML_TYPE_IQ1_S: iq2xs_init_impl(type); break;
|
||||||
case GGML_TYPE_IQ3_XXS: iq3xs_init_impl(256); break;
|
case GGML_TYPE_IQ3_XXS: iq3xs_init_impl(256); break;
|
||||||
default: // nothing
|
default: // nothing
|
||||||
break;
|
break;
|
||||||
|
@ -19104,8 +19182,10 @@ void ggml_quantize_init(enum ggml_type type) {
|
||||||
void ggml_quantize_free(void) {
|
void ggml_quantize_free(void) {
|
||||||
ggml_critical_section_start();
|
ggml_critical_section_start();
|
||||||
|
|
||||||
iq2xs_free_impl(256);
|
iq2xs_free_impl(GGML_TYPE_IQ2_XXS);
|
||||||
iq2xs_free_impl(512);
|
iq2xs_free_impl(GGML_TYPE_IQ2_XS);
|
||||||
|
iq2xs_free_impl(GGML_TYPE_IQ1_S);
|
||||||
|
iq3xs_free_impl(256);
|
||||||
|
|
||||||
ggml_critical_section_end();
|
ggml_critical_section_end();
|
||||||
}
|
}
|
||||||
|
@ -19240,7 +19320,8 @@ size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t *
|
||||||
bool ggml_quantize_requires_imatrix(enum ggml_type type) {
|
bool ggml_quantize_requires_imatrix(enum ggml_type type) {
|
||||||
return
|
return
|
||||||
type == GGML_TYPE_IQ2_XXS ||
|
type == GGML_TYPE_IQ2_XXS ||
|
||||||
type == GGML_TYPE_IQ2_XS;
|
type == GGML_TYPE_IQ2_XS ||
|
||||||
|
type == GGML_TYPE_IQ1_S;
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start,
|
size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start,
|
||||||
|
@ -19365,6 +19446,15 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
|
||||||
result = quantize_iq3_xxs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
result = quantize_iq3_xxs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
||||||
GGML_ASSERT(result == row_size * nrows);
|
GGML_ASSERT(result == row_size * nrows);
|
||||||
} break;
|
} break;
|
||||||
|
case GGML_TYPE_IQ1_S:
|
||||||
|
{
|
||||||
|
GGML_ASSERT(start % QK_K == 0);
|
||||||
|
GGML_ASSERT(start % n_per_row == 0);
|
||||||
|
size_t start_row = start / n_per_row;
|
||||||
|
size_t row_size = ggml_row_size(type, n_per_row);
|
||||||
|
result = quantize_iq1_s(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
||||||
|
GGML_ASSERT(result == row_size * nrows);
|
||||||
|
} break;
|
||||||
case GGML_TYPE_F16:
|
case GGML_TYPE_F16:
|
||||||
{
|
{
|
||||||
size_t elemsize = sizeof(ggml_fp16_t);
|
size_t elemsize = sizeof(ggml_fp16_t);
|
||||||
|
|
15
ggml.h
15
ggml.h
|
@ -361,6 +361,7 @@ extern "C" {
|
||||||
GGML_TYPE_IQ2_XXS = 16,
|
GGML_TYPE_IQ2_XXS = 16,
|
||||||
GGML_TYPE_IQ2_XS = 17,
|
GGML_TYPE_IQ2_XS = 17,
|
||||||
GGML_TYPE_IQ3_XXS = 18,
|
GGML_TYPE_IQ3_XXS = 18,
|
||||||
|
GGML_TYPE_IQ1_S = 19,
|
||||||
GGML_TYPE_I8,
|
GGML_TYPE_I8,
|
||||||
GGML_TYPE_I16,
|
GGML_TYPE_I16,
|
||||||
GGML_TYPE_I32,
|
GGML_TYPE_I32,
|
||||||
|
@ -398,6 +399,7 @@ extern "C" {
|
||||||
GGML_FTYPE_MOSTLY_IQ2_XXS = 15, // except 1d tensors
|
GGML_FTYPE_MOSTLY_IQ2_XXS = 15, // except 1d tensors
|
||||||
GGML_FTYPE_MOSTLY_IQ2_XS = 16, // except 1d tensors
|
GGML_FTYPE_MOSTLY_IQ2_XS = 16, // except 1d tensors
|
||||||
GGML_FTYPE_MOSTLY_IQ3_XXS = 17, // except 1d tensors
|
GGML_FTYPE_MOSTLY_IQ3_XXS = 17, // except 1d tensors
|
||||||
|
GGML_FTYPE_MOSTLY_IQ1_S = 18, // except 1d tensors
|
||||||
};
|
};
|
||||||
|
|
||||||
// available tensor operations:
|
// available tensor operations:
|
||||||
|
@ -1390,13 +1392,17 @@ extern "C" {
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a);
|
struct ggml_tensor * a);
|
||||||
|
|
||||||
// fused soft_max(a*scale + mask)
|
// fused soft_max(a*scale + mask + pos[i]*(ALiBi slope))
|
||||||
// mask is optional
|
// mask is optional
|
||||||
|
// pos is required when max_bias > 0.0f
|
||||||
|
// max_bias = 0.0f for no ALiBi
|
||||||
GGML_API struct ggml_tensor * ggml_soft_max_ext(
|
GGML_API struct ggml_tensor * ggml_soft_max_ext(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
struct ggml_tensor * mask,
|
struct ggml_tensor * mask,
|
||||||
float scale);
|
struct ggml_tensor * pos,
|
||||||
|
float scale,
|
||||||
|
float max_bias);
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_soft_max_back(
|
GGML_API struct ggml_tensor * ggml_soft_max_back(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
|
@ -1498,12 +1504,13 @@ extern "C" {
|
||||||
|
|
||||||
// alibi position embedding
|
// alibi position embedding
|
||||||
// in-place, returns view(a)
|
// in-place, returns view(a)
|
||||||
GGML_API struct ggml_tensor * ggml_alibi(
|
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_alibi(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
int n_past,
|
int n_past,
|
||||||
int n_head,
|
int n_head,
|
||||||
float bias_max);
|
float bias_max),
|
||||||
|
"use ggml_soft_max_ext instead (will be removed in Mar 2024)");
|
||||||
|
|
||||||
// clamp
|
// clamp
|
||||||
// in-place, returns view(a)
|
// in-place, returns view(a)
|
||||||
|
|
266
llama.cpp
266
llama.cpp
|
@ -1585,12 +1585,13 @@ struct llama_hparams {
|
||||||
uint32_t n_yarn_orig_ctx;
|
uint32_t n_yarn_orig_ctx;
|
||||||
int32_t rope_scaling_type_train;
|
int32_t rope_scaling_type_train;
|
||||||
|
|
||||||
float f_clamp_kqv;
|
float f_clamp_kqv = 0.0f;
|
||||||
float f_max_alibi_bias;
|
float f_max_alibi_bias = 0.0f;
|
||||||
|
|
||||||
bool causal_attn = true;
|
bool causal_attn = true;
|
||||||
uint32_t pooling_type = LLAMA_POOLING_NONE;
|
bool need_kq_pos = false;
|
||||||
|
|
||||||
|
uint32_t pooling_type = LLAMA_POOLING_NONE;
|
||||||
|
|
||||||
bool operator!=(const llama_hparams & other) const {
|
bool operator!=(const llama_hparams & other) const {
|
||||||
if (this->vocab_only != other.vocab_only) return true;
|
if (this->vocab_only != other.vocab_only) return true;
|
||||||
|
@ -1955,6 +1956,7 @@ struct llama_context {
|
||||||
struct ggml_tensor * inp_embd; // F32 [n_embd, n_batch]
|
struct ggml_tensor * inp_embd; // F32 [n_embd, n_batch]
|
||||||
struct ggml_tensor * inp_pos; // I32 [n_batch]
|
struct ggml_tensor * inp_pos; // I32 [n_batch]
|
||||||
struct ggml_tensor * inp_KQ_mask; // F32 [n_ctx, n_batch]
|
struct ggml_tensor * inp_KQ_mask; // F32 [n_ctx, n_batch]
|
||||||
|
struct ggml_tensor * inp_KQ_pos; // F32 [n_ctx]
|
||||||
struct ggml_tensor * inp_K_shift; // I32 [n_ctx]
|
struct ggml_tensor * inp_K_shift; // I32 [n_ctx]
|
||||||
struct ggml_tensor * inp_mean; // F32 [n_batch, n_batch]
|
struct ggml_tensor * inp_mean; // F32 [n_batch, n_batch]
|
||||||
struct ggml_tensor * inp_cls; // I32 [n_batch]
|
struct ggml_tensor * inp_cls; // I32 [n_batch]
|
||||||
|
@ -2557,6 +2559,7 @@ struct llama_model_loader {
|
||||||
case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break;
|
case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break;
|
||||||
case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break;
|
case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break;
|
||||||
case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break;
|
case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break;
|
||||||
|
case GGML_TYPE_IQ1_S: ftype = LLAMA_FTYPE_MOSTLY_IQ1_S; break;
|
||||||
default:
|
default:
|
||||||
{
|
{
|
||||||
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
|
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
|
||||||
|
@ -2919,6 +2922,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
||||||
case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
|
case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
|
||||||
case LLAMA_FTYPE_MOSTLY_Q3_K_XS:return "Q3_K - Extra small";
|
case LLAMA_FTYPE_MOSTLY_Q3_K_XS:return "Q3_K - Extra small";
|
||||||
case LLAMA_FTYPE_MOSTLY_IQ3_XXS:return "IQ3_XXS - 3.0625 bpw";
|
case LLAMA_FTYPE_MOSTLY_IQ3_XXS:return "IQ3_XXS - 3.0625 bpw";
|
||||||
|
case LLAMA_FTYPE_MOSTLY_IQ1_S :return "IQ1_S - 1.5625 bpw";
|
||||||
|
|
||||||
default: return "unknown, may not work";
|
default: return "unknown, may not work";
|
||||||
}
|
}
|
||||||
|
@ -3100,6 +3104,11 @@ static void llm_load_hparams(
|
||||||
case 40: model.type = e_model::MODEL_13B; break;
|
case 40: model.type = e_model::MODEL_13B; break;
|
||||||
default: model.type = e_model::MODEL_UNKNOWN;
|
default: model.type = e_model::MODEL_UNKNOWN;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (model.type == e_model::MODEL_13B) {
|
||||||
|
// TODO: become GGUF KV parameter
|
||||||
|
hparams.f_max_alibi_bias = 8.0f;
|
||||||
|
}
|
||||||
} break;
|
} break;
|
||||||
case LLM_ARCH_STARCODER:
|
case LLM_ARCH_STARCODER:
|
||||||
{
|
{
|
||||||
|
@ -3127,6 +3136,9 @@ static void llm_load_hparams(
|
||||||
case 32: model.type = e_model::MODEL_1B; break;
|
case 32: model.type = e_model::MODEL_1B; break;
|
||||||
default: model.type = e_model::MODEL_UNKNOWN;
|
default: model.type = e_model::MODEL_UNKNOWN;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO: become GGUF KV parameter
|
||||||
|
hparams.f_max_alibi_bias = 8.0f;
|
||||||
} break;
|
} break;
|
||||||
case LLM_ARCH_BERT:
|
case LLM_ARCH_BERT:
|
||||||
{
|
{
|
||||||
|
@ -3172,11 +3184,12 @@ static void llm_load_hparams(
|
||||||
case 4096: model.type = e_model::MODEL_7B; break;
|
case 4096: model.type = e_model::MODEL_7B; break;
|
||||||
} break;
|
} break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO: become GGUF KV parameter
|
||||||
|
hparams.f_max_alibi_bias = 8.0f;
|
||||||
} break;
|
} break;
|
||||||
case LLM_ARCH_MPT:
|
case LLM_ARCH_MPT:
|
||||||
{
|
{
|
||||||
hparams.f_clamp_kqv = 0.0f;
|
|
||||||
|
|
||||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||||
ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
|
ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
|
||||||
ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
|
ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
|
||||||
|
@ -3278,6 +3291,10 @@ static void llm_load_hparams(
|
||||||
}
|
}
|
||||||
|
|
||||||
model.ftype = ml.ftype;
|
model.ftype = ml.ftype;
|
||||||
|
|
||||||
|
if (hparams.f_max_alibi_bias > 0.0f) {
|
||||||
|
hparams.need_kq_pos = true;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: This should probably be in llama.h
|
// TODO: This should probably be in llama.h
|
||||||
|
@ -4846,10 +4863,10 @@ static struct ggml_tensor * llm_build_kqv(
|
||||||
struct ggml_tensor * wo_b,
|
struct ggml_tensor * wo_b,
|
||||||
struct ggml_tensor * q_cur,
|
struct ggml_tensor * q_cur,
|
||||||
struct ggml_tensor * kq_mask,
|
struct ggml_tensor * kq_mask,
|
||||||
|
struct ggml_tensor * kq_pos,
|
||||||
int64_t n_ctx,
|
int64_t n_ctx,
|
||||||
int32_t n_tokens,
|
int32_t n_tokens,
|
||||||
int32_t n_kv,
|
int32_t n_kv,
|
||||||
float max_alibi_bias,
|
|
||||||
float kq_scale,
|
float kq_scale,
|
||||||
const llm_build_cb & cb,
|
const llm_build_cb & cb,
|
||||||
int il) {
|
int il) {
|
||||||
|
@ -4879,26 +4896,26 @@ static struct ggml_tensor * llm_build_kqv(
|
||||||
ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (max_alibi_bias > 0.0f) {
|
#if defined(GGML_USE_VULKAN) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_SYCL)
|
||||||
// temporary branch until we figure out how to handle ggml_alibi through ggml_add
|
#pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Vulkan, Kompute, and SYCL")
|
||||||
|
#pragma message(" Falling back to ggml_alibi(). Will become an error in Mar 2024")
|
||||||
|
#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5488")
|
||||||
|
if (hparams.f_max_alibi_bias > 0.0f) {
|
||||||
kq = ggml_scale(ctx, kq, kq_scale);
|
kq = ggml_scale(ctx, kq, kq_scale);
|
||||||
cb(kq, "kq_scaled", il);
|
cb(kq, "kq_scaled", il);
|
||||||
|
|
||||||
if (max_alibi_bias > 0.0f) {
|
kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, hparams.f_max_alibi_bias);
|
||||||
// TODO: n_head or n_head_kv
|
cb(kq, "kq_scaled_alibi", il);
|
||||||
// TODO: K-shift is likely not working
|
|
||||||
// TODO: change to ggml_add
|
|
||||||
kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, max_alibi_bias);
|
|
||||||
cb(kq, "kq_scaled_alibi", il);
|
|
||||||
}
|
|
||||||
|
|
||||||
kq = ggml_add(ctx, kq, kq_mask);
|
kq = ggml_add(ctx, kq, kq_mask);
|
||||||
cb(kq, "kq_masked", il);
|
cb(kq, "kq_masked", il);
|
||||||
|
|
||||||
kq = ggml_soft_max(ctx, kq);
|
kq = ggml_soft_max(ctx, kq);
|
||||||
cb(kq, "kq_soft_max", il);
|
cb(kq, "kq_soft_max", il);
|
||||||
} else {
|
} else
|
||||||
kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale);
|
#endif
|
||||||
|
{
|
||||||
|
kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_pos, kq_scale, hparams.f_max_alibi_bias);
|
||||||
cb(kq, "kq_soft_max_ext", il);
|
cb(kq, "kq_soft_max_ext", il);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -4946,11 +4963,11 @@ static struct ggml_tensor * llm_build_kv(
|
||||||
struct ggml_tensor * v_cur,
|
struct ggml_tensor * v_cur,
|
||||||
struct ggml_tensor * q_cur,
|
struct ggml_tensor * q_cur,
|
||||||
struct ggml_tensor * kq_mask,
|
struct ggml_tensor * kq_mask,
|
||||||
|
struct ggml_tensor * kq_pos,
|
||||||
int64_t n_ctx,
|
int64_t n_ctx,
|
||||||
int32_t n_tokens,
|
int32_t n_tokens,
|
||||||
int32_t kv_head,
|
int32_t kv_head,
|
||||||
int32_t n_kv,
|
int32_t n_kv,
|
||||||
float max_alibi_bias,
|
|
||||||
float kq_scale,
|
float kq_scale,
|
||||||
const llm_build_cb & cb,
|
const llm_build_cb & cb,
|
||||||
int il) {
|
int il) {
|
||||||
|
@ -4964,9 +4981,8 @@ static struct ggml_tensor * llm_build_kv(
|
||||||
llm_build_kv_store(ctx, hparams, kv, graph, k_cur, v_cur, n_ctx, n_tokens, kv_head, cb, il);
|
llm_build_kv_store(ctx, hparams, kv, graph, k_cur, v_cur, n_ctx, n_tokens, kv_head, cb, il);
|
||||||
|
|
||||||
struct ggml_tensor * cur;
|
struct ggml_tensor * cur;
|
||||||
cur = llm_build_kqv(ctx, model, hparams, kv, graph,
|
cur = llm_build_kqv(ctx, model, hparams, kv, graph, wo, wo_b,
|
||||||
wo, wo_b,
|
q_cur, kq_mask, kq_pos, n_ctx, n_tokens, n_kv, kq_scale, cb, il);
|
||||||
q_cur, kq_mask, n_ctx, n_tokens, n_kv, max_alibi_bias, kq_scale, cb, il);
|
|
||||||
cb(cur, "kqv_out", il);
|
cb(cur, "kqv_out", il);
|
||||||
|
|
||||||
return cur;
|
return cur;
|
||||||
|
@ -5134,7 +5150,7 @@ struct llm_build_context {
|
||||||
}
|
}
|
||||||
|
|
||||||
Qcur = ggml_rope_custom(
|
Qcur = ggml_rope_custom(
|
||||||
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
||||||
hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
|
hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
|
||||||
ext_factor, attn_factor, beta_fast, beta_slow
|
ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
);
|
);
|
||||||
|
@ -5149,7 +5165,7 @@ struct llm_build_context {
|
||||||
|
|
||||||
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
||||||
model.layers[il].wo, model.layers[il].bo,
|
model.layers[il].wo, model.layers[il].bo,
|
||||||
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||||
cb(cur, "kqv_out", il);
|
cb(cur, "kqv_out", il);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -5279,6 +5295,10 @@ struct llm_build_context {
|
||||||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||||
cb(KQ_mask, "KQ_mask", -1);
|
cb(KQ_mask, "KQ_mask", -1);
|
||||||
|
|
||||||
|
// positions of the tokens in the KV cache
|
||||||
|
struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0);
|
||||||
|
cb(KQ_pos, "KQ_pos", -1);
|
||||||
|
|
||||||
// shift the entire K-cache if needed
|
// shift the entire K-cache if needed
|
||||||
if (do_rope_shift) {
|
if (do_rope_shift) {
|
||||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
||||||
|
@ -5327,12 +5347,9 @@ struct llm_build_context {
|
||||||
cb(Kcur, "Kcur", il);
|
cb(Kcur, "Kcur", il);
|
||||||
|
|
||||||
|
|
||||||
// apply ALiBi for 13B model
|
|
||||||
const float max_alibi_bias = model.type == MODEL_13B ? 8.0f : -1.0f;
|
|
||||||
|
|
||||||
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
||||||
model.layers[il].wo, NULL,
|
model.layers[il].wo, NULL,
|
||||||
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||||
cb(cur, "kqv_out", il);
|
cb(cur, "kqv_out", il);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -5456,7 +5473,7 @@ struct llm_build_context {
|
||||||
|
|
||||||
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
||||||
model.layers[il].wo, NULL,
|
model.layers[il].wo, NULL,
|
||||||
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||||
cb(cur, "kqv_out", il);
|
cb(cur, "kqv_out", il);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -5555,7 +5572,7 @@ struct llm_build_context {
|
||||||
|
|
||||||
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
||||||
model.layers[il].wo, model.layers[il].bo,
|
model.layers[il].wo, model.layers[il].bo,
|
||||||
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||||
cb(cur, "kqv_out", il);
|
cb(cur, "kqv_out", il);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -5760,7 +5777,7 @@ struct llm_build_context {
|
||||||
|
|
||||||
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
||||||
model.layers[il].wo, model.layers[il].bo,
|
model.layers[il].wo, model.layers[il].bo,
|
||||||
Kcur, Vcur, Q, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
Kcur, Vcur, Q, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||||
cb(cur, "kqv_out", il);
|
cb(cur, "kqv_out", il);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -5822,6 +5839,10 @@ struct llm_build_context {
|
||||||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||||
cb(KQ_mask, "KQ_mask", -1);
|
cb(KQ_mask, "KQ_mask", -1);
|
||||||
|
|
||||||
|
// positions of the tokens in the KV cache
|
||||||
|
struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0);
|
||||||
|
cb(KQ_pos, "KQ_pos", -1);
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
struct ggml_tensor * inpSA = inpL;
|
struct ggml_tensor * inpSA = inpL;
|
||||||
|
|
||||||
|
@ -5849,7 +5870,7 @@ struct llm_build_context {
|
||||||
|
|
||||||
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
||||||
model.layers[il].wo, NULL,
|
model.layers[il].wo, NULL,
|
||||||
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||||
cb(cur, "kqv_out", il);
|
cb(cur, "kqv_out", il);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -5950,7 +5971,7 @@ struct llm_build_context {
|
||||||
|
|
||||||
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
||||||
model.layers[il].wo, model.layers[il].bo,
|
model.layers[il].wo, model.layers[il].bo,
|
||||||
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||||
cb(cur, "kqv_out", il);
|
cb(cur, "kqv_out", il);
|
||||||
} else {
|
} else {
|
||||||
// compute Q and K and RoPE them
|
// compute Q and K and RoPE them
|
||||||
|
@ -5981,7 +6002,7 @@ struct llm_build_context {
|
||||||
|
|
||||||
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
||||||
model.layers[il].wo, model.layers[il].bo,
|
model.layers[il].wo, model.layers[il].bo,
|
||||||
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||||
cb(cur, "kqv_out", il);
|
cb(cur, "kqv_out", il);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -6057,6 +6078,10 @@ struct llm_build_context {
|
||||||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||||
cb(KQ_mask, "KQ_mask", -1);
|
cb(KQ_mask, "KQ_mask", -1);
|
||||||
|
|
||||||
|
// positions of the tokens in the KV cache
|
||||||
|
struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0);
|
||||||
|
cb(KQ_pos, "KQ_pos", -1);
|
||||||
|
|
||||||
inpL = llm_build_norm(ctx0, inpL, hparams,
|
inpL = llm_build_norm(ctx0, inpL, hparams,
|
||||||
model.tok_norm,
|
model.tok_norm,
|
||||||
model.tok_norm_b,
|
model.tok_norm_b,
|
||||||
|
@ -6090,7 +6115,7 @@ struct llm_build_context {
|
||||||
|
|
||||||
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
||||||
model.layers[il].wo, model.layers[il].bo,
|
model.layers[il].wo, model.layers[il].bo,
|
||||||
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||||
cb(cur, "kqv_out", il);
|
cb(cur, "kqv_out", il);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -6150,6 +6175,10 @@ struct llm_build_context {
|
||||||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||||
cb(KQ_mask, "KQ_mask", -1);
|
cb(KQ_mask, "KQ_mask", -1);
|
||||||
|
|
||||||
|
// positions of the tokens in the KV cache
|
||||||
|
struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0);
|
||||||
|
cb(KQ_pos, "KQ_pos", -1);
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
struct ggml_tensor * attn_norm;
|
struct ggml_tensor * attn_norm;
|
||||||
|
|
||||||
|
@ -6183,7 +6212,7 @@ struct llm_build_context {
|
||||||
|
|
||||||
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
||||||
model.layers[il].wo, NULL,
|
model.layers[il].wo, NULL,
|
||||||
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, hparams.f_max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||||
cb(cur, "kqv_out", il);
|
cb(cur, "kqv_out", il);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -6305,7 +6334,7 @@ struct llm_build_context {
|
||||||
|
|
||||||
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
||||||
model.layers[il].wo, NULL,
|
model.layers[il].wo, NULL,
|
||||||
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||||
cb(cur, "kqv_out", il);
|
cb(cur, "kqv_out", il);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -6420,7 +6449,7 @@ struct llm_build_context {
|
||||||
|
|
||||||
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
||||||
model.layers[il].wo, NULL,
|
model.layers[il].wo, NULL,
|
||||||
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||||
cb(cur, "kqv_out", il);
|
cb(cur, "kqv_out", il);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -6541,7 +6570,7 @@ struct llm_build_context {
|
||||||
|
|
||||||
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
||||||
model.layers[il].wo, model.layers[il].bo,
|
model.layers[il].wo, model.layers[il].bo,
|
||||||
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||||
cb(cur, "kqv_out", il);
|
cb(cur, "kqv_out", il);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -6668,7 +6697,7 @@ struct llm_build_context {
|
||||||
|
|
||||||
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
||||||
model.layers[il].wo, model.layers[il].bo,
|
model.layers[il].wo, model.layers[il].bo,
|
||||||
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f, cb, il);
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
||||||
cb(cur, "kqv_out", il);
|
cb(cur, "kqv_out", il);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -6771,7 +6800,7 @@ struct llm_build_context {
|
||||||
|
|
||||||
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
||||||
model.layers[il].wo, NULL,
|
model.layers[il].wo, NULL,
|
||||||
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||||
cb(cur, "kqv_out", il);
|
cb(cur, "kqv_out", il);
|
||||||
}
|
}
|
||||||
struct ggml_tensor * sa_out = cur;
|
struct ggml_tensor * sa_out = cur;
|
||||||
|
@ -6870,7 +6899,7 @@ struct llm_build_context {
|
||||||
|
|
||||||
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
||||||
model.layers[il].wo, model.layers[il].bo,
|
model.layers[il].wo, model.layers[il].bo,
|
||||||
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||||
cb(cur, "kqv_out", il);
|
cb(cur, "kqv_out", il);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -6979,7 +7008,7 @@ struct llm_build_context {
|
||||||
|
|
||||||
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
||||||
model.layers[il].wo, model.layers[il].bo,
|
model.layers[il].wo, model.layers[il].bo,
|
||||||
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||||
cb(cur, "kqv_out", il);
|
cb(cur, "kqv_out", il);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -7097,7 +7126,7 @@ struct llm_build_context {
|
||||||
|
|
||||||
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
||||||
model.layers[il].wo, NULL,
|
model.layers[il].wo, NULL,
|
||||||
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||||
cb(cur, "kqv_out", il);
|
cb(cur, "kqv_out", il);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -7216,7 +7245,7 @@ struct llm_build_context {
|
||||||
|
|
||||||
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
||||||
model.layers[il].wo, model.layers[il].bo,
|
model.layers[il].wo, model.layers[il].bo,
|
||||||
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||||
cb(cur, "kqv_out", il);
|
cb(cur, "kqv_out", il);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -7348,7 +7377,7 @@ struct llm_build_context {
|
||||||
|
|
||||||
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
||||||
model.layers[il].wo, model.layers[il].bo,
|
model.layers[il].wo, model.layers[il].bo,
|
||||||
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||||
cb(cur, "kqv_out", il);
|
cb(cur, "kqv_out", il);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -7579,6 +7608,18 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (hparams.need_kq_pos) {
|
||||||
|
const int64_t n_kv = kv_self.n;
|
||||||
|
|
||||||
|
assert(ggml_backend_buffer_is_host(lctx.inp_KQ_pos->buffer));
|
||||||
|
|
||||||
|
float * data = (float *) lctx.inp_KQ_pos->data;
|
||||||
|
|
||||||
|
for (int i = 0; i < n_kv; ++i) {
|
||||||
|
data[i] = float(lctx.kv_self.cells[i].pos);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (kv_self.has_shift) {
|
if (kv_self.has_shift) {
|
||||||
const int64_t n_ctx = cparams.n_ctx;
|
const int64_t n_ctx = cparams.n_ctx;
|
||||||
|
|
||||||
|
@ -10596,20 +10637,20 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
||||||
if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
|
if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
|
||||||
new_type = GGML_TYPE_Q8_0;
|
new_type = GGML_TYPE_Q8_0;
|
||||||
}
|
}
|
||||||
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
|
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
|
||||||
new_type = GGML_TYPE_Q5_K;
|
new_type = GGML_TYPE_Q5_K;
|
||||||
}
|
}
|
||||||
else if (new_type != GGML_TYPE_Q8_0) {
|
else if (new_type != GGML_TYPE_Q8_0) {
|
||||||
new_type = GGML_TYPE_Q6_K;
|
new_type = GGML_TYPE_Q6_K;
|
||||||
}
|
}
|
||||||
} else if (name == "token_embd.weight") {
|
} else if (name == "token_embd.weight") {
|
||||||
if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
|
if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
|
||||||
new_type = GGML_TYPE_Q2_K;
|
new_type = GGML_TYPE_Q2_K;
|
||||||
}
|
}
|
||||||
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
|
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
|
||||||
new_type = GGML_TYPE_Q4_K;
|
new_type = GGML_TYPE_Q4_K;
|
||||||
}
|
}
|
||||||
} else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
|
} else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
|
||||||
if (name.find("attn_v.weight") != std::string::npos) {
|
if (name.find("attn_v.weight") != std::string::npos) {
|
||||||
if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
|
if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
|
||||||
else new_type = GGML_TYPE_Q2_K;
|
else new_type = GGML_TYPE_Q2_K;
|
||||||
|
@ -10619,6 +10660,9 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
||||||
if (qs.i_ffn_down < qs.n_ffn_down/8) new_type = GGML_TYPE_Q2_K;
|
if (qs.i_ffn_down < qs.n_ffn_down/8) new_type = GGML_TYPE_Q2_K;
|
||||||
++qs.i_ffn_down;
|
++qs.i_ffn_down;
|
||||||
}
|
}
|
||||||
|
else if (name.find("attn_output.weight") != std::string::npos) {
|
||||||
|
if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) new_type = GGML_TYPE_IQ2_XXS;
|
||||||
|
}
|
||||||
} else if (name.find("attn_v.weight") != std::string::npos) {
|
} else if (name.find("attn_v.weight") != std::string::npos) {
|
||||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
|
||||||
new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
|
new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
|
||||||
|
@ -10752,7 +10796,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
||||||
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
|
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
|
||||||
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K ||
|
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K ||
|
||||||
new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS ||
|
new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS ||
|
||||||
new_type == GGML_TYPE_IQ3_XXS) {
|
new_type == GGML_TYPE_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
|
||||||
int nx = tensor->ne[0];
|
int nx = tensor->ne[0];
|
||||||
int ny = tensor->ne[1];
|
int ny = tensor->ne[1];
|
||||||
if (nx % QK_K != 0) {
|
if (nx % QK_K != 0) {
|
||||||
|
@ -10767,6 +10811,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
||||||
case GGML_TYPE_IQ2_XXS:
|
case GGML_TYPE_IQ2_XXS:
|
||||||
case GGML_TYPE_IQ2_XS:
|
case GGML_TYPE_IQ2_XS:
|
||||||
case GGML_TYPE_IQ3_XXS:
|
case GGML_TYPE_IQ3_XXS:
|
||||||
|
case GGML_TYPE_IQ1_S:
|
||||||
case GGML_TYPE_Q2_K: new_type = GGML_TYPE_Q4_0; break;
|
case GGML_TYPE_Q2_K: new_type = GGML_TYPE_Q4_0; break;
|
||||||
case GGML_TYPE_Q3_K: new_type = GGML_TYPE_Q4_1; break;
|
case GGML_TYPE_Q3_K: new_type = GGML_TYPE_Q4_1; break;
|
||||||
case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
|
case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
|
||||||
|
@ -10809,6 +10854,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
case LLAMA_FTYPE_MOSTLY_IQ2_XXS: quantized_type = GGML_TYPE_IQ2_XXS; break;
|
case LLAMA_FTYPE_MOSTLY_IQ2_XXS: quantized_type = GGML_TYPE_IQ2_XXS; break;
|
||||||
case LLAMA_FTYPE_MOSTLY_IQ2_XS: quantized_type = GGML_TYPE_IQ2_XS; break;
|
case LLAMA_FTYPE_MOSTLY_IQ2_XS: quantized_type = GGML_TYPE_IQ2_XS; break;
|
||||||
case LLAMA_FTYPE_MOSTLY_IQ3_XXS: quantized_type = GGML_TYPE_IQ3_XXS; break;
|
case LLAMA_FTYPE_MOSTLY_IQ3_XXS: quantized_type = GGML_TYPE_IQ3_XXS; break;
|
||||||
|
case LLAMA_FTYPE_MOSTLY_IQ1_S: quantized_type = GGML_TYPE_IQ1_S ; break;
|
||||||
|
|
||||||
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
|
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
|
||||||
}
|
}
|
||||||
|
@ -10982,6 +11028,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
}
|
}
|
||||||
if ((new_type == GGML_TYPE_IQ2_XXS ||
|
if ((new_type == GGML_TYPE_IQ2_XXS ||
|
||||||
new_type == GGML_TYPE_IQ2_XS ||
|
new_type == GGML_TYPE_IQ2_XS ||
|
||||||
|
new_type == GGML_TYPE_IQ1_S ||
|
||||||
(new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) {
|
(new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) {
|
||||||
LLAMA_LOG_ERROR("\n\n============================================================\n");
|
LLAMA_LOG_ERROR("\n\n============================================================\n");
|
||||||
LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
|
LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
|
||||||
|
@ -11747,7 +11794,7 @@ struct llama_context * llama_new_context_with_model(
|
||||||
// graph inputs
|
// graph inputs
|
||||||
{
|
{
|
||||||
ggml_init_params init_params = {
|
ggml_init_params init_params = {
|
||||||
/* .mem_size */ ggml_tensor_overhead()*7,
|
/* .mem_size */ ggml_tensor_overhead()*8,
|
||||||
/* .mem_buffer */ nullptr,
|
/* .mem_buffer */ nullptr,
|
||||||
/* .no_alloc */ true,
|
/* .no_alloc */ true,
|
||||||
};
|
};
|
||||||
|
@ -11757,6 +11804,7 @@ struct llama_context * llama_new_context_with_model(
|
||||||
ctx->inp_embd = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, hparams.n_embd, cparams.n_batch);
|
ctx->inp_embd = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, hparams.n_embd, cparams.n_batch);
|
||||||
ctx->inp_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
|
ctx->inp_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
|
||||||
ctx->inp_KQ_mask = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx, cparams.n_batch);
|
ctx->inp_KQ_mask = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx, cparams.n_batch);
|
||||||
|
ctx->inp_KQ_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx);
|
||||||
ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_ctx);
|
ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_ctx);
|
||||||
ctx->inp_mean = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_batch, cparams.n_batch);
|
ctx->inp_mean = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_batch, cparams.n_batch);
|
||||||
ctx->inp_cls = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
|
ctx->inp_cls = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
|
||||||
|
@ -11765,6 +11813,7 @@ struct llama_context * llama_new_context_with_model(
|
||||||
ggml_set_name(ctx->inp_embd, "inp_embd");
|
ggml_set_name(ctx->inp_embd, "inp_embd");
|
||||||
ggml_set_name(ctx->inp_pos, "inp_pos");
|
ggml_set_name(ctx->inp_pos, "inp_pos");
|
||||||
ggml_set_name(ctx->inp_KQ_mask, "inp_KQ_mask");
|
ggml_set_name(ctx->inp_KQ_mask, "inp_KQ_mask");
|
||||||
|
ggml_set_name(ctx->inp_KQ_pos, "inp_KQ_pos");
|
||||||
ggml_set_name(ctx->inp_K_shift, "inp_K_shift");
|
ggml_set_name(ctx->inp_K_shift, "inp_K_shift");
|
||||||
ggml_set_name(ctx->inp_mean, "inp_mean");
|
ggml_set_name(ctx->inp_mean, "inp_mean");
|
||||||
ggml_set_name(ctx->inp_cls, "inp_cls");
|
ggml_set_name(ctx->inp_cls, "inp_cls");
|
||||||
|
@ -12787,6 +12836,123 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// trim whitespace from the beginning and end of a string
|
||||||
|
static std::string trim(const std::string & str) {
|
||||||
|
size_t start = 0;
|
||||||
|
size_t end = str.size();
|
||||||
|
while (start < end && isspace(str[start])) {
|
||||||
|
start += 1;
|
||||||
|
}
|
||||||
|
while (end > start && isspace(str[end - 1])) {
|
||||||
|
end -= 1;
|
||||||
|
}
|
||||||
|
return str.substr(start, end - start);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Simple version of "llama_apply_chat_template" that only works with strings
|
||||||
|
// This function uses heuristic checks to determine commonly used template. It is not a jinja parser.
|
||||||
|
static int32_t llama_chat_apply_template_internal(
|
||||||
|
const std::string & tmpl,
|
||||||
|
const std::vector<const llama_chat_message *> & chat,
|
||||||
|
std::string & dest, bool add_ass) {
|
||||||
|
// Taken from the research: https://github.com/ggerganov/llama.cpp/issues/5527
|
||||||
|
std::stringstream ss;
|
||||||
|
if (tmpl.find("<|im_start|>") != std::string::npos) {
|
||||||
|
// chatml template
|
||||||
|
for (auto message : chat) {
|
||||||
|
ss << "<|im_start|>" << message->role << "\n" << message->content << "<|im_end|>\n";
|
||||||
|
}
|
||||||
|
if (add_ass) {
|
||||||
|
ss << "<|im_start|>assistant\n";
|
||||||
|
}
|
||||||
|
} else if (tmpl.find("[INST]") != std::string::npos) {
|
||||||
|
// llama2 template and its variants
|
||||||
|
// [variant] support system message
|
||||||
|
bool support_system_message = tmpl.find("<<SYS>>") != std::string::npos;
|
||||||
|
// [variant] space before + after response
|
||||||
|
bool space_around_response = tmpl.find("' ' + eos_token") != std::string::npos;
|
||||||
|
// [variant] add BOS inside history
|
||||||
|
bool add_bos_inside_history = tmpl.find("bos_token + '[INST]") != std::string::npos;
|
||||||
|
// [variant] trim spaces from the input message
|
||||||
|
bool strip_message = tmpl.find("content.strip()") != std::string::npos;
|
||||||
|
// construct the prompt
|
||||||
|
bool is_inside_turn = true; // skip BOS at the beginning
|
||||||
|
ss << "[INST] ";
|
||||||
|
for (auto message : chat) {
|
||||||
|
std::string content = strip_message ? trim(message->content) : message->content;
|
||||||
|
std::string role(message->role);
|
||||||
|
if (!is_inside_turn) {
|
||||||
|
is_inside_turn = true;
|
||||||
|
ss << (add_bos_inside_history ? "<s>[INST] " : "[INST] ");
|
||||||
|
}
|
||||||
|
if (role == "system") {
|
||||||
|
if (support_system_message) {
|
||||||
|
ss << "<<SYS>>\n" << content << "\n<</SYS>>\n\n";
|
||||||
|
} else {
|
||||||
|
// if the model does not support system message, we still include it in the first message, but without <<SYS>>
|
||||||
|
ss << content << "\n";
|
||||||
|
}
|
||||||
|
} else if (role == "user") {
|
||||||
|
ss << content << " [/INST]";
|
||||||
|
} else {
|
||||||
|
ss << (space_around_response ? " " : "") << content << (space_around_response ? " " : "") << "</s>";
|
||||||
|
is_inside_turn = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// llama2 templates seem to not care about "add_generation_prompt"
|
||||||
|
} else if (tmpl.find("<|user|>") != std::string::npos) {
|
||||||
|
// zephyr template
|
||||||
|
for (auto message : chat) {
|
||||||
|
ss << "<|" << message->role << "|>" << "\n" << message->content << "<|endoftext|>\n";
|
||||||
|
}
|
||||||
|
if (add_ass) {
|
||||||
|
ss << "<|assistant|>\n";
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// template not supported
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
dest = ss.str();
|
||||||
|
return dest.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
LLAMA_API int32_t llama_chat_apply_template(
|
||||||
|
const struct llama_model * model,
|
||||||
|
const char * tmpl,
|
||||||
|
const struct llama_chat_message * chat,
|
||||||
|
size_t n_msg,
|
||||||
|
bool add_ass,
|
||||||
|
char * buf,
|
||||||
|
int32_t length) {
|
||||||
|
std::string curr_tmpl(tmpl == nullptr ? "" : tmpl);
|
||||||
|
if (tmpl == nullptr) {
|
||||||
|
GGML_ASSERT(model != nullptr);
|
||||||
|
// load template from model
|
||||||
|
std::vector<char> model_template(2048, 0); // longest known template is about 1200 bytes
|
||||||
|
std::string template_key = "tokenizer.chat_template";
|
||||||
|
int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), curr_tmpl.size());
|
||||||
|
if (res < 0) {
|
||||||
|
// worst case: there is no information about template, we will use chatml by default
|
||||||
|
curr_tmpl = "<|im_start|>"; // see llama_chat_apply_template_internal
|
||||||
|
} else {
|
||||||
|
curr_tmpl = std::string(model_template.data(), model_template.size());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// format the chat to string
|
||||||
|
std::vector<const llama_chat_message *> chat_vec;
|
||||||
|
chat_vec.resize(n_msg);
|
||||||
|
for (size_t i = 0; i < n_msg; i++) {
|
||||||
|
chat_vec[i] = &chat[i];
|
||||||
|
}
|
||||||
|
std::string formatted_chat;
|
||||||
|
int32_t res = llama_chat_apply_template_internal(curr_tmpl, chat_vec, formatted_chat, add_ass);
|
||||||
|
if (res < 0) {
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
strncpy(buf, formatted_chat.c_str(), length);
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
struct llama_timings llama_get_timings(struct llama_context * ctx) {
|
struct llama_timings llama_get_timings(struct llama_context * ctx) {
|
||||||
struct llama_timings result = {
|
struct llama_timings result = {
|
||||||
/*.t_start_ms =*/ 1e-3 * ctx->t_start_us,
|
/*.t_start_ms =*/ 1e-3 * ctx->t_start_us,
|
||||||
|
|
26
llama.h
26
llama.h
|
@ -100,6 +100,7 @@ extern "C" {
|
||||||
LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors
|
LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors
|
||||||
LLAMA_FTYPE_MOSTLY_Q3_K_XS = 22, // except 1d tensors
|
LLAMA_FTYPE_MOSTLY_Q3_K_XS = 22, // except 1d tensors
|
||||||
LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23, // except 1d tensors
|
LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23, // except 1d tensors
|
||||||
|
LLAMA_FTYPE_MOSTLY_IQ1_S = 24, // except 1d tensors
|
||||||
|
|
||||||
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
||||||
};
|
};
|
||||||
|
@ -304,6 +305,12 @@ extern "C" {
|
||||||
int32_t n_eval;
|
int32_t n_eval;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// used in chat template
|
||||||
|
typedef struct llama_chat_message {
|
||||||
|
const char * role;
|
||||||
|
const char * content;
|
||||||
|
} llama_chat_message;
|
||||||
|
|
||||||
// Helpers for getting default parameters
|
// Helpers for getting default parameters
|
||||||
LLAMA_API struct llama_model_params llama_model_default_params(void);
|
LLAMA_API struct llama_model_params llama_model_default_params(void);
|
||||||
LLAMA_API struct llama_context_params llama_context_default_params(void);
|
LLAMA_API struct llama_context_params llama_context_default_params(void);
|
||||||
|
@ -700,6 +707,25 @@ extern "C" {
|
||||||
char * buf,
|
char * buf,
|
||||||
int32_t length);
|
int32_t length);
|
||||||
|
|
||||||
|
/// Apply chat template. Inspired by hf apply_chat_template() on python.
|
||||||
|
/// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
|
||||||
|
/// NOTE: This function only support some known jinja templates. It is not a jinja parser.
|
||||||
|
/// @param tmpl A Jinja template to use for this chat. If this is nullptr, the model’s default chat template will be used instead.
|
||||||
|
/// @param chat Pointer to a list of multiple llama_chat_message
|
||||||
|
/// @param n_msg Number of llama_chat_message in this chat
|
||||||
|
/// @param add_ass Whether to end the prompt with the token(s) that indicate the start of an assistant message.
|
||||||
|
/// @param buf A buffer to hold the output formatted prompt. The recommended alloc size is 2 * (total number of characters of all messages)
|
||||||
|
/// @param length The size of the allocated buffer
|
||||||
|
/// @return The total number of bytes of the formatted prompt. If is it larger than the size of buffer, you may need to re-alloc it and then re-apply the template.
|
||||||
|
LLAMA_API int32_t llama_chat_apply_template(
|
||||||
|
const struct llama_model * model,
|
||||||
|
const char * tmpl,
|
||||||
|
const struct llama_chat_message * chat,
|
||||||
|
size_t n_msg,
|
||||||
|
bool add_ass,
|
||||||
|
char * buf,
|
||||||
|
int32_t length);
|
||||||
|
|
||||||
//
|
//
|
||||||
// Grammar
|
// Grammar
|
||||||
//
|
//
|
||||||
|
|
64
tests/test-chat-template.cpp
Normal file
64
tests/test-chat-template.cpp
Normal file
|
@ -0,0 +1,64 @@
|
||||||
|
#include <iostream>
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
#include <sstream>
|
||||||
|
|
||||||
|
#undef NDEBUG
|
||||||
|
#include <cassert>
|
||||||
|
|
||||||
|
#include "llama.h"
|
||||||
|
|
||||||
|
int main(void) {
|
||||||
|
llama_chat_message conversation[] = {
|
||||||
|
{"system", "You are a helpful assistant"},
|
||||||
|
{"user", "Hello"},
|
||||||
|
{"assistant", "Hi there"},
|
||||||
|
{"user", "Who are you"},
|
||||||
|
{"assistant", " I am an assistant "},
|
||||||
|
{"user", "Another question"},
|
||||||
|
};
|
||||||
|
size_t message_count = 6;
|
||||||
|
std::vector<std::string> templates = {
|
||||||
|
// teknium/OpenHermes-2.5-Mistral-7B
|
||||||
|
"{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content'] + '<|im_end|>' + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\\n' }}{% endif %}",
|
||||||
|
// mistralai/Mistral-7B-Instruct-v0.2
|
||||||
|
"{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}",
|
||||||
|
// TheBloke/FusionNet_34Bx2_MoE-AWQ
|
||||||
|
"{%- for idx in range(0, messages|length) -%}\\n{%- if messages[idx]['role'] == 'user' -%}\\n{%- if idx > 1 -%}\\n{{- bos_token + '[INST] ' + messages[idx]['content'] + ' [/INST]' -}}\\n{%- else -%}\\n{{- messages[idx]['content'] + ' [/INST]' -}}\\n{%- endif -%}\\n{% elif messages[idx]['role'] == 'system' %}\\n{{- '[INST] <<SYS>>\\\\n' + messages[idx]['content'] + '\\\\n<</SYS>>\\\\n\\\\n' -}}\\n{%- elif messages[idx]['role'] == 'assistant' -%}\\n{{- ' ' + messages[idx]['content'] + ' ' + eos_token -}}\\n{% endif %}\\n{% endfor %}",
|
||||||
|
// bofenghuang/vigogne-2-70b-chat
|
||||||
|
"{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif true == true and not '<<SYS>>' in messages[0]['content'] %}{% set loop_messages = messages %}{% set system_message = 'Vous êtes Vigogne, un assistant IA créé par Zaion Lab. Vous suivez extrêmement bien les instructions. Aidez autant que vous le pouvez.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\\\n' + system_message + '\\\\n<</SYS>>\\\\n\\\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'system' %}{{ '<<SYS>>\\\\n' + content.strip() + '\\\\n<</SYS>>\\\\n\\\\n' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}",
|
||||||
|
};
|
||||||
|
std::vector<std::string> expected_substr = {
|
||||||
|
"<|im_start|>assistant\n I am an assistant <|im_end|>\n<|im_start|>user\nAnother question<|im_end|>\n<|im_start|>assistant",
|
||||||
|
"[/INST]Hi there</s>[INST] Who are you [/INST] I am an assistant </s>[INST] Another question [/INST]",
|
||||||
|
"</s><s>[INST] Who are you [/INST] I am an assistant </s><s>[INST] Another question [/INST]",
|
||||||
|
"[/INST] Hi there </s>[INST] Who are you [/INST] I am an assistant </s>[INST] Another question [/INST]",
|
||||||
|
};
|
||||||
|
std::vector<char> formatted_chat(1024);
|
||||||
|
int32_t res;
|
||||||
|
|
||||||
|
// test invalid chat template
|
||||||
|
res = llama_chat_apply_template(nullptr, "INVALID TEMPLATE", conversation, message_count, true, formatted_chat.data(), formatted_chat.size());
|
||||||
|
assert(res < 0);
|
||||||
|
|
||||||
|
for (size_t i = 0; i < templates.size(); i++) {
|
||||||
|
std::string custom_template = templates[i];
|
||||||
|
std::string substr = expected_substr[i];
|
||||||
|
formatted_chat.resize(1024);
|
||||||
|
res = llama_chat_apply_template(
|
||||||
|
nullptr,
|
||||||
|
custom_template.c_str(),
|
||||||
|
conversation,
|
||||||
|
message_count,
|
||||||
|
true,
|
||||||
|
formatted_chat.data(),
|
||||||
|
formatted_chat.size()
|
||||||
|
);
|
||||||
|
formatted_chat.resize(res);
|
||||||
|
std::string output(formatted_chat.data(), formatted_chat.size());
|
||||||
|
std::cout << output << "\n-------------------------\n";
|
||||||
|
// expect the "formatted_chat" to contain pre-defined strings
|
||||||
|
assert(output.find(substr) != std::string::npos);
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
Loading…
Add table
Add a link
Reference in a new issue