diff --git a/README-sycl.md b/README-sycl.md index e3a8e726e..dd5bf9dea 100644 --- a/README-sycl.md +++ b/README-sycl.md @@ -272,7 +272,7 @@ Please install [Visual Studio](https://visualstudio.microsoft.com/) which impact a. Please follow the procedure in [Get the Intel® oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html). -Recommend to install to default folder: **/opt/intel/oneapi**. +Recommend to install to default folder: **C:\Program Files (x86)\Intel\oneAPI**. Following guide uses the default folder as example. If you use other folder, please modify the following guide info with your folder. diff --git a/common/common.cpp b/common/common.cpp index 1ed7190f1..8d9983135 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1705,6 +1705,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l } fprintf(stream, "lora_base: %s\n", params.lora_base.c_str()); fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu); + fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep); fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat); fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau); fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta); @@ -1742,7 +1743,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l fprintf(stream, "rope_freq_base: %f # default: 10000.0\n", params.rope_freq_base); fprintf(stream, "rope_freq_scale: %f # default: 1.0\n", params.rope_freq_scale); - fprintf(stream, "seed: %d # default: -1 (random seed)\n", params.seed); + fprintf(stream, "seed: %u # default: -1 (random seed)\n", params.seed); fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false"); fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false"); fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp); @@ -1751,7 +1752,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l dump_vector_float_yaml(stream, "tensor_split", tensor_split_vector); fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z); - fprintf(stream, "threads: %d # default: %d\n", params.n_threads, std::thread::hardware_concurrency()); + fprintf(stream, "threads: %d # default: %u\n", params.n_threads, std::thread::hardware_concurrency()); fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k); fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p); fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p); @@ -1802,7 +1803,8 @@ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) { if (cs_curr[j] < 0) { continue; } if (seqs.find(cs_curr[j]) == seqs.end()) { if (seqs.size() + 1 >= sizeof(slot_chars)) { break; } - seqs[cs_curr[j]] = seqs.size(); + const size_t sz = seqs.size(); + seqs[cs_curr[j]] = sz; } } if (seqs.size() + 1 >= sizeof(slot_chars)) { break; } diff --git a/common/sampling.cpp b/common/sampling.cpp index c198260f3..0cceb77af 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -121,7 +121,7 @@ static void sampler_queue( struct llama_context * ctx_main, const llama_sampling_params & params, llama_token_data_array & cur_p, - size_t & min_keep) { + size_t min_keep) { const float temp = params.temp; const float dynatemp_range = params.dynatemp_range; const float dynatemp_exponent = params.dynatemp_exponent; @@ -249,7 +249,7 @@ static llama_token llama_sampling_sample_impl( id = llama_sample_token_mirostat_v2(ctx_main, &cur_p, mirostat_tau, mirostat_eta, &ctx_sampling->mirostat_mu); } else { // temperature sampling - size_t min_keep = std::max(1, params.n_probs); + size_t min_keep = std::max(1, params.min_keep); sampler_queue(ctx_main, params, cur_p, min_keep); diff --git a/common/sampling.h b/common/sampling.h index c20faf551..9b635a905 100644 --- a/common/sampling.h +++ b/common/sampling.h @@ -22,6 +22,7 @@ enum class llama_sampler_type : char { typedef struct llama_sampling_params { int32_t n_prev = 64; // number of previous tokens to remember int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens. + int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens int32_t top_k = 40; // <= 0 to use vocab size float top_p = 0.95f; // 1.0 = disabled float min_p = 0.05f; // 0.0 = disabled diff --git a/examples/baby-llama/baby-llama.cpp b/examples/baby-llama/baby-llama.cpp index e7d2ad592..65bb238a0 100644 --- a/examples/baby-llama/baby-llama.cpp +++ b/examples/baby-llama/baby-llama.cpp @@ -1533,16 +1533,17 @@ int main(int argc, char ** argv) { int n_past = 0; - ggml_cgraph gf = {}; + struct ggml_cgraph * gf = NULL; + gf = ggml_new_graph_custom(ctx0, LLAMA_TRAIN_MAX_NODES, true); get_example_targets_batch(ctx0, 64*ex+0, tokens_input, targets); - struct ggml_tensor * logits = forward_batch(&model, &kv_self, ctx0, &gf, tokens_input, n_tokens, n_past, n_batch); + struct ggml_tensor * logits = forward_batch(&model, &kv_self, ctx0, gf, tokens_input, n_tokens, n_past, n_batch); // struct ggml_tensor * e = cross_entropy_loss(ctx0, targets, logits); struct ggml_tensor * e = square_error_loss(ctx0, targets, logits); - ggml_build_forward_expand(&gf, e); - ggml_graph_compute_helper(work_buffer, &gf, /*n_threads*/ 1); + ggml_build_forward_expand(gf, e); + ggml_graph_compute_helper(work_buffer, gf, /*n_threads*/ 1); float error_before_opt = ggml_get_f32_1d(e, 0); @@ -1552,8 +1553,8 @@ int main(int argc, char ** argv) { opt_params_lbfgs.lbfgs.n_iter = 16; ggml_opt(ctx0, opt_params_lbfgs, e); // - ggml_build_forward_expand(&gf, e); - ggml_graph_compute_helper(work_buffer, &gf, /*n_threads*/ 1); + ggml_build_forward_expand(gf, e); + ggml_graph_compute_helper(work_buffer, gf, /*n_threads*/ 1); float error_after_opt = ggml_get_f32_1d(e, 0); @@ -1600,13 +1601,14 @@ int main(int argc, char ** argv) { }; struct ggml_context * ctx0 = ggml_init(params); - ggml_cgraph gf = {}; + struct ggml_cgraph * gf = NULL; + gf = ggml_new_graph_custom(ctx0, LLAMA_TRAIN_MAX_NODES, true); int n_past = 0; - struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, &gf, tokens_input, sample_ctx, n_past); + struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, gf, tokens_input, sample_ctx, n_past); - ggml_build_forward_expand(&gf, logits); - ggml_graph_compute_helper(work_buffer, &gf, /*n_threads*/ 1); + ggml_build_forward_expand(gf, logits); + ggml_graph_compute_helper(work_buffer, gf, /*n_threads*/ 1); struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx); struct ggml_tensor * probs = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, sample_ctx); diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp index 55dfd9784..b4b8a38e1 100644 --- a/examples/batched-bench/batched-bench.cpp +++ b/examples/batched-bench/batched-bench.cpp @@ -159,7 +159,7 @@ int main(int argc, char ** argv) { } LOG_TEE("\n"); - LOG_TEE("%s: n_kv_max = %d, is_pp_shared = %d, n_gpu_layers = %d, mmq = %d, n_threads = %d, n_threads_batch = %d\n", __func__, n_kv_max, is_pp_shared, n_gpu_layers, mmq, ctx_params.n_threads, ctx_params.n_threads_batch); + LOG_TEE("%s: n_kv_max = %d, is_pp_shared = %d, n_gpu_layers = %d, mmq = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, is_pp_shared, n_gpu_layers, mmq, ctx_params.n_threads, ctx_params.n_threads_batch); LOG_TEE("\n"); LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s"); diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp index eab636692..9be7eb56b 100644 --- a/examples/batched/batched.cpp +++ b/examples/batched/batched.cpp @@ -92,7 +92,7 @@ int main(int argc, char ** argv) { const int n_ctx = llama_n_ctx(ctx); - LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_batch = %d, n_parallel = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req); + LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req); // make sure the KV cache is big enough to hold all the prompt and generated tokens if (n_kv_req > n_ctx) { diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp index 4d41e1779..8209dcb64 100644 --- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp @@ -325,14 +325,14 @@ struct train_params { }; static void print_params(struct my_llama_hparams * params) { - printf("%s: n_vocab: %d\n", __func__, params->n_vocab); - printf("%s: n_ctx: %d\n", __func__, params->n_ctx); - printf("%s: n_embd: %d\n", __func__, params->n_embd); - printf("%s: n_mult: %d\n", __func__, params->n_mult); - printf("%s: n_head: %d\n", __func__, params->n_head); - printf("%s: n_ff: %d\n", __func__, params->n_ff); - printf("%s: n_layer: %d\n", __func__, params->n_layer); - printf("%s: n_rot: %d\n", __func__, params->n_rot); + printf("%s: n_vocab: %u\n", __func__, params->n_vocab); + printf("%s: n_ctx: %u\n", __func__, params->n_ctx); + printf("%s: n_embd: %u\n", __func__, params->n_embd); + printf("%s: n_mult: %u\n", __func__, params->n_mult); + printf("%s: n_head: %u\n", __func__, params->n_head); + printf("%s: n_ff: %u\n", __func__, params->n_ff); + printf("%s: n_layer: %u\n", __func__, params->n_layer); + printf("%s: n_rot: %u\n", __func__, params->n_rot); } static void init_model(struct my_llama_model * model) { @@ -350,25 +350,25 @@ static void init_model(struct my_llama_model * model) { model->train_tokens = 0; model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab); - printf("[%s:GG] Allocating [%d] x [%d] = [%d] float space for model->tok_embeddings\n",__func__,n_embd , n_vocab, n_embd * n_vocab); + printf("[%s:GG] Allocating [%u] x [%u] = [%u] float space for model->tok_embeddings\n",__func__,n_embd , n_vocab, n_embd * n_vocab); model->norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - printf("[%s:GG] Allocating [%d] float space for model->norm\n",__func__,n_embd); + printf("[%s:GG] Allocating [%u] float space for model->norm\n",__func__,n_embd); model->output = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab); - printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for model->output\n",__func__,n_embd, n_vocab, n_embd * n_vocab); + printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for model->output\n",__func__,n_embd, n_vocab, n_embd * n_vocab); // printing the per-layer allocations here so we dont print in the for loop. - printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wq for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer); - printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wk for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer); - printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wv for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer); - printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wo for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer); + printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wq for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer); + printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wk for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer); + printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wv for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer); + printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wo for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer); - printf("[%s:GG] Allocating [%d] float space for layer.ffn_norm for [%d] layers\n",__func__,n_embd, n_layer); + printf("[%s:GG] Allocating [%u] float space for layer.ffn_norm for [%u] layers\n",__func__,n_embd, n_layer); - printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w1 for [%d] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer); - printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w2 for [%d] layers\n",__func__, n_embd, n_ff, n_ff * n_embd, n_layer); - printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w3 for [%d] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer); + printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.w1 for [%u] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer); + printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.w2 for [%u] layers\n",__func__, n_embd, n_ff, n_ff * n_embd, n_layer); + printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.w3 for [%u] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer); ggml_set_name(model->tok_embeddings, "tok_embeddings.weight"); ggml_set_name(model->norm, "norm.weight"); diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp index 2f7be8a13..08413f57e 100644 --- a/examples/export-lora/export-lora.cpp +++ b/examples/export-lora/export-lora.cpp @@ -7,8 +7,6 @@ #include #include -static const size_t tensor_alignment = 32; - struct lora_info { std::string filename; float scale; diff --git a/examples/json-schema-to-grammar.py b/examples/json-schema-to-grammar.py index 2a4cb65bc..6a977f031 100755 --- a/examples/json-schema-to-grammar.py +++ b/examples/json-schema-to-grammar.py @@ -87,7 +87,21 @@ class SchemaConverter: elif schema_type == 'array' and 'items' in schema: # TODO `prefixItems` keyword item_rule_name = self.visit(schema['items'], f'{name}{"-" if name else ""}item') - rule = f'"[" space ({item_rule_name} ("," space {item_rule_name})*)? "]" space' + list_item_operator = f'("," space {item_rule_name})' + successive_items = "" + min_items = schema.get("minItems", 0) + if min_items > 0: + first_item = f"({item_rule_name})" + successive_items = list_item_operator * (min_items - 1) + min_items -= 1 + else: + first_item = f"({item_rule_name})?" + max_items = schema.get("maxItems") + if max_items is not None and max_items > min_items: + successive_items += (list_item_operator + "?") * (max_items - min_items - 1) + else: + successive_items += list_item_operator + "*" + rule = f'"[" space {first_item} {successive_items} "]" space' return self._add_rule(rule_name, rule) else: diff --git a/examples/llava/README.md b/examples/llava/README.md index 57eb42932..e42db6e5a 100644 --- a/examples/llava/README.md +++ b/examples/llava/README.md @@ -53,7 +53,7 @@ python ./examples/llava/convert-image-encoder-to-gguf.py -m ../clip-vit-large-pa 5. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF: ```sh -python ./convert.py ../llava-v1.5-7b +python ./convert.py ../llava-v1.5-7b --skip-unknown ``` Now both the LLaMA part and the image encoder is in the `llava-v1.5-7b` directory. diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 98d512f67..ef9e4ba7a 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -616,9 +616,9 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 KQ = ggml_soft_max_inplace(ctx0, KQ); struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ); KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_positions, n_head, batch_size); - KQV = ggml_cont(ctx0, ggml_permute(ctx0, KQV, 0, 2, 1, 3)); + KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - cur = ggml_cpy(ctx0, KQV, ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size)); + cur = ggml_cont_3d(ctx0, KQV, hidden_size, num_positions, batch_size); } // attention output diff --git a/examples/llava/llava-surgery.py b/examples/llava/llava-surgery.py index 0a61efdfe..4f2da3bee 100644 --- a/examples/llava/llava-surgery.py +++ b/examples/llava/llava-surgery.py @@ -19,19 +19,12 @@ mm_tensors = [k for k, v in checkpoint.items() if k.startswith("model.mm_project projector = {name: checkpoint[name].float() for name in mm_tensors} torch.save(projector, f"{args.model}/llava.projector") -# remove these tensors from the checkpoint and save it again -for name in mm_tensors: - del checkpoint[name] - # BakLLaVA models contain CLIP tensors in it clip_tensors = [k for k, v in checkpoint.items() if k.startswith("model.vision_tower")] if len(clip_tensors) > 0: clip = {name.replace("vision_tower.vision_tower.", ""): checkpoint[name].float() for name in clip_tensors} torch.save(clip, f"{args.model}/llava.clip") - # remove these tensors - for name in clip_tensors: - del checkpoint[name] # added tokens should be removed to be able to convert Mistral models if os.path.exists(f"{args.model}/added_tokens.json"): @@ -39,7 +32,6 @@ if len(clip_tensors) > 0: f.write("{}\n") -torch.save(checkpoint, path) print("Done!") print(f"Now you can convert {args.model} to a regular LLaMA GGUF file.") diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index 3bf95e7c8..194743cf4 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -310,7 +310,7 @@ static void process_logits(int n_vocab, const float * logits, const int * tokens } static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & params) { - // Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research + // Download: https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip // Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw` // Output: `perplexity: 13.5106 [114/114]` // BOS tokens will be added for each chunk before eval @@ -448,7 +448,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par return perplexity_v2(ctx, params); } - // Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research + // Download: https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip // Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw` // Output: `perplexity: 13.5106 [114/114]` // BOS tokens will be added for each chunk before eval @@ -1624,7 +1624,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) { uint32_t n_ctx; in.read((char *)&n_ctx, sizeof(n_ctx)); if (n_ctx > llama_n_ctx(ctx)) { - fprintf(stderr, "%s: %s has been computed with %d, while the current context is %d. Increase it with -c and retry\n", + fprintf(stderr, "%s: %s has been computed with %u, while the current context is %d. Increase it with -c and retry\n", __func__, params.logits_file.c_str(), n_ctx, params.n_ctx); } diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 2af4a0fb8..ca129e88c 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -24,6 +24,7 @@ static const std::vector QUANT_OPTIONS = { { "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 4.70G, +0.0349 ppl @ LLaMA-v1-7B", }, { "IQ2_XXS",LLAMA_FTYPE_MOSTLY_IQ2_XXS," 2.06 bpw quantization", }, { "IQ2_XS", LLAMA_FTYPE_MOSTLY_IQ2_XS, " 2.31 bpw quantization", }, + { "IQ1_S", LLAMA_FTYPE_MOSTLY_IQ1_S, " 1.56 bpw quantization", }, { "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", }, { "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.16G, +9.0634 ppl @ LLaMA-v1-7B", }, { "IQ3_XXS",LLAMA_FTYPE_MOSTLY_IQ3_XXS," 3.06 bpw quantization", }, @@ -288,9 +289,10 @@ int main(int argc, char ** argv) { } } - if ((params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || params.ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) && imatrix_data.empty()) { + if ((params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || + params.ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) && imatrix_data.empty()) { fprintf(stderr, "\n===============================================================================================\n"); - fprintf(stderr, "Please do not use IQ2_XXS, IQ2_XS or Q2_K_S quantization without an importance matrix\n"); + fprintf(stderr, "Please do not use IQ1_S, IQ2_XXS, IQ2_XS or Q2_K_S quantization without an importance matrix\n"); fprintf(stderr, "===============================================================================================\n\n\n"); return 1; } diff --git a/examples/server/README.md b/examples/server/README.md index 249368749..f6b9c7402 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -39,6 +39,8 @@ see https://github.com/ggerganov/llama.cpp/issues/1437 - `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA. - `--grp-attn-n`: Set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w` - `--grp-attn-w`: Set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n` +- `-n, --n-predict`: Set the maximum tokens to predict (default: -1) +- `--slots-endpoint-disable`: To disable slots state monitoring endpoint. Slots state may contain user data, prompts included. ## Build @@ -132,9 +134,11 @@ node index.js ## API Endpoints - **GET** `/health`: Returns the current state of the server: - - `{"status": "loading model"}` if the model is still being loaded. - - `{"status": "error"}` if the model failed to load. - - `{"status": "ok"}` if the model is successfully loaded and the server is ready for further requests mentioned below. + - 503 -> `{"status": "loading model"}` if the model is still being loaded. + - 500 -> `{"status": "error"}` if the model failed to load. + - 200 -> `{"status": "ok", "slots_idle": 1, "slots_processing": 2 }` if the model is successfully loaded and the server is ready for further requests mentioned below. + - 200 -> `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if no slot are currently available. + - 503 -> `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if the query parameter `fail_on_no_slot` is provided and no slot are currently available. - **POST** `/completion`: Given a `prompt`, it returns the predicted completion. @@ -196,6 +200,8 @@ node index.js `n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token (default: 0) + `min_keep`: If greater than 0, force samplers to return N possible tokens at minimum (default: 0) + `image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `prompt`. You can determine the place of the image in the prompt as in the following: `USER:[img-12]Describe the image in detail.\nASSISTANT:`. In this case, `[img-12]` will be replaced by the embeddings of the image with id `12` in the following `image_data` array: `{..., "image_data": [{"data": "", "id": 12}]}`. Use `image_data` only with multimodal models, e.g., LLaVA. `slot_id`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot (default: -1) @@ -379,6 +385,69 @@ Notice that each `probs` is an array of length `n_probs`. }' ``` +- **GET** `/slots`: Returns the current slots processing state. Can be disabled with `--slots-endpoint-disable`. + +### Result JSON + +```json +[ + { + "dynatemp_exponent": 1.0, + "dynatemp_range": 0.0, + "frequency_penalty": 0.0, + "grammar": "", + "id": 0, + "ignore_eos": false, + "logit_bias": [], + "min_p": 0.05000000074505806, + "mirostat": 0, + "mirostat_eta": 0.10000000149011612, + "mirostat_tau": 5.0, + "model": "llama-2-7b-32k-instruct.Q2_K.gguf", + "n_ctx": 2048, + "n_keep": 0, + "n_predict": 100000, + "n_probs": 0, + "next_token": { + "has_next_token": true, + "n_remain": -1, + "num_tokens_predicted": 0, + "stopped_eos": false, + "stopped_limit": false, + "stopped_word": false, + "stopping_word": "" + }, + "penalize_nl": true, + "penalty_prompt_tokens": [], + "presence_penalty": 0.0, + "prompt": "Say hello to llama.cpp", + "repeat_last_n": 64, + "repeat_penalty": 1.100000023841858, + "samplers": [ + "top_k", + "tfs_z", + "typical_p", + "top_p", + "min_p", + "temperature" + ], + "seed": 42, + "state": 1, + "stop": [ + "\n" + ], + "stream": false, + "task_id": 0, + "temperature": 0.0, + "tfs_z": 1.0, + "top_k": 40, + "top_p": 0.949999988079071, + "typical_p": 1.0, + "use_penalty_prompt_tokens": false + } +] +``` + ## More examples ### Change system prompt on runtime diff --git a/examples/server/public/index.html b/examples/server/public/index.html index b059c75f2..84038ddce 100644 --- a/examples/server/public/index.html +++ b/examples/server/public/index.html @@ -234,6 +234,7 @@ mirostat_eta: 0.1, // learning rate grammar: '', n_probs: 0, // no completion_probabilities, + min_keep: 0, // min probs from each sampler, image_data: [], cache_prompt: true, api_key: '' @@ -791,6 +792,9 @@
${IntField({ label: "Show Probabilities", max: 10, min: 0, name: "n_probs", value: params.value.n_probs })}
+
+ ${IntField({ label: "Min Probabilities from each Sampler", max: 10, min: 0, name: "min_keep", value: params.value.min_keep })} +
diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 67975170c..19697f2b8 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -29,6 +29,7 @@ #include #include #include +#include using json = nlohmann::json; @@ -41,6 +42,7 @@ struct server_params int32_t port = 8080; int32_t read_timeout = 600; int32_t write_timeout = 600; + bool slots_endpoint = true; }; bool server_verbose = false; @@ -159,6 +161,7 @@ struct llama_client_slot int32_t n_decoded = 0; int32_t n_remaining = -1; int32_t i_batch = -1; + int32_t n_predict = -1; int32_t num_prompt_tokens = 0; int32_t num_prompt_tokens_processed = 0; @@ -410,6 +413,7 @@ struct llama_server_context slot.id = i; slot.n_ctx = n_ctx_slot; + slot.n_predict = params.n_predict; LOG_TEE(" -> Slot %i - max context: %i\n", slot.id, n_ctx_slot); @@ -545,6 +549,16 @@ struct llama_server_context slot->params.seed = json_value(data, "seed", default_params.seed); slot->sparams.grammar = json_value(data, "grammar", default_sparams.grammar); slot->sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs); + slot->sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep); + + if (slot->n_predict > 0 && slot->params.n_predict > slot->n_predict) { + // Might be better to reject the request with a 400 ? + LOG_WARNING("Max tokens to predict exceeds server configuration", { + {"params.n_predict", slot->params.n_predict}, + {"slot.n_predict", slot->n_predict}, + }); + slot->params.n_predict = slot->n_predict; + } // infill if (data.count("input_prefix") != 0) @@ -1053,6 +1067,7 @@ struct llama_server_context return json { {"n_ctx", slot.n_ctx}, + {"n_predict", slot.n_predict}, {"model", params.model_alias}, {"seed", slot.params.seed}, {"temperature", slot.sparams.temp}, @@ -1080,6 +1095,7 @@ struct llama_server_context {"stream", slot.params.stream}, {"logit_bias", slot.sparams.logit_bias}, {"n_probs", slot.sparams.n_probs}, + {"min_keep", slot.sparams.min_keep}, {"grammar", slot.sparams.grammar}, {"samplers", samplers_sequence} }; @@ -1914,14 +1930,16 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms, printf(" set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n"); printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA.\n"); printf(" --log-disable disables logging to a file.\n"); + printf(" --slots-endpoint-disable disables slots monitoring endpoint.\n"); printf("\n"); + printf(" -n, --n-predict maximum tokens to predict (default: %d)\n", params.n_predict); printf(" --override-kv KEY=TYPE:VALUE\n"); printf(" advanced option to override model metadata by key. may be specified multiple times.\n"); printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n"); printf(" -gan N, --grp-attn-n N set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`"); printf(" -gaw N, --grp-attn-w N set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`"); printf(" --chat-template FORMAT_NAME"); - printf(" set chat template, possible valus is: llama2, chatml (default %s)", sparams.chat_template.c_str()); + printf(" set chat template, possible value is: llama2, chatml (default %s)", sparams.chat_template.c_str()); printf("\n"); } @@ -2361,6 +2379,10 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, log_set_target(stdout); LOG_INFO("logging to file is disabled.", {}); } + else if (arg == "--slots-endpoint-disable") + { + sparams.slots_endpoint = false; + } else if (arg == "--chat-template") { if (++i >= argc) @@ -2512,6 +2534,9 @@ static void append_to_generated_text_from_generated_token_probs(llama_server_con } } +std::function shutdown_handler; +inline void signal_handler(int signal) { shutdown_handler(signal); } + int main(int argc, char **argv) { #if SERVER_VERBOSE != 1 @@ -2558,13 +2583,40 @@ int main(int argc, char **argv) res.set_header("Access-Control-Allow-Headers", "*"); }); - svr.Get("/health", [&](const httplib::Request&, httplib::Response& res) { + svr.Get("/health", [&](const httplib::Request& req, httplib::Response& res) { server_state current_state = state.load(); switch(current_state) { - case SERVER_STATE_READY: - res.set_content(R"({"status": "ok"})", "application/json"); - res.status = 200; // HTTP OK + case SERVER_STATE_READY: { + int available_slots = 0; + int processing_slots = 0; + for (llama_client_slot &slot: llama.slots) { + if (slot.available()) { + available_slots++; + } else { + processing_slots++; + } + } + if (available_slots > 0) { + json health = { + {"status", "ok"}, + {"slots_idle", available_slots}, + {"slots_processing", processing_slots}}; + res.set_content(health.dump(), "application/json"); + res.status = 200; // HTTP OK + } else { + json health = { + {"status", "no slot available"}, + {"slots_idle", available_slots}, + {"slots_processing", processing_slots}}; + res.set_content(health.dump(), "application/json"); + if (req.has_param("fail_on_no_slot")) { + res.status = 503; // HTTP Service Unavailable + } else { + res.status = 200; // HTTP OK + } + } break; + } case SERVER_STATE_LOADING_MODEL: res.set_content(R"({"status": "loading model"})", "application/json"); res.status = 503; // HTTP Service Unavailable @@ -2576,6 +2628,32 @@ int main(int argc, char **argv) } }); + if (sparams.slots_endpoint) { + svr.Get("/slots", [&](const httplib::Request&, httplib::Response& res) { + json slots; + for (llama_client_slot & slot : llama.slots) { + json slot_data = llama.get_formated_generation(slot); + slot_data["id"] = slot.id; + slot_data["task_id"] = slot.task_id; + slot_data["state"] = slot.state; + slot_data["prompt"] = slot.prompt; + slot_data["next_token"] = { + {"has_next_token", slot.has_next_token}, + {"n_remain", slot.n_remaining}, + {"num_tokens_predicted", slot.n_decoded}, + {"stopped_eos", slot.stopped_eos}, + {"stopped_word", slot.stopped_word}, + {"stopped_limit", slot.stopped_limit}, + {"stopping_word", slot.stopping_word}, + }; + + slots.push_back(slot_data); + } + res.set_content(slots.dump(), "application/json"); + res.status = 200; // HTTP OK + }); + } + svr.set_logger(log_server_request); svr.set_exception_handler([](const httplib::Request &, httplib::Response &res, std::exception_ptr ep) @@ -3129,8 +3207,25 @@ int main(int argc, char **argv) std::placeholders::_2, std::placeholders::_3 )); - llama.queue_tasks.start_loop(); + shutdown_handler = [&](int) { + llama.queue_tasks.terminate(); + }; + +#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) + struct sigaction sigint_action; + sigint_action.sa_handler = signal_handler; + sigemptyset (&sigint_action.sa_mask); + sigint_action.sa_flags = 0; + sigaction(SIGINT, &sigint_action, NULL); +#elif defined (_WIN32) + auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL { + return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false; + }; + SetConsoleCtrlHandler(reinterpret_cast(console_ctrl_handler), true); +#endif + llama.queue_tasks.start_loop(); + svr.stop(); t.join(); llama_backend_free(); diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index 548548962..0ee670dba 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -220,6 +220,7 @@ inline std::string format_chatml(std::vector messages) struct llama_server_queue { int id = 0; std::mutex mutex_tasks; + bool running; // queues std::vector queue_tasks; std::vector queue_tasks_deferred; @@ -278,9 +279,18 @@ struct llama_server_queue { queue_tasks_deferred.clear(); } - // Start the main loop. This call is blocking - [[noreturn]] + // end the start_loop routine + void terminate() { + { + std::unique_lock lock(mutex_tasks); + running = false; + } + condition_tasks.notify_all(); + } + + // Start the main loop. void start_loop() { + running = true; while (true) { // new task arrived LOG_VERBOSE("have new task", {}); @@ -324,8 +334,12 @@ struct llama_server_queue { { std::unique_lock lock(mutex_tasks); if (queue_tasks.empty()) { + if (!running) { + LOG_VERBOSE("ending start_loop", {}); + return; + } condition_tasks.wait(lock, [&]{ - return !queue_tasks.empty(); + return (!queue_tasks.empty() || !running); }); } } diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp index bfdf124d7..e78ab185d 100644 --- a/examples/train-text-from-scratch/train-text-from-scratch.cpp +++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp @@ -111,13 +111,13 @@ static const char * LLM_TENSOR_FFN_DOWN = "blk.%d.ffn_down"; static const char * LLM_TENSOR_FFN_UP = "blk.%d.ffn_up"; static void print_params(struct my_llama_hparams * params) { - printf("%s: n_vocab: %d\n", __func__, params->n_vocab); - printf("%s: n_ctx: %d\n", __func__, params->n_ctx); - printf("%s: n_embd: %d\n", __func__, params->n_embd); - printf("%s: n_head: %d\n", __func__, params->n_head); - printf("%s: n_ff: %d\n", __func__, params->n_ff); - printf("%s: n_layer: %d\n", __func__, params->n_layer); - printf("%s: n_rot: %d\n", __func__, params->n_rot); + printf("%s: n_vocab: %u\n", __func__, params->n_vocab); + printf("%s: n_ctx: %u\n", __func__, params->n_ctx); + printf("%s: n_embd: %u\n", __func__, params->n_embd); + printf("%s: n_head: %u\n", __func__, params->n_head); + printf("%s: n_ff: %u\n", __func__, params->n_ff); + printf("%s: n_layer: %u\n", __func__, params->n_layer); + printf("%s: n_rot: %u\n", __func__, params->n_rot); } static void set_param_model(struct my_llama_model * model) { diff --git a/ggml-alloc.c b/ggml-alloc.c index c28c37c4f..e675306c8 100644 --- a/ggml-alloc.c +++ b/ggml-alloc.c @@ -377,6 +377,9 @@ struct ggml_gallocr { struct node_alloc * node_allocs; // [n_nodes] int n_nodes; + + struct tensor_alloc * leaf_allocs; // [n_leafs] + int n_leafs; }; ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) { @@ -427,6 +430,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) { free(galloc->buffers); free(galloc->buf_tallocs); free(galloc->node_allocs); + free(galloc->leaf_allocs); free(galloc); } @@ -464,7 +468,7 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor for (int i = 0; i < GGML_MAX_SRC; i++) { struct ggml_tensor * parent = node->src[i]; if (parent == NULL) { - break; + continue; } // if the node's data is external, then we cannot re-use it @@ -544,22 +548,8 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr memset(galloc->hash_set.keys, 0, galloc->hash_set.size * sizeof(struct ggml_tensor *)); memset(galloc->hash_values, 0, galloc->hash_set.size * sizeof(struct hash_node)); - // allocate all graph inputs first to avoid overwriting them - for (int i = 0; i < graph->n_nodes; i++) { - if (graph->nodes[i]->flags & GGML_TENSOR_FLAG_INPUT) { - ggml_gallocr_allocate_node(galloc, graph->nodes[i], get_node_buffer_id(node_buffer_ids, i)); - } - for (int j = 0; j < GGML_MAX_SRC; j++) { - if (graph->nodes[i]->src[j] == NULL) { - break; - } - if (graph->nodes[i]->src[j]->flags & GGML_TENSOR_FLAG_INPUT) { - ggml_gallocr_allocate_node(galloc, graph->nodes[i]->src[j], get_node_buffer_id(node_buffer_ids, i)); - } - } - } - // count number of children and views + // allocate all graph inputs and leafs first to avoid overwriting them for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; @@ -568,14 +558,37 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr ggml_gallocr_hash_get(galloc, view_src)->n_views += 1; } - for (int j = 0; j < GGML_MAX_SRC; j++) { - struct ggml_tensor * parent = node->src[j]; - if (parent == NULL) { - break; - } - ggml_gallocr_hash_get(galloc, parent)->n_children += 1; + if (node->flags & GGML_TENSOR_FLAG_INPUT) { + ggml_gallocr_allocate_node(galloc, graph->nodes[i], get_node_buffer_id(node_buffer_ids, i)); } - } + + for (int j = 0; j < GGML_MAX_SRC; j++) { + struct ggml_tensor * src = node->src[j]; + if (src == NULL) { + continue; + } + + ggml_gallocr_hash_get(galloc, src)->n_children += 1; + + // allocate explicit inputs and leafs + if (src->flags & GGML_TENSOR_FLAG_INPUT || src->op == GGML_OP_NONE) { + ggml_gallocr_allocate_node(galloc, src, get_node_buffer_id(node_buffer_ids, i)); + } + } + } + + // allocate the remaining leafs that are unused on the graph + // these are effectively static tensors that the application is not using in the graph, but may still want to allocate for other purposes + for (int i = 0; i < graph->n_leafs; i++) { + struct ggml_tensor * leaf = graph->leafs[i]; + struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf); + + if (hn->n_children == 0) { + assert(!hn->allocated); + // since buffer ids are only given for nodes, these leafs are always allocated in the first buffer + ggml_gallocr_allocate_node(galloc, leaf, 0); + } + } // allocate tensors for (int i = 0; i < graph->n_nodes; i++) { @@ -586,7 +599,7 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr for (int j = 0; j < GGML_MAX_SRC; j++) { struct ggml_tensor * parent = node->src[j]; if (parent == NULL) { - break; + continue; } ggml_gallocr_allocate_node(galloc, parent, buffer_id); } @@ -598,7 +611,7 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr for (int j = 0; j < GGML_MAX_SRC; j++) { struct ggml_tensor * parent = node->src[j]; if (parent == NULL) { - break; + continue; } AT_PRINTF("%s", parent->name); if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) { @@ -611,7 +624,7 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr for (int j = 0; j < GGML_MAX_SRC; j++) { struct ggml_tensor * parent = node->src[j]; if (parent == NULL) { - break; + continue; } struct hash_node * p_hn = ggml_gallocr_hash_get(galloc, parent); p_hn->n_children -= 1; @@ -696,6 +709,18 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c } } } + if (galloc->n_leafs < graph->n_leafs) { + free(galloc->leaf_allocs); + galloc->leaf_allocs = calloc(sizeof(struct tensor_alloc), graph->n_leafs); + GGML_ASSERT(galloc->leaf_allocs != NULL); + } + galloc->n_leafs = graph->n_leafs; + for (int i = 0; i < graph->n_leafs; i++) { + struct ggml_tensor * leaf = graph->leafs[i]; + struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf); + galloc->leaf_allocs[i].offset = hn->offset; + galloc->leaf_allocs[i].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf); + } // reallocate buffers if needed for (int i = 0; i < galloc->n_buffers; i++) { @@ -722,8 +747,8 @@ bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) { return ggml_gallocr_reserve_n(galloc, graph, NULL); } -static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * node, struct node_alloc * node_alloc, struct tensor_alloc * tensor_alloc) { - assert(node->data || node->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[node_alloc->buffer_id], node) <= tensor_alloc->size_max); +static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id, struct tensor_alloc * tensor_alloc) { + assert(node->data || node->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], node) <= tensor_alloc->size_max); if (node->view_src != NULL) { if (node->buffer == NULL) { @@ -732,29 +757,20 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * // this tensor was allocated without ggml-backend return; } - ggml_backend_view_init(galloc->buffers[node_alloc->buffer_id], node); + ggml_backend_view_init(galloc->buffers[buffer_id], node); } } else { if (node->data == NULL) { assert(tensor_alloc->offset != SIZE_MAX); - assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[node_alloc->buffer_id], node) <= tensor_alloc->size_max); - void * base = ggml_backend_buffer_get_base(galloc->buffers[node_alloc->buffer_id]); + assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], node) <= tensor_alloc->size_max); + void * base = ggml_backend_buffer_get_base(galloc->buffers[buffer_id]); void * addr = (char *)base + tensor_alloc->offset; - ggml_backend_tensor_alloc(galloc->buffers[node_alloc->buffer_id], node, addr); + ggml_backend_tensor_alloc(galloc->buffers[buffer_id], node, addr); } else { if (node->buffer == NULL) { // this tensor was allocated without ggml-backend return; } - -#ifndef NDEBUG - size_t offset = - (char *)node->data - - (char *)ggml_backend_buffer_get_base(node->buffer); - size_t size = ggml_backend_buffer_get_alloc_size(node->buffer, node); - assert(tensor_alloc->offset == SIZE_MAX || offset == tensor_alloc->offset); - assert(tensor_alloc->offset == SIZE_MAX || size <= tensor_alloc->size_max); -#endif } } } @@ -773,6 +789,13 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph return true; } + if (galloc->n_leafs != graph->n_leafs) { +#ifndef NDEBUG + fprintf(stderr, "%s: graph has different number of leafs\n", __func__); +#endif + return true; + } + for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; struct node_alloc * node_alloc = &galloc->node_allocs[i]; @@ -787,7 +810,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph for (int j = 0; j < GGML_MAX_SRC; j++) { struct ggml_tensor * src = node->src[j]; if (src == NULL) { - break; + continue; } if (!ggml_gallocr_node_needs_realloc(galloc, src, node_alloc, &node_alloc->src[j])) { #ifndef NDEBUG @@ -827,17 +850,24 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph) } // allocate the graph tensors from the previous assignments + // nodes for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; struct node_alloc * node_alloc = &galloc->node_allocs[i]; for (int j = 0; j < GGML_MAX_SRC; j++) { struct ggml_tensor * src = node->src[j]; if (src == NULL) { - break; + continue; } - ggml_gallocr_init_tensor(galloc, src, node_alloc, &node_alloc->src[j]); + ggml_gallocr_init_tensor(galloc, src, node_alloc->buffer_id, &node_alloc->src[j]); } - ggml_gallocr_init_tensor(galloc, node, node_alloc, &node_alloc->dst); + ggml_gallocr_init_tensor(galloc, node, node_alloc->buffer_id, &node_alloc->dst); + } + // leafs + for (int i = 0; i < graph->n_leafs; i++) { + struct ggml_tensor * leaf = graph->leafs[i]; + struct tensor_alloc * leaf_alloc = &galloc->leaf_allocs[i]; + ggml_gallocr_init_tensor(galloc, leaf, 0, leaf_alloc); } return true; diff --git a/ggml-backend.c b/ggml-backend.c index 87eea8440..5076d9e5e 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -756,7 +756,7 @@ GGML_CALL static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, str GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { switch (op->op) { case GGML_OP_CPY: - return op->type != GGML_TYPE_IQ2_XXS && op->type != GGML_TYPE_IQ2_XS; // missing type_traits.from_float + return op->type != GGML_TYPE_IQ2_XXS && op->type != GGML_TYPE_IQ2_XS && op->type != GGML_TYPE_IQ1_S; // missing type_traits.from_float case GGML_OP_MUL_MAT: return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type; default: @@ -1006,6 +1006,7 @@ static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, gg } } GGML_ASSERT(false && "tensor buffer type not supported by any backend"); + return -1; // silence warning } #if 0 @@ -1040,7 +1041,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st for (int i = 0; i < GGML_MAX_SRC; i++) { const struct ggml_tensor * src = tensor->src[i]; if (src == NULL) { - break; + continue; } if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) { int src_backend = ggml_backend_sched_backend_from_buffer(sched, src->buffer); @@ -1087,7 +1088,7 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str for (int j = 0; j < GGML_MAX_SRC; j++) { struct ggml_tensor * src = node->src[j]; if (src == NULL) { - break; + continue; } ggml_backend_t src_backend = tensor_backend(src); fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name, @@ -1143,7 +1144,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg for (int j = 0; j < GGML_MAX_SRC; j++) { struct ggml_tensor * src = node->src[j]; if (src == NULL) { - break; + continue; } if (tensor_backend_id(src) == -1) { tensor_backend_id(src) = ggml_backend_sched_backend_id_from_cur(sched, src); @@ -1255,7 +1256,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg for (int j = 0; j < GGML_MAX_SRC; j++) { struct ggml_tensor * src = node->src[j]; if (src == NULL) { - break; + continue; } int src_backend_id = tensor_backend_id(src); if (src_backend_id == -1) { @@ -1314,7 +1315,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg for (int j = 0; j < GGML_MAX_SRC; j++) { struct ggml_tensor * src = node->src[j]; if (src == NULL) { - break; + continue; } int src_backend_id = tensor_backend_id(src); assert(src_backend_id != -1); // all inputs should be assigned by now @@ -1361,7 +1362,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg for (int j = 0; j < GGML_MAX_SRC; j++) { struct ggml_tensor * src = node->src[j]; if (src == NULL) { - break; + continue; } ggml_backend_t src_backend = tensor_backend(src); if (src_backend != tensor_backend /* && src_backend != NULL */) { @@ -1667,7 +1668,7 @@ static struct ggml_tensor * graph_copy_dup_tensor(struct ggml_hash_set hash_set, for (int i = 0; i < GGML_MAX_SRC; i++) { struct ggml_tensor * s = src->src[i]; if (s == NULL) { - break; + continue; } dst->src[i] = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, s); } @@ -1696,7 +1697,7 @@ static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_te for (int i = 0; i < GGML_MAX_SRC; i++) { struct ggml_tensor * s = src->src[i]; if (s == NULL) { - break; + continue; } graph_copy_init_tensor(hash_set, node_copies, node_init, s); } diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 7c99fc794..6c08ff032 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -54,6 +54,8 @@ #define cudaDeviceProp hipDeviceProp_t #define cudaDeviceSynchronize hipDeviceSynchronize #define cudaError_t hipError_t +#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled +#define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled #define cudaEventCreateWithFlags hipEventCreateWithFlags #define cudaEventDisableTiming hipEventDisableTiming #define cudaEventRecord hipEventRecord @@ -517,6 +519,15 @@ typedef struct { } block_iq3_xxs; static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding"); +#define QR1_S 8 +#define QI1_S (QK_K / (4*QR1_S)) +typedef struct { + half d; + uint8_t qs[QK_K/8]; + uint8_t scales[QK_K/16]; +} block_iq1_s; +static_assert(sizeof(block_iq1_s) == sizeof(ggml_fp16_t) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding"); + #define WARP_SIZE 32 #define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses @@ -643,18 +654,18 @@ static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) { return a; } -static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) { -#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL -#pragma unroll - for (int mask = 16; mask > 0; mask >>= 1) { - a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32)); - } - return a; -#else - (void) a; - NO_DEVICE_CODE; -#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL -} +//static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) { +//#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL +//#pragma unroll +// for (int mask = 16; mask > 0; mask >>= 1) { +// a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32)); +// } +// return a; +//#else +// (void) a; +// NO_DEVICE_CODE; +//#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL +//} static __device__ __forceinline__ float warp_reduce_max(float x) { #pragma unroll @@ -664,18 +675,18 @@ static __device__ __forceinline__ float warp_reduce_max(float x) { return x; } -static __device__ __forceinline__ half2 warp_reduce_max(half2 x) { -#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX -#pragma unroll - for (int mask = 16; mask > 0; mask >>= 1) { - x = __hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32)); - } - return x; -#else - (void) x; - NO_DEVICE_CODE; -#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX -} +//static __device__ __forceinline__ half2 warp_reduce_max(half2 x) { +//#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX +//#pragma unroll +// for (int mask = 16; mask > 0; mask >>= 1) { +// x = __hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32)); +// } +// return x; +//#else +// (void) x; +// NO_DEVICE_CODE; +//#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX +//} static __device__ __forceinline__ float op_repeat(const float a, const float b) { return b; @@ -1682,6 +1693,137 @@ static const __device__ uint32_t iq3xxs_grid[256] = { 0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04, }; +static const __device__ uint64_t iq1s_grid[512] = { + 0xffffffffffff0101, 0xffffffffff01ff00, 0xffffffffff010100, 0xffffffff00000000, + 0xffffffff01ff00ff, 0xffffffff01ff0001, 0xffffffff0101ffff, 0xffffffff0101ff01, + 0xffffff00ff000000, 0xffffff000000ff00, 0xffffff00000000ff, 0xffffff0000000100, + 0xffffff0000010000, 0xffffff0001000000, 0xffffff01ffff00ff, 0xffffff01ff01ff00, + 0xffffff01ff010100, 0xffffff0100000001, 0xffffff0101ffff00, 0xffffff0101ff0101, + 0xffffff0101010100, 0xffff00ffff00ff01, 0xffff00ffff0000ff, 0xffff00ff00ff0100, + 0xffff00ff0100ff00, 0xffff00ff010001ff, 0xffff0000ff0101ff, 0xffff000000ffff00, + 0xffff000000000000, 0xffff00000001ff01, 0xffff000001000101, 0xffff0000010100ff, + 0xffff0001ffff0100, 0xffff00010000ff00, 0xffff000100010101, 0xffff000101000000, + 0xffff01ffffff0000, 0xffff01ffff01ffff, 0xffff01ffff010100, 0xffff01ff00000000, + 0xffff01ff01ffffff, 0xffff01ff01ff0001, 0xffff01ff0101ffff, 0xffff01ff01010001, + 0xffff0100ffffff01, 0xffff01000000ffff, 0xffff010000000100, 0xffff010001ff01ff, + 0xffff010001000000, 0xffff0101ff000000, 0xffff0101000101ff, 0xffff010101ffff01, + 0xffff01010101ff00, 0xff00ffffff000000, 0xff00ffff00ffff00, 0xff00ffff00000001, + 0xff00ffff000001ff, 0xff00ffff01010000, 0xff00ff00ffff0000, 0xff00ff00ff00ff00, + 0xff00ff00ff0000ff, 0xff00ff00ff000100, 0xff00ff00ff010001, 0xff00ff0000ff0001, + 0xff00ff000000ffff, 0xff00ff0000000000, 0xff00ff000001ff00, 0xff00ff0000010100, + 0xff00ff0001ff0000, 0xff00ff000100ff00, 0xff00ff0001000100, 0xff00ff01ff000000, + 0xff00ff0100ff0000, 0xff00ff01000001ff, 0xff00ff0101010001, 0xff0000ff00000000, + 0xff0000ff0001ff00, 0xff0000ff00010100, 0xff000000ffff0101, 0xff000000ff000000, + 0xff000000ff01ff00, 0xff00000000ff0000, 0xff0000000000ff00, 0xff000000000000ff, + 0xff00000000000000, 0xff00000000000001, 0xff00000000000100, 0xff0000000001ffff, + 0xff00000000010000, 0xff00000001000000, 0xff00000001010100, 0xff000001ff00ff01, + 0xff000001ff0100ff, 0xff00000100000000, 0xff0000010001ff00, 0xff00000101ff0100, + 0xff0000010100ff00, 0xff0001ff00ff00ff, 0xff0001ff00000101, 0xff0001ff000100ff, + 0xff0001ff01000000, 0xff000100ff0001ff, 0xff0001000000ff01, 0xff00010000000000, + 0xff00010000010001, 0xff00010000010100, 0xff00010001ffff00, 0xff00010001ff0101, + 0xff00010001010000, 0xff000101ffffffff, 0xff000101ff000101, 0xff00010101ff00ff, + 0xff00010101000001, 0xff000101010100ff, 0xff01ffffff000101, 0xff01ffffff01ffff, + 0xff01ffffff01ff01, 0xff01ffffff0101ff, 0xff01ffff00000000, 0xff01ffff01ff0001, + 0xff01ffff0101ff01, 0xff01ff00ff000000, 0xff01ff0000ff0100, 0xff01ff000000ff01, + 0xff01ff0000010000, 0xff01ff00010000ff, 0xff01ff01ff01ff00, 0xff01ff0100000101, + 0xff0100ffffff0000, 0xff0100ffff010000, 0xff0100ff01ff00ff, 0xff0100ff01000100, + 0xff0100ff010100ff, 0xff010000ffffff01, 0xff01000000000000, 0xff0100000101ff00, + 0xff010001ffff00ff, 0xff010001ff000100, 0xff01000100ffff00, 0xff01000100010001, + 0xff01000101ff0001, 0xff010001010001ff, 0xff0101ffffffffff, 0xff0101ffff01ffff, + 0xff0101ffff010101, 0xff0101ff0000ff00, 0xff0101ff01010001, 0xff010100ff000000, + 0xff010100ff01ff01, 0xff01010000ff0001, 0xff01010000000100, 0xff01010001000000, + 0xff0101010100ffff, 0x00ffffff0000ff01, 0x00ffffff000000ff, 0x00ffffff00000100, + 0x00ffffff00010000, 0x00ffff00ffff0001, 0x00ffff00ff0000ff, 0x00ffff00ff000100, + 0x00ffff0000000000, 0x00ffff0001000100, 0x00ffff0001010001, 0x00ffff01ff00ff01, + 0x00ffff0100ff0100, 0x00ffff010000ff00, 0x00ffff01000100ff, 0x00ffff0101ff00ff, + 0x00ffff010101ff00, 0x00ff00ffffffffff, 0x00ff00ffffff01ff, 0x00ff00ffff000101, + 0x00ff00ff00000000, 0x00ff00ff000101ff, 0x00ff00ff01010101, 0x00ff0000ff000000, + 0x00ff0000ff01ffff, 0x00ff000000ff0000, 0x00ff00000000ff00, 0x00ff0000000000ff, + 0x00ff000000000000, 0x00ff000000000001, 0x00ff000000000100, 0x00ff000000010000, + 0x00ff000001ffff01, 0x00ff000001000000, 0x00ff0001ff000101, 0x00ff000100ffffff, + 0x00ff000100000000, 0x00ff0001010001ff, 0x00ff01ffff000000, 0x00ff01ff0001ff00, + 0x00ff01ff01ff0100, 0x00ff0100ff01ff01, 0x00ff010000ff00ff, 0x00ff010000ff0101, + 0x00ff010000000000, 0x00ff010000010101, 0x00ff01000100ff00, 0x00ff010001010000, + 0x00ff0101ffffff00, 0x00ff01010000ff01, 0x00ff010100000100, 0x00ff010101ff0000, + 0x0000ffffffff0100, 0x0000ffffff00ff00, 0x0000ffffff0000ff, 0x0000ffffff010000, + 0x0000ffff00000000, 0x0000ffff00010101, 0x0000ffff01ffff01, 0x0000ffff01000100, + 0x0000ff00ff000000, 0x0000ff00ff01ff00, 0x0000ff00ff0101ff, 0x0000ff0000ff0000, + 0x0000ff000000ff00, 0x0000ff00000000ff, 0x0000ff0000000000, 0x0000ff0000000001, + 0x0000ff0000000100, 0x0000ff0000010000, 0x0000ff0001ffffff, 0x0000ff0001ff01ff, + 0x0000ff0001000000, 0x0000ff000101ffff, 0x0000ff01ffff0101, 0x0000ff01ff010000, + 0x0000ff0100000000, 0x0000ff0101000101, 0x000000ffffff0001, 0x000000ffff000000, + 0x000000ff00ff0000, 0x000000ff0000ff00, 0x000000ff000000ff, 0x000000ff00000000, + 0x000000ff00000001, 0x000000ff00000100, 0x000000ff00010000, 0x000000ff01000000, + 0x000000ff0101ff00, 0x00000000ffff0000, 0x00000000ff00ff00, 0x00000000ff0000ff, + 0x00000000ff000000, 0x00000000ff000001, 0x00000000ff000100, 0x00000000ff010000, + 0x0000000000ffff00, 0x0000000000ff00ff, 0x0000000000ff0000, 0x0000000000ff0001, + 0x0000000000ff0100, 0x000000000000ffff, 0x000000000000ff00, 0x000000000000ff01, + 0x00000000000000ff, 0x0000000000000001, 0x00000000000001ff, 0x0000000000000100, + 0x0000000000000101, 0x000000000001ff00, 0x00000000000100ff, 0x0000000000010000, + 0x0000000000010001, 0x0000000000010100, 0x0000000001ff0000, 0x000000000100ff00, + 0x00000000010000ff, 0x0000000001000000, 0x0000000001000001, 0x0000000001000100, + 0x0000000001010000, 0x00000001ffff01ff, 0x00000001ff000000, 0x0000000100ff0000, + 0x000000010000ff00, 0x00000001000000ff, 0x0000000100000000, 0x0000000100000001, + 0x0000000100000100, 0x0000000100010000, 0x0000000101000000, 0x000001ffff00ff00, + 0x000001ffff010001, 0x000001ffff0101ff, 0x000001ff00ffff01, 0x000001ff0000ffff, + 0x000001ff00000000, 0x000001ff010000ff, 0x000001ff01010100, 0x00000100ffff0100, + 0x00000100ff000000, 0x0000010000ff0000, 0x000001000000ff00, 0x00000100000000ff, + 0x0000010000000000, 0x0000010000000001, 0x0000010000000100, 0x0000010000010000, + 0x0000010001000000, 0x000001000101ff01, 0x00000101ffff0001, 0x00000101ff01ffff, + 0x0000010100000000, 0x0000010101010100, 0x0001ffffff000000, 0x0001ffff00ffffff, + 0x0001ffff00000100, 0x0001ffff0001ff00, 0x0001ffff01000000, 0x0001ff00ffffff00, + 0x0001ff00ffff01ff, 0x0001ff00ff010000, 0x0001ff0000000000, 0x0001ff0000010001, + 0x0001ff0001ff0000, 0x0001ff0001010100, 0x0001ff01ff0000ff, 0x0001ff01ff000001, + 0x0001ff0100ffffff, 0x0001ff010001ffff, 0x0001ff01000101ff, 0x0001ff010100ff01, + 0x000100ffff00ffff, 0x000100ffff00ff01, 0x000100ffff000100, 0x000100ff00000000, + 0x000100ff000101ff, 0x000100ff01ff0101, 0x000100ff0100ffff, 0x000100ff01010101, + 0x00010000ff000000, 0x00010000ff010100, 0x0001000000ff0000, 0x000100000000ff00, + 0x00010000000000ff, 0x0001000000000000, 0x0001000000000001, 0x0001000000000100, + 0x0001000000010000, 0x0001000001ffff01, 0x0001000001000000, 0x0001000100ff0101, + 0x0001000100000000, 0x00010001010100ff, 0x000101ffffff01ff, 0x000101ffffff0101, + 0x000101ff00010000, 0x000101ff01ff0000, 0x000101ff0100ff01, 0x00010100ffff0000, + 0x0001010000000000, 0x000101000001ffff, 0x0001010000010101, 0x00010100010001ff, + 0x00010101ff00ff00, 0x00010101ff010001, 0x0001010100ffffff, 0x0001010100ff01ff, + 0x00010101000101ff, 0x0001010101ff0000, 0x000101010100ff01, 0x0001010101000101, + 0x01ffffffffff0101, 0x01ffffffff01ffff, 0x01ffffffff01ff01, 0x01ffffffff0101ff, + 0x01ffffffff010101, 0x01ffffff00000000, 0x01ffffff01ff01ff, 0x01ffffff01000101, + 0x01ffffff0101ff01, 0x01ffffff010100ff, 0x01ffff000000ff00, 0x01ffff0000000001, + 0x01ffff00000001ff, 0x01ffff0000010000, 0x01ffff0001ff0000, 0x01ffff01ffffffff, + 0x01ffff01ffff01ff, 0x01ffff01ff000000, 0x01ffff01ff01ffff, 0x01ffff01ff0101ff, + 0x01ffff010100ffff, 0x01ff00ffffff0000, 0x01ff00ffff010000, 0x01ff00ff00ffff01, + 0x01ff0000ff0000ff, 0x01ff000000000000, 0x01ff00000001ff01, 0x01ff000001ffffff, + 0x01ff000001010100, 0x01ff0001ffffff01, 0x01ff0001ff010001, 0x01ff000101ff0100, + 0x01ff000101000001, 0x01ff0001010100ff, 0x01ff01ffff00ffff, 0x01ff01ff00010001, + 0x01ff01ff01000000, 0x01ff01ff010101ff, 0x01ff0100ff000001, 0x01ff010000ffff00, + 0x01ff010000000100, 0x01ff010001ff01ff, 0x01ff01000101ffff, 0x01ff0101ffff00ff, + 0x01ff0101ffff0101, 0x01ff0101ff0101ff, 0x01ff010100010000, 0x0100ffff00ff00ff, + 0x0100ffff00ff0001, 0x0100ffff00000100, 0x0100ffff0100ff00, 0x0100ff00ffff0000, + 0x0100ff00ff00ffff, 0x0100ff00ff00ff01, 0x0100ff00ff000100, 0x0100ff00ff010000, + 0x0100ff0000000000, 0x0100ff00000100ff, 0x0100ff0001ff0101, 0x0100ff0001010101, + 0x0100ff0100ff00ff, 0x0100ff0100ff0001, 0x0100ff0100000100, 0x0100ff0100010001, + 0x0100ff0101000000, 0x010000ffff00ff00, 0x010000ff0000ffff, 0x010000ff00000000, + 0x010000ff010001ff, 0x010000ff01010001, 0x01000000ffffff00, 0x01000000ffff0101, + 0x01000000ff000000, 0x01000000ff0100ff, 0x01000000ff010101, 0x0100000000ff0000, + 0x010000000000ff00, 0x01000000000000ff, 0x0100000000000000, 0x0100000000000001, + 0x0100000000000100, 0x0100000000010000, 0x0100000001000000, 0x0100000100000000, + 0x01000001000101ff, 0x0100000101ffff01, 0x010001ffff000101, 0x010001ff00ff0100, + 0x010001ff0000ff00, 0x010001ff000100ff, 0x010001ff01ffffff, 0x01000100ffff0000, + 0x01000100ff0001ff, 0x0100010000000000, 0x010001000001ff00, 0x0100010001ff0000, + 0x01000100010000ff, 0x0100010001000101, 0x01000101ff00ff01, 0x0100010100ff0100, + 0x010001010000ffff, 0x0100010101010001, 0x0101ffffffff0101, 0x0101ffffff0001ff, + 0x0101ffffff01ffff, 0x0101ffffff010101, 0x0101ffff00000000, 0x0101ffff0101ffff, + 0x0101ffff010101ff, 0x0101ff00ff000000, 0x0101ff0000ff0100, 0x0101ff000000ff00, + 0x0101ff0000010000, 0x0101ff00010000ff, 0x0101ff0001000001, 0x0101ff01ff010101, + 0x0101ff0100000000, 0x0101ff010101ff00, 0x010100ffffff0000, 0x010100ffff010000, + 0x010100ff00ff01ff, 0x010100ff000000ff, 0x010100ff00000101, 0x010100ff01ffff00, + 0x01010000ffffff01, 0x01010000ff000100, 0x01010000ff01ff01, 0x0101000000000000, + 0x01010000000100ff, 0x010100000101ff01, 0x01010001ffff0000, 0x01010001ff00ffff, + 0x01010001ff010000, 0x0101000101ffffff, 0x0101000101ff01ff, 0x0101000101010101, + 0x010101ffff01ffff, 0x010101ff00000000, 0x010101ff0001ff01, 0x010101ff0101ffff, + 0x010101ff010101ff, 0x01010100ffffffff, 0x01010100ff000001, 0x010101000000ff00, + 0x0101010001010000, 0x0101010100ff0001, 0x010101010001ff01, 0x010101010101ffff, +}; + static const __device__ uint8_t ksigns_iq2xs[128] = { 0, 129, 130, 3, 132, 5, 6, 135, 136, 9, 10, 139, 12, 141, 142, 15, 144, 17, 18, 147, 20, 149, 150, 23, 24, 153, 154, 27, 156, 29, 30, 159, @@ -1824,6 +1966,29 @@ static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, ds } +template +static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_t * __restrict__ yy) { + + const int i = blockIdx.x; + const block_iq1_s * x = (const block_iq1_s *) vx; + + const int tid = threadIdx.x; +#if QK_K == 256 + const int il = tid/8; // 0...3 + const int ib = tid%8; // 0...7 + dst_t * y = yy + i*QK_K + 32*ib + 8*il; + const int i8 = 4*ib+il; + uint8_t h = x[i].scales[i8/2] >> 4*(i8%2); + const int8_t * grid = (const int8_t *)(iq1s_grid + (x[i].qs[i8] | ((h & 8) << 5))); + const float d = (float)x[i].d * (2*(h & 7) + 1); + for (int j = 0; j < 8; ++j) y[j] = d * grid[j]; +#else + assert(false); +#endif + +} + + static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) { static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION"); @@ -4479,10 +4644,12 @@ static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1( const float d = (float)bq2->d * __low2float(bq8_1[ib32].ds) * 0.25f; return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2); #else + (void) ksigns64; assert(false); return 0.f; #endif #else + (void) ksigns64; assert(false); return 0.f; #endif @@ -4523,6 +4690,49 @@ static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1( #endif } +static __device__ __forceinline__ float vec_dot_iq1_s_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { +#if QK_K == 256 + const block_iq1_s * bq1 = (const block_iq1_s *) vbq; + + const int ib32 = iqs; + int sumi1 = 0, sumi2 = 0, sumi3 = 0, sumi4 = 0; + const uint8_t h1 = bq1->scales[2*ib32+0]; + const uint8_t h2 = bq1->scales[2*ib32+1]; +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + const int * q8 = (const int *)bq8_1[ib32].qs; + const int * grid1 = (const int *)(iq1s_grid + (bq1->qs[4*ib32+0] | ((h1 & 0x08) << 5))); + const int * grid2 = (const int *)(iq1s_grid + (bq1->qs[4*ib32+1] | ((h1 & 0x80) << 1))); + const int * grid3 = (const int *)(iq1s_grid + (bq1->qs[4*ib32+2] | ((h2 & 0x08) << 5))); + const int * grid4 = (const int *)(iq1s_grid + (bq1->qs[4*ib32+3] | ((h2 & 0x80) << 1))); + for (int j = 0; j < 2; ++j) { + sumi1 = __dp4a(q8[j+0], grid1[j], sumi1); + sumi2 = __dp4a(q8[j+2], grid2[j], sumi2); + sumi3 = __dp4a(q8[j+4], grid3[j], sumi3); + sumi4 = __dp4a(q8[j+6], grid4[j], sumi4); + } +#else + const int8_t * q8 = bq8_1[ib32].qs; + const int8_t * grid1 = (const int8_t *)(iq1s_grid + (bq1->qs[4*ib32+0] | ((h1 & 0x08) << 5))); + const int8_t * grid2 = (const int8_t *)(iq1s_grid + (bq1->qs[4*ib32+1] | ((h1 & 0x80) << 1))); + const int8_t * grid3 = (const int8_t *)(iq1s_grid + (bq1->qs[4*ib32+2] | ((h2 & 0x08) << 5))); + const int8_t * grid4 = (const int8_t *)(iq1s_grid + (bq1->qs[4*ib32+3] | ((h2 & 0x80) << 1))); + for (int j = 0; j < 8; ++j) { + sumi1 += q8[j+ 0] * grid1[j]; + sumi2 += q8[j+ 8] * grid2[j]; + sumi3 += q8[j+16] * grid3[j]; + sumi4 += q8[j+24] * grid4[j]; + } +#endif + const float d = (float)bq1->d * __low2float(bq8_1[ib32].ds); + return d * (sumi1 * (2*(h1 & 7) + 1) + sumi2 * (2*((h1 >> 4) & 7) + 1) + + sumi3 * (2*(h2 & 7) + 1) + sumi4 * (2*((h2 >> 4) & 7) + 1)); +#else + assert(false); + return 0.f; +#endif +} + template static __device__ __forceinline__ void mul_mat_q( @@ -5957,149 +6167,31 @@ static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int dst[i] = x[i] - (col > n_past + row % rows_per_channel) * FLT_MAX; } -template -static __global__ void soft_max_f16(const float * x, const float * y, float * dst, const int ncols_par, const int nrows_y, const float scale) { -#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX - const int ncols_data = ncols_template == 0 ? ncols_par : ncols_template; - const int ncols_smem = GGML_PAD(ncols_data, 2*WARP_SIZE)/2; - - const int tid = threadIdx.x; - const int rowx = blockIdx.x; - const int rowy = rowx % nrows_y; // broadcast the mask (y) in the row dimension - - const int block_size = block_size_template == 0 ? blockDim.x : block_size_template; - - const int warp_id = threadIdx.x / WARP_SIZE; - const int lane_id = threadIdx.x % WARP_SIZE; - - extern __shared__ half data_soft_max_f16[]; - half * buf_iw = data_soft_max_f16 + 0; // shared memory buffer for inter-warp communication - // (shared memory) buffer to cache values between iterations: - half2 * vals = vals_smem ? (half2 *) (buf_iw + WARP_SIZE) : (half2 *) (dst + rowx*ncols_data); - // if the buffer is larger than max. shared memory per block, use dst as temp. buffer instead - // in that case col_smem == col_data must be enforced to avoid race conditions - - half2 max_val = make_half2(-INFINITY, -INFINITY); - -#pragma unroll - for (int col0 = 0; col0 < ncols_smem; col0 += block_size) { - const int col_data = 2*col0 + 2*WARP_SIZE*warp_id + lane_id; - const int col_smem = vals_smem ? col0 + tid : col_data; - - const int ix = rowx*ncols_data + col_data; - const int iy = rowy*ncols_data + col_data; - - half2 val; - if (need_check && col_data + 0 >= ncols_data) { - val.x = -INFINITY; - } else { - val.x = x[ix + 0]*scale + (y ? y[iy + 0] : 0.0f); - } - if (need_check && col_data + WARP_SIZE >= ncols_data) { - val.y = -INFINITY; - } else { - val.y = x[ix + WARP_SIZE]*scale + (y ? y[iy + WARP_SIZE] : 0.0f); - } - if (!need_check || col_smem < (vals_smem ? ncols_smem : ncols_data)) { - vals[col_smem] = val; - } - max_val = __hmax2(max_val, val); - } - - // find the max value in the block - max_val = warp_reduce_max(max_val); - if (block_size > WARP_SIZE) { - if (warp_id == 0) { - buf_iw[lane_id] = -INFINITY; - } - __syncthreads(); - - if (lane_id == 0) { - buf_iw[warp_id] = __hmax(max_val.x, max_val.y); - } - __syncthreads(); - - max_val = __half2half2(buf_iw[lane_id]); - max_val = warp_reduce_max(max_val); - } else { - max_val = __half2half2(__hmax(max_val.x, max_val.y)); - } - - half2 tmp = make_half2(0.0f, 0.0f); // partial sums - -#pragma unroll - for (int col0 = 0; col0 < ncols_smem; col0 += block_size) { - const int col_smem = vals_smem ? col0 + tid : 2*col0 + 2*warp_id*WARP_SIZE + lane_id; - - if (ncols_template == 0 && col_smem >= (vals_smem ? ncols_smem : ncols_data)) { - break; - } - - const half2 val = h2exp(vals[col_smem] - max_val); - - tmp += val; - vals[col_smem] = val; - } - - // find the sum of exps in the block - tmp = warp_reduce_sum(tmp); - if (block_size > WARP_SIZE) { - if (warp_id == 0) { - buf_iw[lane_id] = 0.0f; - } - __syncthreads(); - - if (lane_id == 0) { - buf_iw[warp_id] = tmp.x + tmp.y; - } - __syncthreads(); - - tmp = __half2half2(buf_iw[lane_id]); - tmp = warp_reduce_sum(tmp); - } else { - tmp = __half2half2(tmp.x + tmp.y); - } - - const half2 inv_sum = make_half2(1.0f, 1.0f) / tmp; - -#pragma unroll - for (int col0 = 0; col0 < ncols_smem; col0 += block_size) { - const int col_data = 2*col0 + 2*WARP_SIZE*warp_id + lane_id; - const int col_smem = vals_smem ? col0 + tid : col_data; - - const int idst = rowx*ncols_data + col_data; - const half2 result = vals[col_smem] * inv_sum; - - if (need_check && col_data + 0 >= ncols_data) { - return; - } - dst[idst] = result.x; - - if (need_check && col_data + WARP_SIZE >= ncols_data) { - return; - } - - dst[idst + WARP_SIZE] = result.y; - } -#else - (void) x; (void) y; (void) dst; (void) ncols_par; (void) nrows_y; (void) scale; - NO_DEVICE_CODE; -#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX -} - template -static __global__ void soft_max_f32(const float * x, const float * y, float * dst, const int ncols_par, const int nrows_y, const float scale) { +static __global__ void soft_max_f32(const float * x, const float * mask, const float * pos, float * dst, const int ncols_par, const int nrows_y, const float scale, const float max_bias, const float m0, const float m1, uint32_t n_head_log2) { const int ncols = ncols_template == 0 ? ncols_par : ncols_template; const int tid = threadIdx.x; const int rowx = blockIdx.x; - const int rowy = rowx % nrows_y; // broadcast the mask (y) in the row dimension + const int rowy = rowx % nrows_y; // broadcast the mask in the row dimension const int block_size = block_size_template == 0 ? blockDim.x : block_size_template; const int warp_id = threadIdx.x / WARP_SIZE; const int lane_id = threadIdx.x % WARP_SIZE; + float slope = 0.0f; + + // ALiBi + if (max_bias > 0.0f) { + const int h = rowx/nrows_y; // head index + + const float base = h < n_head_log2 ? m0 : m1; + const int exp = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1; + + slope = powf(base, exp); + } + extern __shared__ float data_soft_max_f32[]; float * buf_iw = data_soft_max_f32; // shared memory buffer for inter-warp communication // shared memory buffer to cache values between iterations: @@ -6118,7 +6210,8 @@ static __global__ void soft_max_f32(const float * x, const float * y, float * ds const int ix = rowx*ncols + col; const int iy = rowy*ncols + col; - const float val = x[ix]*scale + (y ? y[iy] : 0.0f); + const float val = x[ix]*scale + (mask ? mask[iy] : 0.0f) + (pos ? slope*pos[col] : 0.0f); + vals[col] = val; max_val = max(max_val, val); } @@ -6679,6 +6772,12 @@ static void dequantize_row_iq3_xxs_cuda(const void * vx, dst_t * y, const int k, dequantize_block_iq3_xxs<<>>(vx, y); } +template +static void dequantize_row_iq1_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) { + const int nb = k / QK_K; + dequantize_block_iq1_s<<>>(vx, y); +} + template static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) { const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE; @@ -6718,6 +6817,8 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) { return dequantize_row_iq2_xs_cuda; case GGML_TYPE_IQ3_XXS: return dequantize_row_iq3_xxs_cuda; + case GGML_TYPE_IQ1_S: + return dequantize_row_iq1_s_cuda; case GGML_TYPE_F32: return convert_unary_cuda; default: @@ -6753,6 +6854,8 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) { return dequantize_row_iq2_xs_cuda; case GGML_TYPE_IQ3_XXS: return dequantize_row_iq3_xxs_cuda; + case GGML_TYPE_IQ1_S: + return dequantize_row_iq1_s_cuda; case GGML_TYPE_F16: return convert_unary_cuda; default: @@ -7590,89 +7693,53 @@ static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols diag_mask_inf_f32<<>>(x, dst, ncols_x, rows_per_channel, n_past); } -static void soft_max_f16_cuda(const float * x, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nrows_y, const float scale, cudaStream_t stream) { - int nth = WARP_SIZE; - while (nth < ncols_x/2 && nth < CUDA_SOFT_MAX_BLOCK_SIZE) nth *= 2; - const dim3 block_dims(nth, 1, 1); - const dim3 block_nums(nrows_x, 1, 1); - const size_t shmem = (GGML_PAD(ncols_x, 2*WARP_SIZE) + WARP_SIZE)*sizeof(half); - static_assert(CUDA_SOFT_MAX_BLOCK_SIZE == 1024, "These values need to be adjusted."); - if (shmem <= g_device_caps[g_main_device].smpb) { - switch (ncols_x) { - case 32: - soft_max_f16<<>>(x, y, dst, ncols_x, nrows_y, scale); - break; - case 64: - soft_max_f16<<>>(x, y, dst, ncols_x, nrows_y, scale); - break; - case 128: - soft_max_f16<<>>(x, y, dst, ncols_x, nrows_y, scale); - break; - case 256: - soft_max_f16<<>>(x, y, dst, ncols_x, nrows_y, scale); - break; - case 512: - soft_max_f16<<>>(x, y, dst, ncols_x, nrows_y, scale); - break; - case 1024: - soft_max_f16<<>>(x, y, dst, ncols_x, nrows_y, scale); - break; - case 2048: - soft_max_f16<<>>(x, y, dst, ncols_x, nrows_y, scale); - break; - case 4096: - soft_max_f16<<>>(x, y, dst, ncols_x, nrows_y, scale); - break; - default: - soft_max_f16<<>>(x, y, dst, ncols_x, nrows_y, scale); - break; - } - } else { - const size_t shmem_low = WARP_SIZE*sizeof(half); - soft_max_f16<<>>(x, y, dst, ncols_x, nrows_y, scale); - } -} - -static void soft_max_f32_cuda(const float * x, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nrows_y, const float scale, cudaStream_t stream) { +static void soft_max_f32_cuda(const float * x, const float * mask, const float * pos, float * dst, const int ncols_x, const int nrows_x, const int nrows_y, const float scale, const float max_bias, cudaStream_t stream) { int nth = WARP_SIZE; while (nth < ncols_x && nth < CUDA_SOFT_MAX_BLOCK_SIZE) nth *= 2; const dim3 block_dims(nth, 1, 1); const dim3 block_nums(nrows_x, 1, 1); const size_t shmem = (GGML_PAD(ncols_x, WARP_SIZE) + WARP_SIZE)*sizeof(float); static_assert(CUDA_SOFT_MAX_BLOCK_SIZE == 1024, "These values need to be adjusted."); + + const uint32_t n_head_kv = nrows_x/nrows_y; + const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv)); + + const float m0 = powf(2.0f, -(max_bias ) / n_head_log2); + const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); + if (shmem < g_device_caps[g_main_device].smpb) { switch (ncols_x) { case 32: - soft_max_f32<<>>(x, y, dst, ncols_x, nrows_y, scale); + soft_max_f32<<>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); break; case 64: - soft_max_f32<<>>(x, y, dst, ncols_x, nrows_y, scale); + soft_max_f32<<>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); break; case 128: - soft_max_f32<<>>(x, y, dst, ncols_x, nrows_y, scale); + soft_max_f32<<>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); break; case 256: - soft_max_f32<<>>(x, y, dst, ncols_x, nrows_y, scale); + soft_max_f32<<>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); break; case 512: - soft_max_f32<<>>(x, y, dst, ncols_x, nrows_y, scale); + soft_max_f32<<>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); break; case 1024: - soft_max_f32<<>>(x, y, dst, ncols_x, nrows_y, scale); + soft_max_f32<<>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); break; case 2048: - soft_max_f32<<>>(x, y, dst, ncols_x, nrows_y, scale); + soft_max_f32<<>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); break; case 4096: - soft_max_f32<<>>(x, y, dst, ncols_x, nrows_y, scale); + soft_max_f32<<>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); break; default: - soft_max_f32<<>>(x, y, dst, ncols_x, nrows_y, scale); + soft_max_f32<<>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); break; } } else { const size_t shmem_low = WARP_SIZE*sizeof(float); - soft_max_f32<<>>(x, y, dst, ncols_x, nrows_y, scale); + soft_max_f32<<>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); } } @@ -8523,6 +8590,7 @@ static int64_t get_row_rounding(ggml_type type, const std::array= CC_RDNA2 ? 128 : 64; default: GGML_ASSERT(false); @@ -8546,6 +8614,7 @@ static int64_t get_row_rounding(ggml_type type, const std::array= CC_VOLTA ? 128 : 64; case GGML_TYPE_Q6_K: return 64; @@ -8643,6 +8712,10 @@ static void ggml_cuda_op_mul_mat_vec_q( mul_mat_vec_q_cuda (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); break; + case GGML_TYPE_IQ1_S: + mul_mat_vec_q_cuda + (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); + break; default: GGML_ASSERT(false); break; @@ -9082,30 +9155,36 @@ static void ggml_cuda_op_soft_max( GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32); // src1 contains mask and it is optional - const int64_t ne00 = src0->ne[0]; + const int64_t ne00 = src0->ne[0]; const int64_t nrows_x = ggml_nrows(src0); - const int64_t nrows_y = src1 ? ggml_nrows(src1) : 1; + const int64_t nrows_y = src0->ne[1]; - float scale = 1.0f; - memcpy(&scale, dst->op_params, sizeof(float)); + float scale = 1.0f; + float max_bias = 0.0f; -#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION >= CUDART_HMAX -#ifdef GGML_CUDA_F16 - const bool use_f16_soft_max = true; -#else - const bool use_f16_soft_max = false; -#endif // GGML_CUDA_F16 -#else - const bool use_f16_soft_max = false; -#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && CUDART_VERSION >= CUDART_HMAX + memcpy(&scale, (float *) dst->op_params + 0, sizeof(float)); + memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float)); - if (use_f16_soft_max) { - soft_max_f16_cuda(src0_dd, src1 ? src1_dd : nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, main_stream); - } else { - soft_max_f32_cuda(src0_dd, src1 ? src1_dd : nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, main_stream); + // positions tensor + float * src2_dd = nullptr; + cuda_pool_alloc src2_f; + + ggml_tensor * src2 = dst->src[2]; + const bool use_src2 = src2 != nullptr; + + if (use_src2) { + const bool src2_on_device = src2->backend == GGML_BACKEND_GPU; + + if (src2_on_device) { + ggml_tensor_extra_gpu * src2_extra = (ggml_tensor_extra_gpu *) src2->extra; + src2_dd = (float *) src2_extra->data_device[g_main_device]; + } else { + src2_dd = src2_f.alloc(ggml_nelements(src2)); + CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src2_dd, src2, 0, 0, 0, 1, main_stream)); + } } - (void) dst; + soft_max_f32_cuda(src0_dd, src1 ? src1_dd : nullptr, src2_dd, dst_dd, ne00, nrows_x, nrows_y, scale, max_bias, main_stream); } static void ggml_cuda_op_scale( @@ -9240,9 +9319,15 @@ static void ggml_cuda_set_peer_access(const int n_tokens) { CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer, id, id_other)); if (can_access_peer) { if (enable_peer_access) { - CUDA_CHECK(cudaDeviceEnablePeerAccess(id_other, 0)); + cudaError_t err = cudaDeviceEnablePeerAccess(id_other, 0); + if (err != cudaErrorPeerAccessAlreadyEnabled) { + CUDA_CHECK(err); + } } else { - CUDA_CHECK(cudaDeviceDisablePeerAccess(id_other)); + cudaError_t err = cudaDeviceDisablePeerAccess(id_other); + if (err != cudaErrorPeerAccessNotEnabled) { + CUDA_CHECK(err); + } } } } @@ -10920,10 +11005,10 @@ GGML_CALL static const char * ggml_backend_cuda_split_buffer_get_name(ggml_backe UNUSED(buffer); } -// unused at the moment -//static bool ggml_backend_buffer_is_cuda_split(ggml_backend_buffer_t buffer) { -// return buffer->iface.get_name == ggml_backend_cuda_split_buffer_get_name; -//} +static bool ggml_backend_buffer_is_cuda_split(ggml_backend_buffer_t buffer) { + return buffer->iface.get_name == ggml_backend_cuda_split_buffer_get_name; + UNUSED(ggml_backend_buffer_is_cuda_split); // only used in debug builds currently, avoid unused function warning in release builds +} GGML_CALL static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) { ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context; @@ -11311,7 +11396,7 @@ GGML_CALL static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, gg for (int j = 0; j < GGML_MAX_SRC; j++) { if (node->src[j] != nullptr) { assert(node->src[j]->backend == GGML_BACKEND_GPU || node->src[j]->backend == GGML_BACKEND_GPU_SPLIT); - assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device)); + assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) || ggml_backend_buffer_is_cuda_split(node->src[j]->buffer)); assert(node->src[j]->extra != nullptr); } } @@ -11359,7 +11444,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons return false; } ggml_type a_type = a->type; - if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ3_XXS) { + if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ3_XXS || a_type == GGML_TYPE_IQ1_S) { if (b->ne[1] == 1 && ggml_nrows(b) > 1) { return false; } diff --git a/ggml-metal.m b/ggml-metal.m index 50e8661f8..c5c7fb386 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -61,6 +61,7 @@ enum ggml_metal_kernel_type { GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS, GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS, GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS, + GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S, GGML_METAL_KERNEL_TYPE_GET_ROWS_I32, GGML_METAL_KERNEL_TYPE_RMS_NORM, GGML_METAL_KERNEL_TYPE_GROUP_NORM, @@ -83,6 +84,7 @@ enum ggml_metal_kernel_type { GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XXS_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32, + GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32, //GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F16, GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32, @@ -101,6 +103,7 @@ enum ggml_metal_kernel_type { GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XXS_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32, + GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32, @@ -116,6 +119,7 @@ enum ggml_metal_kernel_type { GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32, + GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32, @@ -131,6 +135,7 @@ enum ggml_metal_kernel_type { GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32, + GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32, GGML_METAL_KERNEL_TYPE_ROPE_F32, GGML_METAL_KERNEL_TYPE_ROPE_F16, GGML_METAL_KERNEL_TYPE_ALIBI_F32, @@ -176,7 +181,7 @@ struct ggml_metal_context { // MSL code // TODO: move the contents here when ready // for now it is easier to work in a separate file -//static NSString * const msl_library_source = @"see metal.metal"; +// static NSString * const msl_library_source = @"see metal.metal"; // Here to assist with NSBundle Path Hack @interface GGMLMetalClass : NSObject @@ -272,6 +277,14 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) { return NULL; } } else { +#if GGML_METAL_EMBED_LIBRARY + GGML_METAL_LOG_INFO("%s: using embedded metal library\n", __func__); + + extern const char ggml_metallib_start[]; + extern const char ggml_metallib_end[]; + + NSString * src = [[NSString alloc] initWithBytes:ggml_metallib_start length:(ggml_metallib_end-ggml_metallib_start) encoding:NSUTF8StringEncoding]; +#else GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__); NSString * sourcePath; @@ -294,6 +307,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) { GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]); return NULL; } +#endif @autoreleasepool { // dictionary of preprocessor macros @@ -433,6 +447,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) { GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS, get_rows_iq2_xxs, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS, get_rows_iq2_xs, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS, get_rows_iq3_xxs, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S, get_rows_iq1_s, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_I32, get_rows_i32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RMS_NORM, rms_norm, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GROUP_NORM, group_norm, ctx->support_simdgroup_reduction); @@ -455,6 +470,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) { GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XXS_F32, mul_mv_iq2_xxs_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32, mul_mv_iq2_xs_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32, mul_mv_iq3_xxs_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32, mul_mv_iq1_s_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32, mul_mv_id_f32_f32, ctx->support_simdgroup_reduction); //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F16, mul_mv_id_f16_f16, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32, mul_mv_id_f16_f32, ctx->support_simdgroup_reduction); @@ -473,6 +489,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) { GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XXS_F32, mul_mv_id_iq2_xxs_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32, mul_mv_id_iq2_xs_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32, mul_mv_id_iq3_xxs_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32, mul_mv_id_iq1_s_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32, mul_mm_f32_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32, mul_mm_f16_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32, mul_mm_q4_0_f32, ctx->support_simdgroup_mm); @@ -488,6 +505,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) { GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32, mul_mm_iq2_xxs_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32, mul_mm_iq2_xs_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32, mul_mm_iq3_xxs_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32, mul_mm_iq1_s_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32, mul_mm_id_f32_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32, mul_mm_id_f16_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32, mul_mm_id_q4_0_f32, ctx->support_simdgroup_mm); @@ -503,6 +521,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) { GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32, mul_mm_id_iq2_xxs_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32, mul_mm_id_iq2_xs_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32, mul_mm_id_iq3_xxs_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32, mul_mm_id_iq1_s_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_F32, rope_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_F16, rope_f16, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ALIBI_F32, alibi_f32, true); @@ -728,6 +747,7 @@ static bool ggml_metal_graph_compute( size_t offs_src0 = 0; size_t offs_src1 = 0; + size_t offs_src2 = 0; size_t offs_dst = 0; id command_buffer = command_buffers[cb_idx]; @@ -746,6 +766,7 @@ static bool ggml_metal_graph_compute( struct ggml_tensor * src0 = gf->nodes[i]->src[0]; struct ggml_tensor * src1 = gf->nodes[i]->src[1]; + struct ggml_tensor * src2 = gf->nodes[i]->src[2]; struct ggml_tensor * dst = gf->nodes[i]; switch (dst->op) { @@ -807,6 +828,7 @@ static bool ggml_metal_graph_compute( id id_src0 = src0 ? ggml_metal_get_buffer(src0, &offs_src0) : nil; id id_src1 = src1 ? ggml_metal_get_buffer(src1, &offs_src1) : nil; + id id_src2 = src2 ? ggml_metal_get_buffer(src2, &offs_src2) : nil; id id_dst = dst ? ggml_metal_get_buffer(dst, &offs_dst) : nil; //GGML_METAL_LOG_INFO("%s: op - %s\n", __func__, ggml_op_name(dst->op)); @@ -1188,7 +1210,16 @@ static bool ggml_metal_graph_compute( pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX].pipeline; } - const float scale = ((float *) dst->op_params)[0]; + const float scale = ((float *) dst->op_params)[0]; + const float max_bias = ((float *) dst->op_params)[1]; + + const int64_t nrows_x = ggml_nrows(src0); + const int64_t nrows_y = src0->ne[1]; + const uint32_t n_head_kv = nrows_x/nrows_y; + const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv)); + + const float m0 = powf(2.0f, -(max_bias ) / n_head_log2); + const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); [encoder setComputePipelineState:pipeline]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; @@ -1197,11 +1228,20 @@ static bool ggml_metal_graph_compute( } else { [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1]; } - [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; - [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3]; - [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4]; - [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5]; - [encoder setBytes:&scale length:sizeof(scale) atIndex:6]; + if (id_src2) { + [encoder setBuffer:id_src2 offset:offs_src2 atIndex:2]; + } else { + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:2]; + } + [encoder setBuffer:id_dst offset:offs_dst atIndex:3]; + [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:4]; + [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:5]; + [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:6]; + [encoder setBytes:&scale length:sizeof(scale) atIndex:7]; + [encoder setBytes:&max_bias length:sizeof(max_bias) atIndex:8]; + [encoder setBytes:&m0 length:sizeof(m0) atIndex:9]; + [encoder setBytes:&m1 length:sizeof(m1) atIndex:10]; + [encoder setBytes:&n_head_log2 length:sizeof(n_head_log2) atIndex:11]; [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0]; [encoder dispatchThreadgroups:MTLSizeMake(ne01*ne02*ne03, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; @@ -1297,6 +1337,7 @@ static bool ggml_metal_graph_compute( case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32].pipeline; break; case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32 ].pipeline; break; case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32].pipeline; break; + case GGML_TYPE_IQ1_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32 ].pipeline; break; default: GGML_ASSERT(false && "MUL MAT-MAT not implemented"); } @@ -1431,6 +1472,12 @@ static bool ggml_metal_graph_compute( nth1 = 16; pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32].pipeline; } break; + case GGML_TYPE_IQ1_S: + { + nth0 = 4; + nth1 = 16; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32].pipeline; + } break; default: { GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src0t); @@ -1465,7 +1512,7 @@ static bool ggml_metal_graph_compute( if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || src0t == GGML_TYPE_Q5_0 || src0t == GGML_TYPE_Q5_1 || src0t == GGML_TYPE_Q8_0 || - src0t == GGML_TYPE_Q2_K) { // || src0t == GGML_TYPE_Q4_K) { + src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_IQ1_S) { // || src0t == GGML_TYPE_Q4_K) { [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } else if (src0t == GGML_TYPE_IQ2_XXS || src0t == GGML_TYPE_IQ2_XS) { @@ -1514,8 +1561,6 @@ static bool ggml_metal_graph_compute( // max size of the src1ids array in the kernel stack GGML_ASSERT(ne11 <= 512); - struct ggml_tensor * src2 = gf->nodes[i]->src[2]; - const int64_t ne20 = src2 ? src2->ne[0] : 0; const int64_t ne21 = src2 ? src2->ne[1] : 0; const int64_t ne22 = src2 ? src2->ne[2] : 0; @@ -1573,6 +1618,7 @@ static bool ggml_metal_graph_compute( case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32].pipeline; break; case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32 ].pipeline; break; case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32].pipeline; break; + case GGML_TYPE_IQ1_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32 ].pipeline; break; default: GGML_ASSERT(false && "MUL_MAT_ID not implemented"); } @@ -1710,6 +1756,12 @@ static bool ggml_metal_graph_compute( nth1 = 16; pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32].pipeline; } break; + case GGML_TYPE_IQ1_S: + { + nth0 = 4; + nth1 = 16; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32].pipeline; + } break; default: { GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src2t); @@ -1760,7 +1812,7 @@ static bool ggml_metal_graph_compute( if (src2t == GGML_TYPE_Q4_0 || src2t == GGML_TYPE_Q4_1 || src2t == GGML_TYPE_Q5_0 || src2t == GGML_TYPE_Q5_1 || src2t == GGML_TYPE_Q8_0 || - src2t == GGML_TYPE_Q2_K) { // || src2t == GGML_TYPE_Q4_K) { + src2t == GGML_TYPE_Q2_K || src2t == GGML_TYPE_IQ1_S) { // || src2t == GGML_TYPE_Q4_K) { [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } else if (src2t == GGML_TYPE_IQ2_XXS || src2t == GGML_TYPE_IQ2_XS) { @@ -1814,6 +1866,7 @@ static bool ggml_metal_graph_compute( case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS].pipeline; break; case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS ].pipeline; break; case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS].pipeline; break; + case GGML_TYPE_IQ1_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S ].pipeline; break; case GGML_TYPE_I32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_I32 ].pipeline; break; default: GGML_ASSERT(false && "not implemented"); } diff --git a/ggml-metal.metal b/ggml-metal.metal index efed6ad46..f0d77d446 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -351,12 +351,17 @@ kernel void kernel_sum_rows( kernel void kernel_soft_max( device const float * src0, device const float * src1, + device const float * src2, device float * dst, constant int64_t & ne00, constant int64_t & ne01, constant int64_t & ne02, constant float & scale, - threadgroup float * buf [[threadgroup(0)]], + constant float & max_bias, + constant float & m0, + constant float & m1, + constant uint32_t & n_head_log2, + threadgroup float * buf [[threadgroup(0)]], uint tgpig[[threadgroup_position_in_grid]], uint tpitg[[thread_position_in_threadgroup]], uint sgitg[[simdgroup_index_in_threadgroup]], @@ -368,13 +373,26 @@ kernel void kernel_soft_max( device const float * psrc0 = src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00; device const float * pmask = src1 != src0 ? src1 + i01*ne00 : nullptr; + device const float * ppos = src2 != src0 ? src2 : nullptr; device float * pdst = dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00; + float slope = 0.0f; + + // ALiBi + if (max_bias > 0.0f) { + const int64_t h = i02; + + const float base = h < n_head_log2 ? m0 : m1; + const int exp = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1; + + slope = pow(base, exp); + } + // parallel max float lmax = -INFINITY; for (int i00 = tpitg; i00 < ne00; i00 += ntg) { - lmax = MAX(lmax, psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f)); + lmax = MAX(lmax, psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f)); } // find the max value in the block @@ -399,7 +417,7 @@ kernel void kernel_soft_max( // parallel sum float lsum = 0.0f; for (int i00 = tpitg; i00 < ne00; i00 += ntg) { - const float exp_psrc0 = exp((psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f)) - max_val); + const float exp_psrc0 = exp((psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f)) - max_val); lsum += exp_psrc0; pdst[i00] = exp_psrc0; } @@ -437,12 +455,17 @@ kernel void kernel_soft_max( kernel void kernel_soft_max_4( device const float * src0, device const float * src1, + device const float * src2, device float * dst, constant int64_t & ne00, constant int64_t & ne01, constant int64_t & ne02, constant float & scale, - threadgroup float * buf [[threadgroup(0)]], + constant float & max_bias, + constant float & m0, + constant float & m1, + constant uint32_t & n_head_log2, + threadgroup float * buf [[threadgroup(0)]], uint tgpig[[threadgroup_position_in_grid]], uint tpitg[[thread_position_in_threadgroup]], uint sgitg[[simdgroup_index_in_threadgroup]], @@ -454,13 +477,25 @@ kernel void kernel_soft_max_4( device const float4 * psrc4 = (device const float4 *)(src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00); device const float4 * pmask = src1 != src0 ? (device const float4 *)(src1 + i01*ne00) : nullptr; + device const float4 * ppos = src2 != src0 ? (device const float4 *)(src2) : nullptr; device float4 * pdst4 = (device float4 *)(dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00); + float slope = 0.0f; + + if (max_bias > 0.0f) { + const int64_t h = i02; + + const float base = h < n_head_log2 ? m0 : m1; + const int exp = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1; + + slope = pow(base, exp); + } + // parallel max float4 lmax4 = -INFINITY; for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) { - lmax4 = fmax(lmax4, psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f)); + lmax4 = fmax(lmax4, psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f)); } const float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3])); @@ -486,7 +521,7 @@ kernel void kernel_soft_max_4( // parallel sum float4 lsum4 = 0.0f; for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) { - const float4 exp_psrc4 = exp((psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f)) - max_val); + const float4 exp_psrc4 = exp((psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f)) - max_val); lsum4 += exp_psrc4; pdst4[i00] = exp_psrc4; } @@ -2490,6 +2525,13 @@ typedef struct { } block_iq3_xxs; // 98 bytes / block for QK_K = 256, so 3.0625 bpw +typedef struct { + half d; + uint8_t qs[QK_K/8]; + uint8_t scales[QK_K/16]; +} block_iq1_s; + + //====================================== dot products ========================= void kernel_mul_mv_q2_K_f32_impl( @@ -3747,6 +3789,137 @@ constexpr constant static uint32_t iq3xxs_grid[256] = { 0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04, }; +#define NGRID_IQ1S 512 +constexpr constant static uint64_t iq1s_grid[NGRID_IQ1S] = { + 0xffffffffffff0101, 0xffffffffff01ff00, 0xffffffffff010100, 0xffffffff00000000, + 0xffffffff01ff00ff, 0xffffffff01ff0001, 0xffffffff0101ffff, 0xffffffff0101ff01, + 0xffffff00ff000000, 0xffffff000000ff00, 0xffffff00000000ff, 0xffffff0000000100, + 0xffffff0000010000, 0xffffff0001000000, 0xffffff01ffff00ff, 0xffffff01ff01ff00, + 0xffffff01ff010100, 0xffffff0100000001, 0xffffff0101ffff00, 0xffffff0101ff0101, + 0xffffff0101010100, 0xffff00ffff00ff01, 0xffff00ffff0000ff, 0xffff00ff00ff0100, + 0xffff00ff0100ff00, 0xffff00ff010001ff, 0xffff0000ff0101ff, 0xffff000000ffff00, + 0xffff000000000000, 0xffff00000001ff01, 0xffff000001000101, 0xffff0000010100ff, + 0xffff0001ffff0100, 0xffff00010000ff00, 0xffff000100010101, 0xffff000101000000, + 0xffff01ffffff0000, 0xffff01ffff01ffff, 0xffff01ffff010100, 0xffff01ff00000000, + 0xffff01ff01ffffff, 0xffff01ff01ff0001, 0xffff01ff0101ffff, 0xffff01ff01010001, + 0xffff0100ffffff01, 0xffff01000000ffff, 0xffff010000000100, 0xffff010001ff01ff, + 0xffff010001000000, 0xffff0101ff000000, 0xffff0101000101ff, 0xffff010101ffff01, + 0xffff01010101ff00, 0xff00ffffff000000, 0xff00ffff00ffff00, 0xff00ffff00000001, + 0xff00ffff000001ff, 0xff00ffff01010000, 0xff00ff00ffff0000, 0xff00ff00ff00ff00, + 0xff00ff00ff0000ff, 0xff00ff00ff000100, 0xff00ff00ff010001, 0xff00ff0000ff0001, + 0xff00ff000000ffff, 0xff00ff0000000000, 0xff00ff000001ff00, 0xff00ff0000010100, + 0xff00ff0001ff0000, 0xff00ff000100ff00, 0xff00ff0001000100, 0xff00ff01ff000000, + 0xff00ff0100ff0000, 0xff00ff01000001ff, 0xff00ff0101010001, 0xff0000ff00000000, + 0xff0000ff0001ff00, 0xff0000ff00010100, 0xff000000ffff0101, 0xff000000ff000000, + 0xff000000ff01ff00, 0xff00000000ff0000, 0xff0000000000ff00, 0xff000000000000ff, + 0xff00000000000000, 0xff00000000000001, 0xff00000000000100, 0xff0000000001ffff, + 0xff00000000010000, 0xff00000001000000, 0xff00000001010100, 0xff000001ff00ff01, + 0xff000001ff0100ff, 0xff00000100000000, 0xff0000010001ff00, 0xff00000101ff0100, + 0xff0000010100ff00, 0xff0001ff00ff00ff, 0xff0001ff00000101, 0xff0001ff000100ff, + 0xff0001ff01000000, 0xff000100ff0001ff, 0xff0001000000ff01, 0xff00010000000000, + 0xff00010000010001, 0xff00010000010100, 0xff00010001ffff00, 0xff00010001ff0101, + 0xff00010001010000, 0xff000101ffffffff, 0xff000101ff000101, 0xff00010101ff00ff, + 0xff00010101000001, 0xff000101010100ff, 0xff01ffffff000101, 0xff01ffffff01ffff, + 0xff01ffffff01ff01, 0xff01ffffff0101ff, 0xff01ffff00000000, 0xff01ffff01ff0001, + 0xff01ffff0101ff01, 0xff01ff00ff000000, 0xff01ff0000ff0100, 0xff01ff000000ff01, + 0xff01ff0000010000, 0xff01ff00010000ff, 0xff01ff01ff01ff00, 0xff01ff0100000101, + 0xff0100ffffff0000, 0xff0100ffff010000, 0xff0100ff01ff00ff, 0xff0100ff01000100, + 0xff0100ff010100ff, 0xff010000ffffff01, 0xff01000000000000, 0xff0100000101ff00, + 0xff010001ffff00ff, 0xff010001ff000100, 0xff01000100ffff00, 0xff01000100010001, + 0xff01000101ff0001, 0xff010001010001ff, 0xff0101ffffffffff, 0xff0101ffff01ffff, + 0xff0101ffff010101, 0xff0101ff0000ff00, 0xff0101ff01010001, 0xff010100ff000000, + 0xff010100ff01ff01, 0xff01010000ff0001, 0xff01010000000100, 0xff01010001000000, + 0xff0101010100ffff, 0x00ffffff0000ff01, 0x00ffffff000000ff, 0x00ffffff00000100, + 0x00ffffff00010000, 0x00ffff00ffff0001, 0x00ffff00ff0000ff, 0x00ffff00ff000100, + 0x00ffff0000000000, 0x00ffff0001000100, 0x00ffff0001010001, 0x00ffff01ff00ff01, + 0x00ffff0100ff0100, 0x00ffff010000ff00, 0x00ffff01000100ff, 0x00ffff0101ff00ff, + 0x00ffff010101ff00, 0x00ff00ffffffffff, 0x00ff00ffffff01ff, 0x00ff00ffff000101, + 0x00ff00ff00000000, 0x00ff00ff000101ff, 0x00ff00ff01010101, 0x00ff0000ff000000, + 0x00ff0000ff01ffff, 0x00ff000000ff0000, 0x00ff00000000ff00, 0x00ff0000000000ff, + 0x00ff000000000000, 0x00ff000000000001, 0x00ff000000000100, 0x00ff000000010000, + 0x00ff000001ffff01, 0x00ff000001000000, 0x00ff0001ff000101, 0x00ff000100ffffff, + 0x00ff000100000000, 0x00ff0001010001ff, 0x00ff01ffff000000, 0x00ff01ff0001ff00, + 0x00ff01ff01ff0100, 0x00ff0100ff01ff01, 0x00ff010000ff00ff, 0x00ff010000ff0101, + 0x00ff010000000000, 0x00ff010000010101, 0x00ff01000100ff00, 0x00ff010001010000, + 0x00ff0101ffffff00, 0x00ff01010000ff01, 0x00ff010100000100, 0x00ff010101ff0000, + 0x0000ffffffff0100, 0x0000ffffff00ff00, 0x0000ffffff0000ff, 0x0000ffffff010000, + 0x0000ffff00000000, 0x0000ffff00010101, 0x0000ffff01ffff01, 0x0000ffff01000100, + 0x0000ff00ff000000, 0x0000ff00ff01ff00, 0x0000ff00ff0101ff, 0x0000ff0000ff0000, + 0x0000ff000000ff00, 0x0000ff00000000ff, 0x0000ff0000000000, 0x0000ff0000000001, + 0x0000ff0000000100, 0x0000ff0000010000, 0x0000ff0001ffffff, 0x0000ff0001ff01ff, + 0x0000ff0001000000, 0x0000ff000101ffff, 0x0000ff01ffff0101, 0x0000ff01ff010000, + 0x0000ff0100000000, 0x0000ff0101000101, 0x000000ffffff0001, 0x000000ffff000000, + 0x000000ff00ff0000, 0x000000ff0000ff00, 0x000000ff000000ff, 0x000000ff00000000, + 0x000000ff00000001, 0x000000ff00000100, 0x000000ff00010000, 0x000000ff01000000, + 0x000000ff0101ff00, 0x00000000ffff0000, 0x00000000ff00ff00, 0x00000000ff0000ff, + 0x00000000ff000000, 0x00000000ff000001, 0x00000000ff000100, 0x00000000ff010000, + 0x0000000000ffff00, 0x0000000000ff00ff, 0x0000000000ff0000, 0x0000000000ff0001, + 0x0000000000ff0100, 0x000000000000ffff, 0x000000000000ff00, 0x000000000000ff01, + 0x00000000000000ff, 0x0000000000000001, 0x00000000000001ff, 0x0000000000000100, + 0x0000000000000101, 0x000000000001ff00, 0x00000000000100ff, 0x0000000000010000, + 0x0000000000010001, 0x0000000000010100, 0x0000000001ff0000, 0x000000000100ff00, + 0x00000000010000ff, 0x0000000001000000, 0x0000000001000001, 0x0000000001000100, + 0x0000000001010000, 0x00000001ffff01ff, 0x00000001ff000000, 0x0000000100ff0000, + 0x000000010000ff00, 0x00000001000000ff, 0x0000000100000000, 0x0000000100000001, + 0x0000000100000100, 0x0000000100010000, 0x0000000101000000, 0x000001ffff00ff00, + 0x000001ffff010001, 0x000001ffff0101ff, 0x000001ff00ffff01, 0x000001ff0000ffff, + 0x000001ff00000000, 0x000001ff010000ff, 0x000001ff01010100, 0x00000100ffff0100, + 0x00000100ff000000, 0x0000010000ff0000, 0x000001000000ff00, 0x00000100000000ff, + 0x0000010000000000, 0x0000010000000001, 0x0000010000000100, 0x0000010000010000, + 0x0000010001000000, 0x000001000101ff01, 0x00000101ffff0001, 0x00000101ff01ffff, + 0x0000010100000000, 0x0000010101010100, 0x0001ffffff000000, 0x0001ffff00ffffff, + 0x0001ffff00000100, 0x0001ffff0001ff00, 0x0001ffff01000000, 0x0001ff00ffffff00, + 0x0001ff00ffff01ff, 0x0001ff00ff010000, 0x0001ff0000000000, 0x0001ff0000010001, + 0x0001ff0001ff0000, 0x0001ff0001010100, 0x0001ff01ff0000ff, 0x0001ff01ff000001, + 0x0001ff0100ffffff, 0x0001ff010001ffff, 0x0001ff01000101ff, 0x0001ff010100ff01, + 0x000100ffff00ffff, 0x000100ffff00ff01, 0x000100ffff000100, 0x000100ff00000000, + 0x000100ff000101ff, 0x000100ff01ff0101, 0x000100ff0100ffff, 0x000100ff01010101, + 0x00010000ff000000, 0x00010000ff010100, 0x0001000000ff0000, 0x000100000000ff00, + 0x00010000000000ff, 0x0001000000000000, 0x0001000000000001, 0x0001000000000100, + 0x0001000000010000, 0x0001000001ffff01, 0x0001000001000000, 0x0001000100ff0101, + 0x0001000100000000, 0x00010001010100ff, 0x000101ffffff01ff, 0x000101ffffff0101, + 0x000101ff00010000, 0x000101ff01ff0000, 0x000101ff0100ff01, 0x00010100ffff0000, + 0x0001010000000000, 0x000101000001ffff, 0x0001010000010101, 0x00010100010001ff, + 0x00010101ff00ff00, 0x00010101ff010001, 0x0001010100ffffff, 0x0001010100ff01ff, + 0x00010101000101ff, 0x0001010101ff0000, 0x000101010100ff01, 0x0001010101000101, + 0x01ffffffffff0101, 0x01ffffffff01ffff, 0x01ffffffff01ff01, 0x01ffffffff0101ff, + 0x01ffffffff010101, 0x01ffffff00000000, 0x01ffffff01ff01ff, 0x01ffffff01000101, + 0x01ffffff0101ff01, 0x01ffffff010100ff, 0x01ffff000000ff00, 0x01ffff0000000001, + 0x01ffff00000001ff, 0x01ffff0000010000, 0x01ffff0001ff0000, 0x01ffff01ffffffff, + 0x01ffff01ffff01ff, 0x01ffff01ff000000, 0x01ffff01ff01ffff, 0x01ffff01ff0101ff, + 0x01ffff010100ffff, 0x01ff00ffffff0000, 0x01ff00ffff010000, 0x01ff00ff00ffff01, + 0x01ff0000ff0000ff, 0x01ff000000000000, 0x01ff00000001ff01, 0x01ff000001ffffff, + 0x01ff000001010100, 0x01ff0001ffffff01, 0x01ff0001ff010001, 0x01ff000101ff0100, + 0x01ff000101000001, 0x01ff0001010100ff, 0x01ff01ffff00ffff, 0x01ff01ff00010001, + 0x01ff01ff01000000, 0x01ff01ff010101ff, 0x01ff0100ff000001, 0x01ff010000ffff00, + 0x01ff010000000100, 0x01ff010001ff01ff, 0x01ff01000101ffff, 0x01ff0101ffff00ff, + 0x01ff0101ffff0101, 0x01ff0101ff0101ff, 0x01ff010100010000, 0x0100ffff00ff00ff, + 0x0100ffff00ff0001, 0x0100ffff00000100, 0x0100ffff0100ff00, 0x0100ff00ffff0000, + 0x0100ff00ff00ffff, 0x0100ff00ff00ff01, 0x0100ff00ff000100, 0x0100ff00ff010000, + 0x0100ff0000000000, 0x0100ff00000100ff, 0x0100ff0001ff0101, 0x0100ff0001010101, + 0x0100ff0100ff00ff, 0x0100ff0100ff0001, 0x0100ff0100000100, 0x0100ff0100010001, + 0x0100ff0101000000, 0x010000ffff00ff00, 0x010000ff0000ffff, 0x010000ff00000000, + 0x010000ff010001ff, 0x010000ff01010001, 0x01000000ffffff00, 0x01000000ffff0101, + 0x01000000ff000000, 0x01000000ff0100ff, 0x01000000ff010101, 0x0100000000ff0000, + 0x010000000000ff00, 0x01000000000000ff, 0x0100000000000000, 0x0100000000000001, + 0x0100000000000100, 0x0100000000010000, 0x0100000001000000, 0x0100000100000000, + 0x01000001000101ff, 0x0100000101ffff01, 0x010001ffff000101, 0x010001ff00ff0100, + 0x010001ff0000ff00, 0x010001ff000100ff, 0x010001ff01ffffff, 0x01000100ffff0000, + 0x01000100ff0001ff, 0x0100010000000000, 0x010001000001ff00, 0x0100010001ff0000, + 0x01000100010000ff, 0x0100010001000101, 0x01000101ff00ff01, 0x0100010100ff0100, + 0x010001010000ffff, 0x0100010101010001, 0x0101ffffffff0101, 0x0101ffffff0001ff, + 0x0101ffffff01ffff, 0x0101ffffff010101, 0x0101ffff00000000, 0x0101ffff0101ffff, + 0x0101ffff010101ff, 0x0101ff00ff000000, 0x0101ff0000ff0100, 0x0101ff000000ff00, + 0x0101ff0000010000, 0x0101ff00010000ff, 0x0101ff0001000001, 0x0101ff01ff010101, + 0x0101ff0100000000, 0x0101ff010101ff00, 0x010100ffffff0000, 0x010100ffff010000, + 0x010100ff00ff01ff, 0x010100ff000000ff, 0x010100ff00000101, 0x010100ff01ffff00, + 0x01010000ffffff01, 0x01010000ff000100, 0x01010000ff01ff01, 0x0101000000000000, + 0x01010000000100ff, 0x010100000101ff01, 0x01010001ffff0000, 0x01010001ff00ffff, + 0x01010001ff010000, 0x0101000101ffffff, 0x0101000101ff01ff, 0x0101000101010101, + 0x010101ffff01ffff, 0x010101ff00000000, 0x010101ff0001ff01, 0x010101ff0101ffff, + 0x010101ff010101ff, 0x01010100ffffffff, 0x01010100ff000001, 0x010101000000ff00, + 0x0101010001010000, 0x0101010100ff0001, 0x010101010001ff01, 0x010101010101ffff, +}; constexpr constant static uint8_t ksigns_iq2xs[128] = { 0, 129, 130, 3, 132, 5, 6, 135, 136, 9, 10, 139, 12, 141, 142, 15, @@ -3854,7 +4027,10 @@ void kernel_mul_mv_iq2_xxs_f32_impl( y4 += 32 * 32; } #else - // TODO + (void) x; + (void) y; + (void) yl; + (void) nb32; #endif for (int row = 0; row < N_DST; ++row) { @@ -3997,7 +4173,10 @@ void kernel_mul_mv_iq2_xs_f32_impl( y4 += 32 * 32; } #else - // TODO + (void) x; + (void) y; + (void) yl; + (void) nb32; #endif for (int row = 0; row < N_DST; ++row) { @@ -4133,7 +4312,10 @@ void kernel_mul_mv_iq3_xxs_f32_impl( y4 += 32 * 32; } #else - // TODO + (void) x; + (void) y; + (void) yl; + (void) nb32; #endif for (int row = 0; row < N_DST; ++row) { @@ -4173,6 +4355,126 @@ kernel void kernel_mul_mv_iq3_xxs_f32( kernel_mul_mv_iq3_xxs_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg); } +void kernel_mul_mv_iq1_s_f32_impl( + device const void * src0, + device const float * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant int64_t & ne10, + constant int64_t & ne12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint & r2, + constant uint & r3, + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + + const int nb = ne00/QK_K; + const int r0 = tgpig.x; + const int r1 = tgpig.y; + const int im = tgpig.z; + + const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST; + const int ib_row = first_row * nb; + + const uint i12 = im%ne12; + const uint i13 = im/ne12; + + const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02); + + device const block_iq1_s * x = (device const block_iq1_s *) src0 + ib_row + offset0; + device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1; + + float yl[16]; + float sumf[N_DST]={0.f}, all_sum; + + const int nb32 = nb * (QK_K / 32); + +#if QK_K == 256 + const int ix = tiisg/2; + const int il = tiisg%2; + + device const float * y4 = y + 32 * ix + 16 * il; + + for (int ib32 = ix; ib32 < nb32; ib32 += 16) { + + for (int i = 0; i < 16; ++i) { + yl[i] = y4[i]; + } + + const int ibl = ib32 / (QK_K / 32); + const int ib = ib32 % (QK_K / 32); + + device const block_iq1_s * xr = x + ibl; + device const uint8_t * qs = xr->qs + 4 * ib + 2 * il; + device const uint8_t * sc = xr->scales + 2 * ib + il; + device const half * dh = &xr->d; + + for (int row = 0; row < N_DST; row++) { + + constant int8_t * grid1 = (constant int8_t *)(iq1s_grid + (qs[0] | ((sc[0] & 0x08) << 5))); + constant int8_t * grid2 = (constant int8_t *)(iq1s_grid + (qs[1] | ((sc[0] & 0x80) << 1))); + + float2 sum = {0}; + for (int j = 0; j < 8; ++j) { + sum[0] += yl[j+ 0] * grid1[j]; + sum[1] += yl[j+ 8] * grid2[j]; + } + sumf[row] += (float)dh[0] * (sum[0] * (2*(sc[0] & 7) + 1) + sum[1] * (2*((sc[0] >> 4) & 7) + 1)); + + dh += nb*sizeof(block_iq1_s)/2; + qs += nb*sizeof(block_iq1_s); + sc += nb*sizeof(block_iq1_s); + } + + y4 += 16 * 32; + } +#else + (void) x; + (void) y; + (void) yl; + (void) nb32; +#endif + + for (int row = 0; row < N_DST; ++row) { + all_sum = simd_sum(sumf[row]); + if (tiisg == 0) { + dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum; + } + } +} + +[[host_name("kernel_mul_mv_iq1_s_f32")]] +kernel void kernel_mul_mv_iq1_s_f32( + device const void * src0, + device const float * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint & r2, + constant uint & r3, + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + + kernel_mul_mv_iq1_s_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, tgpig, tiisg, sgitg); +} + //============================= templates and their specializations ============================= @@ -4369,6 +4671,8 @@ void dequantize_q4_K(device const block_q4_K *xb, short il, thread type4x4 & reg const float dl = d * sc[0]; const float ml = min * sc[1]; #else + (void) get_scale_min_k4_just2; + q = q + 16 * (il&1); device const uint8_t * s = xb->scales; device const half2 * dh = (device const half2 *)xb->d; @@ -4518,6 +4822,22 @@ void dequantize_iq3_xxs(device const block_iq3_xxs * xb, short il, thread type4x } } +template +void dequantize_iq1_s(device const block_iq1_s * xb, short il, thread type4x4 & reg) { + // il is 0...15 for QK_K = 256 => index of block of 32 is il/2 + const float d = xb->d; + device const uint8_t * qs = xb->qs + 2*il; + device const uint8_t * sc = xb->scales + il; + const float dl1 = d * (2*(sc[0] & 7) + 1); + const float dl2 = d * (2*((sc[0] >> 4) & 7) + 1); + constant int8_t * grid1 = (constant int8_t *)(iq1s_grid + (qs[0] | ((sc[0] & 0x08) << 5))); + constant int8_t * grid2 = (constant int8_t *)(iq1s_grid + (qs[1] | ((sc[0] & 0x80) << 1))); + for (int i = 0; i < 8; ++i) { + reg[i/4+0][i%4] = dl1 * grid1[i]; + reg[i/4+2][i%4] = dl2 * grid2[i]; + } +} + template kernel void kernel_get_rows( device const void * src0, @@ -5060,6 +5380,7 @@ template [[host_name("kernel_get_rows_q6_K")]] kernel get_rows_t kernel_get_rows template [[host_name("kernel_get_rows_iq2_xxs")]] kernel get_rows_t kernel_get_rows; template [[host_name("kernel_get_rows_iq2_xs")]] kernel get_rows_t kernel_get_rows; template [[host_name("kernel_get_rows_iq3_xxs")]] kernel get_rows_t kernel_get_rows; +template [[host_name("kernel_get_rows_iq1_s")]] kernel get_rows_t kernel_get_rows; // // matrix-matrix multiplication @@ -5099,6 +5420,7 @@ template [[host_name("kernel_mul_mm_q6_K_f32")]] kernel mat_mm_t kernel_mul_mm; template [[host_name("kernel_mul_mm_iq2_xs_f32")]] kernel mat_mm_t kernel_mul_mm; template [[host_name("kernel_mul_mm_iq3_xxs_f32")]] kernel mat_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_iq1_s_f32")]] kernel mat_mm_t kernel_mul_mm; // // indirect matrix-matrix multiplication @@ -5150,6 +5472,7 @@ template [[host_name("kernel_mul_mm_id_q6_K_f32")]] kernel mat_mm_id_t kernel_mu template [[host_name("kernel_mul_mm_id_iq2_xxs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; template [[host_name("kernel_mul_mm_id_iq2_xs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; template [[host_name("kernel_mul_mm_id_iq3_xxs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_iq1_s_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; // // matrix-vector multiplication @@ -6117,3 +6440,66 @@ kernel void kernel_mul_mv_id_iq3_xxs_f32( tiisg, sgitg); } + +[[host_name("kernel_mul_mv_id_iq1_s_f32")]] +kernel void kernel_mul_mv_id_iq1_s_f32( + device const char * ids, + device const char * src1, + device float * dst, + constant uint64_t & nbi1, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant int64_t & ne13, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint64_t & nb1, + constant uint & r2, + constant uint & r3, + constant int & idx, + device const char * src00, + device const char * src01, + device const char * src02, + device const char * src03, + device const char * src04, + device const char * src05, + device const char * src06, + device const char * src07, + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiitg[[thread_index_in_threadgroup]], + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07}; + + const int64_t bid = tgpig.z/(ne12*ne13); + + tgpig.z = tgpig.z%(ne12*ne13); + + const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx]; + + kernel_mul_mv_iq1_s_f32_impl( + src0[id], + (device const float *) (src1 + bid*nb11), + dst + bid*ne0, + ne00, + ne01, + ne02, + ne10, + ne12, + ne0, + ne1, + r2, + r3, + tgpig, + tiisg, + sgitg); +} diff --git a/ggml-quants.c b/ggml-quants.c index 8787f3579..ca7eb6030 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -1839,9 +1839,9 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri float sigma2 = sumx2/QK_K; for (int j = 0; j < QK_K/16; ++j) { const float * restrict qw = quant_weights + QK_K * i + 16*j; - for (int l = 0; l < 16; ++l) weight[l] = qw[l] * sqrtf(sigma2 + x[16*j + l]*x[16*j + l]); - for (int l = 0; l < 16; ++l) sw[j] += weight[l]; - scales[j] = make_qkx3_quants(16, 3, x + 16*j, weight, L + 16*j, &mins[j], Laux, -0.9f, 0.05f, 36, false); + for (int l = 0; l < QK_K/16; ++l) weight[l] = qw[l] * sqrtf(sigma2 + x[16*j + l]*x[16*j + l]); + for (int l = 0; l < QK_K/16; ++l) sw[j] += weight[l]; + scales[j] = make_qkx3_quants(QK_K/16, 3, x + 16*j, weight, L + 16*j, &mins[j], Laux, -0.9f, 0.05f, 36, false); } float dm = make_qp_quants(QK_K/16, 15, scales, Ls, sw); @@ -3482,6 +3482,139 @@ static const uint32_t iq3xxs_grid[256] = { 0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04, }; +#define NGRID_IQ2XXS 512 +static const uint64_t iq1s_grid[NGRID_IQ2XXS] = { + 0xffffffffffff0101, 0xffffffffff01ff00, 0xffffffffff010100, 0xffffffff00000000, + 0xffffffff01ff00ff, 0xffffffff01ff0001, 0xffffffff0101ffff, 0xffffffff0101ff01, + 0xffffff00ff000000, 0xffffff000000ff00, 0xffffff00000000ff, 0xffffff0000000100, + 0xffffff0000010000, 0xffffff0001000000, 0xffffff01ffff00ff, 0xffffff01ff01ff00, + 0xffffff01ff010100, 0xffffff0100000001, 0xffffff0101ffff00, 0xffffff0101ff0101, + 0xffffff0101010100, 0xffff00ffff00ff01, 0xffff00ffff0000ff, 0xffff00ff00ff0100, + 0xffff00ff0100ff00, 0xffff00ff010001ff, 0xffff0000ff0101ff, 0xffff000000ffff00, + 0xffff000000000000, 0xffff00000001ff01, 0xffff000001000101, 0xffff0000010100ff, + 0xffff0001ffff0100, 0xffff00010000ff00, 0xffff000100010101, 0xffff000101000000, + 0xffff01ffffff0000, 0xffff01ffff01ffff, 0xffff01ffff010100, 0xffff01ff00000000, + 0xffff01ff01ffffff, 0xffff01ff01ff0001, 0xffff01ff0101ffff, 0xffff01ff01010001, + 0xffff0100ffffff01, 0xffff01000000ffff, 0xffff010000000100, 0xffff010001ff01ff, + 0xffff010001000000, 0xffff0101ff000000, 0xffff0101000101ff, 0xffff010101ffff01, + 0xffff01010101ff00, 0xff00ffffff000000, 0xff00ffff00ffff00, 0xff00ffff00000001, + 0xff00ffff000001ff, 0xff00ffff01010000, 0xff00ff00ffff0000, 0xff00ff00ff00ff00, + 0xff00ff00ff0000ff, 0xff00ff00ff000100, 0xff00ff00ff010001, 0xff00ff0000ff0001, + 0xff00ff000000ffff, 0xff00ff0000000000, 0xff00ff000001ff00, 0xff00ff0000010100, + 0xff00ff0001ff0000, 0xff00ff000100ff00, 0xff00ff0001000100, 0xff00ff01ff000000, + 0xff00ff0100ff0000, 0xff00ff01000001ff, 0xff00ff0101010001, 0xff0000ff00000000, + 0xff0000ff0001ff00, 0xff0000ff00010100, 0xff000000ffff0101, 0xff000000ff000000, + 0xff000000ff01ff00, 0xff00000000ff0000, 0xff0000000000ff00, 0xff000000000000ff, + 0xff00000000000000, 0xff00000000000001, 0xff00000000000100, 0xff0000000001ffff, + 0xff00000000010000, 0xff00000001000000, 0xff00000001010100, 0xff000001ff00ff01, + 0xff000001ff0100ff, 0xff00000100000000, 0xff0000010001ff00, 0xff00000101ff0100, + 0xff0000010100ff00, 0xff0001ff00ff00ff, 0xff0001ff00000101, 0xff0001ff000100ff, + 0xff0001ff01000000, 0xff000100ff0001ff, 0xff0001000000ff01, 0xff00010000000000, + 0xff00010000010001, 0xff00010000010100, 0xff00010001ffff00, 0xff00010001ff0101, + 0xff00010001010000, 0xff000101ffffffff, 0xff000101ff000101, 0xff00010101ff00ff, + 0xff00010101000001, 0xff000101010100ff, 0xff01ffffff000101, 0xff01ffffff01ffff, + 0xff01ffffff01ff01, 0xff01ffffff0101ff, 0xff01ffff00000000, 0xff01ffff01ff0001, + 0xff01ffff0101ff01, 0xff01ff00ff000000, 0xff01ff0000ff0100, 0xff01ff000000ff01, + 0xff01ff0000010000, 0xff01ff00010000ff, 0xff01ff01ff01ff00, 0xff01ff0100000101, + 0xff0100ffffff0000, 0xff0100ffff010000, 0xff0100ff01ff00ff, 0xff0100ff01000100, + 0xff0100ff010100ff, 0xff010000ffffff01, 0xff01000000000000, 0xff0100000101ff00, + 0xff010001ffff00ff, 0xff010001ff000100, 0xff01000100ffff00, 0xff01000100010001, + 0xff01000101ff0001, 0xff010001010001ff, 0xff0101ffffffffff, 0xff0101ffff01ffff, + 0xff0101ffff010101, 0xff0101ff0000ff00, 0xff0101ff01010001, 0xff010100ff000000, + 0xff010100ff01ff01, 0xff01010000ff0001, 0xff01010000000100, 0xff01010001000000, + 0xff0101010100ffff, 0x00ffffff0000ff01, 0x00ffffff000000ff, 0x00ffffff00000100, + 0x00ffffff00010000, 0x00ffff00ffff0001, 0x00ffff00ff0000ff, 0x00ffff00ff000100, + 0x00ffff0000000000, 0x00ffff0001000100, 0x00ffff0001010001, 0x00ffff01ff00ff01, + 0x00ffff0100ff0100, 0x00ffff010000ff00, 0x00ffff01000100ff, 0x00ffff0101ff00ff, + 0x00ffff010101ff00, 0x00ff00ffffffffff, 0x00ff00ffffff01ff, 0x00ff00ffff000101, + 0x00ff00ff00000000, 0x00ff00ff000101ff, 0x00ff00ff01010101, 0x00ff0000ff000000, + 0x00ff0000ff01ffff, 0x00ff000000ff0000, 0x00ff00000000ff00, 0x00ff0000000000ff, + 0x00ff000000000000, 0x00ff000000000001, 0x00ff000000000100, 0x00ff000000010000, + 0x00ff000001ffff01, 0x00ff000001000000, 0x00ff0001ff000101, 0x00ff000100ffffff, + 0x00ff000100000000, 0x00ff0001010001ff, 0x00ff01ffff000000, 0x00ff01ff0001ff00, + 0x00ff01ff01ff0100, 0x00ff0100ff01ff01, 0x00ff010000ff00ff, 0x00ff010000ff0101, + 0x00ff010000000000, 0x00ff010000010101, 0x00ff01000100ff00, 0x00ff010001010000, + 0x00ff0101ffffff00, 0x00ff01010000ff01, 0x00ff010100000100, 0x00ff010101ff0000, + 0x0000ffffffff0100, 0x0000ffffff00ff00, 0x0000ffffff0000ff, 0x0000ffffff010000, + 0x0000ffff00000000, 0x0000ffff00010101, 0x0000ffff01ffff01, 0x0000ffff01000100, + 0x0000ff00ff000000, 0x0000ff00ff01ff00, 0x0000ff00ff0101ff, 0x0000ff0000ff0000, + 0x0000ff000000ff00, 0x0000ff00000000ff, 0x0000ff0000000000, 0x0000ff0000000001, + 0x0000ff0000000100, 0x0000ff0000010000, 0x0000ff0001ffffff, 0x0000ff0001ff01ff, + 0x0000ff0001000000, 0x0000ff000101ffff, 0x0000ff01ffff0101, 0x0000ff01ff010000, + 0x0000ff0100000000, 0x0000ff0101000101, 0x000000ffffff0001, 0x000000ffff000000, + 0x000000ff00ff0000, 0x000000ff0000ff00, 0x000000ff000000ff, 0x000000ff00000000, + 0x000000ff00000001, 0x000000ff00000100, 0x000000ff00010000, 0x000000ff01000000, + 0x000000ff0101ff00, 0x00000000ffff0000, 0x00000000ff00ff00, 0x00000000ff0000ff, + 0x00000000ff000000, 0x00000000ff000001, 0x00000000ff000100, 0x00000000ff010000, + 0x0000000000ffff00, 0x0000000000ff00ff, 0x0000000000ff0000, 0x0000000000ff0001, + 0x0000000000ff0100, 0x000000000000ffff, 0x000000000000ff00, 0x000000000000ff01, + 0x00000000000000ff, 0x0000000000000001, 0x00000000000001ff, 0x0000000000000100, + 0x0000000000000101, 0x000000000001ff00, 0x00000000000100ff, 0x0000000000010000, + 0x0000000000010001, 0x0000000000010100, 0x0000000001ff0000, 0x000000000100ff00, + 0x00000000010000ff, 0x0000000001000000, 0x0000000001000001, 0x0000000001000100, + 0x0000000001010000, 0x00000001ffff01ff, 0x00000001ff000000, 0x0000000100ff0000, + 0x000000010000ff00, 0x00000001000000ff, 0x0000000100000000, 0x0000000100000001, + 0x0000000100000100, 0x0000000100010000, 0x0000000101000000, 0x000001ffff00ff00, + 0x000001ffff010001, 0x000001ffff0101ff, 0x000001ff00ffff01, 0x000001ff0000ffff, + 0x000001ff00000000, 0x000001ff010000ff, 0x000001ff01010100, 0x00000100ffff0100, + 0x00000100ff000000, 0x0000010000ff0000, 0x000001000000ff00, 0x00000100000000ff, + 0x0000010000000000, 0x0000010000000001, 0x0000010000000100, 0x0000010000010000, + 0x0000010001000000, 0x000001000101ff01, 0x00000101ffff0001, 0x00000101ff01ffff, + 0x0000010100000000, 0x0000010101010100, 0x0001ffffff000000, 0x0001ffff00ffffff, + 0x0001ffff00000100, 0x0001ffff0001ff00, 0x0001ffff01000000, 0x0001ff00ffffff00, + 0x0001ff00ffff01ff, 0x0001ff00ff010000, 0x0001ff0000000000, 0x0001ff0000010001, + 0x0001ff0001ff0000, 0x0001ff0001010100, 0x0001ff01ff0000ff, 0x0001ff01ff000001, + 0x0001ff0100ffffff, 0x0001ff010001ffff, 0x0001ff01000101ff, 0x0001ff010100ff01, + 0x000100ffff00ffff, 0x000100ffff00ff01, 0x000100ffff000100, 0x000100ff00000000, + 0x000100ff000101ff, 0x000100ff01ff0101, 0x000100ff0100ffff, 0x000100ff01010101, + 0x00010000ff000000, 0x00010000ff010100, 0x0001000000ff0000, 0x000100000000ff00, + 0x00010000000000ff, 0x0001000000000000, 0x0001000000000001, 0x0001000000000100, + 0x0001000000010000, 0x0001000001ffff01, 0x0001000001000000, 0x0001000100ff0101, + 0x0001000100000000, 0x00010001010100ff, 0x000101ffffff01ff, 0x000101ffffff0101, + 0x000101ff00010000, 0x000101ff01ff0000, 0x000101ff0100ff01, 0x00010100ffff0000, + 0x0001010000000000, 0x000101000001ffff, 0x0001010000010101, 0x00010100010001ff, + 0x00010101ff00ff00, 0x00010101ff010001, 0x0001010100ffffff, 0x0001010100ff01ff, + 0x00010101000101ff, 0x0001010101ff0000, 0x000101010100ff01, 0x0001010101000101, + 0x01ffffffffff0101, 0x01ffffffff01ffff, 0x01ffffffff01ff01, 0x01ffffffff0101ff, + 0x01ffffffff010101, 0x01ffffff00000000, 0x01ffffff01ff01ff, 0x01ffffff01000101, + 0x01ffffff0101ff01, 0x01ffffff010100ff, 0x01ffff000000ff00, 0x01ffff0000000001, + 0x01ffff00000001ff, 0x01ffff0000010000, 0x01ffff0001ff0000, 0x01ffff01ffffffff, + 0x01ffff01ffff01ff, 0x01ffff01ff000000, 0x01ffff01ff01ffff, 0x01ffff01ff0101ff, + 0x01ffff010100ffff, 0x01ff00ffffff0000, 0x01ff00ffff010000, 0x01ff00ff00ffff01, + 0x01ff0000ff0000ff, 0x01ff000000000000, 0x01ff00000001ff01, 0x01ff000001ffffff, + 0x01ff000001010100, 0x01ff0001ffffff01, 0x01ff0001ff010001, 0x01ff000101ff0100, + 0x01ff000101000001, 0x01ff0001010100ff, 0x01ff01ffff00ffff, 0x01ff01ff00010001, + 0x01ff01ff01000000, 0x01ff01ff010101ff, 0x01ff0100ff000001, 0x01ff010000ffff00, + 0x01ff010000000100, 0x01ff010001ff01ff, 0x01ff01000101ffff, 0x01ff0101ffff00ff, + 0x01ff0101ffff0101, 0x01ff0101ff0101ff, 0x01ff010100010000, 0x0100ffff00ff00ff, + 0x0100ffff00ff0001, 0x0100ffff00000100, 0x0100ffff0100ff00, 0x0100ff00ffff0000, + 0x0100ff00ff00ffff, 0x0100ff00ff00ff01, 0x0100ff00ff000100, 0x0100ff00ff010000, + 0x0100ff0000000000, 0x0100ff00000100ff, 0x0100ff0001ff0101, 0x0100ff0001010101, + 0x0100ff0100ff00ff, 0x0100ff0100ff0001, 0x0100ff0100000100, 0x0100ff0100010001, + 0x0100ff0101000000, 0x010000ffff00ff00, 0x010000ff0000ffff, 0x010000ff00000000, + 0x010000ff010001ff, 0x010000ff01010001, 0x01000000ffffff00, 0x01000000ffff0101, + 0x01000000ff000000, 0x01000000ff0100ff, 0x01000000ff010101, 0x0100000000ff0000, + 0x010000000000ff00, 0x01000000000000ff, 0x0100000000000000, 0x0100000000000001, + 0x0100000000000100, 0x0100000000010000, 0x0100000001000000, 0x0100000100000000, + 0x01000001000101ff, 0x0100000101ffff01, 0x010001ffff000101, 0x010001ff00ff0100, + 0x010001ff0000ff00, 0x010001ff000100ff, 0x010001ff01ffffff, 0x01000100ffff0000, + 0x01000100ff0001ff, 0x0100010000000000, 0x010001000001ff00, 0x0100010001ff0000, + 0x01000100010000ff, 0x0100010001000101, 0x01000101ff00ff01, 0x0100010100ff0100, + 0x010001010000ffff, 0x0100010101010001, 0x0101ffffffff0101, 0x0101ffffff0001ff, + 0x0101ffffff01ffff, 0x0101ffffff010101, 0x0101ffff00000000, 0x0101ffff0101ffff, + 0x0101ffff010101ff, 0x0101ff00ff000000, 0x0101ff0000ff0100, 0x0101ff000000ff00, + 0x0101ff0000010000, 0x0101ff00010000ff, 0x0101ff0001000001, 0x0101ff01ff010101, + 0x0101ff0100000000, 0x0101ff010101ff00, 0x010100ffffff0000, 0x010100ffff010000, + 0x010100ff00ff01ff, 0x010100ff000000ff, 0x010100ff00000101, 0x010100ff01ffff00, + 0x01010000ffffff01, 0x01010000ff000100, 0x01010000ff01ff01, 0x0101000000000000, + 0x01010000000100ff, 0x010100000101ff01, 0x01010001ffff0000, 0x01010001ff00ffff, + 0x01010001ff010000, 0x0101000101ffffff, 0x0101000101ff01ff, 0x0101000101010101, + 0x010101ffff01ffff, 0x010101ff00000000, 0x010101ff0001ff01, 0x010101ff0101ffff, + 0x010101ff010101ff, 0x01010100ffffffff, 0x01010100ff000001, 0x010101000000ff00, + 0x0101010001010000, 0x0101010100ff0001, 0x010101010001ff01, 0x010101010101ffff, + +}; + static const uint8_t ksigns_iq2xs[128] = { 0, 129, 130, 3, 132, 5, 6, 135, 136, 9, 10, 139, 12, 141, 142, 15, 144, 17, 18, 147, 20, 149, 150, 23, 24, 153, 154, 27, 156, 29, 30, 159, @@ -3580,6 +3713,49 @@ void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y } } +// ====================== 1.5625 bpw (de)-quantization + +void dequantize_row_iq1_s(const block_iq1_s * restrict x, float * restrict y, int k) { + assert(k % QK_K == 0); + const int nb = k / QK_K; + + float db[4]; + uint16_t idx[4]; + //const int8_t * grid[4]; + + for (int i = 0; i < nb; i++) { + + const float d = GGML_FP16_TO_FP32(x[i].d); + const uint8_t * sc = x[i].scales; + const uint8_t * qs = x[i].qs; + + for (int i8 = 0; i8 < QK_K/8; i8 += 4) { + idx[0] = qs[0] | ((sc[0] & 0x08) << 5); + idx[1] = qs[1] | ((sc[0] & 0x80) << 1); + idx[2] = qs[2] | ((sc[1] & 0x08) << 5); + idx[3] = qs[3] | ((sc[1] & 0x80) << 1); + //grid[0] = (const int8_t *)(iq1s_grid + (qs[0] | ((sc[0] & 0x08) << 5))); + //grid[1] = (const int8_t *)(iq1s_grid + (qs[1] | ((sc[0] & 0x80) << 1))); + //grid[2] = (const int8_t *)(iq1s_grid + (qs[2] | ((sc[1] & 0x08) << 5))); + //grid[3] = (const int8_t *)(iq1s_grid + (qs[3] | ((sc[1] & 0x80) << 1))); + db[0] = d * (2*(sc[0] & 7) + 1); + db[1] = d * (2*((sc[0] >> 4) & 7) + 1); + db[2] = d * (2*(sc[1] & 7) + 1); + db[3] = d * (2*((sc[1] >> 4) & 7) + 1); + for (int l = 0; l < 4; ++l) { + const int8_t * grid = (const int8_t *)(iq1s_grid + idx[l]); + for (int j = 0; j < 8; ++j) { + //y[j] = db[l] * grid[l][j]; + y[j] = db[l] * grid[j]; + } + y += 8; + } + qs += 4; + sc += 2; + } + } +} + //===================================== Q8_K ============================================== void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k) { @@ -3850,15 +4026,15 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r const __m128i tmp = _mm_loadu_si128((const __m128i *)x[i].qs); - __m128i bx = _mm_and_si128(lowMask, tmp); - __m128i by = _mm_loadu_si128((const __m128i *)y[i].qs); - bx = _mm_sub_epi8(bx, off); - const __m128i i32_0 = mul_sum_i8_pairs(bx, by); + __m128i bx_0 = _mm_and_si128(lowMask, tmp); + __m128i by_0 = _mm_loadu_si128((const __m128i *)y[i].qs); + bx_0 = _mm_sub_epi8(bx_0, off); + const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0); - bx = _mm_and_si128(lowMask, _mm_srli_epi64(tmp, 4)); - by = _mm_loadu_si128((const __m128i *)(y[i].qs + 16)); - bx = _mm_sub_epi8(bx, off); - const __m128i i32_1 = mul_sum_i8_pairs(bx, by); + bx_0 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp, 4)); + by_0 = _mm_loadu_si128((const __m128i *)(y[i].qs + 16)); + bx_0 = _mm_sub_epi8(bx_0, off); + const __m128i i32_1 = mul_sum_i8_pairs(bx_0, by_0); // Convert int32_t to float __m256 p = _mm256_cvtepi32_ps(MM256_SET_M128I(i32_0, i32_1)); @@ -4444,21 +4620,21 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * r /* Compute combined scale for the block */ const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d)); - __m256i bx = bytes_from_nibbles_32(x[i].qs); + __m256i bx_0 = bytes_from_nibbles_32(x[i].qs); const __m256i bxhi = bytes_from_bits_32(x[i].qh); __m128i bxhil = _mm256_castsi256_si128(bxhi); __m128i bxhih = _mm256_extractf128_si256(bxhi, 1); bxhil = _mm_andnot_si128(bxhil, mask); bxhih = _mm_andnot_si128(bxhih, mask); - __m128i bxl = _mm256_castsi256_si128(bx); - __m128i bxh = _mm256_extractf128_si256(bx, 1); + __m128i bxl = _mm256_castsi256_si128(bx_0); + __m128i bxh = _mm256_extractf128_si256(bx_0, 1); bxl = _mm_or_si128(bxl, bxhil); bxh = _mm_or_si128(bxh, bxhih); - bx = MM256_SET_M128I(bxh, bxl); + bx_0 = MM256_SET_M128I(bxh, bxl); - const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs); + const __m256i by_0 = _mm256_loadu_si256((const __m256i *)y[i].qs); - const __m256 q = mul_sum_i8_pairs_float(bx, by); + const __m256 q = mul_sum_i8_pairs_float(bx_0, by_0); /* Multiply q with scale and accumulate */ acc = _mm256_add_ps(_mm256_mul_ps(d, q), acc); @@ -4751,22 +4927,22 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s; - __m256i bx = bytes_from_nibbles_32(x[i].qs); + __m256i bx_0 = bytes_from_nibbles_32(x[i].qs); const __m256i bxhi = bytes_from_bits_32(x[i].qh); __m128i bxhil = _mm256_castsi256_si128(bxhi); __m128i bxhih = _mm256_extractf128_si256(bxhi, 1); bxhil = _mm_and_si128(bxhil, mask); bxhih = _mm_and_si128(bxhih, mask); - __m128i bxl = _mm256_castsi256_si128(bx); - __m128i bxh = _mm256_extractf128_si256(bx, 1); + __m128i bxl = _mm256_castsi256_si128(bx_0); + __m128i bxh = _mm256_extractf128_si256(bx_0, 1); bxl = _mm_or_si128(bxl, bxhil); bxh = _mm_or_si128(bxh, bxhih); - bx = MM256_SET_M128I(bxh, bxl); + bx_0 = MM256_SET_M128I(bxh, bxl); const __m256 dy = _mm256_set1_ps(y[i].d); - const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs); + const __m256i by_0 = _mm256_loadu_si256((const __m256i *)y[i].qs); - const __m256 q = mul_sum_us8_pairs_float(bx, by); + const __m256 q = mul_sum_us8_pairs_float(bx_0, by_0); acc = _mm256_add_ps(_mm256_mul_ps(q, _mm256_mul_ps(dx, dy)), acc); } @@ -4995,10 +5171,10 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r for (int i = 0; i < nb; i++) { // load elements - vint8m1_t bx = __riscv_vle8_v_i8m1(x[i].qs, vl); - vint8m1_t by = __riscv_vle8_v_i8m1(y[i].qs, vl); + vint8m1_t bx_0 = __riscv_vle8_v_i8m1(x[i].qs, vl); + vint8m1_t by_0 = __riscv_vle8_v_i8m1(y[i].qs, vl); - vint16m2_t vw_mul = __riscv_vwmul_vv_i16m2(bx, by, vl); + vint16m2_t vw_mul = __riscv_vwmul_vv_i16m2(bx_0, by_0, vl); vint32m1_t v_zero = __riscv_vmv_v_x_i32m1(0, vl); vint32m1_t v_sum = __riscv_vwredsum_vs_i16m2_i32m1(vw_mul, v_zero, vl); @@ -9109,6 +9285,178 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void #endif } +#ifdef __AVX2__ +static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) { + const __m256i ax = _mm256_sign_epi8(x, x); + const __m256i sy = _mm256_sign_epi8(y, x); + return _mm256_maddubs_epi16(ax, sy); +} +#endif + +void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_iq1_s * restrict x = vx; + const block_q8_K * restrict y = vy; + + const int nb = n / QK_K; + +#if defined __ARM_NEON + + const uint8x16_t m8 = vdupq_n_u8(0x08); + const uint8x16_t m7 = vdupq_n_u8(0x07); + const uint8x16_t m1 = vdupq_n_u8(0x01); + const int32x4_t vzero = vdupq_n_s32(0); + + uint16_t gindex[8]; + uint16x8x2_t vindex; + int8x16x4_t q1b; + int8x16x4_t q8b; + uint16x8x4_t scales; + int32x4x2_t sumi; + int32x4x2_t dotq; + + float sumf = 0; + for (int i = 0; i < nb; ++i) { + + const int8_t * q8 = y[i].qs; + const uint8_t * qs = x[i].qs; + const uint8_t * sc = x[i].scales; + + sumi.val[0] = sumi.val[1] = vzero; + + for (int i128 = 0; i128 < QK_K/128; ++i128) { + const uint8x16_t ql = vld1q_u8(qs); qs += 16; + const uint8x8_t tm1 = vld1_u8 (sc); sc += 8; + const uint8x8_t tm2 = vshr_n_u8(tm1, 4); + const uint8x16_t qh = vcombine_u8(vzip1_u8(tm1, tm2), vzip2_u8(tm1, tm2)); + const uint8x16_t hbit = vandq_u8(qh, m8); + vindex.val[0] = vorrq_u16(vmovl_u8(vget_low_u8 (ql)), vshlq_n_u16(vmovl_u8(vget_low_u8 (hbit)), 5)); + vindex.val[1] = vorrq_u16(vmovl_u8(vget_high_u8(ql)), vshlq_n_u16(vmovl_u8(vget_high_u8(hbit)), 5)); + const uint8x16_t scales8 = vorrq_u8(vshlq_n_u8(vandq_u8(qh, m7), 1), m1); + scales.val[0] = vmovl_u8(vget_low_u8 (scales8)); + scales.val[1] = vmovl_u8(vget_high_u8 (scales8)); + + for (int l = 0; l < 2; ++l) { + vst1q_u16(gindex+0, vindex.val[l]); + q1b.val[0] = vcombine_s8(vld1_s8((const void *)(iq1s_grid+gindex[0])), vld1_s8((const void *)(iq1s_grid+gindex[1]))); + q1b.val[1] = vcombine_s8(vld1_s8((const void *)(iq1s_grid+gindex[2])), vld1_s8((const void *)(iq1s_grid+gindex[3]))); + q1b.val[2] = vcombine_s8(vld1_s8((const void *)(iq1s_grid+gindex[4])), vld1_s8((const void *)(iq1s_grid+gindex[5]))); + q1b.val[3] = vcombine_s8(vld1_s8((const void *)(iq1s_grid+gindex[6])), vld1_s8((const void *)(iq1s_grid+gindex[7]))); + q8b = ggml_vld1q_s8_x4(q8); q8 += 64; + + dotq.val[0] = vpaddq_s32(ggml_vdotq_s32(vzero, q1b.val[0], q8b.val[0]), ggml_vdotq_s32(vzero, q1b.val[1], q8b.val[1])); + dotq.val[1] = vpaddq_s32(ggml_vdotq_s32(vzero, q1b.val[2], q8b.val[2]), ggml_vdotq_s32(vzero, q1b.val[3], q8b.val[3])); + + sumi.val[0] = vmlaq_s32(sumi.val[0], dotq.val[0], vreinterpretq_s32_u32(vmovl_u16(vget_low_u16 (scales.val[l])))); + sumi.val[1] = vmlaq_s32(sumi.val[1], dotq.val[1], vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(scales.val[l])))); + } + } + + sumf += y[i].d * GGML_FP16_TO_FP32(x[i].d) * vaddvq_s32(vaddq_s32(sumi.val[0], sumi.val[1])); + } + + *s = sumf; + +#elif defined __AVX2__ + + const __m128i m8 = _mm_set1_epi8(0x08); + const __m128i m7 = _mm_set1_epi8(0x07); + const __m128i m1 = _mm_set1_epi8(0x01); + const __m128i shuffle_h = _mm_set_epi8(15, 7, 14, 6, 13, 5, 12, 4, 11, 3, 10, 2, 9, 1, 8, 0); + const __m128i shuffle_s[4] = { + _mm_set_epi32(0x03030303, 0x02020202, 0x01010101, 0x00000000), + _mm_set_epi32(0x07070707, 0x06060606, 0x05050505, 0x04040404), + _mm_set_epi32(0x0b0b0b0b, 0x0a0a0a0a, 0x09090909, 0x08080808), + _mm_set_epi32(0x0f0f0f0f, 0x0e0e0e0e, 0x0d0d0d0d, 0x0c0c0c0c) + }; + + uint64_t aux64; + + __m256i v_gindex; + const uint16_t * gindex = (const uint16_t *)&v_gindex; + + __m256 accum = _mm256_setzero_ps(); + for (int i = 0; i < nb; ++i) { + + const int8_t * q8 = y[i].qs; + const uint8_t * qs = x[i].qs; + const uint8_t * sc = x[i].scales; + + __m256i sumi = _mm256_setzero_si256(); + for (int i128 = 0; i128 < QK_K/128; ++i128) { + const __m128i ql = _mm_loadu_si128((const __m128i*)qs); qs += 16; + memcpy(&aux64, sc, 8); sc += 8; + const __m128i qh = _mm_shuffle_epi8(_mm_set_epi64x(aux64 >> 4, aux64), shuffle_h); + const __m256i hbit = _mm256_cvtepu8_epi16(_mm_and_si128(qh, m8)); + v_gindex = _mm256_or_si256(_mm256_cvtepu8_epi16(ql), _mm256_slli_epi16(hbit, 5)); + const __m128i scales = _mm_or_si128(_mm_slli_epi16(_mm_and_si128(qh, m7), 1), m1); + + for (int i32 = 0; i32 < 4; ++i32) { + const __m256i q8b = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; + const __m256i q1b = _mm256_set_epi64x(iq1s_grid[gindex[4*i32+3]], iq1s_grid[gindex[4*i32+2]], + iq1s_grid[gindex[4*i32+1]], iq1s_grid[gindex[4*i32+0]]); + const __m256i dot = mul_add_epi8(q1b, q8b); + const __m256i s16 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, shuffle_s[i32])); + const __m256i p = _mm256_madd_epi16(s16, dot); + sumi = _mm256_add_epi32(sumi, p); + } + + } + + accum = _mm256_fmadd_ps(_mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(x[i].d)), _mm256_cvtepi32_ps(sumi), accum); + + } + + *s = hsum_float_8(accum); + +#else + + int db[4]; + uint16_t idx[4]; + + float sumf = 0; + for (int i = 0; i < nb; ++i) { + + const int8_t * q8 = y[i].qs; + const uint8_t * qs = x[i].qs; + const uint8_t * sc = x[i].scales; + + int sumi = 0; + for (int i32 = 0; i32 < QK_K/32; ++i32) { + idx[0] = qs[0] | ((sc[0] & 0x08) << 5); + idx[1] = qs[1] | ((sc[0] & 0x80) << 1); + idx[2] = qs[2] | ((sc[1] & 0x08) << 5); + idx[3] = qs[3] | ((sc[1] & 0x80) << 1); + db[0] = (2*(sc[0] & 7) + 1); + db[1] = (2*((sc[0] >> 4) & 7) + 1); + db[2] = (2*(sc[1] & 7) + 1); + db[3] = (2*((sc[1] >> 4) & 7) + 1); + for (int l = 0; l < 4; ++l) { + const int8_t * grid = (const int8_t *)(iq1s_grid + idx[l]); + int suml = 0; + for (int j = 0; j < 8; ++j) suml += q8[j] * grid[j]; + sumi += db[l] * suml; + q8 += 8; + } + qs += 4; + sc += 2; + } + + sumf += GGML_FP16_TO_FP32(x[i].d) * y[i].d * sumi; + } + + *s = sumf; + +#endif + +} + // ================================ IQ2 quantization ============================================= typedef struct { @@ -9117,14 +9465,22 @@ typedef struct { uint16_t * neighbours; } iq2_entry_t; -static iq2_entry_t iq2_data[2] = { +static iq2_entry_t iq2_data[3] = { + {NULL, NULL, NULL}, {NULL, NULL, NULL}, {NULL, NULL, NULL}, }; -static inline int iq2_data_index(int grid_size) { - GGML_ASSERT(grid_size == 256 || grid_size == 512); - return grid_size == 256 ? 0 : 1; +static inline int iq2_data_index(enum ggml_type type) { + GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S); + return type == GGML_TYPE_IQ2_XXS ? 0 : + type == GGML_TYPE_IQ2_XS ? 1 : 2; +} + +static inline int iq2_grid_size(enum ggml_type type) { + GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S); + return type == GGML_TYPE_IQ2_XXS ? 256 : + type == GGML_TYPE_IQ2_XS ? 512 : 512; } static int iq2_compare_func(const void * left, const void * right) { @@ -9133,12 +9489,13 @@ static int iq2_compare_func(const void * left, const void * right) { return l[0] < r[0] ? -1 : l[0] > r[0] ? 1 : l[1] < r[1] ? -1 : l[1] > r[1] ? 1 : 0; } -void iq2xs_init_impl(int grid_size) { - const int gindex = iq2_data_index(grid_size); +void iq2xs_init_impl(enum ggml_type type) { + const int gindex = iq2_data_index(type); + const int grid_size = iq2_grid_size(type); if (iq2_data[gindex].grid) { return; } - static const uint16_t kgrid_256[256] = { + static const uint16_t kgrid_2bit_256[256] = { 0, 2, 5, 8, 10, 17, 20, 32, 34, 40, 42, 65, 68, 80, 88, 97, 100, 128, 130, 138, 162, 257, 260, 272, 277, 320, 388, 408, 512, 514, 546, 642, 1025, 1028, 1040, 1057, 1060, 1088, 1090, 1096, 1120, 1153, 1156, 1168, 1188, 1280, 1282, 1288, @@ -9156,7 +9513,7 @@ void iq2xs_init_impl(int grid_size) { 33888, 34048, 34118, 34196, 34313, 34368, 34400, 34818, 35076, 35345, 36868, 36880, 36900, 36928, 37025, 37142, 37248, 37445, 37888, 37922, 37956, 38225, 39041, 39200, 40962, 41040, 41093, 41225, 41472, 42008, 43088, 43268, }; - static const uint16_t kgrid_512[512] = { + static const uint16_t kgrid_2bit_512[512] = { 0, 2, 5, 8, 10, 17, 20, 22, 25, 32, 34, 37, 40, 65, 68, 70, 73, 80, 82, 85, 88, 97, 100, 128, 130, 133, 136, 145, 148, 153, 160, 257, 260, 262, 265, 272, 274, 277, 280, 282, 289, 292, 320, 322, 325, 328, 337, 340, @@ -9190,9 +9547,45 @@ void iq2xs_init_impl(int grid_size) { 40962, 40968, 40970, 40992, 41002, 41120, 41297, 41305, 41382, 41472, 41474, 41480, 41514, 41600, 41632, 42048, 42133, 42597, 42648, 43018, 43040, 43042, 43048, 43168, 43176, 43268, 43396, 43398, 43560, 43562, 43665, 43690, }; + static const uint16_t kgrid_1bit_512[512] = { + 10, 33, 41, 85, 132, 134, 160, 162, 277, 337, 340, 345, 357, 405, 516, 545, + 553, 598, 641, 650, 681, 1042, 1044, 1097, 1169, 1176, 1320, 1345, 1365, 1378, 1434, 1444, + 1545, 1617, 1642, 1685, 2053, 2080, 2089, 2133, 2176, 2182, 2208, 2214, 2306, 2384, 2393, 2440, + 2453, 2581, 2664, 2690, 2721, 4117, 4161, 4182, 4184, 4261, 4357, 4369, 4372, 4377, 4390, 4422, + 4432, 4437, 4449, 4457, 4485, 4497, 4505, 4629, 4677, 4696, 4774, 5205, 5217, 5225, 5386, 5397, + 5409, 5445, 5457, 5460, 5461, 5462, 5465, 5472, 5477, 5525, 5545, 5650, 5668, 5717, 5729, 5769, + 5777, 6212, 6234, 6244, 6293, 6424, 6482, 6485, 6502, 6505, 6529, 6538, 6565, 6656, 6682, 6788, + 6806, 6820, 8218, 8224, 8226, 8232, 8277, 8326, 8354, 8469, 8521, 8530, 8549, 8596, 8737, 8794, + 9221, 9253, 9348, 9369, 9380, 9474, 9557, 9633, 9732, 9753, 9793, 9830, 9862, 9880, 10240, 10272, + 10282, 10321, 10406, 10517, 10530, 10566, 10585, 10645, 10896, 16466, 16468, 16473, 16485, 16646, 16660, 16665, + 16725, 16793, 16806, 16914, 16969, 16977, 16996, 17028, 17057, 17408, 17416, 17434, 17493, 17512, 17578, 17685, + 17696, 17733, 17745, 17748, 17749, 17750, 17753, 17765, 17794, 17813, 17946, 17984, 18005, 18072, 18453, 18529, + 18569, 18722, 18756, 18762, 18773, 18794, 18833, 18853, 18945, 19026, 19033, 19077, 20489, 20497, 20500, 20517, + 20565, 20586, 20610, 20633, 20757, 20769, 20776, 20805, 20817, 20820, 20821, 20822, 20825, 20837, 20864, 20872, + 20885, 20896, 21002, 21029, 21077, 21146, 21510, 21525, 21573, 21585, 21588, 21589, 21590, 21593, 21605, 21653, + 21665, 21765, 21777, 21780, 21781, 21782, 21785, 21797, 21825, 21828, 21829, 21830, 21833, 21840, 21841, 21842, + 21844, 21846, 21848, 21849, 21850, 21857, 21860, 21861, 21862, 21865, 21893, 21905, 21908, 21909, 21910, 21913, + 21925, 22024, 22037, 22085, 22097, 22100, 22101, 22102, 22105, 22117, 22165, 22545, 22566, 22568, 22594, 22608, + 22613, 22676, 22697, 22793, 22805, 22853, 22865, 22868, 22869, 22870, 22873, 22885, 22933, 22946, 23046, 23072, + 23125, 23209, 24597, 24640, 24665, 24673, 24725, 24833, 24840, 24869, 24917, 24934, 24965, 25001, 25108, 25110, + 25152, 25184, 25192, 25234, 25616, 25618, 25625, 25685, 25704, 25738, 25744, 25770, 25877, 25897, 25925, 25937, + 25940, 25941, 25942, 25945, 25957, 25986, 26005, 26186, 26197, 26276, 26632, 26634, 26725, 26757, 26770, 26885, + 26965, 26976, 26986, 27032, 27153, 27174, 27200, 27208, 27240, 27269, 27282, 27290, 32778, 32800, 32802, 32808, + 32810, 32853, 32904, 32922, 32930, 32932, 33105, 33110, 33112, 33125, 33157, 33280, 33288, 33301, 33312, 33320, + 33424, 33797, 33829, 33858, 34068, 34133, 34146, 34176, 34217, 34306, 34342, 34441, 34454, 34468, 34832, 34918, + 34965, 34984, 35094, 35137, 35161, 35208, 35232, 35332, 35338, 35368, 35429, 36932, 36934, 36953, 37009, 37125, + 37136, 37138, 37145, 37157, 37205, 37220, 37258, 37290, 37444, 37446, 37465, 37478, 37525, 37905, 37968, 37973, + 38040, 38054, 38145, 38154, 38165, 38180, 38186, 38213, 38225, 38228, 38229, 38230, 38233, 38245, 38293, 38485, + 38504, 38530, 38938, 38985, 38993, 39012, 39040, 39173, 39192, 39253, 39265, 39301, 39316, 39322, 39442, 39497, + 39504, 39590, 40970, 40984, 40992, 41002, 41045, 41120, 41128, 41237, 41289, 41297, 41317, 41364, 41366, 41514, + 41557, 41633, 41989, 42021, 42056, 42068, 42074, 42113, 42242, 42265, 42274, 42325, 42340, 42402, 42501, 42512, + 42533, 42624, 42632, 42666, 43040, 43093, 43106, 43168, 43176, 43264, 43286, 43345, 43429, 43590, 43618, 43680, + }; + const int kmap_size = 43692; - const int nwant = 2; - const uint16_t * kgrid = grid_size == 256 ? kgrid_256 : kgrid_512; + const int nwant = type == GGML_TYPE_IQ1_S ? 3 : 2; + const uint16_t * kgrid = type == GGML_TYPE_IQ2_XXS ? kgrid_2bit_256 : + type == GGML_TYPE_IQ2_XS ? kgrid_2bit_512 : kgrid_1bit_512; uint64_t * kgrid_q2xs; int * kmap_q2xs; uint16_t * kneighbors_q2xs; @@ -9288,9 +9681,9 @@ void iq2xs_init_impl(int grid_size) { free(dist2); } -void iq2xs_free_impl(int grid_size) { - GGML_ASSERT(grid_size == 256 || grid_size == 512 || grid_size == 1024); - const int gindex = iq2_data_index(grid_size); +void iq2xs_free_impl(enum ggml_type type) { + GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S); + const int gindex = iq2_data_index(type); if (iq2_data[gindex].grid) { free(iq2_data[gindex].grid); iq2_data[gindex].grid = NULL; free(iq2_data[gindex].map); iq2_data[gindex].map = NULL; @@ -9324,7 +9717,7 @@ static int iq2_find_best_neighbour(const uint16_t * restrict neighbours, const u static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) { - const int gindex = iq2_data_index(256); + const int gindex = iq2_data_index(GGML_TYPE_IQ2_XXS); const uint64_t * kgrid_q2xs = iq2_data[gindex].grid; const int * kmap_q2xs = iq2_data[gindex].map; @@ -9497,7 +9890,7 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) { - const int gindex = iq2_data_index(512); + const int gindex = iq2_data_index(GGML_TYPE_IQ2_XS); const uint64_t * kgrid_q2xs = iq2_data[gindex].grid; const int * kmap_q2xs = iq2_data[gindex].map; @@ -10134,3 +10527,207 @@ void quantize_row_iq3_xxs_reference(const float * restrict x, block_iq3_xxs * re assert(k % QK_K == 0); quantize_row_iq3_xxs_impl(x, y, k, NULL); } + +// =================================== 1.5 bpw =================================================== + +static int iq1_find_best_neighbour(const uint16_t * restrict neighbours, const uint64_t * restrict grid, + const float * restrict xval, const float * restrict weight, float * scale, int8_t * restrict L, int ngrid) { + int num_neighbors = neighbours[0]; + GGML_ASSERT(num_neighbors > 0); + float best_score = 0; + int grid_index = -1; + for (int j = 1; j <= num_neighbors; ++j) { + const int8_t * pg = (const int8_t *)(grid + neighbours[j]); + float sumqx = 0, sumq2 = 0; + for (int i = 0; i < 8; ++i) { + float q = (pg[i] - 3)/2; + float w = weight[i]; + sumqx += w*q*xval[i]; + sumq2 += w*q*q; + } + if (sumqx > 0 && sumq2 > 0 && sumqx*sumqx > best_score*sumq2) { + *scale = sumqx/sumq2; best_score = *scale * sumqx; + grid_index = neighbours[j]; + } + } + if (grid_index < 0) { + for (int i = 0; i < ngrid; ++i) { + const int8_t * grid_i = (const int8_t *)(grid + i); + float sumqx = 0, sumq2 = 0; + for (int j = 0; j < 8; ++j) { + float w = weight[j]; + float q = (grid_i[j] - 3)/2; + sumqx += w*q*xval[j]; + sumq2 += w*q*q; + } + if (sumqx > 0 && sumq2 > 0 && sumqx*sumqx > best_score*sumq2) { + *scale = sumqx/sumq2; best_score = *scale*sumqx; + grid_index = i; + } + } + } + if (grid_index < 0) { + printf("Oops, did not find grid point\n"); + printf("Have %d neighbours\n", num_neighbors); + for (int j = 1; j <= num_neighbors; ++j) { + const int8_t * pg = (const int8_t *)(grid + neighbours[j]); + float sumqx = 0, sumq2 = 0; + for (int i = 0; i < 8; ++i) { + float q = (pg[i] - 3)/2; + float w = weight[i]; + sumqx += w*q*xval[i]; + sumq2 += w*q*q; + } + printf(" neighbour %d: sumqx = %g sumq2 = %g\n", j, (double)sumqx, (double)sumq2); + } + } + GGML_ASSERT(grid_index >= 0); + //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + *scale *= 1.05f; // This is a fudge factor. Don't ask me why it improves the result. + //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + const int8_t * pg = (const int8_t *)(grid + grid_index); + for (int i = 0; i < 8; ++i) L[i] = (pg[i] - 1)/2; + return grid_index; +} + +static int iq1_sort_helper(const void * left, const void * right) { + const float * l = left; + const float * r = right; + return *l < *r ? -1 : *l > *r ? 1 : 0; +} + +static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) { + + const int gindex = iq2_data_index(GGML_TYPE_IQ1_S); + + const uint64_t * kgrid_q2xs = iq2_data[gindex].grid; + const int * kmap_q2xs = iq2_data[gindex].map; + const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours; + + GGML_ASSERT(quant_weights && "missing quantization weights"); + GGML_ASSERT(kgrid_q2xs && "forgot to call ggml_quantize_init()?"); + GGML_ASSERT(kmap_q2xs && "forgot to call ggml_quantize_init()?"); + GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?"); + GGML_ASSERT(n%QK_K == 0); + + const int nbl = n/256; + + block_iq1_s * y = vy; + + float scales[QK_K/8]; + float weight[8]; + int8_t L[8]; + float sumx[9]; + float sumw[9]; + float pairs[16]; + int * idx = (int *)(pairs + 1); + uint8_t hbit[QK_K/8]; + + for (int ibl = 0; ibl < nbl; ++ibl) { + + y[ibl].d = GGML_FP32_TO_FP16(0.f); + memset(y[ibl].qs, 0, QK_K/8); + memset(y[ibl].scales, 0, QK_K/16); + + float max_scale = 0; + + const float * xbl = x + QK_K*ibl; + float sumx2 = 0; + for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i]; + float sigma2 = sumx2/QK_K; + + for (int ib = 0; ib < QK_K/8; ++ib) { + const float * xb = xbl + 8*ib; + const float * qw = quant_weights + QK_K*ibl + 8*ib; + for (int i = 0; i < 8; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]); + float max = fabsf(xb[0]); + for (int i = 1; i < 8; ++i) max = MAX(max, fabsf(xb[i])); + if (!max) { + scales[ib] = 0; + memset(L, 1, 8); + continue; + } + // Here we solve exactly the sum of squared difference (SSD) weighted minimization problem. + // With just 3 allowed quant values (-1, 0, 1), we can search exhaustively for the two + // boundaries that split the weights xb[i] into 3 groups. To do so, we sort the weights + // in ascending order, compute Si = sum[weight[j] xb[j], j = 0...i] and + // Wi = sum[weight[j], j = 0...i], and use these to quckly get get the optimum scale + // for each possible and score for each split. + for (int j = 0; j < 8; ++j) { + pairs[2*j] = xb[j]; + idx[2*j] = j; + } + qsort(pairs, 8, 2*sizeof(float), iq1_sort_helper); + { + sumx[0] = sumw[0] = 0; + for (int j = 0; j < 8; ++j) { + int i = idx[2*j]; + sumx[j+1] = sumx[j] + weight[i]*xb[i]; + sumw[j+1] = sumw[j] + weight[i]; + } + } + float best_score = 0, scale = max; + int besti1 = 0, besti2 = 0; + for (int i1 = 0; i1 <= 8; ++i1) { + for (int i2 = i1; i2 <= 8; ++i2) { + float sumqx = -(sumx[i1] - sumx[0]) + (sumx[8] - sumx[i2]); + float sumq2 = (sumw[i1] - sumw[0]) + (sumw[8] - sumw[i2]); + if (sumq2 > 0 && sumqx*sumqx > best_score*sumq2) { + scale = sumqx/sumq2; best_score = scale*sumqx; + besti1 = i1; besti2 = i2; + } + } + } + for (int j = 0; j < besti1; ++j) L[idx[2*j]] = 0; + for (int j = besti1; j < besti2; ++j) L[idx[2*j]] = 1; + for (int j = besti2; j < 8; ++j) L[idx[2*j]] = 2; + if (scale < 0) { + for (int j = 0; j < 8; ++j) L[j] = 2 - L[j]; + scale = -scale; + } + // Now we check if the solution found above corresponds to a grid point and, if not, use a neighbouring + // grid point that minimizes SSD. + uint16_t u = 0; + for (int j = 0; j < 8; ++j) u |= (L[j] << 2*j); + int grid_index = kmap_q2xs[u]; + if (grid_index < 0) { + const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1; + grid_index = iq1_find_best_neighbour(neighbours, kgrid_q2xs, xb, weight, &scale, L, NGRID_IQ2XXS); + GGML_ASSERT(grid_index >= 0); + } + y[ibl].qs[ib] = grid_index & 255; + hbit[ib] = grid_index >> 8; + GGML_ASSERT(scale >= 0); + scales[ib] = scale; + max_scale = MAX(max_scale, scale); + } + + if (!max_scale) { + memset(y[ibl].qs, 0, QK_K/8); + continue; + } + + float d = max_scale/15; + y[ibl].d = GGML_FP32_TO_FP16(d*1.085f); // 1.085f is another fudge factor. Don't ask me why it is needed. + float id = 1/d; + for (int ib = 0; ib < QK_K/8; ++ib) { + int l = nearest_int(0.5f*(id*scales[ib]-1)); + l = MAX(0, MIN(7, l)); + if (hbit[ib]) l |= 8; + y[ibl].scales[ib/2] |= (l << 4*(ib%2)); + } + } +} + +size_t quantize_iq1_s(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) { + (void)hist; + GGML_ASSERT(n_per_row%QK_K == 0); + int nblock = n_per_row/QK_K; + char * qrow = (char *)dst; + for (int row = 0; row < nrow; ++row) { + quantize_row_iq1_s_impl(src, qrow, n_per_row, quant_weights); + src += n_per_row; + qrow += nblock*sizeof(block_iq1_s); + } + return nrow * nblock * sizeof(block_iq1_s); +} diff --git a/ggml-quants.h b/ggml-quants.h index 68f09b1e1..ad381cfab 100644 --- a/ggml-quants.h +++ b/ggml-quants.h @@ -191,6 +191,13 @@ typedef struct { } block_iq3_xxs; static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding"); +typedef struct { + ggml_fp16_t d; + uint8_t qs[QK_K/8]; + uint8_t scales[QK_K/16]; +} block_iq1_s; +static_assert(sizeof(block_iq1_s) == sizeof(ggml_fp16_t) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding"); + #ifdef __cplusplus extern "C" { #endif @@ -243,6 +250,7 @@ void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRI void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); +void dequantize_row_iq1_s (const block_iq1_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); // Dot product void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); @@ -259,6 +267,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); // // Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization") @@ -266,6 +275,7 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const size_t quantize_iq2_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); size_t quantize_iq2_xs (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); size_t quantize_iq3_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); +size_t quantize_iq1_s (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); size_t quantize_q2_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); size_t quantize_q3_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); size_t quantize_q4_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); @@ -276,8 +286,8 @@ size_t quantize_q4_1 (const float * src, void * dst, int nrows, int n_per_row, size_t quantize_q5_0 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); size_t quantize_q5_1 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); -void iq2xs_init_impl(int grid_size); -void iq2xs_free_impl(int grid_size); +void iq2xs_init_impl(enum ggml_type type); +void iq2xs_free_impl(enum ggml_type type); void iq3xs_init_impl(int grid_size); void iq3xs_free_impl(int grid_size); diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index cd4b3a1e1..df1826112 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -9188,174 +9188,22 @@ static void convert_mul_mat_vec_f16_sycl(const void *vx, const dfloat *y, } } -static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy, - float *dst, const int ncols, - const int nrows, - dpct::queue_ptr stream) { - GGML_ASSERT(ncols % QK4_0 == 0); - const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; - const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { - mul_mat_vec_q(vx, vy, dst, ncols, nrows, - item_ct1); - }); -} - -static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy, - float *dst, const int ncols, - const int nrows, - dpct::queue_ptr stream) { - GGML_ASSERT(ncols % QK4_1 == 0); - const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; - const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { - mul_mat_vec_q(vx, vy, dst, ncols, nrows, - item_ct1); - }); -} - -static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy, - float *dst, const int ncols, - const int nrows, - dpct::queue_ptr stream) { - GGML_ASSERT(ncols % QK5_0 == 0); - const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; - const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { - mul_mat_vec_q(vx, vy, dst, ncols, nrows, - item_ct1); - }); -} - -static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy, - float *dst, const int ncols, - const int nrows, - dpct::queue_ptr stream) { - GGML_ASSERT(ncols % QK5_1 == 0); - const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; - const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { - mul_mat_vec_q(vx, vy, dst, ncols, nrows, - item_ct1); - }); -} - -static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy, - float *dst, const int ncols, - const int nrows, - dpct::queue_ptr stream) { - GGML_ASSERT(ncols % QK8_0 == 0); - const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; - const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { - mul_mat_vec_q(vx, vy, dst, ncols, nrows, - item_ct1); - }); -} - -static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy, - float *dst, const int ncols, - const int nrows, - dpct::queue_ptr stream) { - GGML_ASSERT(ncols % QK_K == 0); - const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; - const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { - mul_mat_vec_q(vx, vy, dst, ncols, nrows, - item_ct1); - }); -} - -static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy, - float *dst, const int ncols, - const int nrows, - dpct::queue_ptr stream) { - GGML_ASSERT(ncols % QK_K == 0); - const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; - const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { - mul_mat_vec_q(vx, vy, dst, ncols, nrows, - item_ct1); - }); -} - -static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy, - float *dst, const int ncols, - const int nrows, - dpct::queue_ptr stream) { - GGML_ASSERT(ncols % QK_K == 0); - const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; - const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { - mul_mat_vec_q(vx, vy, dst, ncols, nrows, - item_ct1); - }); -} - -static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy, - float *dst, const int ncols, - const int nrows, - dpct::queue_ptr stream) { - GGML_ASSERT(ncols % QK_K == 0); - const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; - const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { - mul_mat_vec_q(vx, vy, dst, ncols, nrows, - item_ct1); - }); -} - -static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy, - float *dst, const int ncols, - const int nrows, - dpct::queue_ptr stream) { - GGML_ASSERT(ncols % QK_K == 0); - const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; - const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { - mul_mat_vec_q(vx, vy, dst, ncols, nrows, - item_ct1); - }); +template +static void mul_mat_vec_q_sycl_submitter(const void *vx, const void *vy, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK4_0 == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), [= + ](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { + mul_mat_vec_q( + vx, vy, dst, ncols, nrows, item_ct1); + }); } int get_device_index_by_id(int id){ @@ -12095,37 +11943,63 @@ inline void ggml_sycl_op_mul_mat_vec_q( const int64_t ne00 = src0->ne[0]; const int64_t row_diff = row_high - row_low; + // TODO: support these quantization types + GGML_ASSERT(!(src0->type == GGML_TYPE_IQ2_XXS || + src0->type == GGML_TYPE_IQ2_XS || + src0->type == GGML_TYPE_IQ3_XXS || + src0->type == GGML_TYPE_IQ1_S)); + switch (src0->type) { case GGML_TYPE_Q4_0: - mul_mat_vec_q4_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); - break; + mul_mat_vec_q_sycl_submitter( + src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; case GGML_TYPE_Q4_1: - mul_mat_vec_q4_1_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); - break; + mul_mat_vec_q_sycl_submitter( + src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; case GGML_TYPE_Q5_0: - mul_mat_vec_q5_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); - break; + mul_mat_vec_q_sycl_submitter( + src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; case GGML_TYPE_Q5_1: - mul_mat_vec_q5_1_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); - break; + mul_mat_vec_q_sycl_submitter( + src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; case GGML_TYPE_Q8_0: - mul_mat_vec_q8_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); - break; + mul_mat_vec_q_sycl_submitter( + src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; case GGML_TYPE_Q2_K: - mul_mat_vec_q2_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); - break; + mul_mat_vec_q_sycl_submitter( + src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; case GGML_TYPE_Q3_K: - mul_mat_vec_q3_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); - break; + mul_mat_vec_q_sycl_submitter( + src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; case GGML_TYPE_Q4_K: - mul_mat_vec_q4_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); - break; + mul_mat_vec_q_sycl_submitter( + src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; case GGML_TYPE_Q5_K: - mul_mat_vec_q5_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); - break; + mul_mat_vec_q_sycl_submitter( + src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; case GGML_TYPE_Q6_K: - mul_mat_vec_q6_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); - break; + mul_mat_vec_q_sycl_submitter( + src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; default: GGML_ASSERT(false); break; @@ -12145,7 +12019,7 @@ inline void ggml_sycl_op_dequantize_mul_mat_vec( const int64_t src1_ncols, const int64_t src1_padded_row_size, const dpct::queue_ptr &stream) { - GGML_TENSOR_BINARY_OP_LOCALS + GGML_TENSOR_BINARY_OP_LOCALS; const int64_t row_diff = row_high - row_low; @@ -15093,6 +14967,12 @@ static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, const ggml_ten return false; } + if (a->type == GGML_TYPE_IQ1_S) { + return false; + } + if (a->type == GGML_TYPE_IQ3_XXS) { + return false; + } if (a->type == GGML_TYPE_IQ2_XXS) { return false; } diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index 4a30414df..4e5eaff15 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -1091,7 +1091,10 @@ static void ggml_vk_print_gpu_info(size_t idx) { } } -static void ggml_vk_instance_init() { +static bool ggml_vk_instance_validation_ext_available(const std::vector& instance_extensions); +static bool ggml_vk_instance_portability_enumeration_ext_available(const std::vector& instance_extensions); + +void ggml_vk_instance_init() { if (vk_instance_initialized) { return; } @@ -1100,28 +1103,42 @@ static void ggml_vk_instance_init() { #endif vk::ApplicationInfo app_info{ "ggml-vulkan", 1, nullptr, 0, VK_API_VERSION }; - const std::vector layers = { -#ifdef GGML_VULKAN_VALIDATE - "VK_LAYER_KHRONOS_validation", -#endif - }; - const std::vector extensions = { -#ifdef GGML_VULKAN_VALIDATE - "VK_EXT_validation_features", -#endif - }; - vk::InstanceCreateInfo instance_create_info(vk::InstanceCreateFlags(), &app_info, layers, extensions); -#ifdef GGML_VULKAN_VALIDATE - const std::vector features_enable = { vk::ValidationFeatureEnableEXT::eBestPractices }; - vk::ValidationFeaturesEXT validation_features = { - features_enable, - {}, - }; - validation_features.setPNext(nullptr); - instance_create_info.setPNext(&validation_features); - std::cerr << "ggml_vulkan: Validation layers enabled" << std::endl; -#endif + const std::vector instance_extensions = vk::enumerateInstanceExtensionProperties(); + const bool validation_ext = ggml_vk_instance_validation_ext_available(instance_extensions); + const bool portability_enumeration_ext = ggml_vk_instance_portability_enumeration_ext_available(instance_extensions); + + std::vector layers; + + if (validation_ext) { + layers.push_back("VK_LAYER_KHRONOS_validation"); + } + std::vector extensions; + if (validation_ext) { + extensions.push_back("VK_EXT_validation_features"); + } + if (portability_enumeration_ext) { + extensions.push_back("VK_KHR_portability_enumeration"); + } + vk::InstanceCreateInfo instance_create_info(vk::InstanceCreateFlags{}, &app_info, layers, extensions); + if (portability_enumeration_ext) { + instance_create_info.flags |= vk::InstanceCreateFlagBits::eEnumeratePortabilityKHR; + } + + std::vector features_enable; + vk::ValidationFeaturesEXT validation_features; + + if (validation_ext) { + features_enable = { vk::ValidationFeatureEnableEXT::eBestPractices }; + validation_features = { + features_enable, + {}, + }; + validation_features.setPNext(nullptr); + instance_create_info.setPNext(&validation_features); + + std::cerr << "ggml_vulkan: Validation layers enabled" << std::endl; + } vk_instance.instance = vk::createInstance(instance_create_info); memset(vk_instance.initialized, 0, sizeof(bool) * GGML_VK_MAX_DEVICES); @@ -1168,12 +1185,12 @@ static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) { vk_instance.devices[idx] = std::make_shared(); ctx->device = vk_instance.devices[idx]; ctx->device.lock()->physical_device = devices[dev_num]; - std::vector ext_props = ctx->device.lock()->physical_device.enumerateDeviceExtensionProperties(); + const std::vector ext_props = ctx->device.lock()->physical_device.enumerateDeviceExtensionProperties(); bool maintenance4_support = false; // Check if maintenance4 is supported - for (auto properties : ext_props) { + for (const auto& properties : ext_props) { if (strcmp("VK_KHR_maintenance4", properties.extensionName) == 0) { maintenance4_support = true; } @@ -1204,7 +1221,7 @@ static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) { bool fp16_storage = false; bool fp16_compute = false; - for (auto properties : ext_props) { + for (const auto& properties : ext_props) { if (strcmp("VK_KHR_16bit_storage", properties.extensionName) == 0) { fp16_storage = true; } else if (strcmp("VK_KHR_shader_float16_int8", properties.extensionName) == 0) { @@ -5301,6 +5318,42 @@ GGML_CALL int ggml_backend_vk_reg_devices() { return vk_instance.device_indices.size(); } +// Extension availability +static bool ggml_vk_instance_validation_ext_available(const std::vector& instance_extensions) { +#ifdef GGML_VULKAN_VALIDATE + bool portability_enumeration_ext = false; + // Check for portability enumeration extension for MoltenVK support + for (const auto& properties : instance_extensions) { + if (strcmp("VK_KHR_portability_enumeration", properties.extensionName) == 0) { + return true; + } + } + if (!portability_enumeration_ext) { + std::cerr << "ggml_vulkan: WARNING: Instance extension VK_KHR_portability_enumeration not found." << std::endl; + } +#endif + return false; + + UNUSED(instance_extensions); +} +static bool ggml_vk_instance_portability_enumeration_ext_available(const std::vector& instance_extensions) { +#ifdef __APPLE__ + bool portability_enumeration_ext = false; + // Check for portability enumeration extension for MoltenVK support + for (const auto& properties : instance_extensions) { + if (strcmp("VK_KHR_portability_enumeration", properties.extensionName) == 0) { + return true; + } + } + if (!portability_enumeration_ext) { + std::cerr << "ggml_vulkan: WARNING: Instance extension VK_KHR_portability_enumeration not found." << std::endl; + } +#endif + return false; + + UNUSED(instance_extensions); +} + // checks #ifdef GGML_VULKAN_CHECK_RESULTS diff --git a/ggml.c b/ggml.c index f6ab1ca78..3ccbb5471 100644 --- a/ggml.c +++ b/ggml.c @@ -23,6 +23,9 @@ #include #include #include +#if defined(__gnu_linux__) +#include +#endif #ifdef GGML_USE_METAL #include @@ -270,6 +273,8 @@ inline static void * ggml_calloc(size_t num, size_t size) { #include #if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions #include "ggml-opencl.h" +#elif defined(GGML_USE_VULKAN) +#include "ggml-vulkan.h" #endif #elif defined(GGML_USE_OPENBLAS) #if defined(GGML_BLAS_USE_MKL) @@ -673,6 +678,18 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot_type = GGML_TYPE_Q8_K, .nrows = 1, }, + [GGML_TYPE_IQ1_S] = { + .type_name = "iq1_s", + .blck_size = QK_K, + .type_size = sizeof(block_iq1_s), + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_iq1_s, + .from_float = NULL, + .from_float_reference = NULL, + .vec_dot = ggml_vec_dot_iq1_s_q8_K, + .vec_dot_type = GGML_TYPE_Q8_K, + .nrows = 1, + }, [GGML_TYPE_Q8_K] = { .type_name = "q8_K", .blck_size = QK_K, @@ -868,7 +885,7 @@ do { \ const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(x[0]), \ _mm256_extractf128_ps(x[0], 1)); \ const __m128 t1 = _mm_hadd_ps(t0, t0); \ - res = _mm_cvtss_f32(_mm_hadd_ps(t1, t1)); \ + res = (ggml_float) _mm_cvtss_f32(_mm_hadd_ps(t1, t1)); \ } while (0) // TODO: is this optimal ? @@ -1149,7 +1166,7 @@ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) { x[i] = _mm_add_ps(x[i], x[offset+i]); \ } \ const __m128 t0 = _mm_hadd_ps(x[0], x[0]); \ - res = _mm_cvtss_f32(_mm_hadd_ps(t0, t0)); \ + res = (ggml_float) _mm_cvtss_f32(_mm_hadd_ps(t0, t0)); \ } // TODO: is this optimal ? @@ -1959,7 +1976,7 @@ struct ggml_numa_nodes { uint32_t n_nodes; uint32_t total_cpus; // hardware threads on system uint32_t current_node; // node on which main process is execting -#ifdef __linux__ +#if defined(__gnu_linux__) cpu_set_t cpuset; // cpuset from numactl #else uint32_t cpuset; // no NUMA support outside of Linux at this time. Use a portable datatype @@ -1997,7 +2014,7 @@ inline static void ggml_critical_section_end(void) { atomic_fetch_sub(&g_state_barrier, 1); } -#ifdef __linux__ +#if defined(__gnu_linux__) static cpu_set_t ggml_get_numa_affinity(void) { cpu_set_t cpuset; pthread_t thread; @@ -2019,7 +2036,7 @@ void ggml_numa_init(enum ggml_numa_strategy numa_flag) { return; } -#ifdef __linux__ +#if defined(__gnu_linux__) struct stat st; char path[256]; int rv; @@ -2051,7 +2068,13 @@ void ggml_numa_init(enum ggml_numa_strategy numa_flag) { // figure out which node we're on uint current_cpu; - int getcpu_ret = getcpu(¤t_cpu, &g_state.numa.current_node); + int getcpu_ret = 0; +#if __GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ > 28) + getcpu_ret = getcpu(¤t_cpu, &g_state.numa.current_node); +#else + // old glibc doesn't have a wrapper for this call. Fall back on direct syscall + getcpu_ret = syscall(SYS_getcpu,¤t_cpu,&g_state.numa.current_node); +#endif if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1 || getcpu_ret != 0) { g_state.numa.n_nodes = 0; @@ -2086,6 +2109,7 @@ void ggml_numa_init(enum ggml_numa_strategy numa_flag) { } } #else + GGML_UNUSED(numa_flag); // TODO #endif } @@ -2266,6 +2290,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) { case GGML_FTYPE_MOSTLY_IQ2_XXS: wtype = GGML_TYPE_IQ2_XXS; break; case GGML_FTYPE_MOSTLY_IQ2_XS: wtype = GGML_TYPE_IQ2_XS; break; case GGML_FTYPE_MOSTLY_IQ3_XXS: wtype = GGML_TYPE_IQ3_XXS; break; + case GGML_FTYPE_MOSTLY_IQ1_S: wtype = GGML_TYPE_IQ1_S; break; case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break; case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break; } @@ -3219,7 +3244,7 @@ const char * ggml_get_name(const struct ggml_tensor * tensor) { } struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * name) { - strncpy(tensor->name, name, sizeof(tensor->name)); + strncpy(tensor->name, name, sizeof(tensor->name) - 1); tensor->name[sizeof(tensor->name) - 1] = '\0'; return tensor; } @@ -5095,16 +5120,28 @@ static struct ggml_tensor * ggml_soft_max_impl( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * mask, + struct ggml_tensor * pos, float scale, + float max_bias, bool inplace) { GGML_ASSERT(ggml_is_contiguous(a)); + if (mask) { GGML_ASSERT(ggml_is_contiguous(mask)); - GGML_ASSERT(mask->ne[2] == 1); - GGML_ASSERT(mask->ne[3] == 1); + GGML_ASSERT(ggml_is_matrix(mask)); GGML_ASSERT(ggml_can_repeat_rows(mask, a)); } + if (pos) { + GGML_ASSERT(ggml_is_vector(pos)); + GGML_ASSERT(pos->type == GGML_TYPE_F32); + GGML_ASSERT(pos->ne[0] == a->ne[0]); + } + + if (max_bias > 0.0f) { + GGML_ASSERT(pos); + } + bool is_node = false; if (a->grad) { @@ -5113,13 +5150,14 @@ static struct ggml_tensor * ggml_soft_max_impl( struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - float params[] = { scale }; + float params[] = { scale, max_bias }; ggml_set_op_params(result, params, sizeof(params)); result->op = GGML_OP_SOFT_MAX; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; result->src[1] = mask; + result->src[2] = pos; return result; } @@ -5127,21 +5165,23 @@ static struct ggml_tensor * ggml_soft_max_impl( struct ggml_tensor * ggml_soft_max( struct ggml_context * ctx, struct ggml_tensor * a) { - return ggml_soft_max_impl(ctx, a, NULL, 1.0f, false); + return ggml_soft_max_impl(ctx, a, NULL, NULL, 1.0f, 0.0f, false); } struct ggml_tensor * ggml_soft_max_inplace( struct ggml_context * ctx, struct ggml_tensor * a) { - return ggml_soft_max_impl(ctx, a, NULL, 1.0f, true); + return ggml_soft_max_impl(ctx, a, NULL, NULL, 1.0f, 0.0f, true); } struct ggml_tensor * ggml_soft_max_ext( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * mask, - float scale) { - return ggml_soft_max_impl(ctx, a, mask, scale, false); + struct ggml_tensor * pos, + float scale, + float max_bias) { + return ggml_soft_max_impl(ctx, a, mask, pos, scale, max_bias, false); } // ggml_soft_max_back @@ -7661,6 +7701,7 @@ static void ggml_compute_forward_add( case GGML_TYPE_IQ2_XXS: case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: + case GGML_TYPE_IQ1_S: { ggml_compute_forward_add_q_f32(params, src0, src1, dst); } break; @@ -7928,6 +7969,7 @@ static void ggml_compute_forward_add1( case GGML_TYPE_IQ2_XXS: case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: + case GGML_TYPE_IQ1_S: { ggml_compute_forward_add1_q_f32(params, src0, src1, dst); } break; @@ -8048,6 +8090,7 @@ static void ggml_compute_forward_acc( case GGML_TYPE_IQ2_XXS: case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: + case GGML_TYPE_IQ1_S: default: { GGML_ASSERT(false); @@ -10814,6 +10857,7 @@ static void ggml_compute_forward_out_prod( case GGML_TYPE_IQ2_XXS: case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: + case GGML_TYPE_IQ1_S: { ggml_compute_forward_out_prod_q_f32(params, src0, src1, dst); } break; @@ -10994,6 +11038,7 @@ static void ggml_compute_forward_set( case GGML_TYPE_IQ2_XXS: case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: + case GGML_TYPE_IQ1_S: default: { GGML_ASSERT(false); @@ -11191,6 +11236,7 @@ static void ggml_compute_forward_get_rows( case GGML_TYPE_IQ2_XXS: case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: + case GGML_TYPE_IQ1_S: { ggml_compute_forward_get_rows_q(params, src0, src1, dst); } break; @@ -11494,6 +11540,7 @@ static void ggml_compute_forward_soft_max_f32( const struct ggml_compute_params * params, const struct ggml_tensor * src0, const struct ggml_tensor * src1, + const struct ggml_tensor * src2, struct ggml_tensor * dst) { assert(ggml_is_contiguous(dst)); assert(ggml_are_same_shape(src0, dst)); @@ -11502,16 +11549,29 @@ static void ggml_compute_forward_soft_max_f32( return; } - float scale = 1.0f; - memcpy(&scale, (float *) dst->op_params + 0, sizeof(float)); + float scale = 1.0f; + float max_bias = 0.0f; + + memcpy(&scale, (float *) dst->op_params + 0, sizeof(float)); + memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float)); // TODO: handle transposed/permuted matrices const int ith = params->ith; const int nth = params->nth; + GGML_TENSOR_UNARY_OP_LOCALS + const int64_t ne11 = src1 ? src1->ne[1] : 1; + // TODO: is this supposed to be ceil instead of floor? + // https://huggingface.co/mosaicml/mpt-7b/blob/main/attention.py#L370 + const uint32_t n_head_kv = ne02; + const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head_kv)); + + const float m0 = powf(2.0f, -(max_bias ) / n_head_log2); + const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); + const int nc = src0->ne[0]; const int nr = ggml_nrows(src0); @@ -11524,6 +11584,9 @@ static void ggml_compute_forward_soft_max_f32( float * wp = (float *) params->wdata + (nc + CACHE_LINE_SIZE_F32) * ith; + // when max_bias <= 0.0f, src2 is not used and we default it to src0 to avoid branching + float * pos = src2 ? (float *) src2->data : src0->data; + for (int i1 = ir0; i1 < ir1; i1++) { float * sp = (float *)((char *) src0->data + i1*src0->nb[1]); float * dp = (float *)((char *) dst->data + i1*dst->nb[1]); @@ -11537,6 +11600,16 @@ static void ggml_compute_forward_soft_max_f32( ggml_vec_acc_f32(nc, wp, mp); } + // ALiBi bias + if (max_bias > 0.0f) { + const uint32_t h = (i1/ne01)%ne02; // head + const float slope = h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1); + + for (int i = 0; i < nc; i++) { + wp[i] = wp[i] + slope*pos[i]; + } + } + #ifndef NDEBUG for (int i = 0; i < nc; ++i) { //printf("p[%d] = %f\n", i, p[i]); @@ -11581,11 +11654,12 @@ static void ggml_compute_forward_soft_max( const struct ggml_compute_params * params, const struct ggml_tensor * src0, const struct ggml_tensor * src1, + const struct ggml_tensor * src2, struct ggml_tensor * dst) { switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_soft_max_f32(params, src0, src1, dst); + ggml_compute_forward_soft_max_f32(params, src0, src1, src2, dst); } break; default: { @@ -11729,22 +11803,20 @@ static void ggml_compute_forward_alibi_f32( const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor); const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor); - for (int64_t i = 0; i < ne0; i++) { - for (int64_t j = 0; j < ne1; j++) { - for (int64_t k = 0; k < ne2_ne3; k++) { + for (int64_t k = 0; k < ne2_ne3; k++) { + // TODO: k*nb2 or k*nb3 + float m_k; + + if (k < n_heads_log2_floor) { + m_k = powf(m0, k + 1); + } else { + m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1); + } + + for (int64_t i = 0; i < ne0; i++) { + for (int64_t j = 0; j < ne1; j++) { float * const src = (float *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2); float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2); - - // TODO: k*nb2 or k*nb3 - - float m_k; - - if (k < n_heads_log2_floor) { - m_k = powf(m0, k + 1); - } else { - m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1); - } - pdst[0] = i * m_k + src[0]; } } @@ -11789,21 +11861,20 @@ static void ggml_compute_forward_alibi_f16( const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor); const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor); - for (int i = 0; i < ne0; i++) { - for (int j = 0; j < ne1; j++) { - for (int k = 0; k < ne2_ne3; k++) { + for (int k = 0; k < ne2_ne3; k++) { + // TODO: k*nb2 or k*nb3 + float m_k; + + if (k < n_heads_log2_floor) { + m_k = powf(m0, k + 1); + } else { + m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1); + } + + for (int i = 0; i < ne0; i++) { + for (int j = 0; j < ne1; j++) { ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2); - float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2); - - // TODO: k*nb2 or k*nb3 - - float m_k; - - if (k < n_heads_log2_floor) { - m_k = powf(m0, k + 1); - } else { - m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1); - } + float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2); // we return F32 pdst[0] = i * m_k + GGML_FP16_TO_FP32(src[0]); @@ -11839,6 +11910,7 @@ static void ggml_compute_forward_alibi( case GGML_TYPE_IQ2_XXS: case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: + case GGML_TYPE_IQ1_S: case GGML_TYPE_Q8_K: case GGML_TYPE_I8: case GGML_TYPE_I16: @@ -11916,6 +11988,7 @@ static void ggml_compute_forward_clamp( case GGML_TYPE_IQ2_XXS: case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: + case GGML_TYPE_IQ1_S: case GGML_TYPE_Q8_K: case GGML_TYPE_I8: case GGML_TYPE_I16: @@ -15115,7 +15188,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm } break; case GGML_OP_SOFT_MAX: { - ggml_compute_forward_soft_max(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_soft_max(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor); } break; case GGML_OP_SOFT_MAX_BACK: { @@ -16672,7 +16745,7 @@ typedef pthread_t ggml_thread_t; #endif // Android's libc implementation "bionic" does not support setting affinity -#if defined(__linux__) && !defined(__BIONIC__) +#if defined(__gnu_linux__) static void set_numa_thread_affinity(int thread_n) { if (!ggml_is_numa()) { return; @@ -17847,7 +17920,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context * ptr += ggml_nbytes(tensor); - fprintf(stderr, "%s: loaded leaf %d: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor)); + fprintf(stderr, "%s: loaded leaf %u: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor)); } } @@ -17950,7 +18023,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context * result->nodes[i] = tensor; - fprintf(stderr, "%s: loaded node %d: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor)); + fprintf(stderr, "%s: loaded node %u: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor)); } } } @@ -18575,7 +18648,9 @@ static enum ggml_opt_result linesearch_backtracking( (*step) *= width; } - GGML_UNREACHABLE(); + GGML_ASSERT(false && "line search failed"); + + return GGML_LINESEARCH_FAIL; } static enum ggml_opt_result ggml_opt_lbfgs( @@ -18843,7 +18918,9 @@ static enum ggml_opt_result ggml_opt_lbfgs( step[0] = 1.0; } - GGML_UNREACHABLE(); + GGML_ASSERT(false && "lbfgs failed"); + + return GGML_OPT_DID_NOT_CONVERGE; } struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) { @@ -19091,8 +19168,9 @@ void ggml_quantize_init(enum ggml_type type) { ggml_critical_section_start(); switch (type) { - case GGML_TYPE_IQ2_XXS: iq2xs_init_impl(256); break; - case GGML_TYPE_IQ2_XS: iq2xs_init_impl(512); break; + case GGML_TYPE_IQ2_XXS: + case GGML_TYPE_IQ2_XS: + case GGML_TYPE_IQ1_S: iq2xs_init_impl(type); break; case GGML_TYPE_IQ3_XXS: iq3xs_init_impl(256); break; default: // nothing break; @@ -19104,8 +19182,10 @@ void ggml_quantize_init(enum ggml_type type) { void ggml_quantize_free(void) { ggml_critical_section_start(); - iq2xs_free_impl(256); - iq2xs_free_impl(512); + iq2xs_free_impl(GGML_TYPE_IQ2_XXS); + iq2xs_free_impl(GGML_TYPE_IQ2_XS); + iq2xs_free_impl(GGML_TYPE_IQ1_S); + iq3xs_free_impl(256); ggml_critical_section_end(); } @@ -19240,7 +19320,8 @@ size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * bool ggml_quantize_requires_imatrix(enum ggml_type type) { return type == GGML_TYPE_IQ2_XXS || - type == GGML_TYPE_IQ2_XS; + type == GGML_TYPE_IQ2_XS || + type == GGML_TYPE_IQ1_S; } size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, @@ -19365,6 +19446,15 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i result = quantize_iq3_xxs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix); GGML_ASSERT(result == row_size * nrows); } break; + case GGML_TYPE_IQ1_S: + { + GGML_ASSERT(start % QK_K == 0); + GGML_ASSERT(start % n_per_row == 0); + size_t start_row = start / n_per_row; + size_t row_size = ggml_row_size(type, n_per_row); + result = quantize_iq1_s(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix); + GGML_ASSERT(result == row_size * nrows); + } break; case GGML_TYPE_F16: { size_t elemsize = sizeof(ggml_fp16_t); diff --git a/ggml.h b/ggml.h index 9e50accc3..619092eaf 100644 --- a/ggml.h +++ b/ggml.h @@ -361,6 +361,7 @@ extern "C" { GGML_TYPE_IQ2_XXS = 16, GGML_TYPE_IQ2_XS = 17, GGML_TYPE_IQ3_XXS = 18, + GGML_TYPE_IQ1_S = 19, GGML_TYPE_I8, GGML_TYPE_I16, GGML_TYPE_I32, @@ -398,6 +399,7 @@ extern "C" { GGML_FTYPE_MOSTLY_IQ2_XXS = 15, // except 1d tensors GGML_FTYPE_MOSTLY_IQ2_XS = 16, // except 1d tensors GGML_FTYPE_MOSTLY_IQ3_XXS = 17, // except 1d tensors + GGML_FTYPE_MOSTLY_IQ1_S = 18, // except 1d tensors }; // available tensor operations: @@ -1390,13 +1392,17 @@ extern "C" { struct ggml_context * ctx, struct ggml_tensor * a); - // fused soft_max(a*scale + mask) + // fused soft_max(a*scale + mask + pos[i]*(ALiBi slope)) // mask is optional + // pos is required when max_bias > 0.0f + // max_bias = 0.0f for no ALiBi GGML_API struct ggml_tensor * ggml_soft_max_ext( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * mask, - float scale); + struct ggml_tensor * pos, + float scale, + float max_bias); GGML_API struct ggml_tensor * ggml_soft_max_back( struct ggml_context * ctx, @@ -1498,12 +1504,13 @@ extern "C" { // alibi position embedding // in-place, returns view(a) - GGML_API struct ggml_tensor * ggml_alibi( + GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_alibi( struct ggml_context * ctx, struct ggml_tensor * a, int n_past, int n_head, - float bias_max); + float bias_max), + "use ggml_soft_max_ext instead (will be removed in Mar 2024)"); // clamp // in-place, returns view(a) diff --git a/llama.cpp b/llama.cpp index 5567fe33c..3fb5443ff 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1585,12 +1585,13 @@ struct llama_hparams { uint32_t n_yarn_orig_ctx; int32_t rope_scaling_type_train; - float f_clamp_kqv; - float f_max_alibi_bias; + float f_clamp_kqv = 0.0f; + float f_max_alibi_bias = 0.0f; bool causal_attn = true; - uint32_t pooling_type = LLAMA_POOLING_NONE; + bool need_kq_pos = false; + uint32_t pooling_type = LLAMA_POOLING_NONE; bool operator!=(const llama_hparams & other) const { if (this->vocab_only != other.vocab_only) return true; @@ -1955,6 +1956,7 @@ struct llama_context { struct ggml_tensor * inp_embd; // F32 [n_embd, n_batch] struct ggml_tensor * inp_pos; // I32 [n_batch] struct ggml_tensor * inp_KQ_mask; // F32 [n_ctx, n_batch] + struct ggml_tensor * inp_KQ_pos; // F32 [n_ctx] struct ggml_tensor * inp_K_shift; // I32 [n_ctx] struct ggml_tensor * inp_mean; // F32 [n_batch, n_batch] struct ggml_tensor * inp_cls; // I32 [n_batch] @@ -2557,6 +2559,7 @@ struct llama_model_loader { case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break; case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break; case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break; + case GGML_TYPE_IQ1_S: ftype = LLAMA_FTYPE_MOSTLY_IQ1_S; break; default: { LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max)); @@ -2919,6 +2922,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw"; case LLAMA_FTYPE_MOSTLY_Q3_K_XS:return "Q3_K - Extra small"; case LLAMA_FTYPE_MOSTLY_IQ3_XXS:return "IQ3_XXS - 3.0625 bpw"; + case LLAMA_FTYPE_MOSTLY_IQ1_S :return "IQ1_S - 1.5625 bpw"; default: return "unknown, may not work"; } @@ -3100,6 +3104,11 @@ static void llm_load_hparams( case 40: model.type = e_model::MODEL_13B; break; default: model.type = e_model::MODEL_UNKNOWN; } + + if (model.type == e_model::MODEL_13B) { + // TODO: become GGUF KV parameter + hparams.f_max_alibi_bias = 8.0f; + } } break; case LLM_ARCH_STARCODER: { @@ -3127,6 +3136,9 @@ static void llm_load_hparams( case 32: model.type = e_model::MODEL_1B; break; default: model.type = e_model::MODEL_UNKNOWN; } + + // TODO: become GGUF KV parameter + hparams.f_max_alibi_bias = 8.0f; } break; case LLM_ARCH_BERT: { @@ -3172,11 +3184,12 @@ static void llm_load_hparams( case 4096: model.type = e_model::MODEL_7B; break; } break; } + + // TODO: become GGUF KV parameter + hparams.f_max_alibi_bias = 8.0f; } break; case LLM_ARCH_MPT: { - hparams.f_clamp_kqv = 0.0f; - ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false); ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias); @@ -3278,6 +3291,10 @@ static void llm_load_hparams( } model.ftype = ml.ftype; + + if (hparams.f_max_alibi_bias > 0.0f) { + hparams.need_kq_pos = true; + } } // TODO: This should probably be in llama.h @@ -4846,10 +4863,10 @@ static struct ggml_tensor * llm_build_kqv( struct ggml_tensor * wo_b, struct ggml_tensor * q_cur, struct ggml_tensor * kq_mask, + struct ggml_tensor * kq_pos, int64_t n_ctx, int32_t n_tokens, int32_t n_kv, - float max_alibi_bias, float kq_scale, const llm_build_cb & cb, int il) { @@ -4879,26 +4896,26 @@ static struct ggml_tensor * llm_build_kqv( ggml_mul_mat_set_prec(kq, GGML_PREC_F32); } - if (max_alibi_bias > 0.0f) { - // temporary branch until we figure out how to handle ggml_alibi through ggml_add +#if defined(GGML_USE_VULKAN) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_SYCL) +#pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Vulkan, Kompute, and SYCL") +#pragma message(" Falling back to ggml_alibi(). Will become an error in Mar 2024") +#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5488") + if (hparams.f_max_alibi_bias > 0.0f) { kq = ggml_scale(ctx, kq, kq_scale); cb(kq, "kq_scaled", il); - if (max_alibi_bias > 0.0f) { - // TODO: n_head or n_head_kv - // TODO: K-shift is likely not working - // TODO: change to ggml_add - kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, max_alibi_bias); - cb(kq, "kq_scaled_alibi", il); - } + kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, hparams.f_max_alibi_bias); + cb(kq, "kq_scaled_alibi", il); kq = ggml_add(ctx, kq, kq_mask); cb(kq, "kq_masked", il); kq = ggml_soft_max(ctx, kq); cb(kq, "kq_soft_max", il); - } else { - kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale); + } else +#endif + { + kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_pos, kq_scale, hparams.f_max_alibi_bias); cb(kq, "kq_soft_max_ext", il); } @@ -4946,11 +4963,11 @@ static struct ggml_tensor * llm_build_kv( struct ggml_tensor * v_cur, struct ggml_tensor * q_cur, struct ggml_tensor * kq_mask, + struct ggml_tensor * kq_pos, int64_t n_ctx, int32_t n_tokens, int32_t kv_head, int32_t n_kv, - float max_alibi_bias, float kq_scale, const llm_build_cb & cb, int il) { @@ -4964,9 +4981,8 @@ static struct ggml_tensor * llm_build_kv( llm_build_kv_store(ctx, hparams, kv, graph, k_cur, v_cur, n_ctx, n_tokens, kv_head, cb, il); struct ggml_tensor * cur; - cur = llm_build_kqv(ctx, model, hparams, kv, graph, - wo, wo_b, - q_cur, kq_mask, n_ctx, n_tokens, n_kv, max_alibi_bias, kq_scale, cb, il); + cur = llm_build_kqv(ctx, model, hparams, kv, graph, wo, wo_b, + q_cur, kq_mask, kq_pos, n_ctx, n_tokens, n_kv, kq_scale, cb, il); cb(cur, "kqv_out", il); return cur; @@ -5134,7 +5150,7 @@ struct llm_build_context { } Qcur = ggml_rope_custom( - ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); @@ -5149,7 +5165,7 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -5279,6 +5295,10 @@ struct llm_build_context { struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); cb(KQ_mask, "KQ_mask", -1); + // positions of the tokens in the KV cache + struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0); + cb(KQ_pos, "KQ_pos", -1); + // shift the entire K-cache if needed if (do_rope_shift) { llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb); @@ -5327,12 +5347,9 @@ struct llm_build_context { cb(Kcur, "Kcur", il); - // apply ALiBi for 13B model - const float max_alibi_bias = model.type == MODEL_13B ? 8.0f : -1.0f; - cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -5456,7 +5473,7 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -5555,7 +5572,7 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -5760,7 +5777,7 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Q, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Q, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -5822,6 +5839,10 @@ struct llm_build_context { struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); cb(KQ_mask, "KQ_mask", -1); + // positions of the tokens in the KV cache + struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0); + cb(KQ_pos, "KQ_pos", -1); + for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -5849,7 +5870,7 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -5950,7 +5971,7 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } else { // compute Q and K and RoPE them @@ -5981,7 +6002,7 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -6057,6 +6078,10 @@ struct llm_build_context { struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); cb(KQ_mask, "KQ_mask", -1); + // positions of the tokens in the KV cache + struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0); + cb(KQ_pos, "KQ_pos", -1); + inpL = llm_build_norm(ctx0, inpL, hparams, model.tok_norm, model.tok_norm_b, @@ -6090,7 +6115,7 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -6150,6 +6175,10 @@ struct llm_build_context { struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); cb(KQ_mask, "KQ_mask", -1); + // positions of the tokens in the KV cache + struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0); + cb(KQ_pos, "KQ_pos", -1); + for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * attn_norm; @@ -6183,7 +6212,7 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, hparams.f_max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -6305,7 +6334,7 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -6420,7 +6449,7 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -6541,7 +6570,7 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -6668,7 +6697,7 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f, cb, il); + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il); cb(cur, "kqv_out", il); } @@ -6771,7 +6800,7 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } struct ggml_tensor * sa_out = cur; @@ -6870,7 +6899,7 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -6979,7 +7008,7 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -7097,7 +7126,7 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -7216,7 +7245,7 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -7348,7 +7377,7 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -7579,6 +7608,18 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { } } + if (hparams.need_kq_pos) { + const int64_t n_kv = kv_self.n; + + assert(ggml_backend_buffer_is_host(lctx.inp_KQ_pos->buffer)); + + float * data = (float *) lctx.inp_KQ_pos->data; + + for (int i = 0; i < n_kv; ++i) { + data[i] = float(lctx.kv_self.cells[i].pos); + } + } + if (kv_self.has_shift) { const int64_t n_ctx = cparams.n_ctx; @@ -10596,20 +10637,20 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) { new_type = GGML_TYPE_Q8_0; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) { + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) { new_type = GGML_TYPE_Q5_K; } else if (new_type != GGML_TYPE_Q8_0) { new_type = GGML_TYPE_Q6_K; } } else if (name == "token_embd.weight") { - if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) { + if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) { new_type = GGML_TYPE_Q2_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { new_type = GGML_TYPE_Q4_K; } - } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) { + } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) { if (name.find("attn_v.weight") != std::string::npos) { if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K; else new_type = GGML_TYPE_Q2_K; @@ -10619,6 +10660,9 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty if (qs.i_ffn_down < qs.n_ffn_down/8) new_type = GGML_TYPE_Q2_K; ++qs.i_ffn_down; } + else if (name.find("attn_output.weight") != std::string::npos) { + if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) new_type = GGML_TYPE_IQ2_XXS; + } } else if (name.find("attn_v.weight") != std::string::npos) { if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) { new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; @@ -10752,7 +10796,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K || new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K || new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS || - new_type == GGML_TYPE_IQ3_XXS) { + new_type == GGML_TYPE_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) { int nx = tensor->ne[0]; int ny = tensor->ne[1]; if (nx % QK_K != 0) { @@ -10767,6 +10811,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty case GGML_TYPE_IQ2_XXS: case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: + case GGML_TYPE_IQ1_S: case GGML_TYPE_Q2_K: new_type = GGML_TYPE_Q4_0; break; case GGML_TYPE_Q3_K: new_type = GGML_TYPE_Q4_1; break; case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break; @@ -10809,6 +10854,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s case LLAMA_FTYPE_MOSTLY_IQ2_XXS: quantized_type = GGML_TYPE_IQ2_XXS; break; case LLAMA_FTYPE_MOSTLY_IQ2_XS: quantized_type = GGML_TYPE_IQ2_XS; break; case LLAMA_FTYPE_MOSTLY_IQ3_XXS: quantized_type = GGML_TYPE_IQ3_XXS; break; + case LLAMA_FTYPE_MOSTLY_IQ1_S: quantized_type = GGML_TYPE_IQ1_S ; break; default: throw std::runtime_error(format("invalid output file type %d\n", ftype)); } @@ -10982,6 +11028,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } if ((new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_XS || + new_type == GGML_TYPE_IQ1_S || (new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) { LLAMA_LOG_ERROR("\n\n============================================================\n"); LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name); @@ -11747,7 +11794,7 @@ struct llama_context * llama_new_context_with_model( // graph inputs { ggml_init_params init_params = { - /* .mem_size */ ggml_tensor_overhead()*7, + /* .mem_size */ ggml_tensor_overhead()*8, /* .mem_buffer */ nullptr, /* .no_alloc */ true, }; @@ -11757,6 +11804,7 @@ struct llama_context * llama_new_context_with_model( ctx->inp_embd = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, hparams.n_embd, cparams.n_batch); ctx->inp_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch); ctx->inp_KQ_mask = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx, cparams.n_batch); + ctx->inp_KQ_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx); ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_ctx); ctx->inp_mean = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_batch, cparams.n_batch); ctx->inp_cls = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch); @@ -11765,6 +11813,7 @@ struct llama_context * llama_new_context_with_model( ggml_set_name(ctx->inp_embd, "inp_embd"); ggml_set_name(ctx->inp_pos, "inp_pos"); ggml_set_name(ctx->inp_KQ_mask, "inp_KQ_mask"); + ggml_set_name(ctx->inp_KQ_pos, "inp_KQ_pos"); ggml_set_name(ctx->inp_K_shift, "inp_K_shift"); ggml_set_name(ctx->inp_mean, "inp_mean"); ggml_set_name(ctx->inp_cls, "inp_cls"); @@ -12787,6 +12836,123 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token return 0; } +// trim whitespace from the beginning and end of a string +static std::string trim(const std::string & str) { + size_t start = 0; + size_t end = str.size(); + while (start < end && isspace(str[start])) { + start += 1; + } + while (end > start && isspace(str[end - 1])) { + end -= 1; + } + return str.substr(start, end - start); +} + +// Simple version of "llama_apply_chat_template" that only works with strings +// This function uses heuristic checks to determine commonly used template. It is not a jinja parser. +static int32_t llama_chat_apply_template_internal( + const std::string & tmpl, + const std::vector & chat, + std::string & dest, bool add_ass) { + // Taken from the research: https://github.com/ggerganov/llama.cpp/issues/5527 + std::stringstream ss; + if (tmpl.find("<|im_start|>") != std::string::npos) { + // chatml template + for (auto message : chat) { + ss << "<|im_start|>" << message->role << "\n" << message->content << "<|im_end|>\n"; + } + if (add_ass) { + ss << "<|im_start|>assistant\n"; + } + } else if (tmpl.find("[INST]") != std::string::npos) { + // llama2 template and its variants + // [variant] support system message + bool support_system_message = tmpl.find("<>") != std::string::npos; + // [variant] space before + after response + bool space_around_response = tmpl.find("' ' + eos_token") != std::string::npos; + // [variant] add BOS inside history + bool add_bos_inside_history = tmpl.find("bos_token + '[INST]") != std::string::npos; + // [variant] trim spaces from the input message + bool strip_message = tmpl.find("content.strip()") != std::string::npos; + // construct the prompt + bool is_inside_turn = true; // skip BOS at the beginning + ss << "[INST] "; + for (auto message : chat) { + std::string content = strip_message ? trim(message->content) : message->content; + std::string role(message->role); + if (!is_inside_turn) { + is_inside_turn = true; + ss << (add_bos_inside_history ? "[INST] " : "[INST] "); + } + if (role == "system") { + if (support_system_message) { + ss << "<>\n" << content << "\n<>\n\n"; + } else { + // if the model does not support system message, we still include it in the first message, but without <> + ss << content << "\n"; + } + } else if (role == "user") { + ss << content << " [/INST]"; + } else { + ss << (space_around_response ? " " : "") << content << (space_around_response ? " " : "") << ""; + is_inside_turn = false; + } + } + // llama2 templates seem to not care about "add_generation_prompt" + } else if (tmpl.find("<|user|>") != std::string::npos) { + // zephyr template + for (auto message : chat) { + ss << "<|" << message->role << "|>" << "\n" << message->content << "<|endoftext|>\n"; + } + if (add_ass) { + ss << "<|assistant|>\n"; + } + } else { + // template not supported + return -1; + } + dest = ss.str(); + return dest.size(); +} + +LLAMA_API int32_t llama_chat_apply_template( + const struct llama_model * model, + const char * tmpl, + const struct llama_chat_message * chat, + size_t n_msg, + bool add_ass, + char * buf, + int32_t length) { + std::string curr_tmpl(tmpl == nullptr ? "" : tmpl); + if (tmpl == nullptr) { + GGML_ASSERT(model != nullptr); + // load template from model + std::vector model_template(2048, 0); // longest known template is about 1200 bytes + std::string template_key = "tokenizer.chat_template"; + int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), curr_tmpl.size()); + if (res < 0) { + // worst case: there is no information about template, we will use chatml by default + curr_tmpl = "<|im_start|>"; // see llama_chat_apply_template_internal + } else { + curr_tmpl = std::string(model_template.data(), model_template.size()); + } + } + // format the chat to string + std::vector chat_vec; + chat_vec.resize(n_msg); + for (size_t i = 0; i < n_msg; i++) { + chat_vec[i] = &chat[i]; + } + std::string formatted_chat; + int32_t res = llama_chat_apply_template_internal(curr_tmpl, chat_vec, formatted_chat, add_ass); + if (res < 0) { + return res; + } + strncpy(buf, formatted_chat.c_str(), length); + return res; +} + struct llama_timings llama_get_timings(struct llama_context * ctx) { struct llama_timings result = { /*.t_start_ms =*/ 1e-3 * ctx->t_start_us, diff --git a/llama.h b/llama.h index 29cae889f..47bede0f2 100644 --- a/llama.h +++ b/llama.h @@ -100,6 +100,7 @@ extern "C" { LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q3_K_XS = 22, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23, // except 1d tensors + LLAMA_FTYPE_MOSTLY_IQ1_S = 24, // except 1d tensors LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file }; @@ -304,6 +305,12 @@ extern "C" { int32_t n_eval; }; + // used in chat template + typedef struct llama_chat_message { + const char * role; + const char * content; + } llama_chat_message; + // Helpers for getting default parameters LLAMA_API struct llama_model_params llama_model_default_params(void); LLAMA_API struct llama_context_params llama_context_default_params(void); @@ -700,6 +707,25 @@ extern "C" { char * buf, int32_t length); + /// Apply chat template. Inspired by hf apply_chat_template() on python. + /// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model" + /// NOTE: This function only support some known jinja templates. It is not a jinja parser. + /// @param tmpl A Jinja template to use for this chat. If this is nullptr, the model’s default chat template will be used instead. + /// @param chat Pointer to a list of multiple llama_chat_message + /// @param n_msg Number of llama_chat_message in this chat + /// @param add_ass Whether to end the prompt with the token(s) that indicate the start of an assistant message. + /// @param buf A buffer to hold the output formatted prompt. The recommended alloc size is 2 * (total number of characters of all messages) + /// @param length The size of the allocated buffer + /// @return The total number of bytes of the formatted prompt. If is it larger than the size of buffer, you may need to re-alloc it and then re-apply the template. + LLAMA_API int32_t llama_chat_apply_template( + const struct llama_model * model, + const char * tmpl, + const struct llama_chat_message * chat, + size_t n_msg, + bool add_ass, + char * buf, + int32_t length); + // // Grammar // diff --git a/tests/test-chat-template.cpp b/tests/test-chat-template.cpp new file mode 100644 index 000000000..9830650d4 --- /dev/null +++ b/tests/test-chat-template.cpp @@ -0,0 +1,64 @@ +#include +#include +#include +#include + +#undef NDEBUG +#include + +#include "llama.h" + +int main(void) { + llama_chat_message conversation[] = { + {"system", "You are a helpful assistant"}, + {"user", "Hello"}, + {"assistant", "Hi there"}, + {"user", "Who are you"}, + {"assistant", " I am an assistant "}, + {"user", "Another question"}, + }; + size_t message_count = 6; + std::vector templates = { + // teknium/OpenHermes-2.5-Mistral-7B + "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content'] + '<|im_end|>' + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\\n' }}{% endif %}", + // mistralai/Mistral-7B-Instruct-v0.2 + "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}", + // TheBloke/FusionNet_34Bx2_MoE-AWQ + "{%- for idx in range(0, messages|length) -%}\\n{%- if messages[idx]['role'] == 'user' -%}\\n{%- if idx > 1 -%}\\n{{- bos_token + '[INST] ' + messages[idx]['content'] + ' [/INST]' -}}\\n{%- else -%}\\n{{- messages[idx]['content'] + ' [/INST]' -}}\\n{%- endif -%}\\n{% elif messages[idx]['role'] == 'system' %}\\n{{- '[INST] <>\\\\n' + messages[idx]['content'] + '\\\\n<>\\\\n\\\\n' -}}\\n{%- elif messages[idx]['role'] == 'assistant' -%}\\n{{- ' ' + messages[idx]['content'] + ' ' + eos_token -}}\\n{% endif %}\\n{% endfor %}", + // bofenghuang/vigogne-2-70b-chat + "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif true == true and not '<>' in messages[0]['content'] %}{% set loop_messages = messages %}{% set system_message = 'Vous êtes Vigogne, un assistant IA créé par Zaion Lab. Vous suivez extrêmement bien les instructions. Aidez autant que vous le pouvez.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<>\\\\n' + system_message + '\\\\n<>\\\\n\\\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'system' %}{{ '<>\\\\n' + content.strip() + '\\\\n<>\\\\n\\\\n' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}", + }; + std::vector expected_substr = { + "<|im_start|>assistant\n I am an assistant <|im_end|>\n<|im_start|>user\nAnother question<|im_end|>\n<|im_start|>assistant", + "[/INST]Hi there[INST] Who are you [/INST] I am an assistant [INST] Another question [/INST]", + "[INST] Who are you [/INST] I am an assistant [INST] Another question [/INST]", + "[/INST] Hi there [INST] Who are you [/INST] I am an assistant [INST] Another question [/INST]", + }; + std::vector formatted_chat(1024); + int32_t res; + + // test invalid chat template + res = llama_chat_apply_template(nullptr, "INVALID TEMPLATE", conversation, message_count, true, formatted_chat.data(), formatted_chat.size()); + assert(res < 0); + + for (size_t i = 0; i < templates.size(); i++) { + std::string custom_template = templates[i]; + std::string substr = expected_substr[i]; + formatted_chat.resize(1024); + res = llama_chat_apply_template( + nullptr, + custom_template.c_str(), + conversation, + message_count, + true, + formatted_chat.data(), + formatted_chat.size() + ); + formatted_chat.resize(res); + std::string output(formatted_chat.data(), formatted_chat.size()); + std::cout << output << "\n-------------------------\n"; + // expect the "formatted_chat" to contain pre-defined strings + assert(output.find(substr) != std::string::npos); + } + return 0; +}