diff --git a/examples/llava/README.md b/examples/llava/README.md index 25ea96715..35e6d9e5d 100644 --- a/examples/llava/README.md +++ b/examples/llava/README.md @@ -63,13 +63,12 @@ Now both the LLaMA part and the image encoder is in the `llava-v1.5-7b` director ```console git clone https://huggingface.co/liuhaotian/llava-v1.6-vicuna-7b ``` -2) Backup your pth/safetensor model files as llava-surgery modifies them -3) Use `llava-surgery-v2.py` which also supports llava-1.5 variants pytorch as well as safetensor models: +2) Use `llava-surgery-v2.py` which also supports llava-1.5 variants pytorch as well as safetensor models: ```console python examples/llava/llava-surgery-v2.py -C -m ../llava-v1.6-vicuna-7b/ ``` - you will find a llava.projector and a llava.clip file in your model directory -4) Copy the llava.clip file into a subdirectory (like vit), rename it to pytorch_model.bin and add a fitting vit configuration to the directory: +3) Copy the llava.clip file into a subdirectory (like vit), rename it to pytorch_model.bin and add a fitting vit configuration to the directory: ```console mkdir vit cp ../llava-v1.6-vicuna-7b/llava.clip vit/pytorch_model.bin @@ -77,18 +76,18 @@ cp ../llava-v1.6-vicuna-7b/llava.projector vit/ curl -s -q https://huggingface.co/cmp-nct/llava-1.6-gguf/raw/main/config_vit.json -o vit/config.json ``` -5) Create the visual gguf model: +4) Create the visual gguf model: ```console python ./examples/llava/convert-image-encoder-to-gguf.py -m vit --llava-projector vit/llava.projector --output-dir vit --clip-model-is-vision ``` - This is similar to llava-1.5, the difference is that we tell the encoder that we are working with the pure vision model part of CLIP -6) Then convert the model to gguf format: +5) Then convert the model to gguf format: ```console -python ./convert.py ../llava-v1.6-vicuna-7b/ +python ./convert.py ../llava-v1.6-vicuna-7b/ --skip-unknown ``` -7) And finally we can run the llava-cli using the 1.6 model version: +6) And finally we can run the llava-cli using the 1.6 model version: ```console ./llava-cli -m ../llava-v1.6-vicuna-7b/ggml-model-f16.gguf --mmproj vit/mmproj-model-f16.gguf --image some-image.jpg -c 4096 ``` diff --git a/examples/llava/llava-surgery-v2.py b/examples/llava/llava-surgery-v2.py index 5bc5bc513..eb56d6988 100644 --- a/examples/llava/llava-surgery-v2.py +++ b/examples/llava/llava-surgery-v2.py @@ -65,9 +65,7 @@ def clean_vision_tower_from_checkpoint(checkpoint_path): for name in clip_tensors: del checkpoint[name] - # Save the updated checkpoint checkpoint_path = checkpoint_path - save_model(checkpoint, checkpoint_path, file_type) return True return False @@ -152,16 +150,6 @@ for name in first_mm_tensors: if len(projector) > 0: save_model(projector, f"{args.model}/llava.projector", 'pytorch') -for name in mm_tensors: - del last_checkpoint[name] -for name in first_mm_tensors: - del first_checkpoint[name] - -if len(mm_tensors) > 0: - save_model(last_checkpoint, projector_checkpoint_path, file_type) -if len(first_mm_tensors) > 0: - save_model(first_checkpoint, newline_checkpoint_path, file_type) - print("Done!") print(f"Now you can convert {args.model} to a a regular LLaMA GGUF file.") print(f"Also, use {args.model}/llava.projector to prepare a llava-encoder.gguf file.") diff --git a/examples/main/main.cpp b/examples/main/main.cpp index dae3703fc..8c49a626e 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -335,6 +335,8 @@ int main(int argc, char ** argv) { // number of tokens to keep when resetting context if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size() || params.instruct || params.chatml) { params.n_keep = (int)embd_inp.size(); + } else { + params.n_keep += add_bos; // always keep the BOS token } // prefix & suffix for instruct mode @@ -384,8 +386,8 @@ int main(int argc, char ** argv) { } } - if (params.n_keep > 0) { - LOG_TEE("%s: static prompt based on n_keep: '", __func__); + if (params.n_keep > add_bos) { + LOG_TEE("%s: static prompt based on n_keep: '", __func__); for (int i = 0; i < params.n_keep; i++) { LOG_TEE("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str()); } @@ -541,14 +543,14 @@ int main(int argc, char ** argv) { break; } - const int n_left = n_past - params.n_keep - 1; + const int n_left = n_past - params.n_keep; const int n_discard = n_left/2; LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n", n_past, n_left, n_ctx, params.n_keep, n_discard); - llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1); - llama_kv_cache_seq_shift(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard); + llama_kv_cache_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard); + llama_kv_cache_seq_shift(ctx, 0, params.n_keep + n_discard, n_past, -n_discard); n_past -= n_discard; diff --git a/examples/server/README.md b/examples/server/README.md index f6b9c7402..4b24ee5dc 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -41,6 +41,7 @@ see https://github.com/ggerganov/llama.cpp/issues/1437 - `--grp-attn-w`: Set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n` - `-n, --n-predict`: Set the maximum tokens to predict (default: -1) - `--slots-endpoint-disable`: To disable slots state monitoring endpoint. Slots state may contain user data, prompts included. +- `--chat-template JINJA_TEMPLATE`: Set custom jinja chat template. This parameter accepts a string, not a file name (default: template taken from model's metadata). We only support [some pre-defined templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) ## Build @@ -140,6 +141,8 @@ node index.js - 200 -> `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if no slot are currently available. - 503 -> `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if the query parameter `fail_on_no_slot` is provided and no slot are currently available. + If the query parameter `include_slots` is passed, `slots` field will contain internal slots data except if `--slots-endpoint-disable` is set. + - **POST** `/completion`: Given a `prompt`, it returns the predicted completion. *Options:* diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 517e0257f..635a52603 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1395,6 +1395,46 @@ struct llama_server_context case TASK_TYPE_NEXT_RESPONSE: { // do nothing } break; + case TASK_TYPE_SLOTS_DATA: { + json slots_data = json::array(); + int n_idle_slots = 0; + int n_processing_slots = 0; + + for (llama_client_slot &slot: slots) { + if (slot.available()) { + n_idle_slots++; + } else { + n_processing_slots++; + } + json slot_data = get_formated_generation(slot); + slot_data["id"] = slot.id; + slot_data["task_id"] = slot.task_id; + slot_data["state"] = slot.state; + slot_data["prompt"] = slot.prompt; + slot_data["next_token"] = { + {"has_next_token", slot.has_next_token}, + {"n_remain", slot.n_remaining}, + {"num_tokens_predicted", slot.n_decoded}, + {"stopped_eos", slot.stopped_eos}, + {"stopped_word", slot.stopped_word}, + {"stopped_limit", slot.stopped_limit}, + {"stopping_word", slot.stopping_word}, + }; + slots_data.push_back(slot_data); + } + LOG_TEE("task %i - slots data: idle=%i processing=%i\n", task.id, n_idle_slots, n_processing_slots); + task_result res; + res.id = task.id; + res.multitask_id = task.multitask_id; + res.stop = true; + res.error = false; + res.result_json = { + { "idle", n_idle_slots }, + { "processing", n_processing_slots }, + { "slots", slots_data } + }; + queue_results.send(res); + } break; } } @@ -1448,14 +1488,15 @@ struct llama_server_context if (slot.is_processing() && system_tokens.size() + slot.cache_tokens.size() >= (size_t) slot.n_ctx) { // Shift context - const int n_left = system_tokens.size() + slot.n_past - slot.params.n_keep - 1; + const int n_keep = slot.params.n_keep + add_bos_token; + const int n_left = system_tokens.size() + slot.n_past - n_keep; const int n_discard = n_left / 2; - LOG_TEE("slot %d: context shift - n_keep = %d, n_left = %d, n_discard = %d\n", slot.id, slot.params.n_keep, n_left, n_discard); - llama_kv_cache_seq_rm (ctx, slot.id, slot.params.n_keep + 1 , slot.params.n_keep + n_discard + 1); - llama_kv_cache_seq_shift(ctx, slot.id, slot.params.n_keep + 1 + n_discard, system_tokens.size() + slot.n_past, -n_discard); + LOG_TEE("slot %d: context shift - n_keep = %d, n_left = %d, n_discard = %d\n", slot.id, n_keep, n_left, n_discard); + llama_kv_cache_seq_rm (ctx, slot.id, n_keep , n_keep + n_discard); + llama_kv_cache_seq_shift(ctx, slot.id, n_keep + n_discard, system_tokens.size() + slot.n_past, -n_discard); - for (size_t i = slot.params.n_keep + 1 + n_discard; i < slot.cache_tokens.size(); i++) + for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) { slot.cache_tokens[i - n_discard] = slot.cache_tokens[i]; } @@ -1468,7 +1509,7 @@ struct llama_server_context LOG_VERBOSE("context shift", { { "n_ctx", n_ctx }, - { "n_keep", params.n_keep }, + { "n_keep", n_keep }, { "n_left", n_left }, }); } @@ -2558,34 +2599,38 @@ int main(int argc, char **argv) server_state current_state = state.load(); switch(current_state) { case SERVER_STATE_READY: { - int available_slots = 0; - int processing_slots = 0; - for (llama_client_slot &slot: llama.slots) { - if (slot.available()) { - available_slots++; - } else { - processing_slots++; - } + // request slots data using task queue + task_server task; + task.id = llama.queue_tasks.get_new_id(); + task.type = TASK_TYPE_SLOTS_DATA; + task.target_id = -1; + + llama.queue_results.add_waiting_task_id(task.id); + llama.queue_tasks.post(task); + + // get the result + task_result result = llama.queue_results.recv(task.id); + llama.queue_results.remove_waiting_task_id(task.id); + + int n_idle_slots = result.result_json["idle"]; + int n_processing_slots = result.result_json["processing"]; + + json health = { + {"status", "ok"}, + {"slots_idle", n_idle_slots}, + {"slots_processing", n_processing_slots}}; + res.status = 200; // HTTP OK + if (sparams.slots_endpoint && req.has_param("include_slots")) { + health["slots"] = result.result_json["slots"]; } - if (available_slots > 0) { - json health = { - {"status", "ok"}, - {"slots_idle", available_slots}, - {"slots_processing", processing_slots}}; - res.set_content(health.dump(), "application/json"); - res.status = 200; // HTTP OK - } else { - json health = { - {"status", "no slot available"}, - {"slots_idle", available_slots}, - {"slots_processing", processing_slots}}; - res.set_content(health.dump(), "application/json"); + + if (n_idle_slots == 0) { + health["status"] = "no slot available"; if (req.has_param("fail_on_no_slot")) { res.status = 503; // HTTP Service Unavailable - } else { - res.status = 200; // HTTP OK } } + res.set_content(health.dump(), "application/json"); break; } case SERVER_STATE_LOADING_MODEL: @@ -2601,26 +2646,20 @@ int main(int argc, char **argv) if (sparams.slots_endpoint) { svr.Get("/slots", [&](const httplib::Request&, httplib::Response& res) { - json slots; - for (llama_client_slot & slot : llama.slots) { - json slot_data = llama.get_formated_generation(slot); - slot_data["id"] = slot.id; - slot_data["task_id"] = slot.task_id; - slot_data["state"] = slot.state; - slot_data["prompt"] = slot.prompt; - slot_data["next_token"] = { - {"has_next_token", slot.has_next_token}, - {"n_remain", slot.n_remaining}, - {"num_tokens_predicted", slot.n_decoded}, - {"stopped_eos", slot.stopped_eos}, - {"stopped_word", slot.stopped_word}, - {"stopped_limit", slot.stopped_limit}, - {"stopping_word", slot.stopping_word}, - }; + // request slots data using task queue + task_server task; + task.id = llama.queue_tasks.get_new_id(); + task.type = TASK_TYPE_SLOTS_DATA; + task.target_id = -1; - slots.push_back(slot_data); - } - res.set_content(slots.dump(), "application/json"); + llama.queue_results.add_waiting_task_id(task.id); + llama.queue_tasks.post(task); + + // get the result + task_result result = llama.queue_results.recv(task.id); + llama.queue_results.remove_waiting_task_id(task.id); + + res.set_content(result.result_json["slots"].dump(), "application/json"); res.status = 200; // HTTP OK }); } diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index e954fb0ef..88545eb69 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -49,7 +49,8 @@ enum server_state { enum task_type { TASK_TYPE_COMPLETION, TASK_TYPE_CANCEL, - TASK_TYPE_NEXT_RESPONSE + TASK_TYPE_NEXT_RESPONSE, + TASK_TYPE_SLOTS_DATA }; struct task_server { diff --git a/ggml.c b/ggml.c index 7df3933b6..670068265 100644 --- a/ggml.c +++ b/ggml.c @@ -5645,7 +5645,9 @@ struct ggml_tensor * ggml_conv_2d( ggml_reshape_2d(ctx, im2col, im2col->ne[0], im2col->ne[3] * im2col->ne[2] * im2col->ne[1]), // [N, OH, OW, IC * KH * KW] => [N*OH*OW, IC * KH * KW] ggml_reshape_2d(ctx, a, (a->ne[0] * a->ne[1] * a->ne[2]), a->ne[3])); // [OC,IC, KH, KW] => [OC, IC * KH * KW] - result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], a->ne[3], im2col->ne[3]); // [N, OC, OH, OW] + result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], im2col->ne[3], a->ne[3]); // [OC, N, OH, OW] + result = ggml_cont(ctx, ggml_permute(ctx, result, 0, 1, 3, 2)); // [N, OC, OH, OW] + return result; } @@ -6651,8 +6653,10 @@ void ggml_set_param( static void ggml_compute_forward_dup_same_cont( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0)); GGML_ASSERT(src0->type == dst->type); @@ -6683,8 +6687,10 @@ static void ggml_compute_forward_dup_same_cont( } static void ggml_compute_forward_dup_f16( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -6697,7 +6703,7 @@ static void ggml_compute_forward_dup_f16( const int nth = params->nth; // number of threads if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) { - ggml_compute_forward_dup_same_cont(params, src0, dst); + ggml_compute_forward_dup_same_cont(params, dst); return; } @@ -6954,8 +6960,10 @@ static void ggml_compute_forward_dup_f16( static void ggml_compute_forward_dup_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -6968,7 +6976,7 @@ static void ggml_compute_forward_dup_f32( const int nth = params->nth; // number of threads if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) { - ggml_compute_forward_dup_same_cont(params, src0, dst); + ggml_compute_forward_dup_same_cont(params, dst); return; } @@ -7204,8 +7212,10 @@ static void ggml_compute_forward_dup_f32( // A simplified version of ggml_compute_forward_dup that doesn't do float upcasting, and just plain old memcpy. static void ggml_compute_forward_dup_bytes( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); GGML_ASSERT(src0->type == dst->type); @@ -7214,7 +7224,7 @@ static void ggml_compute_forward_dup_bytes( } if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst)) { - ggml_compute_forward_dup_same_cont(params, src0, dst); + ggml_compute_forward_dup_same_cont(params, dst); return; } @@ -7353,21 +7363,23 @@ static void ggml_compute_forward_dup_bytes( static void ggml_compute_forward_dup( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + if (src0->type == dst->type) { - ggml_compute_forward_dup_bytes(params, src0, dst); + ggml_compute_forward_dup_bytes(params, dst); return; } switch (src0->type) { case GGML_TYPE_F16: { - ggml_compute_forward_dup_f16(params, src0, dst); + ggml_compute_forward_dup_f16(params, dst); } break; case GGML_TYPE_F32: { - ggml_compute_forward_dup_f32(params, src0, dst); + ggml_compute_forward_dup_f32(params, dst); } break; default: { @@ -7380,9 +7392,11 @@ static void ggml_compute_forward_dup( static void ggml_compute_forward_add_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst)); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -7468,9 +7482,11 @@ static void ggml_compute_forward_add_f32( static void ggml_compute_forward_add_f16_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -7545,9 +7561,11 @@ static void ggml_compute_forward_add_f16_f32( static void ggml_compute_forward_add_f16_f16( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -7599,9 +7617,11 @@ static void ggml_compute_forward_add_f16_f16( static void ggml_compute_forward_add_q_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -7677,14 +7697,16 @@ static void ggml_compute_forward_add_q_f32( static void ggml_compute_forward_add( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + switch (src0->type) { case GGML_TYPE_F32: { if (src1->type == GGML_TYPE_F32) { - ggml_compute_forward_add_f32(params, src0, src1, dst); + ggml_compute_forward_add_f32(params, dst); } else { GGML_ASSERT(false); @@ -7693,10 +7715,10 @@ static void ggml_compute_forward_add( case GGML_TYPE_F16: { if (src1->type == GGML_TYPE_F16) { - ggml_compute_forward_add_f16_f16(params, src0, src1, dst); + ggml_compute_forward_add_f16_f16(params, dst); } else if (src1->type == GGML_TYPE_F32) { - ggml_compute_forward_add_f16_f32(params, src0, src1, dst); + ggml_compute_forward_add_f16_f32(params, dst); } else { GGML_ASSERT(false); @@ -7718,7 +7740,7 @@ static void ggml_compute_forward_add( case GGML_TYPE_IQ1_S: case GGML_TYPE_IQ4_NL: { - ggml_compute_forward_add_q_f32(params, src0, src1, dst); + ggml_compute_forward_add_q_f32(params, dst); } break; default: { @@ -7731,9 +7753,11 @@ static void ggml_compute_forward_add( static void ggml_compute_forward_add1_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + GGML_ASSERT(ggml_are_same_shape(src0, dst)); GGML_ASSERT(ggml_is_scalar(src1)); @@ -7783,9 +7807,11 @@ static void ggml_compute_forward_add1_f32( static void ggml_compute_forward_add1_f16_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + GGML_ASSERT(ggml_are_same_shape(src0, dst)); GGML_ASSERT(ggml_is_scalar(src1)); @@ -7833,9 +7859,11 @@ static void ggml_compute_forward_add1_f16_f32( static void ggml_compute_forward_add1_f16_f16( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + GGML_ASSERT(ggml_are_same_shape(src0, dst)); GGML_ASSERT(ggml_is_scalar(src1)); @@ -7883,9 +7911,11 @@ static void ggml_compute_forward_add1_f16_f16( static void ggml_compute_forward_add1_q_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + GGML_ASSERT(ggml_are_same_shape(src0, dst)); GGML_ASSERT(ggml_is_scalar(src1)); @@ -7950,21 +7980,23 @@ static void ggml_compute_forward_add1_q_f32( static void ggml_compute_forward_add1( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_add1_f32(params, src0, src1, dst); + ggml_compute_forward_add1_f32(params, dst); } break; case GGML_TYPE_F16: { if (src1->type == GGML_TYPE_F16) { - ggml_compute_forward_add1_f16_f16(params, src0, src1, dst); + ggml_compute_forward_add1_f16_f16(params, dst); } else if (src1->type == GGML_TYPE_F32) { - ggml_compute_forward_add1_f16_f32(params, src0, src1, dst); + ggml_compute_forward_add1_f16_f32(params, dst); } else { GGML_ASSERT(false); @@ -7987,7 +8019,7 @@ static void ggml_compute_forward_add1( case GGML_TYPE_IQ1_S: case GGML_TYPE_IQ4_NL: { - ggml_compute_forward_add1_q_f32(params, src0, src1, dst); + ggml_compute_forward_add1_q_f32(params, dst); } break; default: { @@ -8000,9 +8032,11 @@ static void ggml_compute_forward_add1( static void ggml_compute_forward_acc_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + GGML_ASSERT(ggml_are_same_shape(src0, dst)); GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0)); @@ -8082,14 +8116,14 @@ static void ggml_compute_forward_acc_f32( static void ggml_compute_forward_acc( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_acc_f32(params, src0, src1, dst); + ggml_compute_forward_acc_f32(params, dst); } break; case GGML_TYPE_F16: case GGML_TYPE_Q4_0: @@ -8119,9 +8153,11 @@ static void ggml_compute_forward_acc( static void ggml_compute_forward_sub_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + assert(params->ith == 0); assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); @@ -8179,13 +8215,14 @@ static void ggml_compute_forward_sub_f32( static void ggml_compute_forward_sub( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_sub_f32(params, src0, src1, dst); + ggml_compute_forward_sub_f32(params, dst); } break; default: { @@ -8198,9 +8235,11 @@ static void ggml_compute_forward_sub( static void ggml_compute_forward_mul_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst)); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -8281,15 +8320,17 @@ static void ggml_compute_forward_mul_f32( static void ggml_compute_forward_mul( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + GGML_ASSERT(src1->type == GGML_TYPE_F32 && "only f32 src1 supported for now"); switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_mul_f32(params, src0, src1, dst); + ggml_compute_forward_mul_f32(params, dst); } break; default: { @@ -8302,9 +8343,11 @@ static void ggml_compute_forward_mul( static void ggml_compute_forward_div_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst)); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -8375,13 +8418,14 @@ static void ggml_compute_forward_div_f32( static void ggml_compute_forward_div( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_div_f32(params, src0, src1, dst); + ggml_compute_forward_div_f32(params, dst); } break; default: { @@ -8394,8 +8438,10 @@ static void ggml_compute_forward_div( static void ggml_compute_forward_sqr_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + assert(params->ith == 0); assert(ggml_are_same_shape(src0, dst)); @@ -8418,12 +8464,14 @@ static void ggml_compute_forward_sqr_f32( static void ggml_compute_forward_sqr( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_sqr_f32(params, src0, dst); + ggml_compute_forward_sqr_f32(params, dst); } break; default: { @@ -8436,8 +8484,10 @@ static void ggml_compute_forward_sqr( static void ggml_compute_forward_sqrt_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + assert(params->ith == 0); assert(ggml_are_same_shape(src0, dst)); @@ -8460,12 +8510,14 @@ static void ggml_compute_forward_sqrt_f32( static void ggml_compute_forward_sqrt( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_sqrt_f32(params, src0, dst); + ggml_compute_forward_sqrt_f32(params, dst); } break; default: { @@ -8478,8 +8530,10 @@ static void ggml_compute_forward_sqrt( static void ggml_compute_forward_log_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + GGML_ASSERT(params->ith == 0); GGML_ASSERT(ggml_are_same_shape(src0, dst)); @@ -8502,12 +8556,14 @@ static void ggml_compute_forward_log_f32( static void ggml_compute_forward_log( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_log_f32(params, src0, dst); + ggml_compute_forward_log_f32(params, dst); } break; default: { @@ -8520,8 +8576,10 @@ static void ggml_compute_forward_log( static void ggml_compute_forward_sum_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + assert(params->ith == 0); assert(ggml_is_scalar(dst)); @@ -8553,8 +8611,10 @@ static void ggml_compute_forward_sum_f32( static void ggml_compute_forward_sum_f16( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + assert(params->ith == 0); assert(ggml_is_scalar(dst)); @@ -8585,16 +8645,18 @@ static void ggml_compute_forward_sum_f16( static void ggml_compute_forward_sum( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_sum_f32(params, src0, dst); + ggml_compute_forward_sum_f32(params, dst); } break; case GGML_TYPE_F16: { - ggml_compute_forward_sum_f16(params, src0, dst); + ggml_compute_forward_sum_f16(params, dst); } break; default: { @@ -8607,8 +8669,10 @@ static void ggml_compute_forward_sum( static void ggml_compute_forward_sum_rows_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + GGML_ASSERT(params->ith == 0); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -8640,12 +8704,14 @@ static void ggml_compute_forward_sum_rows_f32( static void ggml_compute_forward_sum_rows( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_sum_rows_f32(params, src0, dst); + ggml_compute_forward_sum_rows_f32(params, dst); } break; default: { @@ -8658,8 +8724,10 @@ static void ggml_compute_forward_sum_rows( static void ggml_compute_forward_mean_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + assert(params->ith == 0); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -8695,12 +8763,14 @@ static void ggml_compute_forward_mean_f32( static void ggml_compute_forward_mean( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_mean_f32(params, src0, dst); + ggml_compute_forward_mean_f32(params, dst); } break; default: { @@ -8713,8 +8783,10 @@ static void ggml_compute_forward_mean( static void ggml_compute_forward_argmax_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + assert(params->ith == 0); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -8741,12 +8813,14 @@ static void ggml_compute_forward_argmax_f32( static void ggml_compute_forward_argmax( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_argmax_f32(params, src0, dst); + ggml_compute_forward_argmax_f32(params, dst); } break; default: { @@ -8759,8 +8833,10 @@ static void ggml_compute_forward_argmax( static void ggml_compute_forward_repeat_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + GGML_ASSERT(params->ith == 0); GGML_ASSERT(ggml_can_repeat(src0, dst)); @@ -8802,8 +8878,10 @@ static void ggml_compute_forward_repeat_f32( static void ggml_compute_forward_repeat_f16( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + GGML_ASSERT(params->ith == 0); GGML_ASSERT(ggml_can_repeat(src0, dst)); @@ -8848,18 +8926,20 @@ static void ggml_compute_forward_repeat_f16( static void ggml_compute_forward_repeat( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F16: case GGML_TYPE_I16: { - ggml_compute_forward_repeat_f16(params, src0, dst); + ggml_compute_forward_repeat_f16(params, dst); } break; case GGML_TYPE_F32: case GGML_TYPE_I32: { - ggml_compute_forward_repeat_f32(params, src0, dst); + ggml_compute_forward_repeat_f32(params, dst); } break; default: { @@ -8872,8 +8952,10 @@ static void ggml_compute_forward_repeat( static void ggml_compute_forward_repeat_back_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + GGML_ASSERT(params->ith == 0); GGML_ASSERT(ggml_can_repeat(dst, src0)); @@ -8929,12 +9011,14 @@ static void ggml_compute_forward_repeat_back_f32( static void ggml_compute_forward_repeat_back( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_repeat_back_f32(params, src0, dst); + ggml_compute_forward_repeat_back_f32(params, dst); } break; default: { @@ -8947,10 +9031,11 @@ static void ggml_compute_forward_repeat_back( static void ggml_compute_forward_concat_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } @@ -8995,14 +9080,15 @@ static void ggml_compute_forward_concat_f32( static void ggml_compute_forward_concat( const struct ggml_compute_params* params, - const struct ggml_tensor* src0, - const struct ggml_tensor* src1, struct ggml_tensor* dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: case GGML_TYPE_I32: { - ggml_compute_forward_concat_f32(params, src0, src1, dst); + ggml_compute_forward_concat_f32(params, dst); } break; default: { @@ -9015,8 +9101,10 @@ static void ggml_compute_forward_concat( static void ggml_compute_forward_abs_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + assert(params->ith == 0); assert(ggml_are_same_shape(src0, dst)); @@ -9039,12 +9127,14 @@ static void ggml_compute_forward_abs_f32( static void ggml_compute_forward_abs( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_abs_f32(params, src0, dst); + ggml_compute_forward_abs_f32(params, dst); } break; default: { @@ -9057,8 +9147,10 @@ static void ggml_compute_forward_abs( static void ggml_compute_forward_sgn_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + assert(params->ith == 0); assert(ggml_are_same_shape(src0, dst)); @@ -9081,12 +9173,14 @@ static void ggml_compute_forward_sgn_f32( static void ggml_compute_forward_sgn( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_sgn_f32(params, src0, dst); + ggml_compute_forward_sgn_f32(params, dst); } break; default: { @@ -9099,8 +9193,10 @@ static void ggml_compute_forward_sgn( static void ggml_compute_forward_neg_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + assert(params->ith == 0); assert(ggml_are_same_shape(src0, dst)); @@ -9123,12 +9219,14 @@ static void ggml_compute_forward_neg_f32( static void ggml_compute_forward_neg( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_neg_f32(params, src0, dst); + ggml_compute_forward_neg_f32(params, dst); } break; default: { @@ -9141,8 +9239,10 @@ static void ggml_compute_forward_neg( static void ggml_compute_forward_step_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + assert(params->ith == 0); assert(ggml_are_same_shape(src0, dst)); @@ -9165,12 +9265,14 @@ static void ggml_compute_forward_step_f32( static void ggml_compute_forward_step( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_step_f32(params, src0, dst); + ggml_compute_forward_step_f32(params, dst); } break; default: { @@ -9183,8 +9285,10 @@ static void ggml_compute_forward_step( static void ggml_compute_forward_tanh_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + assert(params->ith == 0); assert(ggml_are_same_shape(src0, dst)); @@ -9207,12 +9311,14 @@ static void ggml_compute_forward_tanh_f32( static void ggml_compute_forward_tanh( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_tanh_f32(params, src0, dst); + ggml_compute_forward_tanh_f32(params, dst); } break; default: { @@ -9225,8 +9331,10 @@ static void ggml_compute_forward_tanh( static void ggml_compute_forward_elu_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + assert(params->ith == 0); assert(ggml_are_same_shape(src0, dst)); @@ -9249,12 +9357,14 @@ static void ggml_compute_forward_elu_f32( static void ggml_compute_forward_elu( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_elu_f32(params, src0, dst); + ggml_compute_forward_elu_f32(params, dst); } break; default: { @@ -9267,8 +9377,10 @@ static void ggml_compute_forward_elu( static void ggml_compute_forward_relu_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + assert(params->ith == 0); assert(ggml_are_same_shape(src0, dst)); @@ -9291,12 +9403,14 @@ static void ggml_compute_forward_relu_f32( static void ggml_compute_forward_relu( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_relu_f32(params, src0, dst); + ggml_compute_forward_relu_f32(params, dst); } break; default: { @@ -9309,8 +9423,10 @@ static void ggml_compute_forward_relu( static void ggml_compute_forward_gelu_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0)); GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst)); GGML_ASSERT(ggml_are_same_shape(src0, dst)); @@ -9350,12 +9466,14 @@ static void ggml_compute_forward_gelu_f32( static void ggml_compute_forward_gelu( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_gelu_f32(params, src0, dst); + ggml_compute_forward_gelu_f32(params, dst); } break; default: { @@ -9368,8 +9486,10 @@ static void ggml_compute_forward_gelu( static void ggml_compute_forward_gelu_quick_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0)); GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst)); GGML_ASSERT(ggml_are_same_shape(src0, dst)); @@ -9409,12 +9529,14 @@ static void ggml_compute_forward_gelu_quick_f32( static void ggml_compute_forward_gelu_quick( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_gelu_quick_f32(params, src0, dst); + ggml_compute_forward_gelu_quick_f32(params, dst); } break; default: { @@ -9427,8 +9549,10 @@ static void ggml_compute_forward_gelu_quick( static void ggml_compute_forward_silu_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0)); GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst)); GGML_ASSERT(ggml_are_same_shape(src0, dst)); @@ -9468,12 +9592,14 @@ static void ggml_compute_forward_silu_f32( static void ggml_compute_forward_silu( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_silu_f32(params, src0, dst); + ggml_compute_forward_silu_f32(params, dst); } break; default: { @@ -9485,8 +9611,10 @@ static void ggml_compute_forward_silu( static void ggml_compute_forward_leaky_relu_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + assert(params->ith == 0); assert(ggml_are_same_shape(src0, dst)); @@ -9512,12 +9640,14 @@ static void ggml_compute_forward_leaky_relu_f32( static void ggml_compute_forward_leaky_relu( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_leaky_relu_f32(params, src0, dst); + ggml_compute_forward_leaky_relu_f32(params, dst); } break; default: { @@ -9530,9 +9660,11 @@ static void ggml_compute_forward_leaky_relu( static void ggml_compute_forward_silu_back_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * grad, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * grad = dst->src[1]; + GGML_ASSERT(ggml_is_contiguous_except_dim_1(grad)); GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0)); GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst)); @@ -9575,13 +9707,14 @@ static void ggml_compute_forward_silu_back_f32( static void ggml_compute_forward_silu_back( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * grad, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_silu_back_f32(params, src0, grad, dst); + ggml_compute_forward_silu_back_f32(params, dst); } break; default: { @@ -9593,8 +9726,10 @@ static void ggml_compute_forward_silu_back( static void ggml_compute_forward_hardswish_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + assert(params->ith == 0); assert(ggml_are_same_shape(src0, dst)); @@ -9616,12 +9751,14 @@ static void ggml_compute_forward_hardswish_f32( } static void ggml_compute_forward_hardswish( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_hardswish_f32(params, src0, dst); + ggml_compute_forward_hardswish_f32(params, dst); } break; default: { @@ -9632,8 +9769,10 @@ static void ggml_compute_forward_hardswish( static void ggml_compute_forward_hardsigmoid_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + assert(params->ith == 0); assert(ggml_are_same_shape(src0, dst)); @@ -9656,12 +9795,14 @@ static void ggml_compute_forward_hardsigmoid_f32( static void ggml_compute_forward_hardsigmoid( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_hardsigmoid_f32(params, src0, dst); + ggml_compute_forward_hardsigmoid_f32(params, dst); } break; default: { @@ -9675,8 +9816,10 @@ static void ggml_compute_forward_hardsigmoid( static void ggml_compute_forward_norm_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + GGML_ASSERT(ggml_are_same_shape(src0, dst)); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -9728,12 +9871,14 @@ static void ggml_compute_forward_norm_f32( static void ggml_compute_forward_norm( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_norm_f32(params, src0, dst); + ggml_compute_forward_norm_f32(params, dst); } break; default: { @@ -9746,8 +9891,10 @@ static void ggml_compute_forward_norm( static void ggml_compute_forward_rms_norm_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + GGML_ASSERT(ggml_are_same_shape(src0, dst)); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -9796,12 +9943,14 @@ static void ggml_compute_forward_rms_norm_f32( static void ggml_compute_forward_rms_norm( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_rms_norm_f32(params, src0, dst); + ggml_compute_forward_rms_norm_f32(params, dst); } break; default: { @@ -9812,9 +9961,11 @@ static void ggml_compute_forward_rms_norm( static void ggml_compute_forward_rms_norm_back_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + GGML_ASSERT(ggml_are_same_shape(src0, dst) && ggml_are_same_shape(src0, src1)); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -9969,13 +10120,14 @@ static void ggml_compute_forward_rms_norm_back_f32( static void ggml_compute_forward_rms_norm_back( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_rms_norm_back_f32(params, src0, src1, dst); + ggml_compute_forward_rms_norm_back_f32(params, dst); } break; default: { @@ -9988,8 +10140,10 @@ static void ggml_compute_forward_rms_norm_back( static void ggml_compute_forward_group_norm_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + GGML_ASSERT(ggml_are_same_shape(src0, dst)); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -10060,12 +10214,14 @@ static void ggml_compute_forward_group_norm_f32( static void ggml_compute_forward_group_norm( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_group_norm_f32(params, src0, dst); + ggml_compute_forward_group_norm_f32(params, dst); } break; default: { @@ -10111,9 +10267,11 @@ static bool ggml_compute_forward_mul_mat_use_blas(struct ggml_tensor * dst) { static void ggml_compute_forward_mul_mat( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + int64_t t0 = ggml_perf_time_us(); UNUSED(t0); @@ -10358,10 +10516,11 @@ static void ggml_compute_forward_mul_mat( static void ggml_compute_forward_mul_mat_id( const struct ggml_compute_params * params, - const struct ggml_tensor * ids, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + const struct ggml_tensor * ids = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + const struct ggml_tensor * src0 = dst->src[2]; // only for GGML_TENSOR_BINARY_OP_LOCALS GGML_TENSOR_BINARY_OP_LOCALS @@ -10552,9 +10711,11 @@ static void ggml_compute_forward_mul_mat_id( static void ggml_compute_forward_out_prod_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + // int64_t t0 = ggml_perf_time_us(); // UNUSED(t0); @@ -10744,9 +10905,11 @@ static void ggml_compute_forward_out_prod_f32( static void ggml_compute_forward_out_prod_q_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + // int64_t t0 = ggml_perf_time_us(); // UNUSED(t0); @@ -10857,9 +11020,10 @@ static void ggml_compute_forward_out_prod_q_f32( static void ggml_compute_forward_out_prod( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: @@ -10877,16 +11041,16 @@ static void ggml_compute_forward_out_prod( case GGML_TYPE_IQ1_S: case GGML_TYPE_IQ4_NL: { - ggml_compute_forward_out_prod_q_f32(params, src0, src1, dst); + ggml_compute_forward_out_prod_q_f32(params, dst); } break; case GGML_TYPE_F16: { GGML_ASSERT(false); // todo - // ggml_compute_forward_out_prod_f16_f32(params, src0, src1, dst); + // ggml_compute_forward_out_prod_f16_f32(params, dst); } break; case GGML_TYPE_F32: { - ggml_compute_forward_out_prod_f32(params, src0, src1, dst); + ggml_compute_forward_out_prod_f32(params, dst); } break; default: { @@ -10899,8 +11063,10 @@ static void ggml_compute_forward_out_prod( static void ggml_compute_forward_scale_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + GGML_ASSERT(ggml_is_contiguous(src0)); GGML_ASSERT(ggml_is_contiguous(dst)); GGML_ASSERT(ggml_are_same_shape(src0, dst)); @@ -10941,12 +11107,14 @@ static void ggml_compute_forward_scale_f32( static void ggml_compute_forward_scale( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_scale_f32(params, src0, dst); + ggml_compute_forward_scale_f32(params, dst); } break; default: { @@ -10959,9 +11127,11 @@ static void ggml_compute_forward_scale( static void ggml_compute_forward_set_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + GGML_ASSERT(ggml_are_same_shape(src0, dst)); GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0)); @@ -11032,14 +11202,14 @@ static void ggml_compute_forward_set_f32( static void ggml_compute_forward_set( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_set_f32(params, src0, src1, dst); + ggml_compute_forward_set_f32(params, dst); } break; case GGML_TYPE_F16: case GGML_TYPE_Q4_0: @@ -11069,29 +11239,25 @@ static void ggml_compute_forward_set( static void ggml_compute_forward_cpy( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { - ggml_compute_forward_dup(params, src0, dst); + ggml_compute_forward_dup(params, dst); } // ggml_compute_forward_cont static void ggml_compute_forward_cont( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { - ggml_compute_forward_dup(params, src0, dst); + ggml_compute_forward_dup(params, dst); } // ggml_compute_forward_reshape static void ggml_compute_forward_reshape( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { // NOP UNUSED(params); - UNUSED(src0); UNUSED(dst); } @@ -11099,39 +11265,41 @@ static void ggml_compute_forward_reshape( static void ggml_compute_forward_view( const struct ggml_compute_params * params, - const struct ggml_tensor * src0) { + const struct ggml_tensor * dst) { // NOP UNUSED(params); - UNUSED(src0); + UNUSED(dst); } // ggml_compute_forward_permute static void ggml_compute_forward_permute( const struct ggml_compute_params * params, - const struct ggml_tensor * src0) { + const struct ggml_tensor * dst) { // NOP UNUSED(params); - UNUSED(src0); + UNUSED(dst); } // ggml_compute_forward_transpose static void ggml_compute_forward_transpose( const struct ggml_compute_params * params, - const struct ggml_tensor * src0) { + const struct ggml_tensor * dst) { // NOP UNUSED(params); - UNUSED(src0); + UNUSED(dst); } // ggml_compute_forward_get_rows static void ggml_compute_forward_get_rows_q( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + assert(params->ith == 0); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -11167,9 +11335,11 @@ static void ggml_compute_forward_get_rows_q( static void ggml_compute_forward_get_rows_f16( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + assert(params->ith == 0); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -11202,9 +11372,11 @@ static void ggml_compute_forward_get_rows_f16( static void ggml_compute_forward_get_rows_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + assert(params->ith == 0); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -11237,9 +11409,10 @@ static void ggml_compute_forward_get_rows_f32( static void ggml_compute_forward_get_rows( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: @@ -11258,16 +11431,16 @@ static void ggml_compute_forward_get_rows( case GGML_TYPE_IQ1_S: case GGML_TYPE_IQ4_NL: { - ggml_compute_forward_get_rows_q(params, src0, src1, dst); + ggml_compute_forward_get_rows_q(params, dst); } break; case GGML_TYPE_F16: { - ggml_compute_forward_get_rows_f16(params, src0, src1, dst); + ggml_compute_forward_get_rows_f16(params, dst); } break; case GGML_TYPE_F32: case GGML_TYPE_I32: { - ggml_compute_forward_get_rows_f32(params, src0, src1, dst); + ggml_compute_forward_get_rows_f32(params, dst); } break; default: { @@ -11298,9 +11471,11 @@ static void ggml_compute_forward_get_rows( static void ggml_compute_forward_get_rows_back_f32_f16( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + GGML_ASSERT(params->ith == 0); GGML_ASSERT(ggml_is_contiguous(dst)); @@ -11335,9 +11510,11 @@ static void ggml_compute_forward_get_rows_back_f32_f16( static void ggml_compute_forward_get_rows_back_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + GGML_ASSERT(params->ith == 0); GGML_ASSERT(ggml_is_contiguous(dst)); @@ -11372,17 +11549,18 @@ static void ggml_compute_forward_get_rows_back_f32( static void ggml_compute_forward_get_rows_back( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F16: { - ggml_compute_forward_get_rows_back_f32_f16(params, src0, src1, dst); + ggml_compute_forward_get_rows_back_f32_f16(params, dst); } break; case GGML_TYPE_F32: { - ggml_compute_forward_get_rows_back_f32(params, src0, src1, dst); + ggml_compute_forward_get_rows_back_f32(params, dst); } break; default: { @@ -11413,8 +11591,10 @@ static void ggml_compute_forward_get_rows_back( static void ggml_compute_forward_diag_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + GGML_ASSERT(params->ith == 0); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -11453,12 +11633,14 @@ static void ggml_compute_forward_diag_f32( static void ggml_compute_forward_diag( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_diag_f32(params, src0, dst); + ggml_compute_forward_diag_f32(params, dst); } break; default: { @@ -11471,10 +11653,11 @@ static void ggml_compute_forward_diag( static void ggml_compute_forward_diag_mask_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst, const float value) { + const struct ggml_tensor * src0 = dst->src[0]; + const int ith = params->ith; const int nth = params->nth; @@ -11524,12 +11707,14 @@ static void ggml_compute_forward_diag_mask_f32( static void ggml_compute_forward_diag_mask_inf( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_diag_mask_f32(params, src0, dst, -INFINITY); + ggml_compute_forward_diag_mask_f32(params, dst, -INFINITY); } break; default: { @@ -11540,12 +11725,14 @@ static void ggml_compute_forward_diag_mask_inf( static void ggml_compute_forward_diag_mask_zero( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_diag_mask_f32(params, src0, dst, 0); + ggml_compute_forward_diag_mask_f32(params, dst, 0); } break; default: { @@ -11558,10 +11745,12 @@ static void ggml_compute_forward_diag_mask_zero( static void ggml_compute_forward_soft_max_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - const struct ggml_tensor * src2, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + const struct ggml_tensor * src2 = dst->src[2]; + assert(ggml_is_contiguous(dst)); assert(ggml_are_same_shape(src0, dst)); @@ -11672,14 +11861,14 @@ static void ggml_compute_forward_soft_max_f32( static void ggml_compute_forward_soft_max( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - const struct ggml_tensor * src2, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_soft_max_f32(params, src0, src1, src2, dst); + ggml_compute_forward_soft_max_f32(params, dst); } break; default: { @@ -11692,9 +11881,11 @@ static void ggml_compute_forward_soft_max( static void ggml_compute_forward_soft_max_back_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + GGML_ASSERT(ggml_is_contiguous(src0)); GGML_ASSERT(ggml_is_contiguous(src1)); GGML_ASSERT(ggml_is_contiguous(dst)); @@ -11769,13 +11960,14 @@ static void ggml_compute_forward_soft_max_back_f32( static void ggml_compute_forward_soft_max_back( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_soft_max_back_f32(params, src0, src1, dst); + ggml_compute_forward_soft_max_back_f32(params, dst); } break; default: { @@ -11788,8 +11980,10 @@ static void ggml_compute_forward_soft_max_back( static void ggml_compute_forward_alibi_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + assert(params->ith == 0); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -11845,8 +12039,10 @@ static void ggml_compute_forward_alibi_f32( static void ggml_compute_forward_alibi_f16( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + assert(params->ith == 0); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -11905,16 +12101,18 @@ static void ggml_compute_forward_alibi_f16( static void ggml_compute_forward_alibi( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F16: { - ggml_compute_forward_alibi_f16(params, src0, dst); + ggml_compute_forward_alibi_f16(params, dst); } break; case GGML_TYPE_F32: { - ggml_compute_forward_alibi_f32(params, src0, dst); + ggml_compute_forward_alibi_f32(params, dst); } break; case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: @@ -11947,8 +12145,10 @@ static void ggml_compute_forward_alibi( static void ggml_compute_forward_clamp_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + assert(params->ith == 0); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -11987,12 +12187,14 @@ static void ggml_compute_forward_clamp_f32( static void ggml_compute_forward_clamp( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_clamp_f32(params, src0, dst); + ggml_compute_forward_clamp_f32(params, dst); } break; case GGML_TYPE_F16: case GGML_TYPE_Q4_0: @@ -12082,10 +12284,12 @@ GGML_CALL void ggml_rope_yarn_corr_dims( static void ggml_compute_forward_rope_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst, const bool forward) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } @@ -12258,10 +12462,12 @@ static void ggml_compute_forward_rope_f32( static void ggml_compute_forward_rope_f16( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst, const bool forward) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } @@ -12423,17 +12629,18 @@ static void ggml_compute_forward_rope_f16( static void ggml_compute_forward_rope( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F16: { - ggml_compute_forward_rope_f16(params, src0, src1, dst, true); + ggml_compute_forward_rope_f16(params, dst, true); } break; case GGML_TYPE_F32: { - ggml_compute_forward_rope_f32(params, src0, src1, dst, true); + ggml_compute_forward_rope_f32(params, dst, true); } break; default: { @@ -12446,17 +12653,18 @@ static void ggml_compute_forward_rope( static void ggml_compute_forward_rope_back( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F16: { - ggml_compute_forward_rope_f16(params, src0, src1, dst, false); + ggml_compute_forward_rope_f16(params, dst, false); } break; case GGML_TYPE_F32: { - ggml_compute_forward_rope_f32(params, src0, src1, dst, false); + ggml_compute_forward_rope_f32(params, dst, false); } break; default: { @@ -12469,9 +12677,11 @@ static void ggml_compute_forward_rope_back( static void ggml_compute_forward_conv_transpose_1d_f16_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32); @@ -12566,9 +12776,11 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32( static void ggml_compute_forward_conv_transpose_1d_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32); @@ -12663,17 +12875,18 @@ static void ggml_compute_forward_conv_transpose_1d_f32( static void ggml_compute_forward_conv_transpose_1d( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F16: { - ggml_compute_forward_conv_transpose_1d_f16_f32(params, src0, src1, dst); + ggml_compute_forward_conv_transpose_1d_f16_f32(params, dst); } break; case GGML_TYPE_F32: { - ggml_compute_forward_conv_transpose_1d_f32(params, src0, src1, dst); + ggml_compute_forward_conv_transpose_1d_f32(params, dst); } break; default: { @@ -12687,9 +12900,11 @@ static void ggml_compute_forward_conv_transpose_1d( // dst: result [N, OH, OW, IC*KH*KW] static void ggml_compute_forward_im2col_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32); @@ -12773,9 +12988,11 @@ static void ggml_compute_forward_im2col_f32( // dst: result [N, OH, OW, IC*KH*KW] static void ggml_compute_forward_im2col_f16( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F16); @@ -12855,17 +13072,15 @@ static void ggml_compute_forward_im2col_f16( static void ggml_compute_forward_im2col( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { switch (dst->type) { case GGML_TYPE_F16: { - ggml_compute_forward_im2col_f16(params, src0, src1, dst); + ggml_compute_forward_im2col_f16(params, dst); } break; case GGML_TYPE_F32: { - ggml_compute_forward_im2col_f32(params, src0, src1, dst); + ggml_compute_forward_im2col_f32(params, dst); } break; default: { @@ -12879,9 +13094,11 @@ static void ggml_compute_forward_im2col( static void ggml_compute_forward_conv_transpose_2d( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32); @@ -12985,9 +13202,11 @@ static void ggml_compute_forward_conv_transpose_2d( static void ggml_compute_forward_pool_1d_sk_p0( const struct ggml_compute_params * params, const enum ggml_op_pool op, - const struct ggml_tensor * src, const int k, struct ggml_tensor * dst) { + + const struct ggml_tensor * src = dst->src[0]; + assert(src->type == GGML_TYPE_F32); assert(params->ith == 0); @@ -13036,7 +13255,6 @@ static void ggml_compute_forward_pool_1d_sk_p0( static void ggml_compute_forward_pool_1d( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { const int32_t * opts = (const int32_t *)dst->op_params; @@ -13047,15 +13265,17 @@ static void ggml_compute_forward_pool_1d( GGML_ASSERT(p0 == 0); // padding not supported GGML_ASSERT(k0 == s0); // only s = k supported - ggml_compute_forward_pool_1d_sk_p0(params, op, src0, k0, dst); + ggml_compute_forward_pool_1d_sk_p0(params, op, k0, dst); } // ggml_compute_forward_pool_2d static void ggml_compute_forward_pool_2d( const struct ggml_compute_params * params, - const struct ggml_tensor * src, struct ggml_tensor * dst) { + + const struct ggml_tensor * src = dst->src[0]; + GGML_ASSERT(src->type == GGML_TYPE_F32); GGML_ASSERT(params->ith == 0); @@ -13128,9 +13348,10 @@ static void ggml_compute_forward_pool_2d( static void ggml_compute_forward_upscale_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + const struct ggml_tensor * src0 = dst->src[0]; + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } @@ -13167,12 +13388,14 @@ static void ggml_compute_forward_upscale_f32( static void ggml_compute_forward_upscale( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_upscale_f32(params, src0, dst); + ggml_compute_forward_upscale_f32(params, dst); } break; default: { @@ -13185,9 +13408,10 @@ static void ggml_compute_forward_upscale( static void ggml_compute_forward_pad_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + const struct ggml_tensor * src0 = dst->src[0]; + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } @@ -13225,12 +13449,14 @@ static void ggml_compute_forward_pad_f32( static void ggml_compute_forward_pad( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_pad_f32(params, src0, dst); + ggml_compute_forward_pad_f32(params, dst); } break; default: { @@ -13243,9 +13469,10 @@ static void ggml_compute_forward_pad( static void ggml_compute_forward_argsort_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + const struct ggml_tensor * src0 = dst->src[0]; + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } @@ -13285,13 +13512,14 @@ static void ggml_compute_forward_argsort_f32( static void ggml_compute_forward_argsort( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_argsort_f32(params, src0, dst); + ggml_compute_forward_argsort_f32(params, dst); } break; default: { @@ -13304,11 +13532,13 @@ static void ggml_compute_forward_argsort( static void ggml_compute_forward_flash_attn_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * q, - const struct ggml_tensor * k, - const struct ggml_tensor * v, const bool masked, struct ggml_tensor * dst) { + + const struct ggml_tensor * q = dst->src[0]; + const struct ggml_tensor * k = dst->src[1]; + const struct ggml_tensor * v = dst->src[2]; + int64_t t0 = ggml_perf_time_us(); UNUSED(t0); @@ -13494,11 +13724,13 @@ static void ggml_compute_forward_flash_attn_f32( static void ggml_compute_forward_flash_attn_f16( const struct ggml_compute_params * params, - const struct ggml_tensor * q, - const struct ggml_tensor * k, - const struct ggml_tensor * v, const bool masked, struct ggml_tensor * dst) { + + const struct ggml_tensor * q = dst->src[0]; + const struct ggml_tensor * k = dst->src[1]; + const struct ggml_tensor * v = dst->src[2]; + int64_t t0 = ggml_perf_time_us(); UNUSED(t0); @@ -13720,19 +13952,19 @@ static void ggml_compute_forward_flash_attn_f16( static void ggml_compute_forward_flash_attn( const struct ggml_compute_params * params, - const struct ggml_tensor * q, - const struct ggml_tensor * k, - const struct ggml_tensor * v, const bool masked, struct ggml_tensor * dst) { + + const struct ggml_tensor * q = dst->src[0]; + switch (q->type) { case GGML_TYPE_F16: { - ggml_compute_forward_flash_attn_f16(params, q, k, v, masked, dst); + ggml_compute_forward_flash_attn_f16(params, masked, dst); } break; case GGML_TYPE_F32: { - ggml_compute_forward_flash_attn_f32(params, q, k, v, masked, dst); + ggml_compute_forward_flash_attn_f32(params, masked, dst); } break; default: { @@ -13745,12 +13977,14 @@ static void ggml_compute_forward_flash_attn( static void ggml_compute_forward_flash_ff_f16( const struct ggml_compute_params * params, - const struct ggml_tensor * a, // F16 - const struct ggml_tensor * b0, // F16 fc_w - const struct ggml_tensor * b1, // F32 fc_b - const struct ggml_tensor * c0, // F16 proj_w - const struct ggml_tensor * c1, // F32 proj_b struct ggml_tensor * dst) { + + const struct ggml_tensor * a = dst->src[0]; // F16 + const struct ggml_tensor * b0 = dst->src[1]; // F16 fc_w + const struct ggml_tensor * b1 = dst->src[2]; // F32 fc_b + const struct ggml_tensor * c0 = dst->src[3]; // F16 proj_w + const struct ggml_tensor * c1 = dst->src[4]; // F32 proj_b + int64_t t0 = ggml_perf_time_us(); UNUSED(t0); @@ -13878,16 +14112,14 @@ static void ggml_compute_forward_flash_ff_f16( static void ggml_compute_forward_flash_ff( const struct ggml_compute_params * params, - const struct ggml_tensor * a, - const struct ggml_tensor * b0, - const struct ggml_tensor * b1, - const struct ggml_tensor * c0, - const struct ggml_tensor * c1, struct ggml_tensor * dst) { + + const struct ggml_tensor * b0 = dst->src[1]; + switch (b0->type) { case GGML_TYPE_F16: { - ggml_compute_forward_flash_ff_f16(params, a, b0, b1, c0, c1, dst); + ggml_compute_forward_flash_ff_f16(params, dst); } break; case GGML_TYPE_F32: { @@ -13904,12 +14136,14 @@ static void ggml_compute_forward_flash_ff( static void ggml_compute_forward_flash_attn_back_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * q, - const struct ggml_tensor * k, - const struct ggml_tensor * v, - const struct ggml_tensor * d, const bool masked, struct ggml_tensor * dst) { + + const struct ggml_tensor * q = dst->src[0]; + const struct ggml_tensor * k = dst->src[1]; + const struct ggml_tensor * v = dst->src[2]; + const struct ggml_tensor * d = dst->src[3]; + int64_t t0 = ggml_perf_time_us(); UNUSED(t0); @@ -14257,16 +14491,15 @@ static void ggml_compute_forward_flash_attn_back_f32( static void ggml_compute_forward_flash_attn_back( const struct ggml_compute_params * params, - const struct ggml_tensor * q, - const struct ggml_tensor * k, - const struct ggml_tensor * v, - const struct ggml_tensor * d, const bool masked, struct ggml_tensor * dst) { + + const struct ggml_tensor * q = dst->src[0]; + switch (q->type) { case GGML_TYPE_F32: { - ggml_compute_forward_flash_attn_back_f32(params, q, k, v, d, masked, dst); + ggml_compute_forward_flash_attn_back_f32(params, masked, dst); } break; default: { @@ -14279,8 +14512,10 @@ static void ggml_compute_forward_flash_attn_back( static void ggml_compute_forward_win_part_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } @@ -14323,12 +14558,14 @@ static void ggml_compute_forward_win_part_f32( static void ggml_compute_forward_win_part( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_win_part_f32(params, src0, dst); + ggml_compute_forward_win_part_f32(params, dst); } break; default: { @@ -14341,8 +14578,10 @@ static void ggml_compute_forward_win_part( static void ggml_compute_forward_win_unpart_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } @@ -14383,12 +14622,14 @@ static void ggml_compute_forward_win_unpart_f32( static void ggml_compute_forward_win_unpart( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_win_unpart_f32(params, src0, dst); + ggml_compute_forward_win_unpart_f32(params, dst); } break; default: { @@ -14401,58 +14642,58 @@ static void ggml_compute_forward_win_unpart( static void ggml_compute_forward_unary( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + const enum ggml_unary_op op = ggml_get_unary_op(dst); switch (op) { case GGML_UNARY_OP_ABS: { - ggml_compute_forward_abs(params, src0, dst); + ggml_compute_forward_abs(params, dst); } break; case GGML_UNARY_OP_SGN: { - ggml_compute_forward_sgn(params, src0, dst); + ggml_compute_forward_sgn(params, dst); } break; case GGML_UNARY_OP_NEG: { - ggml_compute_forward_neg(params, src0, dst); + ggml_compute_forward_neg(params, dst); } break; case GGML_UNARY_OP_STEP: { - ggml_compute_forward_step(params, src0, dst); + ggml_compute_forward_step(params, dst); } break; case GGML_UNARY_OP_TANH: { - ggml_compute_forward_tanh(params, src0, dst); + ggml_compute_forward_tanh(params, dst); } break; case GGML_UNARY_OP_ELU: { - ggml_compute_forward_elu(params, src0, dst); + ggml_compute_forward_elu(params, dst); } break; case GGML_UNARY_OP_RELU: { - ggml_compute_forward_relu(params, src0, dst); + ggml_compute_forward_relu(params, dst); } break; case GGML_UNARY_OP_GELU: { - ggml_compute_forward_gelu(params, src0, dst); + ggml_compute_forward_gelu(params, dst); } break; case GGML_UNARY_OP_GELU_QUICK: { - ggml_compute_forward_gelu_quick(params, src0, dst); + ggml_compute_forward_gelu_quick(params, dst); } break; case GGML_UNARY_OP_SILU: { - ggml_compute_forward_silu(params, src0, dst); + ggml_compute_forward_silu(params, dst); } break; case GGML_UNARY_OP_HARDSWISH: { - ggml_compute_forward_hardswish(params, src0, dst); + ggml_compute_forward_hardswish(params, dst); } break; case GGML_UNARY_OP_HARDSIGMOID: { - ggml_compute_forward_hardsigmoid(params, src0, dst); + ggml_compute_forward_hardsigmoid(params, dst); } break; default: { @@ -14465,8 +14706,10 @@ static void ggml_compute_forward_unary( static void ggml_compute_forward_get_rel_pos_f16( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } @@ -14492,12 +14735,14 @@ static void ggml_compute_forward_get_rel_pos_f16( static void ggml_compute_forward_get_rel_pos( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F16: { - ggml_compute_forward_get_rel_pos_f16(params, src0, dst); + ggml_compute_forward_get_rel_pos_f16(params, dst); } break; default: { @@ -14510,11 +14755,12 @@ static void ggml_compute_forward_get_rel_pos( static void ggml_compute_forward_add_rel_pos_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - const struct ggml_tensor * src2, struct ggml_tensor * dst) { + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + const struct ggml_tensor * src2 = dst->src[2]; + const bool inplace = (bool) ((int32_t *) dst->op_params)[0]; if (!inplace && params->type == GGML_TASK_INIT) { if (params->ith != 0) { @@ -14578,14 +14824,14 @@ static void ggml_compute_forward_add_rel_pos_f32( static void ggml_compute_forward_add_rel_pos( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - const struct ggml_tensor * src2, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_add_rel_pos_f32(params, src0, src1, src2, dst); + ggml_compute_forward_add_rel_pos_f32(params, dst); } break; default: { @@ -14598,9 +14844,11 @@ static void ggml_compute_forward_add_rel_pos( static void ggml_compute_forward_map_unary_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst, const ggml_unary_op_f32_t fun) { + + const struct ggml_tensor * src0 = dst->src[0]; + GGML_ASSERT(ggml_are_same_shape(src0, dst)); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -14622,13 +14870,15 @@ static void ggml_compute_forward_map_unary_f32( static void ggml_compute_forward_map_unary( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, struct ggml_tensor * dst, const ggml_unary_op_f32_t fun) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_map_unary_f32(params, src0, dst, fun); + ggml_compute_forward_map_unary_f32(params, dst, fun); } break; default: { @@ -14641,10 +14891,12 @@ static void ggml_compute_forward_map_unary( static void ggml_compute_forward_map_binary_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst, const ggml_binary_op_f32_t fun) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + assert(params->ith == 0); assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); @@ -14669,14 +14921,15 @@ static void ggml_compute_forward_map_binary_f32( static void ggml_compute_forward_map_binary( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst, const ggml_binary_op_f32_t fun) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_map_binary_f32(params, src0, src1, dst, fun); + ggml_compute_forward_map_binary_f32(params, dst, fun); } break; default: { @@ -14689,9 +14942,11 @@ static void ggml_compute_forward_map_binary( static void ggml_compute_forward_map_custom1_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * a, struct ggml_tensor * dst, const ggml_custom1_op_f32_t fun) { + + const struct ggml_tensor * a = dst->src[0]; + assert(params->ith == 0); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -14705,10 +14960,12 @@ static void ggml_compute_forward_map_custom1_f32( static void ggml_compute_forward_map_custom2_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * a, - const struct ggml_tensor * b, struct ggml_tensor * dst, const ggml_custom2_op_f32_t fun) { + + const struct ggml_tensor * a = dst->src[0]; + const struct ggml_tensor * b = dst->src[1]; + assert(params->ith == 0); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -14722,11 +14979,13 @@ static void ggml_compute_forward_map_custom2_f32( static void ggml_compute_forward_map_custom3_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * a, - const struct ggml_tensor * b, - const struct ggml_tensor * c, struct ggml_tensor * dst, const ggml_custom3_op_f32_t fun) { + + const struct ggml_tensor * a = dst->src[0]; + const struct ggml_tensor * b = dst->src[1]; + const struct ggml_tensor * c = dst->src[1]; + assert(params->ith == 0); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -14740,8 +14999,10 @@ static void ggml_compute_forward_map_custom3_f32( static void ggml_compute_forward_map_custom1( const struct ggml_compute_params * params, - const struct ggml_tensor * a, struct ggml_tensor * dst) { + + const struct ggml_tensor * a = dst->src[0]; + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } @@ -14755,9 +15016,11 @@ static void ggml_compute_forward_map_custom1( static void ggml_compute_forward_map_custom2( const struct ggml_compute_params * params, - const struct ggml_tensor * a, - const struct ggml_tensor * b, struct ggml_tensor * dst) { + + const struct ggml_tensor * a = dst->src[0]; + const struct ggml_tensor * b = dst->src[1]; + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } @@ -14771,10 +15034,12 @@ static void ggml_compute_forward_map_custom2( static void ggml_compute_forward_map_custom3( const struct ggml_compute_params * params, - const struct ggml_tensor * a, - const struct ggml_tensor * b, - const struct ggml_tensor * c, struct ggml_tensor * dst) { + + const struct ggml_tensor * a = dst->src[0]; + const struct ggml_tensor * b = dst->src[1]; + const struct ggml_tensor * c = dst->src[2]; + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } @@ -14788,9 +15053,11 @@ static void ggml_compute_forward_map_custom3( static void ggml_compute_forward_cross_entropy_loss_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + GGML_ASSERT(ggml_is_contiguous(src0)); GGML_ASSERT(ggml_is_contiguous(src1)); GGML_ASSERT(ggml_is_scalar(dst)); @@ -14894,13 +15161,14 @@ static void ggml_compute_forward_cross_entropy_loss_f32( static void ggml_compute_forward_cross_entropy_loss( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_cross_entropy_loss_f32(params, src0, src1, dst); + ggml_compute_forward_cross_entropy_loss_f32(params, dst); } break; default: { @@ -14913,10 +15181,12 @@ static void ggml_compute_forward_cross_entropy_loss( static void ggml_compute_forward_cross_entropy_loss_back_f32( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - const struct ggml_tensor * opt0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + const struct ggml_tensor * opt0 = dst->src[2]; + GGML_ASSERT(ggml_is_contiguous(dst)); GGML_ASSERT(ggml_is_contiguous(src0)); GGML_ASSERT(ggml_is_contiguous(src1)); @@ -15003,14 +15273,14 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32( static void ggml_compute_forward_cross_entropy_loss_back( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - const struct ggml_tensor * opt0, struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_cross_entropy_loss_back_f32(params, src0, src1, opt0, dst); + ggml_compute_forward_cross_entropy_loss_back_f32(params, dst); } break; default: { @@ -15058,312 +15328,312 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm switch (tensor->op) { case GGML_OP_DUP: { - ggml_compute_forward_dup(params, tensor->src[0], tensor); + ggml_compute_forward_dup(params, tensor); } break; case GGML_OP_ADD: { - ggml_compute_forward_add(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_add(params, tensor); } break; case GGML_OP_ADD1: { - ggml_compute_forward_add1(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_add1(params, tensor); } break; case GGML_OP_ACC: { - ggml_compute_forward_acc(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_acc(params, tensor); } break; case GGML_OP_SUB: { - ggml_compute_forward_sub(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_sub(params, tensor); } break; case GGML_OP_MUL: { - ggml_compute_forward_mul(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_mul(params, tensor); } break; case GGML_OP_DIV: { - ggml_compute_forward_div(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_div(params, tensor); } break; case GGML_OP_SQR: { - ggml_compute_forward_sqr(params, tensor->src[0], tensor); + ggml_compute_forward_sqr(params, tensor); } break; case GGML_OP_SQRT: { - ggml_compute_forward_sqrt(params, tensor->src[0], tensor); + ggml_compute_forward_sqrt(params, tensor); } break; case GGML_OP_LOG: { - ggml_compute_forward_log(params, tensor->src[0], tensor); + ggml_compute_forward_log(params, tensor); } break; case GGML_OP_SUM: { - ggml_compute_forward_sum(params, tensor->src[0], tensor); + ggml_compute_forward_sum(params, tensor); } break; case GGML_OP_SUM_ROWS: { - ggml_compute_forward_sum_rows(params, tensor->src[0], tensor); + ggml_compute_forward_sum_rows(params, tensor); } break; case GGML_OP_MEAN: { - ggml_compute_forward_mean(params, tensor->src[0], tensor); + ggml_compute_forward_mean(params, tensor); } break; case GGML_OP_ARGMAX: { - ggml_compute_forward_argmax(params, tensor->src[0], tensor); + ggml_compute_forward_argmax(params, tensor); } break; case GGML_OP_REPEAT: { - ggml_compute_forward_repeat(params, tensor->src[0], tensor); + ggml_compute_forward_repeat(params, tensor); } break; case GGML_OP_REPEAT_BACK: { - ggml_compute_forward_repeat_back(params, tensor->src[0], tensor); + ggml_compute_forward_repeat_back(params, tensor); } break; case GGML_OP_CONCAT: { - ggml_compute_forward_concat(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_concat(params, tensor); } break; case GGML_OP_SILU_BACK: { - ggml_compute_forward_silu_back(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_silu_back(params, tensor); } break; case GGML_OP_NORM: { - ggml_compute_forward_norm(params, tensor->src[0], tensor); + ggml_compute_forward_norm(params, tensor); } break; case GGML_OP_RMS_NORM: { - ggml_compute_forward_rms_norm(params, tensor->src[0], tensor); + ggml_compute_forward_rms_norm(params, tensor); } break; case GGML_OP_RMS_NORM_BACK: { - ggml_compute_forward_rms_norm_back(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_rms_norm_back(params, tensor); } break; case GGML_OP_GROUP_NORM: { - ggml_compute_forward_group_norm(params, tensor->src[0], tensor); + ggml_compute_forward_group_norm(params, tensor); } break; case GGML_OP_MUL_MAT: { - ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_mul_mat(params, tensor); } break; case GGML_OP_MUL_MAT_ID: { - ggml_compute_forward_mul_mat_id(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_mul_mat_id(params, tensor); } break; case GGML_OP_OUT_PROD: { - ggml_compute_forward_out_prod(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_out_prod(params, tensor); } break; case GGML_OP_SCALE: { - ggml_compute_forward_scale(params, tensor->src[0], tensor); + ggml_compute_forward_scale(params, tensor); } break; case GGML_OP_SET: { - ggml_compute_forward_set(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_set(params, tensor); } break; case GGML_OP_CPY: { - ggml_compute_forward_cpy(params, tensor->src[0], tensor); + ggml_compute_forward_cpy(params, tensor); } break; case GGML_OP_CONT: { - ggml_compute_forward_cont(params, tensor->src[0], tensor); + ggml_compute_forward_cont(params, tensor); } break; case GGML_OP_RESHAPE: { - ggml_compute_forward_reshape(params, tensor->src[0], tensor); + ggml_compute_forward_reshape(params, tensor); } break; case GGML_OP_VIEW: { - ggml_compute_forward_view(params, tensor->src[0]); + ggml_compute_forward_view(params, tensor); } break; case GGML_OP_PERMUTE: { - ggml_compute_forward_permute(params, tensor->src[0]); + ggml_compute_forward_permute(params, tensor); } break; case GGML_OP_TRANSPOSE: { - ggml_compute_forward_transpose(params, tensor->src[0]); + ggml_compute_forward_transpose(params, tensor); } break; case GGML_OP_GET_ROWS: { - ggml_compute_forward_get_rows(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_get_rows(params, tensor); } break; case GGML_OP_GET_ROWS_BACK: { - ggml_compute_forward_get_rows_back(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_get_rows_back(params, tensor); } break; case GGML_OP_DIAG: { - ggml_compute_forward_diag(params, tensor->src[0], tensor); + ggml_compute_forward_diag(params, tensor); } break; case GGML_OP_DIAG_MASK_INF: { - ggml_compute_forward_diag_mask_inf(params, tensor->src[0], tensor); + ggml_compute_forward_diag_mask_inf(params, tensor); } break; case GGML_OP_DIAG_MASK_ZERO: { - ggml_compute_forward_diag_mask_zero(params, tensor->src[0], tensor); + ggml_compute_forward_diag_mask_zero(params, tensor); } break; case GGML_OP_SOFT_MAX: { - ggml_compute_forward_soft_max(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor); + ggml_compute_forward_soft_max(params, tensor); } break; case GGML_OP_SOFT_MAX_BACK: { - ggml_compute_forward_soft_max_back(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_soft_max_back(params, tensor); } break; case GGML_OP_ROPE: { - ggml_compute_forward_rope(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_rope(params, tensor); } break; case GGML_OP_ROPE_BACK: { - ggml_compute_forward_rope_back(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_rope_back(params, tensor); } break; case GGML_OP_ALIBI: { - ggml_compute_forward_alibi(params, tensor->src[0], tensor); + ggml_compute_forward_alibi(params, tensor); } break; case GGML_OP_CLAMP: { - ggml_compute_forward_clamp(params, tensor->src[0], tensor); + ggml_compute_forward_clamp(params, tensor); } break; case GGML_OP_CONV_TRANSPOSE_1D: { - ggml_compute_forward_conv_transpose_1d(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_conv_transpose_1d(params, tensor); } break; case GGML_OP_IM2COL: { - ggml_compute_forward_im2col(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_im2col(params, tensor); } break; case GGML_OP_CONV_TRANSPOSE_2D: { - ggml_compute_forward_conv_transpose_2d(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_conv_transpose_2d(params, tensor); } break; case GGML_OP_POOL_1D: { - ggml_compute_forward_pool_1d(params, tensor->src[0], tensor); + ggml_compute_forward_pool_1d(params, tensor); } break; case GGML_OP_POOL_2D: { - ggml_compute_forward_pool_2d(params, tensor->src[0], tensor); + ggml_compute_forward_pool_2d(params, tensor); } break; case GGML_OP_UPSCALE: { - ggml_compute_forward_upscale(params, tensor->src[0], tensor); + ggml_compute_forward_upscale(params, tensor); } break; case GGML_OP_PAD: { - ggml_compute_forward_pad(params, tensor->src[0], tensor); + ggml_compute_forward_pad(params, tensor); } break; case GGML_OP_ARGSORT: { - ggml_compute_forward_argsort(params, tensor->src[0], tensor); + ggml_compute_forward_argsort(params, tensor); } break; case GGML_OP_LEAKY_RELU: { - ggml_compute_forward_leaky_relu(params, tensor->src[0], tensor); + ggml_compute_forward_leaky_relu(params, tensor); } break; case GGML_OP_FLASH_ATTN: { const int32_t t = ggml_get_op_params_i32(tensor, 0); GGML_ASSERT(t == 0 || t == 1); const bool masked = t != 0; - ggml_compute_forward_flash_attn(params, tensor->src[0], tensor->src[1], tensor->src[2], masked, tensor); + ggml_compute_forward_flash_attn(params, masked, tensor); } break; case GGML_OP_FLASH_FF: { - ggml_compute_forward_flash_ff(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], tensor->src[4], tensor); + ggml_compute_forward_flash_ff(params, tensor); } break; case GGML_OP_FLASH_ATTN_BACK: { int32_t t = ggml_get_op_params_i32(tensor, 0); GGML_ASSERT(t == 0 || t == 1); bool masked = t != 0; - ggml_compute_forward_flash_attn_back(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], masked, tensor); + ggml_compute_forward_flash_attn_back(params, masked, tensor); } break; case GGML_OP_WIN_PART: { - ggml_compute_forward_win_part(params, tensor->src[0], tensor); + ggml_compute_forward_win_part(params, tensor); } break; case GGML_OP_WIN_UNPART: { - ggml_compute_forward_win_unpart(params, tensor->src[0], tensor); + ggml_compute_forward_win_unpart(params, tensor); } break; case GGML_OP_UNARY: { - ggml_compute_forward_unary(params, tensor->src[0], tensor); + ggml_compute_forward_unary(params, tensor); } break; case GGML_OP_GET_REL_POS: { - ggml_compute_forward_get_rel_pos(params, tensor->src[0], tensor); + ggml_compute_forward_get_rel_pos(params, tensor); } break; case GGML_OP_ADD_REL_POS: { - ggml_compute_forward_add_rel_pos(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor); + ggml_compute_forward_add_rel_pos(params, tensor); } break; case GGML_OP_MAP_UNARY: { ggml_unary_op_f32_t fun; memcpy(&fun, tensor->op_params, sizeof(fun)); - ggml_compute_forward_map_unary(params, tensor->src[0], tensor, fun); + ggml_compute_forward_map_unary(params, tensor, fun); } break; case GGML_OP_MAP_BINARY: { ggml_binary_op_f32_t fun; memcpy(&fun, tensor->op_params, sizeof(fun)); - ggml_compute_forward_map_binary(params, tensor->src[0], tensor->src[1], tensor, fun); + ggml_compute_forward_map_binary(params, tensor, fun); } break; case GGML_OP_MAP_CUSTOM1_F32: { ggml_custom1_op_f32_t fun; memcpy(&fun, tensor->op_params, sizeof(fun)); - ggml_compute_forward_map_custom1_f32(params, tensor->src[0], tensor, fun); + ggml_compute_forward_map_custom1_f32(params, tensor, fun); } break; case GGML_OP_MAP_CUSTOM2_F32: { ggml_custom2_op_f32_t fun; memcpy(&fun, tensor->op_params, sizeof(fun)); - ggml_compute_forward_map_custom2_f32(params, tensor->src[0], tensor->src[1], tensor, fun); + ggml_compute_forward_map_custom2_f32(params, tensor, fun); } break; case GGML_OP_MAP_CUSTOM3_F32: { ggml_custom3_op_f32_t fun; memcpy(&fun, tensor->op_params, sizeof(fun)); - ggml_compute_forward_map_custom3_f32(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor, fun); + ggml_compute_forward_map_custom3_f32(params, tensor, fun); } break; case GGML_OP_MAP_CUSTOM1: { - ggml_compute_forward_map_custom1(params, tensor->src[0], tensor); + ggml_compute_forward_map_custom1(params, tensor); } break; case GGML_OP_MAP_CUSTOM2: { - ggml_compute_forward_map_custom2(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_map_custom2(params, tensor); } break; case GGML_OP_MAP_CUSTOM3: { - ggml_compute_forward_map_custom3(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor); + ggml_compute_forward_map_custom3(params, tensor); } break; case GGML_OP_CROSS_ENTROPY_LOSS: { - ggml_compute_forward_cross_entropy_loss(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_cross_entropy_loss(params, tensor); } break; case GGML_OP_CROSS_ENTROPY_LOSS_BACK: { - ggml_compute_forward_cross_entropy_loss_back(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor); + ggml_compute_forward_cross_entropy_loss_back(params, tensor); } break; case GGML_OP_NONE: diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 114a9a974..8f9139d1b 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -111,6 +111,7 @@ class MODEL_ARCH(IntEnum): ORION = auto() INTERNLM2 = auto() MINICPM = auto() + GEMMA = auto() class MODEL_TENSOR(IntEnum): @@ -167,6 +168,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH.ORION: "orion", MODEL_ARCH.INTERNLM2: "internlm2", MODEL_ARCH.MINICPM: "minicpm", + MODEL_ARCH.GEMMA: "gemma", } TENSOR_NAMES: dict[MODEL_TENSOR, str] = { @@ -511,6 +513,19 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.FFN_DOWN_EXP, MODEL_TENSOR.FFN_UP_EXP, ], + MODEL_ARCH.GEMMA: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.FFN_NORM, + ], # TODO } diff --git a/llama.cpp b/llama.cpp index faff13e21..859d909c0 100644 --- a/llama.cpp +++ b/llama.cpp @@ -232,6 +232,7 @@ enum llm_arch { LLM_ARCH_ORION, LLM_ARCH_INTERNLM2, LLM_ARCH_MINICPM, + LLM_ARCH_GEMMA, LLM_ARCH_UNKNOWN, }; @@ -258,6 +259,7 @@ static std::map LLM_ARCH_NAMES = { { LLM_ARCH_ORION, "orion" }, { LLM_ARCH_INTERNLM2, "internlm2" }, { LLM_ARCH_MINICPM, "minicpm" }, + { LLM_ARCH_GEMMA, "gemma" }, }; enum llm_kv { @@ -784,6 +786,22 @@ static std::map> LLM_TENSOR_NAMES = { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" }, }, }, + { + LLM_ARCH_GEMMA, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + }, + }, { LLM_ARCH_UNKNOWN, { @@ -2806,13 +2824,7 @@ struct llama_model_loader { std::vector> read_buf; - for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) { - struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i)); - if (!cur) { - // some tensors may be allocated in a different context - continue; - } - + for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) { if (progress_callback) { if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) { return false; @@ -3289,6 +3301,16 @@ static void llm_load_hparams( default: model.type = e_model::MODEL_UNKNOWN; } } break; + case LLM_ARCH_GEMMA: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + + switch (hparams.n_layer) { + case 18: model.type = e_model::MODEL_2B; break; + case 28: model.type = e_model::MODEL_7B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; default: (void)0; } @@ -3766,7 +3788,7 @@ static bool llm_load_tensors( } // create one context per buffer type - size_t ctx_size = ggml_tensor_overhead()*ml.n_tensors; + size_t ctx_size = ggml_tensor_overhead()*(ml.n_tensors + 1); // +1 for models where tok_embd is duplicated as output std::map ctx_map; for (auto & it : buft_layer_count) { struct ggml_init_params params = { @@ -3904,6 +3926,7 @@ static bool llm_load_tensors( } else { model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU ml.n_created--; // artificial tensor + ml.size_data += ggml_nbytes(model.output); } } @@ -4432,6 +4455,40 @@ static bool llm_load_tensors( layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); } } break; + case LLM_ARCH_GEMMA: + { + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + + // output + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // same as tok_embd, duplicated to allow offloading + ml.n_created--; // artificial tensor + ml.size_data += ggml_nbytes(model.output); + + const int64_t n_ff = hparams.n_ff; + const int64_t n_embd_head_k = hparams.n_embd_head_k; + const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(); + const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(); + + for (uint32_t i = 0; i < n_layer; ++i) { + ggml_context * ctx_layer = ctx_for_layer(i); + ggml_context * ctx_split = ctx_for_layer_split(i); + + auto & layer = model.layers[i]; + + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + + layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * hparams.n_head}); + layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}); + layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * hparams.n_head, n_embd}); + + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); + layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}); + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); + } + } break; default: throw std::runtime_error("unknown architecture"); } @@ -7438,6 +7495,113 @@ struct llm_build_context { return gf; } + + struct ggml_cgraph * build_gemma() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + + const int64_t n_embd_head_k = hparams.n_embd_head_k; + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); + cb(inpL, "inp_embd", -1); + inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd)); + cb(inpL, "inp_scaled", -1); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0); + cb(inp_pos, "inp_pos", -1); + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); + cb(KQ_mask, "KQ_mask", -1); + + // shift the entire K-cache if needed + if (do_rope_shift) { + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb); + } + + for (int il = 0; il < n_layer; ++il) { + + // norm + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "attn_norm", il); + + // self-attention + { + // compute Q and K and RoPE them + struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + + struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + + struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + + Qcur = ggml_rope_custom( + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos, + n_embd_head_k, 2, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + cb(Qcur, "Qcur", il); + Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k))); + cb(Qcur, "Qcur_scaled", il); + + Kcur = ggml_rope_custom( + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, + n_embd_head_k, 2, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + cb(Kcur, "Kcur", il); + + cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, + model.layers[il].wo, NULL, + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il); + cb(cur, "kqv_out", il); + } + struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL); + cb(sa_out, "sa_out", il); + + cur = llm_build_norm(ctx0, sa_out, hparams, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "ffn_norm", il); + + // feed-forward network + { + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, NULL, + model.layers[il].ffn_gate, NULL, + model.layers[il].ffn_down, NULL, + NULL, + LLM_FFN_GELU, LLM_FFN_PAR, cb, il); + cb(cur, "ffn_out", il); + } + + cur = ggml_add(ctx0, cur, sa_out); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = llm_build_norm(ctx0, cur, hparams, + model.output_norm, NULL, + LLM_NORM_RMS, cb, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = ggml_mul_mat(ctx0, model.output, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } }; static struct ggml_cgraph * llama_build_graph( @@ -7546,6 +7710,10 @@ static struct ggml_cgraph * llama_build_graph( { result = llm.build_minicpm(); } break; + case LLM_ARCH_GEMMA: + { + result = llm.build_gemma(); + } break; default: GGML_ASSERT(false); } @@ -12317,18 +12485,19 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat data_ctx->write(&kv_used, sizeof(kv_used)); if (kv_buf_size) { - const size_t elt_size = ggml_element_size(kv_self.k_l[0]); - std::vector tmp_buf; for (int il = 0; il < (int) n_layer; ++il) { - tmp_buf.resize(elt_size*n_embd_k_gqa*kv_head); + size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head); + tmp_buf.resize(k_size); ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), 0, tmp_buf.size()); data_ctx->write(tmp_buf.data(), tmp_buf.size()); // v is not contiguous, copy row by row - tmp_buf.resize(elt_size*kv_head); + size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head); + size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx); + tmp_buf.resize(v_row_size); for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) { - ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), ir*elt_size*n_ctx, tmp_buf.size()); + ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), ir*v_row_stride, tmp_buf.size()); data_ctx->write(tmp_buf.data(), tmp_buf.size()); } } @@ -12430,17 +12599,16 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) { if (kv_buf_size) { GGML_ASSERT(kv_self.total_size() == kv_buf_size); - const size_t elt_size = ggml_element_size(kv_self.k_l[0]); - for (int il = 0; il < (int) n_layer; ++il) { - size_t k_size = elt_size*n_embd_k_gqa*kv_head; + size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head); ggml_backend_tensor_set(kv_self.k_l[il], inp, 0, k_size); inp += k_size; // v is not contiguous, copy row by row - size_t v_row_size = elt_size*kv_head; + size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head); + size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx); for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) { - ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*elt_size*n_ctx, v_row_size); + ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*v_row_stride, v_row_size); inp += v_row_size; } } diff --git a/llama.h b/llama.h index b4f7a4f07..ef8667456 100644 --- a/llama.h +++ b/llama.h @@ -710,7 +710,7 @@ extern "C" { /// Apply chat template. Inspired by hf apply_chat_template() on python. /// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model" - /// NOTE: This function only support some known jinja templates. It is not a jinja parser. + /// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template /// @param tmpl A Jinja template to use for this chat. If this is nullptr, the model’s default chat template will be used instead. /// @param chat Pointer to a list of multiple llama_chat_message /// @param n_msg Number of llama_chat_message in this chat