gemma3 vision works, but is using more tokens than expected - may need resizing

2025-09-10 17:14:36 +00:00 · 2025-03-13 00:31:16 +08:00 · 2025-03-13 00:31:16 +08:00 · 77debb1b1b
commit 77debb1b1b
parent eb1809c105 80a02aa858
39 changed files with 2140 additions and 814 deletions
--- a/2
+++ b/2
@ -639,6 +639,8 @@ ttsmain: examples/tts/tts.cpp common/arg.cpp build-info.h ggml.o ggml-cpu.o llam
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 gguf-split: examples/gguf-split/gguf-split.cpp ggml.o ggml-cpu.o llama.o build-info.h llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FULL) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 gemma3-cli: examples/llava/gemma3-cli.cpp common/arg.cpp build-info.h ggml.o ggml-cpu.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FULL) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 vulkan-shaders-gen: ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
 	@echo 'This command can be MANUALLY run to regenerate vulkan shaders. Normally concedo will do it, so you do not have to.'
--- a/common/arg.cpp
+++ b/common/arg.cpp
@ -1868,16 +1868,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
    ).set_examples({LLAMA_EXAMPLE_PASSKEY}));
    add_opt(common_arg(
        {"-o", "--output", "--output-file"}, "FNAME",
-        string_format("output file (default: '%s')",
+        string_format("output file (default: '%s')", params.out_file.c_str()),
            ex == LLAMA_EXAMPLE_EXPORT_LORA
                ? params.lora_outfile.c_str()
                : ex == LLAMA_EXAMPLE_CVECTOR_GENERATOR
                    ? params.cvector_outfile.c_str()
                    : params.out_file.c_str()),
        [](common_params & params, const std::string & value) {
            params.out_file = value;
            params.cvector_outfile = value;
            params.lora_outfile = value;
        }
    ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA}));
    add_opt(common_arg(
--- a/common/chat.cpp
+++ b/common/chat.cpp
@ -60,7 +60,9 @@ std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const json & messa
            }
            msg.role = message.at("role");
-            if (message.contains("content")) {
+            auto has_content = message.contains("content");
            auto has_tool_calls = message.contains("tool_calls");
            if (has_content) {
                const auto & content = message.at("content");
                if (content.is_string()) {
                    msg.content = content;
@ -81,19 +83,8 @@ std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const json & messa
                } else if (!content.is_null()) {
                    throw std::runtime_error("Invalid 'content' type: expected string or array, got " + content.dump() + " (ref: https://github.com/ggml-org/llama.cpp/issues/8367)");
                }
            } else {
                throw std::runtime_error("Expected 'content' (ref: https://github.com/ggml-org/llama.cpp/issues/8367)");
            }
-            if (message.contains("reasoning_content")) {
+            if (has_tool_calls) {
                msg.reasoning_content = message.at("reasoning_content");
            }
            if (message.contains("name")) {
                msg.tool_name = message.at("name");
            }
            if (message.contains("tool_call_id")) {
                msg.tool_call_id = message.at("tool_call_id");
            }
            if (message.contains("tool_calls")) {
                for (const auto & tool_call : message.at("tool_calls")) {
                    common_chat_tool_call tc;
                    if (!tool_call.contains("type")) {
@ -118,6 +109,18 @@ std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const json & messa
                    msg.tool_calls.push_back(tc);
                }
            }
            if (!has_content && !has_tool_calls) {
                throw std::runtime_error("Expected 'content' or 'tool_calls' (ref: https://github.com/ggml-org/llama.cpp/issues/8367 & https://github.com/ggml-org/llama.cpp/issues/12279)");
            }
            if (message.contains("reasoning_content")) {
                msg.reasoning_content = message.at("reasoning_content");
            }
            if (message.contains("name")) {
                msg.tool_name = message.at("name");
            }
            if (message.contains("tool_call_id")) {
                msg.tool_call_id = message.at("tool_call_id");
            }
            msgs.push_back(msg);
        }
@ -442,6 +445,7 @@ std::string common_chat_format_name(common_chat_format format) {
        case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2: return "Functionary v3.2";
        case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1: return "Functionary v3.1 Llama 3.1";
        case COMMON_CHAT_FORMAT_HERMES_2_PRO: return "Hermes 2 Pro";
        case COMMON_CHAT_FORMAT_HERMES_2_PRO_EXTRACT_REASONING: return "Hermes 2 Pro (extract reasoning)";
        case COMMON_CHAT_FORMAT_COMMAND_R7B: return "Command R7B";
        case COMMON_CHAT_FORMAT_COMMAND_R7B_EXTRACT_REASONING: return "Command R7B (extract reasoning)";
        default:
@ -875,9 +879,9 @@ static common_chat_params common_chat_params_init_command_r7b(const common_chat_
    return data;
 }
 static common_chat_msg common_chat_parse_command_r7b(const std::string & input, bool extract_reasoning) {
-    static std::regex thought_regex("(<\\|START_THINKING\\|>([\\s\\S]*?)<\\|END_THINKING\\|>)([\\s\\S]*)");
+    static const std::regex thought_regex("(<\\|START_THINKING\\|>([\\s\\S]*?)<\\|END_THINKING\\|>)([\\s\\S]*)");
-    static std::regex action_regex("<\\|START_ACTION\\|>([\\s\\S]*?)<\\|END_ACTION\\|>");
+    static const std::regex action_regex("<\\|START_ACTION\\|>([\\s\\S]*?)<\\|END_ACTION\\|>");
-    static std::regex response_regex("(?:<\\|START_RESPONSE\\|>)?([\\s\\S]*?)<\\|END_RESPONSE\\|>");
+    static const std::regex response_regex("(?:<\\|START_RESPONSE\\|>)?([\\s\\S]*?)<\\|END_RESPONSE\\|>");
    std::smatch match;
@ -1009,10 +1013,10 @@ static common_chat_params common_chat_params_init_llama_3_1_tool_calls(const com
 }
 static common_chat_msg common_chat_parse_llama_3_1(const std::string & input, bool with_builtin_tools = false) {
    // TODO: tighten & simplify the parser, don't accept leading text context.
-    static std::regex function_regex(
+    static const std::regex function_regex(
        "\\s*\\{\\s*(?:\"type\"\\s*:\\s*\"function\"\\s*,\\s*)?\"name\"\\s*:\\s*\"([^\"]+)\"\\s*,\\s*\"parameters\"\\s*: ");
-    static std::regex close_regex("\\}\\s*");
+    static const std::regex close_regex("\\}\\s*");
-    static std::regex builtin_call_regex("<\\|python_tag\\|>\\s*([^.(]+)\\s*\\.\\s*call\\s*\\(\\s*([\\w]+)\\s*=\\s*([\\s\\S]*?)\\)");
+    static const std::regex builtin_call_regex("<\\|python_tag\\|>\\s*([^.(]+)\\s*\\.\\s*call\\s*\\(\\s*([\\w]+)\\s*=\\s*([\\s\\S]*?)\\)");
    if (with_builtin_tools) {
        std::smatch match;
@ -1102,34 +1106,42 @@ static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_
    data.format = inputs.extract_reasoning ? COMMON_CHAT_FORMAT_DEEPSEEK_R1_EXTRACT_REASONING : COMMON_CHAT_FORMAT_DEEPSEEK_R1;
    return data;
 }
-static common_chat_msg common_chat_parse_deepseek_r1(const std::string & input, bool extract_reasoning) {
+static common_chat_msg handle_think_tag_prelude(const std::string & input, bool extract_reasoning, const std::function<common_chat_msg(const std::string &)> & rest_parser) {
    static std::regex function_regex("<｜tool▁call▁begin｜>function<｜tool▁sep｜>([^\n]+)\n```json\n");
    static std::regex close_regex("```[\\s\\r\\n]*<｜tool▁call▁end｜>");
    static std::regex reasoning_content_regex("((?:<think>)?([\\s\\S\\r\\n]*?)</think>)?([\\s\\S\\r\\n]*)");
    static std::regex tool_calls_regex("[\\s\\r\\n]*(?:<｜tool▁calls▁begin｜>|<｜tool_calls_begin｜>|<｜tool calls begin｜>|<｜tool\\\\_calls\\\\_begin｜>)([\\s\\S\\r\\n]*?)<｜tool▁calls▁end｜>");
    common_chat_msg msg;
    msg.role = "assistant";
    std::smatch match;
    static const std::regex reasoning_content_regex("((?:<think>)?([\\s\\S\\r\\n]*?)</think>)?([\\s\\S\\r\\n]*)");
    if (std::regex_match(input, match, reasoning_content_regex)) {
-        std::string rest;
+        auto rest = match[3].str();
        auto msg = rest_parser(rest);
        auto reasoning_content = string_strip(match[2].str());
        if (extract_reasoning) {
-            msg.reasoning_content = string_strip(match[2].str());
+            msg.reasoning_content = reasoning_content;
-        } else {
+        } else if (!reasoning_content.empty()) {
-            msg.content = match[1].str();
+            std::ostringstream content;
            content << "<think>" << reasoning_content << "</think>" << msg.content;
            msg.content = content.str();
        }
-        rest = match[3].str();
+        return msg;
    }
    return rest_parser(input);
 }
 static common_chat_msg common_chat_parse_deepseek_r1(const std::string & input, bool extract_reasoning) {
    return handle_think_tag_prelude(input, extract_reasoning, [](const std::string & input) {
        static const std::regex function_regex("<｜tool▁call▁begin｜>function<｜tool▁sep｜>([^\n]+)\n```json\n");
        static const std::regex close_regex("```[\\s\\r\\n]*<｜tool▁call▁end｜>");
        static const std::regex tool_calls_regex("[\\s\\r\\n]*(?:<｜tool▁calls▁begin｜>|<｜tool_calls_begin｜>|<｜tool calls begin｜>|<｜tool\\\\_calls\\\\_begin｜>)([\\s\\S\\r\\n]*?)<｜tool▁calls▁end｜>");
-        if (std::regex_search(rest, match, tool_calls_regex)) {
+        common_chat_msg msg;
        msg.role = "assistant";
        std::smatch match;
        if (std::regex_search(input, match, tool_calls_regex)) {
            auto tool_calls = match[1].str();
            auto msg2 = parse_json_tool_calls(tool_calls, std::nullopt, function_regex, close_regex);
            msg.tool_calls = std::move(msg2.tool_calls);
        } else {
-            msg.content += std::string(rest.begin() + rest.find_first_not_of(" \r\n"), rest.end());
+            msg.content = input;
        }
-    } else {
+        return msg;
-        msg.content = input;
+    });
    }
    return msg;
 }
 static common_chat_params common_chat_params_init_firefunction_v2(const common_chat_template & tmpl, const struct templates_params & inputs) {
@ -1234,8 +1246,8 @@ static common_chat_params common_chat_params_init_functionary_v3_2(const common_
 }
 static common_chat_msg common_chat_parse_functionary_v3_2(const std::string & input) {
-    static std::regex function_regex(R"((?:>>>)?(?:assistant<|end_header_id|>\n)?(\w+)\n)");
+    static const std::regex function_regex(R"((?:>>>)?(?:assistant<|end_header_id|>\n)?(\w+)\n)");
-    static std::regex close_regex(R"($|(?=>>>))");
+    static const std::regex close_regex(R"($|(?=>>>))");
    std::string content;
    auto it = input.begin();
@ -1324,7 +1336,7 @@ static common_chat_params common_chat_params_init_functionary_v3_1_llama_3_1(con
 }
 static common_chat_msg common_chat_parse_functionary_v3_1_llama_3_1(const std::string & input) {
    // This version of Functionary still supports the llama 3.1 tool call format for the python tool.
-    static std::regex python_tag_regex(R"(<\|python_tag\|>([\s\S\n]*)$)");
+    static const std::regex python_tag_regex(R"(<\|python_tag\|>([\s\S\n]*)$)");
    std::smatch match;
    if (std::regex_search(input, match, python_tag_regex)) {
        auto code = match[1].str();
@ -1338,8 +1350,8 @@ static common_chat_msg common_chat_parse_functionary_v3_1_llama_3_1(const std::s
        });
        return msg;
    }
-    static std::regex function_regex(R"(<function=(\w+)>)");
+    static const std::regex function_regex(R"(<function=(\w+)>)");
-    static std::regex close_regex(R"(</function>)");
+    static const std::regex close_regex(R"(</function>)");
    // TODO: tighten & simplify.
    return parse_json_tool_calls(input, std::nullopt, function_regex, close_regex);
 }
@ -1406,6 +1418,8 @@ static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat
            "(?:```(?:json|xml)?\n\\s*)?(?:<function_call>|<tools>|<xml><json>|<response>)?\\s*\\{\\s*\"", //name\"\\s*:\\s*\"" + escaped_name + "\"",
        });
        data.preserved_tokens = {
            "<think>",
            "</think>",
            "<tool_call>",
            "</tool_call>",
            "<function",
@ -1426,122 +1440,123 @@ static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat
    });
    data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
-    data.format = COMMON_CHAT_FORMAT_HERMES_2_PRO;
+    data.format = inputs.extract_reasoning ? COMMON_CHAT_FORMAT_HERMES_2_PRO_EXTRACT_REASONING : COMMON_CHAT_FORMAT_HERMES_2_PRO;
    return data;
 }
-static common_chat_msg common_chat_parse_hermes_2_pro(const std::string& input) {
+static common_chat_msg common_chat_parse_hermes_2_pro(const std::string& input, bool extract_reasoning) {
-    const static std::regex open_regex(
+    return handle_think_tag_prelude(input, extract_reasoning, [](const std::string & input) {
-        "(?:"
+        static const std::regex open_regex(
-        "(```(?:xml|json)?\\n\\s*)?"         // match 1 (block_start)
+            "(?:"
-        "(<tool_call>"                   // match 2 (open_tag)
+            "(```(?:xml|json)?\\n\\s*)?"         // match 1 (block_start)
-        "|<function_call>"
+            "(<tool_call>"                   // match 2 (open_tag)
-        "|<tool>"
+            "|<function_call>"
-        "|<tools>"
+            "|<tool>"
-        "|<response>"
+            "|<tools>"
-        "|<json>"
+            "|<response>"
-        "|<xml>"
+            "|<json>"
-        "|<JSON>"
+            "|<xml>"
-        ")?"
+            "|<JSON>"
-        "(\\s*\\{\\s*\"name\"\\s*:[\\s\\S]*)"    // match 3 (named tool call + rest)
+            ")?"
-        ")"
+            "(\\s*\\{\\s*\"name\"\\s*:[\\s\\S]*)"    // match 3 (named tool call + rest)
-        "|"
+            ")"
-        "(?:<function=([^>]+)>"            // match 4 (function name)
+            "|"
-        "|<function name=\"([^\"]+)\">)" // match 5 (function name again)
+            "(?:<function=([^>]+)>"            // match 4 (function name)
-        "([\\s\\S]*)"                   // match 6 (function arguments + rest)})"
+            "|<function name=\"([^\"]+)\">)" // match 5 (function name again)
-    );
+            "([\\s\\S]*)"                   // match 6 (function arguments + rest)})"
        );
-    try {
+        try {
            common_chat_msg msg;
            msg.role = "assistant";
-        common_chat_msg msg;
+            std::string::const_iterator it = input.begin();
-        msg.role = "assistant";
+            const std::string::const_iterator end = input.end();
            std::smatch match;
-        std::string::const_iterator it = input.begin();
+            while (it != end) {
-        const std::string::const_iterator end = input.end();
+                if (std::regex_search(it, end, match, open_regex)) {
-        std::smatch match;
+                    // Add content before the match
                    msg.content += std::string(it, match[0].first);
-        while (it != end) {
+                    auto block_start = match[1].str();
-            if (std::regex_search(it, end, match, open_regex)) {
+                    std::string block_end = block_start.empty() ? "" : "```";
                // Add content before the match
                msg.content += std::string(it, match[0].first);
-                auto block_start = match[1].str();
+                    auto open_tag = match[2].str();
-                std::string block_end = block_start.empty() ? "" : "```";
+                    std::string close_tag;
-                auto open_tag = match[2].str();
+                    if (match[3].matched) {
-                std::string close_tag;
+                        close_tag = open_tag.empty() ? "" : "</" + open_tag.substr(1);
                        auto json_it = match[3].first;
                        json tool_call;
                        if (parse_json(json_it, end, tool_call) && tool_call.contains("name") && tool_call.contains("arguments")) {
-                if (match[3].matched) {
+                            msg.tool_calls.emplace_back(process_tool_call(tool_call));
-                    close_tag = open_tag.empty() ? "" : "</" + open_tag.substr(1);
+                            it = json_it;  // Move iterator past parsed JSON
                    auto json_it = match[3].first;
                    json tool_call;
                    if (parse_json(json_it, end, tool_call) && tool_call.contains("name") && tool_call.contains("arguments")) {
-                        msg.tool_calls.emplace_back(process_tool_call(tool_call));
+                            // Handle close tags
-                        it = json_it;  // Move iterator past parsed JSON
+                            consume_spaces(it, end);
-
+                            if (!close_tag.empty() && !parse_literal(it, end, close_tag)) {
-                        // Handle close tags
+                                throw std::runtime_error("Failed to parse closing tag");
-                        consume_spaces(it, end);
+                            }
-                        if (!close_tag.empty() && !parse_literal(it, end, close_tag)) {
+                            consume_spaces(it, end);
-                            throw std::runtime_error("Failed to parse closing tag");
+                            if (!block_end.empty() && !parse_literal(it, end, block_end)) {
                                throw std::runtime_error("Failed to parse block end");
                            }
                            consume_spaces(it, end);
                        } else {
                            // Not a valid tool call, treat as content
                            msg.content += std::string(match[0].first, match[0].second);
                            it = match[0].second;
                        }
                        consume_spaces(it, end);
                        if (!block_end.empty() && !parse_literal(it, end, block_end)) {
                            throw std::runtime_error("Failed to parse block end");
                        }
                        consume_spaces(it, end);
                    } else {
-                        // Not a valid tool call, treat as content
+                        auto function_name = match[4].str();
-                        msg.content += std::string(match[0].first, match[0].second);
+                        if (function_name.empty()) {
-                        it = match[0].second;
+                            function_name = match[5].str();
                        }
                        GGML_ASSERT(!function_name.empty());
                        close_tag = "</function>";
                        // Start parsing from after the opening tags
                        auto json_it = match[6].first;
                        json arguments;
                        if (parse_json(json_it, end, arguments)) {
                            msg.tool_calls.emplace_back(process_tool_call({
                                {"name", function_name},
                                {"arguments", arguments},
                            }));
                            it = json_it;  // Move iterator past parsed JSON
                            // Handle close tags
                            consume_spaces(it, end);
                            if (!close_tag.empty() && !parse_literal(it, end, close_tag)) {
                                throw std::runtime_error("Failed to parse closing tag");
                            }
                            consume_spaces(it, end);
                            if (!block_end.empty() && !parse_literal(it, end, block_end)) {
                                throw std::runtime_error("Failed to parse block end");
                            }
                            consume_spaces(it, end);
                        } else {
                            // Not a valid tool call, treat as content
                            msg.content += std::string(match[0].first, match[0].second);
                            it = match[0].second;
                        }
                    }
                } else {
-                    auto function_name = match[4].str();
+                    // Add remaining content
-                    if (function_name.empty()) {
+                    msg.content += std::string(it, end);
-                        function_name = match[5].str();
+                    break;
                    }
                    GGML_ASSERT(!function_name.empty());
                    close_tag = "</function>";
                    // Start parsing from after the opening tags
                    auto json_it = match[6].first;
                    json arguments;
                    if (parse_json(json_it, end, arguments)) {
                        msg.tool_calls.emplace_back(process_tool_call({
                            {"name", function_name},
                            {"arguments", arguments},
                        }));
                        it = json_it;  // Move iterator past parsed JSON
                        // Handle close tags
                        consume_spaces(it, end);
                        if (!close_tag.empty() && !parse_literal(it, end, close_tag)) {
                            throw std::runtime_error("Failed to parse closing tag");
                        }
                        consume_spaces(it, end);
                        if (!block_end.empty() && !parse_literal(it, end, block_end)) {
                            throw std::runtime_error("Failed to parse block end");
                        }
                        consume_spaces(it, end);
                    } else {
                        // Not a valid tool call, treat as content
                        msg.content += std::string(match[0].first, match[0].second);
                        it = match[0].second;
                    }
                }
            } else {
                // Add remaining content
                msg.content += std::string(it, end);
                break;
            }
            return msg;
        } catch (const std::exception & e) {
            LOG_ERR("Failed to parse hermes 2 pro input: %s\n", e.what());
            common_chat_msg msg;
            msg.role = "assistant";
            msg.content = input;
            return msg;
        }
-        return msg;
+    });
    } catch (const std::exception & e) {
        LOG_ERR("Failed to parse hermes 2 pro input: %s\n", e.what());
        common_chat_msg msg;
        msg.role = "assistant";
        msg.content = input;
        return msg;
    }
 }
 static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) {
@ -1606,6 +1621,11 @@ static common_chat_params common_chat_templates_apply_jinja(
        return common_chat_params_init_command_r7b(tmpl, params);
    }
    // Hermes 2/3 Pro, Qwen 2.5 Instruct (w/ tools)
    if (src.find("<tool_call>") != std::string::npos && params.json_schema.is_null()) {
        return common_chat_params_init_hermes_2_pro(tmpl, params);
    }
    // Use generic handler when mixing tools + JSON schema.
    // TODO: support that mix in handlers below.
    if ((params.tools.is_array() && params.json_schema.is_object())) {
@ -1627,11 +1647,6 @@ static common_chat_params common_chat_templates_apply_jinja(
        return common_chat_params_init_without_tools(tmpl, params);
    }
    // Hermes 2/3 Pro, Qwen 2.5 Instruct (w/ tools)
    if (src.find("<tool_call>") != std::string::npos) {
        return common_chat_params_init_hermes_2_pro(tmpl, params);
    }
    // Functionary v3.1 (w/ tools)
    if (src.find("<|start_header_id|>") != std::string::npos
        && src.find("<function=") != std::string::npos) {
@ -1749,7 +1764,9 @@ common_chat_msg common_chat_parse(const std::string & input, common_chat_format
        case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1:
            return common_chat_parse_functionary_v3_1_llama_3_1(input);
        case COMMON_CHAT_FORMAT_HERMES_2_PRO:
-            return common_chat_parse_hermes_2_pro(input);
+            return common_chat_parse_hermes_2_pro(input, /* extract_reasoning= */ false);
        case COMMON_CHAT_FORMAT_HERMES_2_PRO_EXTRACT_REASONING:
            return common_chat_parse_hermes_2_pro(input, /* extract_reasoning= */ true);
        case COMMON_CHAT_FORMAT_FIREFUNCTION_V2:
            return common_chat_parse_firefunction_v2(input);
        case COMMON_CHAT_FORMAT_COMMAND_R7B:
--- a/common/chat.h
+++ b/common/chat.h
@ -53,6 +53,7 @@ enum common_chat_format {
    COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2,
    COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
    COMMON_CHAT_FORMAT_HERMES_2_PRO,
    COMMON_CHAT_FORMAT_HERMES_2_PRO_EXTRACT_REASONING,
    COMMON_CHAT_FORMAT_COMMAND_R7B,
    COMMON_CHAT_FORMAT_COMMAND_R7B_EXTRACT_REASONING,
--- a/common/common.h
+++ b/common/common.h
@ -403,8 +403,6 @@ struct common_params {
    int32_t i_pos  = -1;  // position of the passkey in the junk text
    // imatrix params
    std::string out_file = "imatrix.dat"; // save the resulting imatrix to this file
    int32_t n_out_freq  = 10; // output the imatrix every n_out_freq iterations
    int32_t n_save_freq =  0; // save the imatrix every n_save_freq iterations
    int32_t i_chunk     =  0; // start processing from this chunk
@ -416,16 +414,16 @@ struct common_params {
    int n_pca_batch = 100;
    int n_pca_iterations = 1000;
    dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
    std::string cvector_outfile       = "control_vector.gguf";
    std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
    std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
    bool spm_infill = false; // suffix/prefix/middle pattern for infill
    std::string lora_outfile = "ggml-lora-merged-f16.gguf";
    // batched-bench params
    bool batched_bench_output_jsonl = false;
    // common params
    std::string out_file; // output filename for all example programs
 };
 // call once at the start of a program if it uses libcommon
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@ -861,6 +861,9 @@ class Model:
                for token_id, token_data in added_tokens_decoder.items():
                    token_id = int(token_id)
                    token: str = token_data["content"]
                    if token_id >= vocab_size:
                        logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
                        continue
                    if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
                        if tokens[token_id] != token.encode("utf-8"):
                            logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token!r}')
@ -3322,6 +3325,83 @@ class Gemma2Model(Model):
        return [(self.map_tensor_name(name), data_torch)]
@Model.register("Gemma3ForCausalLM", "Gemma3ForConditionalGeneration")
 class Gemma3Model(Model):
    model_arch = gguf.MODEL_ARCH.GEMMA3
    has_vision: bool = False
    # we need to merge the text_config into the root level of hparams
    def __init__(self, *args, **kwargs):
        hparams = Model.load_hparams(kwargs["dir_model"])
        if "text_config" in hparams:
            hparams = {**hparams, **hparams["text_config"]}
            kwargs["hparams"] = hparams
        super().__init__(*args, **kwargs)
        if "vision_config" in hparams:
            logger.info("Has vision encoder, but it will be ignored")
            self.has_vision = True
    def write(self):
        super().write()
        if self.has_vision:
            logger.info("NOTE: this script only convert the language model to GGUF")
            logger.info("      for the vision model, please use gemma3_convert_encoder_to_gguf.py")
    def set_vocab(self):
        self._set_vocab_sentencepiece()
        self.gguf_writer.add_add_space_prefix(False)
    def set_gguf_parameters(self):
        hparams = self.hparams
        block_count = hparams["num_hidden_layers"]
        # some default values are not specified in the hparams
        self.gguf_writer.add_context_length(hparams.get("max_position_embeddings", 131072))
        self.gguf_writer.add_embedding_length(hparams["hidden_size"])
        self.gguf_writer.add_block_count(block_count)
        self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
        self.gguf_writer.add_head_count(hparams.get("num_attention_heads", 8))
        self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("rms_norm_eps", 1e-6))
        self.gguf_writer.add_key_length(hparams.get("head_dim", 256))
        self.gguf_writer.add_value_length(hparams.get("head_dim", 256))
        self.gguf_writer.add_file_type(self.ftype)
        self.gguf_writer.add_rope_freq_base(hparams.get("rope_theta", 1_000_000.0)) # for global layers
        # both attn_logit_softcapping and final_logit_softcapping are removed in Gemma3
        assert hparams.get("attn_logit_softcapping") is None
        assert hparams.get("final_logit_softcapping") is None
        self.gguf_writer.add_sliding_window(hparams["sliding_window"])
        self.gguf_writer.add_head_count_kv(hparams.get("num_key_value_heads", 4))
        if hparams.get("rope_scaling") is not None:
            assert hparams["rope_scaling"]["rope_type"] == "linear"
            # important: this rope_scaling is only applied for global layers, and not used by 1B model
            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
            self.gguf_writer.add_rope_scaling_factor(hparams["rope_scaling"]["factor"])
    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        del bid  # unused
        if name.startswith("language_model."):
            name = name.replace("language_model.", "")
        elif name.startswith("multi_modal_projector.") or name.startswith("vision_tower.") \
                or name.startswith("multimodal_projector.") or name.startswith("vision_model."): # this is for old HF model, should be removed later
            # ignore vision tensors
            return []
        # remove OOV (out-of-vocabulary) rows in token_embd
        if "embed_tokens.weight" in name:
            vocab = self._create_vocab_sentencepiece()
            tokens = vocab[0]
            data_torch = data_torch[:len(tokens)]
        # ref code in Gemma3RMSNorm
        # output = output * (1.0 + self.weight.float())
        if name.endswith("norm.weight"):
            data_torch = data_torch + 1
        return [(self.map_tensor_name(name), data_torch)]
@Model.register("Starcoder2ForCausalLM")
 class StarCoder2Model(Model):
    model_arch = gguf.MODEL_ARCH.STARCODER2
--- a/examples/llava/README-gemma3.md
+++ b/examples/llava/README-gemma3.md
@ -0,0 +1,30 @@
 # Gemma 3 vision
 > [!IMPORTANT]
 >
 > This is very experimental, only used for demo purpose.
 ## How to get mmproj.gguf?
 ```bash
 cd gemma-3-4b-it
 python ../llama.cpp/examples/llava/gemma3_convert_encoder_to_gguf.py .
 # output file is mmproj.gguf
 ```
 ## How to run it?
 What you need:
 - The text model GGUF, can be converted using `convert_hf_to_gguf.py`
 - The mmproj file from step above
 - An image file
 ```bash
 # build
 cmake -B build
 cmake --build build --target llama-gemma3-cli
 # run it
 ./build/bin/llama-gemma3-cli -m {text_model}.gguf --mmproj mmproj.gguf --image your_image.jpg
 ```
--- a/examples/llava/README-minicpmo2.6.md
+++ b/examples/llava/README-minicpmo2.6.md
@ -5,13 +5,25 @@ Currently, this readme only supports minicpm-omni's image capabilities, and we w
 Download [MiniCPM-o-2_6](https://huggingface.co/openbmb/MiniCPM-o-2_6) PyTorch model from huggingface to "MiniCPM-o-2_6" folder.
 ### Build llama.cpp
 Readme modification time: 20250206
 If there are differences in usage, please refer to the official build [documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
 Clone llama.cpp:
 ```bash
-git clone git@github.com:OpenBMB/llama.cpp.git
+git clone https://github.com/ggerganov/llama.cpp
 cd llama.cpp
 git checkout minicpm-omni
 ```
 Build llama.cpp using `CMake`:
 ```bash
 cmake -B build
 cmake --build build --config Release
 ```
 ### Usage of MiniCPM-o 2.6
 Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-o-2_6-gguf) by us)
@ -22,25 +34,15 @@ python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-
 python ./convert_hf_to_gguf.py ../MiniCPM-o-2_6/model
 # quantize int4 version
-./llama-quantize ../MiniCPM-o-2_6/model/ggml-model-f16.gguf ../MiniCPM-o-2_6/model/ggml-model-Q4_K_M.gguf Q4_K_M
+./build/bin/llama-quantize ../MiniCPM-o-2_6/model/ggml-model-f16.gguf ../MiniCPM-o-2_6/model/ggml-model-Q4_K_M.gguf Q4_K_M
 ```
 Build llama.cpp using `CMake`:
 https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md
 ```bash
 cmake -B build
 cmake --build build --config Release
 ```
 Inference on Linux or Mac
-```
+```bash
 # run f16 version
-./llama-minicpmv-cli -m ../MiniCPM-o-2_6/model/ggml-model-f16.gguf --mmproj ../MiniCPM-o-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
+./build/bin/llama-minicpmv-cli -m ../MiniCPM-o-2_6/model/ggml-model-f16.gguf --mmproj ../MiniCPM-o-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
 # run quantized int4 version
-./llama-minicpmv-cli -m ../MiniCPM-o-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-o-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg  -p "What is in the image?"
+./build/bin/llama-minicpmv-cli -m ../MiniCPM-o-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-o-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg  -p "What is in the image?"
 # or run in interactive mode
 ./llama-minicpmv-cli -m ../MiniCPM-o-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-o-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -i
 ```
--- a/examples/llava/README-minicpmv2.5.md
+++ b/examples/llava/README-minicpmv2.5.md
@ -4,13 +4,26 @@
 Download [MiniCPM-Llama3-V-2_5](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5) PyTorch model from huggingface to "MiniCPM-Llama3-V-2_5" folder.
 ### Build llama.cpp
 Readme modification time: 20250206
 If there are differences in usage, please refer to the official build [documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
 Clone llama.cpp:
 ```bash
 git clone https://github.com/ggml-org/llama.cpp
 cd llama.cpp
 ```
-### Usage
+Build llama.cpp using `CMake`:
 ```bash
 cmake -B build
 cmake --build build --config Release
 ```
 ### Usage of MiniCPM-Llama3-V 2.5
 Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf) by us)
@ -20,80 +33,15 @@ python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-
 python ./convert_hf_to_gguf.py ../MiniCPM-Llama3-V-2_5/model
 # quantize int4 version
-./llama-quantize ../MiniCPM-Llama3-V-2_5/model/model-8B-F16.gguf ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf Q4_K_M
+./build/bin/llama-quantize ../MiniCPM-Llama3-V-2_5/model/model-8B-F16.gguf ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf Q4_K_M
 ```
 Build for Linux or Mac
 ```bash
 make
 make llama-minicpmv-cli
 ```
 Inference on Linux or Mac
-```
+```bash
 # run f16 version
-./llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/model-8B-F16.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
+./build/bin/llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/model-8B-F16.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
 # run quantized int4 version
-./llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg  -p "What is in the image?"
+./build/bin/llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg  -p "What is in the image?"
 # or run in interactive mode
 ./llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -i
 ```
 ### Android
 #### Build on Android device using Termux
 We found that build on Android device would bring better runtime performance, so we recommend to build on device.
 [Termux](https://github.com/termux/termux-app#installation) is a terminal app on Android device (no root required).
 Install tools in Termux:
 ```
 apt update && apt upgrade -y
 apt install git make cmake
 ```
 It's recommended to move your model inside the `~/` directory for best performance:
 ```
 cd storage/downloads
 mv model.gguf ~/
 ```
 #### Building the Project using Android NDK
 Obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake.
 Execute the following commands on your computer to avoid downloading the NDK to your mobile. Alternatively, you can also do this in Termux:
 ```bash
 mkdir build-android
 cd build-android
 export NDK=/your_ndk_path
 cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod ..
 make
 ```
 Install [termux](https://github.com/termux/termux-app#installation) on your device and run `termux-setup-storage` to get access to your SD card (if Android 11+ then run the command twice).
 Finally, copy these built `llama` binaries and the model file to your device storage. Because the file permissions in the Android sdcard cannot be changed, you can copy the executable files to the `/data/data/com.termux/files/home/bin` path, and then execute the following commands in Termux to add executable permission:
 (Assumed that you have pushed the built executable files to the /sdcard/llama.cpp/bin path using `adb push`)
 ```
 $cp -r /sdcard/llama.cpp/bin /data/data/com.termux/files/home/
 $cd /data/data/com.termux/files/home/bin
 $chmod +x ./*
 ```
 Download models and push them to `/sdcard/llama.cpp/`, then move it to `/data/data/com.termux/files/home/model/`
 ```
 $mv /sdcard/llama.cpp/ggml-model-Q4_K_M.gguf /data/data/com.termux/files/home/model/
 $mv /sdcard/llama.cpp/mmproj-model-f16.gguf /data/data/com.termux/files/home/model/
 ```
 Now, you can start chatting:
 ```
 $cd /data/data/com.termux/files/home/bin
 $./llama-minicpmv-cli -m ../model/ggml-model-Q4_K_M.gguf --mmproj ../model/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg  -p "What is in the image?"
 ```
--- a/examples/llava/README-minicpmv2.6.md
+++ b/examples/llava/README-minicpmv2.6.md
@ -4,13 +4,25 @@
 Download [MiniCPM-V-2_6](https://huggingface.co/openbmb/MiniCPM-V-2_6) PyTorch model from huggingface to "MiniCPM-V-2_6" folder.
 ### Build llama.cpp
 Readme modification time: 20250206
 If there are differences in usage, please refer to the official build [documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
 Clone llama.cpp:
 ```bash
-git clone git@github.com:OpenBMB/llama.cpp.git
+git clone https://github.com/ggerganov/llama.cpp
 cd llama.cpp
 git checkout minicpmv-main
 ```
 Build llama.cpp using `CMake`:
 ```bash
 cmake -B build
 cmake --build build --config Release
 ```
 ### Usage of MiniCPM-V 2.6
 Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) by us)
@ -21,87 +33,15 @@ python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-
 python ./convert_hf_to_gguf.py ../MiniCPM-V-2_6/model
 # quantize int4 version
-./llama-quantize ../MiniCPM-V-2_6/model/ggml-model-f16.gguf ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf Q4_K_M
+./build/bin/llama-quantize ../MiniCPM-V-2_6/model/ggml-model-f16.gguf ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf Q4_K_M
 ```
 Build for Linux or Mac
 ```bash
 make
 make llama-minicpmv-cli
 ```
 Inference on Linux or Mac
-```
+```bash
 # run f16 version
-./llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-f16.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
+./build/bin/llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-f16.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
 # run quantized int4 version
-./llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg  -p "What is in the image?"
+./build/bin/llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg  -p "What is in the image?"
 # or run in interactive mode
 ./llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -i
 ```
 ### Video
 Install FFmpeg
 ```
 brew install ffmpeg
 brew install pkg-config
 ```
 ### Android
 #### Build on Android device using Termux
 We found that build on Android device would bring better runtime performance, so we recommend to build on device.
 [Termux](https://github.com/termux/termux-app#installation) is a terminal app on Android device (no root required).
 Install tools in Termux:
 ```
 apt update && apt upgrade -y
 apt install git make cmake
 ```
 It's recommended to move your model inside the `~/` directory for best performance:
 ```
 cd storage/downloads
 mv model.gguf ~/
 ```
 #### Building the Project using Android NDK
 Obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake.
 Execute the following commands on your computer to avoid downloading the NDK to your mobile. Alternatively, you can also do this in Termux:
 ```bash
 mkdir build-android
 cd build-android
 export NDK=/your_ndk_path
 cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod ..
 make
 ```
 Install [termux](https://github.com/termux/termux-app#installation) on your device and run `termux-setup-storage` to get access to your SD card (if Android 11+ then run the command twice).
 Finally, copy these built `llama` binaries and the model file to your device storage. Because the file permissions in the Android sdcard cannot be changed, you can copy the executable files to the `/data/data/com.termux/files/home/bin` path, and then execute the following commands in Termux to add executable permission:
 (Assumed that you have pushed the built executable files to the /sdcard/llama.cpp/bin path using `adb push`)
 ```
 $cp -r /sdcard/llama.cpp/bin /data/data/com.termux/files/home/
 $cd /data/data/com.termux/files/home/bin
 $chmod +x ./*
 ```
 Download models and push them to `/sdcard/llama.cpp/`, then move it to `/data/data/com.termux/files/home/model/`
 ```
 $mv /sdcard/llama.cpp/ggml-model-Q4_K_M.gguf /data/data/com.termux/files/home/model/
 $mv /sdcard/llama.cpp/mmproj-model-f16.gguf /data/data/com.termux/files/home/model/
 ```
 Now, you can start chatting:
 ```
 $cd /data/data/com.termux/files/home/bin
 $./llama-minicpmv-cli -m ../model/ggml-model-Q4_K_M.gguf --mmproj ../model/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg  -p "What is in the image?"
 ```
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@ -4,6 +4,7 @@
 // Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
 #include "clip.h"
 #include "ggml.h"
 #include "ggml-cpp.h"
 #include "ggml-cpu.h"
 #include "ggml-alloc.h"
 #include "ggml-backend.h"
@ -12,19 +13,9 @@
 #ifdef GGML_USE_CUDA
 #include "ggml-cuda.h"
 #endif
 #ifdef GGML_USE_SYCL
 #include "ggml-sycl.h"
 #endif
 #ifdef GGML_USE_METAL
 #include "ggml-metal.h"
 #endif
 #ifdef GGML_USE_CANN
 #include "ggml-cann.h"
 #endif
 #ifdef GGML_USE_VULKAN
 #include "ggml-vulkan.h"
 #endif
@ -158,6 +149,8 @@ static std::string format(const char * fmt, ...) {
 #define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
 #define TN_MVLM_PROJ_PEG   "mm.model.peg.%d.%s"
 #define TN_IMAGE_NEWLINE   "model.image_newline"
 #define TN_MM_INP_PROJ     "mm.input_projection.weight" // gemma3
 #define TN_MM_SOFT_EMB_N   "mm.soft_emb_norm.weight"    // gemma3
 #define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k"
 #define TN_MINICPMV_QUERY "resampler.query"
@ -184,6 +177,7 @@ enum projector_type {
    PROJECTOR_TYPE_RESAMPLER,
    PROJECTOR_TYPE_GLM_EDGE,
    PROJECTOR_TYPE_MERGER,
    PROJECTOR_TYPE_GEMMA3,
    PROJECTOR_TYPE_UNKNOWN,
 };
@ -194,6 +188,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
    { PROJECTOR_TYPE_RESAMPLER, "resampler"},
    { PROJECTOR_TYPE_GLM_EDGE, "adapter"},
    { PROJECTOR_TYPE_MERGER, "qwen2vl_merger"},
    { PROJECTOR_TYPE_GEMMA3, "gemma3"},
 };
@ -320,7 +315,7 @@ static projector_type clip_projector_type_from_string(const std::string & name)
            return kv.first;
        }
    }
-    return PROJECTOR_TYPE_UNKNOWN;
+    throw std::runtime_error(format("Unknown projector type: %s", name.c_str()));
 }
 #ifdef CLIP_DEBUG_FUNCTIONS
@ -577,8 +572,18 @@ struct clip_vision_model {
    struct ggml_tensor * mm_model_ln_kv_b;
    struct ggml_tensor * mm_model_ln_post_w;
    struct ggml_tensor * mm_model_ln_post_b;
    // gemma3
    struct ggml_tensor * mm_input_proj_w;
    struct ggml_tensor * mm_soft_emb_norm_w;
 };
 bool enable_gpu_clip = true;
 void set_clip_uses_gpu(bool usegpu)
 {
    enable_gpu_clip = usegpu;
 }
 struct clip_ctx {
    bool has_text_encoder    = false;
    bool has_vision_encoder  = false;
@ -591,7 +596,7 @@ struct clip_ctx {
    struct clip_vision_model vision_model;
    projector_type proj_type = PROJECTOR_TYPE_MLP;
-    int32_t max_feature_layer;
+    int32_t max_feature_layer; // unused in newer models like gemma3
    float image_mean[3];
    float image_std[3];
    bool use_gelu = false;
@ -603,21 +608,213 @@ struct clip_ctx {
    bool has_post_norm = false;
    bool has_patch_bias = false;
-    struct gguf_context * ctx_gguf;
+    struct gguf_context * ctx_gguf = nullptr;
-    struct ggml_context * ctx_data;
+    struct ggml_context * ctx_data = nullptr;
    std::vector<uint8_t> buf_compute_meta;
-    // memory buffers to evaluate the model
+    std::vector<ggml_backend_t> backend_ptrs;
-    ggml_backend_buffer_t params_buffer  = NULL;
+    std::vector<ggml_backend_buffer_type_t> backend_buft;
-    ggml_backend_t backend       = NULL;
+    ggml_backend_t backend     = nullptr;
-    ggml_gallocr_t compute_alloc = NULL;
+    ggml_backend_buffer_t buf  = nullptr;
-    struct clip_image_size * load_image_size;
+    ggml_backend_sched_ptr sched;
    struct clip_image_size * load_image_size = nullptr;
    clip_ctx(clip_context_params & ctx_params) {
        if(enable_gpu_clip)
        {
        #ifdef GGML_USE_CUDA
            backend = ggml_backend_cuda_init(0);
            LOG_INF("%s: CLIP using CUDA backend\n", __func__);
        #endif
        #ifdef GGML_USE_METAL
            backend = ggml_backend_metal_init();
            LOG_INF("%s: CLIP using Metal backend\n", __func__);
        #endif
        #ifdef GGML_USE_VULKAN
            backend = ggml_backend_vk_init(0);
            LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
        #endif
        }
        if (!backend) {
            backend = ggml_backend_cpu_init();
            LOG_INF("%s: CLIP using CPU backend\n", __func__);
        }
        backend_ptrs.push_back(backend);
        backend_buft.push_back(ggml_backend_get_default_buffer_type(backend));
        sched.reset(
            ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), 8192, false)
        );
    }
    ~clip_ctx() {
        ggml_free(ctx_data);
        gguf_free(ctx_gguf);
        ggml_backend_buffer_free(buf);
        ggml_backend_free(backend);
    }
 };
-static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) {
+static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_image_f32_batch * imgs) {
    const auto & model = ctx->vision_model;
    const auto & hparams = model.hparams;
    const int image_size = hparams.image_size;
    int image_size_width  = image_size;
    int image_size_height = image_size;
    const int patch_size           = hparams.patch_size;
    const int num_patches          = ((image_size_width / patch_size) * (image_size_height / patch_size));
    const int hidden_size          = hparams.hidden_size;
    const int n_head               = hparams.n_head;
    const int d_head               = hidden_size / n_head;
    const int n_layer              = hparams.n_layer;
    const float eps                = hparams.eps;
    GGML_ASSERT(imgs->size == 1); // batch_size == 1
    struct ggml_init_params params = {
        /*.mem_size   =*/ ctx->buf_compute_meta.size(),
        /*.mem_buffer =*/ ctx->buf_compute_meta.data(),
        /*.no_alloc   =*/ true,
    };
    struct ggml_context * ctx0 = ggml_init(params);
    struct ggml_cgraph * gf = ggml_new_graph(ctx0);
    // input raw
    struct ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, image_size_width, image_size_height, 3);
    ggml_set_name(inp_raw, "inp_raw");
    ggml_set_input(inp_raw);
    struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
    inp = ggml_reshape_2d(ctx0, inp, num_patches, hidden_size);
    inp = ggml_cont(ctx0, ggml_transpose(ctx0, inp));
    inp = ggml_add(ctx0, inp, model.patch_bias);
    // position embeddings
    struct ggml_tensor * embeddings = ggml_add(ctx0, inp, model.position_embeddings);
    // loop over layers
    for (int il = 0; il < n_layer; il++) {
        struct ggml_tensor * cur = embeddings; // embeddings = residual, cur = hidden_states
        // layernorm1
        {
            cur = ggml_norm(ctx0, cur, eps);
            cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ln_1_w), model.layers[il].ln_1_b);
        }
        // self-attention
        {
            struct ggml_tensor * Q =
                ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].q_w, cur), model.layers[il].q_b);
            Q = ggml_reshape_3d(ctx0, Q, d_head, n_head, num_patches);
            Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
            struct ggml_tensor * K =
                ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].k_w, cur), model.layers[il].k_b);
            K = ggml_reshape_3d(ctx0, K, d_head, n_head, num_patches);
            K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
            struct ggml_tensor * V =
                ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].v_w, cur), model.layers[il].v_b);
            V = ggml_reshape_3d(ctx0, V, d_head, n_head, num_patches);
            V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
            KQ = ggml_scale_inplace(ctx0, KQ, 1.0f / sqrtf((float)d_head));
            KQ = ggml_soft_max_inplace(ctx0, KQ);
            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ);
            KQV = ggml_reshape_3d(ctx0, KQV, d_head, num_patches, n_head);
            KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
            cur = ggml_cont_2d(ctx0, KQV, hidden_size, num_patches);
        }
        // attention output
        cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].o_w, cur), model.layers[il].o_b);
        // re-add the layer input, e.g., residual
        cur = ggml_add(ctx0, cur, embeddings);
        embeddings = cur; // embeddings = residual, cur = hidden_states
        // layernorm2
        {
            cur = ggml_norm(ctx0, cur, eps);
            cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ln_2_w), model.layers[il].ln_2_b);
        }
        cur = ggml_mul_mat(ctx0, model.layers[il].ff_i_w, cur);
        cur = ggml_add(ctx0, cur, model.layers[il].ff_i_b);
        // siglip uses gelu
        cur = ggml_gelu(ctx0, cur);
        cur = ggml_mul_mat(ctx0, model.layers[il].ff_o_w, cur);
        cur = ggml_add(ctx0, cur, model.layers[il].ff_o_b);
        // residual 2
        cur = ggml_add(ctx0, embeddings, cur);
        embeddings = cur;
    }
    // post-layernorm
    if (ctx->has_post_norm) {
        embeddings = ggml_norm(ctx0, embeddings, eps);
        ggml_set_name(embeddings, "post_ln");
        embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.post_ln_w), model.post_ln_b);
    }
    if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
        const int batch_size = 1;
        const int mm_tokens_per_image = 256; // default value for gemma3
        const int tokens_per_side = sqrt(mm_tokens_per_image);
        const int patches_per_image = sqrt(num_patches);
        const int kernel_size = patches_per_image / tokens_per_side;
        embeddings = ggml_cont(ctx0, ggml_transpose(ctx0, embeddings));
        embeddings = ggml_reshape_4d(ctx0, embeddings, patches_per_image, patches_per_image, hidden_size, batch_size);
        // doing a pool2d to reduce the number of output tokens to 256
        embeddings = ggml_pool_2d(ctx0, embeddings, GGML_OP_POOL_AVG, kernel_size, kernel_size, kernel_size, kernel_size, 0, 0);
        embeddings = ggml_reshape_3d(ctx0, embeddings, embeddings->ne[0] * embeddings->ne[0], hidden_size, batch_size);
        embeddings = ggml_cont(ctx0, ggml_transpose(ctx0, embeddings));
        // apply norm before projection
        embeddings = ggml_rms_norm(ctx0, embeddings, eps);
        embeddings = ggml_mul(ctx0, embeddings, model.mm_soft_emb_norm_w);
        // apply projection
        embeddings = ggml_mul_mat(ctx0,
            ggml_cont(ctx0, ggml_transpose(ctx0, model.mm_input_proj_w)),
            embeddings);
    }
    // build the graph
    ggml_build_forward_expand(gf, embeddings);
    ggml_free(ctx0);
    return gf;
 }
 static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) {
    if (!ctx->has_vision_encoder) {
        LOG_ERR("This gguf file seems to have no vision encoder\n");
        return nullptr;
@ -1163,7 +1360,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
        } else {
            GGML_ABORT("fatel error");
        }
-    } else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
+    }
    else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
        embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size * 4, num_positions / 4, batch_size);
        embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
@ -1185,14 +1383,25 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
    return gf;
 }
-bool enable_gpu_clip = true;
+static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) {
-void set_clip_uses_gpu(bool usegpu)
+    if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
-{
+        return clip_image_build_graph_siglip(ctx, imgs);
-    enable_gpu_clip = usegpu;
+    } else {
        // TODO: we should have one build_* function per model
        return clip_image_build_graph_legacy(ctx, imgs, load_image_size, is_inf);
    }
 }
 // read and create ggml_context containing the tensors and their data
 struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
    return clip_init(fname, clip_context_params{
        /* use_gpu */   true,
        /* verbosity */ verbosity,
    });
 }
 struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_params) {
    int verbosity = ctx_params.verbosity;
    struct ggml_context * meta = NULL;
    struct gguf_init_params params = {
@ -1206,6 +1415,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
    }
    if (verbosity >= 1) {
        try {
        const int n_tensors = gguf_get_n_tensors(ctx);
        const int n_kv = gguf_get_n_kv(ctx);
        const int ftype = get_u32(ctx, KEY_FTYPE);
@ -1224,6 +1434,9 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
        LOG_INF("%s: n_kv:         %d\n", __func__, n_kv);
        LOG_INF("%s: ftype:        %s\n", __func__, ftype_str.c_str());
        LOG_INF("\n");
        } catch (std::runtime_error & /*e*/) {
            LOG_INF("Could not list CLIP model properties.\n");
        }
    }
    const int n_tensors = gguf_get_n_tensors(ctx);
@ -1286,7 +1499,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
        }
    }
-    clip_ctx * new_clip = new clip_ctx{};
+    clip_ctx * new_clip = new clip_ctx(ctx_params);
    // update projector type
    {
@ -1305,39 +1518,6 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
        }
    }
 if(enable_gpu_clip)
 {
 #ifdef GGML_USE_CUDA
    new_clip->backend = ggml_backend_cuda_init(0);
    LOG_INF("%s: CLIP using CUDA backend\n", __func__);
 #endif
 #ifdef GGML_USE_METAL
    new_clip->backend = ggml_backend_metal_init();
    LOG_INF("%s: CLIP using Metal backend\n", __func__);
 #endif
 #ifdef GGML_USE_CANN
    new_clip->backend = ggml_backend_cann_init(0);
    LOG_INF("%s: CLIP using CANN backend\n", __func__);
 #endif
 #ifdef GGML_USE_VULKAN
    new_clip->backend = ggml_backend_vk_init(0);
    LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
 #endif
 #ifdef GGML_USE_SYCL
    new_clip->backend = ggml_backend_sycl_init(0);
    LOG_INF("%s: CLIP using SYCL backend\n", __func__);
 #endif
 }
    if (!new_clip->backend) {
        new_clip->backend = ggml_backend_cpu_init();
        LOG_INF("%s: CLIP using CPU backend\n", __func__);
    }
    // model size and capabilities
    {
        int idx = get_key_idx(ctx, KEY_HAS_TEXT_ENC);
@ -1375,8 +1555,12 @@ if(enable_gpu_clip)
        GGML_ASSERT(new_clip->has_vision_encoder);
        GGML_ASSERT(!new_clip->has_text_encoder);
-        idx = get_key_idx(ctx, KEY_USE_GELU);
+        try {
-        new_clip->use_gelu = gguf_get_val_bool(ctx, idx);
+            idx = get_key_idx(ctx, KEY_USE_GELU);
            new_clip->use_gelu = gguf_get_val_bool(ctx, idx);
        } catch (std::runtime_error & /*e*/) {
            new_clip->use_gelu = false;
        }
        try {
            idx = get_key_idx(ctx, KEY_USE_SILU);
@ -1390,6 +1574,7 @@ if(enable_gpu_clip)
            LOG_INF("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
            LOG_INF("%s: llava_projector:  %d\n", __func__, new_clip->has_llava_projector);
            LOG_INF("%s: minicpmv_projector:  %d\n", __func__, new_clip->has_minicpmv_projector);
            LOG_INF("%s: minicpmv_version:  %d\n", __func__, new_clip->minicpmv_version);
            LOG_INF("%s: glm_projector:  %d\n", __func__, new_clip->has_glm_projector);
            LOG_INF("%s: model size:     %.2f MB\n", __func__, model_size / 1024.0 / 1024.0);
            LOG_INF("%s: metadata size:  %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
@ -1432,7 +1617,9 @@ if(enable_gpu_clip)
        }
        // alloc memory and offload data
-        new_clip->params_buffer = ggml_backend_alloc_ctx_tensors(new_clip->ctx_data, new_clip->backend);
+        ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(new_clip->backend);
        new_clip->buf = ggml_backend_alloc_ctx_tensors_from_buft(new_clip->ctx_data, buft);
        ggml_backend_buffer_set_usage(new_clip->buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
        for (int i = 0; i < n_tensors; ++i) {
            const char * name = gguf_get_tensor_name(ctx, i);
            struct ggml_tensor * cur = ggml_get_tensor(new_clip->ctx_data, name);
@ -1445,7 +1632,7 @@ if(enable_gpu_clip)
                return nullptr;
            }
            int num_bytes = ggml_nbytes(cur);
-            if (ggml_backend_buffer_is_host(new_clip->params_buffer)) {
+            if (ggml_backend_buft_is_host(buft)) {
                // for the CPU and Metal backend, we can read directly into the tensor
                fin.read(reinterpret_cast<char *>(cur->data), num_bytes);
            } else {
@ -1581,11 +1768,17 @@ if(enable_gpu_clip)
        }
        try {
-            vision_model.patch_embeddings_0    = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
+            vision_model.patch_embeddings_0 = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
        } catch(const std::exception& /*e*/) {
            vision_model.patch_embeddings_0 = nullptr;
        }
        try {
            vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
        } catch(const std::exception& /*e*/) {
-            LOG_ERR("%s: failed to load vision model tensors\n", __func__);
+            vision_model.position_embeddings = nullptr;
        }
        try {
            vision_model.patch_embeddings_1    = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD_1);
        } catch(const std::exception& /*e*/) {
@ -1696,6 +1889,10 @@ if(enable_gpu_clip)
            vision_model.mm_1_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
            vision_model.mm_1_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));
        }
        else if (new_clip->proj_type == PROJECTOR_TYPE_GEMMA3) {
            vision_model.mm_input_proj_w    = get_tensor(new_clip->ctx_data, TN_MM_INP_PROJ);
            vision_model.mm_soft_emb_norm_w = get_tensor(new_clip->ctx_data, TN_MM_SOFT_EMB_N);
        }
        else {
            std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type];
            throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
@ -1731,14 +1928,21 @@ if(enable_gpu_clip)
    // measure mem requirement and allocate
    {
        new_clip->buf_compute_meta.resize(GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead() + ggml_graph_overhead());
        new_clip->compute_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(new_clip->backend));
        clip_image_f32_batch batch;
        batch.size = 1;
        batch.data = nullptr;
        ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch, nullptr, false);
-        ggml_gallocr_reserve(new_clip->compute_alloc, gf);
+        ggml_backend_sched_reserve(new_clip->sched.get(), gf);
-        size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0);
+        for (size_t i = 0; i < new_clip->backend_ptrs.size(); ++i) {
-        LOG_INF("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
+            ggml_backend_t backend = new_clip->backend_ptrs[i];
            ggml_backend_buffer_type_t buft = new_clip->backend_buft[i];
            size_t size = ggml_backend_sched_get_buffer_size(new_clip->sched.get(), backend);
            if (size > 1) {
                LOG_INF("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
                        ggml_backend_buft_name(buft),
                        size / 1024.0 / 1024.0);
            }
        }
    }
    return new_clip;
@ -2317,7 +2521,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
        return true;
    }
-    if (ctx->has_glm_projector) {
+    if (ctx->has_glm_projector || ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
        res_imgs->size = 1;
        res_imgs->data = new clip_image_f32[res_imgs->size];
        clip_image_u8 resized_image;
@ -2506,12 +2710,6 @@ ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) {
 }
 void clip_free(clip_ctx * ctx) {
    ggml_free(ctx->ctx_data);
    gguf_free(ctx->ctx_gguf);
    ggml_backend_buffer_free(ctx->params_buffer);
    ggml_backend_free(ctx->backend);
    ggml_gallocr_free(ctx->compute_alloc);
    delete ctx;
 }
@ -2707,8 +2905,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
    }
    // build the inference graph
    ggml_backend_sched_reset(ctx->sched.get());
    ggml_cgraph * gf = clip_image_build_graph(ctx, imgs, ctx->load_image_size, true);
-    ggml_gallocr_alloc_graph(ctx->compute_alloc, gf);
+    ggml_backend_sched_alloc_graph(ctx->sched.get(), gf);
    // set inputs
    const auto & model = ctx->vision_model;
@ -2847,6 +3046,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
            ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
            free(positions_data);
        }
        else if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
            // do nothing
        }
        else {
            struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
@ -2877,7 +3079,11 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
        ggml_backend_cpu_set_n_threads(ctx->backend, n_threads);
    }
-    ggml_backend_graph_compute(ctx->backend, gf);
+    auto status = ggml_backend_sched_graph_compute(ctx->sched.get(), gf);
    if (status != GGML_STATUS_SUCCESS) {
        LOG_ERR("%s: ggml_backend_sched_graph_compute failed with error %d\n", __func__, status);
        return false;
    }
    // the last node is the embedding tensor
    struct ggml_tensor * embeddings = ggml_graph_node(gf, -1);
@ -3067,6 +3273,9 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
    if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
        return ctx->vision_model.mm_1_b->ne[0];
    }
    if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
        return ctx->vision_model.mm_input_proj_w->ne[0];
    }
    std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type];
    throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
--- a/examples/llava/clip.h
+++ b/examples/llava/clip.h
@ -39,8 +39,15 @@ struct clip_image_f32_batch {
    size_t size;
 };
-CLIP_API struct clip_ctx * clip_model_load    (const char * fname, int verbosity);
+struct clip_context_params {
-CLIP_API struct clip_ctx * clip_model_load_cpu(const char * fname, int verbosity);
+    bool use_gpu;
    int verbosity;
 };
 // deprecated, use clip_init
 CLIP_API struct clip_ctx * clip_model_load(const char * fname, int verbosity);
 CLIP_API struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_params);
 CLIP_API void clip_free(struct clip_ctx * ctx);
--- a/examples/llava/gemma3-cli.cpp
+++ b/examples/llava/gemma3-cli.cpp
@ -0,0 +1,341 @@
 #include "arg.h"
 #include "log.h"
 #include "common.h"
 #include "sampling.h"
 #include "clip.h"
 #include "stb_image.h"
 #include "llama.h"
 #include "ggml.h"
 #include "console.h"
 #include <vector>
 #include <limits.h>
 #include <inttypes.h>
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
 #include <signal.h>
 #include <unistd.h>
 #elif defined (_WIN32)
 #define WIN32_LEAN_AND_MEAN
 #ifndef NOMINMAX
 #define NOMINMAX
 #endif
 #include <windows.h>
 #include <signal.h>
 #endif
 static bool g_is_generating = false;
 /**
 * Please note that this is NOT a production-ready stuff.
 * It is a playground for trying Gemma 3 vision capabilities.
 * For contributors: please keep this code simple and easy to understand.
 */
 static void show_additional_info(int /*argc*/, char ** argv) {
    LOG(
        "Experimental CLI for using Gemma 3 vision model\n\n"
        "Usage: %s [options] -m <model> --mmproj <mmproj> --image <image> -p <prompt>\n\n"
        "  -m and --mmproj are required\n"
        "  --image and -p are optional, if NOT provided, the CLI will run in chat mode\n",
        argv[0]
    );
 }
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
 static void sigint_handler(int signo) {
    if (signo == SIGINT) {
        if (g_is_generating) {
            g_is_generating = false;
        } else {
            console::cleanup();
            LOG("\nInterrupted by user\n");
            _exit(130);
        }
    }
 }
 #endif
 struct gemma3_context {
    struct clip_ctx    * ctx_clip = NULL;
    common_init_result   llama_init;
    llama_model       * model;
    llama_context     * lctx;
    const llama_vocab * vocab;
    llama_batch         batch;
    int n_threads    = 1;
    llama_pos n_past = 0;
    gemma3_context(common_params & params) : llama_init(common_init_from_params(params)) {
        model = llama_init.model.get();
        lctx = llama_init.context.get();
        vocab = llama_model_get_vocab(model);
        n_threads = params.cpuparams.n_threads;
        batch = llama_batch_init(params.n_batch, 0, 1);
        init_clip_model(params);
    }
    void init_clip_model(common_params & params) {
        const char * clip_path = params.mmproj.c_str();
        ctx_clip = clip_model_load(clip_path, params.verbosity > 1);
    }
    ~gemma3_context() {
        clip_free(ctx_clip);
    }
 };
 struct decode_embd_batch {
    std::vector<llama_pos>      pos;
    std::vector<int32_t>        n_seq_id;
    std::vector<llama_seq_id>   seq_id_0;
    std::vector<llama_seq_id *> seq_ids;
    std::vector<int8_t>         logits;
    llama_batch batch;
    decode_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
        pos     .resize(n_tokens);
        n_seq_id.resize(n_tokens);
        seq_ids .resize(n_tokens + 1);
        logits  .resize(n_tokens);
        seq_id_0.resize(1);
        seq_id_0[0] = seq_id;
        seq_ids [n_tokens] = nullptr;
        batch = {
            /*n_tokens       =*/ n_tokens,
            /*tokens         =*/ nullptr,
            /*embd           =*/ embd,
            /*pos            =*/ pos.data(),
            /*n_seq_id       =*/ n_seq_id.data(),
            /*seq_id         =*/ seq_ids.data(),
            /*logits         =*/ logits.data(),
        };
        for (int i = 0; i < n_tokens; i++) {
            batch.pos     [i] = pos_0 + i;
            batch.n_seq_id[i] = 1;
            batch.seq_id  [i] = seq_id_0.data();
            batch.logits  [i] = false;
        }
    }
 };
 static int eval_text(gemma3_context & ctx, std::string input, bool logits_last = false) {
    llama_tokens tokens = common_tokenize(ctx.lctx, input, false, true);
    common_batch_clear(ctx.batch);
    for (llama_token & t : tokens) {
        common_batch_add(ctx.batch, t, ctx.n_past++, {0}, false);
    }
    if (logits_last) {
        ctx.batch.logits[ctx.batch.n_tokens - 1] = true;
    }
    // LOG("eval_text (n_tokens = %d): %s\n", (int)tokens.size(), input.c_str());
    if (llama_decode(ctx.lctx, ctx.batch)) {
        LOG_ERR("Failed to decode text\n");
        return 1;
    }
    return 0;
 }
 static int eval_image(gemma3_context & ctx, std::string & fname) {
    std::vector<float> image_embd_v;
    int n_embd = llama_model_n_embd(ctx.model);
    int n_tokens = 256;
    image_embd_v.resize(n_tokens * n_embd);
    bool ok;
    struct clip_image_u8 * img_u8 = clip_image_u8_init();
    ok = clip_image_load_from_file(fname.c_str(), img_u8);
    if (!ok) {
        LOG_ERR("Unable to load image %s\n", fname.c_str());
        clip_image_u8_free(img_u8);
        return 2; // non-fatal error
    }
    clip_image_f32_batch batch_f32;
    ok = clip_image_preprocess(ctx.ctx_clip, img_u8, &batch_f32);
    if (!ok) {
        LOG_ERR("Unable to preprocess image\n");
        clip_image_f32_batch_free(&batch_f32);
        clip_image_u8_free(img_u8);
        return 1;
    }
    int64_t t0 = ggml_time_ms();
    LOG("Encoding image %s\n", fname.c_str());
    ok = clip_image_batch_encode(ctx.ctx_clip, ctx.n_threads, &batch_f32, image_embd_v.data());
    if (!ok) {
        LOG_ERR("Unable to encode image\n");
        clip_image_f32_batch_free(&batch_f32);
        clip_image_u8_free(img_u8);
        return 1;
    }
    LOG("Image encoded in %" PRId64 " ms\n", ggml_time_ms() - t0);
    clip_image_f32_batch_free(&batch_f32);
    clip_image_u8_free(img_u8);
    // decode image embeddings
    int64_t t1 = ggml_time_ms();
    eval_text(ctx, "<start_of_image>");
    llama_set_causal_attn(ctx.lctx, false);
    decode_embd_batch batch_img(image_embd_v.data(), n_tokens, ctx.n_past, 0);
    if (llama_decode(ctx.lctx, batch_img.batch)) {
        LOG_ERR("failed to decode image\n");
        return 1;
    }
    ctx.n_past += n_tokens;
    llama_set_causal_attn(ctx.lctx, true);
    eval_text(ctx, "<end_of_image>");
    LOG("Image decoded in %" PRId64 " ms\n", ggml_time_ms() - t1);
    return 0;
 }
 static int generate_response(gemma3_context & ctx, common_sampler * smpl, int n_predict) {
    for (int i = 0; i < n_predict; i++) {
        if (i > n_predict || !g_is_generating) {
            printf("\n");
            break;
        }
        llama_token token_id = common_sampler_sample(smpl, ctx.lctx, -1);
        common_sampler_accept(smpl, token_id, true);
        if (llama_vocab_is_eog(ctx.vocab, token_id)) {
            printf("\n");
            break; // end of generation
        }
        printf("%s", common_token_to_piece(ctx.lctx, token_id).c_str());
        fflush(stdout);
        // eval the token
        common_batch_clear(ctx.batch);
        common_batch_add(ctx.batch, token_id, ctx.n_past++, {0}, true);
        if (llama_decode(ctx.lctx, ctx.batch)) {
            LOG_ERR("failed to decode token\n");
            return 1;
        }
    }
    return 0;
 }
 int main(int argc, char ** argv) {
    ggml_time_init();
    common_params params;
    params.sampling.temp = 0.2; // lower temp by default for better quality
    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, show_additional_info)) {
        return 1;
    }
    common_init();
    if (params.mmproj.empty()) {
        show_additional_info(argc, argv);
        return 1;
    }
    gemma3_context ctx(params);
    printf("%s: %s\n", __func__, params.model.c_str());
    bool is_single_turn = !params.prompt.empty() && !params.image.empty();
    struct common_sampler * smpl = common_sampler_init(ctx.model, params.sampling);
    int n_predict = params.n_predict < 0 ? INT_MAX : params.n_predict;
    // ctrl+C handling
    {
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
        struct sigaction sigint_action;
        sigint_action.sa_handler = sigint_handler;
        sigemptyset (&sigint_action.sa_mask);
        sigint_action.sa_flags = 0;
        sigaction(SIGINT, &sigint_action, NULL);
 #elif defined (_WIN32)
        auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
            return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false;
        };
        SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
 #endif
    }
    if (eval_text(ctx, "<bos>")) {
        return 1;
    }
    if (is_single_turn) {
        g_is_generating = true;
        if (eval_text(ctx, "<start_of_turn>user\n")) {
            return 1;
        }
        for (auto & fname : params.image) {
            if (eval_image(ctx, fname)) {
                return 1;
            }
        }
        if (eval_text(ctx, params.prompt + "<end_of_turn><start_of_turn>model\n", true)) {
            return 1;
        }
        if (generate_response(ctx, smpl, n_predict)) {
            return 1;
        }
    } else {
        LOG("\n Running in chat mode, available commands:");
        LOG("\n   /image <path>    load an image");
        LOG("\n   /clear           clear the chat history");
        LOG("\n   /quit or /exit   exit the program");
        LOG("\n");
        if (eval_text(ctx, "<start_of_turn>user\n")) {
            return 1;
        }
        while (true) {
            g_is_generating = false;
            LOG("\n> ");
            console::set_display(console::user_input);
            std::string line;
            console::readline(line, false);
            console::set_display(console::reset);
            line = string_strip(line);
            if (line.empty()) {
                continue;
            }
            if (line == "/quit" || line == "/exit") {
                break;
            }
            if (line == "/clear") {
                ctx.n_past = 0;
                llama_kv_cache_seq_rm(ctx.lctx, 0, 1, -1); // keep BOS
                LOG("Chat history cleared\n\n");
                continue;
            }
            g_is_generating = true;
            if (line.find("/image") == 0) {
                std::string image = line.substr(7);
                int res = eval_image(ctx, image);
                if (res == 2) {
                    continue; // image not found
                }
                if (res) {
                    return 1;
                }
                continue;
            }
            if (eval_text(ctx, line + "<end_of_turn><start_of_turn>model\n", true)) {
                return 1;
            }
            if (generate_response(ctx, smpl, n_predict)) {
                return 1;
            }
            if (eval_text(ctx, "<end_of_turn><start_of_turn>user\n")) {
                return 1;
            }
        }
    }
    return 0;
 }
--- a/examples/llava/gemma3_convert_encoder_to_gguf.py
+++ b/examples/llava/gemma3_convert_encoder_to_gguf.py
@ -0,0 +1,307 @@
 import gguf
 import argparse
 import logging
 import sys
 import torch
 import json
 import os
 import numpy as np
 from typing import cast, ContextManager, Any, Iterator
 from pathlib import Path
 from torch import Tensor
 logger = logging.getLogger("gemma3-mmproj")
 # (copied from convert_hf_to_gguf.py)
 # tree of lazy tensors
 class LazyTorchTensor(gguf.LazyBase):
    _tensor_type = torch.Tensor
    # to keep the type-checker happy
    dtype: torch.dtype
    shape: torch.Size
    # only used when converting a torch.Tensor to a np.ndarray
    _dtype_map: dict[torch.dtype, type] = {
        torch.float16: np.float16,
        torch.float32: np.float32,
    }
    # used for safetensors slices
    # ref: https://github.com/huggingface/safetensors/blob/079781fd0dc455ba0fe851e2b4507c33d0c0d407/bindings/python/src/lib.rs#L1046
    # TODO: uncomment U64, U32, and U16, ref: https://github.com/pytorch/pytorch/issues/58734
    _dtype_str_map: dict[str, torch.dtype] = {
        "F64": torch.float64,
        "F32": torch.float32,
        "BF16": torch.bfloat16,
        "F16": torch.float16,
        # "U64": torch.uint64,
        "I64": torch.int64,
        # "U32": torch.uint32,
        "I32": torch.int32,
        # "U16": torch.uint16,
        "I16": torch.int16,
        "U8": torch.uint8,
        "I8": torch.int8,
        "BOOL": torch.bool,
        "F8_E4M3": torch.float8_e4m3fn,
        "F8_E5M2": torch.float8_e5m2,
    }
    def numpy(self) -> gguf.LazyNumpyTensor:
        dtype = self._dtype_map[self.dtype]
        return gguf.LazyNumpyTensor(
            meta=gguf.LazyNumpyTensor.meta_with_dtype_and_shape(dtype, self.shape),
            args=(self,),
            func=(lambda s: s.numpy())
        )
    @classmethod
    def meta_with_dtype_and_shape(cls, dtype: torch.dtype, shape: tuple[int, ...]) -> Tensor:
        return torch.empty(size=shape, dtype=dtype, device="meta")
    @classmethod
    def from_safetensors_slice(cls, st_slice: Any) -> Tensor:
        dtype = cls._dtype_str_map[st_slice.get_dtype()]
        shape: tuple[int, ...] = tuple(st_slice.get_shape())
        lazy = cls(meta=cls.meta_with_dtype_and_shape(dtype, shape), args=(st_slice,), func=lambda s: s[:])
        return cast(torch.Tensor, lazy)
    @classmethod
    def __torch_function__(cls, func, types, args=(), kwargs=None):
        del types  # unused
        if kwargs is None:
            kwargs = {}
        if func is torch.Tensor.numpy:
            return args[0].numpy()
        return cls._wrap_fn(func)(*args, **kwargs)
 class Gemma3VisionTower:
    hparams: dict
    gguf_writer: gguf.GGUFWriter
    fname_out: Path
    ftype: gguf.LlamaFileType
    @staticmethod
    def load_hparams(dir_model: Path):
        with open(dir_model / "config.json", "r", encoding="utf-8") as f:
            return json.load(f)
    @staticmethod
    def get_model_part_names(dir_model: Path, prefix: str, suffix: str) -> list[str]:
        part_names: list[str] = []
        for filename in os.listdir(dir_model):
            if filename.startswith(prefix) and filename.endswith(suffix):
                part_names.append(filename)
        part_names.sort()
        return part_names
    def __init__(self,
                 dir_model: Path,
                 fname_out: Path,
                 ftype: gguf.LlamaFileType,
                 is_big_endian: bool,):
        hparams = Gemma3VisionTower.load_hparams(dir_model)
        self.hparams = hparams
        self.fname_out = fname_out
        self.ftype = ftype
        endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
        self.gguf_writer = gguf.GGUFWriter(path=None, arch="clip", endianess=endianess)
        text_config = hparams["text_config"]
        vision_config = hparams["vision_config"]
        assert hparams["architectures"][0] == "Gemma3ForConditionalGeneration"
        assert text_config is not None
        assert vision_config is not None
        self.gguf_writer.add_string ("clip.projector_type",              "gemma3")
        self.gguf_writer.add_bool   ("clip.has_text_encoder",            False)
        self.gguf_writer.add_bool   ("clip.has_vision_encoder",          True)
        self.gguf_writer.add_bool   ("clip.has_llava_projector",         False) # legacy
        self.gguf_writer.add_uint32 ("clip.vision.image_size",           vision_config["image_size"])
        self.gguf_writer.add_uint32 ("clip.vision.patch_size",           vision_config["patch_size"])
        self.gguf_writer.add_uint32 ("clip.vision.embedding_length",     vision_config["hidden_size"])
        self.gguf_writer.add_uint32 ("clip.vision.feed_forward_length",  vision_config["intermediate_size"])
        self.gguf_writer.add_uint32 ("clip.vision.projection_dim",       text_config["hidden_size"])
        self.gguf_writer.add_uint32 ("clip.vision.block_count",          vision_config["num_hidden_layers"])
        self.gguf_writer.add_uint32 ("clip.vision.attention.head_count", vision_config["num_attention_heads"])
        self.gguf_writer.add_float32("clip.vision.attention.layer_norm_epsilon", vision_config.get("layer_norm_eps", 1e-6))
        # default values taken from HF tranformers code
        self.gguf_writer.add_array  ("clip.vision.image_mean", [0.5, 0.5, 0.5])
        self.gguf_writer.add_array  ("clip.vision.image_std",  [0.5, 0.5, 0.5])
        self.gguf_writer.add_bool   ("clip.use_gelu", True)
        # load tensors
        for name, data_torch in self.get_tensors(dir_model):
            # convert any unsupported data types to float32
            if data_torch.dtype not in (torch.float16, torch.float32):
                data_torch = data_torch.to(torch.float32)
            self.add_tensor(name, data_torch)
    def get_tensors(self, dir_model: Path) -> Iterator[tuple[str, Tensor]]:
        part_names = Gemma3VisionTower.get_model_part_names(dir_model, "model", ".safetensors")
        tensor_names_from_parts: set[str] = set()
        for part_name in part_names:
            logger.info(f"gguf: loading model part '{part_name}'")
            from safetensors import safe_open
            ctx = cast(ContextManager[Any], safe_open(dir_model / part_name, framework="pt", device="cpu"))
            with ctx as model_part:
                tensor_names_from_parts.update(model_part.keys())
                for name in model_part.keys():
                    data = model_part.get_slice(name)
                    data = LazyTorchTensor.from_safetensors_slice(data)
                    yield name, data
    def add_tensor(self, name: str, data_torch: Tensor):
        is_1d = len(data_torch.shape) == 1
        is_embd = ".embeddings." in name
        old_dtype = data_torch.dtype
        can_quantize = not is_1d and not is_embd
        data_qtype = gguf.GGMLQuantizationType.F32
        # this is to support old checkpoint
        # TODO: remove this when we have the final model
        name = name.replace("vision_model.vision_model.", "vision_tower.vision_model.")
        name = name.replace("multimodal_projector.", "multi_modal_projector.")
        # filter only vision tensors
        if not name.startswith("vision_tower.vision_model.") and not name.startswith("multi_modal_projector."):
            return
        # prefix
        name = name.replace("vision_tower.vision_model.encoder.layers.", "v.blk.")
        name = name.replace("vision_tower.vision_model.", "v.")
        # projector and input embd
        name = name.replace(".embeddings.patch_embedding.", ".patch_embd.")
        name = name.replace(".embeddings.position_embedding.", ".position_embd.")
        name = name.replace(
            "multi_modal_projector.mm_input_projection_weight",
            "mm.input_projection.weight"
        )
        name = name.replace(
            "multi_modal_projector.mm_soft_emb_norm.weight",
            "mm.soft_emb_norm.weight"
        )
        name = name.replace("post_layernorm.", "post_ln.")
        # each block
        name = name.replace(".self_attn.k_proj.", ".attn_k.")
        name = name.replace(".self_attn.v_proj.", ".attn_v.")
        name = name.replace(".self_attn.q_proj.", ".attn_q.")
        name = name.replace(".self_attn.out_proj.", ".attn_out.")
        name = name.replace(".layer_norm1.", ".ln1.")
        name = name.replace(".layer_norm2.", ".ln2.")
        name = name.replace(".mlp.fc1.", ".ffn_down.")
        name = name.replace(".mlp.fc2.", ".ffn_up.")
        if can_quantize:
            if self.ftype == gguf.LlamaFileType.ALL_F32:
                data_qtype = gguf.GGMLQuantizationType.F32
            elif self.ftype == gguf.LlamaFileType.MOSTLY_F16:
                data_qtype = gguf.GGMLQuantizationType.F16
            elif self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
                data_qtype = gguf.GGMLQuantizationType.BF16
            elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0:
                data_qtype = gguf.GGMLQuantizationType.Q8_0
            else:
                raise ValueError(f"Unsupported file type: {self.ftype}")
        # corrent norm value ; only this "soft_emb_norm" need to be corrected as it's part of Gemma projector
        # the other norm values are part of SigLIP model, and they are already correct
        # ref code: Gemma3RMSNorm
        if "soft_emb_norm.weight" in name:
            logger.info(f"Correcting norm value for '{name}'")
            data_torch = data_torch + 1
        data = data_torch.numpy()
        try:
            data = gguf.quants.quantize(data, data_qtype)
        except Exception as e:
            logger.error(f"Error quantizing tensor '{name}': {e}, fallback to F16")
            data_qtype = gguf.GGMLQuantizationType.F16
            data = gguf.quants.quantize(data, data_qtype)
        # reverse shape to make it similar to the internal ggml dimension order
        shape_str = f"{{{', '.join(str(n) for n in reversed(data_torch.shape))}}}"
        logger.info(f"{f'%-32s' % f'{name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}")
        self.gguf_writer.add_tensor(name, data, raw_dtype=data_qtype)
    def write(self):
        self.gguf_writer.write_header_to_file(path=self.fname_out)
        self.gguf_writer.write_kv_data_to_file()
        self.gguf_writer.write_tensors_to_file(progress=True)
        self.gguf_writer.close()
 def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(
        description="Convert Gemma 3 vision tower safetensors to GGUF format",)
    parser.add_argument(
        "--outfile", type=Path, default="mmproj.gguf",
        help="path to write to",
    )
    parser.add_argument(
        "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0"], default="f16",
        help="output format",
    )
    parser.add_argument(
        "--bigendian", action="store_true",
        help="model is executed on big endian machine",
    )
    parser.add_argument(
        "model", type=Path,
        help="directory containing model file",
        nargs="?",
    )
    parser.add_argument(
        "--verbose", action="store_true",
        help="increase output verbosity",
    )
    args = parser.parse_args()
    if args.model is None:
        parser.error("the following arguments are required: model")
    return args
 def main() -> None:
    args = parse_args()
    if args.verbose:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.INFO)
    dir_model = args.model
    if not dir_model.is_dir():
        logger.error(f'Error: {args.model} is not a directory')
        sys.exit(1)
    ftype_map: dict[str, gguf.LlamaFileType] = {
        "f32": gguf.LlamaFileType.ALL_F32,
        "f16": gguf.LlamaFileType.MOSTLY_F16,
        "bf16": gguf.LlamaFileType.MOSTLY_BF16,
        "q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
    }
    logger.info(f"Loading model: {dir_model.name}")
    with torch.inference_mode():
        gemma3_vision_tower = Gemma3VisionTower(
            dir_model=dir_model,
            fname_out=args.outfile,
            ftype=ftype_map[args.outtype],
            is_big_endian=args.bigendian,
        )
        gemma3_vision_tower.write()
 if __name__ == '__main__':
    main()
--- a/examples/llava/minicpmv-cli.cpp
+++ b/examples/llava/minicpmv-cli.cpp
@ -86,7 +86,11 @@ static struct clip_ctx * clip_init_context(common_params * params) {
    if (prompt.empty()) {
        prompt = "describe the image in detail.";
    }
-    auto * ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
+    struct clip_context_params clip_params = {
        /* use_gpu */   params->n_gpu_layers != 0,
        /* verbosity */ params->verbosity,
    };
    auto * ctx_clip = clip_init(clip_path, clip_params);
    return ctx_clip;
 }
@ -148,19 +152,34 @@ static void process_image(struct llava_context * ctx_llava, struct llava_image_e
    process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
    eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
    if (num_image_embeds > 1) {
-        size_t num_image_embeds_col = clip_uhd_num_image_embeds_col(ctx_llava->ctx_clip);
+        if (has_minicpmv_projector == 2) {
-        eval_string(ctx_llava->ctx_llama, std::string("<slice>").c_str(), params->n_batch, &n_past, false);
+            size_t num_image_embeds_col = clip_uhd_num_image_embeds_col(ctx_llava->ctx_clip);
-        for (size_t i = 0; i < (num_image_embeds-1)/num_image_embeds_col; ++i) {
+            eval_string(ctx_llava->ctx_llama, std::string("<slice>").c_str(), params->n_batch, &n_past, false);
-            for (size_t j = 0; j < num_image_embeds_col; ++j) {
+            for (size_t i = 0; i < (num_image_embeds-1)/num_image_embeds_col; ++i) {
-                eval_string(ctx_llava->ctx_llama, std::string("<image>").c_str(), params->n_batch, &n_past, false);
+                for (size_t j = 0; j < num_image_embeds_col; ++j) {
-                process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
+                    eval_string(ctx_llava->ctx_llama, std::string("<image>").c_str(), params->n_batch, &n_past, false);
-                eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
+                    process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
-                if (j == num_image_embeds_col - 1) {
+                    eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
-                    eval_string(ctx_llava->ctx_llama, std::string("\n").c_str(), params->n_batch, &n_past, false);
+                    if (j == num_image_embeds_col - 1) {
                        eval_string(ctx_llava->ctx_llama, std::string("\n").c_str(), params->n_batch, &n_past, false);
                    }
                }
            }
            eval_string(ctx_llava->ctx_llama, std::string("</slice>").c_str(), params->n_batch, &n_past, false);
        }
        else if (has_minicpmv_projector == 3 || has_minicpmv_projector == 4) {
            size_t num_image_embeds_col = clip_uhd_num_image_embeds_col(ctx_llava->ctx_clip);
            for (size_t i = 0; i < (num_image_embeds-1)/num_image_embeds_col; ++i) {
                for (size_t j = 0; j < num_image_embeds_col; ++j) {
                    eval_string(ctx_llava->ctx_llama, std::string("<slice>").c_str(), params->n_batch, &n_past, false);
                    process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
                    eval_string(ctx_llava->ctx_llama, std::string("</slice>").c_str(), params->n_batch, &n_past, false);
                    if (j == num_image_embeds_col - 1) {
                        eval_string(ctx_llava->ctx_llama, std::string("\n").c_str(), params->n_batch, &n_past, false);
                    }
                }
            }
        }
        eval_string(ctx_llava->ctx_llama, std::string("</slice>").c_str(), params->n_batch, &n_past, false);
    }
    LOG_INF("%s: image token past: %d\n", __func__, n_past);
 }
--- a/examples/llava/minicpmv-convert-image-encoder-to-gguf.py
+++ b/examples/llava/minicpmv-convert-image-encoder-to-gguf.py
@ -597,7 +597,6 @@ elif args.minicpmv_projector is not None:
    fname_middle = "mmproj-"
    has_text_encoder = False
    has_minicpmv_projector = True
    minicpmv_version = 4
 elif args.vision_only:
    fname_middle = "vision-"
    has_text_encoder = False
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -384,8 +384,9 @@ struct server_task {
                            SRV_DBG("Grammar trigger token: %d (`%s`)\n", token, word.c_str());
                            common_grammar_trigger trigger;
                            trigger.type = COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN;
-                            trigger.value = (llama_token) token;
+                            trigger.value = word;
-                            params.sampling.grammar_triggers.push_back(trigger);
+                            trigger.token = token;
                            params.sampling.grammar_triggers.push_back(std::move(trigger));
                        } else {
                            SRV_DBG("Grammar trigger word: `%s`\n", word.c_str());
                            params.sampling.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, word});
@ -750,7 +751,10 @@ struct server_task_result_cmpl_final : server_task_result {
                        {"name", tc.name},
                        {"arguments", tc.arguments},
                    }},
-                    {"id", tc.id},
+                    // Some templates generate and require an id (sometimes in a very specific format, e.g. Mistral Nemo).
                    // We only generate a random id for the ones that don't generate one by themselves
                    // (they also won't get to see it as their template likely doesn't use it, so it's all for the client)
                    {"id", tc.id.empty() ? gen_tool_call_id() : tc.id},
                });
            }
            message["tool_calls"] = tool_calls;
--- a/examples/server/tests/unit/test_tool_call.py
+++ b/examples/server/tests/unit/test_tool_call.py
@ -92,6 +92,7 @@ def do_test_completion_with_required_tool_tiny(server: ServerProcess, tool: dict
    assert tool_calls and len(tool_calls) == 1, f'Expected 1 tool call in {choice["message"]}'
    tool_call = tool_calls[0]
    assert choice["message"].get("content") in (None, ""), f'Expected no content in {choice["message"]}'
    assert len(tool_call.get("id", "")) > 0, f'Expected non empty tool call id in {tool_call}'
    expected_function_name = "python" if tool["type"] == "code_interpreter" else tool["function"]["name"]
    assert expected_function_name == tool_call["function"]["name"]
    actual_arguments = tool_call["function"]["arguments"]
@ -373,6 +374,7 @@ def do_test_weather(server: ServerProcess, **kwargs):
    tool_call = tool_calls[0]
    # assert choice["message"].get("content") in (None, ""), f'Expected no content in {choice["message"]}'
    assert tool_call["function"]["name"] == WEATHER_TOOL["function"]["name"], f'Expected weather tool call, got {tool_call["function"]["name"]}'
    assert len(tool_call.get("id", "")) > 0, f'Expected non empty tool call id in {tool_call}'
    actual_arguments = json.loads(tool_call["function"]["arguments"])
    assert 'location' in actual_arguments, f"location not found in {json.dumps(actual_arguments)}"
    location = actual_arguments["location"]
@ -596,6 +598,7 @@ def do_test_hello_world(server: ServerProcess, **kwargs):
    tool_call = tool_calls[0]
    # assert choice["message"].get("content") in (None, ""), f'Expected no content in {choice["message"]}'
    assert tool_call["function"]["name"] == PYTHON_TOOL["function"]["name"]
    assert len(tool_call.get("id", "")) > 0, f'Expected non empty tool call id in {tool_call}'
    actual_arguments = json.loads(tool_call["function"]["arguments"])
    assert 'code' in actual_arguments, f"code not found in {json.dumps(actual_arguments)}"
    code = actual_arguments["code"]
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@ -435,6 +435,10 @@ static std::string gen_chatcmplid() {
    return "chatcmpl-" + random_string();
 }
 static std::string gen_tool_call_id() {
    return random_string();
 }
 //
 // other common utils
 //
--- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
@ -498,7 +498,7 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
        search_paths.push_back(get_executable_path());
        search_paths.push_back(fs::current_path());
    } else {
-        search_paths.push_back(user_search_path);
+        search_paths.push_back(fs::u8path(user_search_path));
    }
    int best_score = 0;
@ -512,9 +512,9 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
        fs::directory_iterator dir_it(search_path, fs::directory_options::skip_permission_denied);
        for (const auto & entry : dir_it) {
            if (entry.is_regular_file()) {
-                auto filename = entry.path().filename().native();
+                auto filename = entry.path().filename();
-                auto ext = entry.path().extension().native();
+                auto ext = entry.path().extension();
-                if (filename.find(file_prefix) == 0 && ext == file_extension) {
+                if (filename.native().find(file_prefix) == 0 && ext == file_extension) {
                    dl_handle_ptr handle { dl_load_library(entry) };
                    if (!handle && !silent) {
                        GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path_str(entry.path()).c_str());
@ -545,7 +545,7 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
        // try to load the base backend
        for (const auto & search_path : search_paths) {
            fs::path filename = backend_filename_prefix().native() + name_path.native() + backend_filename_extension().native();
-            fs::path path = search_path.native() + filename.native();
+            fs::path path = search_path / filename;
            if (fs::exists(path)) {
                return get_reg().load_backend(path, silent);
            }
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@ -1461,7 +1461,7 @@ ggml_backend_sched_t ggml_backend_sched_new(
        bool parallel) {
    GGML_ASSERT(n_backends > 0);
    GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
-    GGML_ASSERT(ggml_backend_dev_type(ggml_backend_get_device(backends[n_backends - 1])) == GGML_BACKEND_DEVICE_TYPE_CPU);
+    // GGML_ASSERT(ggml_backend_dev_type(ggml_backend_get_device(backends[n_backends - 1])) == GGML_BACKEND_DEVICE_TYPE_CPU);
    struct ggml_backend_sched * sched = (ggml_backend_sched *) calloc(1, sizeof(struct ggml_backend_sched));
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@ -395,11 +395,11 @@ static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half
 static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, int c) {
 #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
-#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(RDNA2)
+#if defined(CDNA) || defined(RDNA2) || defined(__gfx906__)
    c = __builtin_amdgcn_sdot4(a, b, c, false);
 #elif defined(RDNA3)
    c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
-#elif defined(__gfx1010__) || defined(__gfx900__)
+#elif defined(RDNA1) || defined(__gfx900__)
    int tmp1;
    int tmp2;
    asm("\n \
--- a/ggml/src/ggml-cuda/fattn-common.cuh
+++ b/ggml/src/ggml-cuda/fattn-common.cuh
@ -52,12 +52,11 @@ typedef half (*vec_dot_KQ_f16_t)(
 typedef float (*vec_dot_KQ_f32_t)(
    const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8 , const void * __restrict__ Q_ds);
-template<typename T, int D>
+template<typename T, int D, int warp_size>
 static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_0(
    const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
    const block_q4_0 * K_q4_0 = (const block_q4_0 *) K_c;
    constexpr int warp_size = ggml_cuda_get_physical_warp_size();
    GGML_UNUSED(Q_v);
    T sum = 0.0f;
@ -93,12 +92,11 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_0(
    return sum;
 }
-template<typename T, int D>
+template<typename T, int D, int warp_size>
 static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_1(
    const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
    const block_q4_1 * K_q4_1 = (const block_q4_1 *) K_c;
    constexpr int warp_size = ggml_cuda_get_physical_warp_size();
    GGML_UNUSED(Q_v);
    T sum = 0.0f;
@ -138,12 +136,11 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_1(
    return sum;
 }
-template<typename T, int D>
+template<typename T, int D, int warp_size>
 static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_0(
    const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
    const block_q5_0 * K_q5_0 = (const block_q5_0 *) K_c;
    constexpr int warp_size = ggml_cuda_get_physical_warp_size();
    GGML_UNUSED(Q_v);
    T sum = 0.0f;
@ -186,12 +183,11 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_0(
    return sum;
 }
-template<typename T, int D>
+template<typename T, int D, int warp_size>
 static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_1(
    const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
    const block_q5_1 * K_q5_1 = (const block_q5_1 *) K_c;
    constexpr int warp_size = ggml_cuda_get_physical_warp_size();
    GGML_UNUSED(Q_v);
    T sum = 0.0f;
@ -238,12 +234,11 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_1(
    return sum;
 }
-template <typename T, int D>
+template <typename T, int D, int warp_size>
 static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q8_0(
    const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
    const block_q8_0 * K_q8_0 = (const block_q8_0 *) K_c;
    constexpr int warp_size = ggml_cuda_get_physical_warp_size();
    GGML_UNUSED(Q_v);
    T sum = 0.0f;
@ -272,12 +267,11 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q8_0(
    return sum;
 }
-template <typename T, int D>
+template <typename T, int D, int warp_size>
 static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_f16(
    const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8 , const void * __restrict__ Q_ds_v) {
    const half2 * K_h2 = (const half2 *) K_c;
    constexpr int warp_size = ggml_cuda_get_physical_warp_size();
    GGML_UNUSED(Q_q8);
    GGML_UNUSED(Q_ds_v);
@ -480,25 +474,25 @@ static __device__ __forceinline__ T dequantize_1_f16(const void * __restrict__ v
    return x[i];
 }
-template <int D>
+template <int D, int warp_size = WARP_SIZE>
 constexpr __device__ vec_dot_KQ_f16_t get_vec_dot_KQ_f16(ggml_type type_K) {
-    return type_K == GGML_TYPE_Q4_0 ? vec_dot_fattn_vec_KQ_q4_0<half, D> :
+    return type_K == GGML_TYPE_Q4_0 ? vec_dot_fattn_vec_KQ_q4_0<half, D, warp_size> :
-        type_K == GGML_TYPE_Q4_1 ? vec_dot_fattn_vec_KQ_q4_1<half, D> :
+        type_K == GGML_TYPE_Q4_1 ? vec_dot_fattn_vec_KQ_q4_1<half, D, warp_size> :
-        type_K == GGML_TYPE_Q5_0 ? vec_dot_fattn_vec_KQ_q5_0<half, D> :
+        type_K == GGML_TYPE_Q5_0 ? vec_dot_fattn_vec_KQ_q5_0<half, D, warp_size> :
-        type_K == GGML_TYPE_Q5_1 ? vec_dot_fattn_vec_KQ_q5_1<half, D> :
+        type_K == GGML_TYPE_Q5_1 ? vec_dot_fattn_vec_KQ_q5_1<half, D, warp_size> :
-        type_K == GGML_TYPE_Q8_0 ? vec_dot_fattn_vec_KQ_q8_0<half, D> :
+        type_K == GGML_TYPE_Q8_0 ? vec_dot_fattn_vec_KQ_q8_0<half, D, warp_size> :
-        type_K == GGML_TYPE_F16 ? vec_dot_fattn_vec_KQ_f16<half, D> :
+        type_K == GGML_TYPE_F16 ? vec_dot_fattn_vec_KQ_f16<half, D, warp_size> :
        nullptr;
 }
-template <int D>
+template <int D, int warp_size = WARP_SIZE>
 constexpr __device__ vec_dot_KQ_f32_t get_vec_dot_KQ_f32(ggml_type type_K) {
-    return type_K == GGML_TYPE_Q4_0 ? vec_dot_fattn_vec_KQ_q4_0<float, D> :
+    return type_K == GGML_TYPE_Q4_0 ? vec_dot_fattn_vec_KQ_q4_0<float, D, warp_size> :
-        type_K == GGML_TYPE_Q4_1 ? vec_dot_fattn_vec_KQ_q4_1<float, D> :
+        type_K == GGML_TYPE_Q4_1 ? vec_dot_fattn_vec_KQ_q4_1<float, D, warp_size> :
-        type_K == GGML_TYPE_Q5_0 ? vec_dot_fattn_vec_KQ_q5_0<float, D> :
+        type_K == GGML_TYPE_Q5_0 ? vec_dot_fattn_vec_KQ_q5_0<float, D, warp_size> :
-        type_K == GGML_TYPE_Q5_1 ? vec_dot_fattn_vec_KQ_q5_1<float, D> :
+        type_K == GGML_TYPE_Q5_1 ? vec_dot_fattn_vec_KQ_q5_1<float, D, warp_size> :
-        type_K == GGML_TYPE_Q8_0 ? vec_dot_fattn_vec_KQ_q8_0<float, D> :
+        type_K == GGML_TYPE_Q8_0 ? vec_dot_fattn_vec_KQ_q8_0<float, D, warp_size> :
-        type_K == GGML_TYPE_F16 ? vec_dot_fattn_vec_KQ_f16<float, D> :
+        type_K == GGML_TYPE_F16 ? vec_dot_fattn_vec_KQ_f16<float, D, warp_size> :
        nullptr;
 }
@ -681,7 +675,8 @@ static void on_no_fattn_vec_case(const int D) {
 template <int D, int ncols1, int ncols2, int parallel_blocks, int KQ_stride>
 void launch_fattn(
    ggml_backend_cuda_context & ctx, ggml_tensor * dst, fattn_kernel_t fattn_kernel,
-    const int nwarps, const size_t nbytes_shared, const bool need_f16_K, const bool need_f16_V
+    const int nwarps, const size_t nbytes_shared, const bool need_f16_K, const bool need_f16_V,
    const int warp_size = WARP_SIZE
 ) {
    constexpr int ncols = ncols1 * ncols2;
@ -704,8 +699,6 @@ void launch_fattn(
    GGML_ASSERT(Q->ne[3] == 1);
    const int warp_size = ggml_cuda_info().devices[ctx.device].warp_size;
    ggml_cuda_pool & pool = ctx.pool();
    cudaStream_t main_stream = ctx.stream();
    const int id  = ggml_cuda_get_device();
@ -805,7 +798,6 @@ void launch_fattn(
    const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
    GGML_ASSERT(block_dim.x % warp_size == 0);
    GGML_ASSERT(!GGML_CUDA_CC_IS_AMD(cc) || block_dim.x * block_dim.y <= 4 * (unsigned int)warp_size);
    fattn_kernel<<<blocks_num, block_dim, nbytes_shared, main_stream>>>(
        (const char *) Q->data,
        K_data,
--- a/ggml/src/ggml-cuda/fattn-wmma-f16.cu
+++ b/ggml/src/ggml-cuda/fattn-wmma-f16.cu
@ -469,6 +469,7 @@ void ggml_cuda_flash_attn_ext_wmma_f16_case(ggml_backend_cuda_context & ctx, ggm
    constexpr int frag_m = cols_per_block == 8 && D % 32 == 0 ? 32 : 16;
    const int blocks_num_pb1 = ((Q->ne[1] + cols_per_block - 1) / cols_per_block)*Q->ne[2]*Q->ne[3];
    const int nsm = ggml_cuda_info().devices[ggml_cuda_get_device()].nsm;
    const int warp_size = ggml_cuda_info().devices[ggml_cuda_get_device()].warp_size;
    float logit_softcap;
    memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));
@ -485,7 +486,7 @@ void ggml_cuda_flash_attn_ext_wmma_f16_case(ggml_backend_cuda_context & ctx, ggm
            fattn_kernel = flash_attn_ext_f16<
                D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t, use_logit_softcap>;
        }
-        launch_fattn<D, cols_per_block, 1, parallel_blocks, -1>(ctx, dst, fattn_kernel, nwarps, 0, true, true);
+        launch_fattn<D, cols_per_block, 1, parallel_blocks, -1>(ctx, dst, fattn_kernel, nwarps, 0, true, true, warp_size);
        return;
    }
    if (2*blocks_num_pb1 < 2*nsm) {
@ -500,7 +501,7 @@ void ggml_cuda_flash_attn_ext_wmma_f16_case(ggml_backend_cuda_context & ctx, ggm
            fattn_kernel = flash_attn_ext_f16<
                D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t, use_logit_softcap>;
        }
-        launch_fattn<D, cols_per_block, 1, parallel_blocks, -1>(ctx, dst, fattn_kernel, nwarps, 0, true, true);
+        launch_fattn<D, cols_per_block, 1, parallel_blocks, -1>(ctx, dst, fattn_kernel, nwarps, 0, true, true, warp_size);
        return;
    }
    constexpr int parallel_blocks = 1;
@ -514,7 +515,7 @@ void ggml_cuda_flash_attn_ext_wmma_f16_case(ggml_backend_cuda_context & ctx, ggm
        fattn_kernel = flash_attn_ext_f16<
            D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t, use_logit_softcap>;
    }
-    launch_fattn<D, cols_per_block, 1, parallel_blocks, -1>(ctx, dst, fattn_kernel, nwarps, 0, true, true);
+    launch_fattn<D, cols_per_block, 1, parallel_blocks, -1>(ctx, dst, fattn_kernel, nwarps, 0, true, true, warp_size);
 }
 void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
--- a/ggml/src/ggml-cuda/mmvq.cu
+++ b/ggml/src/ggml-cuda/mmvq.cu
@ -47,11 +47,89 @@ static constexpr __device__ int get_vdr_mmvq(ggml_type type) {
        1;
 }
 enum mmvq_parameter_table_id {
    MMVQ_PARAMETERS_GENERIC = 0,
    MMVQ_PARAMETERS_GCN,
    MMVQ_PARAMETERS_RDNA2
 };
 static constexpr __device__ mmvq_parameter_table_id get_device_table_id() {
 #if defined(RDNA2) || defined(RDNA3)
    return MMVQ_PARAMETERS_RDNA2;
 #elif defined(GCN) || defined(CDNA)
    return MMVQ_PARAMETERS_GCN;
 #else
    return MMVQ_PARAMETERS_GENERIC;
 #endif
 }
 static __host__ mmvq_parameter_table_id get_device_table_id(int cc) {
    if (GGML_CUDA_CC_IS_RDNA2(cc) || GGML_CUDA_CC_IS_RDNA3(cc)) {
        return MMVQ_PARAMETERS_RDNA2;
    }
    if (GGML_CUDA_CC_IS_GCN(cc) || GGML_CUDA_CC_IS_CDNA(cc)) {
        return MMVQ_PARAMETERS_GCN;
    }
    return MMVQ_PARAMETERS_GENERIC;
 }
 static constexpr __host__ __device__ int calc_nwarps(int ncols_y,  mmvq_parameter_table_id table_id) {
    if (table_id == MMVQ_PARAMETERS_GENERIC) {
        switch (ncols_y) {
            case 1:
            case 2:
            case 3:
            case 4:
                return 4;
            case 5:
            case 6:
            case 7:
            case 8:
                return 2;
            default:
                return 1;
        }
    } else if (table_id == MMVQ_PARAMETERS_GCN) {
        switch (ncols_y) {
            case 1:
            case 2:
            case 3:
            case 4:
                return 2;
            case 5:
            case 6:
            case 7:
            case 8:
            default:
                return 1;
        }
    }
    return 1;
 }
 static constexpr __host__ __device__ int calc_rows_per_block(int ncols_y, int table_id) {
    if (table_id == MMVQ_PARAMETERS_GENERIC || table_id == MMVQ_PARAMETERS_GCN) {
        switch (ncols_y) {
            case 1:
                return 1;
            case 2:
            case 3:
            case 4:
            case 5:
            case 6:
            case 7:
            case 8:
                return 2;
            default:
                return 1;
        }
    }
    return 1;
 }
 template <ggml_type type, int ncols_y>
 #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
 // tell the compiler to use as many registers as it wants, see nwarps definition below
-__launch_bounds__((ncols_y <= 4 ? 4 : 2)*WARP_SIZE, 1)
+__launch_bounds__(calc_nwarps(ncols_y, get_device_table_id())*ggml_cuda_get_physical_warp_size(), 1)
 #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
 static __global__ void mul_mat_vec_q(
    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
    const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst) {
@ -59,24 +137,20 @@ static __global__ void mul_mat_vec_q(
    constexpr int qk  = ggml_cuda_type_traits<type>::qk;
    constexpr int qi  = ggml_cuda_type_traits<type>::qi;
    constexpr int vdr = get_vdr_mmvq(type);
    constexpr mmvq_parameter_table_id table_id = get_device_table_id();
    constexpr int nwarps = calc_nwarps(ncols_y, table_id);
    constexpr int rows_per_cuda_block = calc_rows_per_block(ncols_y, table_id);
    constexpr int warp_size = ggml_cuda_get_physical_warp_size();
    constexpr vec_dot_q_cuda_t vec_dot_q_cuda = get_vec_dot_q_cuda(type);
-#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && (defined(RDNA2) || defined(RDNA3))
+    const     int tid = warp_size*threadIdx.y + threadIdx.x;
    constexpr int nwarps              = 1;
    constexpr int rows_per_cuda_block = 1;
 #else
    constexpr int nwarps              = ncols_y <= 4 ? 4 : 2;
    constexpr int rows_per_cuda_block = ncols_y == 1 ? 1 : 2;
 #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(RDNA2) && !defined(RDNA3)
    const     int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
    const     int row0 = rows_per_cuda_block*blockIdx.x;
    const     int blocks_per_row_x = ncols_x / qk;
    const     int blocks_per_col_y = nrows_y / QK8_1;
-    constexpr int blocks_per_iter = vdr * nwarps*WARP_SIZE / qi;
+    constexpr int blocks_per_iter = vdr * nwarps*warp_size / qi;
-// partial sum for each thread
+    // partial sum for each thread
    float tmp[ncols_y][rows_per_cuda_block] = {0.0f};
    const block_q8_1 * y = (const block_q8_1 *) vy;
@ -96,7 +170,7 @@ static __global__ void mul_mat_vec_q(
        }
    }
-    __shared__ float tmp_shared[nwarps-1 > 0 ? nwarps-1 : 1][ncols_y][rows_per_cuda_block][WARP_SIZE];
+    __shared__ float tmp_shared[nwarps-1 > 0 ? nwarps-1 : 1][ncols_y][rows_per_cuda_block][warp_size];
    if (threadIdx.y > 0) {
 #pragma unroll
        for (int j = 0; j < ncols_y; ++j) {
@ -120,7 +194,7 @@ static __global__ void mul_mat_vec_q(
            for (int l = 0; l < nwarps-1; ++l) {
                tmp[j][i] += tmp_shared[l][j][i][threadIdx.x];
            }
-            tmp[j][i] = warp_reduce_sum(tmp[j][i]);
+            tmp[j][i] = warp_reduce_sum<warp_size>(tmp[j][i]);
        }
        if (threadIdx.x < rows_per_cuda_block && (rows_per_cuda_block == 1 || row0 + threadIdx.x < nrows_dst)) {
@ -129,6 +203,13 @@ static __global__ void mul_mat_vec_q(
    }
 }
 static std::pair<dim3, dim3> calc_launch_params(const int ncols_y, const int nrows_x, const int warp_size, const mmvq_parameter_table_id table_id) {
    const int64_t nblocks = (nrows_x + calc_rows_per_block(ncols_y, table_id) - 1) / calc_rows_per_block(ncols_y, table_id);
    const dim3 block_nums(nblocks, 1, 1);
    const dim3 block_dims(warp_size, calc_nwarps(ncols_y, table_id), 1);
    return {block_nums, block_dims};
 }
 template <ggml_type type>
 static void mul_mat_vec_q_cuda(
    const void * vx, const void * vy, float * dst,
@ -137,65 +218,67 @@ static void mul_mat_vec_q_cuda(
    GGML_ASSERT(ncols_x % ggml_blck_size(type) == 0);
    GGML_ASSERT(ncols_y <= MMVQ_MAX_BATCH_SIZE);
-    int id = ggml_cuda_get_device();
+    const int device = ggml_cuda_get_device();
-
+    const int warp_size = ggml_cuda_info().devices[device].warp_size;
-    int64_t nwarps = 1;
+    const mmvq_parameter_table_id table_id = get_device_table_id(ggml_cuda_info().devices[device].cc);
    int64_t rows_per_cuda_block = 1;
    if (ggml_cuda_info().devices[id].cc < GGML_CUDA_CC_RDNA2) { // NVIDIA and AMD older than RDNA2
        switch(ncols_y) {
            case 1:
                nwarps = 4;
                rows_per_cuda_block = 1;
                break;
            case 2:
            case 3:
            case 4:
                nwarps = 4;
                rows_per_cuda_block = 2;
                break;
            case 5:
            case 6:
            case 7:
            case 8:
                nwarps = 2;
                rows_per_cuda_block = 2;
                break;
            default:
                GGML_ABORT("fatal error");
                break;
        }
    }
    const int64_t nblocks = (nrows_x + rows_per_cuda_block - 1) / rows_per_cuda_block;
    const dim3 block_nums(nblocks, 1, 1);
    const dim3 block_dims(WARP_SIZE, nwarps, 1);
    switch (ncols_y) {
        case 1:
-            mul_mat_vec_q<type, 1><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
+        {
            constexpr int c_ncols_y = 1;
            std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id);
            mul_mat_vec_q<type, c_ncols_y><<<dims.first, dims.second, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
            break;
        }
        case 2:
-            mul_mat_vec_q<type, 2><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
+        {
            constexpr int c_ncols_y = 2;
            std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id);
            mul_mat_vec_q<type, c_ncols_y><<<dims.first, dims.second, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
            break;
        }
        case 3:
-            mul_mat_vec_q<type, 3><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
+        {
            constexpr int c_ncols_y = 3;
            std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id);
            mul_mat_vec_q<type, c_ncols_y><<<dims.first, dims.second, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
            break;
        }
        case 4:
-            mul_mat_vec_q<type, 4><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
+        {
            constexpr int c_ncols_y = 4;
            std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id);
            mul_mat_vec_q<type, c_ncols_y><<<dims.first, dims.second, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
            break;
        }
        case 5:
-            mul_mat_vec_q<type, 5><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
+        {
            constexpr int c_ncols_y = 5;
            std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id);
            mul_mat_vec_q<type, c_ncols_y><<<dims.first, dims.second, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
            break;
        }
        case 6:
-            mul_mat_vec_q<type, 6><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
+        {
            constexpr int c_ncols_y = 6;
            std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id);
            mul_mat_vec_q<type, c_ncols_y><<<dims.first, dims.second, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
            break;
        }
        case 7:
-            mul_mat_vec_q<type, 7><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
+        {
            constexpr int c_ncols_y = 7;
            std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id);
            mul_mat_vec_q<type, c_ncols_y><<<dims.first, dims.second, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
            break;
        }
        case 8:
-            mul_mat_vec_q<type, 8><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
+        {
            constexpr int c_ncols_y = 8;
            std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id);
            mul_mat_vec_q<type, c_ncols_y><<<dims.first, dims.second, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
            break;
        }
        default:
            GGML_ABORT("fatal error");
            break;
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@ -46,6 +46,7 @@ static struct ggml_backend_device g_ggml_backend_metal_device;
 static struct ggml_backend_metal_device_context {
    id<MTLDevice> mtl_device;
    int           mtl_device_ref_count;
    id<MTLLibrary> mtl_library;
    bool has_simdgroup_reduction;
    bool has_simdgroup_mm;
@ -57,6 +58,7 @@ static struct ggml_backend_metal_device_context {
 } g_ggml_ctx_dev_main = {
    /*.mtl_device              =*/ nil,
    /*.mtl_device_ref_count    =*/ 0,
    /*.mtl_library             =*/ nil,
    /*.has_simdgroup_reduction =*/ false,
    /*.has_simdgroup_mm        =*/ false,
    /*.has_residency_sets      =*/ false,
@ -108,6 +110,11 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
    ctx->mtl_device_ref_count--;
    if (ctx->mtl_device_ref_count == 0) {
        if (ctx->mtl_library) {
            [ctx->mtl_library release];
            ctx->mtl_library = nil;
        }
        if (ctx->mtl_device) {
            [ctx->mtl_device release];
            ctx->mtl_device = nil;
@ -495,6 +502,139 @@ static void * ggml_metal_host_malloc(size_t n) {
    return data;
 }
 // load library
 //
 // - first check if the library is embedded
 // - then check if the library is in the bundle
 // - if not found, load the source and compile it
 // - if that fails, return NULL
 static id<MTLLibrary> ggml_metal_load_library(id<MTLDevice> device, bool use_bfloat) {
    id<MTLLibrary> metal_library = nil;
    NSError * error = nil;
    NSString * src = nil;
 #if GGML_METAL_EMBED_LIBRARY
    GGML_LOG_INFO("%s: using embedded metal library\n", __func__);
    extern const char ggml_metallib_start[];
    extern const char ggml_metallib_end[];
    src = [[NSString alloc] initWithBytes:ggml_metallib_start length:(ggml_metallib_end-ggml_metallib_start) encoding:NSUTF8StringEncoding];
 #else
 #ifdef SWIFT_PACKAGE
    NSBundle * bundle = SWIFTPM_MODULE_BUNDLE;
 #else
    NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
 #endif
    NSString * path_lib = [bundle pathForResource:@"default" ofType:@"metallib"];
    if (path_lib == nil) {
        // Try to find the resource in the directory where the current binary located.
        NSString * current_binary = [[NSProcessInfo processInfo] arguments][0];
        NSString * bin_dir = [current_binary stringByDeletingLastPathComponent];
        NSString * default_metallib_path = [NSString pathWithComponents:@[bin_dir, @"default.metallib"]];
        if ([[NSFileManager defaultManager] isReadableFileAtPath:default_metallib_path]) {
            GGML_LOG_INFO("%s: found '%s'\n", __func__, [default_metallib_path UTF8String]);
            NSDictionary * atts = [[NSFileManager defaultManager] attributesOfItemAtPath:default_metallib_path error:&error];
            if (atts && atts[NSFileType] == NSFileTypeSymbolicLink) {
                // Optionally, if this is a symlink, try to resolve it.
                default_metallib_path = [[NSFileManager defaultManager] destinationOfSymbolicLinkAtPath:default_metallib_path error:&error];
                if (default_metallib_path && [default_metallib_path length] > 0 && ![[default_metallib_path substringToIndex:1] isEqualToString:@"/"]) {
                    // It is a relative path, adding the binary directory as directory prefix.
                    default_metallib_path = [NSString pathWithComponents:@[bin_dir, default_metallib_path]];
                }
                if (!default_metallib_path || ![[NSFileManager defaultManager] isReadableFileAtPath:default_metallib_path]) {
                    // Link to the resource could not be resolved.
                    default_metallib_path = nil;
                } else {
                    GGML_LOG_INFO("%s: symlink resolved '%s'\n", __func__, [default_metallib_path UTF8String]);
                }
            }
        } else {
            // The resource couldn't be found in the binary's directory.
            default_metallib_path = nil;
        }
        path_lib = default_metallib_path;
    }
    if (path_lib != nil) {
        // pre-compiled library found
        NSURL * libURL = [NSURL fileURLWithPath:path_lib];
        GGML_LOG_INFO("%s: loading '%s'\n", __func__, [path_lib UTF8String]);
        metal_library = [device newLibraryWithURL:libURL error:&error];
        if (error) {
            GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
            return NULL;
        }
    } else {
        GGML_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
        NSString * path_source;
        NSString * path_resource = [[NSProcessInfo processInfo].environment objectForKey:@"GGML_METAL_PATH_RESOURCES"];
        GGML_LOG_INFO("%s: GGML_METAL_PATH_RESOURCES = %s\n", __func__, path_resource ? [path_resource UTF8String] : "nil");
        if (path_resource) {
            path_source = [path_resource stringByAppendingPathComponent:@"ggml-metal-merged.metal"];
        } else {
            path_source = [bundle pathForResource:@"ggml-metal-merged" ofType:@"metal"];
        }
        if (path_source == nil) {
            GGML_LOG_WARN("%s: error: could not use bundle path to find ggml-metal-merged.metal, falling back to trying cwd\n", __func__);
            path_source = @"ggml-metal.metal";
        }
        GGML_LOG_INFO("%s: loading '%s'\n", __func__, [path_source UTF8String]);
        src = [NSString stringWithContentsOfFile:path_source encoding:NSUTF8StringEncoding error:&error];
        if (error) {
            GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
            return NULL;
        }
    }
 #endif
    if (!metal_library) {
        @autoreleasepool {
            // dictionary of preprocessor macros
            NSMutableDictionary * prep = [NSMutableDictionary dictionary];
            if (use_bfloat) {
                [prep setObject:@"1" forKey:@"GGML_METAL_USE_BF16"];
            }
 #if GGML_METAL_EMBED_LIBRARY
            [prep setObject:@"1" forKey:@"GGML_METAL_EMBED_LIBRARY"];
 #endif
            MTLCompileOptions * options = [MTLCompileOptions new];
            options.preprocessorMacros = prep;
            //[options setFastMathEnabled:false];
            metal_library = [device newLibraryWithSource:src options:options error:&error];
            if (error) {
                GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
                return NULL;
            }
 #if !__has_feature(objc_arc)
            [options release];
 #endif
        }
    }
 #if GGML_METAL_EMBED_LIBRARY
    [src release];
 #endif // GGML_METAL_EMBED_LIBRARY
    return metal_library;
 }
 static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t dev) {
    GGML_LOG_INFO("%s: allocating\n", __func__);
@ -522,136 +662,14 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
    ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
    id<MTLLibrary> metal_library = nil;
    // load library
-    //
+    if (ctx_dev->mtl_library == nil) {
-    // - first check if the library is embedded
+        ctx_dev->mtl_library = ggml_metal_load_library(device, ctx_dev->use_bfloat);
-    // - then check if the library is in the bundle
+    }
-    // - if not found, load the source and compile it
+    id<MTLLibrary> metal_library = ctx_dev->mtl_library;
-    // - if that fails, return NULL
+    if (metal_library == nil) {
-    {
+        GGML_LOG_ERROR("%s: error: metal library is nil\n", __func__);
-        NSError * error = nil;
+        return NULL;
        NSString * src = nil;
 #if GGML_METAL_EMBED_LIBRARY
        GGML_LOG_INFO("%s: using embedded metal library\n", __func__);
        extern const char ggml_metallib_start[];
        extern const char ggml_metallib_end[];
        src = [[NSString alloc] initWithBytes:ggml_metallib_start length:(ggml_metallib_end-ggml_metallib_start) encoding:NSUTF8StringEncoding];
 #else
 #ifdef SWIFT_PACKAGE
        NSBundle * bundle = SWIFTPM_MODULE_BUNDLE;
 #else
        NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
 #endif
        NSString * path_lib = [bundle pathForResource:@"default" ofType:@"metallib"];
        if (path_lib == nil) {
            // Try to find the resource in the directory where the current binary located.
            NSString * current_binary = [[NSProcessInfo processInfo] arguments][0];
            NSString * bin_dir = [current_binary stringByDeletingLastPathComponent];
            NSString * default_metallib_path = [NSString pathWithComponents:@[bin_dir, @"default.metallib"]];
            if ([[NSFileManager defaultManager] isReadableFileAtPath:default_metallib_path]) {
                GGML_LOG_INFO("%s: found '%s'\n", __func__, [default_metallib_path UTF8String]);
                NSDictionary * atts = [[NSFileManager defaultManager] attributesOfItemAtPath:default_metallib_path error:&error];
                if (atts && atts[NSFileType] == NSFileTypeSymbolicLink) {
                    // Optionally, if this is a symlink, try to resolve it.
                    default_metallib_path = [[NSFileManager defaultManager] destinationOfSymbolicLinkAtPath:default_metallib_path error:&error];
                    if (default_metallib_path && [default_metallib_path length] > 0 && ![[default_metallib_path substringToIndex:1] isEqualToString:@"/"]) {
                        // It is a relative path, adding the binary directory as directory prefix.
                        default_metallib_path = [NSString pathWithComponents:@[bin_dir, default_metallib_path]];
                    }
                    if (!default_metallib_path || ![[NSFileManager defaultManager] isReadableFileAtPath:default_metallib_path]) {
                        // Link to the resource could not be resolved.
                        default_metallib_path = nil;
                    } else {
                        GGML_LOG_INFO("%s: symlink resolved '%s'\n", __func__, [default_metallib_path UTF8String]);
                    }
                }
            } else {
                // The resource couldn't be found in the binary's directory.
                default_metallib_path = nil;
            }
            path_lib = default_metallib_path;
        }
        if (path_lib != nil) {
            // pre-compiled library found
            NSURL * libURL = [NSURL fileURLWithPath:path_lib];
            GGML_LOG_INFO("%s: loading '%s'\n", __func__, [path_lib UTF8String]);
            metal_library = [device newLibraryWithURL:libURL error:&error];
            if (error) {
                GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
                return NULL;
            }
        } else {
            GGML_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
            NSString * path_source;
            NSString * path_resource = [[NSProcessInfo processInfo].environment objectForKey:@"GGML_METAL_PATH_RESOURCES"];
            GGML_LOG_INFO("%s: GGML_METAL_PATH_RESOURCES = %s\n", __func__, path_resource ? [path_resource UTF8String] : "nil");
            if (path_resource) {
                path_source = [path_resource stringByAppendingPathComponent:@"ggml-metal-merged.metal"];
            } else {
                path_source = [bundle pathForResource:@"ggml-metal-merged" ofType:@"metal"];
            }
            if (path_source == nil) {
                GGML_LOG_WARN("%s: error: could not use bundle path to find ggml-metal-merged.metal, falling back to trying cwd\n", __func__);
                path_source = @"ggml-metal.metal";
            }
            GGML_LOG_INFO("%s: loading '%s'\n", __func__, [path_source UTF8String]);
            src = [NSString stringWithContentsOfFile:path_source encoding:NSUTF8StringEncoding error:&error];
            if (error) {
                GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
                return NULL;
            }
        }
 #endif
        if (!metal_library) {
            @autoreleasepool {
                // dictionary of preprocessor macros
                NSMutableDictionary * prep = [NSMutableDictionary dictionary];
                if (ctx_dev->use_bfloat) {
                    [prep setObject:@"1" forKey:@"GGML_METAL_USE_BF16"];
                }
 #if GGML_METAL_EMBED_LIBRARY
                [prep setObject:@"1" forKey:@"GGML_METAL_EMBED_LIBRARY"];
 #endif
                MTLCompileOptions * options = [MTLCompileOptions new];
                options.preprocessorMacros = prep;
                //[options setFastMathEnabled:false];
                metal_library = [device newLibraryWithSource:src options:options error:&error];
                if (error) {
                    GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
                    return NULL;
                }
 #if !__has_feature(objc_arc)
                [options release];
 #endif
            }
        }
 #if GGML_METAL_EMBED_LIBRARY
        [src release];
 #endif // GGML_METAL_EMBED_LIBRARY
    }
    // print MTL GPU family:
@ -725,7 +743,6 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
            [metal_function release]; \
            if (error) { \
                GGML_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
                [metal_library release]; \
                return NULL; \
            } \
        } else { \
@ -1044,8 +1061,6 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_POOL_2D_MAX_F32,               pool_2d_max_f32,                true);
    }
    [metal_library release];
    return ctx;
 }
--- a/ggml/src/ggml-opencl/CMakeLists.txt
+++ b/ggml/src/ggml-opencl/CMakeLists.txt
@ -15,6 +15,7 @@ if (GGML_OPENCL_PROFILING)
 endif ()
 add_compile_definitions(GGML_OPENCL_SOA_Q)
 add_compile_definitions(GGML_OPENCL_TARGET_VERSION=${GGML_OPENCL_TARGET_VERSION})
 if (GGML_OPENCL_USE_ADRENO_KERNELS)
    message(STATUS "OpenCL will use matmul kernels optimized for Adreno")
--- a/ggml/src/ggml-opencl/ggml-opencl.cpp
+++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
@ -1,4 +1,4 @@
-#define CL_TARGET_OPENCL_VERSION 220
+#define CL_TARGET_OPENCL_VERSION GGML_OPENCL_TARGET_VERSION
 #define CL_USE_DEPRECATED_OPENCL_1_2_APIS
 // suppress warnings in CL headers for GCC and Clang
@ -25,6 +25,8 @@
 #include <vector>
 #include <string>
 #include <cmath>
 #include <memory>
 #include <charconv>
 #undef MIN
 #undef MAX
@ -62,6 +64,97 @@ enum ADRENO_GPU_GEN {
    X1E,
 };
 struct ggml_cl_version {
    cl_uint major = 0;
    cl_uint minor = 0;
 };
 // Parses a version string of form "XX.YY ". On an error returns ggml_cl_version with all zeroes.
 static ggml_cl_version parse_cl_version(std::string_view str) {
    size_t major_str_begin = 0;
    size_t major_str_end   = str.find(".", major_str_begin);
    if (major_str_end == std::string::npos) {
        return {};
    }
    size_t minor_str_begin = major_str_end + 1;
    size_t minor_str_end   = str.find(" ", minor_str_begin);
    if (minor_str_end == std::string::npos) {
        return {};
    }
    cl_uint version_major;
    if (std::from_chars(str.data() + major_str_begin, str.data() + major_str_end, version_major).ec != std::errc{}) {
        return {};
    }
    cl_uint version_minor;
    if (std::from_chars(str.data() + minor_str_begin, str.data() + minor_str_end, version_minor).ec != std::errc{}) {
        return {};
    }
    return { version_major, version_minor };
 }
 // Returns OpenCL platform's version. On an error returns ggml_cl_version with all zeroes.
 static ggml_cl_version get_opencl_platform_version(cl_platform_id platform) {
    size_t param_size;
    CL_CHECK(clGetPlatformInfo(platform, CL_PLATFORM_VERSION, 0, nullptr, &param_size));
    std::unique_ptr<char[]> param_storage(new char[param_size]);
    CL_CHECK(clGetPlatformInfo(platform, CL_PLATFORM_VERSION, param_size, param_storage.get(), nullptr));
    auto              param_value    = std::string_view(param_storage.get(), param_size);
    const std::string version_prefix = "OpenCL ";  // Suffix: "XX.YY <platform-specific-info>"
    if (param_value.find(version_prefix) != 0) {
        return {};
    }
    param_value.remove_prefix(version_prefix.length());
    return parse_cl_version(param_value);
 }
 // Return a version to use in OpenCL C compilation. On an error returns ggml_cl_version with all zeroes.
 static ggml_cl_version get_opencl_c_version(ggml_cl_version platform_version, cl_device_id device) {
    size_t param_size;
 #if CL_TARGET_OPENCL_VERSION >= 300
    if (platform_version.major >= 3) {
        CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_ALL_VERSIONS, 0, nullptr, &param_size));
        if (!param_size) {
            return {};
        }
        std::unique_ptr<cl_name_version[]> versions(new cl_name_version[param_size]);
        CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_ALL_VERSIONS, param_size, versions.get(), nullptr));
        unsigned versions_count = param_size / sizeof(cl_name_version);
        cl_version version_max = 0;
        for (unsigned i = 0; i < versions_count; i++) {
            version_max = std::max<cl_version>(versions[i].version, version_max);
        }
        return { CL_VERSION_MAJOR(version_max), CL_VERSION_MINOR(version_max) };
    }
 #else
    GGML_UNUSED(platform_version);
 #endif  // CL_TARGET_OPENCL_VERSION >= 300
    CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_VERSION, 0, nullptr, &param_size));
    if (!param_size) {
        return {};
    }
    std::unique_ptr<char[]> param_storage(new char[param_size]);
    CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_VERSION, param_size, param_storage.get(), nullptr));
    auto param_value = std::string_view(param_storage.get(), param_size);
    const std::string version_prefix = "OpenCL C ";  // Suffix: "XX.YY <platform-specific-info>"
    if (param_value.find(version_prefix) != 0) {
        return {};
    }
    param_value.remove_prefix(version_prefix.length());
    return parse_cl_version(param_value);
 }
 static ADRENO_GPU_GEN get_adreno_gpu_gen(const char *device_name) {
    if (strstr(device_name, "730") ||
        strstr(device_name, "740") ||
@ -470,16 +563,11 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
    // A local ref of cl_device_id for convenience
    cl_device_id device = backend_ctx->device;
-    // Check device OpenCL version, OpenCL 2.0 or above is required
+    ggml_cl_version platform_version = get_opencl_platform_version(default_device->platform->id);
    size_t device_ver_str_size;
    clGetDeviceInfo(device, CL_DEVICE_VERSION, 0, NULL, &device_ver_str_size);
    char *device_ver_buffer = (char *)alloca(device_ver_str_size + 1);
    clGetDeviceInfo(device, CL_DEVICE_VERSION, device_ver_str_size, device_ver_buffer, NULL);
    device_ver_buffer[device_ver_str_size] = '\0';
    GGML_LOG_INFO("ggml_opencl: device OpenCL version: %s\n", device_ver_buffer);
-    if (strstr(device_ver_buffer, "OpenCL 2") == NULL &&
+    // Check device OpenCL version, OpenCL 2.0 or above is required
-        strstr(device_ver_buffer, "OpenCL 3") == NULL) {
+    ggml_cl_version opencl_c_version = get_opencl_c_version(platform_version, device);
    if (opencl_c_version.major < 2) {
        GGML_LOG_ERROR("ggml_opencl: OpenCL 2.0 or above is required\n");
        return backend_ctx;
    }
@ -516,8 +604,7 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
    // If OpenCL 3.0 is supported, then check for cl_khr_subgroups, which becomes
    // optional in OpenCL 3.0 (cl_khr_subgroup is mandatory in OpenCL 2.x)
-    if (strstr(device_ver_buffer, "OpenCL 3") &&
+    if (opencl_c_version.major == 3 && strstr(ext_buffer, "cl_khr_subgroups") == NULL &&
        strstr(ext_buffer, "cl_khr_subgroups") == NULL &&
        strstr(ext_buffer, "cl_intel_subgroups") == NULL) {
        GGML_LOG_ERROR("ggml_opencl: device does not support subgroups (cl_khr_subgroups or cl_intel_subgroups) "
            "(note that subgroups is an optional feature in OpenCL 3.0)\n");
@ -581,9 +668,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
    const std::string kernel_src = read_file("ggml-opencl.cl");
 #endif
-    std::string compile_opts =
+    auto opencl_c_std =
-        "-cl-std=CL2.0 -cl-mad-enable -cl-unsafe-math-optimizations "
+        std::string("CL") + std::to_string(opencl_c_version.major) + "." + std::to_string(opencl_c_version.minor);
-        "-cl-finite-math-only -cl-fast-relaxed-math ";
+
    std::string compile_opts = std::string("-cl-std=") + opencl_c_std +
                               " -cl-mad-enable -cl-unsafe-math-optimizations"
                               " -cl-finite-math-only -cl-fast-relaxed-math";
    backend_ctx->program = build_program_from_source(context, device, kernel_src.c_str(), compile_opts);
    // Non matmul kernels.
@ -693,10 +783,10 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
    CL_CHECK((backend_ctx->kernel_transpose_16 = clCreateKernel(backend_ctx->program_transpose_16, "kernel_transpose_16", &err), err));
    // Gemv general
-    std::string CL_gemv_compile_opts =
+    std::string CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
-        " -cl-std=CL2.0 "
+                                       " -cl-mad-enable "
-        " -cl-mad-enable "
+                                       " -DSIMDGROUP_WIDTH=" +
-        " -DSIMDGROUP_WIDTH=" + std::to_string(backend_ctx->adreno_wave_size);
+                                       std::to_string(backend_ctx->adreno_wave_size);
    if (has_vector_subgroup_broadcast) {
        CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
    }
@ -713,12 +803,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
    CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_general = clCreateKernel(backend_ctx->program_CL_gemv_general, "kernel_gemv_noshuffle", &err), err));
    // Gemv 2048, 16384
-    CL_gemv_compile_opts =
+    CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
-        " -cl-std=CL2.0 "
+                           " -cl-mad-enable "
-        " -cl-mad-enable "
+                           " -DLINE_STRIDE_A=2048 "
-        " -DLINE_STRIDE_A=2048 "
+                           " -DBLOCK_STRIDE_A=16384 "
-        " -DBLOCK_STRIDE_A=16384 "
+                           " -DSIMDGROUP_WIDTH=" +
-        " -DSIMDGROUP_WIDTH=" + std::to_string(backend_ctx->adreno_wave_size);
+                           std::to_string(backend_ctx->adreno_wave_size);
    if (has_vector_subgroup_broadcast) {
        CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
    }
@ -735,12 +825,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
    CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_4096_1_4096, "kernel_gemv_noshuffle", &err), err));
    // Gemv 2048, 16384
-    CL_gemv_compile_opts =
+    CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
-        " -cl-std=CL2.0 "
+                           " -cl-mad-enable "
-        " -cl-mad-enable "
+                           " -DLINE_STRIDE_A=2048 "
-        " -DLINE_STRIDE_A=2048 "
+                           " -DBLOCK_STRIDE_A=16384 "
-        " -DBLOCK_STRIDE_A=16384 "
+                           " -DSIMDGROUP_WIDTH=" +
-        " -DSIMDGROUP_WIDTH=" + std::to_string(backend_ctx->adreno_wave_size);
+                           std::to_string(backend_ctx->adreno_wave_size);
    if (has_vector_subgroup_broadcast) {
        CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
    }
@ -750,12 +840,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
    CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_11008 = clCreateKernel(backend_ctx->program_CL_gemv_4096_1_11008, "kernel_gemv_noshuffle", &err), err));
    // Gemv 5504, 44032
-    CL_gemv_compile_opts =
+    CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
-        " -cl-std=CL2.0 "
+                           " -cl-mad-enable "
-        " -cl-mad-enable "
+                           " -DLINE_STRIDE_A=5504 "
-        " -DLINE_STRIDE_A=5504 "
+                           " -DBLOCK_STRIDE_A=44032 "
-        " -DBLOCK_STRIDE_A=44032 "
+                           " -DSIMDGROUP_WIDTH=" +
-        " -DSIMDGROUP_WIDTH=" + std::to_string(backend_ctx->adreno_wave_size);
+                           std::to_string(backend_ctx->adreno_wave_size);
    if (has_vector_subgroup_broadcast) {
        CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
    }
@ -765,12 +855,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
    CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_11008_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_11008_1_4096, "kernel_gemv_noshuffle", &err), err));
    // Gemv 16000, 128000
-    CL_gemv_compile_opts =
+    CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
-        " -cl-std=CL2.0 "
+                           " -cl-mad-enable "
-        " -cl-mad-enable "
+                           " -DLINE_STRIDE_A=16000 "
-        " -DLINE_STRIDE_A=16000 "
+                           " -DBLOCK_STRIDE_A=128000 "
-        " -DBLOCK_STRIDE_A=128000 "
+                           " -DSIMDGROUP_WIDTH=" +
-        " -DSIMDGROUP_WIDTH=" + std::to_string(backend_ctx->adreno_wave_size);
+                           std::to_string(backend_ctx->adreno_wave_size);
    if (has_vector_subgroup_broadcast) {
        CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
    }
--- a/ggml/src/ggml-sycl/mmvq.cpp
+++ b/ggml/src/ggml-sycl/mmvq.cpp
@ -3,44 +3,42 @@
 #include <cassert>
 template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_sycl_t vec_dot_q_sycl>
-static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows,
+static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
-                          const sycl::nd_item<3> &item_ct1) {
+                          const int ncols, const int nrows, const sycl::nd_item<3> & item_ct1) {
-    const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
+    const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + item_ct1.get_local_id(1);
                    item_ct1.get_local_id(1);
    if (row >= nrows) {
        return;
    }
-    const int blocks_per_row = ncols / qk;
+    const int     blocks_per_row  = ncols / qk;
-    const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
+    constexpr int blocks_per_warp = (vdr * WARP_SIZE + qi - 1) / qi;  // Ensuring blocks_per_warp > 0
    assert(blocks_per_warp>0);
-// partial sum for each thread
+    assert(blocks_per_warp > 0);
    // partial sum for each thread
    float tmp = 0.0f;
-    const block_q_t  * x = (const block_q_t  *) vx;
+    const block_q_t *  x = (const block_q_t *) vx;
    const block_q8_1 * y = (const block_q8_1 *) vy;
-    for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
+    for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row; i += blocks_per_warp) {
-         i += blocks_per_warp) {
+        const int ibx = row * blocks_per_row + i;  // x block index
        const int ibx = row*blocks_per_row + i; // x block index
-        const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
+        const int iby = i * (qk / QK8_1);          // y block index that aligns with ibx
-        const int iqs =
+        for (size_t elem = 0; elem < qi / vdr; elem += WARP_SIZE) {
-            vdr *
+            const int iqs = elem + vdr * (item_ct1.get_local_id(2) %
-            (item_ct1.get_local_id(2) %
+                                          (qi / vdr));  // x block quant index when casting the quants to int
             (qi / vdr)); // x block quant index when casting the quants to int
-        tmp += vec_dot_q_sycl(&x[ibx], &y[iby], iqs);
+            tmp += vec_dot_q_sycl(&x[ibx], &y[iby], iqs);
        }
    }
    // sum up partial sums and write back result
 #pragma unroll
-    for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
+    for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
-        tmp +=
+        tmp += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
    }
    if (item_ct1.get_local_id(2) == 0) {
@ -62,7 +60,7 @@ static void mul_mat_vec_q_iq2_xxs_q8_1(const void *__restrict__ vx,
    }
    const int blocks_per_row = ncols / qk;
-    const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
+    const int blocks_per_warp = vdr * WARP_SIZE / qi;
    assert(blocks_per_warp>0);
 // partial sum for each thread
@ -87,7 +85,7 @@ static void mul_mat_vec_q_iq2_xxs_q8_1(const void *__restrict__ vx,
    // sum up partial sums and write back result
 #pragma unroll
-    for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
+    for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
        tmp +=
            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
    }
@ -111,7 +109,7 @@ static void mul_mat_vec_q_iq2_xs_q8_1(const void *__restrict__ vx,
    }
    const int blocks_per_row = ncols / qk;
-    const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
+    const int blocks_per_warp = vdr * WARP_SIZE / qi;
    assert(blocks_per_warp>0);
 // partial sum for each thread
    float tmp = 0.0f;
@ -135,7 +133,7 @@ static void mul_mat_vec_q_iq2_xs_q8_1(const void *__restrict__ vx,
    // sum up partial sums and write back result
 #pragma unroll
-    for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
+    for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
        tmp +=
            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
    }
@ -159,7 +157,7 @@ static void mul_mat_vec_q_iq2_s_q8_1(const void *__restrict__ vx,
    }
    const int blocks_per_row = ncols / qk;
-    const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
+    const int blocks_per_warp = vdr * WARP_SIZE / qi;
    assert(blocks_per_warp>0);
 // partial sum for each thread
    float tmp = 0.0f;
@ -183,7 +181,7 @@ static void mul_mat_vec_q_iq2_s_q8_1(const void *__restrict__ vx,
    // sum up partial sums and write back result
 #pragma unroll
-    for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
+    for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
        tmp +=
            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
    }
@ -207,7 +205,7 @@ static void mul_mat_vec_q_iq3_xxs_q8_1(const void *__restrict__ vx,
    }
    const int blocks_per_row = ncols / qk;
-    const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
+    const int blocks_per_warp = vdr * WARP_SIZE / qi;
    assert(blocks_per_warp>0);
 // partial sum for each thread
    float tmp = 0.0f;
@ -231,7 +229,7 @@ static void mul_mat_vec_q_iq3_xxs_q8_1(const void *__restrict__ vx,
    // sum up partial sums and write back result
 #pragma unroll
-    for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
+    for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
        tmp +=
            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
    }
@ -255,7 +253,7 @@ static void mul_mat_vec_q_iq3_s_q8_1(const void *__restrict__ vx,
    }
    const int blocks_per_row = ncols / qk;
-    const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
+    const int blocks_per_warp = vdr * WARP_SIZE / qi;
    assert(blocks_per_warp>0);
 // partial sum for each thread
    float tmp = 0.0f;
@ -279,7 +277,7 @@ static void mul_mat_vec_q_iq3_s_q8_1(const void *__restrict__ vx,
    // sum up partial sums and write back result
 #pragma unroll
-    for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
+    for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
        tmp +=
            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
    }
@ -303,7 +301,7 @@ static void mul_mat_vec_q_iq1_s_q8_1(const void *__restrict__ vx,
    }
    const int blocks_per_row = ncols / qk;
-    const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
+    const int blocks_per_warp = vdr * WARP_SIZE / qi;
    assert(blocks_per_warp>0);
 // partial sum for each thread
    float tmp = 0.0f;
@ -327,7 +325,7 @@ static void mul_mat_vec_q_iq1_s_q8_1(const void *__restrict__ vx,
    // sum up partial sums and write back result
 #pragma unroll
-    for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
+    for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
        tmp +=
            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
    }
@ -351,7 +349,7 @@ static void mul_mat_vec_q_iq1_m_q8_1(const void *__restrict__ vx,
    }
    const int blocks_per_row = ncols / qk;
-    const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
+    const int blocks_per_warp = vdr * WARP_SIZE / qi;
    assert(blocks_per_warp>0);
 // partial sum for each thread
    float tmp = 0.0f;
@ -375,7 +373,7 @@ static void mul_mat_vec_q_iq1_m_q8_1(const void *__restrict__ vx,
    // sum up partial sums and write back result
 #pragma unroll
-    for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
+    for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
        tmp +=
            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
    }
@ -399,7 +397,7 @@ static void mul_mat_vec_q_iq4_nl_q8_1(const void *__restrict__ vx,
    }
    const int blocks_per_row = ncols / qk;
-    const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
+    const int blocks_per_warp = vdr * WARP_SIZE / qi;
    assert(blocks_per_warp>0);
 // partial sum for each thread
    float tmp = 0.0f;
@ -423,7 +421,7 @@ static void mul_mat_vec_q_iq4_nl_q8_1(const void *__restrict__ vx,
    // sum up partial sums and write back result
 #pragma unroll
-    for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
+    for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
        tmp +=
            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
    }
@ -448,7 +446,7 @@ static void mul_mat_vec_q_iq4_xs_q8_1(const void *__restrict__ vx,
    }
    const int blocks_per_row = ncols / qk;
-    const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
+    const int blocks_per_warp = vdr * WARP_SIZE / qi;
    assert(blocks_per_warp>0);
 // partial sum for each thread
    float tmp = 0.0f;
@ -472,7 +470,7 @@ static void mul_mat_vec_q_iq4_xs_q8_1(const void *__restrict__ vx,
    // sum up partial sums and write back result
 #pragma unroll
-    for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
+    for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
        tmp +=
            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
    }
@ -489,7 +487,7 @@ static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy,
    GGML_ASSERT(ncols % QK4_0 == 0);
    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
    const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
    {
        stream->submit([&](sycl::handler &cgh) {
@ -497,7 +495,7 @@ static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy,
            cgh.parallel_for(
                sycl::nd_range<3>(block_nums * block_dims, block_dims),
                [=](sycl::nd_item<3> item_ct1)
-                    [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
+                    [[intel::reqd_sub_group_size(WARP_SIZE)]] {
                        mul_mat_vec_q<QK4_0, QI4_0, block_q4_0,
                                      VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>(
                            vx, vy, dst, ncols, nrows, item_ct1);
@ -513,7 +511,7 @@ static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy,
    GGML_ASSERT(ncols % QK4_1 == 0);
    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
    const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
    {
        stream->submit([&](sycl::handler &cgh) {
@ -521,7 +519,7 @@ static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy,
            cgh.parallel_for(
                sycl::nd_range<3>(block_nums * block_dims, block_dims),
                [=](sycl::nd_item<3> item_ct1)
-                    [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
+                    [[intel::reqd_sub_group_size(WARP_SIZE)]] {
                        mul_mat_vec_q<QK4_0, QI4_1, block_q4_1,
                                      VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>(
                            vx, vy, dst, ncols, nrows, item_ct1);
@ -537,7 +535,7 @@ static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy,
    GGML_ASSERT(ncols % QK5_0 == 0);
    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
    const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
    {
        stream->submit([&](sycl::handler &cgh) {
@ -545,7 +543,7 @@ static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy,
            cgh.parallel_for(
                sycl::nd_range<3>(block_nums * block_dims, block_dims),
                [=](sycl::nd_item<3> item_ct1)
-                    [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
+                    [[intel::reqd_sub_group_size(WARP_SIZE)]] {
                        mul_mat_vec_q<QK5_0, QI5_0, block_q5_0,
                                      VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>(
                            vx, vy, dst, ncols, nrows, item_ct1);
@ -561,7 +559,7 @@ static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy,
    GGML_ASSERT(ncols % QK5_1 == 0);
    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
    const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
    {
        stream->submit([&](sycl::handler &cgh) {
@ -569,7 +567,7 @@ static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy,
            cgh.parallel_for(
                sycl::nd_range<3>(block_nums * block_dims, block_dims),
                [=](sycl::nd_item<3> item_ct1)
-                    [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
+                    [[intel::reqd_sub_group_size(WARP_SIZE)]] {
                        mul_mat_vec_q<QK5_1, QI5_1, block_q5_1,
                                      VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>(
                            vx, vy, dst, ncols, nrows, item_ct1);
@ -585,7 +583,7 @@ static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy,
    GGML_ASSERT(ncols % QK8_0 == 0);
    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
    const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
    {
        stream->submit([&](sycl::handler &cgh) {
@ -593,7 +591,7 @@ static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy,
            cgh.parallel_for(
                sycl::nd_range<3>(block_nums * block_dims, block_dims),
                [=](sycl::nd_item<3> item_ct1)
-                    [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
+                    [[intel::reqd_sub_group_size(WARP_SIZE)]] {
                        mul_mat_vec_q<QK8_0, QI8_0, block_q8_0,
                                      VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>(
                            vx, vy, dst, ncols, nrows, item_ct1);
@ -609,7 +607,7 @@ static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy,
    GGML_ASSERT(ncols % QK_K == 0);
    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
    const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
    {
        stream->submit([&](sycl::handler &cgh) {
@ -617,7 +615,7 @@ static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy,
            cgh.parallel_for(
                sycl::nd_range<3>(block_nums * block_dims, block_dims),
                [=](sycl::nd_item<3> item_ct1)
-                    [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
+                    [[intel::reqd_sub_group_size(WARP_SIZE)]] {
                        mul_mat_vec_q<QK_K, QI2_K, block_q2_K,
                                      VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>(
                            vx, vy, dst, ncols, nrows, item_ct1);
@ -633,7 +631,7 @@ static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy,
    GGML_ASSERT(ncols % QK_K == 0);
    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
    const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
    {
        stream->submit([&](sycl::handler &cgh) {
@ -641,7 +639,7 @@ static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy,
            cgh.parallel_for(
                sycl::nd_range<3>(block_nums * block_dims, block_dims),
                [=](sycl::nd_item<3> item_ct1)
-                    [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
+                    [[intel::reqd_sub_group_size(WARP_SIZE)]] {
                        mul_mat_vec_q<QK_K, QI3_K, block_q3_K,
                                      VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>(
                            vx, vy, dst, ncols, nrows, item_ct1);
@ -657,7 +655,7 @@ static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
    GGML_ASSERT(ncols % QK_K == 0);
    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
    const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
    {
        stream->submit([&](sycl::handler &cgh) {
@ -665,7 +663,7 @@ static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
            cgh.parallel_for(
                sycl::nd_range<3>(block_nums * block_dims, block_dims),
                [=](sycl::nd_item<3> item_ct1)
-                    [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
+                    [[intel::reqd_sub_group_size(WARP_SIZE)]] {
                        mul_mat_vec_q<QK_K, QI4_K, block_q4_K,
                                      VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>(
                            vx, vy, dst, ncols, nrows, item_ct1);
@ -681,7 +679,7 @@ static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
    GGML_ASSERT(ncols % QK_K == 0);
    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
    const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
    {
        stream->submit([&](sycl::handler &cgh) {
@ -689,7 +687,7 @@ static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
            cgh.parallel_for(
                sycl::nd_range<3>(block_nums * block_dims, block_dims),
                [=](sycl::nd_item<3> item_ct1)
-                    [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
+                    [[intel::reqd_sub_group_size(WARP_SIZE)]] {
                        mul_mat_vec_q<QK_K, QI5_K, block_q5_K,
                                      VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>(
                            vx, vy, dst, ncols, nrows, item_ct1);
@ -705,7 +703,7 @@ static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
    GGML_ASSERT(ncols % QK_K == 0);
    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
    const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
    {
        stream->submit([&](sycl::handler &cgh) {
@ -713,7 +711,7 @@ static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
            cgh.parallel_for(
                sycl::nd_range<3>(block_nums * block_dims, block_dims),
                [=](sycl::nd_item<3> item_ct1)
-                    [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
+                    [[intel::reqd_sub_group_size(WARP_SIZE)]] {
                        mul_mat_vec_q<QK_K, QI6_K, block_q6_K,
                                      VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>(
                            vx, vy, dst, ncols, nrows, item_ct1);
@ -730,13 +728,13 @@ static void mul_mat_vec_iq2_xxs_q8_1_sycl(const void *vx, const void *vy,
    GGML_ASSERT(ncols % QK_K == 0);
    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
    const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
    {
        stream->submit([&](sycl::handler &cgh) {
            cgh.parallel_for(
                sycl::nd_range<3>(block_nums * block_dims, block_dims),
                [=](sycl::nd_item<3> item_ct1)
-                    [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
+                    [[intel::reqd_sub_group_size(WARP_SIZE)]] {
                        mul_mat_vec_q_iq2_xxs_q8_1<QK_K, QI2_XXS/2, block_iq2_xxs, 1>(
                            vx, vy, dst, ncols, nrows, item_ct1);
                    });
@ -751,13 +749,13 @@ static void mul_mat_vec_iq2_xs_q8_1_sycl(const void *vx, const void *vy,
    GGML_ASSERT(ncols % QK_K == 0);
    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
    const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
    {
        stream->submit([&](sycl::handler & cgh) {
            cgh.parallel_for(
                sycl::nd_range<3>(block_nums * block_dims, block_dims),
                [=](sycl::nd_item<3> item_ct1)
-                    [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
+                    [[intel::reqd_sub_group_size(WARP_SIZE)]] {
                        mul_mat_vec_q_iq2_xs_q8_1<QK_K, QI2_XS/2, block_iq2_xs, 1>(
                            vx, vy, dst, ncols, nrows, item_ct1);
                    });
@ -772,14 +770,14 @@ static void mul_mat_vec_iq2_s_q8_1_sycl(const void *vx, const void *vy,
    GGML_ASSERT(ncols % QK_K == 0);
    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
    const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
    {
        stream->submit([&](sycl::handler &cgh) {
            cgh.parallel_for(
                sycl::nd_range<3>(block_nums * block_dims, block_dims),
                [=](sycl::nd_item<3> item_ct1)
-                    [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
+                    [[intel::reqd_sub_group_size(WARP_SIZE)]] {
                        mul_mat_vec_q_iq2_s_q8_1<QK_K, QI2_S/2, block_iq2_s, 1>(
                            vx, vy, dst, ncols, nrows, item_ct1);
                    });
@ -794,14 +792,14 @@ static void mul_mat_vec_iq3_xxs_q8_1_sycl(const void *vx, const void *vy,
    GGML_ASSERT(ncols % QK_K == 0);
    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
    const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
    {
        stream->submit([&](sycl::handler &cgh) {
            cgh.parallel_for(
                sycl::nd_range<3>(block_nums * block_dims, block_dims),
                [=](sycl::nd_item<3> item_ct1)
-                    [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
+                    [[intel::reqd_sub_group_size(WARP_SIZE)]] {
                        mul_mat_vec_q_iq3_xxs_q8_1<QK_K, QI3_XXS/2, block_iq3_xxs, 1>(
                            vx, vy, dst, ncols, nrows, item_ct1);
                    });
@ -816,14 +814,14 @@ static void mul_mat_vec_iq3_s_q8_1_sycl(const void *vx, const void *vy,
    GGML_ASSERT(ncols % QK_K == 0);
    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
    const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
    {
        stream->submit([&](sycl::handler &cgh) {
            cgh.parallel_for(
                sycl::nd_range<3>(block_nums * block_dims, block_dims),
                [=](sycl::nd_item<3> item_ct1)
-                    [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
+                    [[intel::reqd_sub_group_size(WARP_SIZE)]] {
                        mul_mat_vec_q_iq3_s_q8_1<QK_K, QI3_S/2, block_iq3_s, 1>(
                            vx, vy, dst, ncols, nrows, item_ct1);
                    });
@ -838,14 +836,14 @@ static void mul_mat_vec_iq1_s_q8_1_sycl(const void *vx, const void *vy,
    GGML_ASSERT(ncols % QK_K == 0);
    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
    const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
    {
        stream->submit([&](sycl::handler &cgh) {
            cgh.parallel_for(
                sycl::nd_range<3>(block_nums * block_dims, block_dims),
                [=](sycl::nd_item<3> item_ct1)
-                    [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
+                    [[intel::reqd_sub_group_size(WARP_SIZE)]] {
                        mul_mat_vec_q_iq1_s_q8_1<QK_K, QI1_S, block_iq1_s, 1>(
                            vx, vy, dst, ncols, nrows, item_ct1);
                    });
@ -860,13 +858,13 @@ static void mul_mat_vec_iq1_m_q8_1_sycl(const void *vx, const void *vy,
    GGML_ASSERT(ncols % QK_K == 0);
    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
    const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
    {
        stream->submit([&](sycl::handler &cgh) {
            cgh.parallel_for(
                sycl::nd_range<3>(block_nums * block_dims, block_dims),
                [=](sycl::nd_item<3> item_ct1)
-                    [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
+                    [[intel::reqd_sub_group_size(WARP_SIZE)]] {
                        mul_mat_vec_q_iq1_m_q8_1<QK_K, QI1_S, block_iq1_m, 1>(
                            vx, vy, dst, ncols, nrows, item_ct1);
                    });
@ -881,14 +879,14 @@ static void mul_mat_vec_iq4_nl_q8_1_sycl(const void *vx, const void *vy,
    GGML_ASSERT(ncols % QK4_NL == 0);
    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
    const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
    {
        stream->submit([&](sycl::handler &cgh) {
            cgh.parallel_for(
                sycl::nd_range<3>(block_nums * block_dims, block_dims),
                [=](sycl::nd_item<3> item_ct1)
-                    [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
+                    [[intel::reqd_sub_group_size(WARP_SIZE)]] {
                        mul_mat_vec_q_iq4_nl_q8_1<QK4_NL, QI4_NL, block_iq4_nl, 2>(
                            vx, vy, dst, ncols, nrows, item_ct1);
                    });
@ -903,14 +901,14 @@ static void mul_mat_vec_iq4_xs_q8_1_sycl(const void *vx, const void *vy,
    GGML_ASSERT(ncols % QK_K == 0);
    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
    const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
    {
        stream->submit([&](sycl::handler &cgh) {
            cgh.parallel_for(
                sycl::nd_range<3>(block_nums * block_dims, block_dims),
                [=](sycl::nd_item<3> item_ct1)
-                    [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
+                    [[intel::reqd_sub_group_size(WARP_SIZE)]] {
                        mul_mat_vec_q_iq4_xs_q8_1<QK_K, QI4_XS/4, block_iq4_xs, 1>(
                            vx, vy, dst, ncols, nrows, item_ct1);
                    });
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp
@ -5,23 +5,24 @@
 layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
-shared FLOAT_TYPE sccache1[BLOCK_SIZE/16][16];
+shared FLOAT_TYPE sccache1[2][BLOCK_SIZE/16][16];
-shared FLOAT_TYPE sccache2[BLOCK_SIZE/16][16];
+shared FLOAT_TYPE sccache2[2][BLOCK_SIZE/16][16];
 FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
 uint csel = 0;
 void calc_superblock(const uint a_offset, const uint b_offset, const uint itid, const uint v_im, const uint ix, const uint q_offset, const uint y_offset, const uint i, const uint num_blocks_per_row, const uint first_row, const uint num_rows, const bool all_threads) {
    const uint y_idx = i * QUANT_K + y_offset;
    [[unroll]] for (uint n = 0; n < num_rows; ++n) {
        const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
        csel ^= 1;
        barrier();
        if (!all_threads) { // when we don't have enough blocks to use all threads
            if (i < num_blocks_per_row) {
                const uint32_t scale = uint32_t(data_a[ib0 + i].scales[itid]);
-                sccache1[ix][itid] = FLOAT_TYPE(scale & 0xF);
+                sccache1[csel][ix][itid] = FLOAT_TYPE(scale & 0xF);
-                sccache2[ix][itid] = FLOAT_TYPE((scale >> 4) & 0xF);
+                sccache2[csel][ix][itid] = FLOAT_TYPE((scale >> 4) & 0xF);
            }
            barrier();
@ -29,8 +30,8 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint itid,
                continue;
        } else {
            const uint32_t scale = uint32_t(data_a[ib0 + i].scales[itid]);
-            sccache1[ix][itid] = FLOAT_TYPE(scale & 0xF);
+            sccache1[csel][ix][itid] = FLOAT_TYPE(scale & 0xF);
-            sccache2[ix][itid] = FLOAT_TYPE((scale >> 4) & 0xF);
+            sccache2[csel][ix][itid] = FLOAT_TYPE((scale >> 4) & 0xF);
            barrier();
        }
@ -57,22 +58,22 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint itid,
            FLOAT_TYPE sum1 = FLOAT_TYPE(0.0);
            FLOAT_TYPE sum2 = FLOAT_TYPE(0.0);
            [[unroll]] for (int l = 0; l < 2; ++l) {
-                sum1 = fma(FLOAT_TYPE(b0[l]),   sccache1[ix][    8*v_im] * qs_u32_0[l  ],
+                sum1 = fma(FLOAT_TYPE(b0[l]),   sccache1[csel][ix][    8*v_im] * qs_u32_0[l  ],
-                       fma(FLOAT_TYPE(b16[l]),  sccache1[ix][1 + 8*v_im] * qs_u32_0[l+2],
+                       fma(FLOAT_TYPE(b16[l]),  sccache1[csel][ix][1 + 8*v_im] * qs_u32_0[l+2],
-                       fma(FLOAT_TYPE(b32[l]),  sccache1[ix][2 + 8*v_im] * qs_u32_2[l  ],
+                       fma(FLOAT_TYPE(b32[l]),  sccache1[csel][ix][2 + 8*v_im] * qs_u32_2[l  ],
-                       fma(FLOAT_TYPE(b48[l]),  sccache1[ix][3 + 8*v_im] * qs_u32_2[l+2],
+                       fma(FLOAT_TYPE(b48[l]),  sccache1[csel][ix][3 + 8*v_im] * qs_u32_2[l+2],
-                       fma(FLOAT_TYPE(b64[l]),  sccache1[ix][4 + 8*v_im] * qs_u32_4[l  ],
+                       fma(FLOAT_TYPE(b64[l]),  sccache1[csel][ix][4 + 8*v_im] * qs_u32_4[l  ],
-                       fma(FLOAT_TYPE(b80[l]),  sccache1[ix][5 + 8*v_im] * qs_u32_4[l+2],
+                       fma(FLOAT_TYPE(b80[l]),  sccache1[csel][ix][5 + 8*v_im] * qs_u32_4[l+2],
-                       fma(FLOAT_TYPE(b96[l]),  sccache1[ix][6 + 8*v_im] * qs_u32_6[l  ],
+                       fma(FLOAT_TYPE(b96[l]),  sccache1[csel][ix][6 + 8*v_im] * qs_u32_6[l  ],
-                       fma(FLOAT_TYPE(b112[l]), sccache1[ix][7 + 8*v_im] * qs_u32_6[l+2], sum1))))))));
+                       fma(FLOAT_TYPE(b112[l]), sccache1[csel][ix][7 + 8*v_im] * qs_u32_6[l+2], sum1))))))));
-                sum2 = fma(FLOAT_TYPE(b0[l]),   sccache2[ix][    8*v_im],
+                sum2 = fma(FLOAT_TYPE(b0[l]),   sccache2[csel][ix][    8*v_im],
-                       fma(FLOAT_TYPE(b16[l]),  sccache2[ix][1 + 8*v_im],
+                       fma(FLOAT_TYPE(b16[l]),  sccache2[csel][ix][1 + 8*v_im],
-                       fma(FLOAT_TYPE(b32[l]),  sccache2[ix][2 + 8*v_im],
+                       fma(FLOAT_TYPE(b32[l]),  sccache2[csel][ix][2 + 8*v_im],
-                       fma(FLOAT_TYPE(b48[l]),  sccache2[ix][3 + 8*v_im],
+                       fma(FLOAT_TYPE(b48[l]),  sccache2[csel][ix][3 + 8*v_im],
-                       fma(FLOAT_TYPE(b64[l]),  sccache2[ix][4 + 8*v_im],
+                       fma(FLOAT_TYPE(b64[l]),  sccache2[csel][ix][4 + 8*v_im],
-                       fma(FLOAT_TYPE(b80[l]),  sccache2[ix][5 + 8*v_im],
+                       fma(FLOAT_TYPE(b80[l]),  sccache2[csel][ix][5 + 8*v_im],
-                       fma(FLOAT_TYPE(b96[l]),  sccache2[ix][6 + 8*v_im],
+                       fma(FLOAT_TYPE(b96[l]),  sccache2[csel][ix][6 + 8*v_im],
-                       fma(FLOAT_TYPE(b112[l]), sccache2[ix][7 + 8*v_im], sum2))))))));
+                       fma(FLOAT_TYPE(b112[l]), sccache2[csel][ix][7 + 8*v_im], sum2))))))));
            }
            temp[j][n] = fma(dall, sum1, fma(-dmin, sum2, temp[j][n]));
        }
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp
@ -5,20 +5,21 @@
 layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
-shared FLOAT_TYPE sccache[BLOCK_SIZE/16][2][8];
+shared FLOAT_TYPE sccache[2][BLOCK_SIZE/16][2][8];
 FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
 uint csel = 0;
 void calc_superblock(const uint a_offset, const uint b_offset, const uint ix, const uint itid8, const uint v_im, const uint v_im4, const uint v_in, const uint32_t hm_m[4], const uint q_offset, const uint y_offset, const uint s_shift, const uint i, const uint num_blocks_per_row, const uint first_row, const uint num_rows, const bool all_threads) {
    const uint y_idx = i * QUANT_K + y_offset;
    [[unroll]] for (uint n = 0; n < num_rows; ++n) {
        const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
        csel ^= 1;
        if (!all_threads) { // when we don't have enough blocks to use all threads
            barrier();
            if (i < num_blocks_per_row)
-                sccache[ix][v_im][itid8] = FLOAT_TYPE(int8_t(((data_a[ib0+i].scales[itid8] >> v_im4) & 0xF) | (((data_a[ib0+i].scales[itid8%4+8] >> s_shift) & 3) << 4)) - 32);
+                sccache[csel][ix][v_im][itid8] = FLOAT_TYPE(int8_t(((data_a[ib0+i].scales[itid8] >> v_im4) & 0xF) | (((data_a[ib0+i].scales[itid8%4+8] >> s_shift) & 3) << 4)) - 32);
            barrier();
            if (i >= num_blocks_per_row)
@ -40,8 +41,7 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint ix, co
        const vec4 qs_u32_6 = vec4(unpack8((qs_u32 >> 6) & 0x03030303));
        if (all_threads) {
-            barrier();
+            sccache[csel][ix][v_im][itid8] = FLOAT_TYPE(int8_t(((data_a[ib0+i].scales[itid8] >> v_im4) & 0xF) | (((data_a[ib0+i].scales[itid8%4+8] >> s_shift) & 3) << 4)) - 32);
            sccache[ix][v_im][itid8] = FLOAT_TYPE(int8_t(((data_a[ib0+i].scales[itid8] >> v_im4) & 0xF) | (((data_a[ib0+i].scales[itid8%4+8] >> s_shift) & 3) << 4)) - 32);
            barrier();
        }
@ -59,14 +59,14 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint ix, co
            FLOAT_TYPE sum = FLOAT_TYPE(0.0);
            [[unroll]] for (int l = 0; l < 2; ++l) {
-                sum = fma(FLOAT_TYPE(  b0[l]) * sccache[ix][v_im][0], qs_u32_0[l  ] - hmk_0[l  ],
+                sum = fma(FLOAT_TYPE(  b0[l]) * sccache[csel][ix][v_im][0], qs_u32_0[l  ] - hmk_0[l  ],
-                      fma(FLOAT_TYPE( b16[l]) * sccache[ix][v_im][1], qs_u32_0[l+2] - hmk_0[l+2],
+                      fma(FLOAT_TYPE( b16[l]) * sccache[csel][ix][v_im][1], qs_u32_0[l+2] - hmk_0[l+2],
-                      fma(FLOAT_TYPE( b32[l]) * sccache[ix][v_im][2], qs_u32_2[l  ] - hmk_1[l  ],
+                      fma(FLOAT_TYPE( b32[l]) * sccache[csel][ix][v_im][2], qs_u32_2[l  ] - hmk_1[l  ],
-                      fma(FLOAT_TYPE( b48[l]) * sccache[ix][v_im][3], qs_u32_2[l+2] - hmk_1[l+2],
+                      fma(FLOAT_TYPE( b48[l]) * sccache[csel][ix][v_im][3], qs_u32_2[l+2] - hmk_1[l+2],
-                      fma(FLOAT_TYPE( b64[l]) * sccache[ix][v_im][4], qs_u32_4[l  ] - hmk_2[l  ],
+                      fma(FLOAT_TYPE( b64[l]) * sccache[csel][ix][v_im][4], qs_u32_4[l  ] - hmk_2[l  ],
-                      fma(FLOAT_TYPE( b80[l]) * sccache[ix][v_im][5], qs_u32_4[l+2] - hmk_2[l+2],
+                      fma(FLOAT_TYPE( b80[l]) * sccache[csel][ix][v_im][5], qs_u32_4[l+2] - hmk_2[l+2],
-                      fma(FLOAT_TYPE( b96[l]) * sccache[ix][v_im][6], qs_u32_6[l  ] - hmk_3[l  ],
+                      fma(FLOAT_TYPE( b96[l]) * sccache[csel][ix][v_im][6], qs_u32_6[l  ] - hmk_3[l  ],
-                      fma(FLOAT_TYPE(b112[l]) * sccache[ix][v_im][7], qs_u32_6[l+2] - hmk_3[l+2], sum))))))));
+                      fma(FLOAT_TYPE(b112[l]) * sccache[csel][ix][v_im][7], qs_u32_6[l+2] - hmk_3[l+2], sum))))))));
            }
            temp[j][n] = fma(d, sum, temp[j][n]);
        }
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp
@ -6,20 +6,21 @@
 layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
-shared FLOAT_TYPE sccache[BLOCK_SIZE/16][16];
+shared FLOAT_TYPE sccache[2][BLOCK_SIZE/16][16];
 FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
 uint csel = 0;
 void calc_superblock(const uint a_offset, const uint b_offset, const uint itid, const uint ix, const uint ql_offset, const uint qh_offset, const uint s_offset, const uint y_offset, const uint i, const uint num_blocks_per_row, const uint first_row, const uint num_rows, const bool all_threads) {
    const uint y_idx = i * QUANT_K + y_offset;
    [[unroll]] for (uint n = 0; n < num_rows; ++n) {
        const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
        csel ^= 1;
        if (!all_threads) { // when we don't have enough blocks to use all threads
            barrier();
            if (i < num_blocks_per_row)
-                sccache[ix][itid] = FLOAT_TYPE(data_a[ib0 + i].scales[itid]);
+                sccache[csel][ix][itid] = FLOAT_TYPE(data_a[ib0 + i].scales[itid]);
            barrier();
            if (i >= num_blocks_per_row)
@ -51,8 +52,7 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint itid,
        const vec4 q3 = vec4(unpack8(q3_u32)) - 32;
        if (all_threads) {
-            barrier();
+            sccache[csel][ix][itid] = FLOAT_TYPE(data_a[ib0 + i].scales[itid]);
            sccache[ix][itid] = FLOAT_TYPE(data_a[ib0 + i].scales[itid]);
            barrier();
        }
@ -71,7 +71,7 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint itid,
                sum[2] = fma(FLOAT_TYPE(by64[l]), q2[l], sum[2]);
                sum[3] = fma(FLOAT_TYPE(by96[l]), q3[l], sum[3]);
            }
-            temp[j][n] = fma(fma(sum[0], sccache[ix][s_offset], fma(sum[1], sccache[ix][s_offset + 2], fma(sum[2], sccache[ix][s_offset + 4], sum[3] * sccache[ix][s_offset + 6]))), d, temp[j][n]);
+            temp[j][n] = fma(fma(sum[0], sccache[csel][ix][s_offset], fma(sum[1], sccache[csel][ix][s_offset + 2], fma(sum[2], sccache[csel][ix][s_offset + 4], sum[3] * sccache[csel][ix][s_offset + 6]))), d, temp[j][n]);
        }
    }
 }
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp
@ -743,7 +743,7 @@ void main() {
        [[unroll]] for (uint cm_col = 0; cm_col < cms_per_col; cm_col++) {
            coopMatStore(sums[cm_col * cms_per_row + cm_row], coopmat_stage, warp_i * TM * TN, TM, gl_CooperativeMatrixLayoutColumnMajor);
-            [[unroll]] for (uint col = 0; col < BN; col += storestride) {
+            [[unroll]] for (uint col = 0; col < TN; col += storestride) {
                const uint row_i = dc + cm_col * TN + col + store_c;
                if (row_i >= _ne1) break;
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@ -253,6 +253,7 @@ class MODEL_ARCH(IntEnum):
    MINICPM3         = auto()
    GEMMA            = auto()
    GEMMA2           = auto()
    GEMMA3           = auto()
    STARCODER2       = auto()
    RWKV6            = auto()
    RWKV6QWEN2       = auto()
@ -440,6 +441,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
    MODEL_ARCH.MINICPM3:         "minicpm3",
    MODEL_ARCH.GEMMA:            "gemma",
    MODEL_ARCH.GEMMA2:           "gemma2",
    MODEL_ARCH.GEMMA3:           "gemma3",
    MODEL_ARCH.STARCODER2:       "starcoder2",
    MODEL_ARCH.RWKV6:            "rwkv6",
    MODEL_ARCH.RWKV6QWEN2:       "rwkv6qwen2",
@ -1077,6 +1079,23 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.FFN_PRE_NORM,
        MODEL_TENSOR.FFN_POST_NORM,
    ],
    MODEL_ARCH.GEMMA3: [
        MODEL_TENSOR.TOKEN_EMBD,
        MODEL_TENSOR.OUTPUT_NORM,
        MODEL_TENSOR.ATTN_Q,
        MODEL_TENSOR.ATTN_Q_NORM,
        MODEL_TENSOR.ATTN_K,
        MODEL_TENSOR.ATTN_K_NORM,
        MODEL_TENSOR.ATTN_V,
        MODEL_TENSOR.ATTN_OUT,
        MODEL_TENSOR.FFN_GATE,
        MODEL_TENSOR.FFN_DOWN,
        MODEL_TENSOR.FFN_UP,
        MODEL_TENSOR.ATTN_NORM,
        MODEL_TENSOR.ATTN_POST_NORM,
        MODEL_TENSOR.FFN_PRE_NORM,
        MODEL_TENSOR.FFN_POST_NORM,
    ],
    MODEL_ARCH.STARCODER2: [
        MODEL_TENSOR.TOKEN_EMBD,
        MODEL_TENSOR.OUTPUT_NORM,
--- a/gpttype_adapter.cpp
+++ b/gpttype_adapter.cpp
@ -2876,11 +2876,15 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
    bool llava_images_changed = false;
-    bool add_bos_token = true;
+    bool add_bos_token = true; //if set to false, mmproj handling breaks
-    if(file_format == FileFormat::GGUF_GENERIC)
+    if(file_format == FileFormat::GGUF_GENERIC && mmproj_filename == "")
    {
        const llama_vocab * tmpvocab = llama_model_get_vocab(&(llama_ctx_v4->model));
        add_bos_token = llama_vocab_get_add_bos(tmpvocab);
        if(!add_bos_token)
        {
             printf("\nBOS token prefix was disabled for this model.");
        }
    }
    for(int x=0;x<inputs.stop_sequence_len;++x)
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@ -36,6 +36,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
    { LLM_ARCH_MINICPM3,         "minicpm3"         },
    { LLM_ARCH_GEMMA,            "gemma"            },
    { LLM_ARCH_GEMMA2,           "gemma2"           },
    { LLM_ARCH_GEMMA3,           "gemma3"           },
    { LLM_ARCH_STARCODER2,       "starcoder2"       },
    { LLM_ARCH_MAMBA,            "mamba"            },
    { LLM_ARCH_XVERSE,           "xverse"           },
@ -766,6 +767,26 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
            { LLM_TENSOR_FFN_POST_NORM,   "blk.%d.post_ffw_norm" },
        },
    },
    {
        LLM_ARCH_GEMMA3,
        {
            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
            { LLM_TENSOR_ATTN_Q_NORM,     "blk.%d.attn_q_norm" },
            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
            { LLM_TENSOR_ATTN_K_NORM,     "blk.%d.attn_k_norm" },
            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
            { LLM_TENSOR_ATTN_POST_NORM,  "blk.%d.post_attention_norm" },
            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
            { LLM_TENSOR_FFN_POST_NORM,   "blk.%d.post_ffw_norm" },
        },
    },
    {
        LLM_ARCH_STARCODER2,
        {
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@ -40,6 +40,7 @@ enum llm_arch {
    LLM_ARCH_MINICPM3,
    LLM_ARCH_GEMMA,
    LLM_ARCH_GEMMA2,
    LLM_ARCH_GEMMA3,
    LLM_ARCH_STARCODER2,
    LLM_ARCH_MAMBA,
    LLM_ARCH_XVERSE,
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@ -9,6 +9,7 @@
 #include <algorithm>
 #include <cassert>
 #include <cstring>
 #include <cmath>
 #include <functional>
 #include <map>
 #include <sstream>
@ -869,6 +870,23 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                    default: type = LLM_TYPE_UNKNOWN;
               }
            } break;
        case LLM_ARCH_GEMMA3:
            {
                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,    hparams.n_swa);
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
                switch (hparams.n_layer) {
                    case 26: type = LLM_TYPE_1B; break;
                    case 34: type = LLM_TYPE_4B; break;
                    case 48: type = LLM_TYPE_12B; break;
                    case 62: type = LLM_TYPE_27B; break;
                    default: type = LLM_TYPE_UNKNOWN;
                }
                hparams.f_attention_scale = type == LLM_TYPE_27B
                    ? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0)))
                    : 1.0f / std::sqrt(float(hparams.n_embd_head_k));
            } break;
        case LLM_ARCH_STARCODER2:
            {
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@ -2549,6 +2567,35 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
                        layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
                    }
                } break;
            case LLM_ARCH_GEMMA3:
                {
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
                    // output
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
                    output      = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
                    for (int i = 0; i < n_layer; ++i) {
                        auto & layer = layers[i];
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
                        layer.attn_k_norm    = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM,    "weight", i), {n_embd_head_k}, 0);
                        layer.attn_q_norm    = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM,    "weight", i), {n_embd_head_k}, 0);
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
@ -3746,6 +3793,7 @@ void llama_model::print_info() const {
        LLAMA_LOG_INFO("%s: f_clamp_kqv      = %.1e\n",   __func__, hparams.f_clamp_kqv);
        LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n",   __func__, hparams.f_max_alibi_bias);
        LLAMA_LOG_INFO("%s: f_logit_scale    = %.1e\n",   __func__, hparams.f_logit_scale);
        LLAMA_LOG_INFO("%s: f_attn_scale     = %.1e\n",   __func__, hparams.f_attention_scale);
        LLAMA_LOG_INFO("%s: n_ff             = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_ff(il); }, hparams.n_layer).c_str());
        LLAMA_LOG_INFO("%s: n_expert         = %u\n",     __func__, hparams.n_expert);
        LLAMA_LOG_INFO("%s: n_expert_used    = %u\n",     __func__, hparams.n_expert_used);
@ -4022,6 +4070,7 @@ enum llama_rope_type llama_model_rope_type(const struct llama_model * model) {
        case LLM_ARCH_PHIMOE:
        case LLM_ARCH_GEMMA:
        case LLM_ARCH_GEMMA2:
        case LLM_ARCH_GEMMA3:
        case LLM_ARCH_STARCODER2:
        case LLM_ARCH_OPENELM:
        case LLM_ARCH_GPTNEOX:
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -5014,6 +5014,149 @@ struct llm_build_context {
        return gf;
    }
    struct ggml_cgraph * build_gemma3() {
        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
        const int64_t n_embd_head_k = hparams.n_embd_head_k;
        struct ggml_tensor * cur;
        struct ggml_tensor * inpL;
        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
        // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
        if (ubatch.token) {
            inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
            cb(inpL, "inp_scaled", -1);
        }
        // inp_pos - contains the positions
        struct ggml_tensor * inp_pos = build_inp_pos();
        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
        // gemma3 requires different mask for layers using sliding window (SWA)
        struct ggml_tensor * KQ_mask     = build_inp_KQ_mask(true);
        struct ggml_tensor * KQ_mask_swa = build_inp_KQ_mask_swa(true);
        // "5-to-1 interleaved attention"
        // 5 layers of local attention followed by 1 layer of global attention
        static const int sliding_window_pattern = 6;
        for (int il = 0; il < n_layer; ++il) {
            const bool is_sliding          = (il + 1) % sliding_window_pattern;
            const float freq_base_l        = is_sliding ? 10000.0f    : freq_base;
            const float freq_scale_l       = is_sliding ? 1.0f        : freq_scale;
            struct ggml_tensor * KQ_mask_l = is_sliding ? KQ_mask_swa : KQ_mask;
            // norm
            cur = llm_build_norm(ctx0, inpL, hparams,
                    model.layers[il].attn_norm, NULL,
                    LLM_NORM_RMS, cb, il);
            cb(cur, "attn_norm", il);
            // self-attention
            {
                // compute Q and K and RoPE them
                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
                cb(Qcur, "Qcur", il);
                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
                cb(Kcur, "Kcur", il);
                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
                cb(Vcur, "Vcur", il);
                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head,    n_tokens);
                Qcur = llm_build_norm(ctx0, Qcur, hparams,
                    model.layers[il].attn_q_norm,
                    NULL,
                    LLM_NORM_RMS, cb, il);
                cb(Qcur, "Qcur_normed", il);
                Qcur = ggml_rope_ext(
                        ctx0, Qcur, inp_pos, nullptr,
                        n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
                        ext_factor, attn_factor, beta_fast, beta_slow);
                cb(Qcur, "Qcur", il);
                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens);
                Kcur = llm_build_norm(ctx0, Kcur, hparams,
                    model.layers[il].attn_k_norm,
                    NULL,
                    LLM_NORM_RMS, cb, il);
                cb(Kcur, "Kcur_normed", il);
                Kcur = ggml_rope_ext(
                        ctx0, Kcur, inp_pos, nullptr,
                        n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
                        ext_factor, attn_factor, beta_fast, beta_slow);
                cb(Kcur, "Kcur", il);
                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
                        model.layers[il].wo, NULL,
                        Kcur, Vcur, Qcur, KQ_mask_l, n_tokens, kv_head, n_kv, hparams.f_attention_scale, cb, il);
            }
            cur = llm_build_norm(ctx0, cur, hparams,
                    model.layers[il].attn_post_norm, NULL,
                    LLM_NORM_RMS, cb, il);
            cb(cur, "attn_post_norm", il);
            if (il == n_layer - 1) {
                // skip computing output for unused tokens
                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
                cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
                inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
            }
            struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
            cb(sa_out, "sa_out", il);
            cur = llm_build_norm(ctx0, sa_out, hparams,
                    model.layers[il].ffn_norm, NULL,
                    LLM_NORM_RMS, cb, il);
            cb(cur, "ffn_norm", il);
            // feed-forward network
            {
                cur = llm_build_ffn(ctx0, lctx, cur,
                        model.layers[il].ffn_up,   NULL, NULL,
                        model.layers[il].ffn_gate, NULL, NULL,
                        model.layers[il].ffn_down, NULL, NULL,
                        NULL,
                        LLM_FFN_GELU, LLM_FFN_PAR, cb, il);
                cb(cur, "ffn_out", il);
            }
            cur = llm_build_norm(ctx0, cur, hparams,
                model.layers[il].ffn_post_norm, NULL,
                LLM_NORM_RMS, cb, -1);
            cb(cur, "ffn_post_norm", -1);
            cur = ggml_add(ctx0, cur, sa_out);
            cur = lctx.cvec.apply_to(ctx0, cur, il);
            cb(cur, "l_out", il);
            // input for next layer
            inpL = cur;
        }
        cur = inpL;
        cur = llm_build_norm(ctx0, cur, hparams,
                model.output_norm, NULL,
                LLM_NORM_RMS, cb, -1);
        cb(cur, "result_norm", -1);
        // lm_head
        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
        cb(cur, "result_output", -1);
        ggml_build_forward_expand(gf, cur);
        return gf;
    }
    struct ggml_cgraph * build_starcoder2() {
        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
@ -8334,6 +8477,10 @@ static struct ggml_cgraph * llama_build_graph(
            {
                result = llm.build_gemma2();
            } break;
        case LLM_ARCH_GEMMA3:
            {
                result = llm.build_gemma3();
            } break;
        case LLM_ARCH_STARCODER2:
            {
                result = llm.build_starcoder2();