diff --git a/Makefile b/Makefile index c979f7553..225865a01 100644 --- a/Makefile +++ b/Makefile @@ -639,6 +639,8 @@ ttsmain: examples/tts/tts.cpp common/arg.cpp build-info.h ggml.o ggml-cpu.o llam $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) gguf-split: examples/gguf-split/gguf-split.cpp ggml.o ggml-cpu.o llama.o build-info.h llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FULL) $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) +gemma3-cli: examples/llava/gemma3-cli.cpp common/arg.cpp build-info.h ggml.o ggml-cpu.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FULL) $(OBJS) + $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) vulkan-shaders-gen: ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp @echo 'This command can be MANUALLY run to regenerate vulkan shaders. Normally concedo will do it, so you do not have to.' diff --git a/common/arg.cpp b/common/arg.cpp index 5d1393c79..c11bd34b6 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1868,16 +1868,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_examples({LLAMA_EXAMPLE_PASSKEY})); add_opt(common_arg( {"-o", "--output", "--output-file"}, "FNAME", - string_format("output file (default: '%s')", - ex == LLAMA_EXAMPLE_EXPORT_LORA - ? params.lora_outfile.c_str() - : ex == LLAMA_EXAMPLE_CVECTOR_GENERATOR - ? params.cvector_outfile.c_str() - : params.out_file.c_str()), + string_format("output file (default: '%s')", params.out_file.c_str()), [](common_params & params, const std::string & value) { params.out_file = value; - params.cvector_outfile = value; - params.lora_outfile = value; } ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA})); add_opt(common_arg( diff --git a/common/chat.cpp b/common/chat.cpp index 1b10219cc..62ca26ad7 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -60,7 +60,9 @@ std::vector common_chat_msgs_parse_oaicompat(const json & messa } msg.role = message.at("role"); - if (message.contains("content")) { + auto has_content = message.contains("content"); + auto has_tool_calls = message.contains("tool_calls"); + if (has_content) { const auto & content = message.at("content"); if (content.is_string()) { msg.content = content; @@ -81,19 +83,8 @@ std::vector common_chat_msgs_parse_oaicompat(const json & messa } else if (!content.is_null()) { throw std::runtime_error("Invalid 'content' type: expected string or array, got " + content.dump() + " (ref: https://github.com/ggml-org/llama.cpp/issues/8367)"); } - } else { - throw std::runtime_error("Expected 'content' (ref: https://github.com/ggml-org/llama.cpp/issues/8367)"); } - if (message.contains("reasoning_content")) { - msg.reasoning_content = message.at("reasoning_content"); - } - if (message.contains("name")) { - msg.tool_name = message.at("name"); - } - if (message.contains("tool_call_id")) { - msg.tool_call_id = message.at("tool_call_id"); - } - if (message.contains("tool_calls")) { + if (has_tool_calls) { for (const auto & tool_call : message.at("tool_calls")) { common_chat_tool_call tc; if (!tool_call.contains("type")) { @@ -118,6 +109,18 @@ std::vector common_chat_msgs_parse_oaicompat(const json & messa msg.tool_calls.push_back(tc); } } + if (!has_content && !has_tool_calls) { + throw std::runtime_error("Expected 'content' or 'tool_calls' (ref: https://github.com/ggml-org/llama.cpp/issues/8367 & https://github.com/ggml-org/llama.cpp/issues/12279)"); + } + if (message.contains("reasoning_content")) { + msg.reasoning_content = message.at("reasoning_content"); + } + if (message.contains("name")) { + msg.tool_name = message.at("name"); + } + if (message.contains("tool_call_id")) { + msg.tool_call_id = message.at("tool_call_id"); + } msgs.push_back(msg); } @@ -442,6 +445,7 @@ std::string common_chat_format_name(common_chat_format format) { case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2: return "Functionary v3.2"; case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1: return "Functionary v3.1 Llama 3.1"; case COMMON_CHAT_FORMAT_HERMES_2_PRO: return "Hermes 2 Pro"; + case COMMON_CHAT_FORMAT_HERMES_2_PRO_EXTRACT_REASONING: return "Hermes 2 Pro (extract reasoning)"; case COMMON_CHAT_FORMAT_COMMAND_R7B: return "Command R7B"; case COMMON_CHAT_FORMAT_COMMAND_R7B_EXTRACT_REASONING: return "Command R7B (extract reasoning)"; default: @@ -875,9 +879,9 @@ static common_chat_params common_chat_params_init_command_r7b(const common_chat_ return data; } static common_chat_msg common_chat_parse_command_r7b(const std::string & input, bool extract_reasoning) { - static std::regex thought_regex("(<\\|START_THINKING\\|>([\\s\\S]*?)<\\|END_THINKING\\|>)([\\s\\S]*)"); - static std::regex action_regex("<\\|START_ACTION\\|>([\\s\\S]*?)<\\|END_ACTION\\|>"); - static std::regex response_regex("(?:<\\|START_RESPONSE\\|>)?([\\s\\S]*?)<\\|END_RESPONSE\\|>"); + static const std::regex thought_regex("(<\\|START_THINKING\\|>([\\s\\S]*?)<\\|END_THINKING\\|>)([\\s\\S]*)"); + static const std::regex action_regex("<\\|START_ACTION\\|>([\\s\\S]*?)<\\|END_ACTION\\|>"); + static const std::regex response_regex("(?:<\\|START_RESPONSE\\|>)?([\\s\\S]*?)<\\|END_RESPONSE\\|>"); std::smatch match; @@ -1009,10 +1013,10 @@ static common_chat_params common_chat_params_init_llama_3_1_tool_calls(const com } static common_chat_msg common_chat_parse_llama_3_1(const std::string & input, bool with_builtin_tools = false) { // TODO: tighten & simplify the parser, don't accept leading text context. - static std::regex function_regex( + static const std::regex function_regex( "\\s*\\{\\s*(?:\"type\"\\s*:\\s*\"function\"\\s*,\\s*)?\"name\"\\s*:\\s*\"([^\"]+)\"\\s*,\\s*\"parameters\"\\s*: "); - static std::regex close_regex("\\}\\s*"); - static std::regex builtin_call_regex("<\\|python_tag\\|>\\s*([^.(]+)\\s*\\.\\s*call\\s*\\(\\s*([\\w]+)\\s*=\\s*([\\s\\S]*?)\\)"); + static const std::regex close_regex("\\}\\s*"); + static const std::regex builtin_call_regex("<\\|python_tag\\|>\\s*([^.(]+)\\s*\\.\\s*call\\s*\\(\\s*([\\w]+)\\s*=\\s*([\\s\\S]*?)\\)"); if (with_builtin_tools) { std::smatch match; @@ -1102,34 +1106,42 @@ static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_ data.format = inputs.extract_reasoning ? COMMON_CHAT_FORMAT_DEEPSEEK_R1_EXTRACT_REASONING : COMMON_CHAT_FORMAT_DEEPSEEK_R1; return data; } -static common_chat_msg common_chat_parse_deepseek_r1(const std::string & input, bool extract_reasoning) { - static std::regex function_regex("<|tool▁call▁begin|>function<|tool▁sep|>([^\n]+)\n```json\n"); - static std::regex close_regex("```[\\s\\r\\n]*<|tool▁call▁end|>"); - static std::regex reasoning_content_regex("((?:)?([\\s\\S\\r\\n]*?))?([\\s\\S\\r\\n]*)"); - static std::regex tool_calls_regex("[\\s\\r\\n]*(?:<|tool▁calls▁begin|>|<|tool_calls_begin|>|<|tool calls begin|>|<|tool\\\\_calls\\\\_begin|>)([\\s\\S\\r\\n]*?)<|tool▁calls▁end|>"); - common_chat_msg msg; - msg.role = "assistant"; +static common_chat_msg handle_think_tag_prelude(const std::string & input, bool extract_reasoning, const std::function & rest_parser) { std::smatch match; + static const std::regex reasoning_content_regex("((?:)?([\\s\\S\\r\\n]*?))?([\\s\\S\\r\\n]*)"); if (std::regex_match(input, match, reasoning_content_regex)) { - std::string rest; + auto rest = match[3].str(); + auto msg = rest_parser(rest); + auto reasoning_content = string_strip(match[2].str()); if (extract_reasoning) { - msg.reasoning_content = string_strip(match[2].str()); - } else { - msg.content = match[1].str(); + msg.reasoning_content = reasoning_content; + } else if (!reasoning_content.empty()) { + std::ostringstream content; + content << "" << reasoning_content << "" << msg.content; + msg.content = content.str(); } - rest = match[3].str(); + return msg; + } + return rest_parser(input); +} +static common_chat_msg common_chat_parse_deepseek_r1(const std::string & input, bool extract_reasoning) { + return handle_think_tag_prelude(input, extract_reasoning, [](const std::string & input) { + static const std::regex function_regex("<|tool▁call▁begin|>function<|tool▁sep|>([^\n]+)\n```json\n"); + static const std::regex close_regex("```[\\s\\r\\n]*<|tool▁call▁end|>"); + static const std::regex tool_calls_regex("[\\s\\r\\n]*(?:<|tool▁calls▁begin|>|<|tool_calls_begin|>|<|tool calls begin|>|<|tool\\\\_calls\\\\_begin|>)([\\s\\S\\r\\n]*?)<|tool▁calls▁end|>"); - if (std::regex_search(rest, match, tool_calls_regex)) { + common_chat_msg msg; + msg.role = "assistant"; + std::smatch match; + if (std::regex_search(input, match, tool_calls_regex)) { auto tool_calls = match[1].str(); auto msg2 = parse_json_tool_calls(tool_calls, std::nullopt, function_regex, close_regex); msg.tool_calls = std::move(msg2.tool_calls); } else { - msg.content += std::string(rest.begin() + rest.find_first_not_of(" \r\n"), rest.end()); + msg.content = input; } - } else { - msg.content = input; - } - return msg; + return msg; + }); } static common_chat_params common_chat_params_init_firefunction_v2(const common_chat_template & tmpl, const struct templates_params & inputs) { @@ -1234,8 +1246,8 @@ static common_chat_params common_chat_params_init_functionary_v3_2(const common_ } static common_chat_msg common_chat_parse_functionary_v3_2(const std::string & input) { - static std::regex function_regex(R"((?:>>>)?(?:assistant<|end_header_id|>\n)?(\w+)\n)"); - static std::regex close_regex(R"($|(?=>>>))"); + static const std::regex function_regex(R"((?:>>>)?(?:assistant<|end_header_id|>\n)?(\w+)\n)"); + static const std::regex close_regex(R"($|(?=>>>))"); std::string content; auto it = input.begin(); @@ -1324,7 +1336,7 @@ static common_chat_params common_chat_params_init_functionary_v3_1_llama_3_1(con } static common_chat_msg common_chat_parse_functionary_v3_1_llama_3_1(const std::string & input) { // This version of Functionary still supports the llama 3.1 tool call format for the python tool. - static std::regex python_tag_regex(R"(<\|python_tag\|>([\s\S\n]*)$)"); + static const std::regex python_tag_regex(R"(<\|python_tag\|>([\s\S\n]*)$)"); std::smatch match; if (std::regex_search(input, match, python_tag_regex)) { auto code = match[1].str(); @@ -1338,8 +1350,8 @@ static common_chat_msg common_chat_parse_functionary_v3_1_llama_3_1(const std::s }); return msg; } - static std::regex function_regex(R"()"); - static std::regex close_regex(R"()"); + static const std::regex function_regex(R"()"); + static const std::regex close_regex(R"()"); // TODO: tighten & simplify. return parse_json_tool_calls(input, std::nullopt, function_regex, close_regex); } @@ -1406,6 +1418,8 @@ static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat "(?:```(?:json|xml)?\n\\s*)?(?:|||)?\\s*\\{\\s*\"", //name\"\\s*:\\s*\"" + escaped_name + "\"", }); data.preserved_tokens = { + "", + "", "", "", "" // match 2 (open_tag) - "|" - "|" - "|" - "|" - "|" - "|" - "|" - ")?" - "(\\s*\\{\\s*\"name\"\\s*:[\\s\\S]*)" // match 3 (named tool call + rest) - ")" - "|" - "(?:]+)>" // match 4 (function name) - "|)" // match 5 (function name again) - "([\\s\\S]*)" // match 6 (function arguments + rest)})" - ); +static common_chat_msg common_chat_parse_hermes_2_pro(const std::string& input, bool extract_reasoning) { + return handle_think_tag_prelude(input, extract_reasoning, [](const std::string & input) { + static const std::regex open_regex( + "(?:" + "(```(?:xml|json)?\\n\\s*)?" // match 1 (block_start) + "(" // match 2 (open_tag) + "|" + "|" + "|" + "|" + "|" + "|" + "|" + ")?" + "(\\s*\\{\\s*\"name\"\\s*:[\\s\\S]*)" // match 3 (named tool call + rest) + ")" + "|" + "(?:]+)>" // match 4 (function name) + "|)" // match 5 (function name again) + "([\\s\\S]*)" // match 6 (function arguments + rest)})" + ); - try { + try { + common_chat_msg msg; + msg.role = "assistant"; - common_chat_msg msg; - msg.role = "assistant"; + std::string::const_iterator it = input.begin(); + const std::string::const_iterator end = input.end(); + std::smatch match; - std::string::const_iterator it = input.begin(); - const std::string::const_iterator end = input.end(); - std::smatch match; + while (it != end) { + if (std::regex_search(it, end, match, open_regex)) { + // Add content before the match + msg.content += std::string(it, match[0].first); - while (it != end) { - if (std::regex_search(it, end, match, open_regex)) { - // Add content before the match - msg.content += std::string(it, match[0].first); + auto block_start = match[1].str(); + std::string block_end = block_start.empty() ? "" : "```"; - auto block_start = match[1].str(); - std::string block_end = block_start.empty() ? "" : "```"; + auto open_tag = match[2].str(); + std::string close_tag; - auto open_tag = match[2].str(); - std::string close_tag; + if (match[3].matched) { + close_tag = open_tag.empty() ? "" : ""; + // Start parsing from after the opening tags + auto json_it = match[6].first; + json arguments; + if (parse_json(json_it, end, arguments)) { + msg.tool_calls.emplace_back(process_tool_call({ + {"name", function_name}, + {"arguments", arguments}, + })); + it = json_it; // Move iterator past parsed JSON + + // Handle close tags + consume_spaces(it, end); + if (!close_tag.empty() && !parse_literal(it, end, close_tag)) { + throw std::runtime_error("Failed to parse closing tag"); + } + consume_spaces(it, end); + if (!block_end.empty() && !parse_literal(it, end, block_end)) { + throw std::runtime_error("Failed to parse block end"); + } + consume_spaces(it, end); + } else { + // Not a valid tool call, treat as content + msg.content += std::string(match[0].first, match[0].second); + it = match[0].second; + } } } else { - auto function_name = match[4].str(); - if (function_name.empty()) { - function_name = match[5].str(); - } - GGML_ASSERT(!function_name.empty()); - - close_tag = ""; - // Start parsing from after the opening tags - auto json_it = match[6].first; - json arguments; - if (parse_json(json_it, end, arguments)) { - msg.tool_calls.emplace_back(process_tool_call({ - {"name", function_name}, - {"arguments", arguments}, - })); - it = json_it; // Move iterator past parsed JSON - - // Handle close tags - consume_spaces(it, end); - if (!close_tag.empty() && !parse_literal(it, end, close_tag)) { - throw std::runtime_error("Failed to parse closing tag"); - } - consume_spaces(it, end); - if (!block_end.empty() && !parse_literal(it, end, block_end)) { - throw std::runtime_error("Failed to parse block end"); - } - consume_spaces(it, end); - } else { - // Not a valid tool call, treat as content - msg.content += std::string(match[0].first, match[0].second); - it = match[0].second; - } + // Add remaining content + msg.content += std::string(it, end); + break; } - } else { - // Add remaining content - msg.content += std::string(it, end); - break; } + return msg; + } catch (const std::exception & e) { + LOG_ERR("Failed to parse hermes 2 pro input: %s\n", e.what()); + common_chat_msg msg; + msg.role = "assistant"; + msg.content = input; + return msg; } - return msg; - } catch (const std::exception & e) { - LOG_ERR("Failed to parse hermes 2 pro input: %s\n", e.what()); - common_chat_msg msg; - msg.role = "assistant"; - msg.content = input; - return msg; - } + }); } static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) { @@ -1606,6 +1621,11 @@ static common_chat_params common_chat_templates_apply_jinja( return common_chat_params_init_command_r7b(tmpl, params); } + // Hermes 2/3 Pro, Qwen 2.5 Instruct (w/ tools) + if (src.find("") != std::string::npos && params.json_schema.is_null()) { + return common_chat_params_init_hermes_2_pro(tmpl, params); + } + // Use generic handler when mixing tools + JSON schema. // TODO: support that mix in handlers below. if ((params.tools.is_array() && params.json_schema.is_object())) { @@ -1627,11 +1647,6 @@ static common_chat_params common_chat_templates_apply_jinja( return common_chat_params_init_without_tools(tmpl, params); } - // Hermes 2/3 Pro, Qwen 2.5 Instruct (w/ tools) - if (src.find("") != std::string::npos) { - return common_chat_params_init_hermes_2_pro(tmpl, params); - } - // Functionary v3.1 (w/ tools) if (src.find("<|start_header_id|>") != std::string::npos && src.find("= vocab_size: + logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') + continue if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: if tokens[token_id] != token.encode("utf-8"): logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token!r}') @@ -3322,6 +3325,83 @@ class Gemma2Model(Model): return [(self.map_tensor_name(name), data_torch)] +@Model.register("Gemma3ForCausalLM", "Gemma3ForConditionalGeneration") +class Gemma3Model(Model): + model_arch = gguf.MODEL_ARCH.GEMMA3 + has_vision: bool = False + + # we need to merge the text_config into the root level of hparams + def __init__(self, *args, **kwargs): + hparams = Model.load_hparams(kwargs["dir_model"]) + if "text_config" in hparams: + hparams = {**hparams, **hparams["text_config"]} + kwargs["hparams"] = hparams + super().__init__(*args, **kwargs) + if "vision_config" in hparams: + logger.info("Has vision encoder, but it will be ignored") + self.has_vision = True + + def write(self): + super().write() + if self.has_vision: + logger.info("NOTE: this script only convert the language model to GGUF") + logger.info(" for the vision model, please use gemma3_convert_encoder_to_gguf.py") + + def set_vocab(self): + self._set_vocab_sentencepiece() + + self.gguf_writer.add_add_space_prefix(False) + + def set_gguf_parameters(self): + hparams = self.hparams + block_count = hparams["num_hidden_layers"] + + # some default values are not specified in the hparams + self.gguf_writer.add_context_length(hparams.get("max_position_embeddings", 131072)) + self.gguf_writer.add_embedding_length(hparams["hidden_size"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) + self.gguf_writer.add_head_count(hparams.get("num_attention_heads", 8)) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("rms_norm_eps", 1e-6)) + self.gguf_writer.add_key_length(hparams.get("head_dim", 256)) + self.gguf_writer.add_value_length(hparams.get("head_dim", 256)) + self.gguf_writer.add_file_type(self.ftype) + self.gguf_writer.add_rope_freq_base(hparams.get("rope_theta", 1_000_000.0)) # for global layers + # both attn_logit_softcapping and final_logit_softcapping are removed in Gemma3 + assert hparams.get("attn_logit_softcapping") is None + assert hparams.get("final_logit_softcapping") is None + self.gguf_writer.add_sliding_window(hparams["sliding_window"]) + self.gguf_writer.add_head_count_kv(hparams.get("num_key_value_heads", 4)) + if hparams.get("rope_scaling") is not None: + assert hparams["rope_scaling"]["rope_type"] == "linear" + # important: this rope_scaling is only applied for global layers, and not used by 1B model + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) + self.gguf_writer.add_rope_scaling_factor(hparams["rope_scaling"]["factor"]) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + if name.startswith("language_model."): + name = name.replace("language_model.", "") + elif name.startswith("multi_modal_projector.") or name.startswith("vision_tower.") \ + or name.startswith("multimodal_projector.") or name.startswith("vision_model."): # this is for old HF model, should be removed later + # ignore vision tensors + return [] + + # remove OOV (out-of-vocabulary) rows in token_embd + if "embed_tokens.weight" in name: + vocab = self._create_vocab_sentencepiece() + tokens = vocab[0] + data_torch = data_torch[:len(tokens)] + + # ref code in Gemma3RMSNorm + # output = output * (1.0 + self.weight.float()) + if name.endswith("norm.weight"): + data_torch = data_torch + 1 + + return [(self.map_tensor_name(name), data_torch)] + + @Model.register("Starcoder2ForCausalLM") class StarCoder2Model(Model): model_arch = gguf.MODEL_ARCH.STARCODER2 diff --git a/examples/llava/README-gemma3.md b/examples/llava/README-gemma3.md new file mode 100644 index 000000000..20bf73fb5 --- /dev/null +++ b/examples/llava/README-gemma3.md @@ -0,0 +1,30 @@ +# Gemma 3 vision + +> [!IMPORTANT] +> +> This is very experimental, only used for demo purpose. + +## How to get mmproj.gguf? + +```bash +cd gemma-3-4b-it +python ../llama.cpp/examples/llava/gemma3_convert_encoder_to_gguf.py . + +# output file is mmproj.gguf +``` + +## How to run it? + +What you need: +- The text model GGUF, can be converted using `convert_hf_to_gguf.py` +- The mmproj file from step above +- An image file + +```bash +# build +cmake -B build +cmake --build build --target llama-gemma3-cli + +# run it +./build/bin/llama-gemma3-cli -m {text_model}.gguf --mmproj mmproj.gguf --image your_image.jpg +``` diff --git a/examples/llava/README-minicpmo2.6.md b/examples/llava/README-minicpmo2.6.md index 8f591506d..48c423238 100644 --- a/examples/llava/README-minicpmo2.6.md +++ b/examples/llava/README-minicpmo2.6.md @@ -5,13 +5,25 @@ Currently, this readme only supports minicpm-omni's image capabilities, and we w Download [MiniCPM-o-2_6](https://huggingface.co/openbmb/MiniCPM-o-2_6) PyTorch model from huggingface to "MiniCPM-o-2_6" folder. + +### Build llama.cpp +Readme modification time: 20250206 + +If there are differences in usage, please refer to the official build [documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md) + Clone llama.cpp: ```bash -git clone git@github.com:OpenBMB/llama.cpp.git +git clone https://github.com/ggerganov/llama.cpp cd llama.cpp -git checkout minicpm-omni ``` +Build llama.cpp using `CMake`: +```bash +cmake -B build +cmake --build build --config Release +``` + + ### Usage of MiniCPM-o 2.6 Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-o-2_6-gguf) by us) @@ -22,25 +34,15 @@ python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM- python ./convert_hf_to_gguf.py ../MiniCPM-o-2_6/model # quantize int4 version -./llama-quantize ../MiniCPM-o-2_6/model/ggml-model-f16.gguf ../MiniCPM-o-2_6/model/ggml-model-Q4_K_M.gguf Q4_K_M +./build/bin/llama-quantize ../MiniCPM-o-2_6/model/ggml-model-f16.gguf ../MiniCPM-o-2_6/model/ggml-model-Q4_K_M.gguf Q4_K_M ``` -Build llama.cpp using `CMake`: -https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md - -```bash -cmake -B build -cmake --build build --config Release -``` Inference on Linux or Mac -``` +```bash # run f16 version -./llama-minicpmv-cli -m ../MiniCPM-o-2_6/model/ggml-model-f16.gguf --mmproj ../MiniCPM-o-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" +./build/bin/llama-minicpmv-cli -m ../MiniCPM-o-2_6/model/ggml-model-f16.gguf --mmproj ../MiniCPM-o-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" # run quantized int4 version -./llama-minicpmv-cli -m ../MiniCPM-o-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-o-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" - -# or run in interactive mode -./llama-minicpmv-cli -m ../MiniCPM-o-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-o-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -i +./build/bin/llama-minicpmv-cli -m ../MiniCPM-o-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-o-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" ``` diff --git a/examples/llava/README-minicpmv2.5.md b/examples/llava/README-minicpmv2.5.md index b0e72a0fa..6bfe7abd1 100644 --- a/examples/llava/README-minicpmv2.5.md +++ b/examples/llava/README-minicpmv2.5.md @@ -4,13 +4,26 @@ Download [MiniCPM-Llama3-V-2_5](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5) PyTorch model from huggingface to "MiniCPM-Llama3-V-2_5" folder. + +### Build llama.cpp +Readme modification time: 20250206 + +If there are differences in usage, please refer to the official build [documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md) + Clone llama.cpp: ```bash git clone https://github.com/ggml-org/llama.cpp cd llama.cpp ``` -### Usage +Build llama.cpp using `CMake`: +```bash +cmake -B build +cmake --build build --config Release +``` + + +### Usage of MiniCPM-Llama3-V 2.5 Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf) by us) @@ -20,80 +33,15 @@ python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM- python ./convert_hf_to_gguf.py ../MiniCPM-Llama3-V-2_5/model # quantize int4 version -./llama-quantize ../MiniCPM-Llama3-V-2_5/model/model-8B-F16.gguf ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf Q4_K_M +./build/bin/llama-quantize ../MiniCPM-Llama3-V-2_5/model/model-8B-F16.gguf ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf Q4_K_M ``` -Build for Linux or Mac - -```bash -make -make llama-minicpmv-cli -``` Inference on Linux or Mac -``` +```bash # run f16 version -./llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/model-8B-F16.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" +./build/bin/llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/model-8B-F16.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" # run quantized int4 version -./llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" - -# or run in interactive mode -./llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -i -``` - -### Android - -#### Build on Android device using Termux -We found that build on Android device would bring better runtime performance, so we recommend to build on device. - -[Termux](https://github.com/termux/termux-app#installation) is a terminal app on Android device (no root required). - -Install tools in Termux: -``` -apt update && apt upgrade -y -apt install git make cmake -``` - -It's recommended to move your model inside the `~/` directory for best performance: -``` -cd storage/downloads -mv model.gguf ~/ -``` - -#### Building the Project using Android NDK -Obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake. - -Execute the following commands on your computer to avoid downloading the NDK to your mobile. Alternatively, you can also do this in Termux: - -```bash -mkdir build-android -cd build-android -export NDK=/your_ndk_path -cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod .. -make -``` - -Install [termux](https://github.com/termux/termux-app#installation) on your device and run `termux-setup-storage` to get access to your SD card (if Android 11+ then run the command twice). - -Finally, copy these built `llama` binaries and the model file to your device storage. Because the file permissions in the Android sdcard cannot be changed, you can copy the executable files to the `/data/data/com.termux/files/home/bin` path, and then execute the following commands in Termux to add executable permission: - -(Assumed that you have pushed the built executable files to the /sdcard/llama.cpp/bin path using `adb push`) -``` -$cp -r /sdcard/llama.cpp/bin /data/data/com.termux/files/home/ -$cd /data/data/com.termux/files/home/bin -$chmod +x ./* -``` - -Download models and push them to `/sdcard/llama.cpp/`, then move it to `/data/data/com.termux/files/home/model/` - -``` -$mv /sdcard/llama.cpp/ggml-model-Q4_K_M.gguf /data/data/com.termux/files/home/model/ -$mv /sdcard/llama.cpp/mmproj-model-f16.gguf /data/data/com.termux/files/home/model/ -``` - -Now, you can start chatting: -``` -$cd /data/data/com.termux/files/home/bin -$./llama-minicpmv-cli -m ../model/ggml-model-Q4_K_M.gguf --mmproj ../model/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" +./build/bin/llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" ``` diff --git a/examples/llava/README-minicpmv2.6.md b/examples/llava/README-minicpmv2.6.md index c4be5e5dd..2df39cdba 100644 --- a/examples/llava/README-minicpmv2.6.md +++ b/examples/llava/README-minicpmv2.6.md @@ -4,13 +4,25 @@ Download [MiniCPM-V-2_6](https://huggingface.co/openbmb/MiniCPM-V-2_6) PyTorch model from huggingface to "MiniCPM-V-2_6" folder. + +### Build llama.cpp +Readme modification time: 20250206 + +If there are differences in usage, please refer to the official build [documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md) + Clone llama.cpp: ```bash -git clone git@github.com:OpenBMB/llama.cpp.git +git clone https://github.com/ggerganov/llama.cpp cd llama.cpp -git checkout minicpmv-main ``` +Build llama.cpp using `CMake`: +```bash +cmake -B build +cmake --build build --config Release +``` + + ### Usage of MiniCPM-V 2.6 Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) by us) @@ -21,87 +33,15 @@ python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM- python ./convert_hf_to_gguf.py ../MiniCPM-V-2_6/model # quantize int4 version -./llama-quantize ../MiniCPM-V-2_6/model/ggml-model-f16.gguf ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf Q4_K_M +./build/bin/llama-quantize ../MiniCPM-V-2_6/model/ggml-model-f16.gguf ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf Q4_K_M ``` -Build for Linux or Mac - -```bash -make -make llama-minicpmv-cli -``` Inference on Linux or Mac -``` +```bash # run f16 version -./llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-f16.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" +./build/bin/llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-f16.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" # run quantized int4 version -./llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" - -# or run in interactive mode -./llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -i -``` - -### Video -Install FFmpeg -``` -brew install ffmpeg -brew install pkg-config -``` - -### Android - -#### Build on Android device using Termux -We found that build on Android device would bring better runtime performance, so we recommend to build on device. - -[Termux](https://github.com/termux/termux-app#installation) is a terminal app on Android device (no root required). - -Install tools in Termux: -``` -apt update && apt upgrade -y -apt install git make cmake -``` - -It's recommended to move your model inside the `~/` directory for best performance: -``` -cd storage/downloads -mv model.gguf ~/ -``` - -#### Building the Project using Android NDK -Obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake. - -Execute the following commands on your computer to avoid downloading the NDK to your mobile. Alternatively, you can also do this in Termux: - -```bash -mkdir build-android -cd build-android -export NDK=/your_ndk_path -cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod .. -make -``` - -Install [termux](https://github.com/termux/termux-app#installation) on your device and run `termux-setup-storage` to get access to your SD card (if Android 11+ then run the command twice). - -Finally, copy these built `llama` binaries and the model file to your device storage. Because the file permissions in the Android sdcard cannot be changed, you can copy the executable files to the `/data/data/com.termux/files/home/bin` path, and then execute the following commands in Termux to add executable permission: - -(Assumed that you have pushed the built executable files to the /sdcard/llama.cpp/bin path using `adb push`) -``` -$cp -r /sdcard/llama.cpp/bin /data/data/com.termux/files/home/ -$cd /data/data/com.termux/files/home/bin -$chmod +x ./* -``` - -Download models and push them to `/sdcard/llama.cpp/`, then move it to `/data/data/com.termux/files/home/model/` - -``` -$mv /sdcard/llama.cpp/ggml-model-Q4_K_M.gguf /data/data/com.termux/files/home/model/ -$mv /sdcard/llama.cpp/mmproj-model-f16.gguf /data/data/com.termux/files/home/model/ -``` - -Now, you can start chatting: -``` -$cd /data/data/com.termux/files/home/bin -$./llama-minicpmv-cli -m ../model/ggml-model-Q4_K_M.gguf --mmproj ../model/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" +./build/bin/llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" ``` diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index ef5de2df1..d1e1eec1e 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -4,6 +4,7 @@ // Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch #include "clip.h" #include "ggml.h" +#include "ggml-cpp.h" #include "ggml-cpu.h" #include "ggml-alloc.h" #include "ggml-backend.h" @@ -12,19 +13,9 @@ #ifdef GGML_USE_CUDA #include "ggml-cuda.h" #endif - -#ifdef GGML_USE_SYCL -#include "ggml-sycl.h" -#endif - #ifdef GGML_USE_METAL #include "ggml-metal.h" #endif - -#ifdef GGML_USE_CANN -#include "ggml-cann.h" -#endif - #ifdef GGML_USE_VULKAN #include "ggml-vulkan.h" #endif @@ -158,6 +149,8 @@ static std::string format(const char * fmt, ...) { #define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s" #define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s" #define TN_IMAGE_NEWLINE "model.image_newline" +#define TN_MM_INP_PROJ "mm.input_projection.weight" // gemma3 +#define TN_MM_SOFT_EMB_N "mm.soft_emb_norm.weight" // gemma3 #define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k" #define TN_MINICPMV_QUERY "resampler.query" @@ -184,6 +177,7 @@ enum projector_type { PROJECTOR_TYPE_RESAMPLER, PROJECTOR_TYPE_GLM_EDGE, PROJECTOR_TYPE_MERGER, + PROJECTOR_TYPE_GEMMA3, PROJECTOR_TYPE_UNKNOWN, }; @@ -194,6 +188,7 @@ static std::map PROJECTOR_TYPE_NAMES = { { PROJECTOR_TYPE_RESAMPLER, "resampler"}, { PROJECTOR_TYPE_GLM_EDGE, "adapter"}, { PROJECTOR_TYPE_MERGER, "qwen2vl_merger"}, + { PROJECTOR_TYPE_GEMMA3, "gemma3"}, }; @@ -320,7 +315,7 @@ static projector_type clip_projector_type_from_string(const std::string & name) return kv.first; } } - return PROJECTOR_TYPE_UNKNOWN; + throw std::runtime_error(format("Unknown projector type: %s", name.c_str())); } #ifdef CLIP_DEBUG_FUNCTIONS @@ -577,8 +572,18 @@ struct clip_vision_model { struct ggml_tensor * mm_model_ln_kv_b; struct ggml_tensor * mm_model_ln_post_w; struct ggml_tensor * mm_model_ln_post_b; + + // gemma3 + struct ggml_tensor * mm_input_proj_w; + struct ggml_tensor * mm_soft_emb_norm_w; }; +bool enable_gpu_clip = true; +void set_clip_uses_gpu(bool usegpu) +{ + enable_gpu_clip = usegpu; +} + struct clip_ctx { bool has_text_encoder = false; bool has_vision_encoder = false; @@ -591,7 +596,7 @@ struct clip_ctx { struct clip_vision_model vision_model; projector_type proj_type = PROJECTOR_TYPE_MLP; - int32_t max_feature_layer; + int32_t max_feature_layer; // unused in newer models like gemma3 float image_mean[3]; float image_std[3]; bool use_gelu = false; @@ -603,21 +608,213 @@ struct clip_ctx { bool has_post_norm = false; bool has_patch_bias = false; - struct gguf_context * ctx_gguf; - struct ggml_context * ctx_data; + struct gguf_context * ctx_gguf = nullptr; + struct ggml_context * ctx_data = nullptr; std::vector buf_compute_meta; - // memory buffers to evaluate the model - ggml_backend_buffer_t params_buffer = NULL; + std::vector backend_ptrs; + std::vector backend_buft; - ggml_backend_t backend = NULL; - ggml_gallocr_t compute_alloc = NULL; + ggml_backend_t backend = nullptr; + ggml_backend_buffer_t buf = nullptr; - struct clip_image_size * load_image_size; + ggml_backend_sched_ptr sched; + + struct clip_image_size * load_image_size = nullptr; + + clip_ctx(clip_context_params & ctx_params) { + + if(enable_gpu_clip) + { + #ifdef GGML_USE_CUDA + backend = ggml_backend_cuda_init(0); + LOG_INF("%s: CLIP using CUDA backend\n", __func__); + #endif + #ifdef GGML_USE_METAL + backend = ggml_backend_metal_init(); + LOG_INF("%s: CLIP using Metal backend\n", __func__); + #endif + #ifdef GGML_USE_VULKAN + backend = ggml_backend_vk_init(0); + LOG_INF("%s: CLIP using Vulkan backend\n", __func__); + #endif + } + + if (!backend) { + backend = ggml_backend_cpu_init(); + LOG_INF("%s: CLIP using CPU backend\n", __func__); + } + + backend_ptrs.push_back(backend); + backend_buft.push_back(ggml_backend_get_default_buffer_type(backend)); + + sched.reset( + ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), 8192, false) + ); + } + + ~clip_ctx() { + ggml_free(ctx_data); + gguf_free(ctx_gguf); + ggml_backend_buffer_free(buf); + ggml_backend_free(backend); + } }; -static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) { +static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_image_f32_batch * imgs) { + const auto & model = ctx->vision_model; + const auto & hparams = model.hparams; + + const int image_size = hparams.image_size; + int image_size_width = image_size; + int image_size_height = image_size; + + const int patch_size = hparams.patch_size; + const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size)); + const int hidden_size = hparams.hidden_size; + const int n_head = hparams.n_head; + const int d_head = hidden_size / n_head; + const int n_layer = hparams.n_layer; + const float eps = hparams.eps; + + GGML_ASSERT(imgs->size == 1); // batch_size == 1 + + struct ggml_init_params params = { + /*.mem_size =*/ ctx->buf_compute_meta.size(), + /*.mem_buffer =*/ ctx->buf_compute_meta.data(), + /*.no_alloc =*/ true, + }; + + struct ggml_context * ctx0 = ggml_init(params); + struct ggml_cgraph * gf = ggml_new_graph(ctx0); + + // input raw + struct ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, image_size_width, image_size_height, 3); + ggml_set_name(inp_raw, "inp_raw"); + ggml_set_input(inp_raw); + + struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1); + inp = ggml_reshape_2d(ctx0, inp, num_patches, hidden_size); + inp = ggml_cont(ctx0, ggml_transpose(ctx0, inp)); + inp = ggml_add(ctx0, inp, model.patch_bias); + + // position embeddings + struct ggml_tensor * embeddings = ggml_add(ctx0, inp, model.position_embeddings); + + // loop over layers + for (int il = 0; il < n_layer; il++) { + struct ggml_tensor * cur = embeddings; // embeddings = residual, cur = hidden_states + + // layernorm1 + { + cur = ggml_norm(ctx0, cur, eps); + cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ln_1_w), model.layers[il].ln_1_b); + } + + // self-attention + { + + struct ggml_tensor * Q = + ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].q_w, cur), model.layers[il].q_b); + + Q = ggml_reshape_3d(ctx0, Q, d_head, n_head, num_patches); + Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3)); + + struct ggml_tensor * K = + ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].k_w, cur), model.layers[il].k_b); + + K = ggml_reshape_3d(ctx0, K, d_head, n_head, num_patches); + K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3)); + + struct ggml_tensor * V = + ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].v_w, cur), model.layers[il].v_b); + + V = ggml_reshape_3d(ctx0, V, d_head, n_head, num_patches); + V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3)); + + struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); + KQ = ggml_scale_inplace(ctx0, KQ, 1.0f / sqrtf((float)d_head)); + KQ = ggml_soft_max_inplace(ctx0, KQ); + + struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ); + KQV = ggml_reshape_3d(ctx0, KQV, d_head, num_patches, n_head); + KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3); + + cur = ggml_cont_2d(ctx0, KQV, hidden_size, num_patches); + } + + // attention output + cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].o_w, cur), model.layers[il].o_b); + + // re-add the layer input, e.g., residual + cur = ggml_add(ctx0, cur, embeddings); + + embeddings = cur; // embeddings = residual, cur = hidden_states + + // layernorm2 + { + cur = ggml_norm(ctx0, cur, eps); + cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ln_2_w), model.layers[il].ln_2_b); + } + + cur = ggml_mul_mat(ctx0, model.layers[il].ff_i_w, cur); + cur = ggml_add(ctx0, cur, model.layers[il].ff_i_b); + + // siglip uses gelu + cur = ggml_gelu(ctx0, cur); + + cur = ggml_mul_mat(ctx0, model.layers[il].ff_o_w, cur); + cur = ggml_add(ctx0, cur, model.layers[il].ff_o_b); + + // residual 2 + cur = ggml_add(ctx0, embeddings, cur); + + embeddings = cur; + } + + // post-layernorm + if (ctx->has_post_norm) { + embeddings = ggml_norm(ctx0, embeddings, eps); + ggml_set_name(embeddings, "post_ln"); + + embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.post_ln_w), model.post_ln_b); + } + + if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) { + const int batch_size = 1; + const int mm_tokens_per_image = 256; // default value for gemma3 + const int tokens_per_side = sqrt(mm_tokens_per_image); + const int patches_per_image = sqrt(num_patches); + const int kernel_size = patches_per_image / tokens_per_side; + + embeddings = ggml_cont(ctx0, ggml_transpose(ctx0, embeddings)); + embeddings = ggml_reshape_4d(ctx0, embeddings, patches_per_image, patches_per_image, hidden_size, batch_size); + + // doing a pool2d to reduce the number of output tokens to 256 + embeddings = ggml_pool_2d(ctx0, embeddings, GGML_OP_POOL_AVG, kernel_size, kernel_size, kernel_size, kernel_size, 0, 0); + embeddings = ggml_reshape_3d(ctx0, embeddings, embeddings->ne[0] * embeddings->ne[0], hidden_size, batch_size); + embeddings = ggml_cont(ctx0, ggml_transpose(ctx0, embeddings)); + + // apply norm before projection + embeddings = ggml_rms_norm(ctx0, embeddings, eps); + embeddings = ggml_mul(ctx0, embeddings, model.mm_soft_emb_norm_w); + + // apply projection + embeddings = ggml_mul_mat(ctx0, + ggml_cont(ctx0, ggml_transpose(ctx0, model.mm_input_proj_w)), + embeddings); + } + + // build the graph + ggml_build_forward_expand(gf, embeddings); + + ggml_free(ctx0); + + return gf; +} + +static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) { if (!ctx->has_vision_encoder) { LOG_ERR("This gguf file seems to have no vision encoder\n"); return nullptr; @@ -1163,7 +1360,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 } else { GGML_ABORT("fatel error"); } - } else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) { + } + else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) { embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size * 4, num_positions / 4, batch_size); embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings); @@ -1185,14 +1383,25 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 return gf; } -bool enable_gpu_clip = true; -void set_clip_uses_gpu(bool usegpu) -{ - enable_gpu_clip = usegpu; +static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) { + if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) { + return clip_image_build_graph_siglip(ctx, imgs); + } else { + // TODO: we should have one build_* function per model + return clip_image_build_graph_legacy(ctx, imgs, load_image_size, is_inf); + } } // read and create ggml_context containing the tensors and their data struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { + return clip_init(fname, clip_context_params{ + /* use_gpu */ true, + /* verbosity */ verbosity, + }); +} + +struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_params) { + int verbosity = ctx_params.verbosity; struct ggml_context * meta = NULL; struct gguf_init_params params = { @@ -1206,6 +1415,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { } if (verbosity >= 1) { + try { const int n_tensors = gguf_get_n_tensors(ctx); const int n_kv = gguf_get_n_kv(ctx); const int ftype = get_u32(ctx, KEY_FTYPE); @@ -1224,6 +1434,9 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { LOG_INF("%s: n_kv: %d\n", __func__, n_kv); LOG_INF("%s: ftype: %s\n", __func__, ftype_str.c_str()); LOG_INF("\n"); + } catch (std::runtime_error & /*e*/) { + LOG_INF("Could not list CLIP model properties.\n"); + } } const int n_tensors = gguf_get_n_tensors(ctx); @@ -1286,7 +1499,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { } } - clip_ctx * new_clip = new clip_ctx{}; + clip_ctx * new_clip = new clip_ctx(ctx_params); // update projector type { @@ -1305,39 +1518,6 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { } } -if(enable_gpu_clip) -{ -#ifdef GGML_USE_CUDA - new_clip->backend = ggml_backend_cuda_init(0); - LOG_INF("%s: CLIP using CUDA backend\n", __func__); -#endif - -#ifdef GGML_USE_METAL - new_clip->backend = ggml_backend_metal_init(); - LOG_INF("%s: CLIP using Metal backend\n", __func__); -#endif - -#ifdef GGML_USE_CANN - new_clip->backend = ggml_backend_cann_init(0); - LOG_INF("%s: CLIP using CANN backend\n", __func__); -#endif - -#ifdef GGML_USE_VULKAN - new_clip->backend = ggml_backend_vk_init(0); - LOG_INF("%s: CLIP using Vulkan backend\n", __func__); -#endif - -#ifdef GGML_USE_SYCL - new_clip->backend = ggml_backend_sycl_init(0); - LOG_INF("%s: CLIP using SYCL backend\n", __func__); -#endif -} - - if (!new_clip->backend) { - new_clip->backend = ggml_backend_cpu_init(); - LOG_INF("%s: CLIP using CPU backend\n", __func__); - } - // model size and capabilities { int idx = get_key_idx(ctx, KEY_HAS_TEXT_ENC); @@ -1375,8 +1555,12 @@ if(enable_gpu_clip) GGML_ASSERT(new_clip->has_vision_encoder); GGML_ASSERT(!new_clip->has_text_encoder); - idx = get_key_idx(ctx, KEY_USE_GELU); - new_clip->use_gelu = gguf_get_val_bool(ctx, idx); + try { + idx = get_key_idx(ctx, KEY_USE_GELU); + new_clip->use_gelu = gguf_get_val_bool(ctx, idx); + } catch (std::runtime_error & /*e*/) { + new_clip->use_gelu = false; + } try { idx = get_key_idx(ctx, KEY_USE_SILU); @@ -1390,6 +1574,7 @@ if(enable_gpu_clip) LOG_INF("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder); LOG_INF("%s: llava_projector: %d\n", __func__, new_clip->has_llava_projector); LOG_INF("%s: minicpmv_projector: %d\n", __func__, new_clip->has_minicpmv_projector); + LOG_INF("%s: minicpmv_version: %d\n", __func__, new_clip->minicpmv_version); LOG_INF("%s: glm_projector: %d\n", __func__, new_clip->has_glm_projector); LOG_INF("%s: model size: %.2f MB\n", __func__, model_size / 1024.0 / 1024.0); LOG_INF("%s: metadata size: %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0); @@ -1432,7 +1617,9 @@ if(enable_gpu_clip) } // alloc memory and offload data - new_clip->params_buffer = ggml_backend_alloc_ctx_tensors(new_clip->ctx_data, new_clip->backend); + ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(new_clip->backend); + new_clip->buf = ggml_backend_alloc_ctx_tensors_from_buft(new_clip->ctx_data, buft); + ggml_backend_buffer_set_usage(new_clip->buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); for (int i = 0; i < n_tensors; ++i) { const char * name = gguf_get_tensor_name(ctx, i); struct ggml_tensor * cur = ggml_get_tensor(new_clip->ctx_data, name); @@ -1445,7 +1632,7 @@ if(enable_gpu_clip) return nullptr; } int num_bytes = ggml_nbytes(cur); - if (ggml_backend_buffer_is_host(new_clip->params_buffer)) { + if (ggml_backend_buft_is_host(buft)) { // for the CPU and Metal backend, we can read directly into the tensor fin.read(reinterpret_cast(cur->data), num_bytes); } else { @@ -1581,11 +1768,17 @@ if(enable_gpu_clip) } try { - vision_model.patch_embeddings_0 = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD); + vision_model.patch_embeddings_0 = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD); + } catch(const std::exception& /*e*/) { + vision_model.patch_embeddings_0 = nullptr; + } + + try { vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v")); } catch(const std::exception& /*e*/) { - LOG_ERR("%s: failed to load vision model tensors\n", __func__); + vision_model.position_embeddings = nullptr; } + try { vision_model.patch_embeddings_1 = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD_1); } catch(const std::exception& /*e*/) { @@ -1696,6 +1889,10 @@ if(enable_gpu_clip) vision_model.mm_1_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight")); vision_model.mm_1_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias")); } + else if (new_clip->proj_type == PROJECTOR_TYPE_GEMMA3) { + vision_model.mm_input_proj_w = get_tensor(new_clip->ctx_data, TN_MM_INP_PROJ); + vision_model.mm_soft_emb_norm_w = get_tensor(new_clip->ctx_data, TN_MM_SOFT_EMB_N); + } else { std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type]; throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str())); @@ -1731,14 +1928,21 @@ if(enable_gpu_clip) // measure mem requirement and allocate { new_clip->buf_compute_meta.resize(GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead() + ggml_graph_overhead()); - new_clip->compute_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(new_clip->backend)); clip_image_f32_batch batch; batch.size = 1; batch.data = nullptr; ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch, nullptr, false); - ggml_gallocr_reserve(new_clip->compute_alloc, gf); - size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0); - LOG_INF("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0); + ggml_backend_sched_reserve(new_clip->sched.get(), gf); + for (size_t i = 0; i < new_clip->backend_ptrs.size(); ++i) { + ggml_backend_t backend = new_clip->backend_ptrs[i]; + ggml_backend_buffer_type_t buft = new_clip->backend_buft[i]; + size_t size = ggml_backend_sched_get_buffer_size(new_clip->sched.get(), backend); + if (size > 1) { + LOG_INF("%s: %10s compute buffer size = %8.2f MiB\n", __func__, + ggml_backend_buft_name(buft), + size / 1024.0 / 1024.0); + } + } } return new_clip; @@ -2317,7 +2521,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli return true; } - if (ctx->has_glm_projector) { + if (ctx->has_glm_projector || ctx->proj_type == PROJECTOR_TYPE_GEMMA3) { res_imgs->size = 1; res_imgs->data = new clip_image_f32[res_imgs->size]; clip_image_u8 resized_image; @@ -2506,12 +2710,6 @@ ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) { } void clip_free(clip_ctx * ctx) { - ggml_free(ctx->ctx_data); - gguf_free(ctx->ctx_gguf); - - ggml_backend_buffer_free(ctx->params_buffer); - ggml_backend_free(ctx->backend); - ggml_gallocr_free(ctx->compute_alloc); delete ctx; } @@ -2707,8 +2905,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima } // build the inference graph + ggml_backend_sched_reset(ctx->sched.get()); ggml_cgraph * gf = clip_image_build_graph(ctx, imgs, ctx->load_image_size, true); - ggml_gallocr_alloc_graph(ctx->compute_alloc, gf); + ggml_backend_sched_alloc_graph(ctx->sched.get(), gf); // set inputs const auto & model = ctx->vision_model; @@ -2847,6 +3046,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions)); free(positions_data); } + else if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) { + // do nothing + } else { struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions"); @@ -2877,7 +3079,11 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima ggml_backend_cpu_set_n_threads(ctx->backend, n_threads); } - ggml_backend_graph_compute(ctx->backend, gf); + auto status = ggml_backend_sched_graph_compute(ctx->sched.get(), gf); + if (status != GGML_STATUS_SUCCESS) { + LOG_ERR("%s: ggml_backend_sched_graph_compute failed with error %d\n", __func__, status); + return false; + } // the last node is the embedding tensor struct ggml_tensor * embeddings = ggml_graph_node(gf, -1); @@ -3067,6 +3273,9 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { if (ctx->proj_type == PROJECTOR_TYPE_MERGER) { return ctx->vision_model.mm_1_b->ne[0]; } + if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) { + return ctx->vision_model.mm_input_proj_w->ne[0]; + } std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type]; throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str())); diff --git a/examples/llava/clip.h b/examples/llava/clip.h index 76ca99716..b9b8c275a 100644 --- a/examples/llava/clip.h +++ b/examples/llava/clip.h @@ -39,8 +39,15 @@ struct clip_image_f32_batch { size_t size; }; -CLIP_API struct clip_ctx * clip_model_load (const char * fname, int verbosity); -CLIP_API struct clip_ctx * clip_model_load_cpu(const char * fname, int verbosity); +struct clip_context_params { + bool use_gpu; + int verbosity; +}; + +// deprecated, use clip_init +CLIP_API struct clip_ctx * clip_model_load(const char * fname, int verbosity); + +CLIP_API struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_params); CLIP_API void clip_free(struct clip_ctx * ctx); diff --git a/examples/llava/gemma3-cli.cpp b/examples/llava/gemma3-cli.cpp new file mode 100644 index 000000000..a07864d4e --- /dev/null +++ b/examples/llava/gemma3-cli.cpp @@ -0,0 +1,341 @@ +#include "arg.h" +#include "log.h" +#include "common.h" +#include "sampling.h" +#include "clip.h" +#include "stb_image.h" +#include "llama.h" +#include "ggml.h" +#include "console.h" + +#include +#include +#include + +#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) +#include +#include +#elif defined (_WIN32) +#define WIN32_LEAN_AND_MEAN +#ifndef NOMINMAX +#define NOMINMAX +#endif +#include +#include +#endif + +static bool g_is_generating = false; + +/** + * Please note that this is NOT a production-ready stuff. + * It is a playground for trying Gemma 3 vision capabilities. + * For contributors: please keep this code simple and easy to understand. + */ + +static void show_additional_info(int /*argc*/, char ** argv) { + LOG( + "Experimental CLI for using Gemma 3 vision model\n\n" + "Usage: %s [options] -m --mmproj --image -p \n\n" + " -m and --mmproj are required\n" + " --image and -p are optional, if NOT provided, the CLI will run in chat mode\n", + argv[0] + ); +} + +#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) +static void sigint_handler(int signo) { + if (signo == SIGINT) { + if (g_is_generating) { + g_is_generating = false; + } else { + console::cleanup(); + LOG("\nInterrupted by user\n"); + _exit(130); + } + } +} +#endif + +struct gemma3_context { + struct clip_ctx * ctx_clip = NULL; + common_init_result llama_init; + + llama_model * model; + llama_context * lctx; + const llama_vocab * vocab; + llama_batch batch; + + int n_threads = 1; + llama_pos n_past = 0; + + gemma3_context(common_params & params) : llama_init(common_init_from_params(params)) { + model = llama_init.model.get(); + lctx = llama_init.context.get(); + vocab = llama_model_get_vocab(model); + n_threads = params.cpuparams.n_threads; + batch = llama_batch_init(params.n_batch, 0, 1); + init_clip_model(params); + } + + void init_clip_model(common_params & params) { + const char * clip_path = params.mmproj.c_str(); + ctx_clip = clip_model_load(clip_path, params.verbosity > 1); + } + + ~gemma3_context() { + clip_free(ctx_clip); + } +}; + +struct decode_embd_batch { + std::vector pos; + std::vector n_seq_id; + std::vector seq_id_0; + std::vector seq_ids; + std::vector logits; + llama_batch batch; + decode_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) { + pos .resize(n_tokens); + n_seq_id.resize(n_tokens); + seq_ids .resize(n_tokens + 1); + logits .resize(n_tokens); + seq_id_0.resize(1); + seq_id_0[0] = seq_id; + seq_ids [n_tokens] = nullptr; + batch = { + /*n_tokens =*/ n_tokens, + /*tokens =*/ nullptr, + /*embd =*/ embd, + /*pos =*/ pos.data(), + /*n_seq_id =*/ n_seq_id.data(), + /*seq_id =*/ seq_ids.data(), + /*logits =*/ logits.data(), + }; + for (int i = 0; i < n_tokens; i++) { + batch.pos [i] = pos_0 + i; + batch.n_seq_id[i] = 1; + batch.seq_id [i] = seq_id_0.data(); + batch.logits [i] = false; + } + } +}; + +static int eval_text(gemma3_context & ctx, std::string input, bool logits_last = false) { + llama_tokens tokens = common_tokenize(ctx.lctx, input, false, true); + common_batch_clear(ctx.batch); + for (llama_token & t : tokens) { + common_batch_add(ctx.batch, t, ctx.n_past++, {0}, false); + } + if (logits_last) { + ctx.batch.logits[ctx.batch.n_tokens - 1] = true; + } + // LOG("eval_text (n_tokens = %d): %s\n", (int)tokens.size(), input.c_str()); + if (llama_decode(ctx.lctx, ctx.batch)) { + LOG_ERR("Failed to decode text\n"); + return 1; + } + return 0; +} + +static int eval_image(gemma3_context & ctx, std::string & fname) { + std::vector image_embd_v; + int n_embd = llama_model_n_embd(ctx.model); + int n_tokens = 256; + image_embd_v.resize(n_tokens * n_embd); + + bool ok; + struct clip_image_u8 * img_u8 = clip_image_u8_init(); + ok = clip_image_load_from_file(fname.c_str(), img_u8); + if (!ok) { + LOG_ERR("Unable to load image %s\n", fname.c_str()); + clip_image_u8_free(img_u8); + return 2; // non-fatal error + } + + clip_image_f32_batch batch_f32; + ok = clip_image_preprocess(ctx.ctx_clip, img_u8, &batch_f32); + if (!ok) { + LOG_ERR("Unable to preprocess image\n"); + clip_image_f32_batch_free(&batch_f32); + clip_image_u8_free(img_u8); + return 1; + } + + int64_t t0 = ggml_time_ms(); + LOG("Encoding image %s\n", fname.c_str()); + ok = clip_image_batch_encode(ctx.ctx_clip, ctx.n_threads, &batch_f32, image_embd_v.data()); + if (!ok) { + LOG_ERR("Unable to encode image\n"); + clip_image_f32_batch_free(&batch_f32); + clip_image_u8_free(img_u8); + return 1; + } + LOG("Image encoded in %" PRId64 " ms\n", ggml_time_ms() - t0); + + clip_image_f32_batch_free(&batch_f32); + clip_image_u8_free(img_u8); + + // decode image embeddings + int64_t t1 = ggml_time_ms(); + eval_text(ctx, ""); + llama_set_causal_attn(ctx.lctx, false); + decode_embd_batch batch_img(image_embd_v.data(), n_tokens, ctx.n_past, 0); + if (llama_decode(ctx.lctx, batch_img.batch)) { + LOG_ERR("failed to decode image\n"); + return 1; + } + ctx.n_past += n_tokens; + llama_set_causal_attn(ctx.lctx, true); + eval_text(ctx, ""); + LOG("Image decoded in %" PRId64 " ms\n", ggml_time_ms() - t1); + return 0; +} + +static int generate_response(gemma3_context & ctx, common_sampler * smpl, int n_predict) { + for (int i = 0; i < n_predict; i++) { + if (i > n_predict || !g_is_generating) { + printf("\n"); + break; + } + + llama_token token_id = common_sampler_sample(smpl, ctx.lctx, -1); + common_sampler_accept(smpl, token_id, true); + + if (llama_vocab_is_eog(ctx.vocab, token_id)) { + printf("\n"); + break; // end of generation + } + + printf("%s", common_token_to_piece(ctx.lctx, token_id).c_str()); + fflush(stdout); + + // eval the token + common_batch_clear(ctx.batch); + common_batch_add(ctx.batch, token_id, ctx.n_past++, {0}, true); + if (llama_decode(ctx.lctx, ctx.batch)) { + LOG_ERR("failed to decode token\n"); + return 1; + } + } + return 0; +} + +int main(int argc, char ** argv) { + ggml_time_init(); + + common_params params; + params.sampling.temp = 0.2; // lower temp by default for better quality + + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, show_additional_info)) { + return 1; + } + + common_init(); + + if (params.mmproj.empty()) { + show_additional_info(argc, argv); + return 1; + } + + gemma3_context ctx(params); + printf("%s: %s\n", __func__, params.model.c_str()); + + bool is_single_turn = !params.prompt.empty() && !params.image.empty(); + + struct common_sampler * smpl = common_sampler_init(ctx.model, params.sampling); + int n_predict = params.n_predict < 0 ? INT_MAX : params.n_predict; + + // ctrl+C handling + { +#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) + struct sigaction sigint_action; + sigint_action.sa_handler = sigint_handler; + sigemptyset (&sigint_action.sa_mask); + sigint_action.sa_flags = 0; + sigaction(SIGINT, &sigint_action, NULL); +#elif defined (_WIN32) + auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL { + return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false; + }; + SetConsoleCtrlHandler(reinterpret_cast(console_ctrl_handler), true); +#endif + } + + if (eval_text(ctx, "")) { + return 1; + } + + if (is_single_turn) { + g_is_generating = true; + if (eval_text(ctx, "user\n")) { + return 1; + } + for (auto & fname : params.image) { + if (eval_image(ctx, fname)) { + return 1; + } + } + if (eval_text(ctx, params.prompt + "model\n", true)) { + return 1; + } + if (generate_response(ctx, smpl, n_predict)) { + return 1; + } + + } else { + LOG("\n Running in chat mode, available commands:"); + LOG("\n /image load an image"); + LOG("\n /clear clear the chat history"); + LOG("\n /quit or /exit exit the program"); + LOG("\n"); + + if (eval_text(ctx, "user\n")) { + return 1; + } + + while (true) { + g_is_generating = false; + LOG("\n> "); + console::set_display(console::user_input); + std::string line; + console::readline(line, false); + console::set_display(console::reset); + line = string_strip(line); + if (line.empty()) { + continue; + } + if (line == "/quit" || line == "/exit") { + break; + } + if (line == "/clear") { + ctx.n_past = 0; + llama_kv_cache_seq_rm(ctx.lctx, 0, 1, -1); // keep BOS + LOG("Chat history cleared\n\n"); + continue; + } + g_is_generating = true; + if (line.find("/image") == 0) { + std::string image = line.substr(7); + int res = eval_image(ctx, image); + if (res == 2) { + continue; // image not found + } + if (res) { + return 1; + } + continue; + } + if (eval_text(ctx, line + "model\n", true)) { + return 1; + } + if (generate_response(ctx, smpl, n_predict)) { + return 1; + } + if (eval_text(ctx, "user\n")) { + return 1; + } + } + } + + return 0; +} diff --git a/examples/llava/gemma3_convert_encoder_to_gguf.py b/examples/llava/gemma3_convert_encoder_to_gguf.py new file mode 100644 index 000000000..241b526b9 --- /dev/null +++ b/examples/llava/gemma3_convert_encoder_to_gguf.py @@ -0,0 +1,307 @@ +import gguf +import argparse +import logging +import sys +import torch +import json +import os +import numpy as np +from typing import cast, ContextManager, Any, Iterator +from pathlib import Path +from torch import Tensor + +logger = logging.getLogger("gemma3-mmproj") + + +# (copied from convert_hf_to_gguf.py) +# tree of lazy tensors +class LazyTorchTensor(gguf.LazyBase): + _tensor_type = torch.Tensor + # to keep the type-checker happy + dtype: torch.dtype + shape: torch.Size + + # only used when converting a torch.Tensor to a np.ndarray + _dtype_map: dict[torch.dtype, type] = { + torch.float16: np.float16, + torch.float32: np.float32, + } + + # used for safetensors slices + # ref: https://github.com/huggingface/safetensors/blob/079781fd0dc455ba0fe851e2b4507c33d0c0d407/bindings/python/src/lib.rs#L1046 + # TODO: uncomment U64, U32, and U16, ref: https://github.com/pytorch/pytorch/issues/58734 + _dtype_str_map: dict[str, torch.dtype] = { + "F64": torch.float64, + "F32": torch.float32, + "BF16": torch.bfloat16, + "F16": torch.float16, + # "U64": torch.uint64, + "I64": torch.int64, + # "U32": torch.uint32, + "I32": torch.int32, + # "U16": torch.uint16, + "I16": torch.int16, + "U8": torch.uint8, + "I8": torch.int8, + "BOOL": torch.bool, + "F8_E4M3": torch.float8_e4m3fn, + "F8_E5M2": torch.float8_e5m2, + } + + def numpy(self) -> gguf.LazyNumpyTensor: + dtype = self._dtype_map[self.dtype] + return gguf.LazyNumpyTensor( + meta=gguf.LazyNumpyTensor.meta_with_dtype_and_shape(dtype, self.shape), + args=(self,), + func=(lambda s: s.numpy()) + ) + + @classmethod + def meta_with_dtype_and_shape(cls, dtype: torch.dtype, shape: tuple[int, ...]) -> Tensor: + return torch.empty(size=shape, dtype=dtype, device="meta") + + @classmethod + def from_safetensors_slice(cls, st_slice: Any) -> Tensor: + dtype = cls._dtype_str_map[st_slice.get_dtype()] + shape: tuple[int, ...] = tuple(st_slice.get_shape()) + lazy = cls(meta=cls.meta_with_dtype_and_shape(dtype, shape), args=(st_slice,), func=lambda s: s[:]) + return cast(torch.Tensor, lazy) + + @classmethod + def __torch_function__(cls, func, types, args=(), kwargs=None): + del types # unused + + if kwargs is None: + kwargs = {} + + if func is torch.Tensor.numpy: + return args[0].numpy() + + return cls._wrap_fn(func)(*args, **kwargs) + + +class Gemma3VisionTower: + hparams: dict + gguf_writer: gguf.GGUFWriter + fname_out: Path + ftype: gguf.LlamaFileType + + @staticmethod + def load_hparams(dir_model: Path): + with open(dir_model / "config.json", "r", encoding="utf-8") as f: + return json.load(f) + + @staticmethod + def get_model_part_names(dir_model: Path, prefix: str, suffix: str) -> list[str]: + part_names: list[str] = [] + for filename in os.listdir(dir_model): + if filename.startswith(prefix) and filename.endswith(suffix): + part_names.append(filename) + part_names.sort() + return part_names + + def __init__(self, + dir_model: Path, + fname_out: Path, + ftype: gguf.LlamaFileType, + is_big_endian: bool,): + hparams = Gemma3VisionTower.load_hparams(dir_model) + self.hparams = hparams + self.fname_out = fname_out + self.ftype = ftype + endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE + self.gguf_writer = gguf.GGUFWriter(path=None, arch="clip", endianess=endianess) + + text_config = hparams["text_config"] + vision_config = hparams["vision_config"] + + assert hparams["architectures"][0] == "Gemma3ForConditionalGeneration" + assert text_config is not None + assert vision_config is not None + + self.gguf_writer.add_string ("clip.projector_type", "gemma3") + self.gguf_writer.add_bool ("clip.has_text_encoder", False) + self.gguf_writer.add_bool ("clip.has_vision_encoder", True) + self.gguf_writer.add_bool ("clip.has_llava_projector", False) # legacy + self.gguf_writer.add_uint32 ("clip.vision.image_size", vision_config["image_size"]) + self.gguf_writer.add_uint32 ("clip.vision.patch_size", vision_config["patch_size"]) + self.gguf_writer.add_uint32 ("clip.vision.embedding_length", vision_config["hidden_size"]) + self.gguf_writer.add_uint32 ("clip.vision.feed_forward_length", vision_config["intermediate_size"]) + self.gguf_writer.add_uint32 ("clip.vision.projection_dim", text_config["hidden_size"]) + self.gguf_writer.add_uint32 ("clip.vision.block_count", vision_config["num_hidden_layers"]) + self.gguf_writer.add_uint32 ("clip.vision.attention.head_count", vision_config["num_attention_heads"]) + self.gguf_writer.add_float32("clip.vision.attention.layer_norm_epsilon", vision_config.get("layer_norm_eps", 1e-6)) + # default values taken from HF tranformers code + self.gguf_writer.add_array ("clip.vision.image_mean", [0.5, 0.5, 0.5]) + self.gguf_writer.add_array ("clip.vision.image_std", [0.5, 0.5, 0.5]) + self.gguf_writer.add_bool ("clip.use_gelu", True) + + # load tensors + for name, data_torch in self.get_tensors(dir_model): + # convert any unsupported data types to float32 + if data_torch.dtype not in (torch.float16, torch.float32): + data_torch = data_torch.to(torch.float32) + self.add_tensor(name, data_torch) + + def get_tensors(self, dir_model: Path) -> Iterator[tuple[str, Tensor]]: + part_names = Gemma3VisionTower.get_model_part_names(dir_model, "model", ".safetensors") + tensor_names_from_parts: set[str] = set() + for part_name in part_names: + logger.info(f"gguf: loading model part '{part_name}'") + from safetensors import safe_open + ctx = cast(ContextManager[Any], safe_open(dir_model / part_name, framework="pt", device="cpu")) + with ctx as model_part: + tensor_names_from_parts.update(model_part.keys()) + + for name in model_part.keys(): + data = model_part.get_slice(name) + data = LazyTorchTensor.from_safetensors_slice(data) + yield name, data + + def add_tensor(self, name: str, data_torch: Tensor): + is_1d = len(data_torch.shape) == 1 + is_embd = ".embeddings." in name + old_dtype = data_torch.dtype + can_quantize = not is_1d and not is_embd + data_qtype = gguf.GGMLQuantizationType.F32 + + # this is to support old checkpoint + # TODO: remove this when we have the final model + name = name.replace("vision_model.vision_model.", "vision_tower.vision_model.") + name = name.replace("multimodal_projector.", "multi_modal_projector.") + + # filter only vision tensors + if not name.startswith("vision_tower.vision_model.") and not name.startswith("multi_modal_projector."): + return + # prefix + name = name.replace("vision_tower.vision_model.encoder.layers.", "v.blk.") + name = name.replace("vision_tower.vision_model.", "v.") + # projector and input embd + name = name.replace(".embeddings.patch_embedding.", ".patch_embd.") + name = name.replace(".embeddings.position_embedding.", ".position_embd.") + name = name.replace( + "multi_modal_projector.mm_input_projection_weight", + "mm.input_projection.weight" + ) + name = name.replace( + "multi_modal_projector.mm_soft_emb_norm.weight", + "mm.soft_emb_norm.weight" + ) + name = name.replace("post_layernorm.", "post_ln.") + # each block + name = name.replace(".self_attn.k_proj.", ".attn_k.") + name = name.replace(".self_attn.v_proj.", ".attn_v.") + name = name.replace(".self_attn.q_proj.", ".attn_q.") + name = name.replace(".self_attn.out_proj.", ".attn_out.") + name = name.replace(".layer_norm1.", ".ln1.") + name = name.replace(".layer_norm2.", ".ln2.") + name = name.replace(".mlp.fc1.", ".ffn_down.") + name = name.replace(".mlp.fc2.", ".ffn_up.") + + if can_quantize: + if self.ftype == gguf.LlamaFileType.ALL_F32: + data_qtype = gguf.GGMLQuantizationType.F32 + elif self.ftype == gguf.LlamaFileType.MOSTLY_F16: + data_qtype = gguf.GGMLQuantizationType.F16 + elif self.ftype == gguf.LlamaFileType.MOSTLY_BF16: + data_qtype = gguf.GGMLQuantizationType.BF16 + elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0: + data_qtype = gguf.GGMLQuantizationType.Q8_0 + else: + raise ValueError(f"Unsupported file type: {self.ftype}") + + # corrent norm value ; only this "soft_emb_norm" need to be corrected as it's part of Gemma projector + # the other norm values are part of SigLIP model, and they are already correct + # ref code: Gemma3RMSNorm + if "soft_emb_norm.weight" in name: + logger.info(f"Correcting norm value for '{name}'") + data_torch = data_torch + 1 + + data = data_torch.numpy() + + try: + data = gguf.quants.quantize(data, data_qtype) + except Exception as e: + logger.error(f"Error quantizing tensor '{name}': {e}, fallback to F16") + data_qtype = gguf.GGMLQuantizationType.F16 + data = gguf.quants.quantize(data, data_qtype) + + # reverse shape to make it similar to the internal ggml dimension order + shape_str = f"{{{', '.join(str(n) for n in reversed(data_torch.shape))}}}" + logger.info(f"{f'%-32s' % f'{name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}") + + self.gguf_writer.add_tensor(name, data, raw_dtype=data_qtype) + + def write(self): + self.gguf_writer.write_header_to_file(path=self.fname_out) + self.gguf_writer.write_kv_data_to_file() + self.gguf_writer.write_tensors_to_file(progress=True) + self.gguf_writer.close() + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Convert Gemma 3 vision tower safetensors to GGUF format",) + parser.add_argument( + "--outfile", type=Path, default="mmproj.gguf", + help="path to write to", + ) + parser.add_argument( + "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0"], default="f16", + help="output format", + ) + parser.add_argument( + "--bigendian", action="store_true", + help="model is executed on big endian machine", + ) + parser.add_argument( + "model", type=Path, + help="directory containing model file", + nargs="?", + ) + parser.add_argument( + "--verbose", action="store_true", + help="increase output verbosity", + ) + + args = parser.parse_args() + if args.model is None: + parser.error("the following arguments are required: model") + return args + + +def main() -> None: + args = parse_args() + + if args.verbose: + logging.basicConfig(level=logging.DEBUG) + else: + logging.basicConfig(level=logging.INFO) + + dir_model = args.model + + if not dir_model.is_dir(): + logger.error(f'Error: {args.model} is not a directory') + sys.exit(1) + + ftype_map: dict[str, gguf.LlamaFileType] = { + "f32": gguf.LlamaFileType.ALL_F32, + "f16": gguf.LlamaFileType.MOSTLY_F16, + "bf16": gguf.LlamaFileType.MOSTLY_BF16, + "q8_0": gguf.LlamaFileType.MOSTLY_Q8_0, + } + + logger.info(f"Loading model: {dir_model.name}") + + with torch.inference_mode(): + gemma3_vision_tower = Gemma3VisionTower( + dir_model=dir_model, + fname_out=args.outfile, + ftype=ftype_map[args.outtype], + is_big_endian=args.bigendian, + ) + gemma3_vision_tower.write() + + +if __name__ == '__main__': + main() + diff --git a/examples/llava/minicpmv-cli.cpp b/examples/llava/minicpmv-cli.cpp index 53d902d61..12f536cf5 100644 --- a/examples/llava/minicpmv-cli.cpp +++ b/examples/llava/minicpmv-cli.cpp @@ -86,7 +86,11 @@ static struct clip_ctx * clip_init_context(common_params * params) { if (prompt.empty()) { prompt = "describe the image in detail."; } - auto * ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1); + struct clip_context_params clip_params = { + /* use_gpu */ params->n_gpu_layers != 0, + /* verbosity */ params->verbosity, + }; + auto * ctx_clip = clip_init(clip_path, clip_params); return ctx_clip; } @@ -148,19 +152,34 @@ static void process_image(struct llava_context * ctx_llava, struct llava_image_e process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++); eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); if (num_image_embeds > 1) { - size_t num_image_embeds_col = clip_uhd_num_image_embeds_col(ctx_llava->ctx_clip); - eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); - for (size_t i = 0; i < (num_image_embeds-1)/num_image_embeds_col; ++i) { - for (size_t j = 0; j < num_image_embeds_col; ++j) { - eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); - process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++); - eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); - if (j == num_image_embeds_col - 1) { - eval_string(ctx_llava->ctx_llama, std::string("\n").c_str(), params->n_batch, &n_past, false); + if (has_minicpmv_projector == 2) { + size_t num_image_embeds_col = clip_uhd_num_image_embeds_col(ctx_llava->ctx_clip); + eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); + for (size_t i = 0; i < (num_image_embeds-1)/num_image_embeds_col; ++i) { + for (size_t j = 0; j < num_image_embeds_col; ++j) { + eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); + process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++); + eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); + if (j == num_image_embeds_col - 1) { + eval_string(ctx_llava->ctx_llama, std::string("\n").c_str(), params->n_batch, &n_past, false); + } + } + } + eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); + } + else if (has_minicpmv_projector == 3 || has_minicpmv_projector == 4) { + size_t num_image_embeds_col = clip_uhd_num_image_embeds_col(ctx_llava->ctx_clip); + for (size_t i = 0; i < (num_image_embeds-1)/num_image_embeds_col; ++i) { + for (size_t j = 0; j < num_image_embeds_col; ++j) { + eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); + process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++); + eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); + if (j == num_image_embeds_col - 1) { + eval_string(ctx_llava->ctx_llama, std::string("\n").c_str(), params->n_batch, &n_past, false); + } } } } - eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); } LOG_INF("%s: image token past: %d\n", __func__, n_past); } diff --git a/examples/llava/minicpmv-convert-image-encoder-to-gguf.py b/examples/llava/minicpmv-convert-image-encoder-to-gguf.py index 9b196757f..cfe0961f9 100644 --- a/examples/llava/minicpmv-convert-image-encoder-to-gguf.py +++ b/examples/llava/minicpmv-convert-image-encoder-to-gguf.py @@ -597,7 +597,6 @@ elif args.minicpmv_projector is not None: fname_middle = "mmproj-" has_text_encoder = False has_minicpmv_projector = True - minicpmv_version = 4 elif args.vision_only: fname_middle = "vision-" has_text_encoder = False diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 8386f4eeb..8cb8d0033 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -384,8 +384,9 @@ struct server_task { SRV_DBG("Grammar trigger token: %d (`%s`)\n", token, word.c_str()); common_grammar_trigger trigger; trigger.type = COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN; - trigger.value = (llama_token) token; - params.sampling.grammar_triggers.push_back(trigger); + trigger.value = word; + trigger.token = token; + params.sampling.grammar_triggers.push_back(std::move(trigger)); } else { SRV_DBG("Grammar trigger word: `%s`\n", word.c_str()); params.sampling.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, word}); @@ -750,7 +751,10 @@ struct server_task_result_cmpl_final : server_task_result { {"name", tc.name}, {"arguments", tc.arguments}, }}, - {"id", tc.id}, + // Some templates generate and require an id (sometimes in a very specific format, e.g. Mistral Nemo). + // We only generate a random id for the ones that don't generate one by themselves + // (they also won't get to see it as their template likely doesn't use it, so it's all for the client) + {"id", tc.id.empty() ? gen_tool_call_id() : tc.id}, }); } message["tool_calls"] = tool_calls; diff --git a/examples/server/tests/unit/test_tool_call.py b/examples/server/tests/unit/test_tool_call.py index 25bddbaee..569c2a1f8 100755 --- a/examples/server/tests/unit/test_tool_call.py +++ b/examples/server/tests/unit/test_tool_call.py @@ -92,6 +92,7 @@ def do_test_completion_with_required_tool_tiny(server: ServerProcess, tool: dict assert tool_calls and len(tool_calls) == 1, f'Expected 1 tool call in {choice["message"]}' tool_call = tool_calls[0] assert choice["message"].get("content") in (None, ""), f'Expected no content in {choice["message"]}' + assert len(tool_call.get("id", "")) > 0, f'Expected non empty tool call id in {tool_call}' expected_function_name = "python" if tool["type"] == "code_interpreter" else tool["function"]["name"] assert expected_function_name == tool_call["function"]["name"] actual_arguments = tool_call["function"]["arguments"] @@ -373,6 +374,7 @@ def do_test_weather(server: ServerProcess, **kwargs): tool_call = tool_calls[0] # assert choice["message"].get("content") in (None, ""), f'Expected no content in {choice["message"]}' assert tool_call["function"]["name"] == WEATHER_TOOL["function"]["name"], f'Expected weather tool call, got {tool_call["function"]["name"]}' + assert len(tool_call.get("id", "")) > 0, f'Expected non empty tool call id in {tool_call}' actual_arguments = json.loads(tool_call["function"]["arguments"]) assert 'location' in actual_arguments, f"location not found in {json.dumps(actual_arguments)}" location = actual_arguments["location"] @@ -596,6 +598,7 @@ def do_test_hello_world(server: ServerProcess, **kwargs): tool_call = tool_calls[0] # assert choice["message"].get("content") in (None, ""), f'Expected no content in {choice["message"]}' assert tool_call["function"]["name"] == PYTHON_TOOL["function"]["name"] + assert len(tool_call.get("id", "")) > 0, f'Expected non empty tool call id in {tool_call}' actual_arguments = json.loads(tool_call["function"]["arguments"]) assert 'code' in actual_arguments, f"code not found in {json.dumps(actual_arguments)}" code = actual_arguments["code"] diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index 393e3927c..36ad276fd 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -435,6 +435,10 @@ static std::string gen_chatcmplid() { return "chatcmpl-" + random_string(); } +static std::string gen_tool_call_id() { + return random_string(); +} + // // other common utils // diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp index 8b959e355..84e66b212 100644 --- a/ggml/src/ggml-backend-reg.cpp +++ b/ggml/src/ggml-backend-reg.cpp @@ -498,7 +498,7 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent, search_paths.push_back(get_executable_path()); search_paths.push_back(fs::current_path()); } else { - search_paths.push_back(user_search_path); + search_paths.push_back(fs::u8path(user_search_path)); } int best_score = 0; @@ -512,9 +512,9 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent, fs::directory_iterator dir_it(search_path, fs::directory_options::skip_permission_denied); for (const auto & entry : dir_it) { if (entry.is_regular_file()) { - auto filename = entry.path().filename().native(); - auto ext = entry.path().extension().native(); - if (filename.find(file_prefix) == 0 && ext == file_extension) { + auto filename = entry.path().filename(); + auto ext = entry.path().extension(); + if (filename.native().find(file_prefix) == 0 && ext == file_extension) { dl_handle_ptr handle { dl_load_library(entry) }; if (!handle && !silent) { GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path_str(entry.path()).c_str()); @@ -545,7 +545,7 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent, // try to load the base backend for (const auto & search_path : search_paths) { fs::path filename = backend_filename_prefix().native() + name_path.native() + backend_filename_extension().native(); - fs::path path = search_path.native() + filename.native(); + fs::path path = search_path / filename; if (fs::exists(path)) { return get_reg().load_backend(path, silent); } diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index dee49fd97..7e7c3b3c8 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1461,7 +1461,7 @@ ggml_backend_sched_t ggml_backend_sched_new( bool parallel) { GGML_ASSERT(n_backends > 0); GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS); - GGML_ASSERT(ggml_backend_dev_type(ggml_backend_get_device(backends[n_backends - 1])) == GGML_BACKEND_DEVICE_TYPE_CPU); + // GGML_ASSERT(ggml_backend_dev_type(ggml_backend_get_device(backends[n_backends - 1])) == GGML_BACKEND_DEVICE_TYPE_CPU); struct ggml_backend_sched * sched = (ggml_backend_sched *) calloc(1, sizeof(struct ggml_backend_sched)); diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index fc33fa4c0..c3f904bfb 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -395,11 +395,11 @@ static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, int c) { #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) -#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(RDNA2) +#if defined(CDNA) || defined(RDNA2) || defined(__gfx906__) c = __builtin_amdgcn_sdot4(a, b, c, false); #elif defined(RDNA3) c = __builtin_amdgcn_sudot4( true, a, true, b, c, false); -#elif defined(__gfx1010__) || defined(__gfx900__) +#elif defined(RDNA1) || defined(__gfx900__) int tmp1; int tmp2; asm("\n \ diff --git a/ggml/src/ggml-cuda/fattn-common.cuh b/ggml/src/ggml-cuda/fattn-common.cuh index 46de14093..4067fd41b 100644 --- a/ggml/src/ggml-cuda/fattn-common.cuh +++ b/ggml/src/ggml-cuda/fattn-common.cuh @@ -52,12 +52,11 @@ typedef half (*vec_dot_KQ_f16_t)( typedef float (*vec_dot_KQ_f32_t)( const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8 , const void * __restrict__ Q_ds); -template +template static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_0( const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) { const block_q4_0 * K_q4_0 = (const block_q4_0 *) K_c; - constexpr int warp_size = ggml_cuda_get_physical_warp_size(); GGML_UNUSED(Q_v); T sum = 0.0f; @@ -93,12 +92,11 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_0( return sum; } -template +template static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_1( const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) { const block_q4_1 * K_q4_1 = (const block_q4_1 *) K_c; - constexpr int warp_size = ggml_cuda_get_physical_warp_size(); GGML_UNUSED(Q_v); T sum = 0.0f; @@ -138,12 +136,11 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_1( return sum; } -template +template static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_0( const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) { const block_q5_0 * K_q5_0 = (const block_q5_0 *) K_c; - constexpr int warp_size = ggml_cuda_get_physical_warp_size(); GGML_UNUSED(Q_v); T sum = 0.0f; @@ -186,12 +183,11 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_0( return sum; } -template +template static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_1( const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) { const block_q5_1 * K_q5_1 = (const block_q5_1 *) K_c; - constexpr int warp_size = ggml_cuda_get_physical_warp_size(); GGML_UNUSED(Q_v); T sum = 0.0f; @@ -238,12 +234,11 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_1( return sum; } -template +template static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q8_0( const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) { const block_q8_0 * K_q8_0 = (const block_q8_0 *) K_c; - constexpr int warp_size = ggml_cuda_get_physical_warp_size(); GGML_UNUSED(Q_v); T sum = 0.0f; @@ -272,12 +267,11 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q8_0( return sum; } -template +template static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_f16( const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8 , const void * __restrict__ Q_ds_v) { const half2 * K_h2 = (const half2 *) K_c; - constexpr int warp_size = ggml_cuda_get_physical_warp_size(); GGML_UNUSED(Q_q8); GGML_UNUSED(Q_ds_v); @@ -480,25 +474,25 @@ static __device__ __forceinline__ T dequantize_1_f16(const void * __restrict__ v return x[i]; } -template +template constexpr __device__ vec_dot_KQ_f16_t get_vec_dot_KQ_f16(ggml_type type_K) { - return type_K == GGML_TYPE_Q4_0 ? vec_dot_fattn_vec_KQ_q4_0 : - type_K == GGML_TYPE_Q4_1 ? vec_dot_fattn_vec_KQ_q4_1 : - type_K == GGML_TYPE_Q5_0 ? vec_dot_fattn_vec_KQ_q5_0 : - type_K == GGML_TYPE_Q5_1 ? vec_dot_fattn_vec_KQ_q5_1 : - type_K == GGML_TYPE_Q8_0 ? vec_dot_fattn_vec_KQ_q8_0 : - type_K == GGML_TYPE_F16 ? vec_dot_fattn_vec_KQ_f16 : + return type_K == GGML_TYPE_Q4_0 ? vec_dot_fattn_vec_KQ_q4_0 : + type_K == GGML_TYPE_Q4_1 ? vec_dot_fattn_vec_KQ_q4_1 : + type_K == GGML_TYPE_Q5_0 ? vec_dot_fattn_vec_KQ_q5_0 : + type_K == GGML_TYPE_Q5_1 ? vec_dot_fattn_vec_KQ_q5_1 : + type_K == GGML_TYPE_Q8_0 ? vec_dot_fattn_vec_KQ_q8_0 : + type_K == GGML_TYPE_F16 ? vec_dot_fattn_vec_KQ_f16 : nullptr; } -template +template constexpr __device__ vec_dot_KQ_f32_t get_vec_dot_KQ_f32(ggml_type type_K) { - return type_K == GGML_TYPE_Q4_0 ? vec_dot_fattn_vec_KQ_q4_0 : - type_K == GGML_TYPE_Q4_1 ? vec_dot_fattn_vec_KQ_q4_1 : - type_K == GGML_TYPE_Q5_0 ? vec_dot_fattn_vec_KQ_q5_0 : - type_K == GGML_TYPE_Q5_1 ? vec_dot_fattn_vec_KQ_q5_1 : - type_K == GGML_TYPE_Q8_0 ? vec_dot_fattn_vec_KQ_q8_0 : - type_K == GGML_TYPE_F16 ? vec_dot_fattn_vec_KQ_f16 : + return type_K == GGML_TYPE_Q4_0 ? vec_dot_fattn_vec_KQ_q4_0 : + type_K == GGML_TYPE_Q4_1 ? vec_dot_fattn_vec_KQ_q4_1 : + type_K == GGML_TYPE_Q5_0 ? vec_dot_fattn_vec_KQ_q5_0 : + type_K == GGML_TYPE_Q5_1 ? vec_dot_fattn_vec_KQ_q5_1 : + type_K == GGML_TYPE_Q8_0 ? vec_dot_fattn_vec_KQ_q8_0 : + type_K == GGML_TYPE_F16 ? vec_dot_fattn_vec_KQ_f16 : nullptr; } @@ -681,7 +675,8 @@ static void on_no_fattn_vec_case(const int D) { template void launch_fattn( ggml_backend_cuda_context & ctx, ggml_tensor * dst, fattn_kernel_t fattn_kernel, - const int nwarps, const size_t nbytes_shared, const bool need_f16_K, const bool need_f16_V + const int nwarps, const size_t nbytes_shared, const bool need_f16_K, const bool need_f16_V, + const int warp_size = WARP_SIZE ) { constexpr int ncols = ncols1 * ncols2; @@ -704,8 +699,6 @@ void launch_fattn( GGML_ASSERT(Q->ne[3] == 1); - const int warp_size = ggml_cuda_info().devices[ctx.device].warp_size; - ggml_cuda_pool & pool = ctx.pool(); cudaStream_t main_stream = ctx.stream(); const int id = ggml_cuda_get_device(); @@ -805,7 +798,6 @@ void launch_fattn( const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); GGML_ASSERT(block_dim.x % warp_size == 0); - GGML_ASSERT(!GGML_CUDA_CC_IS_AMD(cc) || block_dim.x * block_dim.y <= 4 * (unsigned int)warp_size); fattn_kernel<<>>( (const char *) Q->data, K_data, diff --git a/ggml/src/ggml-cuda/fattn-wmma-f16.cu b/ggml/src/ggml-cuda/fattn-wmma-f16.cu index 622cf2857..dab1d5cbc 100644 --- a/ggml/src/ggml-cuda/fattn-wmma-f16.cu +++ b/ggml/src/ggml-cuda/fattn-wmma-f16.cu @@ -469,6 +469,7 @@ void ggml_cuda_flash_attn_ext_wmma_f16_case(ggml_backend_cuda_context & ctx, ggm constexpr int frag_m = cols_per_block == 8 && D % 32 == 0 ? 32 : 16; const int blocks_num_pb1 = ((Q->ne[1] + cols_per_block - 1) / cols_per_block)*Q->ne[2]*Q->ne[3]; const int nsm = ggml_cuda_info().devices[ggml_cuda_get_device()].nsm; + const int warp_size = ggml_cuda_info().devices[ggml_cuda_get_device()].warp_size; float logit_softcap; memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float)); @@ -485,7 +486,7 @@ void ggml_cuda_flash_attn_ext_wmma_f16_case(ggml_backend_cuda_context & ctx, ggm fattn_kernel = flash_attn_ext_f16< D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t, use_logit_softcap>; } - launch_fattn(ctx, dst, fattn_kernel, nwarps, 0, true, true); + launch_fattn(ctx, dst, fattn_kernel, nwarps, 0, true, true, warp_size); return; } if (2*blocks_num_pb1 < 2*nsm) { @@ -500,7 +501,7 @@ void ggml_cuda_flash_attn_ext_wmma_f16_case(ggml_backend_cuda_context & ctx, ggm fattn_kernel = flash_attn_ext_f16< D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t, use_logit_softcap>; } - launch_fattn(ctx, dst, fattn_kernel, nwarps, 0, true, true); + launch_fattn(ctx, dst, fattn_kernel, nwarps, 0, true, true, warp_size); return; } constexpr int parallel_blocks = 1; @@ -514,7 +515,7 @@ void ggml_cuda_flash_attn_ext_wmma_f16_case(ggml_backend_cuda_context & ctx, ggm fattn_kernel = flash_attn_ext_f16< D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t, use_logit_softcap>; } - launch_fattn(ctx, dst, fattn_kernel, nwarps, 0, true, true); + launch_fattn(ctx, dst, fattn_kernel, nwarps, 0, true, true, warp_size); } void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu index 4fb466ca0..a7d518a57 100644 --- a/ggml/src/ggml-cuda/mmvq.cu +++ b/ggml/src/ggml-cuda/mmvq.cu @@ -47,11 +47,89 @@ static constexpr __device__ int get_vdr_mmvq(ggml_type type) { 1; } +enum mmvq_parameter_table_id { + MMVQ_PARAMETERS_GENERIC = 0, + MMVQ_PARAMETERS_GCN, + MMVQ_PARAMETERS_RDNA2 +}; + +static constexpr __device__ mmvq_parameter_table_id get_device_table_id() { +#if defined(RDNA2) || defined(RDNA3) + return MMVQ_PARAMETERS_RDNA2; +#elif defined(GCN) || defined(CDNA) + return MMVQ_PARAMETERS_GCN; +#else + return MMVQ_PARAMETERS_GENERIC; +#endif +} + +static __host__ mmvq_parameter_table_id get_device_table_id(int cc) { + if (GGML_CUDA_CC_IS_RDNA2(cc) || GGML_CUDA_CC_IS_RDNA3(cc)) { + return MMVQ_PARAMETERS_RDNA2; + } + if (GGML_CUDA_CC_IS_GCN(cc) || GGML_CUDA_CC_IS_CDNA(cc)) { + return MMVQ_PARAMETERS_GCN; + } + return MMVQ_PARAMETERS_GENERIC; +} + +static constexpr __host__ __device__ int calc_nwarps(int ncols_y, mmvq_parameter_table_id table_id) { + if (table_id == MMVQ_PARAMETERS_GENERIC) { + switch (ncols_y) { + case 1: + case 2: + case 3: + case 4: + return 4; + case 5: + case 6: + case 7: + case 8: + return 2; + default: + return 1; + } + } else if (table_id == MMVQ_PARAMETERS_GCN) { + switch (ncols_y) { + case 1: + case 2: + case 3: + case 4: + return 2; + case 5: + case 6: + case 7: + case 8: + default: + return 1; + } + } + return 1; +} + +static constexpr __host__ __device__ int calc_rows_per_block(int ncols_y, int table_id) { + if (table_id == MMVQ_PARAMETERS_GENERIC || table_id == MMVQ_PARAMETERS_GCN) { + switch (ncols_y) { + case 1: + return 1; + case 2: + case 3: + case 4: + case 5: + case 6: + case 7: + case 8: + return 2; + default: + return 1; + } + } + return 1; +} + template -#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) // tell the compiler to use as many registers as it wants, see nwarps definition below -__launch_bounds__((ncols_y <= 4 ? 4 : 2)*WARP_SIZE, 1) -#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) +__launch_bounds__(calc_nwarps(ncols_y, get_device_table_id())*ggml_cuda_get_physical_warp_size(), 1) static __global__ void mul_mat_vec_q( const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst) { @@ -59,24 +137,20 @@ static __global__ void mul_mat_vec_q( constexpr int qk = ggml_cuda_type_traits::qk; constexpr int qi = ggml_cuda_type_traits::qi; constexpr int vdr = get_vdr_mmvq(type); + constexpr mmvq_parameter_table_id table_id = get_device_table_id(); + constexpr int nwarps = calc_nwarps(ncols_y, table_id); + constexpr int rows_per_cuda_block = calc_rows_per_block(ncols_y, table_id); + constexpr int warp_size = ggml_cuda_get_physical_warp_size(); constexpr vec_dot_q_cuda_t vec_dot_q_cuda = get_vec_dot_q_cuda(type); -#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && (defined(RDNA2) || defined(RDNA3)) - constexpr int nwarps = 1; - constexpr int rows_per_cuda_block = 1; -#else - constexpr int nwarps = ncols_y <= 4 ? 4 : 2; - constexpr int rows_per_cuda_block = ncols_y == 1 ? 1 : 2; -#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(RDNA2) && !defined(RDNA3) - - const int tid = WARP_SIZE*threadIdx.y + threadIdx.x; + const int tid = warp_size*threadIdx.y + threadIdx.x; const int row0 = rows_per_cuda_block*blockIdx.x; const int blocks_per_row_x = ncols_x / qk; const int blocks_per_col_y = nrows_y / QK8_1; - constexpr int blocks_per_iter = vdr * nwarps*WARP_SIZE / qi; + constexpr int blocks_per_iter = vdr * nwarps*warp_size / qi; -// partial sum for each thread + // partial sum for each thread float tmp[ncols_y][rows_per_cuda_block] = {0.0f}; const block_q8_1 * y = (const block_q8_1 *) vy; @@ -96,7 +170,7 @@ static __global__ void mul_mat_vec_q( } } - __shared__ float tmp_shared[nwarps-1 > 0 ? nwarps-1 : 1][ncols_y][rows_per_cuda_block][WARP_SIZE]; + __shared__ float tmp_shared[nwarps-1 > 0 ? nwarps-1 : 1][ncols_y][rows_per_cuda_block][warp_size]; if (threadIdx.y > 0) { #pragma unroll for (int j = 0; j < ncols_y; ++j) { @@ -120,7 +194,7 @@ static __global__ void mul_mat_vec_q( for (int l = 0; l < nwarps-1; ++l) { tmp[j][i] += tmp_shared[l][j][i][threadIdx.x]; } - tmp[j][i] = warp_reduce_sum(tmp[j][i]); + tmp[j][i] = warp_reduce_sum(tmp[j][i]); } if (threadIdx.x < rows_per_cuda_block && (rows_per_cuda_block == 1 || row0 + threadIdx.x < nrows_dst)) { @@ -129,6 +203,13 @@ static __global__ void mul_mat_vec_q( } } +static std::pair calc_launch_params(const int ncols_y, const int nrows_x, const int warp_size, const mmvq_parameter_table_id table_id) { + const int64_t nblocks = (nrows_x + calc_rows_per_block(ncols_y, table_id) - 1) / calc_rows_per_block(ncols_y, table_id); + const dim3 block_nums(nblocks, 1, 1); + const dim3 block_dims(warp_size, calc_nwarps(ncols_y, table_id), 1); + return {block_nums, block_dims}; +} + template static void mul_mat_vec_q_cuda( const void * vx, const void * vy, float * dst, @@ -137,65 +218,67 @@ static void mul_mat_vec_q_cuda( GGML_ASSERT(ncols_x % ggml_blck_size(type) == 0); GGML_ASSERT(ncols_y <= MMVQ_MAX_BATCH_SIZE); - int id = ggml_cuda_get_device(); - - int64_t nwarps = 1; - int64_t rows_per_cuda_block = 1; - - if (ggml_cuda_info().devices[id].cc < GGML_CUDA_CC_RDNA2) { // NVIDIA and AMD older than RDNA2 - switch(ncols_y) { - case 1: - nwarps = 4; - rows_per_cuda_block = 1; - break; - case 2: - case 3: - case 4: - nwarps = 4; - rows_per_cuda_block = 2; - break; - case 5: - case 6: - case 7: - case 8: - nwarps = 2; - rows_per_cuda_block = 2; - break; - default: - GGML_ABORT("fatal error"); - break; - } - } - - const int64_t nblocks = (nrows_x + rows_per_cuda_block - 1) / rows_per_cuda_block; - const dim3 block_nums(nblocks, 1, 1); - const dim3 block_dims(WARP_SIZE, nwarps, 1); + const int device = ggml_cuda_get_device(); + const int warp_size = ggml_cuda_info().devices[device].warp_size; + const mmvq_parameter_table_id table_id = get_device_table_id(ggml_cuda_info().devices[device].cc); switch (ncols_y) { case 1: - mul_mat_vec_q<<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst); + { + constexpr int c_ncols_y = 1; + std::pair dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id); + mul_mat_vec_q<<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst); break; + } case 2: - mul_mat_vec_q<<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst); + { + constexpr int c_ncols_y = 2; + std::pair dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id); + mul_mat_vec_q<<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst); break; + } case 3: - mul_mat_vec_q<<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst); + { + constexpr int c_ncols_y = 3; + std::pair dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id); + mul_mat_vec_q<<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst); break; + } case 4: - mul_mat_vec_q<<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst); + { + constexpr int c_ncols_y = 4; + std::pair dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id); + mul_mat_vec_q<<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst); break; + } case 5: - mul_mat_vec_q<<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst); + { + constexpr int c_ncols_y = 5; + std::pair dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id); + mul_mat_vec_q<<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst); break; + } case 6: - mul_mat_vec_q<<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst); + { + constexpr int c_ncols_y = 6; + std::pair dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id); + mul_mat_vec_q<<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst); break; + } case 7: - mul_mat_vec_q<<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst); + { + constexpr int c_ncols_y = 7; + std::pair dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id); + mul_mat_vec_q<<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst); break; + } case 8: - mul_mat_vec_q<<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst); + { + constexpr int c_ncols_y = 8; + std::pair dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id); + mul_mat_vec_q<<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst); break; + } default: GGML_ABORT("fatal error"); break; diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index 22babf9c2..2fb1d2d7b 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -46,6 +46,7 @@ static struct ggml_backend_device g_ggml_backend_metal_device; static struct ggml_backend_metal_device_context { id mtl_device; int mtl_device_ref_count; + id mtl_library; bool has_simdgroup_reduction; bool has_simdgroup_mm; @@ -57,6 +58,7 @@ static struct ggml_backend_metal_device_context { } g_ggml_ctx_dev_main = { /*.mtl_device =*/ nil, /*.mtl_device_ref_count =*/ 0, + /*.mtl_library =*/ nil, /*.has_simdgroup_reduction =*/ false, /*.has_simdgroup_mm =*/ false, /*.has_residency_sets =*/ false, @@ -108,6 +110,11 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte ctx->mtl_device_ref_count--; if (ctx->mtl_device_ref_count == 0) { + if (ctx->mtl_library) { + [ctx->mtl_library release]; + ctx->mtl_library = nil; + } + if (ctx->mtl_device) { [ctx->mtl_device release]; ctx->mtl_device = nil; @@ -495,6 +502,139 @@ static void * ggml_metal_host_malloc(size_t n) { return data; } +// load library +// +// - first check if the library is embedded +// - then check if the library is in the bundle +// - if not found, load the source and compile it +// - if that fails, return NULL +static id ggml_metal_load_library(id device, bool use_bfloat) { + id metal_library = nil; + NSError * error = nil; + NSString * src = nil; + +#if GGML_METAL_EMBED_LIBRARY + GGML_LOG_INFO("%s: using embedded metal library\n", __func__); + + extern const char ggml_metallib_start[]; + extern const char ggml_metallib_end[]; + + src = [[NSString alloc] initWithBytes:ggml_metallib_start length:(ggml_metallib_end-ggml_metallib_start) encoding:NSUTF8StringEncoding]; + +#else + +#ifdef SWIFT_PACKAGE + NSBundle * bundle = SWIFTPM_MODULE_BUNDLE; +#else + NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]]; +#endif + + NSString * path_lib = [bundle pathForResource:@"default" ofType:@"metallib"]; + if (path_lib == nil) { + // Try to find the resource in the directory where the current binary located. + NSString * current_binary = [[NSProcessInfo processInfo] arguments][0]; + NSString * bin_dir = [current_binary stringByDeletingLastPathComponent]; + NSString * default_metallib_path = [NSString pathWithComponents:@[bin_dir, @"default.metallib"]]; + if ([[NSFileManager defaultManager] isReadableFileAtPath:default_metallib_path]) { + GGML_LOG_INFO("%s: found '%s'\n", __func__, [default_metallib_path UTF8String]); + NSDictionary * atts = [[NSFileManager defaultManager] attributesOfItemAtPath:default_metallib_path error:&error]; + if (atts && atts[NSFileType] == NSFileTypeSymbolicLink) { + // Optionally, if this is a symlink, try to resolve it. + default_metallib_path = [[NSFileManager defaultManager] destinationOfSymbolicLinkAtPath:default_metallib_path error:&error]; + if (default_metallib_path && [default_metallib_path length] > 0 && ![[default_metallib_path substringToIndex:1] isEqualToString:@"/"]) { + // It is a relative path, adding the binary directory as directory prefix. + default_metallib_path = [NSString pathWithComponents:@[bin_dir, default_metallib_path]]; + } + if (!default_metallib_path || ![[NSFileManager defaultManager] isReadableFileAtPath:default_metallib_path]) { + // Link to the resource could not be resolved. + default_metallib_path = nil; + } else { + GGML_LOG_INFO("%s: symlink resolved '%s'\n", __func__, [default_metallib_path UTF8String]); + } + } + } else { + // The resource couldn't be found in the binary's directory. + default_metallib_path = nil; + } + path_lib = default_metallib_path; + } + + if (path_lib != nil) { + // pre-compiled library found + NSURL * libURL = [NSURL fileURLWithPath:path_lib]; + GGML_LOG_INFO("%s: loading '%s'\n", __func__, [path_lib UTF8String]); + + metal_library = [device newLibraryWithURL:libURL error:&error]; + if (error) { + GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]); + return NULL; + } + } else { + GGML_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__); + + NSString * path_source; + NSString * path_resource = [[NSProcessInfo processInfo].environment objectForKey:@"GGML_METAL_PATH_RESOURCES"]; + + GGML_LOG_INFO("%s: GGML_METAL_PATH_RESOURCES = %s\n", __func__, path_resource ? [path_resource UTF8String] : "nil"); + + if (path_resource) { + path_source = [path_resource stringByAppendingPathComponent:@"ggml-metal-merged.metal"]; + } else { + path_source = [bundle pathForResource:@"ggml-metal-merged" ofType:@"metal"]; + } + + if (path_source == nil) { + GGML_LOG_WARN("%s: error: could not use bundle path to find ggml-metal-merged.metal, falling back to trying cwd\n", __func__); + path_source = @"ggml-metal.metal"; + } + + GGML_LOG_INFO("%s: loading '%s'\n", __func__, [path_source UTF8String]); + + src = [NSString stringWithContentsOfFile:path_source encoding:NSUTF8StringEncoding error:&error]; + if (error) { + GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]); + return NULL; + } + } +#endif + + if (!metal_library) { + @autoreleasepool { + // dictionary of preprocessor macros + NSMutableDictionary * prep = [NSMutableDictionary dictionary]; + + if (use_bfloat) { + [prep setObject:@"1" forKey:@"GGML_METAL_USE_BF16"]; + } + +#if GGML_METAL_EMBED_LIBRARY + [prep setObject:@"1" forKey:@"GGML_METAL_EMBED_LIBRARY"]; +#endif + + MTLCompileOptions * options = [MTLCompileOptions new]; + options.preprocessorMacros = prep; + + //[options setFastMathEnabled:false]; + + metal_library = [device newLibraryWithSource:src options:options error:&error]; + if (error) { + GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]); + return NULL; + } + +#if !__has_feature(objc_arc) + [options release]; +#endif + } + } + +#if GGML_METAL_EMBED_LIBRARY + [src release]; +#endif // GGML_METAL_EMBED_LIBRARY + + return metal_library; +} + static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t dev) { GGML_LOG_INFO("%s: allocating\n", __func__); @@ -522,136 +662,14 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT); - id metal_library = nil; - // load library - // - // - first check if the library is embedded - // - then check if the library is in the bundle - // - if not found, load the source and compile it - // - if that fails, return NULL - { - NSError * error = nil; - NSString * src = nil; - -#if GGML_METAL_EMBED_LIBRARY - GGML_LOG_INFO("%s: using embedded metal library\n", __func__); - - extern const char ggml_metallib_start[]; - extern const char ggml_metallib_end[]; - - src = [[NSString alloc] initWithBytes:ggml_metallib_start length:(ggml_metallib_end-ggml_metallib_start) encoding:NSUTF8StringEncoding]; - -#else - -#ifdef SWIFT_PACKAGE - NSBundle * bundle = SWIFTPM_MODULE_BUNDLE; -#else - NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]]; -#endif - - NSString * path_lib = [bundle pathForResource:@"default" ofType:@"metallib"]; - if (path_lib == nil) { - // Try to find the resource in the directory where the current binary located. - NSString * current_binary = [[NSProcessInfo processInfo] arguments][0]; - NSString * bin_dir = [current_binary stringByDeletingLastPathComponent]; - NSString * default_metallib_path = [NSString pathWithComponents:@[bin_dir, @"default.metallib"]]; - if ([[NSFileManager defaultManager] isReadableFileAtPath:default_metallib_path]) { - GGML_LOG_INFO("%s: found '%s'\n", __func__, [default_metallib_path UTF8String]); - NSDictionary * atts = [[NSFileManager defaultManager] attributesOfItemAtPath:default_metallib_path error:&error]; - if (atts && atts[NSFileType] == NSFileTypeSymbolicLink) { - // Optionally, if this is a symlink, try to resolve it. - default_metallib_path = [[NSFileManager defaultManager] destinationOfSymbolicLinkAtPath:default_metallib_path error:&error]; - if (default_metallib_path && [default_metallib_path length] > 0 && ![[default_metallib_path substringToIndex:1] isEqualToString:@"/"]) { - // It is a relative path, adding the binary directory as directory prefix. - default_metallib_path = [NSString pathWithComponents:@[bin_dir, default_metallib_path]]; - } - if (!default_metallib_path || ![[NSFileManager defaultManager] isReadableFileAtPath:default_metallib_path]) { - // Link to the resource could not be resolved. - default_metallib_path = nil; - } else { - GGML_LOG_INFO("%s: symlink resolved '%s'\n", __func__, [default_metallib_path UTF8String]); - } - } - } else { - // The resource couldn't be found in the binary's directory. - default_metallib_path = nil; - } - path_lib = default_metallib_path; - } - - if (path_lib != nil) { - // pre-compiled library found - NSURL * libURL = [NSURL fileURLWithPath:path_lib]; - GGML_LOG_INFO("%s: loading '%s'\n", __func__, [path_lib UTF8String]); - - metal_library = [device newLibraryWithURL:libURL error:&error]; - if (error) { - GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]); - return NULL; - } - } else { - GGML_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__); - - NSString * path_source; - NSString * path_resource = [[NSProcessInfo processInfo].environment objectForKey:@"GGML_METAL_PATH_RESOURCES"]; - - GGML_LOG_INFO("%s: GGML_METAL_PATH_RESOURCES = %s\n", __func__, path_resource ? [path_resource UTF8String] : "nil"); - - if (path_resource) { - path_source = [path_resource stringByAppendingPathComponent:@"ggml-metal-merged.metal"]; - } else { - path_source = [bundle pathForResource:@"ggml-metal-merged" ofType:@"metal"]; - } - - if (path_source == nil) { - GGML_LOG_WARN("%s: error: could not use bundle path to find ggml-metal-merged.metal, falling back to trying cwd\n", __func__); - path_source = @"ggml-metal.metal"; - } - - GGML_LOG_INFO("%s: loading '%s'\n", __func__, [path_source UTF8String]); - - src = [NSString stringWithContentsOfFile:path_source encoding:NSUTF8StringEncoding error:&error]; - if (error) { - GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]); - return NULL; - } - } -#endif - - if (!metal_library) { - @autoreleasepool { - // dictionary of preprocessor macros - NSMutableDictionary * prep = [NSMutableDictionary dictionary]; - - if (ctx_dev->use_bfloat) { - [prep setObject:@"1" forKey:@"GGML_METAL_USE_BF16"]; - } - -#if GGML_METAL_EMBED_LIBRARY - [prep setObject:@"1" forKey:@"GGML_METAL_EMBED_LIBRARY"]; -#endif - - MTLCompileOptions * options = [MTLCompileOptions new]; - options.preprocessorMacros = prep; - - //[options setFastMathEnabled:false]; - - metal_library = [device newLibraryWithSource:src options:options error:&error]; - if (error) { - GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]); - return NULL; - } - -#if !__has_feature(objc_arc) - [options release]; -#endif - } - } - -#if GGML_METAL_EMBED_LIBRARY - [src release]; -#endif // GGML_METAL_EMBED_LIBRARY + if (ctx_dev->mtl_library == nil) { + ctx_dev->mtl_library = ggml_metal_load_library(device, ctx_dev->use_bfloat); + } + id metal_library = ctx_dev->mtl_library; + if (metal_library == nil) { + GGML_LOG_ERROR("%s: error: metal library is nil\n", __func__); + return NULL; } // print MTL GPU family: @@ -725,7 +743,6 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de [metal_function release]; \ if (error) { \ GGML_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \ - [metal_library release]; \ return NULL; \ } \ } else { \ @@ -1044,8 +1061,6 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_POOL_2D_MAX_F32, pool_2d_max_f32, true); } - [metal_library release]; - return ctx; } diff --git a/ggml/src/ggml-opencl/CMakeLists.txt b/ggml/src/ggml-opencl/CMakeLists.txt index 45328a657..59a208fe9 100644 --- a/ggml/src/ggml-opencl/CMakeLists.txt +++ b/ggml/src/ggml-opencl/CMakeLists.txt @@ -15,6 +15,7 @@ if (GGML_OPENCL_PROFILING) endif () add_compile_definitions(GGML_OPENCL_SOA_Q) +add_compile_definitions(GGML_OPENCL_TARGET_VERSION=${GGML_OPENCL_TARGET_VERSION}) if (GGML_OPENCL_USE_ADRENO_KERNELS) message(STATUS "OpenCL will use matmul kernels optimized for Adreno") diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index b85a895c4..14d9934fb 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -1,4 +1,4 @@ -#define CL_TARGET_OPENCL_VERSION 220 +#define CL_TARGET_OPENCL_VERSION GGML_OPENCL_TARGET_VERSION #define CL_USE_DEPRECATED_OPENCL_1_2_APIS // suppress warnings in CL headers for GCC and Clang @@ -25,6 +25,8 @@ #include #include #include +#include +#include #undef MIN #undef MAX @@ -62,6 +64,97 @@ enum ADRENO_GPU_GEN { X1E, }; +struct ggml_cl_version { + cl_uint major = 0; + cl_uint minor = 0; +}; + +// Parses a version string of form "XX.YY ". On an error returns ggml_cl_version with all zeroes. +static ggml_cl_version parse_cl_version(std::string_view str) { + size_t major_str_begin = 0; + size_t major_str_end = str.find(".", major_str_begin); + if (major_str_end == std::string::npos) { + return {}; + } + + size_t minor_str_begin = major_str_end + 1; + size_t minor_str_end = str.find(" ", minor_str_begin); + if (minor_str_end == std::string::npos) { + return {}; + } + + cl_uint version_major; + if (std::from_chars(str.data() + major_str_begin, str.data() + major_str_end, version_major).ec != std::errc{}) { + return {}; + } + + cl_uint version_minor; + if (std::from_chars(str.data() + minor_str_begin, str.data() + minor_str_end, version_minor).ec != std::errc{}) { + return {}; + } + return { version_major, version_minor }; +} + +// Returns OpenCL platform's version. On an error returns ggml_cl_version with all zeroes. +static ggml_cl_version get_opencl_platform_version(cl_platform_id platform) { + size_t param_size; + CL_CHECK(clGetPlatformInfo(platform, CL_PLATFORM_VERSION, 0, nullptr, ¶m_size)); + std::unique_ptr param_storage(new char[param_size]); + CL_CHECK(clGetPlatformInfo(platform, CL_PLATFORM_VERSION, param_size, param_storage.get(), nullptr)); + + auto param_value = std::string_view(param_storage.get(), param_size); + const std::string version_prefix = "OpenCL "; // Suffix: "XX.YY " + if (param_value.find(version_prefix) != 0) { + return {}; + } + param_value.remove_prefix(version_prefix.length()); + return parse_cl_version(param_value); +} + +// Return a version to use in OpenCL C compilation. On an error returns ggml_cl_version with all zeroes. +static ggml_cl_version get_opencl_c_version(ggml_cl_version platform_version, cl_device_id device) { + size_t param_size; + +#if CL_TARGET_OPENCL_VERSION >= 300 + if (platform_version.major >= 3) { + CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_ALL_VERSIONS, 0, nullptr, ¶m_size)); + if (!param_size) { + return {}; + } + + std::unique_ptr versions(new cl_name_version[param_size]); + CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_ALL_VERSIONS, param_size, versions.get(), nullptr)); + unsigned versions_count = param_size / sizeof(cl_name_version); + + cl_version version_max = 0; + for (unsigned i = 0; i < versions_count; i++) { + version_max = std::max(versions[i].version, version_max); + } + + return { CL_VERSION_MAJOR(version_max), CL_VERSION_MINOR(version_max) }; + } +#else + GGML_UNUSED(platform_version); +#endif // CL_TARGET_OPENCL_VERSION >= 300 + + CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_VERSION, 0, nullptr, ¶m_size)); + if (!param_size) { + return {}; + } + + std::unique_ptr param_storage(new char[param_size]); + CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_VERSION, param_size, param_storage.get(), nullptr)); + auto param_value = std::string_view(param_storage.get(), param_size); + + const std::string version_prefix = "OpenCL C "; // Suffix: "XX.YY " + if (param_value.find(version_prefix) != 0) { + return {}; + } + param_value.remove_prefix(version_prefix.length()); + + return parse_cl_version(param_value); +} + static ADRENO_GPU_GEN get_adreno_gpu_gen(const char *device_name) { if (strstr(device_name, "730") || strstr(device_name, "740") || @@ -470,16 +563,11 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) { // A local ref of cl_device_id for convenience cl_device_id device = backend_ctx->device; - // Check device OpenCL version, OpenCL 2.0 or above is required - size_t device_ver_str_size; - clGetDeviceInfo(device, CL_DEVICE_VERSION, 0, NULL, &device_ver_str_size); - char *device_ver_buffer = (char *)alloca(device_ver_str_size + 1); - clGetDeviceInfo(device, CL_DEVICE_VERSION, device_ver_str_size, device_ver_buffer, NULL); - device_ver_buffer[device_ver_str_size] = '\0'; - GGML_LOG_INFO("ggml_opencl: device OpenCL version: %s\n", device_ver_buffer); + ggml_cl_version platform_version = get_opencl_platform_version(default_device->platform->id); - if (strstr(device_ver_buffer, "OpenCL 2") == NULL && - strstr(device_ver_buffer, "OpenCL 3") == NULL) { + // Check device OpenCL version, OpenCL 2.0 or above is required + ggml_cl_version opencl_c_version = get_opencl_c_version(platform_version, device); + if (opencl_c_version.major < 2) { GGML_LOG_ERROR("ggml_opencl: OpenCL 2.0 or above is required\n"); return backend_ctx; } @@ -516,8 +604,7 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) { // If OpenCL 3.0 is supported, then check for cl_khr_subgroups, which becomes // optional in OpenCL 3.0 (cl_khr_subgroup is mandatory in OpenCL 2.x) - if (strstr(device_ver_buffer, "OpenCL 3") && - strstr(ext_buffer, "cl_khr_subgroups") == NULL && + if (opencl_c_version.major == 3 && strstr(ext_buffer, "cl_khr_subgroups") == NULL && strstr(ext_buffer, "cl_intel_subgroups") == NULL) { GGML_LOG_ERROR("ggml_opencl: device does not support subgroups (cl_khr_subgroups or cl_intel_subgroups) " "(note that subgroups is an optional feature in OpenCL 3.0)\n"); @@ -581,9 +668,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) { const std::string kernel_src = read_file("ggml-opencl.cl"); #endif - std::string compile_opts = - "-cl-std=CL2.0 -cl-mad-enable -cl-unsafe-math-optimizations " - "-cl-finite-math-only -cl-fast-relaxed-math "; + auto opencl_c_std = + std::string("CL") + std::to_string(opencl_c_version.major) + "." + std::to_string(opencl_c_version.minor); + + std::string compile_opts = std::string("-cl-std=") + opencl_c_std + + " -cl-mad-enable -cl-unsafe-math-optimizations" + " -cl-finite-math-only -cl-fast-relaxed-math"; backend_ctx->program = build_program_from_source(context, device, kernel_src.c_str(), compile_opts); // Non matmul kernels. @@ -693,10 +783,10 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) { CL_CHECK((backend_ctx->kernel_transpose_16 = clCreateKernel(backend_ctx->program_transpose_16, "kernel_transpose_16", &err), err)); // Gemv general - std::string CL_gemv_compile_opts = - " -cl-std=CL2.0 " - " -cl-mad-enable " - " -DSIMDGROUP_WIDTH=" + std::to_string(backend_ctx->adreno_wave_size); + std::string CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std + + " -cl-mad-enable " + " -DSIMDGROUP_WIDTH=" + + std::to_string(backend_ctx->adreno_wave_size); if (has_vector_subgroup_broadcast) { CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT "; } @@ -713,12 +803,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) { CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_general = clCreateKernel(backend_ctx->program_CL_gemv_general, "kernel_gemv_noshuffle", &err), err)); // Gemv 2048, 16384 - CL_gemv_compile_opts = - " -cl-std=CL2.0 " - " -cl-mad-enable " - " -DLINE_STRIDE_A=2048 " - " -DBLOCK_STRIDE_A=16384 " - " -DSIMDGROUP_WIDTH=" + std::to_string(backend_ctx->adreno_wave_size); + CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std + + " -cl-mad-enable " + " -DLINE_STRIDE_A=2048 " + " -DBLOCK_STRIDE_A=16384 " + " -DSIMDGROUP_WIDTH=" + + std::to_string(backend_ctx->adreno_wave_size); if (has_vector_subgroup_broadcast) { CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT "; } @@ -735,12 +825,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) { CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_4096_1_4096, "kernel_gemv_noshuffle", &err), err)); // Gemv 2048, 16384 - CL_gemv_compile_opts = - " -cl-std=CL2.0 " - " -cl-mad-enable " - " -DLINE_STRIDE_A=2048 " - " -DBLOCK_STRIDE_A=16384 " - " -DSIMDGROUP_WIDTH=" + std::to_string(backend_ctx->adreno_wave_size); + CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std + + " -cl-mad-enable " + " -DLINE_STRIDE_A=2048 " + " -DBLOCK_STRIDE_A=16384 " + " -DSIMDGROUP_WIDTH=" + + std::to_string(backend_ctx->adreno_wave_size); if (has_vector_subgroup_broadcast) { CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT "; } @@ -750,12 +840,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) { CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_11008 = clCreateKernel(backend_ctx->program_CL_gemv_4096_1_11008, "kernel_gemv_noshuffle", &err), err)); // Gemv 5504, 44032 - CL_gemv_compile_opts = - " -cl-std=CL2.0 " - " -cl-mad-enable " - " -DLINE_STRIDE_A=5504 " - " -DBLOCK_STRIDE_A=44032 " - " -DSIMDGROUP_WIDTH=" + std::to_string(backend_ctx->adreno_wave_size); + CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std + + " -cl-mad-enable " + " -DLINE_STRIDE_A=5504 " + " -DBLOCK_STRIDE_A=44032 " + " -DSIMDGROUP_WIDTH=" + + std::to_string(backend_ctx->adreno_wave_size); if (has_vector_subgroup_broadcast) { CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT "; } @@ -765,12 +855,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) { CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_11008_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_11008_1_4096, "kernel_gemv_noshuffle", &err), err)); // Gemv 16000, 128000 - CL_gemv_compile_opts = - " -cl-std=CL2.0 " - " -cl-mad-enable " - " -DLINE_STRIDE_A=16000 " - " -DBLOCK_STRIDE_A=128000 " - " -DSIMDGROUP_WIDTH=" + std::to_string(backend_ctx->adreno_wave_size); + CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std + + " -cl-mad-enable " + " -DLINE_STRIDE_A=16000 " + " -DBLOCK_STRIDE_A=128000 " + " -DSIMDGROUP_WIDTH=" + + std::to_string(backend_ctx->adreno_wave_size); if (has_vector_subgroup_broadcast) { CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT "; } diff --git a/ggml/src/ggml-sycl/mmvq.cpp b/ggml/src/ggml-sycl/mmvq.cpp index 221f65c21..a96286d71 100644 --- a/ggml/src/ggml-sycl/mmvq.cpp +++ b/ggml/src/ggml-sycl/mmvq.cpp @@ -3,44 +3,42 @@ #include template -static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows, - const sycl::nd_item<3> &item_ct1) { - const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + - item_ct1.get_local_id(1); +static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols, const int nrows, const sycl::nd_item<3> & item_ct1) { + const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + item_ct1.get_local_id(1); if (row >= nrows) { return; } - const int blocks_per_row = ncols / qk; - const int blocks_per_warp = vdr * QK_WARP_SIZE / qi; - assert(blocks_per_warp>0); + const int blocks_per_row = ncols / qk; + constexpr int blocks_per_warp = (vdr * WARP_SIZE + qi - 1) / qi; // Ensuring blocks_per_warp > 0 -// partial sum for each thread + assert(blocks_per_warp > 0); + + // partial sum for each thread float tmp = 0.0f; - const block_q_t * x = (const block_q_t *) vx; + const block_q_t * x = (const block_q_t *) vx; const block_q8_1 * y = (const block_q8_1 *) vy; - for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row; - i += blocks_per_warp) { - const int ibx = row*blocks_per_row + i; // x block index + for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row; i += blocks_per_warp) { + const int ibx = row * blocks_per_row + i; // x block index - const int iby = i * (qk/QK8_1); // y block index that aligns with ibx + const int iby = i * (qk / QK8_1); // y block index that aligns with ibx - const int iqs = - vdr * - (item_ct1.get_local_id(2) % - (qi / vdr)); // x block quant index when casting the quants to int + for (size_t elem = 0; elem < qi / vdr; elem += WARP_SIZE) { + const int iqs = elem + vdr * (item_ct1.get_local_id(2) % + (qi / vdr)); // x block quant index when casting the quants to int - tmp += vec_dot_q_sycl(&x[ibx], &y[iby], iqs); + tmp += vec_dot_q_sycl(&x[ibx], &y[iby], iqs); + } } // sum up partial sums and write back result #pragma unroll - for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) { - tmp += - dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); + for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) { + tmp += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); } if (item_ct1.get_local_id(2) == 0) { @@ -62,7 +60,7 @@ static void mul_mat_vec_q_iq2_xxs_q8_1(const void *__restrict__ vx, } const int blocks_per_row = ncols / qk; - const int blocks_per_warp = vdr * QK_WARP_SIZE / qi; + const int blocks_per_warp = vdr * WARP_SIZE / qi; assert(blocks_per_warp>0); // partial sum for each thread @@ -87,7 +85,7 @@ static void mul_mat_vec_q_iq2_xxs_q8_1(const void *__restrict__ vx, // sum up partial sums and write back result #pragma unroll - for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) { + for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) { tmp += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); } @@ -111,7 +109,7 @@ static void mul_mat_vec_q_iq2_xs_q8_1(const void *__restrict__ vx, } const int blocks_per_row = ncols / qk; - const int blocks_per_warp = vdr * QK_WARP_SIZE / qi; + const int blocks_per_warp = vdr * WARP_SIZE / qi; assert(blocks_per_warp>0); // partial sum for each thread float tmp = 0.0f; @@ -135,7 +133,7 @@ static void mul_mat_vec_q_iq2_xs_q8_1(const void *__restrict__ vx, // sum up partial sums and write back result #pragma unroll - for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) { + for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) { tmp += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); } @@ -159,7 +157,7 @@ static void mul_mat_vec_q_iq2_s_q8_1(const void *__restrict__ vx, } const int blocks_per_row = ncols / qk; - const int blocks_per_warp = vdr * QK_WARP_SIZE / qi; + const int blocks_per_warp = vdr * WARP_SIZE / qi; assert(blocks_per_warp>0); // partial sum for each thread float tmp = 0.0f; @@ -183,7 +181,7 @@ static void mul_mat_vec_q_iq2_s_q8_1(const void *__restrict__ vx, // sum up partial sums and write back result #pragma unroll - for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) { + for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) { tmp += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); } @@ -207,7 +205,7 @@ static void mul_mat_vec_q_iq3_xxs_q8_1(const void *__restrict__ vx, } const int blocks_per_row = ncols / qk; - const int blocks_per_warp = vdr * QK_WARP_SIZE / qi; + const int blocks_per_warp = vdr * WARP_SIZE / qi; assert(blocks_per_warp>0); // partial sum for each thread float tmp = 0.0f; @@ -231,7 +229,7 @@ static void mul_mat_vec_q_iq3_xxs_q8_1(const void *__restrict__ vx, // sum up partial sums and write back result #pragma unroll - for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) { + for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) { tmp += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); } @@ -255,7 +253,7 @@ static void mul_mat_vec_q_iq3_s_q8_1(const void *__restrict__ vx, } const int blocks_per_row = ncols / qk; - const int blocks_per_warp = vdr * QK_WARP_SIZE / qi; + const int blocks_per_warp = vdr * WARP_SIZE / qi; assert(blocks_per_warp>0); // partial sum for each thread float tmp = 0.0f; @@ -279,7 +277,7 @@ static void mul_mat_vec_q_iq3_s_q8_1(const void *__restrict__ vx, // sum up partial sums and write back result #pragma unroll - for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) { + for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) { tmp += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); } @@ -303,7 +301,7 @@ static void mul_mat_vec_q_iq1_s_q8_1(const void *__restrict__ vx, } const int blocks_per_row = ncols / qk; - const int blocks_per_warp = vdr * QK_WARP_SIZE / qi; + const int blocks_per_warp = vdr * WARP_SIZE / qi; assert(blocks_per_warp>0); // partial sum for each thread float tmp = 0.0f; @@ -327,7 +325,7 @@ static void mul_mat_vec_q_iq1_s_q8_1(const void *__restrict__ vx, // sum up partial sums and write back result #pragma unroll - for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) { + for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) { tmp += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); } @@ -351,7 +349,7 @@ static void mul_mat_vec_q_iq1_m_q8_1(const void *__restrict__ vx, } const int blocks_per_row = ncols / qk; - const int blocks_per_warp = vdr * QK_WARP_SIZE / qi; + const int blocks_per_warp = vdr * WARP_SIZE / qi; assert(blocks_per_warp>0); // partial sum for each thread float tmp = 0.0f; @@ -375,7 +373,7 @@ static void mul_mat_vec_q_iq1_m_q8_1(const void *__restrict__ vx, // sum up partial sums and write back result #pragma unroll - for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) { + for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) { tmp += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); } @@ -399,7 +397,7 @@ static void mul_mat_vec_q_iq4_nl_q8_1(const void *__restrict__ vx, } const int blocks_per_row = ncols / qk; - const int blocks_per_warp = vdr * QK_WARP_SIZE / qi; + const int blocks_per_warp = vdr * WARP_SIZE / qi; assert(blocks_per_warp>0); // partial sum for each thread float tmp = 0.0f; @@ -423,7 +421,7 @@ static void mul_mat_vec_q_iq4_nl_q8_1(const void *__restrict__ vx, // sum up partial sums and write back result #pragma unroll - for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) { + for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) { tmp += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); } @@ -448,7 +446,7 @@ static void mul_mat_vec_q_iq4_xs_q8_1(const void *__restrict__ vx, } const int blocks_per_row = ncols / qk; - const int blocks_per_warp = vdr * QK_WARP_SIZE / qi; + const int blocks_per_warp = vdr * WARP_SIZE / qi; assert(blocks_per_warp>0); // partial sum for each thread float tmp = 0.0f; @@ -472,7 +470,7 @@ static void mul_mat_vec_q_iq4_xs_q8_1(const void *__restrict__ vx, // sum up partial sums and write back result #pragma unroll - for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) { + for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) { tmp += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); } @@ -489,7 +487,7 @@ static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy, GGML_ASSERT(ncols % QK4_0 == 0); const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); { stream->submit([&](sycl::handler &cgh) { @@ -497,7 +495,7 @@ static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy, cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] { + [[intel::reqd_sub_group_size(WARP_SIZE)]] { mul_mat_vec_q( vx, vy, dst, ncols, nrows, item_ct1); @@ -513,7 +511,7 @@ static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy, GGML_ASSERT(ncols % QK4_1 == 0); const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); { stream->submit([&](sycl::handler &cgh) { @@ -521,7 +519,7 @@ static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy, cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] { + [[intel::reqd_sub_group_size(WARP_SIZE)]] { mul_mat_vec_q( vx, vy, dst, ncols, nrows, item_ct1); @@ -537,7 +535,7 @@ static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy, GGML_ASSERT(ncols % QK5_0 == 0); const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); { stream->submit([&](sycl::handler &cgh) { @@ -545,7 +543,7 @@ static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy, cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] { + [[intel::reqd_sub_group_size(WARP_SIZE)]] { mul_mat_vec_q( vx, vy, dst, ncols, nrows, item_ct1); @@ -561,7 +559,7 @@ static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy, GGML_ASSERT(ncols % QK5_1 == 0); const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); { stream->submit([&](sycl::handler &cgh) { @@ -569,7 +567,7 @@ static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy, cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] { + [[intel::reqd_sub_group_size(WARP_SIZE)]] { mul_mat_vec_q( vx, vy, dst, ncols, nrows, item_ct1); @@ -585,7 +583,7 @@ static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy, GGML_ASSERT(ncols % QK8_0 == 0); const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); { stream->submit([&](sycl::handler &cgh) { @@ -593,7 +591,7 @@ static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy, cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] { + [[intel::reqd_sub_group_size(WARP_SIZE)]] { mul_mat_vec_q( vx, vy, dst, ncols, nrows, item_ct1); @@ -609,7 +607,7 @@ static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy, GGML_ASSERT(ncols % QK_K == 0); const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); { stream->submit([&](sycl::handler &cgh) { @@ -617,7 +615,7 @@ static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy, cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] { + [[intel::reqd_sub_group_size(WARP_SIZE)]] { mul_mat_vec_q( vx, vy, dst, ncols, nrows, item_ct1); @@ -633,7 +631,7 @@ static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy, GGML_ASSERT(ncols % QK_K == 0); const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); { stream->submit([&](sycl::handler &cgh) { @@ -641,7 +639,7 @@ static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy, cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] { + [[intel::reqd_sub_group_size(WARP_SIZE)]] { mul_mat_vec_q( vx, vy, dst, ncols, nrows, item_ct1); @@ -657,7 +655,7 @@ static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy, GGML_ASSERT(ncols % QK_K == 0); const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); { stream->submit([&](sycl::handler &cgh) { @@ -665,7 +663,7 @@ static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy, cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] { + [[intel::reqd_sub_group_size(WARP_SIZE)]] { mul_mat_vec_q( vx, vy, dst, ncols, nrows, item_ct1); @@ -681,7 +679,7 @@ static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy, GGML_ASSERT(ncols % QK_K == 0); const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); { stream->submit([&](sycl::handler &cgh) { @@ -689,7 +687,7 @@ static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy, cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] { + [[intel::reqd_sub_group_size(WARP_SIZE)]] { mul_mat_vec_q( vx, vy, dst, ncols, nrows, item_ct1); @@ -705,7 +703,7 @@ static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy, GGML_ASSERT(ncols % QK_K == 0); const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); { stream->submit([&](sycl::handler &cgh) { @@ -713,7 +711,7 @@ static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy, cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] { + [[intel::reqd_sub_group_size(WARP_SIZE)]] { mul_mat_vec_q( vx, vy, dst, ncols, nrows, item_ct1); @@ -730,13 +728,13 @@ static void mul_mat_vec_iq2_xxs_q8_1_sycl(const void *vx, const void *vy, GGML_ASSERT(ncols % QK_K == 0); const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); { stream->submit([&](sycl::handler &cgh) { cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] { + [[intel::reqd_sub_group_size(WARP_SIZE)]] { mul_mat_vec_q_iq2_xxs_q8_1( vx, vy, dst, ncols, nrows, item_ct1); }); @@ -751,13 +749,13 @@ static void mul_mat_vec_iq2_xs_q8_1_sycl(const void *vx, const void *vy, GGML_ASSERT(ncols % QK_K == 0); const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); { stream->submit([&](sycl::handler & cgh) { cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] { + [[intel::reqd_sub_group_size(WARP_SIZE)]] { mul_mat_vec_q_iq2_xs_q8_1( vx, vy, dst, ncols, nrows, item_ct1); }); @@ -772,14 +770,14 @@ static void mul_mat_vec_iq2_s_q8_1_sycl(const void *vx, const void *vy, GGML_ASSERT(ncols % QK_K == 0); const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); { stream->submit([&](sycl::handler &cgh) { cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] { + [[intel::reqd_sub_group_size(WARP_SIZE)]] { mul_mat_vec_q_iq2_s_q8_1( vx, vy, dst, ncols, nrows, item_ct1); }); @@ -794,14 +792,14 @@ static void mul_mat_vec_iq3_xxs_q8_1_sycl(const void *vx, const void *vy, GGML_ASSERT(ncols % QK_K == 0); const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); { stream->submit([&](sycl::handler &cgh) { cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] { + [[intel::reqd_sub_group_size(WARP_SIZE)]] { mul_mat_vec_q_iq3_xxs_q8_1( vx, vy, dst, ncols, nrows, item_ct1); }); @@ -816,14 +814,14 @@ static void mul_mat_vec_iq3_s_q8_1_sycl(const void *vx, const void *vy, GGML_ASSERT(ncols % QK_K == 0); const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); { stream->submit([&](sycl::handler &cgh) { cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] { + [[intel::reqd_sub_group_size(WARP_SIZE)]] { mul_mat_vec_q_iq3_s_q8_1( vx, vy, dst, ncols, nrows, item_ct1); }); @@ -838,14 +836,14 @@ static void mul_mat_vec_iq1_s_q8_1_sycl(const void *vx, const void *vy, GGML_ASSERT(ncols % QK_K == 0); const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); { stream->submit([&](sycl::handler &cgh) { cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] { + [[intel::reqd_sub_group_size(WARP_SIZE)]] { mul_mat_vec_q_iq1_s_q8_1( vx, vy, dst, ncols, nrows, item_ct1); }); @@ -860,13 +858,13 @@ static void mul_mat_vec_iq1_m_q8_1_sycl(const void *vx, const void *vy, GGML_ASSERT(ncols % QK_K == 0); const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); { stream->submit([&](sycl::handler &cgh) { cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] { + [[intel::reqd_sub_group_size(WARP_SIZE)]] { mul_mat_vec_q_iq1_m_q8_1( vx, vy, dst, ncols, nrows, item_ct1); }); @@ -881,14 +879,14 @@ static void mul_mat_vec_iq4_nl_q8_1_sycl(const void *vx, const void *vy, GGML_ASSERT(ncols % QK4_NL == 0); const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); { stream->submit([&](sycl::handler &cgh) { cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] { + [[intel::reqd_sub_group_size(WARP_SIZE)]] { mul_mat_vec_q_iq4_nl_q8_1( vx, vy, dst, ncols, nrows, item_ct1); }); @@ -903,14 +901,14 @@ static void mul_mat_vec_iq4_xs_q8_1_sycl(const void *vx, const void *vy, GGML_ASSERT(ncols % QK_K == 0); const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); { stream->submit([&](sycl::handler &cgh) { cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] { + [[intel::reqd_sub_group_size(WARP_SIZE)]] { mul_mat_vec_q_iq4_xs_q8_1( vx, vy, dst, ncols, nrows, item_ct1); }); diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp index 8cdc640e8..423ceb8a3 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp @@ -5,23 +5,24 @@ layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; -shared FLOAT_TYPE sccache1[BLOCK_SIZE/16][16]; -shared FLOAT_TYPE sccache2[BLOCK_SIZE/16][16]; +shared FLOAT_TYPE sccache1[2][BLOCK_SIZE/16][16]; +shared FLOAT_TYPE sccache2[2][BLOCK_SIZE/16][16]; FLOAT_TYPE temp[NUM_COLS][NUM_ROWS]; +uint csel = 0; void calc_superblock(const uint a_offset, const uint b_offset, const uint itid, const uint v_im, const uint ix, const uint q_offset, const uint y_offset, const uint i, const uint num_blocks_per_row, const uint first_row, const uint num_rows, const bool all_threads) { const uint y_idx = i * QUANT_K + y_offset; [[unroll]] for (uint n = 0; n < num_rows; ++n) { const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row; + csel ^= 1; - barrier(); if (!all_threads) { // when we don't have enough blocks to use all threads if (i < num_blocks_per_row) { const uint32_t scale = uint32_t(data_a[ib0 + i].scales[itid]); - sccache1[ix][itid] = FLOAT_TYPE(scale & 0xF); - sccache2[ix][itid] = FLOAT_TYPE((scale >> 4) & 0xF); + sccache1[csel][ix][itid] = FLOAT_TYPE(scale & 0xF); + sccache2[csel][ix][itid] = FLOAT_TYPE((scale >> 4) & 0xF); } barrier(); @@ -29,8 +30,8 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint itid, continue; } else { const uint32_t scale = uint32_t(data_a[ib0 + i].scales[itid]); - sccache1[ix][itid] = FLOAT_TYPE(scale & 0xF); - sccache2[ix][itid] = FLOAT_TYPE((scale >> 4) & 0xF); + sccache1[csel][ix][itid] = FLOAT_TYPE(scale & 0xF); + sccache2[csel][ix][itid] = FLOAT_TYPE((scale >> 4) & 0xF); barrier(); } @@ -57,22 +58,22 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint itid, FLOAT_TYPE sum1 = FLOAT_TYPE(0.0); FLOAT_TYPE sum2 = FLOAT_TYPE(0.0); [[unroll]] for (int l = 0; l < 2; ++l) { - sum1 = fma(FLOAT_TYPE(b0[l]), sccache1[ix][ 8*v_im] * qs_u32_0[l ], - fma(FLOAT_TYPE(b16[l]), sccache1[ix][1 + 8*v_im] * qs_u32_0[l+2], - fma(FLOAT_TYPE(b32[l]), sccache1[ix][2 + 8*v_im] * qs_u32_2[l ], - fma(FLOAT_TYPE(b48[l]), sccache1[ix][3 + 8*v_im] * qs_u32_2[l+2], - fma(FLOAT_TYPE(b64[l]), sccache1[ix][4 + 8*v_im] * qs_u32_4[l ], - fma(FLOAT_TYPE(b80[l]), sccache1[ix][5 + 8*v_im] * qs_u32_4[l+2], - fma(FLOAT_TYPE(b96[l]), sccache1[ix][6 + 8*v_im] * qs_u32_6[l ], - fma(FLOAT_TYPE(b112[l]), sccache1[ix][7 + 8*v_im] * qs_u32_6[l+2], sum1)))))))); - sum2 = fma(FLOAT_TYPE(b0[l]), sccache2[ix][ 8*v_im], - fma(FLOAT_TYPE(b16[l]), sccache2[ix][1 + 8*v_im], - fma(FLOAT_TYPE(b32[l]), sccache2[ix][2 + 8*v_im], - fma(FLOAT_TYPE(b48[l]), sccache2[ix][3 + 8*v_im], - fma(FLOAT_TYPE(b64[l]), sccache2[ix][4 + 8*v_im], - fma(FLOAT_TYPE(b80[l]), sccache2[ix][5 + 8*v_im], - fma(FLOAT_TYPE(b96[l]), sccache2[ix][6 + 8*v_im], - fma(FLOAT_TYPE(b112[l]), sccache2[ix][7 + 8*v_im], sum2)))))))); + sum1 = fma(FLOAT_TYPE(b0[l]), sccache1[csel][ix][ 8*v_im] * qs_u32_0[l ], + fma(FLOAT_TYPE(b16[l]), sccache1[csel][ix][1 + 8*v_im] * qs_u32_0[l+2], + fma(FLOAT_TYPE(b32[l]), sccache1[csel][ix][2 + 8*v_im] * qs_u32_2[l ], + fma(FLOAT_TYPE(b48[l]), sccache1[csel][ix][3 + 8*v_im] * qs_u32_2[l+2], + fma(FLOAT_TYPE(b64[l]), sccache1[csel][ix][4 + 8*v_im] * qs_u32_4[l ], + fma(FLOAT_TYPE(b80[l]), sccache1[csel][ix][5 + 8*v_im] * qs_u32_4[l+2], + fma(FLOAT_TYPE(b96[l]), sccache1[csel][ix][6 + 8*v_im] * qs_u32_6[l ], + fma(FLOAT_TYPE(b112[l]), sccache1[csel][ix][7 + 8*v_im] * qs_u32_6[l+2], sum1)))))))); + sum2 = fma(FLOAT_TYPE(b0[l]), sccache2[csel][ix][ 8*v_im], + fma(FLOAT_TYPE(b16[l]), sccache2[csel][ix][1 + 8*v_im], + fma(FLOAT_TYPE(b32[l]), sccache2[csel][ix][2 + 8*v_im], + fma(FLOAT_TYPE(b48[l]), sccache2[csel][ix][3 + 8*v_im], + fma(FLOAT_TYPE(b64[l]), sccache2[csel][ix][4 + 8*v_im], + fma(FLOAT_TYPE(b80[l]), sccache2[csel][ix][5 + 8*v_im], + fma(FLOAT_TYPE(b96[l]), sccache2[csel][ix][6 + 8*v_im], + fma(FLOAT_TYPE(b112[l]), sccache2[csel][ix][7 + 8*v_im], sum2)))))))); } temp[j][n] = fma(dall, sum1, fma(-dmin, sum2, temp[j][n])); } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp index 3116fad16..e91724a28 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp @@ -5,20 +5,21 @@ layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; -shared FLOAT_TYPE sccache[BLOCK_SIZE/16][2][8]; +shared FLOAT_TYPE sccache[2][BLOCK_SIZE/16][2][8]; FLOAT_TYPE temp[NUM_COLS][NUM_ROWS]; +uint csel = 0; void calc_superblock(const uint a_offset, const uint b_offset, const uint ix, const uint itid8, const uint v_im, const uint v_im4, const uint v_in, const uint32_t hm_m[4], const uint q_offset, const uint y_offset, const uint s_shift, const uint i, const uint num_blocks_per_row, const uint first_row, const uint num_rows, const bool all_threads) { const uint y_idx = i * QUANT_K + y_offset; [[unroll]] for (uint n = 0; n < num_rows; ++n) { const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row; + csel ^= 1; if (!all_threads) { // when we don't have enough blocks to use all threads - barrier(); if (i < num_blocks_per_row) - sccache[ix][v_im][itid8] = FLOAT_TYPE(int8_t(((data_a[ib0+i].scales[itid8] >> v_im4) & 0xF) | (((data_a[ib0+i].scales[itid8%4+8] >> s_shift) & 3) << 4)) - 32); + sccache[csel][ix][v_im][itid8] = FLOAT_TYPE(int8_t(((data_a[ib0+i].scales[itid8] >> v_im4) & 0xF) | (((data_a[ib0+i].scales[itid8%4+8] >> s_shift) & 3) << 4)) - 32); barrier(); if (i >= num_blocks_per_row) @@ -40,8 +41,7 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint ix, co const vec4 qs_u32_6 = vec4(unpack8((qs_u32 >> 6) & 0x03030303)); if (all_threads) { - barrier(); - sccache[ix][v_im][itid8] = FLOAT_TYPE(int8_t(((data_a[ib0+i].scales[itid8] >> v_im4) & 0xF) | (((data_a[ib0+i].scales[itid8%4+8] >> s_shift) & 3) << 4)) - 32); + sccache[csel][ix][v_im][itid8] = FLOAT_TYPE(int8_t(((data_a[ib0+i].scales[itid8] >> v_im4) & 0xF) | (((data_a[ib0+i].scales[itid8%4+8] >> s_shift) & 3) << 4)) - 32); barrier(); } @@ -59,14 +59,14 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint ix, co FLOAT_TYPE sum = FLOAT_TYPE(0.0); [[unroll]] for (int l = 0; l < 2; ++l) { - sum = fma(FLOAT_TYPE( b0[l]) * sccache[ix][v_im][0], qs_u32_0[l ] - hmk_0[l ], - fma(FLOAT_TYPE( b16[l]) * sccache[ix][v_im][1], qs_u32_0[l+2] - hmk_0[l+2], - fma(FLOAT_TYPE( b32[l]) * sccache[ix][v_im][2], qs_u32_2[l ] - hmk_1[l ], - fma(FLOAT_TYPE( b48[l]) * sccache[ix][v_im][3], qs_u32_2[l+2] - hmk_1[l+2], - fma(FLOAT_TYPE( b64[l]) * sccache[ix][v_im][4], qs_u32_4[l ] - hmk_2[l ], - fma(FLOAT_TYPE( b80[l]) * sccache[ix][v_im][5], qs_u32_4[l+2] - hmk_2[l+2], - fma(FLOAT_TYPE( b96[l]) * sccache[ix][v_im][6], qs_u32_6[l ] - hmk_3[l ], - fma(FLOAT_TYPE(b112[l]) * sccache[ix][v_im][7], qs_u32_6[l+2] - hmk_3[l+2], sum)))))))); + sum = fma(FLOAT_TYPE( b0[l]) * sccache[csel][ix][v_im][0], qs_u32_0[l ] - hmk_0[l ], + fma(FLOAT_TYPE( b16[l]) * sccache[csel][ix][v_im][1], qs_u32_0[l+2] - hmk_0[l+2], + fma(FLOAT_TYPE( b32[l]) * sccache[csel][ix][v_im][2], qs_u32_2[l ] - hmk_1[l ], + fma(FLOAT_TYPE( b48[l]) * sccache[csel][ix][v_im][3], qs_u32_2[l+2] - hmk_1[l+2], + fma(FLOAT_TYPE( b64[l]) * sccache[csel][ix][v_im][4], qs_u32_4[l ] - hmk_2[l ], + fma(FLOAT_TYPE( b80[l]) * sccache[csel][ix][v_im][5], qs_u32_4[l+2] - hmk_2[l+2], + fma(FLOAT_TYPE( b96[l]) * sccache[csel][ix][v_im][6], qs_u32_6[l ] - hmk_3[l ], + fma(FLOAT_TYPE(b112[l]) * sccache[csel][ix][v_im][7], qs_u32_6[l+2] - hmk_3[l+2], sum)))))))); } temp[j][n] = fma(d, sum, temp[j][n]); } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp index f05f96b5e..d53d9ee0a 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp @@ -6,20 +6,21 @@ layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; -shared FLOAT_TYPE sccache[BLOCK_SIZE/16][16]; +shared FLOAT_TYPE sccache[2][BLOCK_SIZE/16][16]; FLOAT_TYPE temp[NUM_COLS][NUM_ROWS]; +uint csel = 0; void calc_superblock(const uint a_offset, const uint b_offset, const uint itid, const uint ix, const uint ql_offset, const uint qh_offset, const uint s_offset, const uint y_offset, const uint i, const uint num_blocks_per_row, const uint first_row, const uint num_rows, const bool all_threads) { const uint y_idx = i * QUANT_K + y_offset; [[unroll]] for (uint n = 0; n < num_rows; ++n) { const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row; + csel ^= 1; if (!all_threads) { // when we don't have enough blocks to use all threads - barrier(); if (i < num_blocks_per_row) - sccache[ix][itid] = FLOAT_TYPE(data_a[ib0 + i].scales[itid]); + sccache[csel][ix][itid] = FLOAT_TYPE(data_a[ib0 + i].scales[itid]); barrier(); if (i >= num_blocks_per_row) @@ -51,8 +52,7 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint itid, const vec4 q3 = vec4(unpack8(q3_u32)) - 32; if (all_threads) { - barrier(); - sccache[ix][itid] = FLOAT_TYPE(data_a[ib0 + i].scales[itid]); + sccache[csel][ix][itid] = FLOAT_TYPE(data_a[ib0 + i].scales[itid]); barrier(); } @@ -71,7 +71,7 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint itid, sum[2] = fma(FLOAT_TYPE(by64[l]), q2[l], sum[2]); sum[3] = fma(FLOAT_TYPE(by96[l]), q3[l], sum[3]); } - temp[j][n] = fma(fma(sum[0], sccache[ix][s_offset], fma(sum[1], sccache[ix][s_offset + 2], fma(sum[2], sccache[ix][s_offset + 4], sum[3] * sccache[ix][s_offset + 6]))), d, temp[j][n]); + temp[j][n] = fma(fma(sum[0], sccache[csel][ix][s_offset], fma(sum[1], sccache[csel][ix][s_offset + 2], fma(sum[2], sccache[csel][ix][s_offset + 4], sum[3] * sccache[csel][ix][s_offset + 6]))), d, temp[j][n]); } } } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp index 39657195c..dd7b13e27 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp @@ -743,7 +743,7 @@ void main() { [[unroll]] for (uint cm_col = 0; cm_col < cms_per_col; cm_col++) { coopMatStore(sums[cm_col * cms_per_row + cm_row], coopmat_stage, warp_i * TM * TN, TM, gl_CooperativeMatrixLayoutColumnMajor); - [[unroll]] for (uint col = 0; col < BN; col += storestride) { + [[unroll]] for (uint col = 0; col < TN; col += storestride) { const uint row_i = dc + cm_col * TN + col + store_c; if (row_i >= _ne1) break; diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index ecac5b4bb..19624eae0 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -253,6 +253,7 @@ class MODEL_ARCH(IntEnum): MINICPM3 = auto() GEMMA = auto() GEMMA2 = auto() + GEMMA3 = auto() STARCODER2 = auto() RWKV6 = auto() RWKV6QWEN2 = auto() @@ -440,6 +441,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH.MINICPM3: "minicpm3", MODEL_ARCH.GEMMA: "gemma", MODEL_ARCH.GEMMA2: "gemma2", + MODEL_ARCH.GEMMA3: "gemma3", MODEL_ARCH.STARCODER2: "starcoder2", MODEL_ARCH.RWKV6: "rwkv6", MODEL_ARCH.RWKV6QWEN2: "rwkv6qwen2", @@ -1077,6 +1079,23 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.FFN_PRE_NORM, MODEL_TENSOR.FFN_POST_NORM, ], + MODEL_ARCH.GEMMA3: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_Q_NORM, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_K_NORM, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_POST_NORM, + MODEL_TENSOR.FFN_PRE_NORM, + MODEL_TENSOR.FFN_POST_NORM, + ], MODEL_ARCH.STARCODER2: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index 390ed07db..fbfd59132 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -2876,11 +2876,15 @@ generation_outputs gpttype_generate(const generation_inputs inputs) bool llava_images_changed = false; - bool add_bos_token = true; - if(file_format == FileFormat::GGUF_GENERIC) + bool add_bos_token = true; //if set to false, mmproj handling breaks + if(file_format == FileFormat::GGUF_GENERIC && mmproj_filename == "") { const llama_vocab * tmpvocab = llama_model_get_vocab(&(llama_ctx_v4->model)); add_bos_token = llama_vocab_get_add_bos(tmpvocab); + if(!add_bos_token) + { + printf("\nBOS token prefix was disabled for this model."); + } } for(int x=0;x LLM_ARCH_NAMES = { { LLM_ARCH_MINICPM3, "minicpm3" }, { LLM_ARCH_GEMMA, "gemma" }, { LLM_ARCH_GEMMA2, "gemma2" }, + { LLM_ARCH_GEMMA3, "gemma3" }, { LLM_ARCH_STARCODER2, "starcoder2" }, { LLM_ARCH_MAMBA, "mamba" }, { LLM_ARCH_XVERSE, "xverse" }, @@ -766,6 +767,26 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" }, }, }, + { + LLM_ARCH_GEMMA3, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" }, + }, + }, { LLM_ARCH_STARCODER2, { diff --git a/src/llama-arch.h b/src/llama-arch.h index 122fdcebe..2ec2e2362 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -40,6 +40,7 @@ enum llm_arch { LLM_ARCH_MINICPM3, LLM_ARCH_GEMMA, LLM_ARCH_GEMMA2, + LLM_ARCH_GEMMA3, LLM_ARCH_STARCODER2, LLM_ARCH_MAMBA, LLM_ARCH_XVERSE, diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 0f0205581..1736c7ec8 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -869,6 +870,23 @@ void llama_model::load_hparams(llama_model_loader & ml) { default: type = LLM_TYPE_UNKNOWN; } } break; + case LLM_ARCH_GEMMA3: + { + ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa); + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + + switch (hparams.n_layer) { + case 26: type = LLM_TYPE_1B; break; + case 34: type = LLM_TYPE_4B; break; + case 48: type = LLM_TYPE_12B; break; + case 62: type = LLM_TYPE_27B; break; + default: type = LLM_TYPE_UNKNOWN; + } + + hparams.f_attention_scale = type == LLM_TYPE_27B + ? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0))) + : 1.0f / std::sqrt(float(hparams.n_embd_head_k)); + } break; case LLM_ARCH_STARCODER2: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); @@ -2549,6 +2567,35 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0); layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0); + layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); + layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0); + } + } break; + case LLM_ARCH_GEMMA3: + { + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); + + // output + output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); + output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading + + for (int i = 0; i < n_layer; ++i) { + auto & layer = layers[i]; + + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); + + layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0); + layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0); + layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0); + + layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0); + layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0); + layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0); + layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); @@ -3746,6 +3793,7 @@ void llama_model::print_info() const { LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv); LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias); LLAMA_LOG_INFO("%s: f_logit_scale = %.1e\n", __func__, hparams.f_logit_scale); + LLAMA_LOG_INFO("%s: f_attn_scale = %.1e\n", __func__, hparams.f_attention_scale); LLAMA_LOG_INFO("%s: n_ff = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_ff(il); }, hparams.n_layer).c_str()); LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert); LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used); @@ -4022,6 +4070,7 @@ enum llama_rope_type llama_model_rope_type(const struct llama_model * model) { case LLM_ARCH_PHIMOE: case LLM_ARCH_GEMMA: case LLM_ARCH_GEMMA2: + case LLM_ARCH_GEMMA3: case LLM_ARCH_STARCODER2: case LLM_ARCH_OPENELM: case LLM_ARCH_GPTNEOX: diff --git a/src/llama.cpp b/src/llama.cpp index 5e39fcb16..4c49e9a69 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -5014,6 +5014,149 @@ struct llm_build_context { return gf; } + struct ggml_cgraph * build_gemma3() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); + + const int64_t n_embd_head_k = hparams.n_embd_head_k; + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + + // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings) + if (ubatch.token) { + inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd)); + cb(inpL, "inp_scaled", -1); + } + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = build_inp_pos(); + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + // gemma3 requires different mask for layers using sliding window (SWA) + struct ggml_tensor * KQ_mask = build_inp_KQ_mask(true); + struct ggml_tensor * KQ_mask_swa = build_inp_KQ_mask_swa(true); + + // "5-to-1 interleaved attention" + // 5 layers of local attention followed by 1 layer of global attention + static const int sliding_window_pattern = 6; + + for (int il = 0; il < n_layer; ++il) { + const bool is_sliding = (il + 1) % sliding_window_pattern; + const float freq_base_l = is_sliding ? 10000.0f : freq_base; + const float freq_scale_l = is_sliding ? 1.0f : freq_scale; + struct ggml_tensor * KQ_mask_l = is_sliding ? KQ_mask_swa : KQ_mask; + + // norm + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "attn_norm", il); + + // self-attention + { + // compute Q and K and RoPE them + struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + + struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + + struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens); + Qcur = llm_build_norm(ctx0, Qcur, hparams, + model.layers[il].attn_q_norm, + NULL, + LLM_NORM_RMS, cb, il); + cb(Qcur, "Qcur_normed", il); + + Qcur = ggml_rope_ext( + ctx0, Qcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l, + ext_factor, attn_factor, beta_fast, beta_slow); + cb(Qcur, "Qcur", il); + + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens); + Kcur = llm_build_norm(ctx0, Kcur, hparams, + model.layers[il].attn_k_norm, + NULL, + LLM_NORM_RMS, cb, il); + cb(Kcur, "Kcur_normed", il); + + Kcur = ggml_rope_ext( + ctx0, Kcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l, + ext_factor, attn_factor, beta_fast, beta_slow); + cb(Kcur, "Kcur", il); + + cur = llm_build_kv(ctx0, lctx, kv_self, gf, + model.layers[il].wo, NULL, + Kcur, Vcur, Qcur, KQ_mask_l, n_tokens, kv_head, n_kv, hparams.f_attention_scale, cb, il); + } + + cur = llm_build_norm(ctx0, cur, hparams, + model.layers[il].attn_post_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "attn_post_norm", il); + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); + } + + struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL); + cb(sa_out, "sa_out", il); + + cur = llm_build_norm(ctx0, sa_out, hparams, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "ffn_norm", il); + + // feed-forward network + { + cur = llm_build_ffn(ctx0, lctx, cur, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, + NULL, + LLM_FFN_GELU, LLM_FFN_PAR, cb, il); + cb(cur, "ffn_out", il); + } + + cur = llm_build_norm(ctx0, cur, hparams, + model.layers[il].ffn_post_norm, NULL, + LLM_NORM_RMS, cb, -1); + cb(cur, "ffn_post_norm", -1); + + cur = ggml_add(ctx0, cur, sa_out); + cur = lctx.cvec.apply_to(ctx0, cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = llm_build_norm(ctx0, cur, hparams, + model.output_norm, NULL, + LLM_NORM_RMS, cb, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } struct ggml_cgraph * build_starcoder2() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); @@ -8334,6 +8477,10 @@ static struct ggml_cgraph * llama_build_graph( { result = llm.build_gemma2(); } break; + case LLM_ARCH_GEMMA3: + { + result = llm.build_gemma3(); + } break; case LLM_ARCH_STARCODER2: { result = llm.build_starcoder2();