mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-10 17:14:36 +00:00
gemma3 vision works, but is using more tokens than expected - may need resizing
This commit is contained in:
commit
77debb1b1b
39 changed files with 2140 additions and 814 deletions
2
Makefile
2
Makefile
|
@ -639,6 +639,8 @@ ttsmain: examples/tts/tts.cpp common/arg.cpp build-info.h ggml.o ggml-cpu.o llam
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
gguf-split: examples/gguf-split/gguf-split.cpp ggml.o ggml-cpu.o llama.o build-info.h llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FULL) $(OBJS)
|
gguf-split: examples/gguf-split/gguf-split.cpp ggml.o ggml-cpu.o llama.o build-info.h llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FULL) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
gemma3-cli: examples/llava/gemma3-cli.cpp common/arg.cpp build-info.h ggml.o ggml-cpu.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FULL) $(OBJS)
|
||||||
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
vulkan-shaders-gen: ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
|
vulkan-shaders-gen: ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
|
||||||
@echo 'This command can be MANUALLY run to regenerate vulkan shaders. Normally concedo will do it, so you do not have to.'
|
@echo 'This command can be MANUALLY run to regenerate vulkan shaders. Normally concedo will do it, so you do not have to.'
|
||||||
|
|
|
@ -1868,16 +1868,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
).set_examples({LLAMA_EXAMPLE_PASSKEY}));
|
).set_examples({LLAMA_EXAMPLE_PASSKEY}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-o", "--output", "--output-file"}, "FNAME",
|
{"-o", "--output", "--output-file"}, "FNAME",
|
||||||
string_format("output file (default: '%s')",
|
string_format("output file (default: '%s')", params.out_file.c_str()),
|
||||||
ex == LLAMA_EXAMPLE_EXPORT_LORA
|
|
||||||
? params.lora_outfile.c_str()
|
|
||||||
: ex == LLAMA_EXAMPLE_CVECTOR_GENERATOR
|
|
||||||
? params.cvector_outfile.c_str()
|
|
||||||
: params.out_file.c_str()),
|
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.out_file = value;
|
params.out_file = value;
|
||||||
params.cvector_outfile = value;
|
|
||||||
params.lora_outfile = value;
|
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA}));
|
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
|
|
311
common/chat.cpp
311
common/chat.cpp
|
@ -60,7 +60,9 @@ std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const json & messa
|
||||||
}
|
}
|
||||||
msg.role = message.at("role");
|
msg.role = message.at("role");
|
||||||
|
|
||||||
if (message.contains("content")) {
|
auto has_content = message.contains("content");
|
||||||
|
auto has_tool_calls = message.contains("tool_calls");
|
||||||
|
if (has_content) {
|
||||||
const auto & content = message.at("content");
|
const auto & content = message.at("content");
|
||||||
if (content.is_string()) {
|
if (content.is_string()) {
|
||||||
msg.content = content;
|
msg.content = content;
|
||||||
|
@ -81,19 +83,8 @@ std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const json & messa
|
||||||
} else if (!content.is_null()) {
|
} else if (!content.is_null()) {
|
||||||
throw std::runtime_error("Invalid 'content' type: expected string or array, got " + content.dump() + " (ref: https://github.com/ggml-org/llama.cpp/issues/8367)");
|
throw std::runtime_error("Invalid 'content' type: expected string or array, got " + content.dump() + " (ref: https://github.com/ggml-org/llama.cpp/issues/8367)");
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
throw std::runtime_error("Expected 'content' (ref: https://github.com/ggml-org/llama.cpp/issues/8367)");
|
|
||||||
}
|
}
|
||||||
if (message.contains("reasoning_content")) {
|
if (has_tool_calls) {
|
||||||
msg.reasoning_content = message.at("reasoning_content");
|
|
||||||
}
|
|
||||||
if (message.contains("name")) {
|
|
||||||
msg.tool_name = message.at("name");
|
|
||||||
}
|
|
||||||
if (message.contains("tool_call_id")) {
|
|
||||||
msg.tool_call_id = message.at("tool_call_id");
|
|
||||||
}
|
|
||||||
if (message.contains("tool_calls")) {
|
|
||||||
for (const auto & tool_call : message.at("tool_calls")) {
|
for (const auto & tool_call : message.at("tool_calls")) {
|
||||||
common_chat_tool_call tc;
|
common_chat_tool_call tc;
|
||||||
if (!tool_call.contains("type")) {
|
if (!tool_call.contains("type")) {
|
||||||
|
@ -118,6 +109,18 @@ std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const json & messa
|
||||||
msg.tool_calls.push_back(tc);
|
msg.tool_calls.push_back(tc);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (!has_content && !has_tool_calls) {
|
||||||
|
throw std::runtime_error("Expected 'content' or 'tool_calls' (ref: https://github.com/ggml-org/llama.cpp/issues/8367 & https://github.com/ggml-org/llama.cpp/issues/12279)");
|
||||||
|
}
|
||||||
|
if (message.contains("reasoning_content")) {
|
||||||
|
msg.reasoning_content = message.at("reasoning_content");
|
||||||
|
}
|
||||||
|
if (message.contains("name")) {
|
||||||
|
msg.tool_name = message.at("name");
|
||||||
|
}
|
||||||
|
if (message.contains("tool_call_id")) {
|
||||||
|
msg.tool_call_id = message.at("tool_call_id");
|
||||||
|
}
|
||||||
|
|
||||||
msgs.push_back(msg);
|
msgs.push_back(msg);
|
||||||
}
|
}
|
||||||
|
@ -442,6 +445,7 @@ std::string common_chat_format_name(common_chat_format format) {
|
||||||
case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2: return "Functionary v3.2";
|
case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2: return "Functionary v3.2";
|
||||||
case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1: return "Functionary v3.1 Llama 3.1";
|
case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1: return "Functionary v3.1 Llama 3.1";
|
||||||
case COMMON_CHAT_FORMAT_HERMES_2_PRO: return "Hermes 2 Pro";
|
case COMMON_CHAT_FORMAT_HERMES_2_PRO: return "Hermes 2 Pro";
|
||||||
|
case COMMON_CHAT_FORMAT_HERMES_2_PRO_EXTRACT_REASONING: return "Hermes 2 Pro (extract reasoning)";
|
||||||
case COMMON_CHAT_FORMAT_COMMAND_R7B: return "Command R7B";
|
case COMMON_CHAT_FORMAT_COMMAND_R7B: return "Command R7B";
|
||||||
case COMMON_CHAT_FORMAT_COMMAND_R7B_EXTRACT_REASONING: return "Command R7B (extract reasoning)";
|
case COMMON_CHAT_FORMAT_COMMAND_R7B_EXTRACT_REASONING: return "Command R7B (extract reasoning)";
|
||||||
default:
|
default:
|
||||||
|
@ -875,9 +879,9 @@ static common_chat_params common_chat_params_init_command_r7b(const common_chat_
|
||||||
return data;
|
return data;
|
||||||
}
|
}
|
||||||
static common_chat_msg common_chat_parse_command_r7b(const std::string & input, bool extract_reasoning) {
|
static common_chat_msg common_chat_parse_command_r7b(const std::string & input, bool extract_reasoning) {
|
||||||
static std::regex thought_regex("(<\\|START_THINKING\\|>([\\s\\S]*?)<\\|END_THINKING\\|>)([\\s\\S]*)");
|
static const std::regex thought_regex("(<\\|START_THINKING\\|>([\\s\\S]*?)<\\|END_THINKING\\|>)([\\s\\S]*)");
|
||||||
static std::regex action_regex("<\\|START_ACTION\\|>([\\s\\S]*?)<\\|END_ACTION\\|>");
|
static const std::regex action_regex("<\\|START_ACTION\\|>([\\s\\S]*?)<\\|END_ACTION\\|>");
|
||||||
static std::regex response_regex("(?:<\\|START_RESPONSE\\|>)?([\\s\\S]*?)<\\|END_RESPONSE\\|>");
|
static const std::regex response_regex("(?:<\\|START_RESPONSE\\|>)?([\\s\\S]*?)<\\|END_RESPONSE\\|>");
|
||||||
|
|
||||||
std::smatch match;
|
std::smatch match;
|
||||||
|
|
||||||
|
@ -1009,10 +1013,10 @@ static common_chat_params common_chat_params_init_llama_3_1_tool_calls(const com
|
||||||
}
|
}
|
||||||
static common_chat_msg common_chat_parse_llama_3_1(const std::string & input, bool with_builtin_tools = false) {
|
static common_chat_msg common_chat_parse_llama_3_1(const std::string & input, bool with_builtin_tools = false) {
|
||||||
// TODO: tighten & simplify the parser, don't accept leading text context.
|
// TODO: tighten & simplify the parser, don't accept leading text context.
|
||||||
static std::regex function_regex(
|
static const std::regex function_regex(
|
||||||
"\\s*\\{\\s*(?:\"type\"\\s*:\\s*\"function\"\\s*,\\s*)?\"name\"\\s*:\\s*\"([^\"]+)\"\\s*,\\s*\"parameters\"\\s*: ");
|
"\\s*\\{\\s*(?:\"type\"\\s*:\\s*\"function\"\\s*,\\s*)?\"name\"\\s*:\\s*\"([^\"]+)\"\\s*,\\s*\"parameters\"\\s*: ");
|
||||||
static std::regex close_regex("\\}\\s*");
|
static const std::regex close_regex("\\}\\s*");
|
||||||
static std::regex builtin_call_regex("<\\|python_tag\\|>\\s*([^.(]+)\\s*\\.\\s*call\\s*\\(\\s*([\\w]+)\\s*=\\s*([\\s\\S]*?)\\)");
|
static const std::regex builtin_call_regex("<\\|python_tag\\|>\\s*([^.(]+)\\s*\\.\\s*call\\s*\\(\\s*([\\w]+)\\s*=\\s*([\\s\\S]*?)\\)");
|
||||||
|
|
||||||
if (with_builtin_tools) {
|
if (with_builtin_tools) {
|
||||||
std::smatch match;
|
std::smatch match;
|
||||||
|
@ -1102,34 +1106,42 @@ static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_
|
||||||
data.format = inputs.extract_reasoning ? COMMON_CHAT_FORMAT_DEEPSEEK_R1_EXTRACT_REASONING : COMMON_CHAT_FORMAT_DEEPSEEK_R1;
|
data.format = inputs.extract_reasoning ? COMMON_CHAT_FORMAT_DEEPSEEK_R1_EXTRACT_REASONING : COMMON_CHAT_FORMAT_DEEPSEEK_R1;
|
||||||
return data;
|
return data;
|
||||||
}
|
}
|
||||||
static common_chat_msg common_chat_parse_deepseek_r1(const std::string & input, bool extract_reasoning) {
|
static common_chat_msg handle_think_tag_prelude(const std::string & input, bool extract_reasoning, const std::function<common_chat_msg(const std::string &)> & rest_parser) {
|
||||||
static std::regex function_regex("<|tool▁call▁begin|>function<|tool▁sep|>([^\n]+)\n```json\n");
|
|
||||||
static std::regex close_regex("```[\\s\\r\\n]*<|tool▁call▁end|>");
|
|
||||||
static std::regex reasoning_content_regex("((?:<think>)?([\\s\\S\\r\\n]*?)</think>)?([\\s\\S\\r\\n]*)");
|
|
||||||
static std::regex tool_calls_regex("[\\s\\r\\n]*(?:<|tool▁calls▁begin|>|<|tool_calls_begin|>|<|tool calls begin|>|<|tool\\\\_calls\\\\_begin|>)([\\s\\S\\r\\n]*?)<|tool▁calls▁end|>");
|
|
||||||
common_chat_msg msg;
|
|
||||||
msg.role = "assistant";
|
|
||||||
std::smatch match;
|
std::smatch match;
|
||||||
|
static const std::regex reasoning_content_regex("((?:<think>)?([\\s\\S\\r\\n]*?)</think>)?([\\s\\S\\r\\n]*)");
|
||||||
if (std::regex_match(input, match, reasoning_content_regex)) {
|
if (std::regex_match(input, match, reasoning_content_regex)) {
|
||||||
std::string rest;
|
auto rest = match[3].str();
|
||||||
|
auto msg = rest_parser(rest);
|
||||||
|
auto reasoning_content = string_strip(match[2].str());
|
||||||
if (extract_reasoning) {
|
if (extract_reasoning) {
|
||||||
msg.reasoning_content = string_strip(match[2].str());
|
msg.reasoning_content = reasoning_content;
|
||||||
} else {
|
} else if (!reasoning_content.empty()) {
|
||||||
msg.content = match[1].str();
|
std::ostringstream content;
|
||||||
|
content << "<think>" << reasoning_content << "</think>" << msg.content;
|
||||||
|
msg.content = content.str();
|
||||||
}
|
}
|
||||||
rest = match[3].str();
|
return msg;
|
||||||
|
}
|
||||||
|
return rest_parser(input);
|
||||||
|
}
|
||||||
|
static common_chat_msg common_chat_parse_deepseek_r1(const std::string & input, bool extract_reasoning) {
|
||||||
|
return handle_think_tag_prelude(input, extract_reasoning, [](const std::string & input) {
|
||||||
|
static const std::regex function_regex("<|tool▁call▁begin|>function<|tool▁sep|>([^\n]+)\n```json\n");
|
||||||
|
static const std::regex close_regex("```[\\s\\r\\n]*<|tool▁call▁end|>");
|
||||||
|
static const std::regex tool_calls_regex("[\\s\\r\\n]*(?:<|tool▁calls▁begin|>|<|tool_calls_begin|>|<|tool calls begin|>|<|tool\\\\_calls\\\\_begin|>)([\\s\\S\\r\\n]*?)<|tool▁calls▁end|>");
|
||||||
|
|
||||||
if (std::regex_search(rest, match, tool_calls_regex)) {
|
common_chat_msg msg;
|
||||||
|
msg.role = "assistant";
|
||||||
|
std::smatch match;
|
||||||
|
if (std::regex_search(input, match, tool_calls_regex)) {
|
||||||
auto tool_calls = match[1].str();
|
auto tool_calls = match[1].str();
|
||||||
auto msg2 = parse_json_tool_calls(tool_calls, std::nullopt, function_regex, close_regex);
|
auto msg2 = parse_json_tool_calls(tool_calls, std::nullopt, function_regex, close_regex);
|
||||||
msg.tool_calls = std::move(msg2.tool_calls);
|
msg.tool_calls = std::move(msg2.tool_calls);
|
||||||
} else {
|
} else {
|
||||||
msg.content += std::string(rest.begin() + rest.find_first_not_of(" \r\n"), rest.end());
|
msg.content = input;
|
||||||
}
|
}
|
||||||
} else {
|
return msg;
|
||||||
msg.content = input;
|
});
|
||||||
}
|
|
||||||
return msg;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static common_chat_params common_chat_params_init_firefunction_v2(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
static common_chat_params common_chat_params_init_firefunction_v2(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
||||||
|
@ -1234,8 +1246,8 @@ static common_chat_params common_chat_params_init_functionary_v3_2(const common_
|
||||||
}
|
}
|
||||||
|
|
||||||
static common_chat_msg common_chat_parse_functionary_v3_2(const std::string & input) {
|
static common_chat_msg common_chat_parse_functionary_v3_2(const std::string & input) {
|
||||||
static std::regex function_regex(R"((?:>>>)?(?:assistant<|end_header_id|>\n)?(\w+)\n)");
|
static const std::regex function_regex(R"((?:>>>)?(?:assistant<|end_header_id|>\n)?(\w+)\n)");
|
||||||
static std::regex close_regex(R"($|(?=>>>))");
|
static const std::regex close_regex(R"($|(?=>>>))");
|
||||||
|
|
||||||
std::string content;
|
std::string content;
|
||||||
auto it = input.begin();
|
auto it = input.begin();
|
||||||
|
@ -1324,7 +1336,7 @@ static common_chat_params common_chat_params_init_functionary_v3_1_llama_3_1(con
|
||||||
}
|
}
|
||||||
static common_chat_msg common_chat_parse_functionary_v3_1_llama_3_1(const std::string & input) {
|
static common_chat_msg common_chat_parse_functionary_v3_1_llama_3_1(const std::string & input) {
|
||||||
// This version of Functionary still supports the llama 3.1 tool call format for the python tool.
|
// This version of Functionary still supports the llama 3.1 tool call format for the python tool.
|
||||||
static std::regex python_tag_regex(R"(<\|python_tag\|>([\s\S\n]*)$)");
|
static const std::regex python_tag_regex(R"(<\|python_tag\|>([\s\S\n]*)$)");
|
||||||
std::smatch match;
|
std::smatch match;
|
||||||
if (std::regex_search(input, match, python_tag_regex)) {
|
if (std::regex_search(input, match, python_tag_regex)) {
|
||||||
auto code = match[1].str();
|
auto code = match[1].str();
|
||||||
|
@ -1338,8 +1350,8 @@ static common_chat_msg common_chat_parse_functionary_v3_1_llama_3_1(const std::s
|
||||||
});
|
});
|
||||||
return msg;
|
return msg;
|
||||||
}
|
}
|
||||||
static std::regex function_regex(R"(<function=(\w+)>)");
|
static const std::regex function_regex(R"(<function=(\w+)>)");
|
||||||
static std::regex close_regex(R"(</function>)");
|
static const std::regex close_regex(R"(</function>)");
|
||||||
// TODO: tighten & simplify.
|
// TODO: tighten & simplify.
|
||||||
return parse_json_tool_calls(input, std::nullopt, function_regex, close_regex);
|
return parse_json_tool_calls(input, std::nullopt, function_regex, close_regex);
|
||||||
}
|
}
|
||||||
|
@ -1406,6 +1418,8 @@ static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat
|
||||||
"(?:```(?:json|xml)?\n\\s*)?(?:<function_call>|<tools>|<xml><json>|<response>)?\\s*\\{\\s*\"", //name\"\\s*:\\s*\"" + escaped_name + "\"",
|
"(?:```(?:json|xml)?\n\\s*)?(?:<function_call>|<tools>|<xml><json>|<response>)?\\s*\\{\\s*\"", //name\"\\s*:\\s*\"" + escaped_name + "\"",
|
||||||
});
|
});
|
||||||
data.preserved_tokens = {
|
data.preserved_tokens = {
|
||||||
|
"<think>",
|
||||||
|
"</think>",
|
||||||
"<tool_call>",
|
"<tool_call>",
|
||||||
"</tool_call>",
|
"</tool_call>",
|
||||||
"<function",
|
"<function",
|
||||||
|
@ -1426,122 +1440,123 @@ static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat
|
||||||
});
|
});
|
||||||
|
|
||||||
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
|
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
|
||||||
data.format = COMMON_CHAT_FORMAT_HERMES_2_PRO;
|
data.format = inputs.extract_reasoning ? COMMON_CHAT_FORMAT_HERMES_2_PRO_EXTRACT_REASONING : COMMON_CHAT_FORMAT_HERMES_2_PRO;
|
||||||
return data;
|
return data;
|
||||||
}
|
}
|
||||||
static common_chat_msg common_chat_parse_hermes_2_pro(const std::string& input) {
|
static common_chat_msg common_chat_parse_hermes_2_pro(const std::string& input, bool extract_reasoning) {
|
||||||
const static std::regex open_regex(
|
return handle_think_tag_prelude(input, extract_reasoning, [](const std::string & input) {
|
||||||
"(?:"
|
static const std::regex open_regex(
|
||||||
"(```(?:xml|json)?\\n\\s*)?" // match 1 (block_start)
|
"(?:"
|
||||||
"(<tool_call>" // match 2 (open_tag)
|
"(```(?:xml|json)?\\n\\s*)?" // match 1 (block_start)
|
||||||
"|<function_call>"
|
"(<tool_call>" // match 2 (open_tag)
|
||||||
"|<tool>"
|
"|<function_call>"
|
||||||
"|<tools>"
|
"|<tool>"
|
||||||
"|<response>"
|
"|<tools>"
|
||||||
"|<json>"
|
"|<response>"
|
||||||
"|<xml>"
|
"|<json>"
|
||||||
"|<JSON>"
|
"|<xml>"
|
||||||
")?"
|
"|<JSON>"
|
||||||
"(\\s*\\{\\s*\"name\"\\s*:[\\s\\S]*)" // match 3 (named tool call + rest)
|
")?"
|
||||||
")"
|
"(\\s*\\{\\s*\"name\"\\s*:[\\s\\S]*)" // match 3 (named tool call + rest)
|
||||||
"|"
|
")"
|
||||||
"(?:<function=([^>]+)>" // match 4 (function name)
|
"|"
|
||||||
"|<function name=\"([^\"]+)\">)" // match 5 (function name again)
|
"(?:<function=([^>]+)>" // match 4 (function name)
|
||||||
"([\\s\\S]*)" // match 6 (function arguments + rest)})"
|
"|<function name=\"([^\"]+)\">)" // match 5 (function name again)
|
||||||
);
|
"([\\s\\S]*)" // match 6 (function arguments + rest)})"
|
||||||
|
);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
|
common_chat_msg msg;
|
||||||
|
msg.role = "assistant";
|
||||||
|
|
||||||
common_chat_msg msg;
|
std::string::const_iterator it = input.begin();
|
||||||
msg.role = "assistant";
|
const std::string::const_iterator end = input.end();
|
||||||
|
std::smatch match;
|
||||||
|
|
||||||
std::string::const_iterator it = input.begin();
|
while (it != end) {
|
||||||
const std::string::const_iterator end = input.end();
|
if (std::regex_search(it, end, match, open_regex)) {
|
||||||
std::smatch match;
|
// Add content before the match
|
||||||
|
msg.content += std::string(it, match[0].first);
|
||||||
|
|
||||||
while (it != end) {
|
auto block_start = match[1].str();
|
||||||
if (std::regex_search(it, end, match, open_regex)) {
|
std::string block_end = block_start.empty() ? "" : "```";
|
||||||
// Add content before the match
|
|
||||||
msg.content += std::string(it, match[0].first);
|
|
||||||
|
|
||||||
auto block_start = match[1].str();
|
auto open_tag = match[2].str();
|
||||||
std::string block_end = block_start.empty() ? "" : "```";
|
std::string close_tag;
|
||||||
|
|
||||||
auto open_tag = match[2].str();
|
if (match[3].matched) {
|
||||||
std::string close_tag;
|
close_tag = open_tag.empty() ? "" : "</" + open_tag.substr(1);
|
||||||
|
auto json_it = match[3].first;
|
||||||
|
json tool_call;
|
||||||
|
if (parse_json(json_it, end, tool_call) && tool_call.contains("name") && tool_call.contains("arguments")) {
|
||||||
|
|
||||||
if (match[3].matched) {
|
msg.tool_calls.emplace_back(process_tool_call(tool_call));
|
||||||
close_tag = open_tag.empty() ? "" : "</" + open_tag.substr(1);
|
it = json_it; // Move iterator past parsed JSON
|
||||||
auto json_it = match[3].first;
|
|
||||||
json tool_call;
|
|
||||||
if (parse_json(json_it, end, tool_call) && tool_call.contains("name") && tool_call.contains("arguments")) {
|
|
||||||
|
|
||||||
msg.tool_calls.emplace_back(process_tool_call(tool_call));
|
// Handle close tags
|
||||||
it = json_it; // Move iterator past parsed JSON
|
consume_spaces(it, end);
|
||||||
|
if (!close_tag.empty() && !parse_literal(it, end, close_tag)) {
|
||||||
// Handle close tags
|
throw std::runtime_error("Failed to parse closing tag");
|
||||||
consume_spaces(it, end);
|
}
|
||||||
if (!close_tag.empty() && !parse_literal(it, end, close_tag)) {
|
consume_spaces(it, end);
|
||||||
throw std::runtime_error("Failed to parse closing tag");
|
if (!block_end.empty() && !parse_literal(it, end, block_end)) {
|
||||||
|
throw std::runtime_error("Failed to parse block end");
|
||||||
|
}
|
||||||
|
consume_spaces(it, end);
|
||||||
|
} else {
|
||||||
|
// Not a valid tool call, treat as content
|
||||||
|
msg.content += std::string(match[0].first, match[0].second);
|
||||||
|
it = match[0].second;
|
||||||
}
|
}
|
||||||
consume_spaces(it, end);
|
|
||||||
if (!block_end.empty() && !parse_literal(it, end, block_end)) {
|
|
||||||
throw std::runtime_error("Failed to parse block end");
|
|
||||||
}
|
|
||||||
consume_spaces(it, end);
|
|
||||||
} else {
|
} else {
|
||||||
// Not a valid tool call, treat as content
|
auto function_name = match[4].str();
|
||||||
msg.content += std::string(match[0].first, match[0].second);
|
if (function_name.empty()) {
|
||||||
it = match[0].second;
|
function_name = match[5].str();
|
||||||
|
}
|
||||||
|
GGML_ASSERT(!function_name.empty());
|
||||||
|
|
||||||
|
close_tag = "</function>";
|
||||||
|
// Start parsing from after the opening tags
|
||||||
|
auto json_it = match[6].first;
|
||||||
|
json arguments;
|
||||||
|
if (parse_json(json_it, end, arguments)) {
|
||||||
|
msg.tool_calls.emplace_back(process_tool_call({
|
||||||
|
{"name", function_name},
|
||||||
|
{"arguments", arguments},
|
||||||
|
}));
|
||||||
|
it = json_it; // Move iterator past parsed JSON
|
||||||
|
|
||||||
|
// Handle close tags
|
||||||
|
consume_spaces(it, end);
|
||||||
|
if (!close_tag.empty() && !parse_literal(it, end, close_tag)) {
|
||||||
|
throw std::runtime_error("Failed to parse closing tag");
|
||||||
|
}
|
||||||
|
consume_spaces(it, end);
|
||||||
|
if (!block_end.empty() && !parse_literal(it, end, block_end)) {
|
||||||
|
throw std::runtime_error("Failed to parse block end");
|
||||||
|
}
|
||||||
|
consume_spaces(it, end);
|
||||||
|
} else {
|
||||||
|
// Not a valid tool call, treat as content
|
||||||
|
msg.content += std::string(match[0].first, match[0].second);
|
||||||
|
it = match[0].second;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
auto function_name = match[4].str();
|
// Add remaining content
|
||||||
if (function_name.empty()) {
|
msg.content += std::string(it, end);
|
||||||
function_name = match[5].str();
|
break;
|
||||||
}
|
|
||||||
GGML_ASSERT(!function_name.empty());
|
|
||||||
|
|
||||||
close_tag = "</function>";
|
|
||||||
// Start parsing from after the opening tags
|
|
||||||
auto json_it = match[6].first;
|
|
||||||
json arguments;
|
|
||||||
if (parse_json(json_it, end, arguments)) {
|
|
||||||
msg.tool_calls.emplace_back(process_tool_call({
|
|
||||||
{"name", function_name},
|
|
||||||
{"arguments", arguments},
|
|
||||||
}));
|
|
||||||
it = json_it; // Move iterator past parsed JSON
|
|
||||||
|
|
||||||
// Handle close tags
|
|
||||||
consume_spaces(it, end);
|
|
||||||
if (!close_tag.empty() && !parse_literal(it, end, close_tag)) {
|
|
||||||
throw std::runtime_error("Failed to parse closing tag");
|
|
||||||
}
|
|
||||||
consume_spaces(it, end);
|
|
||||||
if (!block_end.empty() && !parse_literal(it, end, block_end)) {
|
|
||||||
throw std::runtime_error("Failed to parse block end");
|
|
||||||
}
|
|
||||||
consume_spaces(it, end);
|
|
||||||
} else {
|
|
||||||
// Not a valid tool call, treat as content
|
|
||||||
msg.content += std::string(match[0].first, match[0].second);
|
|
||||||
it = match[0].second;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
// Add remaining content
|
|
||||||
msg.content += std::string(it, end);
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
|
return msg;
|
||||||
|
} catch (const std::exception & e) {
|
||||||
|
LOG_ERR("Failed to parse hermes 2 pro input: %s\n", e.what());
|
||||||
|
common_chat_msg msg;
|
||||||
|
msg.role = "assistant";
|
||||||
|
msg.content = input;
|
||||||
|
return msg;
|
||||||
}
|
}
|
||||||
return msg;
|
});
|
||||||
} catch (const std::exception & e) {
|
|
||||||
LOG_ERR("Failed to parse hermes 2 pro input: %s\n", e.what());
|
|
||||||
common_chat_msg msg;
|
|
||||||
msg.role = "assistant";
|
|
||||||
msg.content = input;
|
|
||||||
return msg;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
||||||
|
@ -1606,6 +1621,11 @@ static common_chat_params common_chat_templates_apply_jinja(
|
||||||
return common_chat_params_init_command_r7b(tmpl, params);
|
return common_chat_params_init_command_r7b(tmpl, params);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Hermes 2/3 Pro, Qwen 2.5 Instruct (w/ tools)
|
||||||
|
if (src.find("<tool_call>") != std::string::npos && params.json_schema.is_null()) {
|
||||||
|
return common_chat_params_init_hermes_2_pro(tmpl, params);
|
||||||
|
}
|
||||||
|
|
||||||
// Use generic handler when mixing tools + JSON schema.
|
// Use generic handler when mixing tools + JSON schema.
|
||||||
// TODO: support that mix in handlers below.
|
// TODO: support that mix in handlers below.
|
||||||
if ((params.tools.is_array() && params.json_schema.is_object())) {
|
if ((params.tools.is_array() && params.json_schema.is_object())) {
|
||||||
|
@ -1627,11 +1647,6 @@ static common_chat_params common_chat_templates_apply_jinja(
|
||||||
return common_chat_params_init_without_tools(tmpl, params);
|
return common_chat_params_init_without_tools(tmpl, params);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Hermes 2/3 Pro, Qwen 2.5 Instruct (w/ tools)
|
|
||||||
if (src.find("<tool_call>") != std::string::npos) {
|
|
||||||
return common_chat_params_init_hermes_2_pro(tmpl, params);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Functionary v3.1 (w/ tools)
|
// Functionary v3.1 (w/ tools)
|
||||||
if (src.find("<|start_header_id|>") != std::string::npos
|
if (src.find("<|start_header_id|>") != std::string::npos
|
||||||
&& src.find("<function=") != std::string::npos) {
|
&& src.find("<function=") != std::string::npos) {
|
||||||
|
@ -1749,7 +1764,9 @@ common_chat_msg common_chat_parse(const std::string & input, common_chat_format
|
||||||
case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1:
|
case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1:
|
||||||
return common_chat_parse_functionary_v3_1_llama_3_1(input);
|
return common_chat_parse_functionary_v3_1_llama_3_1(input);
|
||||||
case COMMON_CHAT_FORMAT_HERMES_2_PRO:
|
case COMMON_CHAT_FORMAT_HERMES_2_PRO:
|
||||||
return common_chat_parse_hermes_2_pro(input);
|
return common_chat_parse_hermes_2_pro(input, /* extract_reasoning= */ false);
|
||||||
|
case COMMON_CHAT_FORMAT_HERMES_2_PRO_EXTRACT_REASONING:
|
||||||
|
return common_chat_parse_hermes_2_pro(input, /* extract_reasoning= */ true);
|
||||||
case COMMON_CHAT_FORMAT_FIREFUNCTION_V2:
|
case COMMON_CHAT_FORMAT_FIREFUNCTION_V2:
|
||||||
return common_chat_parse_firefunction_v2(input);
|
return common_chat_parse_firefunction_v2(input);
|
||||||
case COMMON_CHAT_FORMAT_COMMAND_R7B:
|
case COMMON_CHAT_FORMAT_COMMAND_R7B:
|
||||||
|
|
|
@ -53,6 +53,7 @@ enum common_chat_format {
|
||||||
COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2,
|
COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2,
|
||||||
COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
|
COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
|
||||||
COMMON_CHAT_FORMAT_HERMES_2_PRO,
|
COMMON_CHAT_FORMAT_HERMES_2_PRO,
|
||||||
|
COMMON_CHAT_FORMAT_HERMES_2_PRO_EXTRACT_REASONING,
|
||||||
COMMON_CHAT_FORMAT_COMMAND_R7B,
|
COMMON_CHAT_FORMAT_COMMAND_R7B,
|
||||||
COMMON_CHAT_FORMAT_COMMAND_R7B_EXTRACT_REASONING,
|
COMMON_CHAT_FORMAT_COMMAND_R7B_EXTRACT_REASONING,
|
||||||
|
|
||||||
|
|
|
@ -403,8 +403,6 @@ struct common_params {
|
||||||
int32_t i_pos = -1; // position of the passkey in the junk text
|
int32_t i_pos = -1; // position of the passkey in the junk text
|
||||||
|
|
||||||
// imatrix params
|
// imatrix params
|
||||||
std::string out_file = "imatrix.dat"; // save the resulting imatrix to this file
|
|
||||||
|
|
||||||
int32_t n_out_freq = 10; // output the imatrix every n_out_freq iterations
|
int32_t n_out_freq = 10; // output the imatrix every n_out_freq iterations
|
||||||
int32_t n_save_freq = 0; // save the imatrix every n_save_freq iterations
|
int32_t n_save_freq = 0; // save the imatrix every n_save_freq iterations
|
||||||
int32_t i_chunk = 0; // start processing from this chunk
|
int32_t i_chunk = 0; // start processing from this chunk
|
||||||
|
@ -416,16 +414,16 @@ struct common_params {
|
||||||
int n_pca_batch = 100;
|
int n_pca_batch = 100;
|
||||||
int n_pca_iterations = 1000;
|
int n_pca_iterations = 1000;
|
||||||
dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
|
dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
|
||||||
std::string cvector_outfile = "control_vector.gguf";
|
|
||||||
std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
|
std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
|
||||||
std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
|
std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
|
||||||
|
|
||||||
bool spm_infill = false; // suffix/prefix/middle pattern for infill
|
bool spm_infill = false; // suffix/prefix/middle pattern for infill
|
||||||
|
|
||||||
std::string lora_outfile = "ggml-lora-merged-f16.gguf";
|
|
||||||
|
|
||||||
// batched-bench params
|
// batched-bench params
|
||||||
bool batched_bench_output_jsonl = false;
|
bool batched_bench_output_jsonl = false;
|
||||||
|
|
||||||
|
// common params
|
||||||
|
std::string out_file; // output filename for all example programs
|
||||||
};
|
};
|
||||||
|
|
||||||
// call once at the start of a program if it uses libcommon
|
// call once at the start of a program if it uses libcommon
|
||||||
|
|
|
@ -861,6 +861,9 @@ class Model:
|
||||||
for token_id, token_data in added_tokens_decoder.items():
|
for token_id, token_data in added_tokens_decoder.items():
|
||||||
token_id = int(token_id)
|
token_id = int(token_id)
|
||||||
token: str = token_data["content"]
|
token: str = token_data["content"]
|
||||||
|
if token_id >= vocab_size:
|
||||||
|
logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
|
||||||
|
continue
|
||||||
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
|
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
|
||||||
if tokens[token_id] != token.encode("utf-8"):
|
if tokens[token_id] != token.encode("utf-8"):
|
||||||
logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token!r}')
|
logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token!r}')
|
||||||
|
@ -3322,6 +3325,83 @@ class Gemma2Model(Model):
|
||||||
return [(self.map_tensor_name(name), data_torch)]
|
return [(self.map_tensor_name(name), data_torch)]
|
||||||
|
|
||||||
|
|
||||||
|
@Model.register("Gemma3ForCausalLM", "Gemma3ForConditionalGeneration")
|
||||||
|
class Gemma3Model(Model):
|
||||||
|
model_arch = gguf.MODEL_ARCH.GEMMA3
|
||||||
|
has_vision: bool = False
|
||||||
|
|
||||||
|
# we need to merge the text_config into the root level of hparams
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
hparams = Model.load_hparams(kwargs["dir_model"])
|
||||||
|
if "text_config" in hparams:
|
||||||
|
hparams = {**hparams, **hparams["text_config"]}
|
||||||
|
kwargs["hparams"] = hparams
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
if "vision_config" in hparams:
|
||||||
|
logger.info("Has vision encoder, but it will be ignored")
|
||||||
|
self.has_vision = True
|
||||||
|
|
||||||
|
def write(self):
|
||||||
|
super().write()
|
||||||
|
if self.has_vision:
|
||||||
|
logger.info("NOTE: this script only convert the language model to GGUF")
|
||||||
|
logger.info(" for the vision model, please use gemma3_convert_encoder_to_gguf.py")
|
||||||
|
|
||||||
|
def set_vocab(self):
|
||||||
|
self._set_vocab_sentencepiece()
|
||||||
|
|
||||||
|
self.gguf_writer.add_add_space_prefix(False)
|
||||||
|
|
||||||
|
def set_gguf_parameters(self):
|
||||||
|
hparams = self.hparams
|
||||||
|
block_count = hparams["num_hidden_layers"]
|
||||||
|
|
||||||
|
# some default values are not specified in the hparams
|
||||||
|
self.gguf_writer.add_context_length(hparams.get("max_position_embeddings", 131072))
|
||||||
|
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
|
||||||
|
self.gguf_writer.add_block_count(block_count)
|
||||||
|
self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
|
||||||
|
self.gguf_writer.add_head_count(hparams.get("num_attention_heads", 8))
|
||||||
|
self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("rms_norm_eps", 1e-6))
|
||||||
|
self.gguf_writer.add_key_length(hparams.get("head_dim", 256))
|
||||||
|
self.gguf_writer.add_value_length(hparams.get("head_dim", 256))
|
||||||
|
self.gguf_writer.add_file_type(self.ftype)
|
||||||
|
self.gguf_writer.add_rope_freq_base(hparams.get("rope_theta", 1_000_000.0)) # for global layers
|
||||||
|
# both attn_logit_softcapping and final_logit_softcapping are removed in Gemma3
|
||||||
|
assert hparams.get("attn_logit_softcapping") is None
|
||||||
|
assert hparams.get("final_logit_softcapping") is None
|
||||||
|
self.gguf_writer.add_sliding_window(hparams["sliding_window"])
|
||||||
|
self.gguf_writer.add_head_count_kv(hparams.get("num_key_value_heads", 4))
|
||||||
|
if hparams.get("rope_scaling") is not None:
|
||||||
|
assert hparams["rope_scaling"]["rope_type"] == "linear"
|
||||||
|
# important: this rope_scaling is only applied for global layers, and not used by 1B model
|
||||||
|
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
||||||
|
self.gguf_writer.add_rope_scaling_factor(hparams["rope_scaling"]["factor"])
|
||||||
|
|
||||||
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
|
del bid # unused
|
||||||
|
|
||||||
|
if name.startswith("language_model."):
|
||||||
|
name = name.replace("language_model.", "")
|
||||||
|
elif name.startswith("multi_modal_projector.") or name.startswith("vision_tower.") \
|
||||||
|
or name.startswith("multimodal_projector.") or name.startswith("vision_model."): # this is for old HF model, should be removed later
|
||||||
|
# ignore vision tensors
|
||||||
|
return []
|
||||||
|
|
||||||
|
# remove OOV (out-of-vocabulary) rows in token_embd
|
||||||
|
if "embed_tokens.weight" in name:
|
||||||
|
vocab = self._create_vocab_sentencepiece()
|
||||||
|
tokens = vocab[0]
|
||||||
|
data_torch = data_torch[:len(tokens)]
|
||||||
|
|
||||||
|
# ref code in Gemma3RMSNorm
|
||||||
|
# output = output * (1.0 + self.weight.float())
|
||||||
|
if name.endswith("norm.weight"):
|
||||||
|
data_torch = data_torch + 1
|
||||||
|
|
||||||
|
return [(self.map_tensor_name(name), data_torch)]
|
||||||
|
|
||||||
|
|
||||||
@Model.register("Starcoder2ForCausalLM")
|
@Model.register("Starcoder2ForCausalLM")
|
||||||
class StarCoder2Model(Model):
|
class StarCoder2Model(Model):
|
||||||
model_arch = gguf.MODEL_ARCH.STARCODER2
|
model_arch = gguf.MODEL_ARCH.STARCODER2
|
||||||
|
|
30
examples/llava/README-gemma3.md
Normal file
30
examples/llava/README-gemma3.md
Normal file
|
@ -0,0 +1,30 @@
|
||||||
|
# Gemma 3 vision
|
||||||
|
|
||||||
|
> [!IMPORTANT]
|
||||||
|
>
|
||||||
|
> This is very experimental, only used for demo purpose.
|
||||||
|
|
||||||
|
## How to get mmproj.gguf?
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd gemma-3-4b-it
|
||||||
|
python ../llama.cpp/examples/llava/gemma3_convert_encoder_to_gguf.py .
|
||||||
|
|
||||||
|
# output file is mmproj.gguf
|
||||||
|
```
|
||||||
|
|
||||||
|
## How to run it?
|
||||||
|
|
||||||
|
What you need:
|
||||||
|
- The text model GGUF, can be converted using `convert_hf_to_gguf.py`
|
||||||
|
- The mmproj file from step above
|
||||||
|
- An image file
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# build
|
||||||
|
cmake -B build
|
||||||
|
cmake --build build --target llama-gemma3-cli
|
||||||
|
|
||||||
|
# run it
|
||||||
|
./build/bin/llama-gemma3-cli -m {text_model}.gguf --mmproj mmproj.gguf --image your_image.jpg
|
||||||
|
```
|
|
@ -5,13 +5,25 @@ Currently, this readme only supports minicpm-omni's image capabilities, and we w
|
||||||
|
|
||||||
Download [MiniCPM-o-2_6](https://huggingface.co/openbmb/MiniCPM-o-2_6) PyTorch model from huggingface to "MiniCPM-o-2_6" folder.
|
Download [MiniCPM-o-2_6](https://huggingface.co/openbmb/MiniCPM-o-2_6) PyTorch model from huggingface to "MiniCPM-o-2_6" folder.
|
||||||
|
|
||||||
|
|
||||||
|
### Build llama.cpp
|
||||||
|
Readme modification time: 20250206
|
||||||
|
|
||||||
|
If there are differences in usage, please refer to the official build [documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
|
||||||
|
|
||||||
Clone llama.cpp:
|
Clone llama.cpp:
|
||||||
```bash
|
```bash
|
||||||
git clone git@github.com:OpenBMB/llama.cpp.git
|
git clone https://github.com/ggerganov/llama.cpp
|
||||||
cd llama.cpp
|
cd llama.cpp
|
||||||
git checkout minicpm-omni
|
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Build llama.cpp using `CMake`:
|
||||||
|
```bash
|
||||||
|
cmake -B build
|
||||||
|
cmake --build build --config Release
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
### Usage of MiniCPM-o 2.6
|
### Usage of MiniCPM-o 2.6
|
||||||
|
|
||||||
Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-o-2_6-gguf) by us)
|
Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-o-2_6-gguf) by us)
|
||||||
|
@ -22,25 +34,15 @@ python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-
|
||||||
python ./convert_hf_to_gguf.py ../MiniCPM-o-2_6/model
|
python ./convert_hf_to_gguf.py ../MiniCPM-o-2_6/model
|
||||||
|
|
||||||
# quantize int4 version
|
# quantize int4 version
|
||||||
./llama-quantize ../MiniCPM-o-2_6/model/ggml-model-f16.gguf ../MiniCPM-o-2_6/model/ggml-model-Q4_K_M.gguf Q4_K_M
|
./build/bin/llama-quantize ../MiniCPM-o-2_6/model/ggml-model-f16.gguf ../MiniCPM-o-2_6/model/ggml-model-Q4_K_M.gguf Q4_K_M
|
||||||
```
|
```
|
||||||
|
|
||||||
Build llama.cpp using `CMake`:
|
|
||||||
https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md
|
|
||||||
|
|
||||||
```bash
|
|
||||||
cmake -B build
|
|
||||||
cmake --build build --config Release
|
|
||||||
```
|
|
||||||
|
|
||||||
Inference on Linux or Mac
|
Inference on Linux or Mac
|
||||||
```
|
```bash
|
||||||
# run f16 version
|
# run f16 version
|
||||||
./llama-minicpmv-cli -m ../MiniCPM-o-2_6/model/ggml-model-f16.gguf --mmproj ../MiniCPM-o-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
|
./build/bin/llama-minicpmv-cli -m ../MiniCPM-o-2_6/model/ggml-model-f16.gguf --mmproj ../MiniCPM-o-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
|
||||||
|
|
||||||
# run quantized int4 version
|
# run quantized int4 version
|
||||||
./llama-minicpmv-cli -m ../MiniCPM-o-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-o-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
|
./build/bin/llama-minicpmv-cli -m ../MiniCPM-o-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-o-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
|
||||||
|
|
||||||
# or run in interactive mode
|
|
||||||
./llama-minicpmv-cli -m ../MiniCPM-o-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-o-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -i
|
|
||||||
```
|
```
|
||||||
|
|
|
@ -4,13 +4,26 @@
|
||||||
|
|
||||||
Download [MiniCPM-Llama3-V-2_5](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5) PyTorch model from huggingface to "MiniCPM-Llama3-V-2_5" folder.
|
Download [MiniCPM-Llama3-V-2_5](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5) PyTorch model from huggingface to "MiniCPM-Llama3-V-2_5" folder.
|
||||||
|
|
||||||
|
|
||||||
|
### Build llama.cpp
|
||||||
|
Readme modification time: 20250206
|
||||||
|
|
||||||
|
If there are differences in usage, please refer to the official build [documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
|
||||||
|
|
||||||
Clone llama.cpp:
|
Clone llama.cpp:
|
||||||
```bash
|
```bash
|
||||||
git clone https://github.com/ggml-org/llama.cpp
|
git clone https://github.com/ggml-org/llama.cpp
|
||||||
cd llama.cpp
|
cd llama.cpp
|
||||||
```
|
```
|
||||||
|
|
||||||
### Usage
|
Build llama.cpp using `CMake`:
|
||||||
|
```bash
|
||||||
|
cmake -B build
|
||||||
|
cmake --build build --config Release
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
### Usage of MiniCPM-Llama3-V 2.5
|
||||||
|
|
||||||
Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf) by us)
|
Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf) by us)
|
||||||
|
|
||||||
|
@ -20,80 +33,15 @@ python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-
|
||||||
python ./convert_hf_to_gguf.py ../MiniCPM-Llama3-V-2_5/model
|
python ./convert_hf_to_gguf.py ../MiniCPM-Llama3-V-2_5/model
|
||||||
|
|
||||||
# quantize int4 version
|
# quantize int4 version
|
||||||
./llama-quantize ../MiniCPM-Llama3-V-2_5/model/model-8B-F16.gguf ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf Q4_K_M
|
./build/bin/llama-quantize ../MiniCPM-Llama3-V-2_5/model/model-8B-F16.gguf ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf Q4_K_M
|
||||||
```
|
```
|
||||||
|
|
||||||
Build for Linux or Mac
|
|
||||||
|
|
||||||
```bash
|
|
||||||
make
|
|
||||||
make llama-minicpmv-cli
|
|
||||||
```
|
|
||||||
|
|
||||||
Inference on Linux or Mac
|
Inference on Linux or Mac
|
||||||
```
|
```bash
|
||||||
# run f16 version
|
# run f16 version
|
||||||
./llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/model-8B-F16.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
|
./build/bin/llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/model-8B-F16.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
|
||||||
|
|
||||||
# run quantized int4 version
|
# run quantized int4 version
|
||||||
./llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
|
./build/bin/llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
|
||||||
|
|
||||||
# or run in interactive mode
|
|
||||||
./llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -i
|
|
||||||
```
|
|
||||||
|
|
||||||
### Android
|
|
||||||
|
|
||||||
#### Build on Android device using Termux
|
|
||||||
We found that build on Android device would bring better runtime performance, so we recommend to build on device.
|
|
||||||
|
|
||||||
[Termux](https://github.com/termux/termux-app#installation) is a terminal app on Android device (no root required).
|
|
||||||
|
|
||||||
Install tools in Termux:
|
|
||||||
```
|
|
||||||
apt update && apt upgrade -y
|
|
||||||
apt install git make cmake
|
|
||||||
```
|
|
||||||
|
|
||||||
It's recommended to move your model inside the `~/` directory for best performance:
|
|
||||||
```
|
|
||||||
cd storage/downloads
|
|
||||||
mv model.gguf ~/
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Building the Project using Android NDK
|
|
||||||
Obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake.
|
|
||||||
|
|
||||||
Execute the following commands on your computer to avoid downloading the NDK to your mobile. Alternatively, you can also do this in Termux:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
mkdir build-android
|
|
||||||
cd build-android
|
|
||||||
export NDK=/your_ndk_path
|
|
||||||
cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod ..
|
|
||||||
make
|
|
||||||
```
|
|
||||||
|
|
||||||
Install [termux](https://github.com/termux/termux-app#installation) on your device and run `termux-setup-storage` to get access to your SD card (if Android 11+ then run the command twice).
|
|
||||||
|
|
||||||
Finally, copy these built `llama` binaries and the model file to your device storage. Because the file permissions in the Android sdcard cannot be changed, you can copy the executable files to the `/data/data/com.termux/files/home/bin` path, and then execute the following commands in Termux to add executable permission:
|
|
||||||
|
|
||||||
(Assumed that you have pushed the built executable files to the /sdcard/llama.cpp/bin path using `adb push`)
|
|
||||||
```
|
|
||||||
$cp -r /sdcard/llama.cpp/bin /data/data/com.termux/files/home/
|
|
||||||
$cd /data/data/com.termux/files/home/bin
|
|
||||||
$chmod +x ./*
|
|
||||||
```
|
|
||||||
|
|
||||||
Download models and push them to `/sdcard/llama.cpp/`, then move it to `/data/data/com.termux/files/home/model/`
|
|
||||||
|
|
||||||
```
|
|
||||||
$mv /sdcard/llama.cpp/ggml-model-Q4_K_M.gguf /data/data/com.termux/files/home/model/
|
|
||||||
$mv /sdcard/llama.cpp/mmproj-model-f16.gguf /data/data/com.termux/files/home/model/
|
|
||||||
```
|
|
||||||
|
|
||||||
Now, you can start chatting:
|
|
||||||
```
|
|
||||||
$cd /data/data/com.termux/files/home/bin
|
|
||||||
$./llama-minicpmv-cli -m ../model/ggml-model-Q4_K_M.gguf --mmproj ../model/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
|
|
||||||
```
|
```
|
||||||
|
|
|
@ -4,13 +4,25 @@
|
||||||
|
|
||||||
Download [MiniCPM-V-2_6](https://huggingface.co/openbmb/MiniCPM-V-2_6) PyTorch model from huggingface to "MiniCPM-V-2_6" folder.
|
Download [MiniCPM-V-2_6](https://huggingface.co/openbmb/MiniCPM-V-2_6) PyTorch model from huggingface to "MiniCPM-V-2_6" folder.
|
||||||
|
|
||||||
|
|
||||||
|
### Build llama.cpp
|
||||||
|
Readme modification time: 20250206
|
||||||
|
|
||||||
|
If there are differences in usage, please refer to the official build [documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
|
||||||
|
|
||||||
Clone llama.cpp:
|
Clone llama.cpp:
|
||||||
```bash
|
```bash
|
||||||
git clone git@github.com:OpenBMB/llama.cpp.git
|
git clone https://github.com/ggerganov/llama.cpp
|
||||||
cd llama.cpp
|
cd llama.cpp
|
||||||
git checkout minicpmv-main
|
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Build llama.cpp using `CMake`:
|
||||||
|
```bash
|
||||||
|
cmake -B build
|
||||||
|
cmake --build build --config Release
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
### Usage of MiniCPM-V 2.6
|
### Usage of MiniCPM-V 2.6
|
||||||
|
|
||||||
Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) by us)
|
Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) by us)
|
||||||
|
@ -21,87 +33,15 @@ python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-
|
||||||
python ./convert_hf_to_gguf.py ../MiniCPM-V-2_6/model
|
python ./convert_hf_to_gguf.py ../MiniCPM-V-2_6/model
|
||||||
|
|
||||||
# quantize int4 version
|
# quantize int4 version
|
||||||
./llama-quantize ../MiniCPM-V-2_6/model/ggml-model-f16.gguf ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf Q4_K_M
|
./build/bin/llama-quantize ../MiniCPM-V-2_6/model/ggml-model-f16.gguf ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf Q4_K_M
|
||||||
```
|
```
|
||||||
|
|
||||||
Build for Linux or Mac
|
|
||||||
|
|
||||||
```bash
|
|
||||||
make
|
|
||||||
make llama-minicpmv-cli
|
|
||||||
```
|
|
||||||
|
|
||||||
Inference on Linux or Mac
|
Inference on Linux or Mac
|
||||||
```
|
```bash
|
||||||
# run f16 version
|
# run f16 version
|
||||||
./llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-f16.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
|
./build/bin/llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-f16.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
|
||||||
|
|
||||||
# run quantized int4 version
|
# run quantized int4 version
|
||||||
./llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
|
./build/bin/llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
|
||||||
|
|
||||||
# or run in interactive mode
|
|
||||||
./llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -i
|
|
||||||
```
|
|
||||||
|
|
||||||
### Video
|
|
||||||
Install FFmpeg
|
|
||||||
```
|
|
||||||
brew install ffmpeg
|
|
||||||
brew install pkg-config
|
|
||||||
```
|
|
||||||
|
|
||||||
### Android
|
|
||||||
|
|
||||||
#### Build on Android device using Termux
|
|
||||||
We found that build on Android device would bring better runtime performance, so we recommend to build on device.
|
|
||||||
|
|
||||||
[Termux](https://github.com/termux/termux-app#installation) is a terminal app on Android device (no root required).
|
|
||||||
|
|
||||||
Install tools in Termux:
|
|
||||||
```
|
|
||||||
apt update && apt upgrade -y
|
|
||||||
apt install git make cmake
|
|
||||||
```
|
|
||||||
|
|
||||||
It's recommended to move your model inside the `~/` directory for best performance:
|
|
||||||
```
|
|
||||||
cd storage/downloads
|
|
||||||
mv model.gguf ~/
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Building the Project using Android NDK
|
|
||||||
Obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake.
|
|
||||||
|
|
||||||
Execute the following commands on your computer to avoid downloading the NDK to your mobile. Alternatively, you can also do this in Termux:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
mkdir build-android
|
|
||||||
cd build-android
|
|
||||||
export NDK=/your_ndk_path
|
|
||||||
cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod ..
|
|
||||||
make
|
|
||||||
```
|
|
||||||
|
|
||||||
Install [termux](https://github.com/termux/termux-app#installation) on your device and run `termux-setup-storage` to get access to your SD card (if Android 11+ then run the command twice).
|
|
||||||
|
|
||||||
Finally, copy these built `llama` binaries and the model file to your device storage. Because the file permissions in the Android sdcard cannot be changed, you can copy the executable files to the `/data/data/com.termux/files/home/bin` path, and then execute the following commands in Termux to add executable permission:
|
|
||||||
|
|
||||||
(Assumed that you have pushed the built executable files to the /sdcard/llama.cpp/bin path using `adb push`)
|
|
||||||
```
|
|
||||||
$cp -r /sdcard/llama.cpp/bin /data/data/com.termux/files/home/
|
|
||||||
$cd /data/data/com.termux/files/home/bin
|
|
||||||
$chmod +x ./*
|
|
||||||
```
|
|
||||||
|
|
||||||
Download models and push them to `/sdcard/llama.cpp/`, then move it to `/data/data/com.termux/files/home/model/`
|
|
||||||
|
|
||||||
```
|
|
||||||
$mv /sdcard/llama.cpp/ggml-model-Q4_K_M.gguf /data/data/com.termux/files/home/model/
|
|
||||||
$mv /sdcard/llama.cpp/mmproj-model-f16.gguf /data/data/com.termux/files/home/model/
|
|
||||||
```
|
|
||||||
|
|
||||||
Now, you can start chatting:
|
|
||||||
```
|
|
||||||
$cd /data/data/com.termux/files/home/bin
|
|
||||||
$./llama-minicpmv-cli -m ../model/ggml-model-Q4_K_M.gguf --mmproj ../model/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
|
|
||||||
```
|
```
|
||||||
|
|
|
@ -4,6 +4,7 @@
|
||||||
// Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
|
// Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
|
||||||
#include "clip.h"
|
#include "clip.h"
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
|
#include "ggml-cpp.h"
|
||||||
#include "ggml-cpu.h"
|
#include "ggml-cpu.h"
|
||||||
#include "ggml-alloc.h"
|
#include "ggml-alloc.h"
|
||||||
#include "ggml-backend.h"
|
#include "ggml-backend.h"
|
||||||
|
@ -12,19 +13,9 @@
|
||||||
#ifdef GGML_USE_CUDA
|
#ifdef GGML_USE_CUDA
|
||||||
#include "ggml-cuda.h"
|
#include "ggml-cuda.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef GGML_USE_SYCL
|
|
||||||
#include "ggml-sycl.h"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef GGML_USE_METAL
|
#ifdef GGML_USE_METAL
|
||||||
#include "ggml-metal.h"
|
#include "ggml-metal.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef GGML_USE_CANN
|
|
||||||
#include "ggml-cann.h"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef GGML_USE_VULKAN
|
#ifdef GGML_USE_VULKAN
|
||||||
#include "ggml-vulkan.h"
|
#include "ggml-vulkan.h"
|
||||||
#endif
|
#endif
|
||||||
|
@ -158,6 +149,8 @@ static std::string format(const char * fmt, ...) {
|
||||||
#define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
|
#define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
|
||||||
#define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s"
|
#define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s"
|
||||||
#define TN_IMAGE_NEWLINE "model.image_newline"
|
#define TN_IMAGE_NEWLINE "model.image_newline"
|
||||||
|
#define TN_MM_INP_PROJ "mm.input_projection.weight" // gemma3
|
||||||
|
#define TN_MM_SOFT_EMB_N "mm.soft_emb_norm.weight" // gemma3
|
||||||
|
|
||||||
#define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k"
|
#define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k"
|
||||||
#define TN_MINICPMV_QUERY "resampler.query"
|
#define TN_MINICPMV_QUERY "resampler.query"
|
||||||
|
@ -184,6 +177,7 @@ enum projector_type {
|
||||||
PROJECTOR_TYPE_RESAMPLER,
|
PROJECTOR_TYPE_RESAMPLER,
|
||||||
PROJECTOR_TYPE_GLM_EDGE,
|
PROJECTOR_TYPE_GLM_EDGE,
|
||||||
PROJECTOR_TYPE_MERGER,
|
PROJECTOR_TYPE_MERGER,
|
||||||
|
PROJECTOR_TYPE_GEMMA3,
|
||||||
PROJECTOR_TYPE_UNKNOWN,
|
PROJECTOR_TYPE_UNKNOWN,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -194,6 +188,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
||||||
{ PROJECTOR_TYPE_RESAMPLER, "resampler"},
|
{ PROJECTOR_TYPE_RESAMPLER, "resampler"},
|
||||||
{ PROJECTOR_TYPE_GLM_EDGE, "adapter"},
|
{ PROJECTOR_TYPE_GLM_EDGE, "adapter"},
|
||||||
{ PROJECTOR_TYPE_MERGER, "qwen2vl_merger"},
|
{ PROJECTOR_TYPE_MERGER, "qwen2vl_merger"},
|
||||||
|
{ PROJECTOR_TYPE_GEMMA3, "gemma3"},
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
@ -320,7 +315,7 @@ static projector_type clip_projector_type_from_string(const std::string & name)
|
||||||
return kv.first;
|
return kv.first;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return PROJECTOR_TYPE_UNKNOWN;
|
throw std::runtime_error(format("Unknown projector type: %s", name.c_str()));
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef CLIP_DEBUG_FUNCTIONS
|
#ifdef CLIP_DEBUG_FUNCTIONS
|
||||||
|
@ -577,8 +572,18 @@ struct clip_vision_model {
|
||||||
struct ggml_tensor * mm_model_ln_kv_b;
|
struct ggml_tensor * mm_model_ln_kv_b;
|
||||||
struct ggml_tensor * mm_model_ln_post_w;
|
struct ggml_tensor * mm_model_ln_post_w;
|
||||||
struct ggml_tensor * mm_model_ln_post_b;
|
struct ggml_tensor * mm_model_ln_post_b;
|
||||||
|
|
||||||
|
// gemma3
|
||||||
|
struct ggml_tensor * mm_input_proj_w;
|
||||||
|
struct ggml_tensor * mm_soft_emb_norm_w;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
bool enable_gpu_clip = true;
|
||||||
|
void set_clip_uses_gpu(bool usegpu)
|
||||||
|
{
|
||||||
|
enable_gpu_clip = usegpu;
|
||||||
|
}
|
||||||
|
|
||||||
struct clip_ctx {
|
struct clip_ctx {
|
||||||
bool has_text_encoder = false;
|
bool has_text_encoder = false;
|
||||||
bool has_vision_encoder = false;
|
bool has_vision_encoder = false;
|
||||||
|
@ -591,7 +596,7 @@ struct clip_ctx {
|
||||||
struct clip_vision_model vision_model;
|
struct clip_vision_model vision_model;
|
||||||
projector_type proj_type = PROJECTOR_TYPE_MLP;
|
projector_type proj_type = PROJECTOR_TYPE_MLP;
|
||||||
|
|
||||||
int32_t max_feature_layer;
|
int32_t max_feature_layer; // unused in newer models like gemma3
|
||||||
float image_mean[3];
|
float image_mean[3];
|
||||||
float image_std[3];
|
float image_std[3];
|
||||||
bool use_gelu = false;
|
bool use_gelu = false;
|
||||||
|
@ -603,21 +608,213 @@ struct clip_ctx {
|
||||||
bool has_post_norm = false;
|
bool has_post_norm = false;
|
||||||
bool has_patch_bias = false;
|
bool has_patch_bias = false;
|
||||||
|
|
||||||
struct gguf_context * ctx_gguf;
|
struct gguf_context * ctx_gguf = nullptr;
|
||||||
struct ggml_context * ctx_data;
|
struct ggml_context * ctx_data = nullptr;
|
||||||
|
|
||||||
std::vector<uint8_t> buf_compute_meta;
|
std::vector<uint8_t> buf_compute_meta;
|
||||||
|
|
||||||
// memory buffers to evaluate the model
|
std::vector<ggml_backend_t> backend_ptrs;
|
||||||
ggml_backend_buffer_t params_buffer = NULL;
|
std::vector<ggml_backend_buffer_type_t> backend_buft;
|
||||||
|
|
||||||
ggml_backend_t backend = NULL;
|
ggml_backend_t backend = nullptr;
|
||||||
ggml_gallocr_t compute_alloc = NULL;
|
ggml_backend_buffer_t buf = nullptr;
|
||||||
|
|
||||||
struct clip_image_size * load_image_size;
|
ggml_backend_sched_ptr sched;
|
||||||
|
|
||||||
|
struct clip_image_size * load_image_size = nullptr;
|
||||||
|
|
||||||
|
clip_ctx(clip_context_params & ctx_params) {
|
||||||
|
|
||||||
|
if(enable_gpu_clip)
|
||||||
|
{
|
||||||
|
#ifdef GGML_USE_CUDA
|
||||||
|
backend = ggml_backend_cuda_init(0);
|
||||||
|
LOG_INF("%s: CLIP using CUDA backend\n", __func__);
|
||||||
|
#endif
|
||||||
|
#ifdef GGML_USE_METAL
|
||||||
|
backend = ggml_backend_metal_init();
|
||||||
|
LOG_INF("%s: CLIP using Metal backend\n", __func__);
|
||||||
|
#endif
|
||||||
|
#ifdef GGML_USE_VULKAN
|
||||||
|
backend = ggml_backend_vk_init(0);
|
||||||
|
LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!backend) {
|
||||||
|
backend = ggml_backend_cpu_init();
|
||||||
|
LOG_INF("%s: CLIP using CPU backend\n", __func__);
|
||||||
|
}
|
||||||
|
|
||||||
|
backend_ptrs.push_back(backend);
|
||||||
|
backend_buft.push_back(ggml_backend_get_default_buffer_type(backend));
|
||||||
|
|
||||||
|
sched.reset(
|
||||||
|
ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), 8192, false)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
~clip_ctx() {
|
||||||
|
ggml_free(ctx_data);
|
||||||
|
gguf_free(ctx_gguf);
|
||||||
|
ggml_backend_buffer_free(buf);
|
||||||
|
ggml_backend_free(backend);
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) {
|
static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_image_f32_batch * imgs) {
|
||||||
|
const auto & model = ctx->vision_model;
|
||||||
|
const auto & hparams = model.hparams;
|
||||||
|
|
||||||
|
const int image_size = hparams.image_size;
|
||||||
|
int image_size_width = image_size;
|
||||||
|
int image_size_height = image_size;
|
||||||
|
|
||||||
|
const int patch_size = hparams.patch_size;
|
||||||
|
const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
|
||||||
|
const int hidden_size = hparams.hidden_size;
|
||||||
|
const int n_head = hparams.n_head;
|
||||||
|
const int d_head = hidden_size / n_head;
|
||||||
|
const int n_layer = hparams.n_layer;
|
||||||
|
const float eps = hparams.eps;
|
||||||
|
|
||||||
|
GGML_ASSERT(imgs->size == 1); // batch_size == 1
|
||||||
|
|
||||||
|
struct ggml_init_params params = {
|
||||||
|
/*.mem_size =*/ ctx->buf_compute_meta.size(),
|
||||||
|
/*.mem_buffer =*/ ctx->buf_compute_meta.data(),
|
||||||
|
/*.no_alloc =*/ true,
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ggml_context * ctx0 = ggml_init(params);
|
||||||
|
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
|
||||||
|
|
||||||
|
// input raw
|
||||||
|
struct ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, image_size_width, image_size_height, 3);
|
||||||
|
ggml_set_name(inp_raw, "inp_raw");
|
||||||
|
ggml_set_input(inp_raw);
|
||||||
|
|
||||||
|
struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
|
||||||
|
inp = ggml_reshape_2d(ctx0, inp, num_patches, hidden_size);
|
||||||
|
inp = ggml_cont(ctx0, ggml_transpose(ctx0, inp));
|
||||||
|
inp = ggml_add(ctx0, inp, model.patch_bias);
|
||||||
|
|
||||||
|
// position embeddings
|
||||||
|
struct ggml_tensor * embeddings = ggml_add(ctx0, inp, model.position_embeddings);
|
||||||
|
|
||||||
|
// loop over layers
|
||||||
|
for (int il = 0; il < n_layer; il++) {
|
||||||
|
struct ggml_tensor * cur = embeddings; // embeddings = residual, cur = hidden_states
|
||||||
|
|
||||||
|
// layernorm1
|
||||||
|
{
|
||||||
|
cur = ggml_norm(ctx0, cur, eps);
|
||||||
|
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ln_1_w), model.layers[il].ln_1_b);
|
||||||
|
}
|
||||||
|
|
||||||
|
// self-attention
|
||||||
|
{
|
||||||
|
|
||||||
|
struct ggml_tensor * Q =
|
||||||
|
ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].q_w, cur), model.layers[il].q_b);
|
||||||
|
|
||||||
|
Q = ggml_reshape_3d(ctx0, Q, d_head, n_head, num_patches);
|
||||||
|
Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
|
||||||
|
|
||||||
|
struct ggml_tensor * K =
|
||||||
|
ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].k_w, cur), model.layers[il].k_b);
|
||||||
|
|
||||||
|
K = ggml_reshape_3d(ctx0, K, d_head, n_head, num_patches);
|
||||||
|
K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
|
||||||
|
|
||||||
|
struct ggml_tensor * V =
|
||||||
|
ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].v_w, cur), model.layers[il].v_b);
|
||||||
|
|
||||||
|
V = ggml_reshape_3d(ctx0, V, d_head, n_head, num_patches);
|
||||||
|
V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
|
||||||
|
|
||||||
|
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
||||||
|
KQ = ggml_scale_inplace(ctx0, KQ, 1.0f / sqrtf((float)d_head));
|
||||||
|
KQ = ggml_soft_max_inplace(ctx0, KQ);
|
||||||
|
|
||||||
|
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ);
|
||||||
|
KQV = ggml_reshape_3d(ctx0, KQV, d_head, num_patches, n_head);
|
||||||
|
KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
||||||
|
|
||||||
|
cur = ggml_cont_2d(ctx0, KQV, hidden_size, num_patches);
|
||||||
|
}
|
||||||
|
|
||||||
|
// attention output
|
||||||
|
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].o_w, cur), model.layers[il].o_b);
|
||||||
|
|
||||||
|
// re-add the layer input, e.g., residual
|
||||||
|
cur = ggml_add(ctx0, cur, embeddings);
|
||||||
|
|
||||||
|
embeddings = cur; // embeddings = residual, cur = hidden_states
|
||||||
|
|
||||||
|
// layernorm2
|
||||||
|
{
|
||||||
|
cur = ggml_norm(ctx0, cur, eps);
|
||||||
|
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ln_2_w), model.layers[il].ln_2_b);
|
||||||
|
}
|
||||||
|
|
||||||
|
cur = ggml_mul_mat(ctx0, model.layers[il].ff_i_w, cur);
|
||||||
|
cur = ggml_add(ctx0, cur, model.layers[il].ff_i_b);
|
||||||
|
|
||||||
|
// siglip uses gelu
|
||||||
|
cur = ggml_gelu(ctx0, cur);
|
||||||
|
|
||||||
|
cur = ggml_mul_mat(ctx0, model.layers[il].ff_o_w, cur);
|
||||||
|
cur = ggml_add(ctx0, cur, model.layers[il].ff_o_b);
|
||||||
|
|
||||||
|
// residual 2
|
||||||
|
cur = ggml_add(ctx0, embeddings, cur);
|
||||||
|
|
||||||
|
embeddings = cur;
|
||||||
|
}
|
||||||
|
|
||||||
|
// post-layernorm
|
||||||
|
if (ctx->has_post_norm) {
|
||||||
|
embeddings = ggml_norm(ctx0, embeddings, eps);
|
||||||
|
ggml_set_name(embeddings, "post_ln");
|
||||||
|
|
||||||
|
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.post_ln_w), model.post_ln_b);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
|
||||||
|
const int batch_size = 1;
|
||||||
|
const int mm_tokens_per_image = 256; // default value for gemma3
|
||||||
|
const int tokens_per_side = sqrt(mm_tokens_per_image);
|
||||||
|
const int patches_per_image = sqrt(num_patches);
|
||||||
|
const int kernel_size = patches_per_image / tokens_per_side;
|
||||||
|
|
||||||
|
embeddings = ggml_cont(ctx0, ggml_transpose(ctx0, embeddings));
|
||||||
|
embeddings = ggml_reshape_4d(ctx0, embeddings, patches_per_image, patches_per_image, hidden_size, batch_size);
|
||||||
|
|
||||||
|
// doing a pool2d to reduce the number of output tokens to 256
|
||||||
|
embeddings = ggml_pool_2d(ctx0, embeddings, GGML_OP_POOL_AVG, kernel_size, kernel_size, kernel_size, kernel_size, 0, 0);
|
||||||
|
embeddings = ggml_reshape_3d(ctx0, embeddings, embeddings->ne[0] * embeddings->ne[0], hidden_size, batch_size);
|
||||||
|
embeddings = ggml_cont(ctx0, ggml_transpose(ctx0, embeddings));
|
||||||
|
|
||||||
|
// apply norm before projection
|
||||||
|
embeddings = ggml_rms_norm(ctx0, embeddings, eps);
|
||||||
|
embeddings = ggml_mul(ctx0, embeddings, model.mm_soft_emb_norm_w);
|
||||||
|
|
||||||
|
// apply projection
|
||||||
|
embeddings = ggml_mul_mat(ctx0,
|
||||||
|
ggml_cont(ctx0, ggml_transpose(ctx0, model.mm_input_proj_w)),
|
||||||
|
embeddings);
|
||||||
|
}
|
||||||
|
|
||||||
|
// build the graph
|
||||||
|
ggml_build_forward_expand(gf, embeddings);
|
||||||
|
|
||||||
|
ggml_free(ctx0);
|
||||||
|
|
||||||
|
return gf;
|
||||||
|
}
|
||||||
|
|
||||||
|
static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) {
|
||||||
if (!ctx->has_vision_encoder) {
|
if (!ctx->has_vision_encoder) {
|
||||||
LOG_ERR("This gguf file seems to have no vision encoder\n");
|
LOG_ERR("This gguf file seems to have no vision encoder\n");
|
||||||
return nullptr;
|
return nullptr;
|
||||||
|
@ -1163,7 +1360,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||||
} else {
|
} else {
|
||||||
GGML_ABORT("fatel error");
|
GGML_ABORT("fatel error");
|
||||||
}
|
}
|
||||||
} else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
|
}
|
||||||
|
else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
|
||||||
embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size * 4, num_positions / 4, batch_size);
|
embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size * 4, num_positions / 4, batch_size);
|
||||||
|
|
||||||
embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
|
embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
|
||||||
|
@ -1185,14 +1383,25 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||||
return gf;
|
return gf;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool enable_gpu_clip = true;
|
static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) {
|
||||||
void set_clip_uses_gpu(bool usegpu)
|
if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
|
||||||
{
|
return clip_image_build_graph_siglip(ctx, imgs);
|
||||||
enable_gpu_clip = usegpu;
|
} else {
|
||||||
|
// TODO: we should have one build_* function per model
|
||||||
|
return clip_image_build_graph_legacy(ctx, imgs, load_image_size, is_inf);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// read and create ggml_context containing the tensors and their data
|
// read and create ggml_context containing the tensors and their data
|
||||||
struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||||
|
return clip_init(fname, clip_context_params{
|
||||||
|
/* use_gpu */ true,
|
||||||
|
/* verbosity */ verbosity,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_params) {
|
||||||
|
int verbosity = ctx_params.verbosity;
|
||||||
struct ggml_context * meta = NULL;
|
struct ggml_context * meta = NULL;
|
||||||
|
|
||||||
struct gguf_init_params params = {
|
struct gguf_init_params params = {
|
||||||
|
@ -1206,6 +1415,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (verbosity >= 1) {
|
if (verbosity >= 1) {
|
||||||
|
try {
|
||||||
const int n_tensors = gguf_get_n_tensors(ctx);
|
const int n_tensors = gguf_get_n_tensors(ctx);
|
||||||
const int n_kv = gguf_get_n_kv(ctx);
|
const int n_kv = gguf_get_n_kv(ctx);
|
||||||
const int ftype = get_u32(ctx, KEY_FTYPE);
|
const int ftype = get_u32(ctx, KEY_FTYPE);
|
||||||
|
@ -1224,6 +1434,9 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||||
LOG_INF("%s: n_kv: %d\n", __func__, n_kv);
|
LOG_INF("%s: n_kv: %d\n", __func__, n_kv);
|
||||||
LOG_INF("%s: ftype: %s\n", __func__, ftype_str.c_str());
|
LOG_INF("%s: ftype: %s\n", __func__, ftype_str.c_str());
|
||||||
LOG_INF("\n");
|
LOG_INF("\n");
|
||||||
|
} catch (std::runtime_error & /*e*/) {
|
||||||
|
LOG_INF("Could not list CLIP model properties.\n");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
const int n_tensors = gguf_get_n_tensors(ctx);
|
const int n_tensors = gguf_get_n_tensors(ctx);
|
||||||
|
|
||||||
|
@ -1286,7 +1499,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
clip_ctx * new_clip = new clip_ctx{};
|
clip_ctx * new_clip = new clip_ctx(ctx_params);
|
||||||
|
|
||||||
// update projector type
|
// update projector type
|
||||||
{
|
{
|
||||||
|
@ -1305,39 +1518,6 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if(enable_gpu_clip)
|
|
||||||
{
|
|
||||||
#ifdef GGML_USE_CUDA
|
|
||||||
new_clip->backend = ggml_backend_cuda_init(0);
|
|
||||||
LOG_INF("%s: CLIP using CUDA backend\n", __func__);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef GGML_USE_METAL
|
|
||||||
new_clip->backend = ggml_backend_metal_init();
|
|
||||||
LOG_INF("%s: CLIP using Metal backend\n", __func__);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef GGML_USE_CANN
|
|
||||||
new_clip->backend = ggml_backend_cann_init(0);
|
|
||||||
LOG_INF("%s: CLIP using CANN backend\n", __func__);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef GGML_USE_VULKAN
|
|
||||||
new_clip->backend = ggml_backend_vk_init(0);
|
|
||||||
LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef GGML_USE_SYCL
|
|
||||||
new_clip->backend = ggml_backend_sycl_init(0);
|
|
||||||
LOG_INF("%s: CLIP using SYCL backend\n", __func__);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!new_clip->backend) {
|
|
||||||
new_clip->backend = ggml_backend_cpu_init();
|
|
||||||
LOG_INF("%s: CLIP using CPU backend\n", __func__);
|
|
||||||
}
|
|
||||||
|
|
||||||
// model size and capabilities
|
// model size and capabilities
|
||||||
{
|
{
|
||||||
int idx = get_key_idx(ctx, KEY_HAS_TEXT_ENC);
|
int idx = get_key_idx(ctx, KEY_HAS_TEXT_ENC);
|
||||||
|
@ -1375,8 +1555,12 @@ if(enable_gpu_clip)
|
||||||
GGML_ASSERT(new_clip->has_vision_encoder);
|
GGML_ASSERT(new_clip->has_vision_encoder);
|
||||||
GGML_ASSERT(!new_clip->has_text_encoder);
|
GGML_ASSERT(!new_clip->has_text_encoder);
|
||||||
|
|
||||||
idx = get_key_idx(ctx, KEY_USE_GELU);
|
try {
|
||||||
new_clip->use_gelu = gguf_get_val_bool(ctx, idx);
|
idx = get_key_idx(ctx, KEY_USE_GELU);
|
||||||
|
new_clip->use_gelu = gguf_get_val_bool(ctx, idx);
|
||||||
|
} catch (std::runtime_error & /*e*/) {
|
||||||
|
new_clip->use_gelu = false;
|
||||||
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
idx = get_key_idx(ctx, KEY_USE_SILU);
|
idx = get_key_idx(ctx, KEY_USE_SILU);
|
||||||
|
@ -1390,6 +1574,7 @@ if(enable_gpu_clip)
|
||||||
LOG_INF("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
|
LOG_INF("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
|
||||||
LOG_INF("%s: llava_projector: %d\n", __func__, new_clip->has_llava_projector);
|
LOG_INF("%s: llava_projector: %d\n", __func__, new_clip->has_llava_projector);
|
||||||
LOG_INF("%s: minicpmv_projector: %d\n", __func__, new_clip->has_minicpmv_projector);
|
LOG_INF("%s: minicpmv_projector: %d\n", __func__, new_clip->has_minicpmv_projector);
|
||||||
|
LOG_INF("%s: minicpmv_version: %d\n", __func__, new_clip->minicpmv_version);
|
||||||
LOG_INF("%s: glm_projector: %d\n", __func__, new_clip->has_glm_projector);
|
LOG_INF("%s: glm_projector: %d\n", __func__, new_clip->has_glm_projector);
|
||||||
LOG_INF("%s: model size: %.2f MB\n", __func__, model_size / 1024.0 / 1024.0);
|
LOG_INF("%s: model size: %.2f MB\n", __func__, model_size / 1024.0 / 1024.0);
|
||||||
LOG_INF("%s: metadata size: %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
|
LOG_INF("%s: metadata size: %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
|
||||||
|
@ -1432,7 +1617,9 @@ if(enable_gpu_clip)
|
||||||
}
|
}
|
||||||
|
|
||||||
// alloc memory and offload data
|
// alloc memory and offload data
|
||||||
new_clip->params_buffer = ggml_backend_alloc_ctx_tensors(new_clip->ctx_data, new_clip->backend);
|
ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(new_clip->backend);
|
||||||
|
new_clip->buf = ggml_backend_alloc_ctx_tensors_from_buft(new_clip->ctx_data, buft);
|
||||||
|
ggml_backend_buffer_set_usage(new_clip->buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
|
||||||
for (int i = 0; i < n_tensors; ++i) {
|
for (int i = 0; i < n_tensors; ++i) {
|
||||||
const char * name = gguf_get_tensor_name(ctx, i);
|
const char * name = gguf_get_tensor_name(ctx, i);
|
||||||
struct ggml_tensor * cur = ggml_get_tensor(new_clip->ctx_data, name);
|
struct ggml_tensor * cur = ggml_get_tensor(new_clip->ctx_data, name);
|
||||||
|
@ -1445,7 +1632,7 @@ if(enable_gpu_clip)
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
int num_bytes = ggml_nbytes(cur);
|
int num_bytes = ggml_nbytes(cur);
|
||||||
if (ggml_backend_buffer_is_host(new_clip->params_buffer)) {
|
if (ggml_backend_buft_is_host(buft)) {
|
||||||
// for the CPU and Metal backend, we can read directly into the tensor
|
// for the CPU and Metal backend, we can read directly into the tensor
|
||||||
fin.read(reinterpret_cast<char *>(cur->data), num_bytes);
|
fin.read(reinterpret_cast<char *>(cur->data), num_bytes);
|
||||||
} else {
|
} else {
|
||||||
|
@ -1581,11 +1768,17 @@ if(enable_gpu_clip)
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
vision_model.patch_embeddings_0 = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
|
vision_model.patch_embeddings_0 = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
|
||||||
|
} catch(const std::exception& /*e*/) {
|
||||||
|
vision_model.patch_embeddings_0 = nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
|
vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
|
||||||
} catch(const std::exception& /*e*/) {
|
} catch(const std::exception& /*e*/) {
|
||||||
LOG_ERR("%s: failed to load vision model tensors\n", __func__);
|
vision_model.position_embeddings = nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
vision_model.patch_embeddings_1 = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD_1);
|
vision_model.patch_embeddings_1 = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD_1);
|
||||||
} catch(const std::exception& /*e*/) {
|
} catch(const std::exception& /*e*/) {
|
||||||
|
@ -1696,6 +1889,10 @@ if(enable_gpu_clip)
|
||||||
vision_model.mm_1_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
|
vision_model.mm_1_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
|
||||||
vision_model.mm_1_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));
|
vision_model.mm_1_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));
|
||||||
}
|
}
|
||||||
|
else if (new_clip->proj_type == PROJECTOR_TYPE_GEMMA3) {
|
||||||
|
vision_model.mm_input_proj_w = get_tensor(new_clip->ctx_data, TN_MM_INP_PROJ);
|
||||||
|
vision_model.mm_soft_emb_norm_w = get_tensor(new_clip->ctx_data, TN_MM_SOFT_EMB_N);
|
||||||
|
}
|
||||||
else {
|
else {
|
||||||
std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type];
|
std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type];
|
||||||
throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
|
throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
|
||||||
|
@ -1731,14 +1928,21 @@ if(enable_gpu_clip)
|
||||||
// measure mem requirement and allocate
|
// measure mem requirement and allocate
|
||||||
{
|
{
|
||||||
new_clip->buf_compute_meta.resize(GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead() + ggml_graph_overhead());
|
new_clip->buf_compute_meta.resize(GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead() + ggml_graph_overhead());
|
||||||
new_clip->compute_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(new_clip->backend));
|
|
||||||
clip_image_f32_batch batch;
|
clip_image_f32_batch batch;
|
||||||
batch.size = 1;
|
batch.size = 1;
|
||||||
batch.data = nullptr;
|
batch.data = nullptr;
|
||||||
ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch, nullptr, false);
|
ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch, nullptr, false);
|
||||||
ggml_gallocr_reserve(new_clip->compute_alloc, gf);
|
ggml_backend_sched_reserve(new_clip->sched.get(), gf);
|
||||||
size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0);
|
for (size_t i = 0; i < new_clip->backend_ptrs.size(); ++i) {
|
||||||
LOG_INF("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
|
ggml_backend_t backend = new_clip->backend_ptrs[i];
|
||||||
|
ggml_backend_buffer_type_t buft = new_clip->backend_buft[i];
|
||||||
|
size_t size = ggml_backend_sched_get_buffer_size(new_clip->sched.get(), backend);
|
||||||
|
if (size > 1) {
|
||||||
|
LOG_INF("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
|
||||||
|
ggml_backend_buft_name(buft),
|
||||||
|
size / 1024.0 / 1024.0);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return new_clip;
|
return new_clip;
|
||||||
|
@ -2317,7 +2521,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ctx->has_glm_projector) {
|
if (ctx->has_glm_projector || ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
|
||||||
res_imgs->size = 1;
|
res_imgs->size = 1;
|
||||||
res_imgs->data = new clip_image_f32[res_imgs->size];
|
res_imgs->data = new clip_image_f32[res_imgs->size];
|
||||||
clip_image_u8 resized_image;
|
clip_image_u8 resized_image;
|
||||||
|
@ -2506,12 +2710,6 @@ ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void clip_free(clip_ctx * ctx) {
|
void clip_free(clip_ctx * ctx) {
|
||||||
ggml_free(ctx->ctx_data);
|
|
||||||
gguf_free(ctx->ctx_gguf);
|
|
||||||
|
|
||||||
ggml_backend_buffer_free(ctx->params_buffer);
|
|
||||||
ggml_backend_free(ctx->backend);
|
|
||||||
ggml_gallocr_free(ctx->compute_alloc);
|
|
||||||
delete ctx;
|
delete ctx;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2707,8 +2905,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||||
}
|
}
|
||||||
|
|
||||||
// build the inference graph
|
// build the inference graph
|
||||||
|
ggml_backend_sched_reset(ctx->sched.get());
|
||||||
ggml_cgraph * gf = clip_image_build_graph(ctx, imgs, ctx->load_image_size, true);
|
ggml_cgraph * gf = clip_image_build_graph(ctx, imgs, ctx->load_image_size, true);
|
||||||
ggml_gallocr_alloc_graph(ctx->compute_alloc, gf);
|
ggml_backend_sched_alloc_graph(ctx->sched.get(), gf);
|
||||||
|
|
||||||
// set inputs
|
// set inputs
|
||||||
const auto & model = ctx->vision_model;
|
const auto & model = ctx->vision_model;
|
||||||
|
@ -2847,6 +3046,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||||
ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
|
ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
|
||||||
free(positions_data);
|
free(positions_data);
|
||||||
}
|
}
|
||||||
|
else if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
|
||||||
|
// do nothing
|
||||||
|
}
|
||||||
else {
|
else {
|
||||||
struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
|
struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
|
||||||
|
|
||||||
|
@ -2877,7 +3079,11 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||||
ggml_backend_cpu_set_n_threads(ctx->backend, n_threads);
|
ggml_backend_cpu_set_n_threads(ctx->backend, n_threads);
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_graph_compute(ctx->backend, gf);
|
auto status = ggml_backend_sched_graph_compute(ctx->sched.get(), gf);
|
||||||
|
if (status != GGML_STATUS_SUCCESS) {
|
||||||
|
LOG_ERR("%s: ggml_backend_sched_graph_compute failed with error %d\n", __func__, status);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
// the last node is the embedding tensor
|
// the last node is the embedding tensor
|
||||||
struct ggml_tensor * embeddings = ggml_graph_node(gf, -1);
|
struct ggml_tensor * embeddings = ggml_graph_node(gf, -1);
|
||||||
|
@ -3067,6 +3273,9 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
||||||
if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
|
if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
|
||||||
return ctx->vision_model.mm_1_b->ne[0];
|
return ctx->vision_model.mm_1_b->ne[0];
|
||||||
}
|
}
|
||||||
|
if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
|
||||||
|
return ctx->vision_model.mm_input_proj_w->ne[0];
|
||||||
|
}
|
||||||
|
|
||||||
std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type];
|
std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type];
|
||||||
throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
|
throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
|
||||||
|
|
|
@ -39,8 +39,15 @@ struct clip_image_f32_batch {
|
||||||
size_t size;
|
size_t size;
|
||||||
};
|
};
|
||||||
|
|
||||||
CLIP_API struct clip_ctx * clip_model_load (const char * fname, int verbosity);
|
struct clip_context_params {
|
||||||
CLIP_API struct clip_ctx * clip_model_load_cpu(const char * fname, int verbosity);
|
bool use_gpu;
|
||||||
|
int verbosity;
|
||||||
|
};
|
||||||
|
|
||||||
|
// deprecated, use clip_init
|
||||||
|
CLIP_API struct clip_ctx * clip_model_load(const char * fname, int verbosity);
|
||||||
|
|
||||||
|
CLIP_API struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_params);
|
||||||
|
|
||||||
CLIP_API void clip_free(struct clip_ctx * ctx);
|
CLIP_API void clip_free(struct clip_ctx * ctx);
|
||||||
|
|
||||||
|
|
341
examples/llava/gemma3-cli.cpp
Normal file
341
examples/llava/gemma3-cli.cpp
Normal file
|
@ -0,0 +1,341 @@
|
||||||
|
#include "arg.h"
|
||||||
|
#include "log.h"
|
||||||
|
#include "common.h"
|
||||||
|
#include "sampling.h"
|
||||||
|
#include "clip.h"
|
||||||
|
#include "stb_image.h"
|
||||||
|
#include "llama.h"
|
||||||
|
#include "ggml.h"
|
||||||
|
#include "console.h"
|
||||||
|
|
||||||
|
#include <vector>
|
||||||
|
#include <limits.h>
|
||||||
|
#include <inttypes.h>
|
||||||
|
|
||||||
|
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
|
||||||
|
#include <signal.h>
|
||||||
|
#include <unistd.h>
|
||||||
|
#elif defined (_WIN32)
|
||||||
|
#define WIN32_LEAN_AND_MEAN
|
||||||
|
#ifndef NOMINMAX
|
||||||
|
#define NOMINMAX
|
||||||
|
#endif
|
||||||
|
#include <windows.h>
|
||||||
|
#include <signal.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
static bool g_is_generating = false;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Please note that this is NOT a production-ready stuff.
|
||||||
|
* It is a playground for trying Gemma 3 vision capabilities.
|
||||||
|
* For contributors: please keep this code simple and easy to understand.
|
||||||
|
*/
|
||||||
|
|
||||||
|
static void show_additional_info(int /*argc*/, char ** argv) {
|
||||||
|
LOG(
|
||||||
|
"Experimental CLI for using Gemma 3 vision model\n\n"
|
||||||
|
"Usage: %s [options] -m <model> --mmproj <mmproj> --image <image> -p <prompt>\n\n"
|
||||||
|
" -m and --mmproj are required\n"
|
||||||
|
" --image and -p are optional, if NOT provided, the CLI will run in chat mode\n",
|
||||||
|
argv[0]
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
|
||||||
|
static void sigint_handler(int signo) {
|
||||||
|
if (signo == SIGINT) {
|
||||||
|
if (g_is_generating) {
|
||||||
|
g_is_generating = false;
|
||||||
|
} else {
|
||||||
|
console::cleanup();
|
||||||
|
LOG("\nInterrupted by user\n");
|
||||||
|
_exit(130);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
struct gemma3_context {
|
||||||
|
struct clip_ctx * ctx_clip = NULL;
|
||||||
|
common_init_result llama_init;
|
||||||
|
|
||||||
|
llama_model * model;
|
||||||
|
llama_context * lctx;
|
||||||
|
const llama_vocab * vocab;
|
||||||
|
llama_batch batch;
|
||||||
|
|
||||||
|
int n_threads = 1;
|
||||||
|
llama_pos n_past = 0;
|
||||||
|
|
||||||
|
gemma3_context(common_params & params) : llama_init(common_init_from_params(params)) {
|
||||||
|
model = llama_init.model.get();
|
||||||
|
lctx = llama_init.context.get();
|
||||||
|
vocab = llama_model_get_vocab(model);
|
||||||
|
n_threads = params.cpuparams.n_threads;
|
||||||
|
batch = llama_batch_init(params.n_batch, 0, 1);
|
||||||
|
init_clip_model(params);
|
||||||
|
}
|
||||||
|
|
||||||
|
void init_clip_model(common_params & params) {
|
||||||
|
const char * clip_path = params.mmproj.c_str();
|
||||||
|
ctx_clip = clip_model_load(clip_path, params.verbosity > 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
~gemma3_context() {
|
||||||
|
clip_free(ctx_clip);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct decode_embd_batch {
|
||||||
|
std::vector<llama_pos> pos;
|
||||||
|
std::vector<int32_t> n_seq_id;
|
||||||
|
std::vector<llama_seq_id> seq_id_0;
|
||||||
|
std::vector<llama_seq_id *> seq_ids;
|
||||||
|
std::vector<int8_t> logits;
|
||||||
|
llama_batch batch;
|
||||||
|
decode_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
|
||||||
|
pos .resize(n_tokens);
|
||||||
|
n_seq_id.resize(n_tokens);
|
||||||
|
seq_ids .resize(n_tokens + 1);
|
||||||
|
logits .resize(n_tokens);
|
||||||
|
seq_id_0.resize(1);
|
||||||
|
seq_id_0[0] = seq_id;
|
||||||
|
seq_ids [n_tokens] = nullptr;
|
||||||
|
batch = {
|
||||||
|
/*n_tokens =*/ n_tokens,
|
||||||
|
/*tokens =*/ nullptr,
|
||||||
|
/*embd =*/ embd,
|
||||||
|
/*pos =*/ pos.data(),
|
||||||
|
/*n_seq_id =*/ n_seq_id.data(),
|
||||||
|
/*seq_id =*/ seq_ids.data(),
|
||||||
|
/*logits =*/ logits.data(),
|
||||||
|
};
|
||||||
|
for (int i = 0; i < n_tokens; i++) {
|
||||||
|
batch.pos [i] = pos_0 + i;
|
||||||
|
batch.n_seq_id[i] = 1;
|
||||||
|
batch.seq_id [i] = seq_id_0.data();
|
||||||
|
batch.logits [i] = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
static int eval_text(gemma3_context & ctx, std::string input, bool logits_last = false) {
|
||||||
|
llama_tokens tokens = common_tokenize(ctx.lctx, input, false, true);
|
||||||
|
common_batch_clear(ctx.batch);
|
||||||
|
for (llama_token & t : tokens) {
|
||||||
|
common_batch_add(ctx.batch, t, ctx.n_past++, {0}, false);
|
||||||
|
}
|
||||||
|
if (logits_last) {
|
||||||
|
ctx.batch.logits[ctx.batch.n_tokens - 1] = true;
|
||||||
|
}
|
||||||
|
// LOG("eval_text (n_tokens = %d): %s\n", (int)tokens.size(), input.c_str());
|
||||||
|
if (llama_decode(ctx.lctx, ctx.batch)) {
|
||||||
|
LOG_ERR("Failed to decode text\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int eval_image(gemma3_context & ctx, std::string & fname) {
|
||||||
|
std::vector<float> image_embd_v;
|
||||||
|
int n_embd = llama_model_n_embd(ctx.model);
|
||||||
|
int n_tokens = 256;
|
||||||
|
image_embd_v.resize(n_tokens * n_embd);
|
||||||
|
|
||||||
|
bool ok;
|
||||||
|
struct clip_image_u8 * img_u8 = clip_image_u8_init();
|
||||||
|
ok = clip_image_load_from_file(fname.c_str(), img_u8);
|
||||||
|
if (!ok) {
|
||||||
|
LOG_ERR("Unable to load image %s\n", fname.c_str());
|
||||||
|
clip_image_u8_free(img_u8);
|
||||||
|
return 2; // non-fatal error
|
||||||
|
}
|
||||||
|
|
||||||
|
clip_image_f32_batch batch_f32;
|
||||||
|
ok = clip_image_preprocess(ctx.ctx_clip, img_u8, &batch_f32);
|
||||||
|
if (!ok) {
|
||||||
|
LOG_ERR("Unable to preprocess image\n");
|
||||||
|
clip_image_f32_batch_free(&batch_f32);
|
||||||
|
clip_image_u8_free(img_u8);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
int64_t t0 = ggml_time_ms();
|
||||||
|
LOG("Encoding image %s\n", fname.c_str());
|
||||||
|
ok = clip_image_batch_encode(ctx.ctx_clip, ctx.n_threads, &batch_f32, image_embd_v.data());
|
||||||
|
if (!ok) {
|
||||||
|
LOG_ERR("Unable to encode image\n");
|
||||||
|
clip_image_f32_batch_free(&batch_f32);
|
||||||
|
clip_image_u8_free(img_u8);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
LOG("Image encoded in %" PRId64 " ms\n", ggml_time_ms() - t0);
|
||||||
|
|
||||||
|
clip_image_f32_batch_free(&batch_f32);
|
||||||
|
clip_image_u8_free(img_u8);
|
||||||
|
|
||||||
|
// decode image embeddings
|
||||||
|
int64_t t1 = ggml_time_ms();
|
||||||
|
eval_text(ctx, "<start_of_image>");
|
||||||
|
llama_set_causal_attn(ctx.lctx, false);
|
||||||
|
decode_embd_batch batch_img(image_embd_v.data(), n_tokens, ctx.n_past, 0);
|
||||||
|
if (llama_decode(ctx.lctx, batch_img.batch)) {
|
||||||
|
LOG_ERR("failed to decode image\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
ctx.n_past += n_tokens;
|
||||||
|
llama_set_causal_attn(ctx.lctx, true);
|
||||||
|
eval_text(ctx, "<end_of_image>");
|
||||||
|
LOG("Image decoded in %" PRId64 " ms\n", ggml_time_ms() - t1);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int generate_response(gemma3_context & ctx, common_sampler * smpl, int n_predict) {
|
||||||
|
for (int i = 0; i < n_predict; i++) {
|
||||||
|
if (i > n_predict || !g_is_generating) {
|
||||||
|
printf("\n");
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_token token_id = common_sampler_sample(smpl, ctx.lctx, -1);
|
||||||
|
common_sampler_accept(smpl, token_id, true);
|
||||||
|
|
||||||
|
if (llama_vocab_is_eog(ctx.vocab, token_id)) {
|
||||||
|
printf("\n");
|
||||||
|
break; // end of generation
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("%s", common_token_to_piece(ctx.lctx, token_id).c_str());
|
||||||
|
fflush(stdout);
|
||||||
|
|
||||||
|
// eval the token
|
||||||
|
common_batch_clear(ctx.batch);
|
||||||
|
common_batch_add(ctx.batch, token_id, ctx.n_past++, {0}, true);
|
||||||
|
if (llama_decode(ctx.lctx, ctx.batch)) {
|
||||||
|
LOG_ERR("failed to decode token\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char ** argv) {
|
||||||
|
ggml_time_init();
|
||||||
|
|
||||||
|
common_params params;
|
||||||
|
params.sampling.temp = 0.2; // lower temp by default for better quality
|
||||||
|
|
||||||
|
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, show_additional_info)) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
common_init();
|
||||||
|
|
||||||
|
if (params.mmproj.empty()) {
|
||||||
|
show_additional_info(argc, argv);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
gemma3_context ctx(params);
|
||||||
|
printf("%s: %s\n", __func__, params.model.c_str());
|
||||||
|
|
||||||
|
bool is_single_turn = !params.prompt.empty() && !params.image.empty();
|
||||||
|
|
||||||
|
struct common_sampler * smpl = common_sampler_init(ctx.model, params.sampling);
|
||||||
|
int n_predict = params.n_predict < 0 ? INT_MAX : params.n_predict;
|
||||||
|
|
||||||
|
// ctrl+C handling
|
||||||
|
{
|
||||||
|
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
|
||||||
|
struct sigaction sigint_action;
|
||||||
|
sigint_action.sa_handler = sigint_handler;
|
||||||
|
sigemptyset (&sigint_action.sa_mask);
|
||||||
|
sigint_action.sa_flags = 0;
|
||||||
|
sigaction(SIGINT, &sigint_action, NULL);
|
||||||
|
#elif defined (_WIN32)
|
||||||
|
auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
|
||||||
|
return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false;
|
||||||
|
};
|
||||||
|
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
if (eval_text(ctx, "<bos>")) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (is_single_turn) {
|
||||||
|
g_is_generating = true;
|
||||||
|
if (eval_text(ctx, "<start_of_turn>user\n")) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
for (auto & fname : params.image) {
|
||||||
|
if (eval_image(ctx, fname)) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (eval_text(ctx, params.prompt + "<end_of_turn><start_of_turn>model\n", true)) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
if (generate_response(ctx, smpl, n_predict)) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
} else {
|
||||||
|
LOG("\n Running in chat mode, available commands:");
|
||||||
|
LOG("\n /image <path> load an image");
|
||||||
|
LOG("\n /clear clear the chat history");
|
||||||
|
LOG("\n /quit or /exit exit the program");
|
||||||
|
LOG("\n");
|
||||||
|
|
||||||
|
if (eval_text(ctx, "<start_of_turn>user\n")) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
while (true) {
|
||||||
|
g_is_generating = false;
|
||||||
|
LOG("\n> ");
|
||||||
|
console::set_display(console::user_input);
|
||||||
|
std::string line;
|
||||||
|
console::readline(line, false);
|
||||||
|
console::set_display(console::reset);
|
||||||
|
line = string_strip(line);
|
||||||
|
if (line.empty()) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (line == "/quit" || line == "/exit") {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (line == "/clear") {
|
||||||
|
ctx.n_past = 0;
|
||||||
|
llama_kv_cache_seq_rm(ctx.lctx, 0, 1, -1); // keep BOS
|
||||||
|
LOG("Chat history cleared\n\n");
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
g_is_generating = true;
|
||||||
|
if (line.find("/image") == 0) {
|
||||||
|
std::string image = line.substr(7);
|
||||||
|
int res = eval_image(ctx, image);
|
||||||
|
if (res == 2) {
|
||||||
|
continue; // image not found
|
||||||
|
}
|
||||||
|
if (res) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (eval_text(ctx, line + "<end_of_turn><start_of_turn>model\n", true)) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
if (generate_response(ctx, smpl, n_predict)) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
if (eval_text(ctx, "<end_of_turn><start_of_turn>user\n")) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
307
examples/llava/gemma3_convert_encoder_to_gguf.py
Normal file
307
examples/llava/gemma3_convert_encoder_to_gguf.py
Normal file
|
@ -0,0 +1,307 @@
|
||||||
|
import gguf
|
||||||
|
import argparse
|
||||||
|
import logging
|
||||||
|
import sys
|
||||||
|
import torch
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import numpy as np
|
||||||
|
from typing import cast, ContextManager, Any, Iterator
|
||||||
|
from pathlib import Path
|
||||||
|
from torch import Tensor
|
||||||
|
|
||||||
|
logger = logging.getLogger("gemma3-mmproj")
|
||||||
|
|
||||||
|
|
||||||
|
# (copied from convert_hf_to_gguf.py)
|
||||||
|
# tree of lazy tensors
|
||||||
|
class LazyTorchTensor(gguf.LazyBase):
|
||||||
|
_tensor_type = torch.Tensor
|
||||||
|
# to keep the type-checker happy
|
||||||
|
dtype: torch.dtype
|
||||||
|
shape: torch.Size
|
||||||
|
|
||||||
|
# only used when converting a torch.Tensor to a np.ndarray
|
||||||
|
_dtype_map: dict[torch.dtype, type] = {
|
||||||
|
torch.float16: np.float16,
|
||||||
|
torch.float32: np.float32,
|
||||||
|
}
|
||||||
|
|
||||||
|
# used for safetensors slices
|
||||||
|
# ref: https://github.com/huggingface/safetensors/blob/079781fd0dc455ba0fe851e2b4507c33d0c0d407/bindings/python/src/lib.rs#L1046
|
||||||
|
# TODO: uncomment U64, U32, and U16, ref: https://github.com/pytorch/pytorch/issues/58734
|
||||||
|
_dtype_str_map: dict[str, torch.dtype] = {
|
||||||
|
"F64": torch.float64,
|
||||||
|
"F32": torch.float32,
|
||||||
|
"BF16": torch.bfloat16,
|
||||||
|
"F16": torch.float16,
|
||||||
|
# "U64": torch.uint64,
|
||||||
|
"I64": torch.int64,
|
||||||
|
# "U32": torch.uint32,
|
||||||
|
"I32": torch.int32,
|
||||||
|
# "U16": torch.uint16,
|
||||||
|
"I16": torch.int16,
|
||||||
|
"U8": torch.uint8,
|
||||||
|
"I8": torch.int8,
|
||||||
|
"BOOL": torch.bool,
|
||||||
|
"F8_E4M3": torch.float8_e4m3fn,
|
||||||
|
"F8_E5M2": torch.float8_e5m2,
|
||||||
|
}
|
||||||
|
|
||||||
|
def numpy(self) -> gguf.LazyNumpyTensor:
|
||||||
|
dtype = self._dtype_map[self.dtype]
|
||||||
|
return gguf.LazyNumpyTensor(
|
||||||
|
meta=gguf.LazyNumpyTensor.meta_with_dtype_and_shape(dtype, self.shape),
|
||||||
|
args=(self,),
|
||||||
|
func=(lambda s: s.numpy())
|
||||||
|
)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def meta_with_dtype_and_shape(cls, dtype: torch.dtype, shape: tuple[int, ...]) -> Tensor:
|
||||||
|
return torch.empty(size=shape, dtype=dtype, device="meta")
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_safetensors_slice(cls, st_slice: Any) -> Tensor:
|
||||||
|
dtype = cls._dtype_str_map[st_slice.get_dtype()]
|
||||||
|
shape: tuple[int, ...] = tuple(st_slice.get_shape())
|
||||||
|
lazy = cls(meta=cls.meta_with_dtype_and_shape(dtype, shape), args=(st_slice,), func=lambda s: s[:])
|
||||||
|
return cast(torch.Tensor, lazy)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def __torch_function__(cls, func, types, args=(), kwargs=None):
|
||||||
|
del types # unused
|
||||||
|
|
||||||
|
if kwargs is None:
|
||||||
|
kwargs = {}
|
||||||
|
|
||||||
|
if func is torch.Tensor.numpy:
|
||||||
|
return args[0].numpy()
|
||||||
|
|
||||||
|
return cls._wrap_fn(func)(*args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
class Gemma3VisionTower:
|
||||||
|
hparams: dict
|
||||||
|
gguf_writer: gguf.GGUFWriter
|
||||||
|
fname_out: Path
|
||||||
|
ftype: gguf.LlamaFileType
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def load_hparams(dir_model: Path):
|
||||||
|
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
|
||||||
|
return json.load(f)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_model_part_names(dir_model: Path, prefix: str, suffix: str) -> list[str]:
|
||||||
|
part_names: list[str] = []
|
||||||
|
for filename in os.listdir(dir_model):
|
||||||
|
if filename.startswith(prefix) and filename.endswith(suffix):
|
||||||
|
part_names.append(filename)
|
||||||
|
part_names.sort()
|
||||||
|
return part_names
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
dir_model: Path,
|
||||||
|
fname_out: Path,
|
||||||
|
ftype: gguf.LlamaFileType,
|
||||||
|
is_big_endian: bool,):
|
||||||
|
hparams = Gemma3VisionTower.load_hparams(dir_model)
|
||||||
|
self.hparams = hparams
|
||||||
|
self.fname_out = fname_out
|
||||||
|
self.ftype = ftype
|
||||||
|
endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
|
||||||
|
self.gguf_writer = gguf.GGUFWriter(path=None, arch="clip", endianess=endianess)
|
||||||
|
|
||||||
|
text_config = hparams["text_config"]
|
||||||
|
vision_config = hparams["vision_config"]
|
||||||
|
|
||||||
|
assert hparams["architectures"][0] == "Gemma3ForConditionalGeneration"
|
||||||
|
assert text_config is not None
|
||||||
|
assert vision_config is not None
|
||||||
|
|
||||||
|
self.gguf_writer.add_string ("clip.projector_type", "gemma3")
|
||||||
|
self.gguf_writer.add_bool ("clip.has_text_encoder", False)
|
||||||
|
self.gguf_writer.add_bool ("clip.has_vision_encoder", True)
|
||||||
|
self.gguf_writer.add_bool ("clip.has_llava_projector", False) # legacy
|
||||||
|
self.gguf_writer.add_uint32 ("clip.vision.image_size", vision_config["image_size"])
|
||||||
|
self.gguf_writer.add_uint32 ("clip.vision.patch_size", vision_config["patch_size"])
|
||||||
|
self.gguf_writer.add_uint32 ("clip.vision.embedding_length", vision_config["hidden_size"])
|
||||||
|
self.gguf_writer.add_uint32 ("clip.vision.feed_forward_length", vision_config["intermediate_size"])
|
||||||
|
self.gguf_writer.add_uint32 ("clip.vision.projection_dim", text_config["hidden_size"])
|
||||||
|
self.gguf_writer.add_uint32 ("clip.vision.block_count", vision_config["num_hidden_layers"])
|
||||||
|
self.gguf_writer.add_uint32 ("clip.vision.attention.head_count", vision_config["num_attention_heads"])
|
||||||
|
self.gguf_writer.add_float32("clip.vision.attention.layer_norm_epsilon", vision_config.get("layer_norm_eps", 1e-6))
|
||||||
|
# default values taken from HF tranformers code
|
||||||
|
self.gguf_writer.add_array ("clip.vision.image_mean", [0.5, 0.5, 0.5])
|
||||||
|
self.gguf_writer.add_array ("clip.vision.image_std", [0.5, 0.5, 0.5])
|
||||||
|
self.gguf_writer.add_bool ("clip.use_gelu", True)
|
||||||
|
|
||||||
|
# load tensors
|
||||||
|
for name, data_torch in self.get_tensors(dir_model):
|
||||||
|
# convert any unsupported data types to float32
|
||||||
|
if data_torch.dtype not in (torch.float16, torch.float32):
|
||||||
|
data_torch = data_torch.to(torch.float32)
|
||||||
|
self.add_tensor(name, data_torch)
|
||||||
|
|
||||||
|
def get_tensors(self, dir_model: Path) -> Iterator[tuple[str, Tensor]]:
|
||||||
|
part_names = Gemma3VisionTower.get_model_part_names(dir_model, "model", ".safetensors")
|
||||||
|
tensor_names_from_parts: set[str] = set()
|
||||||
|
for part_name in part_names:
|
||||||
|
logger.info(f"gguf: loading model part '{part_name}'")
|
||||||
|
from safetensors import safe_open
|
||||||
|
ctx = cast(ContextManager[Any], safe_open(dir_model / part_name, framework="pt", device="cpu"))
|
||||||
|
with ctx as model_part:
|
||||||
|
tensor_names_from_parts.update(model_part.keys())
|
||||||
|
|
||||||
|
for name in model_part.keys():
|
||||||
|
data = model_part.get_slice(name)
|
||||||
|
data = LazyTorchTensor.from_safetensors_slice(data)
|
||||||
|
yield name, data
|
||||||
|
|
||||||
|
def add_tensor(self, name: str, data_torch: Tensor):
|
||||||
|
is_1d = len(data_torch.shape) == 1
|
||||||
|
is_embd = ".embeddings." in name
|
||||||
|
old_dtype = data_torch.dtype
|
||||||
|
can_quantize = not is_1d and not is_embd
|
||||||
|
data_qtype = gguf.GGMLQuantizationType.F32
|
||||||
|
|
||||||
|
# this is to support old checkpoint
|
||||||
|
# TODO: remove this when we have the final model
|
||||||
|
name = name.replace("vision_model.vision_model.", "vision_tower.vision_model.")
|
||||||
|
name = name.replace("multimodal_projector.", "multi_modal_projector.")
|
||||||
|
|
||||||
|
# filter only vision tensors
|
||||||
|
if not name.startswith("vision_tower.vision_model.") and not name.startswith("multi_modal_projector."):
|
||||||
|
return
|
||||||
|
# prefix
|
||||||
|
name = name.replace("vision_tower.vision_model.encoder.layers.", "v.blk.")
|
||||||
|
name = name.replace("vision_tower.vision_model.", "v.")
|
||||||
|
# projector and input embd
|
||||||
|
name = name.replace(".embeddings.patch_embedding.", ".patch_embd.")
|
||||||
|
name = name.replace(".embeddings.position_embedding.", ".position_embd.")
|
||||||
|
name = name.replace(
|
||||||
|
"multi_modal_projector.mm_input_projection_weight",
|
||||||
|
"mm.input_projection.weight"
|
||||||
|
)
|
||||||
|
name = name.replace(
|
||||||
|
"multi_modal_projector.mm_soft_emb_norm.weight",
|
||||||
|
"mm.soft_emb_norm.weight"
|
||||||
|
)
|
||||||
|
name = name.replace("post_layernorm.", "post_ln.")
|
||||||
|
# each block
|
||||||
|
name = name.replace(".self_attn.k_proj.", ".attn_k.")
|
||||||
|
name = name.replace(".self_attn.v_proj.", ".attn_v.")
|
||||||
|
name = name.replace(".self_attn.q_proj.", ".attn_q.")
|
||||||
|
name = name.replace(".self_attn.out_proj.", ".attn_out.")
|
||||||
|
name = name.replace(".layer_norm1.", ".ln1.")
|
||||||
|
name = name.replace(".layer_norm2.", ".ln2.")
|
||||||
|
name = name.replace(".mlp.fc1.", ".ffn_down.")
|
||||||
|
name = name.replace(".mlp.fc2.", ".ffn_up.")
|
||||||
|
|
||||||
|
if can_quantize:
|
||||||
|
if self.ftype == gguf.LlamaFileType.ALL_F32:
|
||||||
|
data_qtype = gguf.GGMLQuantizationType.F32
|
||||||
|
elif self.ftype == gguf.LlamaFileType.MOSTLY_F16:
|
||||||
|
data_qtype = gguf.GGMLQuantizationType.F16
|
||||||
|
elif self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
|
||||||
|
data_qtype = gguf.GGMLQuantizationType.BF16
|
||||||
|
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0:
|
||||||
|
data_qtype = gguf.GGMLQuantizationType.Q8_0
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unsupported file type: {self.ftype}")
|
||||||
|
|
||||||
|
# corrent norm value ; only this "soft_emb_norm" need to be corrected as it's part of Gemma projector
|
||||||
|
# the other norm values are part of SigLIP model, and they are already correct
|
||||||
|
# ref code: Gemma3RMSNorm
|
||||||
|
if "soft_emb_norm.weight" in name:
|
||||||
|
logger.info(f"Correcting norm value for '{name}'")
|
||||||
|
data_torch = data_torch + 1
|
||||||
|
|
||||||
|
data = data_torch.numpy()
|
||||||
|
|
||||||
|
try:
|
||||||
|
data = gguf.quants.quantize(data, data_qtype)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error quantizing tensor '{name}': {e}, fallback to F16")
|
||||||
|
data_qtype = gguf.GGMLQuantizationType.F16
|
||||||
|
data = gguf.quants.quantize(data, data_qtype)
|
||||||
|
|
||||||
|
# reverse shape to make it similar to the internal ggml dimension order
|
||||||
|
shape_str = f"{{{', '.join(str(n) for n in reversed(data_torch.shape))}}}"
|
||||||
|
logger.info(f"{f'%-32s' % f'{name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}")
|
||||||
|
|
||||||
|
self.gguf_writer.add_tensor(name, data, raw_dtype=data_qtype)
|
||||||
|
|
||||||
|
def write(self):
|
||||||
|
self.gguf_writer.write_header_to_file(path=self.fname_out)
|
||||||
|
self.gguf_writer.write_kv_data_to_file()
|
||||||
|
self.gguf_writer.write_tensors_to_file(progress=True)
|
||||||
|
self.gguf_writer.close()
|
||||||
|
|
||||||
|
def parse_args() -> argparse.Namespace:
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Convert Gemma 3 vision tower safetensors to GGUF format",)
|
||||||
|
parser.add_argument(
|
||||||
|
"--outfile", type=Path, default="mmproj.gguf",
|
||||||
|
help="path to write to",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0"], default="f16",
|
||||||
|
help="output format",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--bigendian", action="store_true",
|
||||||
|
help="model is executed on big endian machine",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"model", type=Path,
|
||||||
|
help="directory containing model file",
|
||||||
|
nargs="?",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--verbose", action="store_true",
|
||||||
|
help="increase output verbosity",
|
||||||
|
)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
if args.model is None:
|
||||||
|
parser.error("the following arguments are required: model")
|
||||||
|
return args
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
args = parse_args()
|
||||||
|
|
||||||
|
if args.verbose:
|
||||||
|
logging.basicConfig(level=logging.DEBUG)
|
||||||
|
else:
|
||||||
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
|
||||||
|
dir_model = args.model
|
||||||
|
|
||||||
|
if not dir_model.is_dir():
|
||||||
|
logger.error(f'Error: {args.model} is not a directory')
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
ftype_map: dict[str, gguf.LlamaFileType] = {
|
||||||
|
"f32": gguf.LlamaFileType.ALL_F32,
|
||||||
|
"f16": gguf.LlamaFileType.MOSTLY_F16,
|
||||||
|
"bf16": gguf.LlamaFileType.MOSTLY_BF16,
|
||||||
|
"q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.info(f"Loading model: {dir_model.name}")
|
||||||
|
|
||||||
|
with torch.inference_mode():
|
||||||
|
gemma3_vision_tower = Gemma3VisionTower(
|
||||||
|
dir_model=dir_model,
|
||||||
|
fname_out=args.outfile,
|
||||||
|
ftype=ftype_map[args.outtype],
|
||||||
|
is_big_endian=args.bigendian,
|
||||||
|
)
|
||||||
|
gemma3_vision_tower.write()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
|
|
|
@ -86,7 +86,11 @@ static struct clip_ctx * clip_init_context(common_params * params) {
|
||||||
if (prompt.empty()) {
|
if (prompt.empty()) {
|
||||||
prompt = "describe the image in detail.";
|
prompt = "describe the image in detail.";
|
||||||
}
|
}
|
||||||
auto * ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
|
struct clip_context_params clip_params = {
|
||||||
|
/* use_gpu */ params->n_gpu_layers != 0,
|
||||||
|
/* verbosity */ params->verbosity,
|
||||||
|
};
|
||||||
|
auto * ctx_clip = clip_init(clip_path, clip_params);
|
||||||
return ctx_clip;
|
return ctx_clip;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -148,19 +152,34 @@ static void process_image(struct llava_context * ctx_llava, struct llava_image_e
|
||||||
process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
|
process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
|
||||||
eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
|
eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
|
||||||
if (num_image_embeds > 1) {
|
if (num_image_embeds > 1) {
|
||||||
size_t num_image_embeds_col = clip_uhd_num_image_embeds_col(ctx_llava->ctx_clip);
|
if (has_minicpmv_projector == 2) {
|
||||||
eval_string(ctx_llava->ctx_llama, std::string("<slice>").c_str(), params->n_batch, &n_past, false);
|
size_t num_image_embeds_col = clip_uhd_num_image_embeds_col(ctx_llava->ctx_clip);
|
||||||
for (size_t i = 0; i < (num_image_embeds-1)/num_image_embeds_col; ++i) {
|
eval_string(ctx_llava->ctx_llama, std::string("<slice>").c_str(), params->n_batch, &n_past, false);
|
||||||
for (size_t j = 0; j < num_image_embeds_col; ++j) {
|
for (size_t i = 0; i < (num_image_embeds-1)/num_image_embeds_col; ++i) {
|
||||||
eval_string(ctx_llava->ctx_llama, std::string("<image>").c_str(), params->n_batch, &n_past, false);
|
for (size_t j = 0; j < num_image_embeds_col; ++j) {
|
||||||
process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
|
eval_string(ctx_llava->ctx_llama, std::string("<image>").c_str(), params->n_batch, &n_past, false);
|
||||||
eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
|
process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
|
||||||
if (j == num_image_embeds_col - 1) {
|
eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
|
||||||
eval_string(ctx_llava->ctx_llama, std::string("\n").c_str(), params->n_batch, &n_past, false);
|
if (j == num_image_embeds_col - 1) {
|
||||||
|
eval_string(ctx_llava->ctx_llama, std::string("\n").c_str(), params->n_batch, &n_past, false);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
eval_string(ctx_llava->ctx_llama, std::string("</slice>").c_str(), params->n_batch, &n_past, false);
|
||||||
|
}
|
||||||
|
else if (has_minicpmv_projector == 3 || has_minicpmv_projector == 4) {
|
||||||
|
size_t num_image_embeds_col = clip_uhd_num_image_embeds_col(ctx_llava->ctx_clip);
|
||||||
|
for (size_t i = 0; i < (num_image_embeds-1)/num_image_embeds_col; ++i) {
|
||||||
|
for (size_t j = 0; j < num_image_embeds_col; ++j) {
|
||||||
|
eval_string(ctx_llava->ctx_llama, std::string("<slice>").c_str(), params->n_batch, &n_past, false);
|
||||||
|
process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
|
||||||
|
eval_string(ctx_llava->ctx_llama, std::string("</slice>").c_str(), params->n_batch, &n_past, false);
|
||||||
|
if (j == num_image_embeds_col - 1) {
|
||||||
|
eval_string(ctx_llava->ctx_llama, std::string("\n").c_str(), params->n_batch, &n_past, false);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
eval_string(ctx_llava->ctx_llama, std::string("</slice>").c_str(), params->n_batch, &n_past, false);
|
|
||||||
}
|
}
|
||||||
LOG_INF("%s: image token past: %d\n", __func__, n_past);
|
LOG_INF("%s: image token past: %d\n", __func__, n_past);
|
||||||
}
|
}
|
||||||
|
|
|
@ -597,7 +597,6 @@ elif args.minicpmv_projector is not None:
|
||||||
fname_middle = "mmproj-"
|
fname_middle = "mmproj-"
|
||||||
has_text_encoder = False
|
has_text_encoder = False
|
||||||
has_minicpmv_projector = True
|
has_minicpmv_projector = True
|
||||||
minicpmv_version = 4
|
|
||||||
elif args.vision_only:
|
elif args.vision_only:
|
||||||
fname_middle = "vision-"
|
fname_middle = "vision-"
|
||||||
has_text_encoder = False
|
has_text_encoder = False
|
||||||
|
|
|
@ -384,8 +384,9 @@ struct server_task {
|
||||||
SRV_DBG("Grammar trigger token: %d (`%s`)\n", token, word.c_str());
|
SRV_DBG("Grammar trigger token: %d (`%s`)\n", token, word.c_str());
|
||||||
common_grammar_trigger trigger;
|
common_grammar_trigger trigger;
|
||||||
trigger.type = COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN;
|
trigger.type = COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN;
|
||||||
trigger.value = (llama_token) token;
|
trigger.value = word;
|
||||||
params.sampling.grammar_triggers.push_back(trigger);
|
trigger.token = token;
|
||||||
|
params.sampling.grammar_triggers.push_back(std::move(trigger));
|
||||||
} else {
|
} else {
|
||||||
SRV_DBG("Grammar trigger word: `%s`\n", word.c_str());
|
SRV_DBG("Grammar trigger word: `%s`\n", word.c_str());
|
||||||
params.sampling.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, word});
|
params.sampling.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, word});
|
||||||
|
@ -750,7 +751,10 @@ struct server_task_result_cmpl_final : server_task_result {
|
||||||
{"name", tc.name},
|
{"name", tc.name},
|
||||||
{"arguments", tc.arguments},
|
{"arguments", tc.arguments},
|
||||||
}},
|
}},
|
||||||
{"id", tc.id},
|
// Some templates generate and require an id (sometimes in a very specific format, e.g. Mistral Nemo).
|
||||||
|
// We only generate a random id for the ones that don't generate one by themselves
|
||||||
|
// (they also won't get to see it as their template likely doesn't use it, so it's all for the client)
|
||||||
|
{"id", tc.id.empty() ? gen_tool_call_id() : tc.id},
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
message["tool_calls"] = tool_calls;
|
message["tool_calls"] = tool_calls;
|
||||||
|
|
|
@ -92,6 +92,7 @@ def do_test_completion_with_required_tool_tiny(server: ServerProcess, tool: dict
|
||||||
assert tool_calls and len(tool_calls) == 1, f'Expected 1 tool call in {choice["message"]}'
|
assert tool_calls and len(tool_calls) == 1, f'Expected 1 tool call in {choice["message"]}'
|
||||||
tool_call = tool_calls[0]
|
tool_call = tool_calls[0]
|
||||||
assert choice["message"].get("content") in (None, ""), f'Expected no content in {choice["message"]}'
|
assert choice["message"].get("content") in (None, ""), f'Expected no content in {choice["message"]}'
|
||||||
|
assert len(tool_call.get("id", "")) > 0, f'Expected non empty tool call id in {tool_call}'
|
||||||
expected_function_name = "python" if tool["type"] == "code_interpreter" else tool["function"]["name"]
|
expected_function_name = "python" if tool["type"] == "code_interpreter" else tool["function"]["name"]
|
||||||
assert expected_function_name == tool_call["function"]["name"]
|
assert expected_function_name == tool_call["function"]["name"]
|
||||||
actual_arguments = tool_call["function"]["arguments"]
|
actual_arguments = tool_call["function"]["arguments"]
|
||||||
|
@ -373,6 +374,7 @@ def do_test_weather(server: ServerProcess, **kwargs):
|
||||||
tool_call = tool_calls[0]
|
tool_call = tool_calls[0]
|
||||||
# assert choice["message"].get("content") in (None, ""), f'Expected no content in {choice["message"]}'
|
# assert choice["message"].get("content") in (None, ""), f'Expected no content in {choice["message"]}'
|
||||||
assert tool_call["function"]["name"] == WEATHER_TOOL["function"]["name"], f'Expected weather tool call, got {tool_call["function"]["name"]}'
|
assert tool_call["function"]["name"] == WEATHER_TOOL["function"]["name"], f'Expected weather tool call, got {tool_call["function"]["name"]}'
|
||||||
|
assert len(tool_call.get("id", "")) > 0, f'Expected non empty tool call id in {tool_call}'
|
||||||
actual_arguments = json.loads(tool_call["function"]["arguments"])
|
actual_arguments = json.loads(tool_call["function"]["arguments"])
|
||||||
assert 'location' in actual_arguments, f"location not found in {json.dumps(actual_arguments)}"
|
assert 'location' in actual_arguments, f"location not found in {json.dumps(actual_arguments)}"
|
||||||
location = actual_arguments["location"]
|
location = actual_arguments["location"]
|
||||||
|
@ -596,6 +598,7 @@ def do_test_hello_world(server: ServerProcess, **kwargs):
|
||||||
tool_call = tool_calls[0]
|
tool_call = tool_calls[0]
|
||||||
# assert choice["message"].get("content") in (None, ""), f'Expected no content in {choice["message"]}'
|
# assert choice["message"].get("content") in (None, ""), f'Expected no content in {choice["message"]}'
|
||||||
assert tool_call["function"]["name"] == PYTHON_TOOL["function"]["name"]
|
assert tool_call["function"]["name"] == PYTHON_TOOL["function"]["name"]
|
||||||
|
assert len(tool_call.get("id", "")) > 0, f'Expected non empty tool call id in {tool_call}'
|
||||||
actual_arguments = json.loads(tool_call["function"]["arguments"])
|
actual_arguments = json.loads(tool_call["function"]["arguments"])
|
||||||
assert 'code' in actual_arguments, f"code not found in {json.dumps(actual_arguments)}"
|
assert 'code' in actual_arguments, f"code not found in {json.dumps(actual_arguments)}"
|
||||||
code = actual_arguments["code"]
|
code = actual_arguments["code"]
|
||||||
|
|
|
@ -435,6 +435,10 @@ static std::string gen_chatcmplid() {
|
||||||
return "chatcmpl-" + random_string();
|
return "chatcmpl-" + random_string();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static std::string gen_tool_call_id() {
|
||||||
|
return random_string();
|
||||||
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
// other common utils
|
// other common utils
|
||||||
//
|
//
|
||||||
|
|
|
@ -498,7 +498,7 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
|
||||||
search_paths.push_back(get_executable_path());
|
search_paths.push_back(get_executable_path());
|
||||||
search_paths.push_back(fs::current_path());
|
search_paths.push_back(fs::current_path());
|
||||||
} else {
|
} else {
|
||||||
search_paths.push_back(user_search_path);
|
search_paths.push_back(fs::u8path(user_search_path));
|
||||||
}
|
}
|
||||||
|
|
||||||
int best_score = 0;
|
int best_score = 0;
|
||||||
|
@ -512,9 +512,9 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
|
||||||
fs::directory_iterator dir_it(search_path, fs::directory_options::skip_permission_denied);
|
fs::directory_iterator dir_it(search_path, fs::directory_options::skip_permission_denied);
|
||||||
for (const auto & entry : dir_it) {
|
for (const auto & entry : dir_it) {
|
||||||
if (entry.is_regular_file()) {
|
if (entry.is_regular_file()) {
|
||||||
auto filename = entry.path().filename().native();
|
auto filename = entry.path().filename();
|
||||||
auto ext = entry.path().extension().native();
|
auto ext = entry.path().extension();
|
||||||
if (filename.find(file_prefix) == 0 && ext == file_extension) {
|
if (filename.native().find(file_prefix) == 0 && ext == file_extension) {
|
||||||
dl_handle_ptr handle { dl_load_library(entry) };
|
dl_handle_ptr handle { dl_load_library(entry) };
|
||||||
if (!handle && !silent) {
|
if (!handle && !silent) {
|
||||||
GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path_str(entry.path()).c_str());
|
GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path_str(entry.path()).c_str());
|
||||||
|
@ -545,7 +545,7 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
|
||||||
// try to load the base backend
|
// try to load the base backend
|
||||||
for (const auto & search_path : search_paths) {
|
for (const auto & search_path : search_paths) {
|
||||||
fs::path filename = backend_filename_prefix().native() + name_path.native() + backend_filename_extension().native();
|
fs::path filename = backend_filename_prefix().native() + name_path.native() + backend_filename_extension().native();
|
||||||
fs::path path = search_path.native() + filename.native();
|
fs::path path = search_path / filename;
|
||||||
if (fs::exists(path)) {
|
if (fs::exists(path)) {
|
||||||
return get_reg().load_backend(path, silent);
|
return get_reg().load_backend(path, silent);
|
||||||
}
|
}
|
||||||
|
|
|
@ -1461,7 +1461,7 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
||||||
bool parallel) {
|
bool parallel) {
|
||||||
GGML_ASSERT(n_backends > 0);
|
GGML_ASSERT(n_backends > 0);
|
||||||
GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
|
GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
|
||||||
GGML_ASSERT(ggml_backend_dev_type(ggml_backend_get_device(backends[n_backends - 1])) == GGML_BACKEND_DEVICE_TYPE_CPU);
|
// GGML_ASSERT(ggml_backend_dev_type(ggml_backend_get_device(backends[n_backends - 1])) == GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||||
|
|
||||||
struct ggml_backend_sched * sched = (ggml_backend_sched *) calloc(1, sizeof(struct ggml_backend_sched));
|
struct ggml_backend_sched * sched = (ggml_backend_sched *) calloc(1, sizeof(struct ggml_backend_sched));
|
||||||
|
|
||||||
|
|
|
@ -395,11 +395,11 @@ static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half
|
||||||
|
|
||||||
static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, int c) {
|
static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, int c) {
|
||||||
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||||
#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(RDNA2)
|
#if defined(CDNA) || defined(RDNA2) || defined(__gfx906__)
|
||||||
c = __builtin_amdgcn_sdot4(a, b, c, false);
|
c = __builtin_amdgcn_sdot4(a, b, c, false);
|
||||||
#elif defined(RDNA3)
|
#elif defined(RDNA3)
|
||||||
c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
|
c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
|
||||||
#elif defined(__gfx1010__) || defined(__gfx900__)
|
#elif defined(RDNA1) || defined(__gfx900__)
|
||||||
int tmp1;
|
int tmp1;
|
||||||
int tmp2;
|
int tmp2;
|
||||||
asm("\n \
|
asm("\n \
|
||||||
|
|
|
@ -52,12 +52,11 @@ typedef half (*vec_dot_KQ_f16_t)(
|
||||||
typedef float (*vec_dot_KQ_f32_t)(
|
typedef float (*vec_dot_KQ_f32_t)(
|
||||||
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8 , const void * __restrict__ Q_ds);
|
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8 , const void * __restrict__ Q_ds);
|
||||||
|
|
||||||
template<typename T, int D>
|
template<typename T, int D, int warp_size>
|
||||||
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_0(
|
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_0(
|
||||||
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
|
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
|
||||||
|
|
||||||
const block_q4_0 * K_q4_0 = (const block_q4_0 *) K_c;
|
const block_q4_0 * K_q4_0 = (const block_q4_0 *) K_c;
|
||||||
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
|
||||||
GGML_UNUSED(Q_v);
|
GGML_UNUSED(Q_v);
|
||||||
|
|
||||||
T sum = 0.0f;
|
T sum = 0.0f;
|
||||||
|
@ -93,12 +92,11 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_0(
|
||||||
return sum;
|
return sum;
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename T, int D>
|
template<typename T, int D, int warp_size>
|
||||||
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_1(
|
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_1(
|
||||||
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
|
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
|
||||||
|
|
||||||
const block_q4_1 * K_q4_1 = (const block_q4_1 *) K_c;
|
const block_q4_1 * K_q4_1 = (const block_q4_1 *) K_c;
|
||||||
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
|
||||||
GGML_UNUSED(Q_v);
|
GGML_UNUSED(Q_v);
|
||||||
|
|
||||||
T sum = 0.0f;
|
T sum = 0.0f;
|
||||||
|
@ -138,12 +136,11 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_1(
|
||||||
return sum;
|
return sum;
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename T, int D>
|
template<typename T, int D, int warp_size>
|
||||||
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_0(
|
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_0(
|
||||||
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
|
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
|
||||||
|
|
||||||
const block_q5_0 * K_q5_0 = (const block_q5_0 *) K_c;
|
const block_q5_0 * K_q5_0 = (const block_q5_0 *) K_c;
|
||||||
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
|
||||||
GGML_UNUSED(Q_v);
|
GGML_UNUSED(Q_v);
|
||||||
|
|
||||||
T sum = 0.0f;
|
T sum = 0.0f;
|
||||||
|
@ -186,12 +183,11 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_0(
|
||||||
return sum;
|
return sum;
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename T, int D>
|
template<typename T, int D, int warp_size>
|
||||||
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_1(
|
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_1(
|
||||||
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
|
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
|
||||||
|
|
||||||
const block_q5_1 * K_q5_1 = (const block_q5_1 *) K_c;
|
const block_q5_1 * K_q5_1 = (const block_q5_1 *) K_c;
|
||||||
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
|
||||||
GGML_UNUSED(Q_v);
|
GGML_UNUSED(Q_v);
|
||||||
|
|
||||||
T sum = 0.0f;
|
T sum = 0.0f;
|
||||||
|
@ -238,12 +234,11 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_1(
|
||||||
return sum;
|
return sum;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T, int D>
|
template <typename T, int D, int warp_size>
|
||||||
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q8_0(
|
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q8_0(
|
||||||
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
|
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
|
||||||
|
|
||||||
const block_q8_0 * K_q8_0 = (const block_q8_0 *) K_c;
|
const block_q8_0 * K_q8_0 = (const block_q8_0 *) K_c;
|
||||||
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
|
||||||
GGML_UNUSED(Q_v);
|
GGML_UNUSED(Q_v);
|
||||||
|
|
||||||
T sum = 0.0f;
|
T sum = 0.0f;
|
||||||
|
@ -272,12 +267,11 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q8_0(
|
||||||
return sum;
|
return sum;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T, int D>
|
template <typename T, int D, int warp_size>
|
||||||
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_f16(
|
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_f16(
|
||||||
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8 , const void * __restrict__ Q_ds_v) {
|
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8 , const void * __restrict__ Q_ds_v) {
|
||||||
|
|
||||||
const half2 * K_h2 = (const half2 *) K_c;
|
const half2 * K_h2 = (const half2 *) K_c;
|
||||||
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
|
||||||
GGML_UNUSED(Q_q8);
|
GGML_UNUSED(Q_q8);
|
||||||
GGML_UNUSED(Q_ds_v);
|
GGML_UNUSED(Q_ds_v);
|
||||||
|
|
||||||
|
@ -480,25 +474,25 @@ static __device__ __forceinline__ T dequantize_1_f16(const void * __restrict__ v
|
||||||
return x[i];
|
return x[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
template <int D>
|
template <int D, int warp_size = WARP_SIZE>
|
||||||
constexpr __device__ vec_dot_KQ_f16_t get_vec_dot_KQ_f16(ggml_type type_K) {
|
constexpr __device__ vec_dot_KQ_f16_t get_vec_dot_KQ_f16(ggml_type type_K) {
|
||||||
return type_K == GGML_TYPE_Q4_0 ? vec_dot_fattn_vec_KQ_q4_0<half, D> :
|
return type_K == GGML_TYPE_Q4_0 ? vec_dot_fattn_vec_KQ_q4_0<half, D, warp_size> :
|
||||||
type_K == GGML_TYPE_Q4_1 ? vec_dot_fattn_vec_KQ_q4_1<half, D> :
|
type_K == GGML_TYPE_Q4_1 ? vec_dot_fattn_vec_KQ_q4_1<half, D, warp_size> :
|
||||||
type_K == GGML_TYPE_Q5_0 ? vec_dot_fattn_vec_KQ_q5_0<half, D> :
|
type_K == GGML_TYPE_Q5_0 ? vec_dot_fattn_vec_KQ_q5_0<half, D, warp_size> :
|
||||||
type_K == GGML_TYPE_Q5_1 ? vec_dot_fattn_vec_KQ_q5_1<half, D> :
|
type_K == GGML_TYPE_Q5_1 ? vec_dot_fattn_vec_KQ_q5_1<half, D, warp_size> :
|
||||||
type_K == GGML_TYPE_Q8_0 ? vec_dot_fattn_vec_KQ_q8_0<half, D> :
|
type_K == GGML_TYPE_Q8_0 ? vec_dot_fattn_vec_KQ_q8_0<half, D, warp_size> :
|
||||||
type_K == GGML_TYPE_F16 ? vec_dot_fattn_vec_KQ_f16<half, D> :
|
type_K == GGML_TYPE_F16 ? vec_dot_fattn_vec_KQ_f16<half, D, warp_size> :
|
||||||
nullptr;
|
nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <int D>
|
template <int D, int warp_size = WARP_SIZE>
|
||||||
constexpr __device__ vec_dot_KQ_f32_t get_vec_dot_KQ_f32(ggml_type type_K) {
|
constexpr __device__ vec_dot_KQ_f32_t get_vec_dot_KQ_f32(ggml_type type_K) {
|
||||||
return type_K == GGML_TYPE_Q4_0 ? vec_dot_fattn_vec_KQ_q4_0<float, D> :
|
return type_K == GGML_TYPE_Q4_0 ? vec_dot_fattn_vec_KQ_q4_0<float, D, warp_size> :
|
||||||
type_K == GGML_TYPE_Q4_1 ? vec_dot_fattn_vec_KQ_q4_1<float, D> :
|
type_K == GGML_TYPE_Q4_1 ? vec_dot_fattn_vec_KQ_q4_1<float, D, warp_size> :
|
||||||
type_K == GGML_TYPE_Q5_0 ? vec_dot_fattn_vec_KQ_q5_0<float, D> :
|
type_K == GGML_TYPE_Q5_0 ? vec_dot_fattn_vec_KQ_q5_0<float, D, warp_size> :
|
||||||
type_K == GGML_TYPE_Q5_1 ? vec_dot_fattn_vec_KQ_q5_1<float, D> :
|
type_K == GGML_TYPE_Q5_1 ? vec_dot_fattn_vec_KQ_q5_1<float, D, warp_size> :
|
||||||
type_K == GGML_TYPE_Q8_0 ? vec_dot_fattn_vec_KQ_q8_0<float, D> :
|
type_K == GGML_TYPE_Q8_0 ? vec_dot_fattn_vec_KQ_q8_0<float, D, warp_size> :
|
||||||
type_K == GGML_TYPE_F16 ? vec_dot_fattn_vec_KQ_f16<float, D> :
|
type_K == GGML_TYPE_F16 ? vec_dot_fattn_vec_KQ_f16<float, D, warp_size> :
|
||||||
nullptr;
|
nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -681,7 +675,8 @@ static void on_no_fattn_vec_case(const int D) {
|
||||||
template <int D, int ncols1, int ncols2, int parallel_blocks, int KQ_stride>
|
template <int D, int ncols1, int ncols2, int parallel_blocks, int KQ_stride>
|
||||||
void launch_fattn(
|
void launch_fattn(
|
||||||
ggml_backend_cuda_context & ctx, ggml_tensor * dst, fattn_kernel_t fattn_kernel,
|
ggml_backend_cuda_context & ctx, ggml_tensor * dst, fattn_kernel_t fattn_kernel,
|
||||||
const int nwarps, const size_t nbytes_shared, const bool need_f16_K, const bool need_f16_V
|
const int nwarps, const size_t nbytes_shared, const bool need_f16_K, const bool need_f16_V,
|
||||||
|
const int warp_size = WARP_SIZE
|
||||||
) {
|
) {
|
||||||
constexpr int ncols = ncols1 * ncols2;
|
constexpr int ncols = ncols1 * ncols2;
|
||||||
|
|
||||||
|
@ -704,8 +699,6 @@ void launch_fattn(
|
||||||
|
|
||||||
GGML_ASSERT(Q->ne[3] == 1);
|
GGML_ASSERT(Q->ne[3] == 1);
|
||||||
|
|
||||||
const int warp_size = ggml_cuda_info().devices[ctx.device].warp_size;
|
|
||||||
|
|
||||||
ggml_cuda_pool & pool = ctx.pool();
|
ggml_cuda_pool & pool = ctx.pool();
|
||||||
cudaStream_t main_stream = ctx.stream();
|
cudaStream_t main_stream = ctx.stream();
|
||||||
const int id = ggml_cuda_get_device();
|
const int id = ggml_cuda_get_device();
|
||||||
|
@ -805,7 +798,6 @@ void launch_fattn(
|
||||||
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
||||||
|
|
||||||
GGML_ASSERT(block_dim.x % warp_size == 0);
|
GGML_ASSERT(block_dim.x % warp_size == 0);
|
||||||
GGML_ASSERT(!GGML_CUDA_CC_IS_AMD(cc) || block_dim.x * block_dim.y <= 4 * (unsigned int)warp_size);
|
|
||||||
fattn_kernel<<<blocks_num, block_dim, nbytes_shared, main_stream>>>(
|
fattn_kernel<<<blocks_num, block_dim, nbytes_shared, main_stream>>>(
|
||||||
(const char *) Q->data,
|
(const char *) Q->data,
|
||||||
K_data,
|
K_data,
|
||||||
|
|
|
@ -469,6 +469,7 @@ void ggml_cuda_flash_attn_ext_wmma_f16_case(ggml_backend_cuda_context & ctx, ggm
|
||||||
constexpr int frag_m = cols_per_block == 8 && D % 32 == 0 ? 32 : 16;
|
constexpr int frag_m = cols_per_block == 8 && D % 32 == 0 ? 32 : 16;
|
||||||
const int blocks_num_pb1 = ((Q->ne[1] + cols_per_block - 1) / cols_per_block)*Q->ne[2]*Q->ne[3];
|
const int blocks_num_pb1 = ((Q->ne[1] + cols_per_block - 1) / cols_per_block)*Q->ne[2]*Q->ne[3];
|
||||||
const int nsm = ggml_cuda_info().devices[ggml_cuda_get_device()].nsm;
|
const int nsm = ggml_cuda_info().devices[ggml_cuda_get_device()].nsm;
|
||||||
|
const int warp_size = ggml_cuda_info().devices[ggml_cuda_get_device()].warp_size;
|
||||||
|
|
||||||
float logit_softcap;
|
float logit_softcap;
|
||||||
memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));
|
memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));
|
||||||
|
@ -485,7 +486,7 @@ void ggml_cuda_flash_attn_ext_wmma_f16_case(ggml_backend_cuda_context & ctx, ggm
|
||||||
fattn_kernel = flash_attn_ext_f16<
|
fattn_kernel = flash_attn_ext_f16<
|
||||||
D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t, use_logit_softcap>;
|
D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t, use_logit_softcap>;
|
||||||
}
|
}
|
||||||
launch_fattn<D, cols_per_block, 1, parallel_blocks, -1>(ctx, dst, fattn_kernel, nwarps, 0, true, true);
|
launch_fattn<D, cols_per_block, 1, parallel_blocks, -1>(ctx, dst, fattn_kernel, nwarps, 0, true, true, warp_size);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (2*blocks_num_pb1 < 2*nsm) {
|
if (2*blocks_num_pb1 < 2*nsm) {
|
||||||
|
@ -500,7 +501,7 @@ void ggml_cuda_flash_attn_ext_wmma_f16_case(ggml_backend_cuda_context & ctx, ggm
|
||||||
fattn_kernel = flash_attn_ext_f16<
|
fattn_kernel = flash_attn_ext_f16<
|
||||||
D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t, use_logit_softcap>;
|
D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t, use_logit_softcap>;
|
||||||
}
|
}
|
||||||
launch_fattn<D, cols_per_block, 1, parallel_blocks, -1>(ctx, dst, fattn_kernel, nwarps, 0, true, true);
|
launch_fattn<D, cols_per_block, 1, parallel_blocks, -1>(ctx, dst, fattn_kernel, nwarps, 0, true, true, warp_size);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
constexpr int parallel_blocks = 1;
|
constexpr int parallel_blocks = 1;
|
||||||
|
@ -514,7 +515,7 @@ void ggml_cuda_flash_attn_ext_wmma_f16_case(ggml_backend_cuda_context & ctx, ggm
|
||||||
fattn_kernel = flash_attn_ext_f16<
|
fattn_kernel = flash_attn_ext_f16<
|
||||||
D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t, use_logit_softcap>;
|
D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t, use_logit_softcap>;
|
||||||
}
|
}
|
||||||
launch_fattn<D, cols_per_block, 1, parallel_blocks, -1>(ctx, dst, fattn_kernel, nwarps, 0, true, true);
|
launch_fattn<D, cols_per_block, 1, parallel_blocks, -1>(ctx, dst, fattn_kernel, nwarps, 0, true, true, warp_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
|
|
|
@ -47,11 +47,89 @@ static constexpr __device__ int get_vdr_mmvq(ggml_type type) {
|
||||||
1;
|
1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
enum mmvq_parameter_table_id {
|
||||||
|
MMVQ_PARAMETERS_GENERIC = 0,
|
||||||
|
MMVQ_PARAMETERS_GCN,
|
||||||
|
MMVQ_PARAMETERS_RDNA2
|
||||||
|
};
|
||||||
|
|
||||||
|
static constexpr __device__ mmvq_parameter_table_id get_device_table_id() {
|
||||||
|
#if defined(RDNA2) || defined(RDNA3)
|
||||||
|
return MMVQ_PARAMETERS_RDNA2;
|
||||||
|
#elif defined(GCN) || defined(CDNA)
|
||||||
|
return MMVQ_PARAMETERS_GCN;
|
||||||
|
#else
|
||||||
|
return MMVQ_PARAMETERS_GENERIC;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
static __host__ mmvq_parameter_table_id get_device_table_id(int cc) {
|
||||||
|
if (GGML_CUDA_CC_IS_RDNA2(cc) || GGML_CUDA_CC_IS_RDNA3(cc)) {
|
||||||
|
return MMVQ_PARAMETERS_RDNA2;
|
||||||
|
}
|
||||||
|
if (GGML_CUDA_CC_IS_GCN(cc) || GGML_CUDA_CC_IS_CDNA(cc)) {
|
||||||
|
return MMVQ_PARAMETERS_GCN;
|
||||||
|
}
|
||||||
|
return MMVQ_PARAMETERS_GENERIC;
|
||||||
|
}
|
||||||
|
|
||||||
|
static constexpr __host__ __device__ int calc_nwarps(int ncols_y, mmvq_parameter_table_id table_id) {
|
||||||
|
if (table_id == MMVQ_PARAMETERS_GENERIC) {
|
||||||
|
switch (ncols_y) {
|
||||||
|
case 1:
|
||||||
|
case 2:
|
||||||
|
case 3:
|
||||||
|
case 4:
|
||||||
|
return 4;
|
||||||
|
case 5:
|
||||||
|
case 6:
|
||||||
|
case 7:
|
||||||
|
case 8:
|
||||||
|
return 2;
|
||||||
|
default:
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
} else if (table_id == MMVQ_PARAMETERS_GCN) {
|
||||||
|
switch (ncols_y) {
|
||||||
|
case 1:
|
||||||
|
case 2:
|
||||||
|
case 3:
|
||||||
|
case 4:
|
||||||
|
return 2;
|
||||||
|
case 5:
|
||||||
|
case 6:
|
||||||
|
case 7:
|
||||||
|
case 8:
|
||||||
|
default:
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
static constexpr __host__ __device__ int calc_rows_per_block(int ncols_y, int table_id) {
|
||||||
|
if (table_id == MMVQ_PARAMETERS_GENERIC || table_id == MMVQ_PARAMETERS_GCN) {
|
||||||
|
switch (ncols_y) {
|
||||||
|
case 1:
|
||||||
|
return 1;
|
||||||
|
case 2:
|
||||||
|
case 3:
|
||||||
|
case 4:
|
||||||
|
case 5:
|
||||||
|
case 6:
|
||||||
|
case 7:
|
||||||
|
case 8:
|
||||||
|
return 2;
|
||||||
|
default:
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
template <ggml_type type, int ncols_y>
|
template <ggml_type type, int ncols_y>
|
||||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
|
||||||
// tell the compiler to use as many registers as it wants, see nwarps definition below
|
// tell the compiler to use as many registers as it wants, see nwarps definition below
|
||||||
__launch_bounds__((ncols_y <= 4 ? 4 : 2)*WARP_SIZE, 1)
|
__launch_bounds__(calc_nwarps(ncols_y, get_device_table_id())*ggml_cuda_get_physical_warp_size(), 1)
|
||||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
|
||||||
static __global__ void mul_mat_vec_q(
|
static __global__ void mul_mat_vec_q(
|
||||||
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
||||||
const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst) {
|
const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst) {
|
||||||
|
@ -59,24 +137,20 @@ static __global__ void mul_mat_vec_q(
|
||||||
constexpr int qk = ggml_cuda_type_traits<type>::qk;
|
constexpr int qk = ggml_cuda_type_traits<type>::qk;
|
||||||
constexpr int qi = ggml_cuda_type_traits<type>::qi;
|
constexpr int qi = ggml_cuda_type_traits<type>::qi;
|
||||||
constexpr int vdr = get_vdr_mmvq(type);
|
constexpr int vdr = get_vdr_mmvq(type);
|
||||||
|
constexpr mmvq_parameter_table_id table_id = get_device_table_id();
|
||||||
|
constexpr int nwarps = calc_nwarps(ncols_y, table_id);
|
||||||
|
constexpr int rows_per_cuda_block = calc_rows_per_block(ncols_y, table_id);
|
||||||
|
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
||||||
|
|
||||||
constexpr vec_dot_q_cuda_t vec_dot_q_cuda = get_vec_dot_q_cuda(type);
|
constexpr vec_dot_q_cuda_t vec_dot_q_cuda = get_vec_dot_q_cuda(type);
|
||||||
|
|
||||||
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && (defined(RDNA2) || defined(RDNA3))
|
const int tid = warp_size*threadIdx.y + threadIdx.x;
|
||||||
constexpr int nwarps = 1;
|
|
||||||
constexpr int rows_per_cuda_block = 1;
|
|
||||||
#else
|
|
||||||
constexpr int nwarps = ncols_y <= 4 ? 4 : 2;
|
|
||||||
constexpr int rows_per_cuda_block = ncols_y == 1 ? 1 : 2;
|
|
||||||
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(RDNA2) && !defined(RDNA3)
|
|
||||||
|
|
||||||
const int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
|
|
||||||
const int row0 = rows_per_cuda_block*blockIdx.x;
|
const int row0 = rows_per_cuda_block*blockIdx.x;
|
||||||
const int blocks_per_row_x = ncols_x / qk;
|
const int blocks_per_row_x = ncols_x / qk;
|
||||||
const int blocks_per_col_y = nrows_y / QK8_1;
|
const int blocks_per_col_y = nrows_y / QK8_1;
|
||||||
constexpr int blocks_per_iter = vdr * nwarps*WARP_SIZE / qi;
|
constexpr int blocks_per_iter = vdr * nwarps*warp_size / qi;
|
||||||
|
|
||||||
// partial sum for each thread
|
// partial sum for each thread
|
||||||
float tmp[ncols_y][rows_per_cuda_block] = {0.0f};
|
float tmp[ncols_y][rows_per_cuda_block] = {0.0f};
|
||||||
|
|
||||||
const block_q8_1 * y = (const block_q8_1 *) vy;
|
const block_q8_1 * y = (const block_q8_1 *) vy;
|
||||||
|
@ -96,7 +170,7 @@ static __global__ void mul_mat_vec_q(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
__shared__ float tmp_shared[nwarps-1 > 0 ? nwarps-1 : 1][ncols_y][rows_per_cuda_block][WARP_SIZE];
|
__shared__ float tmp_shared[nwarps-1 > 0 ? nwarps-1 : 1][ncols_y][rows_per_cuda_block][warp_size];
|
||||||
if (threadIdx.y > 0) {
|
if (threadIdx.y > 0) {
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int j = 0; j < ncols_y; ++j) {
|
for (int j = 0; j < ncols_y; ++j) {
|
||||||
|
@ -120,7 +194,7 @@ static __global__ void mul_mat_vec_q(
|
||||||
for (int l = 0; l < nwarps-1; ++l) {
|
for (int l = 0; l < nwarps-1; ++l) {
|
||||||
tmp[j][i] += tmp_shared[l][j][i][threadIdx.x];
|
tmp[j][i] += tmp_shared[l][j][i][threadIdx.x];
|
||||||
}
|
}
|
||||||
tmp[j][i] = warp_reduce_sum(tmp[j][i]);
|
tmp[j][i] = warp_reduce_sum<warp_size>(tmp[j][i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (threadIdx.x < rows_per_cuda_block && (rows_per_cuda_block == 1 || row0 + threadIdx.x < nrows_dst)) {
|
if (threadIdx.x < rows_per_cuda_block && (rows_per_cuda_block == 1 || row0 + threadIdx.x < nrows_dst)) {
|
||||||
|
@ -129,6 +203,13 @@ static __global__ void mul_mat_vec_q(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static std::pair<dim3, dim3> calc_launch_params(const int ncols_y, const int nrows_x, const int warp_size, const mmvq_parameter_table_id table_id) {
|
||||||
|
const int64_t nblocks = (nrows_x + calc_rows_per_block(ncols_y, table_id) - 1) / calc_rows_per_block(ncols_y, table_id);
|
||||||
|
const dim3 block_nums(nblocks, 1, 1);
|
||||||
|
const dim3 block_dims(warp_size, calc_nwarps(ncols_y, table_id), 1);
|
||||||
|
return {block_nums, block_dims};
|
||||||
|
}
|
||||||
|
|
||||||
template <ggml_type type>
|
template <ggml_type type>
|
||||||
static void mul_mat_vec_q_cuda(
|
static void mul_mat_vec_q_cuda(
|
||||||
const void * vx, const void * vy, float * dst,
|
const void * vx, const void * vy, float * dst,
|
||||||
|
@ -137,65 +218,67 @@ static void mul_mat_vec_q_cuda(
|
||||||
GGML_ASSERT(ncols_x % ggml_blck_size(type) == 0);
|
GGML_ASSERT(ncols_x % ggml_blck_size(type) == 0);
|
||||||
GGML_ASSERT(ncols_y <= MMVQ_MAX_BATCH_SIZE);
|
GGML_ASSERT(ncols_y <= MMVQ_MAX_BATCH_SIZE);
|
||||||
|
|
||||||
int id = ggml_cuda_get_device();
|
const int device = ggml_cuda_get_device();
|
||||||
|
const int warp_size = ggml_cuda_info().devices[device].warp_size;
|
||||||
int64_t nwarps = 1;
|
const mmvq_parameter_table_id table_id = get_device_table_id(ggml_cuda_info().devices[device].cc);
|
||||||
int64_t rows_per_cuda_block = 1;
|
|
||||||
|
|
||||||
if (ggml_cuda_info().devices[id].cc < GGML_CUDA_CC_RDNA2) { // NVIDIA and AMD older than RDNA2
|
|
||||||
switch(ncols_y) {
|
|
||||||
case 1:
|
|
||||||
nwarps = 4;
|
|
||||||
rows_per_cuda_block = 1;
|
|
||||||
break;
|
|
||||||
case 2:
|
|
||||||
case 3:
|
|
||||||
case 4:
|
|
||||||
nwarps = 4;
|
|
||||||
rows_per_cuda_block = 2;
|
|
||||||
break;
|
|
||||||
case 5:
|
|
||||||
case 6:
|
|
||||||
case 7:
|
|
||||||
case 8:
|
|
||||||
nwarps = 2;
|
|
||||||
rows_per_cuda_block = 2;
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
GGML_ABORT("fatal error");
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
const int64_t nblocks = (nrows_x + rows_per_cuda_block - 1) / rows_per_cuda_block;
|
|
||||||
const dim3 block_nums(nblocks, 1, 1);
|
|
||||||
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
|
||||||
|
|
||||||
switch (ncols_y) {
|
switch (ncols_y) {
|
||||||
case 1:
|
case 1:
|
||||||
mul_mat_vec_q<type, 1><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
{
|
||||||
|
constexpr int c_ncols_y = 1;
|
||||||
|
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id);
|
||||||
|
mul_mat_vec_q<type, c_ncols_y><<<dims.first, dims.second, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
||||||
break;
|
break;
|
||||||
|
}
|
||||||
case 2:
|
case 2:
|
||||||
mul_mat_vec_q<type, 2><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
{
|
||||||
|
constexpr int c_ncols_y = 2;
|
||||||
|
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id);
|
||||||
|
mul_mat_vec_q<type, c_ncols_y><<<dims.first, dims.second, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
||||||
break;
|
break;
|
||||||
|
}
|
||||||
case 3:
|
case 3:
|
||||||
mul_mat_vec_q<type, 3><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
{
|
||||||
|
constexpr int c_ncols_y = 3;
|
||||||
|
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id);
|
||||||
|
mul_mat_vec_q<type, c_ncols_y><<<dims.first, dims.second, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
||||||
break;
|
break;
|
||||||
|
}
|
||||||
case 4:
|
case 4:
|
||||||
mul_mat_vec_q<type, 4><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
{
|
||||||
|
constexpr int c_ncols_y = 4;
|
||||||
|
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id);
|
||||||
|
mul_mat_vec_q<type, c_ncols_y><<<dims.first, dims.second, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
||||||
break;
|
break;
|
||||||
|
}
|
||||||
case 5:
|
case 5:
|
||||||
mul_mat_vec_q<type, 5><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
{
|
||||||
|
constexpr int c_ncols_y = 5;
|
||||||
|
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id);
|
||||||
|
mul_mat_vec_q<type, c_ncols_y><<<dims.first, dims.second, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
||||||
break;
|
break;
|
||||||
|
}
|
||||||
case 6:
|
case 6:
|
||||||
mul_mat_vec_q<type, 6><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
{
|
||||||
|
constexpr int c_ncols_y = 6;
|
||||||
|
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id);
|
||||||
|
mul_mat_vec_q<type, c_ncols_y><<<dims.first, dims.second, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
||||||
break;
|
break;
|
||||||
|
}
|
||||||
case 7:
|
case 7:
|
||||||
mul_mat_vec_q<type, 7><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
{
|
||||||
|
constexpr int c_ncols_y = 7;
|
||||||
|
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id);
|
||||||
|
mul_mat_vec_q<type, c_ncols_y><<<dims.first, dims.second, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
||||||
break;
|
break;
|
||||||
|
}
|
||||||
case 8:
|
case 8:
|
||||||
mul_mat_vec_q<type, 8><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
{
|
||||||
|
constexpr int c_ncols_y = 8;
|
||||||
|
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id);
|
||||||
|
mul_mat_vec_q<type, c_ncols_y><<<dims.first, dims.second, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
||||||
break;
|
break;
|
||||||
|
}
|
||||||
default:
|
default:
|
||||||
GGML_ABORT("fatal error");
|
GGML_ABORT("fatal error");
|
||||||
break;
|
break;
|
||||||
|
|
|
@ -46,6 +46,7 @@ static struct ggml_backend_device g_ggml_backend_metal_device;
|
||||||
static struct ggml_backend_metal_device_context {
|
static struct ggml_backend_metal_device_context {
|
||||||
id<MTLDevice> mtl_device;
|
id<MTLDevice> mtl_device;
|
||||||
int mtl_device_ref_count;
|
int mtl_device_ref_count;
|
||||||
|
id<MTLLibrary> mtl_library;
|
||||||
|
|
||||||
bool has_simdgroup_reduction;
|
bool has_simdgroup_reduction;
|
||||||
bool has_simdgroup_mm;
|
bool has_simdgroup_mm;
|
||||||
|
@ -57,6 +58,7 @@ static struct ggml_backend_metal_device_context {
|
||||||
} g_ggml_ctx_dev_main = {
|
} g_ggml_ctx_dev_main = {
|
||||||
/*.mtl_device =*/ nil,
|
/*.mtl_device =*/ nil,
|
||||||
/*.mtl_device_ref_count =*/ 0,
|
/*.mtl_device_ref_count =*/ 0,
|
||||||
|
/*.mtl_library =*/ nil,
|
||||||
/*.has_simdgroup_reduction =*/ false,
|
/*.has_simdgroup_reduction =*/ false,
|
||||||
/*.has_simdgroup_mm =*/ false,
|
/*.has_simdgroup_mm =*/ false,
|
||||||
/*.has_residency_sets =*/ false,
|
/*.has_residency_sets =*/ false,
|
||||||
|
@ -108,6 +110,11 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
|
||||||
ctx->mtl_device_ref_count--;
|
ctx->mtl_device_ref_count--;
|
||||||
|
|
||||||
if (ctx->mtl_device_ref_count == 0) {
|
if (ctx->mtl_device_ref_count == 0) {
|
||||||
|
if (ctx->mtl_library) {
|
||||||
|
[ctx->mtl_library release];
|
||||||
|
ctx->mtl_library = nil;
|
||||||
|
}
|
||||||
|
|
||||||
if (ctx->mtl_device) {
|
if (ctx->mtl_device) {
|
||||||
[ctx->mtl_device release];
|
[ctx->mtl_device release];
|
||||||
ctx->mtl_device = nil;
|
ctx->mtl_device = nil;
|
||||||
|
@ -495,6 +502,139 @@ static void * ggml_metal_host_malloc(size_t n) {
|
||||||
return data;
|
return data;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// load library
|
||||||
|
//
|
||||||
|
// - first check if the library is embedded
|
||||||
|
// - then check if the library is in the bundle
|
||||||
|
// - if not found, load the source and compile it
|
||||||
|
// - if that fails, return NULL
|
||||||
|
static id<MTLLibrary> ggml_metal_load_library(id<MTLDevice> device, bool use_bfloat) {
|
||||||
|
id<MTLLibrary> metal_library = nil;
|
||||||
|
NSError * error = nil;
|
||||||
|
NSString * src = nil;
|
||||||
|
|
||||||
|
#if GGML_METAL_EMBED_LIBRARY
|
||||||
|
GGML_LOG_INFO("%s: using embedded metal library\n", __func__);
|
||||||
|
|
||||||
|
extern const char ggml_metallib_start[];
|
||||||
|
extern const char ggml_metallib_end[];
|
||||||
|
|
||||||
|
src = [[NSString alloc] initWithBytes:ggml_metallib_start length:(ggml_metallib_end-ggml_metallib_start) encoding:NSUTF8StringEncoding];
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
#ifdef SWIFT_PACKAGE
|
||||||
|
NSBundle * bundle = SWIFTPM_MODULE_BUNDLE;
|
||||||
|
#else
|
||||||
|
NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
|
||||||
|
#endif
|
||||||
|
|
||||||
|
NSString * path_lib = [bundle pathForResource:@"default" ofType:@"metallib"];
|
||||||
|
if (path_lib == nil) {
|
||||||
|
// Try to find the resource in the directory where the current binary located.
|
||||||
|
NSString * current_binary = [[NSProcessInfo processInfo] arguments][0];
|
||||||
|
NSString * bin_dir = [current_binary stringByDeletingLastPathComponent];
|
||||||
|
NSString * default_metallib_path = [NSString pathWithComponents:@[bin_dir, @"default.metallib"]];
|
||||||
|
if ([[NSFileManager defaultManager] isReadableFileAtPath:default_metallib_path]) {
|
||||||
|
GGML_LOG_INFO("%s: found '%s'\n", __func__, [default_metallib_path UTF8String]);
|
||||||
|
NSDictionary * atts = [[NSFileManager defaultManager] attributesOfItemAtPath:default_metallib_path error:&error];
|
||||||
|
if (atts && atts[NSFileType] == NSFileTypeSymbolicLink) {
|
||||||
|
// Optionally, if this is a symlink, try to resolve it.
|
||||||
|
default_metallib_path = [[NSFileManager defaultManager] destinationOfSymbolicLinkAtPath:default_metallib_path error:&error];
|
||||||
|
if (default_metallib_path && [default_metallib_path length] > 0 && ![[default_metallib_path substringToIndex:1] isEqualToString:@"/"]) {
|
||||||
|
// It is a relative path, adding the binary directory as directory prefix.
|
||||||
|
default_metallib_path = [NSString pathWithComponents:@[bin_dir, default_metallib_path]];
|
||||||
|
}
|
||||||
|
if (!default_metallib_path || ![[NSFileManager defaultManager] isReadableFileAtPath:default_metallib_path]) {
|
||||||
|
// Link to the resource could not be resolved.
|
||||||
|
default_metallib_path = nil;
|
||||||
|
} else {
|
||||||
|
GGML_LOG_INFO("%s: symlink resolved '%s'\n", __func__, [default_metallib_path UTF8String]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// The resource couldn't be found in the binary's directory.
|
||||||
|
default_metallib_path = nil;
|
||||||
|
}
|
||||||
|
path_lib = default_metallib_path;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (path_lib != nil) {
|
||||||
|
// pre-compiled library found
|
||||||
|
NSURL * libURL = [NSURL fileURLWithPath:path_lib];
|
||||||
|
GGML_LOG_INFO("%s: loading '%s'\n", __func__, [path_lib UTF8String]);
|
||||||
|
|
||||||
|
metal_library = [device newLibraryWithURL:libURL error:&error];
|
||||||
|
if (error) {
|
||||||
|
GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
GGML_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
|
||||||
|
|
||||||
|
NSString * path_source;
|
||||||
|
NSString * path_resource = [[NSProcessInfo processInfo].environment objectForKey:@"GGML_METAL_PATH_RESOURCES"];
|
||||||
|
|
||||||
|
GGML_LOG_INFO("%s: GGML_METAL_PATH_RESOURCES = %s\n", __func__, path_resource ? [path_resource UTF8String] : "nil");
|
||||||
|
|
||||||
|
if (path_resource) {
|
||||||
|
path_source = [path_resource stringByAppendingPathComponent:@"ggml-metal-merged.metal"];
|
||||||
|
} else {
|
||||||
|
path_source = [bundle pathForResource:@"ggml-metal-merged" ofType:@"metal"];
|
||||||
|
}
|
||||||
|
|
||||||
|
if (path_source == nil) {
|
||||||
|
GGML_LOG_WARN("%s: error: could not use bundle path to find ggml-metal-merged.metal, falling back to trying cwd\n", __func__);
|
||||||
|
path_source = @"ggml-metal.metal";
|
||||||
|
}
|
||||||
|
|
||||||
|
GGML_LOG_INFO("%s: loading '%s'\n", __func__, [path_source UTF8String]);
|
||||||
|
|
||||||
|
src = [NSString stringWithContentsOfFile:path_source encoding:NSUTF8StringEncoding error:&error];
|
||||||
|
if (error) {
|
||||||
|
GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if (!metal_library) {
|
||||||
|
@autoreleasepool {
|
||||||
|
// dictionary of preprocessor macros
|
||||||
|
NSMutableDictionary * prep = [NSMutableDictionary dictionary];
|
||||||
|
|
||||||
|
if (use_bfloat) {
|
||||||
|
[prep setObject:@"1" forKey:@"GGML_METAL_USE_BF16"];
|
||||||
|
}
|
||||||
|
|
||||||
|
#if GGML_METAL_EMBED_LIBRARY
|
||||||
|
[prep setObject:@"1" forKey:@"GGML_METAL_EMBED_LIBRARY"];
|
||||||
|
#endif
|
||||||
|
|
||||||
|
MTLCompileOptions * options = [MTLCompileOptions new];
|
||||||
|
options.preprocessorMacros = prep;
|
||||||
|
|
||||||
|
//[options setFastMathEnabled:false];
|
||||||
|
|
||||||
|
metal_library = [device newLibraryWithSource:src options:options error:&error];
|
||||||
|
if (error) {
|
||||||
|
GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
#if !__has_feature(objc_arc)
|
||||||
|
[options release];
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#if GGML_METAL_EMBED_LIBRARY
|
||||||
|
[src release];
|
||||||
|
#endif // GGML_METAL_EMBED_LIBRARY
|
||||||
|
|
||||||
|
return metal_library;
|
||||||
|
}
|
||||||
|
|
||||||
static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t dev) {
|
static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t dev) {
|
||||||
GGML_LOG_INFO("%s: allocating\n", __func__);
|
GGML_LOG_INFO("%s: allocating\n", __func__);
|
||||||
|
|
||||||
|
@ -522,136 +662,14 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
|
||||||
|
|
||||||
ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
|
ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
|
||||||
|
|
||||||
id<MTLLibrary> metal_library = nil;
|
|
||||||
|
|
||||||
// load library
|
// load library
|
||||||
//
|
if (ctx_dev->mtl_library == nil) {
|
||||||
// - first check if the library is embedded
|
ctx_dev->mtl_library = ggml_metal_load_library(device, ctx_dev->use_bfloat);
|
||||||
// - then check if the library is in the bundle
|
}
|
||||||
// - if not found, load the source and compile it
|
id<MTLLibrary> metal_library = ctx_dev->mtl_library;
|
||||||
// - if that fails, return NULL
|
if (metal_library == nil) {
|
||||||
{
|
GGML_LOG_ERROR("%s: error: metal library is nil\n", __func__);
|
||||||
NSError * error = nil;
|
return NULL;
|
||||||
NSString * src = nil;
|
|
||||||
|
|
||||||
#if GGML_METAL_EMBED_LIBRARY
|
|
||||||
GGML_LOG_INFO("%s: using embedded metal library\n", __func__);
|
|
||||||
|
|
||||||
extern const char ggml_metallib_start[];
|
|
||||||
extern const char ggml_metallib_end[];
|
|
||||||
|
|
||||||
src = [[NSString alloc] initWithBytes:ggml_metallib_start length:(ggml_metallib_end-ggml_metallib_start) encoding:NSUTF8StringEncoding];
|
|
||||||
|
|
||||||
#else
|
|
||||||
|
|
||||||
#ifdef SWIFT_PACKAGE
|
|
||||||
NSBundle * bundle = SWIFTPM_MODULE_BUNDLE;
|
|
||||||
#else
|
|
||||||
NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
|
|
||||||
#endif
|
|
||||||
|
|
||||||
NSString * path_lib = [bundle pathForResource:@"default" ofType:@"metallib"];
|
|
||||||
if (path_lib == nil) {
|
|
||||||
// Try to find the resource in the directory where the current binary located.
|
|
||||||
NSString * current_binary = [[NSProcessInfo processInfo] arguments][0];
|
|
||||||
NSString * bin_dir = [current_binary stringByDeletingLastPathComponent];
|
|
||||||
NSString * default_metallib_path = [NSString pathWithComponents:@[bin_dir, @"default.metallib"]];
|
|
||||||
if ([[NSFileManager defaultManager] isReadableFileAtPath:default_metallib_path]) {
|
|
||||||
GGML_LOG_INFO("%s: found '%s'\n", __func__, [default_metallib_path UTF8String]);
|
|
||||||
NSDictionary * atts = [[NSFileManager defaultManager] attributesOfItemAtPath:default_metallib_path error:&error];
|
|
||||||
if (atts && atts[NSFileType] == NSFileTypeSymbolicLink) {
|
|
||||||
// Optionally, if this is a symlink, try to resolve it.
|
|
||||||
default_metallib_path = [[NSFileManager defaultManager] destinationOfSymbolicLinkAtPath:default_metallib_path error:&error];
|
|
||||||
if (default_metallib_path && [default_metallib_path length] > 0 && ![[default_metallib_path substringToIndex:1] isEqualToString:@"/"]) {
|
|
||||||
// It is a relative path, adding the binary directory as directory prefix.
|
|
||||||
default_metallib_path = [NSString pathWithComponents:@[bin_dir, default_metallib_path]];
|
|
||||||
}
|
|
||||||
if (!default_metallib_path || ![[NSFileManager defaultManager] isReadableFileAtPath:default_metallib_path]) {
|
|
||||||
// Link to the resource could not be resolved.
|
|
||||||
default_metallib_path = nil;
|
|
||||||
} else {
|
|
||||||
GGML_LOG_INFO("%s: symlink resolved '%s'\n", __func__, [default_metallib_path UTF8String]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// The resource couldn't be found in the binary's directory.
|
|
||||||
default_metallib_path = nil;
|
|
||||||
}
|
|
||||||
path_lib = default_metallib_path;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (path_lib != nil) {
|
|
||||||
// pre-compiled library found
|
|
||||||
NSURL * libURL = [NSURL fileURLWithPath:path_lib];
|
|
||||||
GGML_LOG_INFO("%s: loading '%s'\n", __func__, [path_lib UTF8String]);
|
|
||||||
|
|
||||||
metal_library = [device newLibraryWithURL:libURL error:&error];
|
|
||||||
if (error) {
|
|
||||||
GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
GGML_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
|
|
||||||
|
|
||||||
NSString * path_source;
|
|
||||||
NSString * path_resource = [[NSProcessInfo processInfo].environment objectForKey:@"GGML_METAL_PATH_RESOURCES"];
|
|
||||||
|
|
||||||
GGML_LOG_INFO("%s: GGML_METAL_PATH_RESOURCES = %s\n", __func__, path_resource ? [path_resource UTF8String] : "nil");
|
|
||||||
|
|
||||||
if (path_resource) {
|
|
||||||
path_source = [path_resource stringByAppendingPathComponent:@"ggml-metal-merged.metal"];
|
|
||||||
} else {
|
|
||||||
path_source = [bundle pathForResource:@"ggml-metal-merged" ofType:@"metal"];
|
|
||||||
}
|
|
||||||
|
|
||||||
if (path_source == nil) {
|
|
||||||
GGML_LOG_WARN("%s: error: could not use bundle path to find ggml-metal-merged.metal, falling back to trying cwd\n", __func__);
|
|
||||||
path_source = @"ggml-metal.metal";
|
|
||||||
}
|
|
||||||
|
|
||||||
GGML_LOG_INFO("%s: loading '%s'\n", __func__, [path_source UTF8String]);
|
|
||||||
|
|
||||||
src = [NSString stringWithContentsOfFile:path_source encoding:NSUTF8StringEncoding error:&error];
|
|
||||||
if (error) {
|
|
||||||
GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
if (!metal_library) {
|
|
||||||
@autoreleasepool {
|
|
||||||
// dictionary of preprocessor macros
|
|
||||||
NSMutableDictionary * prep = [NSMutableDictionary dictionary];
|
|
||||||
|
|
||||||
if (ctx_dev->use_bfloat) {
|
|
||||||
[prep setObject:@"1" forKey:@"GGML_METAL_USE_BF16"];
|
|
||||||
}
|
|
||||||
|
|
||||||
#if GGML_METAL_EMBED_LIBRARY
|
|
||||||
[prep setObject:@"1" forKey:@"GGML_METAL_EMBED_LIBRARY"];
|
|
||||||
#endif
|
|
||||||
|
|
||||||
MTLCompileOptions * options = [MTLCompileOptions new];
|
|
||||||
options.preprocessorMacros = prep;
|
|
||||||
|
|
||||||
//[options setFastMathEnabled:false];
|
|
||||||
|
|
||||||
metal_library = [device newLibraryWithSource:src options:options error:&error];
|
|
||||||
if (error) {
|
|
||||||
GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
#if !__has_feature(objc_arc)
|
|
||||||
[options release];
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#if GGML_METAL_EMBED_LIBRARY
|
|
||||||
[src release];
|
|
||||||
#endif // GGML_METAL_EMBED_LIBRARY
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// print MTL GPU family:
|
// print MTL GPU family:
|
||||||
|
@ -725,7 +743,6 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
|
||||||
[metal_function release]; \
|
[metal_function release]; \
|
||||||
if (error) { \
|
if (error) { \
|
||||||
GGML_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
|
GGML_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
|
||||||
[metal_library release]; \
|
|
||||||
return NULL; \
|
return NULL; \
|
||||||
} \
|
} \
|
||||||
} else { \
|
} else { \
|
||||||
|
@ -1044,8 +1061,6 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_POOL_2D_MAX_F32, pool_2d_max_f32, true);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_POOL_2D_MAX_F32, pool_2d_max_f32, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
[metal_library release];
|
|
||||||
|
|
||||||
return ctx;
|
return ctx;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -15,6 +15,7 @@ if (GGML_OPENCL_PROFILING)
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
add_compile_definitions(GGML_OPENCL_SOA_Q)
|
add_compile_definitions(GGML_OPENCL_SOA_Q)
|
||||||
|
add_compile_definitions(GGML_OPENCL_TARGET_VERSION=${GGML_OPENCL_TARGET_VERSION})
|
||||||
|
|
||||||
if (GGML_OPENCL_USE_ADRENO_KERNELS)
|
if (GGML_OPENCL_USE_ADRENO_KERNELS)
|
||||||
message(STATUS "OpenCL will use matmul kernels optimized for Adreno")
|
message(STATUS "OpenCL will use matmul kernels optimized for Adreno")
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
#define CL_TARGET_OPENCL_VERSION 220
|
#define CL_TARGET_OPENCL_VERSION GGML_OPENCL_TARGET_VERSION
|
||||||
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
|
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
|
||||||
|
|
||||||
// suppress warnings in CL headers for GCC and Clang
|
// suppress warnings in CL headers for GCC and Clang
|
||||||
|
@ -25,6 +25,8 @@
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
|
#include <memory>
|
||||||
|
#include <charconv>
|
||||||
|
|
||||||
#undef MIN
|
#undef MIN
|
||||||
#undef MAX
|
#undef MAX
|
||||||
|
@ -62,6 +64,97 @@ enum ADRENO_GPU_GEN {
|
||||||
X1E,
|
X1E,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct ggml_cl_version {
|
||||||
|
cl_uint major = 0;
|
||||||
|
cl_uint minor = 0;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Parses a version string of form "XX.YY ". On an error returns ggml_cl_version with all zeroes.
|
||||||
|
static ggml_cl_version parse_cl_version(std::string_view str) {
|
||||||
|
size_t major_str_begin = 0;
|
||||||
|
size_t major_str_end = str.find(".", major_str_begin);
|
||||||
|
if (major_str_end == std::string::npos) {
|
||||||
|
return {};
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t minor_str_begin = major_str_end + 1;
|
||||||
|
size_t minor_str_end = str.find(" ", minor_str_begin);
|
||||||
|
if (minor_str_end == std::string::npos) {
|
||||||
|
return {};
|
||||||
|
}
|
||||||
|
|
||||||
|
cl_uint version_major;
|
||||||
|
if (std::from_chars(str.data() + major_str_begin, str.data() + major_str_end, version_major).ec != std::errc{}) {
|
||||||
|
return {};
|
||||||
|
}
|
||||||
|
|
||||||
|
cl_uint version_minor;
|
||||||
|
if (std::from_chars(str.data() + minor_str_begin, str.data() + minor_str_end, version_minor).ec != std::errc{}) {
|
||||||
|
return {};
|
||||||
|
}
|
||||||
|
return { version_major, version_minor };
|
||||||
|
}
|
||||||
|
|
||||||
|
// Returns OpenCL platform's version. On an error returns ggml_cl_version with all zeroes.
|
||||||
|
static ggml_cl_version get_opencl_platform_version(cl_platform_id platform) {
|
||||||
|
size_t param_size;
|
||||||
|
CL_CHECK(clGetPlatformInfo(platform, CL_PLATFORM_VERSION, 0, nullptr, ¶m_size));
|
||||||
|
std::unique_ptr<char[]> param_storage(new char[param_size]);
|
||||||
|
CL_CHECK(clGetPlatformInfo(platform, CL_PLATFORM_VERSION, param_size, param_storage.get(), nullptr));
|
||||||
|
|
||||||
|
auto param_value = std::string_view(param_storage.get(), param_size);
|
||||||
|
const std::string version_prefix = "OpenCL "; // Suffix: "XX.YY <platform-specific-info>"
|
||||||
|
if (param_value.find(version_prefix) != 0) {
|
||||||
|
return {};
|
||||||
|
}
|
||||||
|
param_value.remove_prefix(version_prefix.length());
|
||||||
|
return parse_cl_version(param_value);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Return a version to use in OpenCL C compilation. On an error returns ggml_cl_version with all zeroes.
|
||||||
|
static ggml_cl_version get_opencl_c_version(ggml_cl_version platform_version, cl_device_id device) {
|
||||||
|
size_t param_size;
|
||||||
|
|
||||||
|
#if CL_TARGET_OPENCL_VERSION >= 300
|
||||||
|
if (platform_version.major >= 3) {
|
||||||
|
CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_ALL_VERSIONS, 0, nullptr, ¶m_size));
|
||||||
|
if (!param_size) {
|
||||||
|
return {};
|
||||||
|
}
|
||||||
|
|
||||||
|
std::unique_ptr<cl_name_version[]> versions(new cl_name_version[param_size]);
|
||||||
|
CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_ALL_VERSIONS, param_size, versions.get(), nullptr));
|
||||||
|
unsigned versions_count = param_size / sizeof(cl_name_version);
|
||||||
|
|
||||||
|
cl_version version_max = 0;
|
||||||
|
for (unsigned i = 0; i < versions_count; i++) {
|
||||||
|
version_max = std::max<cl_version>(versions[i].version, version_max);
|
||||||
|
}
|
||||||
|
|
||||||
|
return { CL_VERSION_MAJOR(version_max), CL_VERSION_MINOR(version_max) };
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
GGML_UNUSED(platform_version);
|
||||||
|
#endif // CL_TARGET_OPENCL_VERSION >= 300
|
||||||
|
|
||||||
|
CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_VERSION, 0, nullptr, ¶m_size));
|
||||||
|
if (!param_size) {
|
||||||
|
return {};
|
||||||
|
}
|
||||||
|
|
||||||
|
std::unique_ptr<char[]> param_storage(new char[param_size]);
|
||||||
|
CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_VERSION, param_size, param_storage.get(), nullptr));
|
||||||
|
auto param_value = std::string_view(param_storage.get(), param_size);
|
||||||
|
|
||||||
|
const std::string version_prefix = "OpenCL C "; // Suffix: "XX.YY <platform-specific-info>"
|
||||||
|
if (param_value.find(version_prefix) != 0) {
|
||||||
|
return {};
|
||||||
|
}
|
||||||
|
param_value.remove_prefix(version_prefix.length());
|
||||||
|
|
||||||
|
return parse_cl_version(param_value);
|
||||||
|
}
|
||||||
|
|
||||||
static ADRENO_GPU_GEN get_adreno_gpu_gen(const char *device_name) {
|
static ADRENO_GPU_GEN get_adreno_gpu_gen(const char *device_name) {
|
||||||
if (strstr(device_name, "730") ||
|
if (strstr(device_name, "730") ||
|
||||||
strstr(device_name, "740") ||
|
strstr(device_name, "740") ||
|
||||||
|
@ -470,16 +563,11 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
||||||
// A local ref of cl_device_id for convenience
|
// A local ref of cl_device_id for convenience
|
||||||
cl_device_id device = backend_ctx->device;
|
cl_device_id device = backend_ctx->device;
|
||||||
|
|
||||||
// Check device OpenCL version, OpenCL 2.0 or above is required
|
ggml_cl_version platform_version = get_opencl_platform_version(default_device->platform->id);
|
||||||
size_t device_ver_str_size;
|
|
||||||
clGetDeviceInfo(device, CL_DEVICE_VERSION, 0, NULL, &device_ver_str_size);
|
|
||||||
char *device_ver_buffer = (char *)alloca(device_ver_str_size + 1);
|
|
||||||
clGetDeviceInfo(device, CL_DEVICE_VERSION, device_ver_str_size, device_ver_buffer, NULL);
|
|
||||||
device_ver_buffer[device_ver_str_size] = '\0';
|
|
||||||
GGML_LOG_INFO("ggml_opencl: device OpenCL version: %s\n", device_ver_buffer);
|
|
||||||
|
|
||||||
if (strstr(device_ver_buffer, "OpenCL 2") == NULL &&
|
// Check device OpenCL version, OpenCL 2.0 or above is required
|
||||||
strstr(device_ver_buffer, "OpenCL 3") == NULL) {
|
ggml_cl_version opencl_c_version = get_opencl_c_version(platform_version, device);
|
||||||
|
if (opencl_c_version.major < 2) {
|
||||||
GGML_LOG_ERROR("ggml_opencl: OpenCL 2.0 or above is required\n");
|
GGML_LOG_ERROR("ggml_opencl: OpenCL 2.0 or above is required\n");
|
||||||
return backend_ctx;
|
return backend_ctx;
|
||||||
}
|
}
|
||||||
|
@ -516,8 +604,7 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
||||||
|
|
||||||
// If OpenCL 3.0 is supported, then check for cl_khr_subgroups, which becomes
|
// If OpenCL 3.0 is supported, then check for cl_khr_subgroups, which becomes
|
||||||
// optional in OpenCL 3.0 (cl_khr_subgroup is mandatory in OpenCL 2.x)
|
// optional in OpenCL 3.0 (cl_khr_subgroup is mandatory in OpenCL 2.x)
|
||||||
if (strstr(device_ver_buffer, "OpenCL 3") &&
|
if (opencl_c_version.major == 3 && strstr(ext_buffer, "cl_khr_subgroups") == NULL &&
|
||||||
strstr(ext_buffer, "cl_khr_subgroups") == NULL &&
|
|
||||||
strstr(ext_buffer, "cl_intel_subgroups") == NULL) {
|
strstr(ext_buffer, "cl_intel_subgroups") == NULL) {
|
||||||
GGML_LOG_ERROR("ggml_opencl: device does not support subgroups (cl_khr_subgroups or cl_intel_subgroups) "
|
GGML_LOG_ERROR("ggml_opencl: device does not support subgroups (cl_khr_subgroups or cl_intel_subgroups) "
|
||||||
"(note that subgroups is an optional feature in OpenCL 3.0)\n");
|
"(note that subgroups is an optional feature in OpenCL 3.0)\n");
|
||||||
|
@ -581,9 +668,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
||||||
const std::string kernel_src = read_file("ggml-opencl.cl");
|
const std::string kernel_src = read_file("ggml-opencl.cl");
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
std::string compile_opts =
|
auto opencl_c_std =
|
||||||
"-cl-std=CL2.0 -cl-mad-enable -cl-unsafe-math-optimizations "
|
std::string("CL") + std::to_string(opencl_c_version.major) + "." + std::to_string(opencl_c_version.minor);
|
||||||
"-cl-finite-math-only -cl-fast-relaxed-math ";
|
|
||||||
|
std::string compile_opts = std::string("-cl-std=") + opencl_c_std +
|
||||||
|
" -cl-mad-enable -cl-unsafe-math-optimizations"
|
||||||
|
" -cl-finite-math-only -cl-fast-relaxed-math";
|
||||||
backend_ctx->program = build_program_from_source(context, device, kernel_src.c_str(), compile_opts);
|
backend_ctx->program = build_program_from_source(context, device, kernel_src.c_str(), compile_opts);
|
||||||
|
|
||||||
// Non matmul kernels.
|
// Non matmul kernels.
|
||||||
|
@ -693,10 +783,10 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
||||||
CL_CHECK((backend_ctx->kernel_transpose_16 = clCreateKernel(backend_ctx->program_transpose_16, "kernel_transpose_16", &err), err));
|
CL_CHECK((backend_ctx->kernel_transpose_16 = clCreateKernel(backend_ctx->program_transpose_16, "kernel_transpose_16", &err), err));
|
||||||
|
|
||||||
// Gemv general
|
// Gemv general
|
||||||
std::string CL_gemv_compile_opts =
|
std::string CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
|
||||||
" -cl-std=CL2.0 "
|
" -cl-mad-enable "
|
||||||
" -cl-mad-enable "
|
" -DSIMDGROUP_WIDTH=" +
|
||||||
" -DSIMDGROUP_WIDTH=" + std::to_string(backend_ctx->adreno_wave_size);
|
std::to_string(backend_ctx->adreno_wave_size);
|
||||||
if (has_vector_subgroup_broadcast) {
|
if (has_vector_subgroup_broadcast) {
|
||||||
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
|
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
|
||||||
}
|
}
|
||||||
|
@ -713,12 +803,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
||||||
CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_general = clCreateKernel(backend_ctx->program_CL_gemv_general, "kernel_gemv_noshuffle", &err), err));
|
CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_general = clCreateKernel(backend_ctx->program_CL_gemv_general, "kernel_gemv_noshuffle", &err), err));
|
||||||
|
|
||||||
// Gemv 2048, 16384
|
// Gemv 2048, 16384
|
||||||
CL_gemv_compile_opts =
|
CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
|
||||||
" -cl-std=CL2.0 "
|
" -cl-mad-enable "
|
||||||
" -cl-mad-enable "
|
" -DLINE_STRIDE_A=2048 "
|
||||||
" -DLINE_STRIDE_A=2048 "
|
" -DBLOCK_STRIDE_A=16384 "
|
||||||
" -DBLOCK_STRIDE_A=16384 "
|
" -DSIMDGROUP_WIDTH=" +
|
||||||
" -DSIMDGROUP_WIDTH=" + std::to_string(backend_ctx->adreno_wave_size);
|
std::to_string(backend_ctx->adreno_wave_size);
|
||||||
if (has_vector_subgroup_broadcast) {
|
if (has_vector_subgroup_broadcast) {
|
||||||
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
|
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
|
||||||
}
|
}
|
||||||
|
@ -735,12 +825,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
||||||
CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_4096_1_4096, "kernel_gemv_noshuffle", &err), err));
|
CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_4096_1_4096, "kernel_gemv_noshuffle", &err), err));
|
||||||
|
|
||||||
// Gemv 2048, 16384
|
// Gemv 2048, 16384
|
||||||
CL_gemv_compile_opts =
|
CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
|
||||||
" -cl-std=CL2.0 "
|
" -cl-mad-enable "
|
||||||
" -cl-mad-enable "
|
" -DLINE_STRIDE_A=2048 "
|
||||||
" -DLINE_STRIDE_A=2048 "
|
" -DBLOCK_STRIDE_A=16384 "
|
||||||
" -DBLOCK_STRIDE_A=16384 "
|
" -DSIMDGROUP_WIDTH=" +
|
||||||
" -DSIMDGROUP_WIDTH=" + std::to_string(backend_ctx->adreno_wave_size);
|
std::to_string(backend_ctx->adreno_wave_size);
|
||||||
if (has_vector_subgroup_broadcast) {
|
if (has_vector_subgroup_broadcast) {
|
||||||
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
|
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
|
||||||
}
|
}
|
||||||
|
@ -750,12 +840,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
||||||
CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_11008 = clCreateKernel(backend_ctx->program_CL_gemv_4096_1_11008, "kernel_gemv_noshuffle", &err), err));
|
CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_11008 = clCreateKernel(backend_ctx->program_CL_gemv_4096_1_11008, "kernel_gemv_noshuffle", &err), err));
|
||||||
|
|
||||||
// Gemv 5504, 44032
|
// Gemv 5504, 44032
|
||||||
CL_gemv_compile_opts =
|
CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
|
||||||
" -cl-std=CL2.0 "
|
" -cl-mad-enable "
|
||||||
" -cl-mad-enable "
|
" -DLINE_STRIDE_A=5504 "
|
||||||
" -DLINE_STRIDE_A=5504 "
|
" -DBLOCK_STRIDE_A=44032 "
|
||||||
" -DBLOCK_STRIDE_A=44032 "
|
" -DSIMDGROUP_WIDTH=" +
|
||||||
" -DSIMDGROUP_WIDTH=" + std::to_string(backend_ctx->adreno_wave_size);
|
std::to_string(backend_ctx->adreno_wave_size);
|
||||||
if (has_vector_subgroup_broadcast) {
|
if (has_vector_subgroup_broadcast) {
|
||||||
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
|
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
|
||||||
}
|
}
|
||||||
|
@ -765,12 +855,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
||||||
CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_11008_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_11008_1_4096, "kernel_gemv_noshuffle", &err), err));
|
CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_11008_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_11008_1_4096, "kernel_gemv_noshuffle", &err), err));
|
||||||
|
|
||||||
// Gemv 16000, 128000
|
// Gemv 16000, 128000
|
||||||
CL_gemv_compile_opts =
|
CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
|
||||||
" -cl-std=CL2.0 "
|
" -cl-mad-enable "
|
||||||
" -cl-mad-enable "
|
" -DLINE_STRIDE_A=16000 "
|
||||||
" -DLINE_STRIDE_A=16000 "
|
" -DBLOCK_STRIDE_A=128000 "
|
||||||
" -DBLOCK_STRIDE_A=128000 "
|
" -DSIMDGROUP_WIDTH=" +
|
||||||
" -DSIMDGROUP_WIDTH=" + std::to_string(backend_ctx->adreno_wave_size);
|
std::to_string(backend_ctx->adreno_wave_size);
|
||||||
if (has_vector_subgroup_broadcast) {
|
if (has_vector_subgroup_broadcast) {
|
||||||
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
|
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
|
||||||
}
|
}
|
||||||
|
|
|
@ -3,44 +3,42 @@
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
|
|
||||||
template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_sycl_t vec_dot_q_sycl>
|
template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_sycl_t vec_dot_q_sycl>
|
||||||
static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows,
|
static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
||||||
const sycl::nd_item<3> &item_ct1) {
|
const int ncols, const int nrows, const sycl::nd_item<3> & item_ct1) {
|
||||||
const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
|
const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + item_ct1.get_local_id(1);
|
||||||
item_ct1.get_local_id(1);
|
|
||||||
|
|
||||||
if (row >= nrows) {
|
if (row >= nrows) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
const int blocks_per_row = ncols / qk;
|
const int blocks_per_row = ncols / qk;
|
||||||
const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
|
constexpr int blocks_per_warp = (vdr * WARP_SIZE + qi - 1) / qi; // Ensuring blocks_per_warp > 0
|
||||||
assert(blocks_per_warp>0);
|
|
||||||
|
|
||||||
// partial sum for each thread
|
assert(blocks_per_warp > 0);
|
||||||
|
|
||||||
|
// partial sum for each thread
|
||||||
float tmp = 0.0f;
|
float tmp = 0.0f;
|
||||||
|
|
||||||
const block_q_t * x = (const block_q_t *) vx;
|
const block_q_t * x = (const block_q_t *) vx;
|
||||||
const block_q8_1 * y = (const block_q8_1 *) vy;
|
const block_q8_1 * y = (const block_q8_1 *) vy;
|
||||||
|
|
||||||
for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
|
for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row; i += blocks_per_warp) {
|
||||||
i += blocks_per_warp) {
|
const int ibx = row * blocks_per_row + i; // x block index
|
||||||
const int ibx = row*blocks_per_row + i; // x block index
|
|
||||||
|
|
||||||
const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
|
const int iby = i * (qk / QK8_1); // y block index that aligns with ibx
|
||||||
|
|
||||||
const int iqs =
|
for (size_t elem = 0; elem < qi / vdr; elem += WARP_SIZE) {
|
||||||
vdr *
|
const int iqs = elem + vdr * (item_ct1.get_local_id(2) %
|
||||||
(item_ct1.get_local_id(2) %
|
(qi / vdr)); // x block quant index when casting the quants to int
|
||||||
(qi / vdr)); // x block quant index when casting the quants to int
|
|
||||||
|
|
||||||
tmp += vec_dot_q_sycl(&x[ibx], &y[iby], iqs);
|
tmp += vec_dot_q_sycl(&x[ibx], &y[iby], iqs);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
||||||
tmp +=
|
tmp += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
||||||
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (item_ct1.get_local_id(2) == 0) {
|
if (item_ct1.get_local_id(2) == 0) {
|
||||||
|
@ -62,7 +60,7 @@ static void mul_mat_vec_q_iq2_xxs_q8_1(const void *__restrict__ vx,
|
||||||
}
|
}
|
||||||
|
|
||||||
const int blocks_per_row = ncols / qk;
|
const int blocks_per_row = ncols / qk;
|
||||||
const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
|
const int blocks_per_warp = vdr * WARP_SIZE / qi;
|
||||||
assert(blocks_per_warp>0);
|
assert(blocks_per_warp>0);
|
||||||
|
|
||||||
// partial sum for each thread
|
// partial sum for each thread
|
||||||
|
@ -87,7 +85,7 @@ static void mul_mat_vec_q_iq2_xxs_q8_1(const void *__restrict__ vx,
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
||||||
tmp +=
|
tmp +=
|
||||||
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
||||||
}
|
}
|
||||||
|
@ -111,7 +109,7 @@ static void mul_mat_vec_q_iq2_xs_q8_1(const void *__restrict__ vx,
|
||||||
}
|
}
|
||||||
|
|
||||||
const int blocks_per_row = ncols / qk;
|
const int blocks_per_row = ncols / qk;
|
||||||
const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
|
const int blocks_per_warp = vdr * WARP_SIZE / qi;
|
||||||
assert(blocks_per_warp>0);
|
assert(blocks_per_warp>0);
|
||||||
// partial sum for each thread
|
// partial sum for each thread
|
||||||
float tmp = 0.0f;
|
float tmp = 0.0f;
|
||||||
|
@ -135,7 +133,7 @@ static void mul_mat_vec_q_iq2_xs_q8_1(const void *__restrict__ vx,
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
||||||
tmp +=
|
tmp +=
|
||||||
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
||||||
}
|
}
|
||||||
|
@ -159,7 +157,7 @@ static void mul_mat_vec_q_iq2_s_q8_1(const void *__restrict__ vx,
|
||||||
}
|
}
|
||||||
|
|
||||||
const int blocks_per_row = ncols / qk;
|
const int blocks_per_row = ncols / qk;
|
||||||
const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
|
const int blocks_per_warp = vdr * WARP_SIZE / qi;
|
||||||
assert(blocks_per_warp>0);
|
assert(blocks_per_warp>0);
|
||||||
// partial sum for each thread
|
// partial sum for each thread
|
||||||
float tmp = 0.0f;
|
float tmp = 0.0f;
|
||||||
|
@ -183,7 +181,7 @@ static void mul_mat_vec_q_iq2_s_q8_1(const void *__restrict__ vx,
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
||||||
tmp +=
|
tmp +=
|
||||||
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
||||||
}
|
}
|
||||||
|
@ -207,7 +205,7 @@ static void mul_mat_vec_q_iq3_xxs_q8_1(const void *__restrict__ vx,
|
||||||
}
|
}
|
||||||
|
|
||||||
const int blocks_per_row = ncols / qk;
|
const int blocks_per_row = ncols / qk;
|
||||||
const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
|
const int blocks_per_warp = vdr * WARP_SIZE / qi;
|
||||||
assert(blocks_per_warp>0);
|
assert(blocks_per_warp>0);
|
||||||
// partial sum for each thread
|
// partial sum for each thread
|
||||||
float tmp = 0.0f;
|
float tmp = 0.0f;
|
||||||
|
@ -231,7 +229,7 @@ static void mul_mat_vec_q_iq3_xxs_q8_1(const void *__restrict__ vx,
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
||||||
tmp +=
|
tmp +=
|
||||||
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
||||||
}
|
}
|
||||||
|
@ -255,7 +253,7 @@ static void mul_mat_vec_q_iq3_s_q8_1(const void *__restrict__ vx,
|
||||||
}
|
}
|
||||||
|
|
||||||
const int blocks_per_row = ncols / qk;
|
const int blocks_per_row = ncols / qk;
|
||||||
const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
|
const int blocks_per_warp = vdr * WARP_SIZE / qi;
|
||||||
assert(blocks_per_warp>0);
|
assert(blocks_per_warp>0);
|
||||||
// partial sum for each thread
|
// partial sum for each thread
|
||||||
float tmp = 0.0f;
|
float tmp = 0.0f;
|
||||||
|
@ -279,7 +277,7 @@ static void mul_mat_vec_q_iq3_s_q8_1(const void *__restrict__ vx,
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
||||||
tmp +=
|
tmp +=
|
||||||
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
||||||
}
|
}
|
||||||
|
@ -303,7 +301,7 @@ static void mul_mat_vec_q_iq1_s_q8_1(const void *__restrict__ vx,
|
||||||
}
|
}
|
||||||
|
|
||||||
const int blocks_per_row = ncols / qk;
|
const int blocks_per_row = ncols / qk;
|
||||||
const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
|
const int blocks_per_warp = vdr * WARP_SIZE / qi;
|
||||||
assert(blocks_per_warp>0);
|
assert(blocks_per_warp>0);
|
||||||
// partial sum for each thread
|
// partial sum for each thread
|
||||||
float tmp = 0.0f;
|
float tmp = 0.0f;
|
||||||
|
@ -327,7 +325,7 @@ static void mul_mat_vec_q_iq1_s_q8_1(const void *__restrict__ vx,
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
||||||
tmp +=
|
tmp +=
|
||||||
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
||||||
}
|
}
|
||||||
|
@ -351,7 +349,7 @@ static void mul_mat_vec_q_iq1_m_q8_1(const void *__restrict__ vx,
|
||||||
}
|
}
|
||||||
|
|
||||||
const int blocks_per_row = ncols / qk;
|
const int blocks_per_row = ncols / qk;
|
||||||
const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
|
const int blocks_per_warp = vdr * WARP_SIZE / qi;
|
||||||
assert(blocks_per_warp>0);
|
assert(blocks_per_warp>0);
|
||||||
// partial sum for each thread
|
// partial sum for each thread
|
||||||
float tmp = 0.0f;
|
float tmp = 0.0f;
|
||||||
|
@ -375,7 +373,7 @@ static void mul_mat_vec_q_iq1_m_q8_1(const void *__restrict__ vx,
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
||||||
tmp +=
|
tmp +=
|
||||||
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
||||||
}
|
}
|
||||||
|
@ -399,7 +397,7 @@ static void mul_mat_vec_q_iq4_nl_q8_1(const void *__restrict__ vx,
|
||||||
}
|
}
|
||||||
|
|
||||||
const int blocks_per_row = ncols / qk;
|
const int blocks_per_row = ncols / qk;
|
||||||
const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
|
const int blocks_per_warp = vdr * WARP_SIZE / qi;
|
||||||
assert(blocks_per_warp>0);
|
assert(blocks_per_warp>0);
|
||||||
// partial sum for each thread
|
// partial sum for each thread
|
||||||
float tmp = 0.0f;
|
float tmp = 0.0f;
|
||||||
|
@ -423,7 +421,7 @@ static void mul_mat_vec_q_iq4_nl_q8_1(const void *__restrict__ vx,
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
||||||
tmp +=
|
tmp +=
|
||||||
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
||||||
}
|
}
|
||||||
|
@ -448,7 +446,7 @@ static void mul_mat_vec_q_iq4_xs_q8_1(const void *__restrict__ vx,
|
||||||
}
|
}
|
||||||
|
|
||||||
const int blocks_per_row = ncols / qk;
|
const int blocks_per_row = ncols / qk;
|
||||||
const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
|
const int blocks_per_warp = vdr * WARP_SIZE / qi;
|
||||||
assert(blocks_per_warp>0);
|
assert(blocks_per_warp>0);
|
||||||
// partial sum for each thread
|
// partial sum for each thread
|
||||||
float tmp = 0.0f;
|
float tmp = 0.0f;
|
||||||
|
@ -472,7 +470,7 @@ static void mul_mat_vec_q_iq4_xs_q8_1(const void *__restrict__ vx,
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
||||||
tmp +=
|
tmp +=
|
||||||
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
||||||
}
|
}
|
||||||
|
@ -489,7 +487,7 @@ static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy,
|
||||||
GGML_ASSERT(ncols % QK4_0 == 0);
|
GGML_ASSERT(ncols % QK4_0 == 0);
|
||||||
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
||||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
||||||
{
|
{
|
||||||
|
|
||||||
stream->submit([&](sycl::handler &cgh) {
|
stream->submit([&](sycl::handler &cgh) {
|
||||||
|
@ -497,7 +495,7 @@ static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy,
|
||||||
cgh.parallel_for(
|
cgh.parallel_for(
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||||
[=](sycl::nd_item<3> item_ct1)
|
[=](sycl::nd_item<3> item_ct1)
|
||||||
[[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
|
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||||
mul_mat_vec_q<QK4_0, QI4_0, block_q4_0,
|
mul_mat_vec_q<QK4_0, QI4_0, block_q4_0,
|
||||||
VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>(
|
VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>(
|
||||||
vx, vy, dst, ncols, nrows, item_ct1);
|
vx, vy, dst, ncols, nrows, item_ct1);
|
||||||
|
@ -513,7 +511,7 @@ static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy,
|
||||||
GGML_ASSERT(ncols % QK4_1 == 0);
|
GGML_ASSERT(ncols % QK4_1 == 0);
|
||||||
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
||||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
||||||
{
|
{
|
||||||
|
|
||||||
stream->submit([&](sycl::handler &cgh) {
|
stream->submit([&](sycl::handler &cgh) {
|
||||||
|
@ -521,7 +519,7 @@ static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy,
|
||||||
cgh.parallel_for(
|
cgh.parallel_for(
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||||
[=](sycl::nd_item<3> item_ct1)
|
[=](sycl::nd_item<3> item_ct1)
|
||||||
[[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
|
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||||
mul_mat_vec_q<QK4_0, QI4_1, block_q4_1,
|
mul_mat_vec_q<QK4_0, QI4_1, block_q4_1,
|
||||||
VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>(
|
VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>(
|
||||||
vx, vy, dst, ncols, nrows, item_ct1);
|
vx, vy, dst, ncols, nrows, item_ct1);
|
||||||
|
@ -537,7 +535,7 @@ static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy,
|
||||||
GGML_ASSERT(ncols % QK5_0 == 0);
|
GGML_ASSERT(ncols % QK5_0 == 0);
|
||||||
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
||||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
||||||
{
|
{
|
||||||
|
|
||||||
stream->submit([&](sycl::handler &cgh) {
|
stream->submit([&](sycl::handler &cgh) {
|
||||||
|
@ -545,7 +543,7 @@ static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy,
|
||||||
cgh.parallel_for(
|
cgh.parallel_for(
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||||
[=](sycl::nd_item<3> item_ct1)
|
[=](sycl::nd_item<3> item_ct1)
|
||||||
[[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
|
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||||
mul_mat_vec_q<QK5_0, QI5_0, block_q5_0,
|
mul_mat_vec_q<QK5_0, QI5_0, block_q5_0,
|
||||||
VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>(
|
VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>(
|
||||||
vx, vy, dst, ncols, nrows, item_ct1);
|
vx, vy, dst, ncols, nrows, item_ct1);
|
||||||
|
@ -561,7 +559,7 @@ static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy,
|
||||||
GGML_ASSERT(ncols % QK5_1 == 0);
|
GGML_ASSERT(ncols % QK5_1 == 0);
|
||||||
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
||||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
||||||
{
|
{
|
||||||
|
|
||||||
stream->submit([&](sycl::handler &cgh) {
|
stream->submit([&](sycl::handler &cgh) {
|
||||||
|
@ -569,7 +567,7 @@ static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy,
|
||||||
cgh.parallel_for(
|
cgh.parallel_for(
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||||
[=](sycl::nd_item<3> item_ct1)
|
[=](sycl::nd_item<3> item_ct1)
|
||||||
[[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
|
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||||
mul_mat_vec_q<QK5_1, QI5_1, block_q5_1,
|
mul_mat_vec_q<QK5_1, QI5_1, block_q5_1,
|
||||||
VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>(
|
VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>(
|
||||||
vx, vy, dst, ncols, nrows, item_ct1);
|
vx, vy, dst, ncols, nrows, item_ct1);
|
||||||
|
@ -585,7 +583,7 @@ static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy,
|
||||||
GGML_ASSERT(ncols % QK8_0 == 0);
|
GGML_ASSERT(ncols % QK8_0 == 0);
|
||||||
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
||||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
||||||
{
|
{
|
||||||
|
|
||||||
stream->submit([&](sycl::handler &cgh) {
|
stream->submit([&](sycl::handler &cgh) {
|
||||||
|
@ -593,7 +591,7 @@ static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy,
|
||||||
cgh.parallel_for(
|
cgh.parallel_for(
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||||
[=](sycl::nd_item<3> item_ct1)
|
[=](sycl::nd_item<3> item_ct1)
|
||||||
[[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
|
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||||
mul_mat_vec_q<QK8_0, QI8_0, block_q8_0,
|
mul_mat_vec_q<QK8_0, QI8_0, block_q8_0,
|
||||||
VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>(
|
VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>(
|
||||||
vx, vy, dst, ncols, nrows, item_ct1);
|
vx, vy, dst, ncols, nrows, item_ct1);
|
||||||
|
@ -609,7 +607,7 @@ static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy,
|
||||||
GGML_ASSERT(ncols % QK_K == 0);
|
GGML_ASSERT(ncols % QK_K == 0);
|
||||||
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
||||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
||||||
{
|
{
|
||||||
|
|
||||||
stream->submit([&](sycl::handler &cgh) {
|
stream->submit([&](sycl::handler &cgh) {
|
||||||
|
@ -617,7 +615,7 @@ static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy,
|
||||||
cgh.parallel_for(
|
cgh.parallel_for(
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||||
[=](sycl::nd_item<3> item_ct1)
|
[=](sycl::nd_item<3> item_ct1)
|
||||||
[[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
|
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||||
mul_mat_vec_q<QK_K, QI2_K, block_q2_K,
|
mul_mat_vec_q<QK_K, QI2_K, block_q2_K,
|
||||||
VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>(
|
VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>(
|
||||||
vx, vy, dst, ncols, nrows, item_ct1);
|
vx, vy, dst, ncols, nrows, item_ct1);
|
||||||
|
@ -633,7 +631,7 @@ static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy,
|
||||||
GGML_ASSERT(ncols % QK_K == 0);
|
GGML_ASSERT(ncols % QK_K == 0);
|
||||||
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
||||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
||||||
{
|
{
|
||||||
|
|
||||||
stream->submit([&](sycl::handler &cgh) {
|
stream->submit([&](sycl::handler &cgh) {
|
||||||
|
@ -641,7 +639,7 @@ static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy,
|
||||||
cgh.parallel_for(
|
cgh.parallel_for(
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||||
[=](sycl::nd_item<3> item_ct1)
|
[=](sycl::nd_item<3> item_ct1)
|
||||||
[[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
|
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||||
mul_mat_vec_q<QK_K, QI3_K, block_q3_K,
|
mul_mat_vec_q<QK_K, QI3_K, block_q3_K,
|
||||||
VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>(
|
VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>(
|
||||||
vx, vy, dst, ncols, nrows, item_ct1);
|
vx, vy, dst, ncols, nrows, item_ct1);
|
||||||
|
@ -657,7 +655,7 @@ static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
|
||||||
GGML_ASSERT(ncols % QK_K == 0);
|
GGML_ASSERT(ncols % QK_K == 0);
|
||||||
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
||||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
||||||
{
|
{
|
||||||
|
|
||||||
stream->submit([&](sycl::handler &cgh) {
|
stream->submit([&](sycl::handler &cgh) {
|
||||||
|
@ -665,7 +663,7 @@ static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
|
||||||
cgh.parallel_for(
|
cgh.parallel_for(
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||||
[=](sycl::nd_item<3> item_ct1)
|
[=](sycl::nd_item<3> item_ct1)
|
||||||
[[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
|
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||||
mul_mat_vec_q<QK_K, QI4_K, block_q4_K,
|
mul_mat_vec_q<QK_K, QI4_K, block_q4_K,
|
||||||
VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>(
|
VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>(
|
||||||
vx, vy, dst, ncols, nrows, item_ct1);
|
vx, vy, dst, ncols, nrows, item_ct1);
|
||||||
|
@ -681,7 +679,7 @@ static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
|
||||||
GGML_ASSERT(ncols % QK_K == 0);
|
GGML_ASSERT(ncols % QK_K == 0);
|
||||||
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
||||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
||||||
{
|
{
|
||||||
|
|
||||||
stream->submit([&](sycl::handler &cgh) {
|
stream->submit([&](sycl::handler &cgh) {
|
||||||
|
@ -689,7 +687,7 @@ static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
|
||||||
cgh.parallel_for(
|
cgh.parallel_for(
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||||
[=](sycl::nd_item<3> item_ct1)
|
[=](sycl::nd_item<3> item_ct1)
|
||||||
[[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
|
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||||
mul_mat_vec_q<QK_K, QI5_K, block_q5_K,
|
mul_mat_vec_q<QK_K, QI5_K, block_q5_K,
|
||||||
VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>(
|
VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>(
|
||||||
vx, vy, dst, ncols, nrows, item_ct1);
|
vx, vy, dst, ncols, nrows, item_ct1);
|
||||||
|
@ -705,7 +703,7 @@ static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
|
||||||
GGML_ASSERT(ncols % QK_K == 0);
|
GGML_ASSERT(ncols % QK_K == 0);
|
||||||
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
||||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
||||||
{
|
{
|
||||||
|
|
||||||
stream->submit([&](sycl::handler &cgh) {
|
stream->submit([&](sycl::handler &cgh) {
|
||||||
|
@ -713,7 +711,7 @@ static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
|
||||||
cgh.parallel_for(
|
cgh.parallel_for(
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||||
[=](sycl::nd_item<3> item_ct1)
|
[=](sycl::nd_item<3> item_ct1)
|
||||||
[[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
|
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||||
mul_mat_vec_q<QK_K, QI6_K, block_q6_K,
|
mul_mat_vec_q<QK_K, QI6_K, block_q6_K,
|
||||||
VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>(
|
VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>(
|
||||||
vx, vy, dst, ncols, nrows, item_ct1);
|
vx, vy, dst, ncols, nrows, item_ct1);
|
||||||
|
@ -730,13 +728,13 @@ static void mul_mat_vec_iq2_xxs_q8_1_sycl(const void *vx, const void *vy,
|
||||||
GGML_ASSERT(ncols % QK_K == 0);
|
GGML_ASSERT(ncols % QK_K == 0);
|
||||||
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
||||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
||||||
{
|
{
|
||||||
stream->submit([&](sycl::handler &cgh) {
|
stream->submit([&](sycl::handler &cgh) {
|
||||||
cgh.parallel_for(
|
cgh.parallel_for(
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||||
[=](sycl::nd_item<3> item_ct1)
|
[=](sycl::nd_item<3> item_ct1)
|
||||||
[[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
|
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||||
mul_mat_vec_q_iq2_xxs_q8_1<QK_K, QI2_XXS/2, block_iq2_xxs, 1>(
|
mul_mat_vec_q_iq2_xxs_q8_1<QK_K, QI2_XXS/2, block_iq2_xxs, 1>(
|
||||||
vx, vy, dst, ncols, nrows, item_ct1);
|
vx, vy, dst, ncols, nrows, item_ct1);
|
||||||
});
|
});
|
||||||
|
@ -751,13 +749,13 @@ static void mul_mat_vec_iq2_xs_q8_1_sycl(const void *vx, const void *vy,
|
||||||
GGML_ASSERT(ncols % QK_K == 0);
|
GGML_ASSERT(ncols % QK_K == 0);
|
||||||
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
||||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
||||||
{
|
{
|
||||||
stream->submit([&](sycl::handler & cgh) {
|
stream->submit([&](sycl::handler & cgh) {
|
||||||
cgh.parallel_for(
|
cgh.parallel_for(
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||||
[=](sycl::nd_item<3> item_ct1)
|
[=](sycl::nd_item<3> item_ct1)
|
||||||
[[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
|
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||||
mul_mat_vec_q_iq2_xs_q8_1<QK_K, QI2_XS/2, block_iq2_xs, 1>(
|
mul_mat_vec_q_iq2_xs_q8_1<QK_K, QI2_XS/2, block_iq2_xs, 1>(
|
||||||
vx, vy, dst, ncols, nrows, item_ct1);
|
vx, vy, dst, ncols, nrows, item_ct1);
|
||||||
});
|
});
|
||||||
|
@ -772,14 +770,14 @@ static void mul_mat_vec_iq2_s_q8_1_sycl(const void *vx, const void *vy,
|
||||||
GGML_ASSERT(ncols % QK_K == 0);
|
GGML_ASSERT(ncols % QK_K == 0);
|
||||||
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
||||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
||||||
{
|
{
|
||||||
|
|
||||||
stream->submit([&](sycl::handler &cgh) {
|
stream->submit([&](sycl::handler &cgh) {
|
||||||
cgh.parallel_for(
|
cgh.parallel_for(
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||||
[=](sycl::nd_item<3> item_ct1)
|
[=](sycl::nd_item<3> item_ct1)
|
||||||
[[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
|
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||||
mul_mat_vec_q_iq2_s_q8_1<QK_K, QI2_S/2, block_iq2_s, 1>(
|
mul_mat_vec_q_iq2_s_q8_1<QK_K, QI2_S/2, block_iq2_s, 1>(
|
||||||
vx, vy, dst, ncols, nrows, item_ct1);
|
vx, vy, dst, ncols, nrows, item_ct1);
|
||||||
});
|
});
|
||||||
|
@ -794,14 +792,14 @@ static void mul_mat_vec_iq3_xxs_q8_1_sycl(const void *vx, const void *vy,
|
||||||
GGML_ASSERT(ncols % QK_K == 0);
|
GGML_ASSERT(ncols % QK_K == 0);
|
||||||
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
||||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
||||||
{
|
{
|
||||||
|
|
||||||
stream->submit([&](sycl::handler &cgh) {
|
stream->submit([&](sycl::handler &cgh) {
|
||||||
cgh.parallel_for(
|
cgh.parallel_for(
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||||
[=](sycl::nd_item<3> item_ct1)
|
[=](sycl::nd_item<3> item_ct1)
|
||||||
[[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
|
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||||
mul_mat_vec_q_iq3_xxs_q8_1<QK_K, QI3_XXS/2, block_iq3_xxs, 1>(
|
mul_mat_vec_q_iq3_xxs_q8_1<QK_K, QI3_XXS/2, block_iq3_xxs, 1>(
|
||||||
vx, vy, dst, ncols, nrows, item_ct1);
|
vx, vy, dst, ncols, nrows, item_ct1);
|
||||||
});
|
});
|
||||||
|
@ -816,14 +814,14 @@ static void mul_mat_vec_iq3_s_q8_1_sycl(const void *vx, const void *vy,
|
||||||
GGML_ASSERT(ncols % QK_K == 0);
|
GGML_ASSERT(ncols % QK_K == 0);
|
||||||
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
||||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
||||||
{
|
{
|
||||||
|
|
||||||
stream->submit([&](sycl::handler &cgh) {
|
stream->submit([&](sycl::handler &cgh) {
|
||||||
cgh.parallel_for(
|
cgh.parallel_for(
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||||
[=](sycl::nd_item<3> item_ct1)
|
[=](sycl::nd_item<3> item_ct1)
|
||||||
[[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
|
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||||
mul_mat_vec_q_iq3_s_q8_1<QK_K, QI3_S/2, block_iq3_s, 1>(
|
mul_mat_vec_q_iq3_s_q8_1<QK_K, QI3_S/2, block_iq3_s, 1>(
|
||||||
vx, vy, dst, ncols, nrows, item_ct1);
|
vx, vy, dst, ncols, nrows, item_ct1);
|
||||||
});
|
});
|
||||||
|
@ -838,14 +836,14 @@ static void mul_mat_vec_iq1_s_q8_1_sycl(const void *vx, const void *vy,
|
||||||
GGML_ASSERT(ncols % QK_K == 0);
|
GGML_ASSERT(ncols % QK_K == 0);
|
||||||
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
||||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
||||||
{
|
{
|
||||||
|
|
||||||
stream->submit([&](sycl::handler &cgh) {
|
stream->submit([&](sycl::handler &cgh) {
|
||||||
cgh.parallel_for(
|
cgh.parallel_for(
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||||
[=](sycl::nd_item<3> item_ct1)
|
[=](sycl::nd_item<3> item_ct1)
|
||||||
[[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
|
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||||
mul_mat_vec_q_iq1_s_q8_1<QK_K, QI1_S, block_iq1_s, 1>(
|
mul_mat_vec_q_iq1_s_q8_1<QK_K, QI1_S, block_iq1_s, 1>(
|
||||||
vx, vy, dst, ncols, nrows, item_ct1);
|
vx, vy, dst, ncols, nrows, item_ct1);
|
||||||
});
|
});
|
||||||
|
@ -860,13 +858,13 @@ static void mul_mat_vec_iq1_m_q8_1_sycl(const void *vx, const void *vy,
|
||||||
GGML_ASSERT(ncols % QK_K == 0);
|
GGML_ASSERT(ncols % QK_K == 0);
|
||||||
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
||||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
||||||
{
|
{
|
||||||
stream->submit([&](sycl::handler &cgh) {
|
stream->submit([&](sycl::handler &cgh) {
|
||||||
cgh.parallel_for(
|
cgh.parallel_for(
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||||
[=](sycl::nd_item<3> item_ct1)
|
[=](sycl::nd_item<3> item_ct1)
|
||||||
[[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
|
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||||
mul_mat_vec_q_iq1_m_q8_1<QK_K, QI1_S, block_iq1_m, 1>(
|
mul_mat_vec_q_iq1_m_q8_1<QK_K, QI1_S, block_iq1_m, 1>(
|
||||||
vx, vy, dst, ncols, nrows, item_ct1);
|
vx, vy, dst, ncols, nrows, item_ct1);
|
||||||
});
|
});
|
||||||
|
@ -881,14 +879,14 @@ static void mul_mat_vec_iq4_nl_q8_1_sycl(const void *vx, const void *vy,
|
||||||
GGML_ASSERT(ncols % QK4_NL == 0);
|
GGML_ASSERT(ncols % QK4_NL == 0);
|
||||||
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
||||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
||||||
{
|
{
|
||||||
|
|
||||||
stream->submit([&](sycl::handler &cgh) {
|
stream->submit([&](sycl::handler &cgh) {
|
||||||
cgh.parallel_for(
|
cgh.parallel_for(
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||||
[=](sycl::nd_item<3> item_ct1)
|
[=](sycl::nd_item<3> item_ct1)
|
||||||
[[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
|
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||||
mul_mat_vec_q_iq4_nl_q8_1<QK4_NL, QI4_NL, block_iq4_nl, 2>(
|
mul_mat_vec_q_iq4_nl_q8_1<QK4_NL, QI4_NL, block_iq4_nl, 2>(
|
||||||
vx, vy, dst, ncols, nrows, item_ct1);
|
vx, vy, dst, ncols, nrows, item_ct1);
|
||||||
});
|
});
|
||||||
|
@ -903,14 +901,14 @@ static void mul_mat_vec_iq4_xs_q8_1_sycl(const void *vx, const void *vy,
|
||||||
GGML_ASSERT(ncols % QK_K == 0);
|
GGML_ASSERT(ncols % QK_K == 0);
|
||||||
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
||||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
||||||
{
|
{
|
||||||
|
|
||||||
stream->submit([&](sycl::handler &cgh) {
|
stream->submit([&](sycl::handler &cgh) {
|
||||||
cgh.parallel_for(
|
cgh.parallel_for(
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||||
[=](sycl::nd_item<3> item_ct1)
|
[=](sycl::nd_item<3> item_ct1)
|
||||||
[[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
|
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||||
mul_mat_vec_q_iq4_xs_q8_1<QK_K, QI4_XS/4, block_iq4_xs, 1>(
|
mul_mat_vec_q_iq4_xs_q8_1<QK_K, QI4_XS/4, block_iq4_xs, 1>(
|
||||||
vx, vy, dst, ncols, nrows, item_ct1);
|
vx, vy, dst, ncols, nrows, item_ct1);
|
||||||
});
|
});
|
||||||
|
|
|
@ -5,23 +5,24 @@
|
||||||
|
|
||||||
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
||||||
|
|
||||||
shared FLOAT_TYPE sccache1[BLOCK_SIZE/16][16];
|
shared FLOAT_TYPE sccache1[2][BLOCK_SIZE/16][16];
|
||||||
shared FLOAT_TYPE sccache2[BLOCK_SIZE/16][16];
|
shared FLOAT_TYPE sccache2[2][BLOCK_SIZE/16][16];
|
||||||
|
|
||||||
FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
|
FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
|
||||||
|
uint csel = 0;
|
||||||
|
|
||||||
void calc_superblock(const uint a_offset, const uint b_offset, const uint itid, const uint v_im, const uint ix, const uint q_offset, const uint y_offset, const uint i, const uint num_blocks_per_row, const uint first_row, const uint num_rows, const bool all_threads) {
|
void calc_superblock(const uint a_offset, const uint b_offset, const uint itid, const uint v_im, const uint ix, const uint q_offset, const uint y_offset, const uint i, const uint num_blocks_per_row, const uint first_row, const uint num_rows, const bool all_threads) {
|
||||||
const uint y_idx = i * QUANT_K + y_offset;
|
const uint y_idx = i * QUANT_K + y_offset;
|
||||||
|
|
||||||
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||||
const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
|
const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
|
||||||
|
csel ^= 1;
|
||||||
|
|
||||||
barrier();
|
|
||||||
if (!all_threads) { // when we don't have enough blocks to use all threads
|
if (!all_threads) { // when we don't have enough blocks to use all threads
|
||||||
if (i < num_blocks_per_row) {
|
if (i < num_blocks_per_row) {
|
||||||
const uint32_t scale = uint32_t(data_a[ib0 + i].scales[itid]);
|
const uint32_t scale = uint32_t(data_a[ib0 + i].scales[itid]);
|
||||||
sccache1[ix][itid] = FLOAT_TYPE(scale & 0xF);
|
sccache1[csel][ix][itid] = FLOAT_TYPE(scale & 0xF);
|
||||||
sccache2[ix][itid] = FLOAT_TYPE((scale >> 4) & 0xF);
|
sccache2[csel][ix][itid] = FLOAT_TYPE((scale >> 4) & 0xF);
|
||||||
}
|
}
|
||||||
barrier();
|
barrier();
|
||||||
|
|
||||||
|
@ -29,8 +30,8 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint itid,
|
||||||
continue;
|
continue;
|
||||||
} else {
|
} else {
|
||||||
const uint32_t scale = uint32_t(data_a[ib0 + i].scales[itid]);
|
const uint32_t scale = uint32_t(data_a[ib0 + i].scales[itid]);
|
||||||
sccache1[ix][itid] = FLOAT_TYPE(scale & 0xF);
|
sccache1[csel][ix][itid] = FLOAT_TYPE(scale & 0xF);
|
||||||
sccache2[ix][itid] = FLOAT_TYPE((scale >> 4) & 0xF);
|
sccache2[csel][ix][itid] = FLOAT_TYPE((scale >> 4) & 0xF);
|
||||||
barrier();
|
barrier();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -57,22 +58,22 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint itid,
|
||||||
FLOAT_TYPE sum1 = FLOAT_TYPE(0.0);
|
FLOAT_TYPE sum1 = FLOAT_TYPE(0.0);
|
||||||
FLOAT_TYPE sum2 = FLOAT_TYPE(0.0);
|
FLOAT_TYPE sum2 = FLOAT_TYPE(0.0);
|
||||||
[[unroll]] for (int l = 0; l < 2; ++l) {
|
[[unroll]] for (int l = 0; l < 2; ++l) {
|
||||||
sum1 = fma(FLOAT_TYPE(b0[l]), sccache1[ix][ 8*v_im] * qs_u32_0[l ],
|
sum1 = fma(FLOAT_TYPE(b0[l]), sccache1[csel][ix][ 8*v_im] * qs_u32_0[l ],
|
||||||
fma(FLOAT_TYPE(b16[l]), sccache1[ix][1 + 8*v_im] * qs_u32_0[l+2],
|
fma(FLOAT_TYPE(b16[l]), sccache1[csel][ix][1 + 8*v_im] * qs_u32_0[l+2],
|
||||||
fma(FLOAT_TYPE(b32[l]), sccache1[ix][2 + 8*v_im] * qs_u32_2[l ],
|
fma(FLOAT_TYPE(b32[l]), sccache1[csel][ix][2 + 8*v_im] * qs_u32_2[l ],
|
||||||
fma(FLOAT_TYPE(b48[l]), sccache1[ix][3 + 8*v_im] * qs_u32_2[l+2],
|
fma(FLOAT_TYPE(b48[l]), sccache1[csel][ix][3 + 8*v_im] * qs_u32_2[l+2],
|
||||||
fma(FLOAT_TYPE(b64[l]), sccache1[ix][4 + 8*v_im] * qs_u32_4[l ],
|
fma(FLOAT_TYPE(b64[l]), sccache1[csel][ix][4 + 8*v_im] * qs_u32_4[l ],
|
||||||
fma(FLOAT_TYPE(b80[l]), sccache1[ix][5 + 8*v_im] * qs_u32_4[l+2],
|
fma(FLOAT_TYPE(b80[l]), sccache1[csel][ix][5 + 8*v_im] * qs_u32_4[l+2],
|
||||||
fma(FLOAT_TYPE(b96[l]), sccache1[ix][6 + 8*v_im] * qs_u32_6[l ],
|
fma(FLOAT_TYPE(b96[l]), sccache1[csel][ix][6 + 8*v_im] * qs_u32_6[l ],
|
||||||
fma(FLOAT_TYPE(b112[l]), sccache1[ix][7 + 8*v_im] * qs_u32_6[l+2], sum1))))))));
|
fma(FLOAT_TYPE(b112[l]), sccache1[csel][ix][7 + 8*v_im] * qs_u32_6[l+2], sum1))))))));
|
||||||
sum2 = fma(FLOAT_TYPE(b0[l]), sccache2[ix][ 8*v_im],
|
sum2 = fma(FLOAT_TYPE(b0[l]), sccache2[csel][ix][ 8*v_im],
|
||||||
fma(FLOAT_TYPE(b16[l]), sccache2[ix][1 + 8*v_im],
|
fma(FLOAT_TYPE(b16[l]), sccache2[csel][ix][1 + 8*v_im],
|
||||||
fma(FLOAT_TYPE(b32[l]), sccache2[ix][2 + 8*v_im],
|
fma(FLOAT_TYPE(b32[l]), sccache2[csel][ix][2 + 8*v_im],
|
||||||
fma(FLOAT_TYPE(b48[l]), sccache2[ix][3 + 8*v_im],
|
fma(FLOAT_TYPE(b48[l]), sccache2[csel][ix][3 + 8*v_im],
|
||||||
fma(FLOAT_TYPE(b64[l]), sccache2[ix][4 + 8*v_im],
|
fma(FLOAT_TYPE(b64[l]), sccache2[csel][ix][4 + 8*v_im],
|
||||||
fma(FLOAT_TYPE(b80[l]), sccache2[ix][5 + 8*v_im],
|
fma(FLOAT_TYPE(b80[l]), sccache2[csel][ix][5 + 8*v_im],
|
||||||
fma(FLOAT_TYPE(b96[l]), sccache2[ix][6 + 8*v_im],
|
fma(FLOAT_TYPE(b96[l]), sccache2[csel][ix][6 + 8*v_im],
|
||||||
fma(FLOAT_TYPE(b112[l]), sccache2[ix][7 + 8*v_im], sum2))))))));
|
fma(FLOAT_TYPE(b112[l]), sccache2[csel][ix][7 + 8*v_im], sum2))))))));
|
||||||
}
|
}
|
||||||
temp[j][n] = fma(dall, sum1, fma(-dmin, sum2, temp[j][n]));
|
temp[j][n] = fma(dall, sum1, fma(-dmin, sum2, temp[j][n]));
|
||||||
}
|
}
|
||||||
|
|
|
@ -5,20 +5,21 @@
|
||||||
|
|
||||||
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
||||||
|
|
||||||
shared FLOAT_TYPE sccache[BLOCK_SIZE/16][2][8];
|
shared FLOAT_TYPE sccache[2][BLOCK_SIZE/16][2][8];
|
||||||
|
|
||||||
FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
|
FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
|
||||||
|
uint csel = 0;
|
||||||
|
|
||||||
void calc_superblock(const uint a_offset, const uint b_offset, const uint ix, const uint itid8, const uint v_im, const uint v_im4, const uint v_in, const uint32_t hm_m[4], const uint q_offset, const uint y_offset, const uint s_shift, const uint i, const uint num_blocks_per_row, const uint first_row, const uint num_rows, const bool all_threads) {
|
void calc_superblock(const uint a_offset, const uint b_offset, const uint ix, const uint itid8, const uint v_im, const uint v_im4, const uint v_in, const uint32_t hm_m[4], const uint q_offset, const uint y_offset, const uint s_shift, const uint i, const uint num_blocks_per_row, const uint first_row, const uint num_rows, const bool all_threads) {
|
||||||
const uint y_idx = i * QUANT_K + y_offset;
|
const uint y_idx = i * QUANT_K + y_offset;
|
||||||
|
|
||||||
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||||
const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
|
const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
|
||||||
|
csel ^= 1;
|
||||||
|
|
||||||
if (!all_threads) { // when we don't have enough blocks to use all threads
|
if (!all_threads) { // when we don't have enough blocks to use all threads
|
||||||
barrier();
|
|
||||||
if (i < num_blocks_per_row)
|
if (i < num_blocks_per_row)
|
||||||
sccache[ix][v_im][itid8] = FLOAT_TYPE(int8_t(((data_a[ib0+i].scales[itid8] >> v_im4) & 0xF) | (((data_a[ib0+i].scales[itid8%4+8] >> s_shift) & 3) << 4)) - 32);
|
sccache[csel][ix][v_im][itid8] = FLOAT_TYPE(int8_t(((data_a[ib0+i].scales[itid8] >> v_im4) & 0xF) | (((data_a[ib0+i].scales[itid8%4+8] >> s_shift) & 3) << 4)) - 32);
|
||||||
barrier();
|
barrier();
|
||||||
|
|
||||||
if (i >= num_blocks_per_row)
|
if (i >= num_blocks_per_row)
|
||||||
|
@ -40,8 +41,7 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint ix, co
|
||||||
const vec4 qs_u32_6 = vec4(unpack8((qs_u32 >> 6) & 0x03030303));
|
const vec4 qs_u32_6 = vec4(unpack8((qs_u32 >> 6) & 0x03030303));
|
||||||
|
|
||||||
if (all_threads) {
|
if (all_threads) {
|
||||||
barrier();
|
sccache[csel][ix][v_im][itid8] = FLOAT_TYPE(int8_t(((data_a[ib0+i].scales[itid8] >> v_im4) & 0xF) | (((data_a[ib0+i].scales[itid8%4+8] >> s_shift) & 3) << 4)) - 32);
|
||||||
sccache[ix][v_im][itid8] = FLOAT_TYPE(int8_t(((data_a[ib0+i].scales[itid8] >> v_im4) & 0xF) | (((data_a[ib0+i].scales[itid8%4+8] >> s_shift) & 3) << 4)) - 32);
|
|
||||||
barrier();
|
barrier();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -59,14 +59,14 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint ix, co
|
||||||
|
|
||||||
FLOAT_TYPE sum = FLOAT_TYPE(0.0);
|
FLOAT_TYPE sum = FLOAT_TYPE(0.0);
|
||||||
[[unroll]] for (int l = 0; l < 2; ++l) {
|
[[unroll]] for (int l = 0; l < 2; ++l) {
|
||||||
sum = fma(FLOAT_TYPE( b0[l]) * sccache[ix][v_im][0], qs_u32_0[l ] - hmk_0[l ],
|
sum = fma(FLOAT_TYPE( b0[l]) * sccache[csel][ix][v_im][0], qs_u32_0[l ] - hmk_0[l ],
|
||||||
fma(FLOAT_TYPE( b16[l]) * sccache[ix][v_im][1], qs_u32_0[l+2] - hmk_0[l+2],
|
fma(FLOAT_TYPE( b16[l]) * sccache[csel][ix][v_im][1], qs_u32_0[l+2] - hmk_0[l+2],
|
||||||
fma(FLOAT_TYPE( b32[l]) * sccache[ix][v_im][2], qs_u32_2[l ] - hmk_1[l ],
|
fma(FLOAT_TYPE( b32[l]) * sccache[csel][ix][v_im][2], qs_u32_2[l ] - hmk_1[l ],
|
||||||
fma(FLOAT_TYPE( b48[l]) * sccache[ix][v_im][3], qs_u32_2[l+2] - hmk_1[l+2],
|
fma(FLOAT_TYPE( b48[l]) * sccache[csel][ix][v_im][3], qs_u32_2[l+2] - hmk_1[l+2],
|
||||||
fma(FLOAT_TYPE( b64[l]) * sccache[ix][v_im][4], qs_u32_4[l ] - hmk_2[l ],
|
fma(FLOAT_TYPE( b64[l]) * sccache[csel][ix][v_im][4], qs_u32_4[l ] - hmk_2[l ],
|
||||||
fma(FLOAT_TYPE( b80[l]) * sccache[ix][v_im][5], qs_u32_4[l+2] - hmk_2[l+2],
|
fma(FLOAT_TYPE( b80[l]) * sccache[csel][ix][v_im][5], qs_u32_4[l+2] - hmk_2[l+2],
|
||||||
fma(FLOAT_TYPE( b96[l]) * sccache[ix][v_im][6], qs_u32_6[l ] - hmk_3[l ],
|
fma(FLOAT_TYPE( b96[l]) * sccache[csel][ix][v_im][6], qs_u32_6[l ] - hmk_3[l ],
|
||||||
fma(FLOAT_TYPE(b112[l]) * sccache[ix][v_im][7], qs_u32_6[l+2] - hmk_3[l+2], sum))))))));
|
fma(FLOAT_TYPE(b112[l]) * sccache[csel][ix][v_im][7], qs_u32_6[l+2] - hmk_3[l+2], sum))))))));
|
||||||
}
|
}
|
||||||
temp[j][n] = fma(d, sum, temp[j][n]);
|
temp[j][n] = fma(d, sum, temp[j][n]);
|
||||||
}
|
}
|
||||||
|
|
|
@ -6,20 +6,21 @@
|
||||||
|
|
||||||
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
||||||
|
|
||||||
shared FLOAT_TYPE sccache[BLOCK_SIZE/16][16];
|
shared FLOAT_TYPE sccache[2][BLOCK_SIZE/16][16];
|
||||||
|
|
||||||
FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
|
FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
|
||||||
|
uint csel = 0;
|
||||||
|
|
||||||
void calc_superblock(const uint a_offset, const uint b_offset, const uint itid, const uint ix, const uint ql_offset, const uint qh_offset, const uint s_offset, const uint y_offset, const uint i, const uint num_blocks_per_row, const uint first_row, const uint num_rows, const bool all_threads) {
|
void calc_superblock(const uint a_offset, const uint b_offset, const uint itid, const uint ix, const uint ql_offset, const uint qh_offset, const uint s_offset, const uint y_offset, const uint i, const uint num_blocks_per_row, const uint first_row, const uint num_rows, const bool all_threads) {
|
||||||
const uint y_idx = i * QUANT_K + y_offset;
|
const uint y_idx = i * QUANT_K + y_offset;
|
||||||
|
|
||||||
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||||
const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
|
const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
|
||||||
|
csel ^= 1;
|
||||||
|
|
||||||
if (!all_threads) { // when we don't have enough blocks to use all threads
|
if (!all_threads) { // when we don't have enough blocks to use all threads
|
||||||
barrier();
|
|
||||||
if (i < num_blocks_per_row)
|
if (i < num_blocks_per_row)
|
||||||
sccache[ix][itid] = FLOAT_TYPE(data_a[ib0 + i].scales[itid]);
|
sccache[csel][ix][itid] = FLOAT_TYPE(data_a[ib0 + i].scales[itid]);
|
||||||
barrier();
|
barrier();
|
||||||
|
|
||||||
if (i >= num_blocks_per_row)
|
if (i >= num_blocks_per_row)
|
||||||
|
@ -51,8 +52,7 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint itid,
|
||||||
const vec4 q3 = vec4(unpack8(q3_u32)) - 32;
|
const vec4 q3 = vec4(unpack8(q3_u32)) - 32;
|
||||||
|
|
||||||
if (all_threads) {
|
if (all_threads) {
|
||||||
barrier();
|
sccache[csel][ix][itid] = FLOAT_TYPE(data_a[ib0 + i].scales[itid]);
|
||||||
sccache[ix][itid] = FLOAT_TYPE(data_a[ib0 + i].scales[itid]);
|
|
||||||
barrier();
|
barrier();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -71,7 +71,7 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint itid,
|
||||||
sum[2] = fma(FLOAT_TYPE(by64[l]), q2[l], sum[2]);
|
sum[2] = fma(FLOAT_TYPE(by64[l]), q2[l], sum[2]);
|
||||||
sum[3] = fma(FLOAT_TYPE(by96[l]), q3[l], sum[3]);
|
sum[3] = fma(FLOAT_TYPE(by96[l]), q3[l], sum[3]);
|
||||||
}
|
}
|
||||||
temp[j][n] = fma(fma(sum[0], sccache[ix][s_offset], fma(sum[1], sccache[ix][s_offset + 2], fma(sum[2], sccache[ix][s_offset + 4], sum[3] * sccache[ix][s_offset + 6]))), d, temp[j][n]);
|
temp[j][n] = fma(fma(sum[0], sccache[csel][ix][s_offset], fma(sum[1], sccache[csel][ix][s_offset + 2], fma(sum[2], sccache[csel][ix][s_offset + 4], sum[3] * sccache[csel][ix][s_offset + 6]))), d, temp[j][n]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -743,7 +743,7 @@ void main() {
|
||||||
[[unroll]] for (uint cm_col = 0; cm_col < cms_per_col; cm_col++) {
|
[[unroll]] for (uint cm_col = 0; cm_col < cms_per_col; cm_col++) {
|
||||||
coopMatStore(sums[cm_col * cms_per_row + cm_row], coopmat_stage, warp_i * TM * TN, TM, gl_CooperativeMatrixLayoutColumnMajor);
|
coopMatStore(sums[cm_col * cms_per_row + cm_row], coopmat_stage, warp_i * TM * TN, TM, gl_CooperativeMatrixLayoutColumnMajor);
|
||||||
|
|
||||||
[[unroll]] for (uint col = 0; col < BN; col += storestride) {
|
[[unroll]] for (uint col = 0; col < TN; col += storestride) {
|
||||||
const uint row_i = dc + cm_col * TN + col + store_c;
|
const uint row_i = dc + cm_col * TN + col + store_c;
|
||||||
if (row_i >= _ne1) break;
|
if (row_i >= _ne1) break;
|
||||||
|
|
||||||
|
|
|
@ -253,6 +253,7 @@ class MODEL_ARCH(IntEnum):
|
||||||
MINICPM3 = auto()
|
MINICPM3 = auto()
|
||||||
GEMMA = auto()
|
GEMMA = auto()
|
||||||
GEMMA2 = auto()
|
GEMMA2 = auto()
|
||||||
|
GEMMA3 = auto()
|
||||||
STARCODER2 = auto()
|
STARCODER2 = auto()
|
||||||
RWKV6 = auto()
|
RWKV6 = auto()
|
||||||
RWKV6QWEN2 = auto()
|
RWKV6QWEN2 = auto()
|
||||||
|
@ -440,6 +441,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||||
MODEL_ARCH.MINICPM3: "minicpm3",
|
MODEL_ARCH.MINICPM3: "minicpm3",
|
||||||
MODEL_ARCH.GEMMA: "gemma",
|
MODEL_ARCH.GEMMA: "gemma",
|
||||||
MODEL_ARCH.GEMMA2: "gemma2",
|
MODEL_ARCH.GEMMA2: "gemma2",
|
||||||
|
MODEL_ARCH.GEMMA3: "gemma3",
|
||||||
MODEL_ARCH.STARCODER2: "starcoder2",
|
MODEL_ARCH.STARCODER2: "starcoder2",
|
||||||
MODEL_ARCH.RWKV6: "rwkv6",
|
MODEL_ARCH.RWKV6: "rwkv6",
|
||||||
MODEL_ARCH.RWKV6QWEN2: "rwkv6qwen2",
|
MODEL_ARCH.RWKV6QWEN2: "rwkv6qwen2",
|
||||||
|
@ -1077,6 +1079,23 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
MODEL_TENSOR.FFN_PRE_NORM,
|
MODEL_TENSOR.FFN_PRE_NORM,
|
||||||
MODEL_TENSOR.FFN_POST_NORM,
|
MODEL_TENSOR.FFN_POST_NORM,
|
||||||
],
|
],
|
||||||
|
MODEL_ARCH.GEMMA3: [
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_Q,
|
||||||
|
MODEL_TENSOR.ATTN_Q_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_K,
|
||||||
|
MODEL_TENSOR.ATTN_K_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_V,
|
||||||
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
|
MODEL_TENSOR.FFN_GATE,
|
||||||
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
|
MODEL_TENSOR.FFN_UP,
|
||||||
|
MODEL_TENSOR.ATTN_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_POST_NORM,
|
||||||
|
MODEL_TENSOR.FFN_PRE_NORM,
|
||||||
|
MODEL_TENSOR.FFN_POST_NORM,
|
||||||
|
],
|
||||||
MODEL_ARCH.STARCODER2: [
|
MODEL_ARCH.STARCODER2: [
|
||||||
MODEL_TENSOR.TOKEN_EMBD,
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
MODEL_TENSOR.OUTPUT_NORM,
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
|
|
@ -2876,11 +2876,15 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
|
||||||
|
|
||||||
bool llava_images_changed = false;
|
bool llava_images_changed = false;
|
||||||
|
|
||||||
bool add_bos_token = true;
|
bool add_bos_token = true; //if set to false, mmproj handling breaks
|
||||||
if(file_format == FileFormat::GGUF_GENERIC)
|
if(file_format == FileFormat::GGUF_GENERIC && mmproj_filename == "")
|
||||||
{
|
{
|
||||||
const llama_vocab * tmpvocab = llama_model_get_vocab(&(llama_ctx_v4->model));
|
const llama_vocab * tmpvocab = llama_model_get_vocab(&(llama_ctx_v4->model));
|
||||||
add_bos_token = llama_vocab_get_add_bos(tmpvocab);
|
add_bos_token = llama_vocab_get_add_bos(tmpvocab);
|
||||||
|
if(!add_bos_token)
|
||||||
|
{
|
||||||
|
printf("\nBOS token prefix was disabled for this model.");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for(int x=0;x<inputs.stop_sequence_len;++x)
|
for(int x=0;x<inputs.stop_sequence_len;++x)
|
||||||
|
|
|
@ -36,6 +36,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||||
{ LLM_ARCH_MINICPM3, "minicpm3" },
|
{ LLM_ARCH_MINICPM3, "minicpm3" },
|
||||||
{ LLM_ARCH_GEMMA, "gemma" },
|
{ LLM_ARCH_GEMMA, "gemma" },
|
||||||
{ LLM_ARCH_GEMMA2, "gemma2" },
|
{ LLM_ARCH_GEMMA2, "gemma2" },
|
||||||
|
{ LLM_ARCH_GEMMA3, "gemma3" },
|
||||||
{ LLM_ARCH_STARCODER2, "starcoder2" },
|
{ LLM_ARCH_STARCODER2, "starcoder2" },
|
||||||
{ LLM_ARCH_MAMBA, "mamba" },
|
{ LLM_ARCH_MAMBA, "mamba" },
|
||||||
{ LLM_ARCH_XVERSE, "xverse" },
|
{ LLM_ARCH_XVERSE, "xverse" },
|
||||||
|
@ -766,6 +767,26 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||||
{ LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
|
{ LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
LLM_ARCH_GEMMA3,
|
||||||
|
{
|
||||||
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||||
|
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||||
|
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||||
|
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||||
|
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
||||||
|
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||||
|
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
||||||
|
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||||
|
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||||
|
{ LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
|
||||||
|
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||||
|
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||||
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||||
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||||
|
{ LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
|
||||||
|
},
|
||||||
|
},
|
||||||
{
|
{
|
||||||
LLM_ARCH_STARCODER2,
|
LLM_ARCH_STARCODER2,
|
||||||
{
|
{
|
||||||
|
|
|
@ -40,6 +40,7 @@ enum llm_arch {
|
||||||
LLM_ARCH_MINICPM3,
|
LLM_ARCH_MINICPM3,
|
||||||
LLM_ARCH_GEMMA,
|
LLM_ARCH_GEMMA,
|
||||||
LLM_ARCH_GEMMA2,
|
LLM_ARCH_GEMMA2,
|
||||||
|
LLM_ARCH_GEMMA3,
|
||||||
LLM_ARCH_STARCODER2,
|
LLM_ARCH_STARCODER2,
|
||||||
LLM_ARCH_MAMBA,
|
LLM_ARCH_MAMBA,
|
||||||
LLM_ARCH_XVERSE,
|
LLM_ARCH_XVERSE,
|
||||||
|
|
|
@ -9,6 +9,7 @@
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
|
#include <cmath>
|
||||||
#include <functional>
|
#include <functional>
|
||||||
#include <map>
|
#include <map>
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
|
@ -869,6 +870,23 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||||
default: type = LLM_TYPE_UNKNOWN;
|
default: type = LLM_TYPE_UNKNOWN;
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_GEMMA3:
|
||||||
|
{
|
||||||
|
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
||||||
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||||
|
|
||||||
|
switch (hparams.n_layer) {
|
||||||
|
case 26: type = LLM_TYPE_1B; break;
|
||||||
|
case 34: type = LLM_TYPE_4B; break;
|
||||||
|
case 48: type = LLM_TYPE_12B; break;
|
||||||
|
case 62: type = LLM_TYPE_27B; break;
|
||||||
|
default: type = LLM_TYPE_UNKNOWN;
|
||||||
|
}
|
||||||
|
|
||||||
|
hparams.f_attention_scale = type == LLM_TYPE_27B
|
||||||
|
? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0)))
|
||||||
|
: 1.0f / std::sqrt(float(hparams.n_embd_head_k));
|
||||||
|
} break;
|
||||||
case LLM_ARCH_STARCODER2:
|
case LLM_ARCH_STARCODER2:
|
||||||
{
|
{
|
||||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||||
|
@ -2549,6 +2567,35 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
||||||
layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
|
layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
|
||||||
|
|
||||||
|
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||||
|
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
||||||
|
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
||||||
|
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
||||||
|
layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
|
||||||
|
}
|
||||||
|
} break;
|
||||||
|
case LLM_ARCH_GEMMA3:
|
||||||
|
{
|
||||||
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||||
|
|
||||||
|
// output
|
||||||
|
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
||||||
|
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
|
||||||
|
|
||||||
|
for (int i = 0; i < n_layer; ++i) {
|
||||||
|
auto & layer = layers[i];
|
||||||
|
|
||||||
|
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
||||||
|
|
||||||
|
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
||||||
|
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
||||||
|
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
|
||||||
|
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
||||||
|
|
||||||
|
layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
|
||||||
|
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
|
||||||
|
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
|
||||||
|
|
||||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||||
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
||||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
||||||
|
@ -3746,6 +3793,7 @@ void llama_model::print_info() const {
|
||||||
LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
|
LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
|
||||||
LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
|
LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
|
||||||
LLAMA_LOG_INFO("%s: f_logit_scale = %.1e\n", __func__, hparams.f_logit_scale);
|
LLAMA_LOG_INFO("%s: f_logit_scale = %.1e\n", __func__, hparams.f_logit_scale);
|
||||||
|
LLAMA_LOG_INFO("%s: f_attn_scale = %.1e\n", __func__, hparams.f_attention_scale);
|
||||||
LLAMA_LOG_INFO("%s: n_ff = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_ff(il); }, hparams.n_layer).c_str());
|
LLAMA_LOG_INFO("%s: n_ff = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_ff(il); }, hparams.n_layer).c_str());
|
||||||
LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
|
LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
|
||||||
LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
|
LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
|
||||||
|
@ -4022,6 +4070,7 @@ enum llama_rope_type llama_model_rope_type(const struct llama_model * model) {
|
||||||
case LLM_ARCH_PHIMOE:
|
case LLM_ARCH_PHIMOE:
|
||||||
case LLM_ARCH_GEMMA:
|
case LLM_ARCH_GEMMA:
|
||||||
case LLM_ARCH_GEMMA2:
|
case LLM_ARCH_GEMMA2:
|
||||||
|
case LLM_ARCH_GEMMA3:
|
||||||
case LLM_ARCH_STARCODER2:
|
case LLM_ARCH_STARCODER2:
|
||||||
case LLM_ARCH_OPENELM:
|
case LLM_ARCH_OPENELM:
|
||||||
case LLM_ARCH_GPTNEOX:
|
case LLM_ARCH_GPTNEOX:
|
||||||
|
|
147
src/llama.cpp
147
src/llama.cpp
|
@ -5014,6 +5014,149 @@ struct llm_build_context {
|
||||||
return gf;
|
return gf;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct ggml_cgraph * build_gemma3() {
|
||||||
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
||||||
|
|
||||||
|
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
||||||
|
|
||||||
|
struct ggml_tensor * cur;
|
||||||
|
struct ggml_tensor * inpL;
|
||||||
|
|
||||||
|
inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
|
||||||
|
|
||||||
|
// important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
|
||||||
|
if (ubatch.token) {
|
||||||
|
inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
|
||||||
|
cb(inpL, "inp_scaled", -1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// inp_pos - contains the positions
|
||||||
|
struct ggml_tensor * inp_pos = build_inp_pos();
|
||||||
|
|
||||||
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||||
|
// gemma3 requires different mask for layers using sliding window (SWA)
|
||||||
|
struct ggml_tensor * KQ_mask = build_inp_KQ_mask(true);
|
||||||
|
struct ggml_tensor * KQ_mask_swa = build_inp_KQ_mask_swa(true);
|
||||||
|
|
||||||
|
// "5-to-1 interleaved attention"
|
||||||
|
// 5 layers of local attention followed by 1 layer of global attention
|
||||||
|
static const int sliding_window_pattern = 6;
|
||||||
|
|
||||||
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
|
const bool is_sliding = (il + 1) % sliding_window_pattern;
|
||||||
|
const float freq_base_l = is_sliding ? 10000.0f : freq_base;
|
||||||
|
const float freq_scale_l = is_sliding ? 1.0f : freq_scale;
|
||||||
|
struct ggml_tensor * KQ_mask_l = is_sliding ? KQ_mask_swa : KQ_mask;
|
||||||
|
|
||||||
|
// norm
|
||||||
|
cur = llm_build_norm(ctx0, inpL, hparams,
|
||||||
|
model.layers[il].attn_norm, NULL,
|
||||||
|
LLM_NORM_RMS, cb, il);
|
||||||
|
cb(cur, "attn_norm", il);
|
||||||
|
|
||||||
|
// self-attention
|
||||||
|
{
|
||||||
|
// compute Q and K and RoPE them
|
||||||
|
struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
|
||||||
|
struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
|
||||||
|
struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
|
||||||
|
cb(Vcur, "Vcur", il);
|
||||||
|
|
||||||
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens);
|
||||||
|
Qcur = llm_build_norm(ctx0, Qcur, hparams,
|
||||||
|
model.layers[il].attn_q_norm,
|
||||||
|
NULL,
|
||||||
|
LLM_NORM_RMS, cb, il);
|
||||||
|
cb(Qcur, "Qcur_normed", il);
|
||||||
|
|
||||||
|
Qcur = ggml_rope_ext(
|
||||||
|
ctx0, Qcur, inp_pos, nullptr,
|
||||||
|
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
||||||
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
|
||||||
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens);
|
||||||
|
Kcur = llm_build_norm(ctx0, Kcur, hparams,
|
||||||
|
model.layers[il].attn_k_norm,
|
||||||
|
NULL,
|
||||||
|
LLM_NORM_RMS, cb, il);
|
||||||
|
cb(Kcur, "Kcur_normed", il);
|
||||||
|
|
||||||
|
Kcur = ggml_rope_ext(
|
||||||
|
ctx0, Kcur, inp_pos, nullptr,
|
||||||
|
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
||||||
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
|
||||||
|
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
||||||
|
model.layers[il].wo, NULL,
|
||||||
|
Kcur, Vcur, Qcur, KQ_mask_l, n_tokens, kv_head, n_kv, hparams.f_attention_scale, cb, il);
|
||||||
|
}
|
||||||
|
|
||||||
|
cur = llm_build_norm(ctx0, cur, hparams,
|
||||||
|
model.layers[il].attn_post_norm, NULL,
|
||||||
|
LLM_NORM_RMS, cb, il);
|
||||||
|
cb(cur, "attn_post_norm", il);
|
||||||
|
|
||||||
|
if (il == n_layer - 1) {
|
||||||
|
// skip computing output for unused tokens
|
||||||
|
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||||
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||||
|
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
|
||||||
|
cb(sa_out, "sa_out", il);
|
||||||
|
|
||||||
|
cur = llm_build_norm(ctx0, sa_out, hparams,
|
||||||
|
model.layers[il].ffn_norm, NULL,
|
||||||
|
LLM_NORM_RMS, cb, il);
|
||||||
|
cb(cur, "ffn_norm", il);
|
||||||
|
|
||||||
|
// feed-forward network
|
||||||
|
{
|
||||||
|
cur = llm_build_ffn(ctx0, lctx, cur,
|
||||||
|
model.layers[il].ffn_up, NULL, NULL,
|
||||||
|
model.layers[il].ffn_gate, NULL, NULL,
|
||||||
|
model.layers[il].ffn_down, NULL, NULL,
|
||||||
|
NULL,
|
||||||
|
LLM_FFN_GELU, LLM_FFN_PAR, cb, il);
|
||||||
|
cb(cur, "ffn_out", il);
|
||||||
|
}
|
||||||
|
|
||||||
|
cur = llm_build_norm(ctx0, cur, hparams,
|
||||||
|
model.layers[il].ffn_post_norm, NULL,
|
||||||
|
LLM_NORM_RMS, cb, -1);
|
||||||
|
cb(cur, "ffn_post_norm", -1);
|
||||||
|
|
||||||
|
cur = ggml_add(ctx0, cur, sa_out);
|
||||||
|
cur = lctx.cvec.apply_to(ctx0, cur, il);
|
||||||
|
cb(cur, "l_out", il);
|
||||||
|
|
||||||
|
// input for next layer
|
||||||
|
inpL = cur;
|
||||||
|
}
|
||||||
|
|
||||||
|
cur = inpL;
|
||||||
|
|
||||||
|
cur = llm_build_norm(ctx0, cur, hparams,
|
||||||
|
model.output_norm, NULL,
|
||||||
|
LLM_NORM_RMS, cb, -1);
|
||||||
|
cb(cur, "result_norm", -1);
|
||||||
|
|
||||||
|
// lm_head
|
||||||
|
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
||||||
|
|
||||||
|
cb(cur, "result_output", -1);
|
||||||
|
|
||||||
|
ggml_build_forward_expand(gf, cur);
|
||||||
|
|
||||||
|
return gf;
|
||||||
|
}
|
||||||
|
|
||||||
struct ggml_cgraph * build_starcoder2() {
|
struct ggml_cgraph * build_starcoder2() {
|
||||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
||||||
|
@ -8334,6 +8477,10 @@ static struct ggml_cgraph * llama_build_graph(
|
||||||
{
|
{
|
||||||
result = llm.build_gemma2();
|
result = llm.build_gemma2();
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_GEMMA3:
|
||||||
|
{
|
||||||
|
result = llm.build_gemma3();
|
||||||
|
} break;
|
||||||
case LLM_ARCH_STARCODER2:
|
case LLM_ARCH_STARCODER2:
|
||||||
{
|
{
|
||||||
result = llm.build_starcoder2();
|
result = llm.build_starcoder2();
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue