diff --git a/common/arg.cpp b/common/arg.cpp index ca698f916..67fd52279 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -366,6 +366,112 @@ static void common_params_print_usage(common_params_context & ctx_arg) { print_options(specific_options); } +static void common_params_print_completion(common_params_context & ctx_arg) { + std::vector common_options; + std::vector sparam_options; + std::vector specific_options; + + for (auto & opt : ctx_arg.options) { + if (opt.is_sparam) { + sparam_options.push_back(&opt); + } else if (opt.in_example(ctx_arg.ex)) { + specific_options.push_back(&opt); + } else { + common_options.push_back(&opt); + } + } + + printf("_llama_completions() {\n"); + printf(" local cur prev opts\n"); + printf(" COMPREPLY=()\n"); + printf(" cur=\"${COMP_WORDS[COMP_CWORD]}\"\n"); + printf(" prev=\"${COMP_WORDS[COMP_CWORD-1]}\"\n\n"); + + printf(" opts=\""); + auto print_options = [](const std::vector & options) { + for (const common_arg * opt : options) { + for (const char * arg : opt->args) { + printf("%s ", arg); + } + } + }; + + print_options(common_options); + print_options(sparam_options); + print_options(specific_options); + printf("\"\n\n"); + + printf(" case \"$prev\" in\n"); + printf(" --model)\n"); + printf(" COMPREPLY=( $(compgen -f -X '!*.gguf' -- \"$cur\") $(compgen -d -- \"$cur\") )\n"); + printf(" return 0\n"); + printf(" ;;\n"); + printf(" --grammar-file)\n"); + printf(" COMPREPLY=( $(compgen -f -X '!*.gbnf' -- \"$cur\") $(compgen -d -- \"$cur\") )\n"); + printf(" return 0\n"); + printf(" ;;\n"); + printf(" --chat-template-file)\n"); + printf(" COMPREPLY=( $(compgen -f -X '!*.jinja' -- \"$cur\") $(compgen -d -- \"$cur\") )\n"); + printf(" return 0\n"); + printf(" ;;\n"); + printf(" *)\n"); + printf(" COMPREPLY=( $(compgen -W \"${opts}\" -- \"$cur\") )\n"); + printf(" return 0\n"); + printf(" ;;\n"); + printf(" esac\n"); + printf("}\n\n"); + + std::set executables = { + "llama-batched", + "llama-batched-bench", + "llama-bench", + "llama-cli", + "llama-convert-llama2c-to-ggml", + "llama-cvector-generator", + "llama-embedding", + "llama-eval-callback", + "llama-export-lora", + "llama-gbnf-validator", + "llama-gen-docs", + "llama-gguf", + "llama-gguf-hash", + "llama-gguf-split", + "llama-gritlm", + "llama-imatrix", + "llama-infill", + "llama-llava-cli", + "llama-llava-clip-quantize-cli", + "llama-lookahead", + "llama-lookup", + "llama-lookup-create", + "llama-lookup-merge", + "llama-lookup-stats", + "llama-minicpmv-cli", + "llama-parallel", + "llama-passkey", + "llama-perplexity", + "llama-q8dot", + "llama-quantize", + "llama-quantize-stats", + "llama-qwen2vl-cli", + "llama-retrieval", + "llama-run", + "llama-save-load-state", + "llama-server", + "llama-simple", + "llama-simple-chat", + "llama-speculative", + "llama-speculative-simple", + "llama-tokenize", + "llama-tts", + "llama-vdot" + }; + + for (const auto& exe : executables) { + printf("complete -F _llama_completions %s\n", exe.c_str()); + } +} + static std::vector parse_device_list(const std::string & value) { std::vector devices; auto dev_names = string_split(value, ','); @@ -427,6 +533,10 @@ bool common_params_parse(int argc, char ** argv, common_params & params, llama_e } exit(0); } + if (ctx_arg.params.completion) { + common_params_print_completion(ctx_arg); + exit(0); + } } catch (const std::invalid_argument & ex) { fprintf(stderr, "%s\n", ex.what()); ctx_arg.params = params_org; @@ -495,6 +605,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex exit(0); } )); + add_opt(common_arg( + {"--completion-bash"}, + "print source-able bash completion script for llama.cpp", + [](common_params & params) { + params.completion = true; + } + )); add_opt(common_arg( {"--verbose-prompt"}, string_format("print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false"), @@ -947,6 +1064,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.sampling.min_p = std::stof(value); } ).set_sparam()); + add_opt(common_arg( + {"--top-nsigma"}, "N", + string_format("top-n-sigma sampling (default: %.1f, -1.0 = disabled)", params.sampling.top_n_sigma), + [](common_params & params, const std::string & value) { + params.sampling.top_n_sigma = std::stof(value); + } + ).set_examples({LLAMA_EXAMPLE_MAIN}).set_sparam()); add_opt(common_arg( {"--xtc-probability"}, "N", string_format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sampling.xtc_probability), @@ -1976,6 +2100,17 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.use_jinja = true; } ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_JINJA")); + add_opt(common_arg( + {"--reasoning-format"}, "FORMAT", + "reasoning format (default: deepseek; allowed values: deepseek, none)\n" + "controls whether thought tags are extracted from the response, and in which format they're returned. 'none' leaves thoughts unparsed in `message.content`, 'deepseek' puts them in `message.reasoning_content` (for DeepSeek R1 & Command R7B only).\n" + "only supported for non-streamed responses", + [](common_params & params, const std::string & value) { + /**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; } + else if (value == "none") { params.reasoning_format = COMMON_REASONING_FORMAT_NONE; } + else { std::invalid_argument("invalid value"); } + } + ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK")); add_opt(common_arg( {"--chat-template"}, "JINJA_TEMPLATE", string_format( diff --git a/common/chat.cpp b/common/chat.cpp index ef1c6fb3d..5b8e280aa 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -12,11 +12,13 @@ std::string common_chat_format_name(common_chat_format format) { case COMMON_CHAT_FORMAT_LLAMA_3_X: return "Llama 3.x"; case COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS: return "Llama 3.x with builtin tools"; case COMMON_CHAT_FORMAT_DEEPSEEK_R1: return "DeepSeek R1"; + case COMMON_CHAT_FORMAT_DEEPSEEK_R1_EXTRACT_REASONING: return "DeepSeek R1 (extract reasoning)"; case COMMON_CHAT_FORMAT_FIREFUNCTION_V2: return "FireFunction v2"; case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2: return "Functionary v3.2"; case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1: return "Functionary v3.1 Llama 3.1"; case COMMON_CHAT_FORMAT_HERMES_2_PRO: return "Hermes 2 Pro"; case COMMON_CHAT_FORMAT_COMMAND_R7B: return "Command R7B"; + case COMMON_CHAT_FORMAT_COMMAND_R7B_EXTRACT_REASONING: return "Command R7B (extract reasoning)"; default: throw std::runtime_error("Unknown chat format"); } @@ -105,7 +107,6 @@ static common_chat_msg parse_json_tool_calls( std::sregex_iterator rend; std::sregex_iterator rit(it, end, function_regex); if (rit == rend) { - fprintf(stderr, "No more tool calls found\n"); result.content += std::string(it, end); break; } @@ -115,14 +116,21 @@ static common_chat_msg parse_json_tool_calls( json arguments; if (!parse_json(it, end, arguments)) { - throw std::runtime_error("Failed to parse json tool call arguments"); + throw std::runtime_error("Failed to parse json tool call arguments: " + input); } if (!std::regex_search(it, end, match, close_regex)) { - throw std::runtime_error("Malformed input, missing closing pattern"); + throw std::runtime_error("Malformed input, missing closing pattern: " + input); } it = match.suffix().first; result.tool_calls.push_back({name, arguments.is_string() ? arguments.get() : arguments.dump(), /* id= */ ""}); } + + if (!result.tool_calls.empty()) { + if (!string_strip(result.content).empty()) { + LOG_WRN("Content found with tool calls: %s\n", result.content.c_str()); + } + result.content = ""; + } return result; } @@ -134,11 +142,11 @@ static common_chat_msg parse_prefixed_json_tool_call_array(const std::string& in result.role = "assistant"; const auto process_tool_calls = [&](const json & tool_calls) { for (const auto & tool_call : tool_calls) { - const auto & arguments = tool_call["arguments"]; + const auto & arguments = tool_call.at("arguments"); result.tool_calls.push_back({ - tool_call["name"], + tool_call.at("name"), arguments.is_string() ? arguments.get() : arguments.dump(), - tool_call.contains("id") ? tool_call["id"] : "", + tool_call.contains("id") ? tool_call.at("id") : "", }); } }; @@ -155,7 +163,7 @@ static common_chat_msg parse_prefixed_json_tool_call_array(const std::string& in static void foreach_function(const json & tools, const std::function & fn) { for (const auto & tool : tools) { - if (!tool.contains("type") || tool["type"] != "function" || !tool.contains("function")) { + if (!tool.contains("type") || tool.at("type") != "function" || !tool.contains("function")) { LOG_INF("Skipping tool without function: %s", tool.dump(2).c_str()); continue; } @@ -190,27 +198,27 @@ static common_chat_params common_chat_params_init_generic(const common_chat_temp auto tool_call_schemas = json::array(); foreach_function(inputs.tools, [&](const json & tool) { - const auto & function = tool["function"]; + const auto & function = tool.at("function"); auto tool_schema = json { {"type", "object"}, {"properties", { {"name", { {"type", "string"}, - {"const", function["name"]}, + {"const", function.at("name")}, }}, - {"arguments", function["parameters"]}, + {"arguments", function.at("parameters")}, }}, {"required", json::array({"name", "arguments"})}, }; if (function.contains("description")) { - tool_schema["description"] = function["description"]; + tool_schema["description"] = function.at("description"); } if (inputs.parallel_tool_calls) { - tool_schema["properties"]["id"] = { + tool_schema.at("properties")["id"] = { {"type", "string"}, {"minLength", 4}, }; - tool_schema["required"].push_back("id"); + tool_schema.at("required").push_back("id"); } tool_call_schemas.emplace_back(tool_schema); }); @@ -275,21 +283,21 @@ static common_chat_msg common_chat_parse_generic(const std::string & input) { common_chat_msg result; result.role = "assistant"; if (data.contains("tool_calls")) { - for (const auto & tool_call : data["tool_calls"]) { + for (const auto & tool_call : data.at("tool_calls")) { result.tool_calls.push_back({ - tool_call["name"], - tool_call["arguments"].dump(), - tool_call.contains("id") ? tool_call["id"] : "", + tool_call.at("name"), + tool_call.at("arguments").dump(), + tool_call.contains("id") ? tool_call.at("id") : "", }); } } else if (data.contains("tool_call")) { result.tool_calls.push_back({ - data["tool_call"]["name"], - data["tool_call"]["arguments"].dump(), + data.at("tool_call").at("name"), + data.at("tool_call").at("arguments").dump(), /* id= */ "", }); } else if (data.contains("response")) { - const auto & response = data["response"]; + const auto & response = data.at("response"); result.content = response.is_string() ? response.get() : response.dump(2); } return result; @@ -301,7 +309,7 @@ static common_chat_params common_chat_params_init_mistral_nemo(const common_chat data.grammar = build_grammar([&](const common_grammar_builder & builder) { auto schemas = json::array(); foreach_function(inputs.tools, [&](const json & tool) { - const auto & function = tool["function"]; + const auto & function = tool.at("function"); schemas.push_back({ {"type", "object"}, {"properties", { @@ -309,9 +317,9 @@ static common_chat_params common_chat_params_init_mistral_nemo(const common_chat // It's hard to constrain that for now (while reusing the JSON schema conversion), so we're just expecting a plain object. {"name", { {"type", "string"}, - {"const", function["name"]}, + {"const", function.at("name")}, }}, - {"arguments", function["parameters"]}, + {"arguments", function.at("parameters")}, {"id", { {"type", "string"}, // Nemo's template expects a 9-character alphanumeric ID. @@ -346,7 +354,7 @@ static common_chat_params common_chat_params_init_command_r7b(const common_chat_ data.grammar = build_grammar([&](const common_grammar_builder & builder) { auto schemas = json::array(); foreach_function(inputs.tools, [&](const json & tool) { - const auto & function = tool["function"]; + const auto & function = tool.at("function"); schemas.push_back({ {"type", "object"}, {"properties", { @@ -357,9 +365,9 @@ static common_chat_params common_chat_params_init_command_r7b(const common_chat_ }}, {"tool_name", { {"type", "string"}, - {"const", function["name"]}, + {"const", function.at("name")}, }}, - {"parameters", function["parameters"]}, + {"parameters", function.at("parameters")}, }}, {"required", json::array({"tool_call_id", "tool_name", "parameters"})}, }); @@ -382,39 +390,65 @@ static common_chat_params common_chat_params_init_command_r7b(const common_chat_ "<|END_THINKING|>", "<|END_ACTION|>", }; - data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt); - data.format = COMMON_CHAT_FORMAT_COMMAND_R7B; + auto adjusted_messages = json::array(); + for (const auto & msg : inputs.messages) { + auto has_reasoning_content = msg.contains("reasoning_content") && msg.at("reasoning_content").is_string(); + auto has_tool_calls = msg.contains("tool_calls") && msg.at("tool_calls").is_array(); + if (has_reasoning_content && has_tool_calls) { + auto adjusted_message = msg; + adjusted_message["tool_plan"] = msg.at("reasoning_content"); + adjusted_message.erase("reasoning_content"); + adjusted_messages.push_back(adjusted_message); + } else { + adjusted_messages.push_back(msg); + } + } + data.prompt = apply(tmpl, adjusted_messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, {}); + data.format = inputs.extract_reasoning ? COMMON_CHAT_FORMAT_COMMAND_R7B_EXTRACT_REASONING : COMMON_CHAT_FORMAT_COMMAND_R7B; return data; } -static common_chat_msg common_chat_parse_command_r7b(const std::string & input) { - static std::regex response_regex("<\\|START_RESPONSE\\|>([\\s\\S\\n\\r]*?)<\\|END_RESPONSE\\|>"); - static std::regex thought_action_regex("<\\|START_THINKING\\|>([\\s\\S\\n\\r]*?)<\\|END_THINKING\\|><\\|START_ACTION\\|>([\\s\\S\\n\\r]*?)<\\|END_ACTION\\|>"); +static common_chat_msg common_chat_parse_command_r7b(const std::string & input, bool extract_reasoning) { + static std::regex thought_regex("(<\\|START_THINKING\\|>([\\s\\S\\n\\r]*?)<\\|END_THINKING\\|>)([\\s\\S\\n\\r]*)"); + static std::regex action_regex("<\\|START_ACTION\\|>([\\s\\S\\n\\r]*?)<\\|END_ACTION\\|>"); + static std::regex response_regex("(?:<\\|START_RESPONSE\\|>)?([\\s\\S\\n\\r]*?)<\\|END_RESPONSE\\|>"); + std::smatch match; common_chat_msg result; result.role = "assistant"; - if (std::regex_match(input, match, response_regex)) { - result.content = match[1].str(); - } else if (std::regex_match(input, match, thought_action_regex)) { - result.tool_plan = match[1].str(); - auto actions_str = match[2].str(); + + std::string rest = input; + + if (std::regex_match(rest, match, thought_regex)) { + if (extract_reasoning) { + result.reasoning_content = match[2].str(); + } else if (!match[2].str().empty()) { + // Let the unparsed thinking tags through in content only if their insides aren't empty. + result.content = match[1].str(); + } + rest = match[3].str(); + } + if (std::regex_match(rest, match, action_regex)) { + auto actions_str = match[1].str(); auto actions = json::parse(actions_str); for (const auto & action : actions) { result.tool_calls.push_back({ - /* .name = */ action["tool_name"], - /* .arguments = */ action["parameters"].dump(), - /* .id = */ action["tool_call_id"], + /* .name = */ action.at("tool_name"), + /* .arguments = */ action.at("parameters").dump(), + /* .id = */ action.at("tool_call_id"), }); } + } else if (std::regex_match(rest, match, response_regex)) { + auto response = match[1].str(); + result.content += response; } else { - LOG_ERR("Failed to parse command_r output"); - result.content = input; + result.content += rest; } return result; } static void expect_tool_parameters(const std::string & name, const json & parameters, const std::vector & expected_properties) { - if (!parameters.is_object() || !parameters.contains("type") || parameters["type"] != "object" || !parameters.contains("properties") || !parameters.contains("required")) { + if (!parameters.is_object() || !parameters.contains("type") || parameters.at("type") != "object" || !parameters.contains("properties") || !parameters.contains("required")) { throw std::runtime_error("Parameters of tool " + name + " must be an object w/ required properties"); } const auto & parameters_properties = parameters.at("properties"); @@ -468,9 +502,9 @@ static common_chat_params common_chat_params_init_llama_3_1_tool_calls(const com }; foreach_function(inputs.tools, [&](const json & tool) { - const auto & function = tool["function"]; - std::string name = function["name"]; - auto parameters = function["parameters"]; + const auto & function = tool.at("function"); + std::string name = function.at("name"); + auto parameters = function.at("parameters"); builder.resolve_refs(parameters); // https://github.com/meta-llama/llama-stack/tree/main/llama_stack/providers/remote/tool_runtime @@ -546,34 +580,90 @@ static common_chat_msg common_chat_parse_llama_3_1(const std::string & input, bo static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) { common_chat_params data; - data.grammar_lazy = inputs.tool_choice != "required"; - data.grammar = build_grammar([&](const common_grammar_builder & builder) { - std::vector tool_rules; - foreach_function(inputs.tools, [&](const json & tool) { - const auto & function = tool["function"]; - std::string name = function["name"]; - auto parameters = function["parameters"]; - auto args_rule = builder.add_schema(name + "-args", parameters); - tool_rules.push_back(builder.add_rule(name + "-call", - "\"<|tool▁call▁begin|>function<|tool▁sep|>" + name + "\\n```json\\n\" " + args_rule + " \"```<|tool▁call▁end|>\"")); - }); - data.grammar_triggers.push_back({"<|tool▁calls▁begin|>", /* .at_start = */ false}); - data.preserved_tokens = { - "<|tool▁sep|>", - "<|tool▁call▁end|>", - }; - builder.add_rule("root", "\"<|tool▁calls▁begin|>\" (" + string_join(tool_rules, " | ") + ")" + (inputs.parallel_tool_calls ? "*" : "") + " space"); - }, grammar_options); + if (inputs.tools.is_array() && !inputs.tools.empty()) { + data.grammar_lazy = inputs.tool_choice != "required" && inputs.json_schema.is_null(); + data.grammar = build_grammar([&](const common_grammar_builder & builder) { + std::vector tool_rules; + foreach_function(inputs.tools, [&](const json & tool) { + const auto & function = tool.at("function"); + std::string name = function.at("name"); + auto parameters = function.at("parameters"); + auto args_rule = builder.add_schema(name + "-args", parameters); + tool_rules.push_back(builder.add_rule(name + "-call", + "\"<|tool▁call▁begin|>function<|tool▁sep|>" + name + "\\n" + "```json\\n\" " + args_rule + " \"```<|tool▁call▁end|>\"")); + }); + // Distill Qwen 7B & 32B models seem confused re/ syntax of their tool call opening tag, + // so we accept common variants (then it's all constrained) + builder.add_rule("root", + "( \"<|tool▁calls▁begin|>\" | \"<|tool_calls_begin|>\" | \"<|tool calls begin|>\" | \"<|tool\\\\_calls\\\\_begin|>\" ) " + "(" + string_join(tool_rules, " | ") + ")" + (inputs.parallel_tool_calls ? "*" : "") + " " + "\"<|tool▁calls▁end|>\"" + " space"); + data.grammar_triggers.push_back({"<|tool▁calls▁begin|>", /* .at_start = */ false}); + data.grammar_triggers.push_back({"<|tool_calls_begin|>", /* .at_start = */ false}); + data.grammar_triggers.push_back({"<|tool calls begin|>", /* .at_start = */ false}); + data.grammar_triggers.push_back({"<|tool\\_calls\\_begin|>", /* .at_start = */ false}); + data.preserved_tokens = { + "", + "", + "<|tool▁sep|>", + "<|tool▁calls▁end|", + "<|tool▁call▁end|>", + }; + }, grammar_options); + } auto prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt); + + // Hacks to fix the official (broken) prompt. + // It is advisable to use --chat-template-file models/templates/llama-cpp-deepseek-r1.jinja instead, + // until the official template is fixed. + if (tmpl.source().find("{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}") != std::string::npos) { + // Don't leave the chat dangling after tool results + if (string_ends_with(prompt, "<|tool▁outputs▁end|>")) { + prompt += "<|end▁of▁sentence|>"; + if (inputs.add_generation_prompt) { + prompt += "<|Assistant|>"; + } + } + // Fix up tool call delta example added by Minja + prompt = std::regex_replace( + prompt, + std::regex("(<|tool▁call▁end|>)[\\s\\r\\n]*(<|tool▁outputs▁begin|>|<|User|>)"), + "$1<|tool▁calls▁end|><|end▁of▁sentence|>$2"); + } data.prompt = prompt; - data.format = COMMON_CHAT_FORMAT_DEEPSEEK_R1; + data.format = inputs.extract_reasoning ? COMMON_CHAT_FORMAT_DEEPSEEK_R1_EXTRACT_REASONING : COMMON_CHAT_FORMAT_DEEPSEEK_R1; return data; } -static common_chat_msg common_chat_parse_deepseek_r1(const std::string & input) { - static std::regex trigger_regex("<|tool▁calls▁begin|>"); +static common_chat_msg common_chat_parse_deepseek_r1(const std::string & input, bool extract_reasoning) { static std::regex function_regex("<|tool▁call▁begin|>function<|tool▁sep|>([^\n]+)\n```json\n"); - static std::regex close_regex("```<|tool▁call▁end|>"); - return parse_json_tool_calls(input, trigger_regex, function_regex, close_regex); + static std::regex close_regex("```[\\s\\r\\n]*<|tool▁call▁end|>"); + static std::regex reasoning_content_regex("((?:)?([\\s\\S\\r\\n]*?))?([\\s\\S\\r\\n]*)"); + static std::regex tool_calls_regex("[\\s\\r\\n]*(?:<|tool▁calls▁begin|>|<|tool_calls_begin|>|<|tool calls begin|>|<|tool\\\\_calls\\\\_begin|>)([\\s\\S\\r\\n]*?)<|tool▁calls▁end|>"); + common_chat_msg msg; + msg.role = "assistant"; + std::smatch match; + if (std::regex_match(input, match, reasoning_content_regex)) { + std::string rest; + if (extract_reasoning) { + msg.reasoning_content = string_strip(match[2].str()); + } else { + msg.content = match[1].str(); + } + rest = match[3].str(); + + if (std::regex_search(rest, match, tool_calls_regex)) { + auto tool_calls = match[1].str(); + auto msg2 = parse_json_tool_calls(tool_calls, std::nullopt, function_regex, close_regex); + msg.tool_calls = std::move(msg2.tool_calls); + } else { + msg.content += std::string(rest.begin() + rest.find_first_not_of(" \r\n"), rest.end()); + } + } else { + msg.content = input; + } + return msg; } static common_chat_params common_chat_params_init_firefunction_v2(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) { @@ -583,20 +673,20 @@ static common_chat_params common_chat_params_init_firefunction_v2(const common_c {"datetime", "Jan 29 2025 13:00:00 GMT"}, {"functions", json(inputs.tools.empty() ? "" : inputs.tools.dump(2))}, }); - if (!inputs.tools.is_null() && !inputs.tools.empty()) { + if (inputs.tools.is_array() && !inputs.tools.empty()) { data.grammar_lazy = inputs.tool_choice != "required"; data.grammar = build_grammar([&](const common_grammar_builder & builder) { auto schemas = json::array(); foreach_function(inputs.tools, [&](const json & tool) { - const auto & function = tool["function"]; + const auto & function = tool.at("function"); schemas.push_back({ {"type", "object"}, {"properties", { {"name", { {"type", "string"}, - {"const", function["name"]}, + {"const", function.at("name")}, }}, - {"arguments", function["parameters"]}, + {"arguments", function.at("parameters")}, }}, {"required", json::array({"name", "arguments", "id"})}, }); @@ -628,15 +718,15 @@ static common_chat_params common_chat_params_init_functionary_v3_2(const common_ common_chat_params data; data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt); data.format = COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2; - if (!inputs.tools.is_null() && !inputs.tools.empty()) { + if (inputs.tools.is_array() && !inputs.tools.empty()) { data.grammar_lazy = inputs.tool_choice != "required"; data.grammar = build_grammar([&](const common_grammar_builder & builder) { std::vector first_tool_rules; std::vector subsequent_tool_rules; foreach_function(inputs.tools, [&](const json & tool) { - const auto & function = tool["function"]; - std::string name = function["name"]; - auto parameters = function["parameters"]; + const auto & function = tool.at("function"); + std::string name = function.at("name"); + auto parameters = function.at("parameters"); auto args_rule = builder.add_schema(name + "-args", parameters); first_tool_rules.push_back(builder.add_rule(name + "-call", "\"" + name + "\\n\" " + args_rule)); subsequent_tool_rules.push_back(builder.add_rule(name + "-call2", "\">>>" + name + "\\n\" " + args_rule)); @@ -716,9 +806,9 @@ static common_chat_params common_chat_params_init_functionary_v3_1_llama_3_1(con data.grammar = build_grammar([&](const common_grammar_builder & builder) { std::vector tool_rules; foreach_function(inputs.tools, [&](const json & tool) { - const auto & function = tool["function"]; - const auto & parameters = function["parameters"]; - std::string name = function["name"]; + const auto & function = tool.at("function"); + const auto & parameters = function.at("parameters"); + std::string name = function.at("name"); if (name == "python" || name == "ipython") { if (!parameters.contains("type")) { throw std::runtime_error("Missing type in python tool"); @@ -789,9 +879,9 @@ static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat data.grammar = build_grammar([&](const common_grammar_builder & builder) { std::vector tool_rules; foreach_function(inputs.tools, [&](const json & tool) { - const auto & function = tool["function"]; - std::string name = function["name"]; - auto parameters = function["parameters"]; + const auto & function = tool.at("function"); + std::string name = function.at("name"); + auto parameters = function.at("parameters"); builder.resolve_refs(parameters); tool_rules.push_back(builder.add_schema(name + "-call", { {"type", "object"}, @@ -839,9 +929,9 @@ static common_chat_msg common_chat_parse_hermes_2_pro(const std::string & input) if (!parse_json(it, end, call)) { throw std::runtime_error("Failed to parse json tool call"); } - const auto & arguments = call["arguments"]; + const auto & arguments = call.at("arguments"); result.tool_calls.push_back({ - call["name"], + call.at("name"), arguments.dump(), // arguments.is_string() ? arguments.get() : arguments.dump(), /* id= */ "", @@ -884,47 +974,72 @@ static common_chat_params common_chat_params_init_without_tools(const common_cha } common_chat_params common_chat_params_init(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) { - auto has_tools = !inputs.tools.is_null() && inputs.tool_choice != "none"; - LOG_DBG("[%s] has_tools=%s\n", __func__, has_tools ? "true" : "false"); + const auto & src = tmpl.source(); + const auto & caps = tmpl.original_caps(); - if (has_tools && !inputs.grammar.empty()) { - throw std::runtime_error("Cannot specify grammar with tools"); + if (inputs.tools.is_array()) { + if (inputs.tool_choice != "none" && !inputs.grammar.empty()) { + throw std::runtime_error("Cannot specify grammar with tools"); + } + if (caps.supports_tool_calls && !caps.supports_tools) { + LOG_WRN("Template supports tool calls but does not natively describe tools. The fallback behaviour used may produce bad results, inspect prompt w/ --verbose & consider overriding the template.\n"); + } } - const auto & src = tmpl.source(); + // DeepSeek R1: use handler in all cases except json schema (thinking / tools). + if (src.find("<|tool▁calls▁begin|>") != std::string::npos && inputs.json_schema.is_null()) { + return common_chat_params_init_deepseek_r1(tmpl, inputs); + } + + // Command R7B: : use handler in all cases except json schema (thinking / tools). + if (src.find("<|END_THINKING|><|START_ACTION|>") != std::string::npos && inputs.json_schema.is_null()) { + return common_chat_params_init_command_r7b(tmpl, inputs); + } + + // Use generic handler when mixing tools + JSON schema. + // TODO: support that mix in handlers below. + if ((!inputs.tools.is_array() && inputs.json_schema.is_object())) { + return common_chat_params_init_generic(tmpl, inputs); + } + + // Functionary prepends "all\n" to plain content outputs, so we use its handler in all cases. if (src.find(">>>all") != std::string::npos) { - // Functionary prepends "all\n" to plain content outputs, so we use the parser no matter when return common_chat_params_init_functionary_v3_2(tmpl, inputs); } + + // Firefunction v2 requires datetime and functions in the context even w/o tools, so we also use its handler in all cases. if (src.find(" functools[") != std::string::npos) { - // Firefunction v2 requires datetime and functions in the context, even w/o tools. return common_chat_params_init_firefunction_v2(tmpl, inputs); } - if (!has_tools) { + // Plain handler (no tools) + if (inputs.tools.is_null() || inputs.tool_choice == "none") { return common_chat_params_init_without_tools(tmpl, inputs); } + // Hermes 2/3 Pro, Qwen 2.5 Instruct (w/ tools) if (src.find("") != std::string::npos) { return common_chat_params_init_hermes_2_pro(tmpl, inputs); } + + // Functionary v3.1 (w/ tools) if (src.find("<|start_header_id|>") != std::string::npos && src.find("ipython<|end_header_id|>") != std::string::npos) { auto allow_python_tag_builtin_tools = src.find("<|python_tag|>") != std::string::npos; return common_chat_params_init_llama_3_1_tool_calls(tmpl, inputs, allow_python_tag_builtin_tools); } - if (src.find("<|tool▁calls▁begin|>") != std::string::npos) { - return common_chat_params_init_deepseek_r1(tmpl, inputs); - } + + // Mistral Nemo (w/ tools) if (src.find("[TOOL_CALLS]") != std::string::npos) { return common_chat_params_init_mistral_nemo(tmpl, inputs); } - if (src.find("<|END_THINKING|><|START_ACTION|>") != std::string::npos) { - return common_chat_params_init_command_r7b(tmpl, inputs); - } + + // Generic fallback return common_chat_params_init_generic(tmpl, inputs); } @@ -949,7 +1064,9 @@ common_chat_msg common_chat_parse(const std::string & input, common_chat_format case COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS: return common_chat_parse_llama_3_1(input, /* with_builtin_tools= */ true); case COMMON_CHAT_FORMAT_DEEPSEEK_R1: - return common_chat_parse_deepseek_r1(input); + return common_chat_parse_deepseek_r1(input, /* extract_reasoning= */ false); + case COMMON_CHAT_FORMAT_DEEPSEEK_R1_EXTRACT_REASONING: + return common_chat_parse_deepseek_r1(input, /* extract_reasoning= */ true); case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2: return common_chat_parse_functionary_v3_2(input); case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1: @@ -959,7 +1076,9 @@ common_chat_msg common_chat_parse(const std::string & input, common_chat_format case COMMON_CHAT_FORMAT_FIREFUNCTION_V2: return common_chat_parse_firefunction_v2(input); case COMMON_CHAT_FORMAT_COMMAND_R7B: - return common_chat_parse_command_r7b(input); + return common_chat_parse_command_r7b(input, /* extract_reasoning= */ false); + case COMMON_CHAT_FORMAT_COMMAND_R7B_EXTRACT_REASONING: + return common_chat_parse_command_r7b(input, /* extract_reasoning= */ true); default: throw std::runtime_error("Unsupported format: " + common_chat_format_name(format)); } diff --git a/common/chat.hpp b/common/chat.hpp index 33e64a430..ba1632f66 100644 --- a/common/chat.hpp +++ b/common/chat.hpp @@ -19,6 +19,7 @@ struct common_chat_inputs { bool stream; std::string grammar; bool add_generation_prompt = true; + bool extract_reasoning = true; }; enum common_chat_format { @@ -28,11 +29,13 @@ enum common_chat_format { COMMON_CHAT_FORMAT_LLAMA_3_X, COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS, COMMON_CHAT_FORMAT_DEEPSEEK_R1, + COMMON_CHAT_FORMAT_DEEPSEEK_R1_EXTRACT_REASONING, COMMON_CHAT_FORMAT_FIREFUNCTION_V2, COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2, COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1, COMMON_CHAT_FORMAT_HERMES_2_PRO, COMMON_CHAT_FORMAT_COMMAND_R7B, + COMMON_CHAT_FORMAT_COMMAND_R7B_EXTRACT_REASONING, COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats }; diff --git a/common/common.h b/common/common.h index 09d45279a..b1432a249 100644 --- a/common/common.h +++ b/common/common.h @@ -136,6 +136,7 @@ struct common_params_sampling { int32_t dry_allowed_length = 2; // tokens extending repetitions beyond this receive penalty int32_t dry_penalty_last_n = -1; // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size) int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0 + float top_n_sigma = -1.00f;// -1.0 = disabled float mirostat_tau = 5.00f; // target entropy float mirostat_eta = 0.10f; // learning rate bool ignore_eos = false; @@ -198,6 +199,11 @@ struct common_params_vocoder { bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy // NOLINT }; +enum common_reasoning_format { + COMMON_REASONING_FORMAT_NONE, + COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content` +}; + struct common_params { int32_t n_predict = -1; // new tokens to predict int32_t n_ctx = 4096; // context size @@ -288,6 +294,7 @@ struct common_params { bool kl_divergence = false; // compute KL divergence bool usage = false; // print usage + bool completion = false; // print source-able completion script bool use_color = false; // use color to distinguish generations and inputs bool special = false; // enable special token output bool interactive = false; // interactive mode @@ -342,6 +349,7 @@ struct common_params { std::string chat_template = ""; // NOLINT bool use_jinja = false; // NOLINT bool enable_chat_template = true; + common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; std::vector api_keys; @@ -420,13 +428,13 @@ bool set_process_priority(enum ggml_sched_priority prio); // #ifdef __GNUC__ -#ifdef __MINGW32__ -#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__))) +# if defined(__MINGW32__) && !defined(__clang__) +# define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__))) +# else +# define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__))) +# endif #else -#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__))) -#endif -#else -#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) +# define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) #endif LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2) @@ -619,7 +627,7 @@ struct common_chat_msg { std::string role; std::string content; std::vector tool_calls; - std::string tool_plan = ""; + std::string reasoning_content = ""; }; // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid diff --git a/common/log.cpp b/common/log.cpp index 4bfbecf15..52b31470c 100644 --- a/common/log.cpp +++ b/common/log.cpp @@ -1,5 +1,6 @@ #include "log.h" +#include #include #include #include diff --git a/common/log.h b/common/log.h index 4ebc6314b..c56bb50d9 100644 --- a/common/log.h +++ b/common/log.h @@ -15,7 +15,7 @@ #ifndef __GNUC__ # define LOG_ATTRIBUTE_FORMAT(...) -#elif defined(__MINGW32__) +#elif defined(__MINGW32__) && !defined(__clang__) # define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__))) #else # define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__))) diff --git a/common/sampling.cpp b/common/sampling.cpp index e4b21ca10..37a0d9c85 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -134,11 +134,11 @@ std::string common_params_sampling::print() const { snprintf(result, sizeof(result), "\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n" "\tdry_multiplier = %.3f, dry_base = %.3f, dry_allowed_length = %d, dry_penalty_last_n = %d\n" - "\ttop_k = %d, top_p = %.3f, min_p = %.3f, xtc_probability = %.3f, xtc_threshold = %.3f, typical_p = %.3f, temp = %.3f\n" + "\ttop_k = %d, top_p = %.3f, min_p = %.3f, xtc_probability = %.3f, xtc_threshold = %.3f, typical_p = %.3f, top_n_sigma = %.3f, temp = %.3f\n" "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f", penalty_last_n, penalty_repeat, penalty_freq, penalty_present, dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n, - top_k, top_p, min_p, xtc_probability, xtc_threshold, typ_p, temp, + top_k, top_p, min_p, xtc_probability, xtc_threshold, typ_p, top_n_sigma, temp, mirostat, mirostat_eta, mirostat_tau); return std::string(result); @@ -151,12 +151,6 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co lparams.no_perf = params.no_perf; - std::vector trigger_words; - trigger_words.reserve(params.grammar_trigger_words.size()); - for (const auto & str : params.grammar_trigger_words) { - trigger_words.push_back(str.word.c_str()); - } - struct llama_sampler * grmr; if (params.grammar.compare(0, 11, "%llguidance") == 0) { #ifdef LLAMA_USE_LLGUIDANCE @@ -165,6 +159,12 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled"); #endif // LLAMA_USE_LLGUIDANCE } else { + std::vector trigger_words; + trigger_words.reserve(params.grammar_trigger_words.size()); + for (const auto & str : params.grammar_trigger_words) { + trigger_words.push_back(str.word.c_str()); + } + grmr = params.grammar_lazy ? llama_sampler_init_grammar_lazy(vocab, params.grammar.c_str(), "root", trigger_words.data(), trigger_words.size(), @@ -188,45 +188,51 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co params.logit_bias.data())); if (params.mirostat == 0) { - for (const auto & cnstr : params.samplers) { - switch (cnstr) { - case COMMON_SAMPLER_TYPE_DRY: - { - std::vector c_breakers; - c_breakers.reserve(params.dry_sequence_breakers.size()); - for (const auto & str : params.dry_sequence_breakers) { - c_breakers.push_back(str.c_str()); - } + if (params.top_n_sigma >= 0) { + llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k)); + llama_sampler_chain_add(result->chain, llama_sampler_init_temp (params.temp)); + llama_sampler_chain_add(result->chain, llama_sampler_init_top_n_sigma (params.top_n_sigma)); + } else { + for (const auto & cnstr : params.samplers) { + switch (cnstr) { + case COMMON_SAMPLER_TYPE_DRY: + { + std::vector c_breakers; + c_breakers.reserve(params.dry_sequence_breakers.size()); + for (const auto & str : params.dry_sequence_breakers) { + c_breakers.push_back(str.c_str()); + } - llama_sampler_chain_add(result->chain, llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size())); - } - break; - case COMMON_SAMPLER_TYPE_TOP_K: - llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k)); - break; - case COMMON_SAMPLER_TYPE_TOP_P: - llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep)); - break; - case COMMON_SAMPLER_TYPE_MIN_P: - llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep)); - break; - case COMMON_SAMPLER_TYPE_XTC: - llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed)); - break; - case COMMON_SAMPLER_TYPE_TYPICAL_P: - llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep)); - break; - case COMMON_SAMPLER_TYPE_TEMPERATURE: - llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent)); - break; - case COMMON_SAMPLER_TYPE_INFILL: - llama_sampler_chain_add(result->chain, llama_sampler_init_infill (vocab)); - break; - case COMMON_SAMPLER_TYPE_PENALTIES: - llama_sampler_chain_add(result->chain, llama_sampler_init_penalties(params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present)); - break; - default: - GGML_ASSERT(false && "unknown sampler type"); + llama_sampler_chain_add(result->chain, llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size())); + } + break; + case COMMON_SAMPLER_TYPE_TOP_K: + llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k)); + break; + case COMMON_SAMPLER_TYPE_TOP_P: + llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep)); + break; + case COMMON_SAMPLER_TYPE_MIN_P: + llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep)); + break; + case COMMON_SAMPLER_TYPE_XTC: + llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed)); + break; + case COMMON_SAMPLER_TYPE_TYPICAL_P: + llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep)); + break; + case COMMON_SAMPLER_TYPE_TEMPERATURE: + llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent)); + break; + case COMMON_SAMPLER_TYPE_INFILL: + llama_sampler_chain_add(result->chain, llama_sampler_init_infill (vocab)); + break; + case COMMON_SAMPLER_TYPE_PENALTIES: + llama_sampler_chain_add(result->chain, llama_sampler_init_penalties(params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present)); + break; + default: + GGML_ASSERT(false && "unknown sampler type"); + } } } llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed)); diff --git a/examples/server/public/index.html.gz b/examples/server/public/index.html.gz index 9311410e1..1925b334b 100644 Binary files a/examples/server/public/index.html.gz and b/examples/server/public/index.html.gz differ diff --git a/examples/server/server.cpp b/examples/server/server.cpp index d320e9d6b..71151183b 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -173,6 +173,7 @@ struct slot_params { {"grammar_trigger_words", grammar_trigger_words}, {"grammar_trigger_tokens", sampling.grammar_trigger_tokens}, {"preserved_tokens", sampling.preserved_tokens}, + {"chat_format", common_chat_format_name(oaicompat_chat_format)}, {"samplers", samplers}, {"speculative.n_max", speculative.n_max}, {"speculative.n_min", speculative.n_min}, @@ -724,9 +725,19 @@ struct server_task_result_cmpl_final : server_task_result { msg.content = content; } - json tool_calls; + json message { + {"role", "assistant"}, + }; + if (!msg.reasoning_content.empty()) { + message["reasoning_content"] = msg.reasoning_content; + } + if (msg.content.empty() && !msg.tool_calls.empty()) { + message["content"] = json(); + } else { + message["content"] = msg.content; + } if (!msg.tool_calls.empty()) { - tool_calls = json::array(); + auto tool_calls = json::array(); for (const auto & tc : msg.tool_calls) { tool_calls.push_back({ {"type", "function"}, @@ -737,15 +748,7 @@ struct server_task_result_cmpl_final : server_task_result { {"id", tc.id}, }); } - } - - json message { - {"content", msg.content}, - {"tool_calls", tool_calls}, - {"role", "assistant"}, - }; - if (!msg.tool_plan.empty()) { - message["tool_plan"] = msg.tool_plan; + message["tool_calls"] = tool_calls; } json choice { @@ -2073,8 +2076,8 @@ struct server_context { if (slot.n_predict > 0 && slot.params.n_predict > slot.n_predict) { // Might be better to reject the request with a 400 ? + SLT_WRN(slot, "n_predict = %d exceeds server configuration, setting to %d", slot.params.n_predict, slot.n_predict); slot.params.n_predict = slot.n_predict; - SLT_WRN(slot, "n_predict = %d exceeds server configuration, setting to %d", slot.n_predict, slot.n_predict); } if (slot.params.ignore_eos && has_eos_token) { @@ -4060,7 +4063,7 @@ int main(int argc, char ** argv) { } auto body = json::parse(req.body); - json data = oaicompat_completion_params_parse(body, params.use_jinja, ctx_server.chat_templates); + json data = oaicompat_completion_params_parse(body, params.use_jinja, params.reasoning_format, ctx_server.chat_templates); return handle_completions_impl( SERVER_TASK_TYPE_COMPLETION, @@ -4073,7 +4076,7 @@ int main(int argc, char ** argv) { // same with handle_chat_completions, but without inference part const auto handle_apply_template = [&ctx_server, ¶ms, &res_ok](const httplib::Request & req, httplib::Response & res) { auto body = json::parse(req.body); - json data = oaicompat_completion_params_parse(body, params.use_jinja, ctx_server.chat_templates); + json data = oaicompat_completion_params_parse(body, params.use_jinja, params.reasoning_format, ctx_server.chat_templates); res_ok(res, {{ "prompt", std::move(data.at("prompt")) }}); }; diff --git a/examples/server/tests/unit/test_tool_call.py b/examples/server/tests/unit/test_tool_call.py index 4a551404f..ba3367b4f 100644 --- a/examples/server/tests/unit/test_tool_call.py +++ b/examples/server/tests/unit/test_tool_call.py @@ -92,6 +92,7 @@ def do_test_completion_with_required_tool_tiny(template_name: str, tool: dict, a tool_calls = choice["message"].get("tool_calls") assert tool_calls and len(tool_calls) == 1, f'Expected 1 tool call in {choice["message"]}' tool_call = tool_calls[0] + assert choice["message"].get("content") is None, f'Expected no content in {choice["message"]}' expected_function_name = "python" if tool["type"] == "code_interpreter" else tool["function"]["name"] assert expected_function_name == tool_call["function"]["name"] actual_arguments = tool_call["function"]["arguments"] @@ -155,11 +156,11 @@ def test_completion_with_required_tool_tiny_slow(template_name: str, tool: dict, (TEST_TOOL, "success", "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")), (PYTHON_TOOL, "code", "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")), - (PYTHON_TOOL, "code", "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", "chatml"), + # (PYTHON_TOOL, "code", "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", "chatml"), (TEST_TOOL, "success", "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-3-Llama-3.1-8B", "tool_use")), (PYTHON_TOOL, "code", "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-3-Llama-3.1-8B", "tool_use")), - (PYTHON_TOOL, "code", "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", "chatml"), + # (PYTHON_TOOL, "code", "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", "chatml"), (TEST_TOOL, "success", "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None), (PYTHON_TOOL, "code", "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None), @@ -175,7 +176,7 @@ def test_completion_with_required_tool_tiny_slow(template_name: str, tool: dict, (TEST_TOOL, "success", "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", ("meta-llama/Llama-3.2-3B-Instruct", None)), (PYTHON_TOOL, "code", "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", ("meta-llama/Llama-3.2-3B-Instruct", None)), - (PYTHON_TOOL, "code", "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", "chatml"), + # (PYTHON_TOOL, "code", "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", "chatml"), # TODO: fix these # (TEST_TOOL, "success", "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None), # (PYTHON_TOOL, "code", "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None), @@ -214,6 +215,7 @@ def test_completion_with_required_tool_real_model(tool: dict, argument_key: str tool_calls = choice["message"].get("tool_calls") assert tool_calls and len(tool_calls) == 1, f'Expected 1 tool call in {choice["message"]}' tool_call = tool_calls[0] + assert choice["message"].get("content") is None, f'Expected no content in {choice["message"]}' expected_function_name = "python" if tool["type"] == "code_interpreter" else tool["function"]["name"] assert expected_function_name == tool_call["function"]["name"] actual_arguments = tool_call["function"]["arguments"] @@ -273,7 +275,6 @@ def test_completion_without_tool_call_slow(template_name: str, n_predict: int, t @pytest.mark.slow @pytest.mark.parametrize("hf_repo,template_override", [ - ("bartowski/c4ai-command-r7b-12-2024-GGUF:Q4_K_M", ("CohereForAI/c4ai-command-r7b-12-2024", "tool_use")), ("bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None), ("bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", "chatml"), @@ -298,13 +299,16 @@ def test_completion_without_tool_call_slow(template_name: str, n_predict: int, t ("bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M", ("meta-llama/Llama-3.2-3B-Instruct", None)), ("bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M", "chatml"), + ("bartowski/c4ai-command-r7b-12-2024-GGUF:Q6_K_L", ("CohereForAI/c4ai-command-r7b-12-2024", "tool_use")), + + ("bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None), + # Note: gemma-2-2b-it knows itself as "model", not "assistant", so we don't test the ill-suited chatml on it. ("bartowski/gemma-2-2b-it-GGUF:Q4_K_M", None), # ("bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", ("meta-llama/Llama-3.2-3B-Instruct", None)), - # ("bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None), ]) -def test_weather(hf_repo: str, template_override: Tuple[str, str | None] | None): +def test_weather(hf_repo: str, template_override: str | Tuple[str, str | None] | None): global server n_predict = 512 server.n_slots = 1 @@ -323,6 +327,7 @@ def test_weather(hf_repo: str, template_override: Tuple[str, str | None] | None) res = server.make_request("POST", "/chat/completions", data={ "max_tokens": n_predict, "messages": [ + {"role": "system", "content": "You are a chatbot that uses tools/functions. Dont overthink things."}, {"role": "user", "content": "What is the weather in Istanbul?"}, ], "tools": [WEATHER_TOOL], @@ -332,6 +337,7 @@ def test_weather(hf_repo: str, template_override: Tuple[str, str | None] | None) tool_calls = choice["message"].get("tool_calls") assert tool_calls and len(tool_calls) == 1, f'Expected 1 tool call in {choice["message"]}' tool_call = tool_calls[0] + assert choice["message"].get("content") is None, f'Expected no content in {choice["message"]}' assert tool_call["function"]["name"] == WEATHER_TOOL["function"]["name"] actual_arguments = json.loads(tool_call["function"]["arguments"]) assert 'location' in actual_arguments, f"location not found in {json.dumps(actual_arguments)}" @@ -340,22 +346,166 @@ def test_weather(hf_repo: str, template_override: Tuple[str, str | None] | None) assert re.match('^Istanbul(, (TR|Turkey|Türkiye))?$', location), f'Expected Istanbul for location, got {location}' +@pytest.mark.slow +@pytest.mark.parametrize("result_override,n_predict,hf_repo,template_override", [ + (None, 128, "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", "chatml"), + (None, 128, "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", None), + (None, 128, "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", "chatml"), + (None, 128, "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")), + (None, 128, "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-3-Llama-3.1-8B", "tool_use")), + (None, 128, "bartowski/functionary-small-v3.2-GGUF:Q8_0", ("meetkai/functionary-medium-v3.2", None)), + (None, 128, "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None), + (None, 128, "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None), + ("^> 0.56$", 128, "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", "chatml"), + (None, 128, "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None), + + # TODO: fix these (wrong results, either didn't respect decimal instruction or got wrong value) + ("^The y-coordinate [\\s\\S]*?\\*\\*0.5\\*\\*", 8192, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None), + ("[\\s\\S]*?\\*\\*0\\.5\\*\\*", 8192, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", ("llama-cpp-deepseek-r1", None)), +]) +def test_calc_result(result_override: str | None, n_predict: int, hf_repo: str, template_override: str | Tuple[str, str | None] | None): + global server + # n_predict = 512 + server.n_slots = 1 + server.jinja = True + server.n_ctx = 8192 * 2 + server.n_predict = n_predict + server.model_hf_repo = hf_repo + server.model_hf_file = None + if isinstance(template_override, tuple): + (template_hf_repo, template_variant) = template_override + server.chat_template_file = f"../../../models/templates/{template_hf_repo.replace('/', '-') + ('-' + template_variant if template_variant else '')}.jinja" + assert os.path.exists(server.chat_template_file), f"Template file {server.chat_template_file} does not exist. Run `python scripts/get_chat_template.py {template_hf_repo} {template_variant} > {server.chat_template_file}` to download the template." + elif isinstance(template_override, str): + server.chat_template = template_override + server.start(timeout_seconds=TIMEOUT_SERVER_START) + res = server.make_request("POST", "/chat/completions", data={ + "max_tokens": n_predict, + "messages": [ + {"role": "system", "content": "You are a chatbot that uses tools/functions. Dont overthink things, and provide very concise answers. Do not explain your reasoning to the user. Provide any numerical values back to the user with at most two decimals."}, + {"role": "user", "content": "What's the y coordinate of a point on the unit sphere at angle 30 degrees?"}, + { + "role": "assistant", + "content": None, + "tool_calls": [ + { + "id": "call_6789", + "type": "function", + "function": { + "name": "calculate", + "arguments": "{\"expression\":\"sin(30 * pi / 180)\"}" + } + } + ] + }, + { + "role": "tool", + "name": "calculate", + "content": 0.55644242476, + "tool_call_id": "call_6789" + } + ], + "tools": [ + { + "type":"function", + "function":{ + "name":"calculate", + "description":"A calculator function that computes values of arithmetic expressions in the Python syntax", + "parameters":{ + "type":"object", + "properties":{ + "expression":{ + "type":"string", + "description":"An arithmetic expression to compute the value of (Python syntad, assuming all floats)" + } + }, + "required":["expression"] + } + } + } + ] + }, timeout=TIMEOUT_HTTP_REQUEST) + assert res.status_code == 200, f"Expected status code 200, got {res.status_code}" + choice = res.body["choices"][0] + tool_calls = choice["message"].get("tool_calls") + assert tool_calls is None, f'Expected no tool call in {choice["message"]}' + content = choice["message"].get("content") + assert content is not None, f'Expected content in {choice["message"]}' + if result_override is not None: + assert re.match(result_override, content), f'Expected {result_override}, got {content}' + else: + assert re.match('^[\\s\\S]*?The (y[ -])?coordinate [\\s\\S]*?is (approximately )?0\\.56\\b|^0\\.56$', content), \ + f'Expected something like "The y coordinate is 0.56.", got {content}' + + +@pytest.mark.slow +@pytest.mark.parametrize("n_predict,reasoning_format,expect_content,expect_reasoning_content,hf_repo,template_override", [ + (128, 'deepseek', "^The sum of 102 and 7 is 109.*", None, "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None), + (128, None, "^The sum of 102 and 7 is 109.*", None, "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None), + + (1024, 'deepseek', "To find the sum of.*", "I need to calculate the sum of 102 and 7.*", "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None), + (1024, 'none', "\n?I need[\\s\\S]*?\n?To find.*", None, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None), + + (1024, 'deepseek', "To find the sum of.*", "First, I [\\s\\S]*", "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", ("llama-cpp-deepseek-r1", None)), +]) +def test_thoughts(n_predict: int, reasoning_format: Literal['deepseek', 'none'] | None, expect_content: str | None, expect_reasoning_content: str | None, hf_repo: str, template_override: str | Tuple[str, str | None] | None): + global server + server.n_slots = 1 + server.reasoning_format = reasoning_format + server.jinja = True + server.n_ctx = 8192 * 2 + server.n_predict = n_predict + server.model_hf_repo = hf_repo + server.model_hf_file = None + if isinstance(template_override, tuple): + (template_hf_repo, template_variant) = template_override + server.chat_template_file = f"../../../models/templates/{template_hf_repo.replace('/', '-') + ('-' + template_variant if template_variant else '')}.jinja" + assert os.path.exists(server.chat_template_file), f"Template file {server.chat_template_file} does not exist. Run `python scripts/get_chat_template.py {template_hf_repo} {template_variant} > {server.chat_template_file}` to download the template." + elif isinstance(template_override, str): + server.chat_template = template_override + server.start(timeout_seconds=TIMEOUT_SERVER_START) + res = server.make_request("POST", "/chat/completions", data={ + "max_tokens": n_predict, + "messages": [ + {"role": "user", "content": "What's the sum of 102 and 7?"}, + ] + }, timeout=TIMEOUT_HTTP_REQUEST) + assert res.status_code == 200, f"Expected status code 200, got {res.status_code}" + choice = res.body["choices"][0] + assert choice["message"].get("tool_calls") is None, f'Expected no tool call in {choice["message"]}' + + content = choice["message"].get("content") + if expect_content is None: + assert content is None, f'Expected no content in {choice["message"]}' + else: + assert re.match(expect_content, content), f'Expected {expect_content}, got {content}' + + reasoning_content = choice["message"].get("reasoning_content") + if expect_reasoning_content is None: + assert reasoning_content is None, f'Expected no reasoning content in {choice["message"]}' + else: + assert re.match(expect_reasoning_content, reasoning_content), f'Expected {expect_reasoning_content}, got {reasoning_content}' + + @pytest.mark.slow @pytest.mark.parametrize("expected_arguments_override,hf_repo,template_override", [ + (None, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None), + # (None, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", "chatml"), + (None, "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None), (None, "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", "chatml"), (None, "bartowski/functionary-small-v3.2-GGUF:Q8_0", ("meetkai-functionary-medium-v3.2", None)), (None, "bartowski/functionary-small-v3.2-GGUF:Q8_0", "chatml"), - (None, "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None), - ('{"code":"print("}', "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", "chatml"), + ('{"code":"print("}', "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None), + (None, "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", "chatml"), - ('{"code":"print("}', "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", ("meta-llama-Llama-3.2-3B-Instruct", None)), + (None, "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", ("meta-llama-Llama-3.2-3B-Instruct", None)), (None, "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", "chatml"), ('{"code":"print("}', "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M", ("meta-llama-Llama-3.2-3B-Instruct", None)), - ('{"code":"print("}', "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M", "chatml"), + (None, "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M", "chatml"), (None, "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", None), (None, "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", "chatml"), @@ -371,15 +521,13 @@ def test_weather(hf_repo: str, template_override: Tuple[str, str | None] | None) # Note: gemma-2-2b-it knows itself as "model", not "assistant", so we don't test the ill-suited chatml on it. (None, "bartowski/gemma-2-2b-it-GGUF:Q4_K_M", None), - - # (None, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None), ]) -def test_hello_world_tool_call(expected_arguments_override: str | None, hf_repo: str, template_override: str | Tuple[str, str | None] | None): +def test_hello_world(expected_arguments_override: str | None, hf_repo: str, template_override: str | Tuple[str, str | None] | None): global server server.n_slots = 1 server.jinja = True server.n_ctx = 8192 - server.n_predict = 128 + server.n_predict = 512 # High because of DeepSeek R1 server.model_hf_repo = hf_repo server.model_hf_file = None if isinstance(template_override, tuple): @@ -406,6 +554,7 @@ def test_hello_world_tool_call(expected_arguments_override: str | None, hf_repo: tool_calls = choice["message"].get("tool_calls") assert tool_calls and len(tool_calls) == 1, f'Expected 1 tool call in {choice["message"]}' tool_call = tool_calls[0] + assert choice["message"].get("content") is None, f'Expected no content in {choice["message"]}' assert tool_call["function"]["name"] == PYTHON_TOOL["function"]["name"] actual_arguments = tool_call["function"]["arguments"] if expected_arguments_override is not None: diff --git a/examples/server/tests/utils.py b/examples/server/tests/utils.py index ce0680662..a82504235 100644 --- a/examples/server/tests/utils.py +++ b/examples/server/tests/utils.py @@ -78,6 +78,7 @@ class ServerProcess: draft_max: int | None = None no_webui: bool | None = None jinja: bool | None = None + reasoning_format: Literal['deepseek', 'none'] | None = None chat_template: str | None = None chat_template_file: str | None = None @@ -172,6 +173,8 @@ class ServerProcess: server_args.append("--no-webui") if self.jinja: server_args.append("--jinja") + if self.reasoning_format is not None: + server_args.extend(("--reasoning-format", self.reasoning_format)) if self.chat_template: server_args.extend(["--chat-template", self.chat_template]) if self.chat_template_file: diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index 5f97df5fd..86de0e6d7 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -578,6 +578,7 @@ static json oaicompat_completion_params_parse(const json & body) { static json oaicompat_completion_params_parse( const json & body, /* openai api json semantics */ bool use_jinja, + common_reasoning_format reasoning_format, const common_chat_templates & chat_templates) { json llama_params; @@ -633,9 +634,10 @@ static json oaicompat_completion_params_parse( throw std::runtime_error("Cannot use custom grammar constraints with tools."); } common_chat_inputs inputs; - inputs.messages = body.at("messages"); - inputs.tools = tools; - inputs.tool_choice = tool_choice; + inputs.extract_reasoning = reasoning_format != COMMON_REASONING_FORMAT_NONE; + inputs.messages = body.at("messages"); + inputs.tools = tools; + inputs.tool_choice = tool_choice; inputs.parallel_tool_calls = json_value(body, "parallel_tool_calls", false); if (inputs.parallel_tool_calls && !tmpl.original_caps().supports_parallel_tool_calls) { LOG_DBG("Disabling parallel_tool_calls because the template does not support it\n"); diff --git a/examples/server/webui/src/components/ChatMessage.tsx b/examples/server/webui/src/components/ChatMessage.tsx index 2ffe08b37..68be7c751 100644 --- a/examples/server/webui/src/components/ChatMessage.tsx +++ b/examples/server/webui/src/components/ChatMessage.tsx @@ -254,12 +254,12 @@ export default function ChatMessage({ 🔄 Regenerate )} - )} + )} diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index a23abba27..65958a684 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -198,7 +198,7 @@ #ifndef __GNUC__ # define GGML_ATTRIBUTE_FORMAT(...) -#elif defined(__MINGW32__) +#elif defined(__MINGW32__) && !defined(__clang__) # define GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__))) #else # define GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__))) diff --git a/ggml/src/ggml-cpu/ggml-cpu-quants.c b/ggml/src/ggml-cpu/ggml-cpu-quants.c index 06df10853..ffef2130b 100644 --- a/ggml/src/ggml-cpu/ggml-cpu-quants.c +++ b/ggml/src/ggml-cpu/ggml-cpu-quants.c @@ -563,6 +563,41 @@ static __m256i lasx_packs_h(__m256i a, __m256i b) { return __lasx_xvpickev_b(tmp1, tmp); } +static inline __m256i lasx_madd_h_b(__m256i a, __m256i b) { + __m256i tmp1, tmp2; + tmp1 = __lasx_xvmulwev_h_b(a, b); + tmp2 = __lasx_xvmulwod_h_b(a, b); + return __lasx_xvadd_h(tmp1, tmp2); +} + +static inline __m256i lasx_xvrepl128vei_h(__m256i a, const unsigned int b) { + switch (b) { + case 0: return __lasx_xvrepl128vei_h(a, 0); + case 1: return __lasx_xvrepl128vei_h(a, 1); + case 2: return __lasx_xvrepl128vei_h(a, 2); + case 3: return __lasx_xvrepl128vei_h(a, 3); + case 4: return __lasx_xvrepl128vei_h(a, 4); + case 5: return __lasx_xvrepl128vei_h(a, 5); + case 6: return __lasx_xvrepl128vei_h(a, 6); + case 7: return __lasx_xvrepl128vei_h(a, 7); + default: __builtin_unreachable(); + } +} + +static inline __m256i lasx_xvandi_b_bit(__m256i a, const unsigned int b) { + switch (b) { + case 0: return __lasx_xvandi_b(a, 1 << 0); + case 1: return __lasx_xvandi_b(a, 1 << 1); + case 2: return __lasx_xvandi_b(a, 1 << 2); + case 3: return __lasx_xvandi_b(a, 1 << 3); + case 4: return __lasx_xvandi_b(a, 1 << 4); + case 5: return __lasx_xvandi_b(a, 1 << 5); + case 6: return __lasx_xvandi_b(a, 1 << 6); + case 7: return __lasx_xvandi_b(a, 1 << 7); + default: __builtin_unreachable(); + } +} + // multiply int8_t, add results pairwise twice static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) { // Get absolute values of x vectors @@ -657,13 +692,8 @@ static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) // multiply int8_t, add results pairwise twice and return as float vector static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) { - - // Get absolute values of x vectors - const __m256i ax = __lasx_xvsigncov_b(x, x); - // Sign the values of the y vectors - const __m256i sy = __lasx_xvsigncov_b(x, y); - - return mul_sum_us8_pairs_float(ax, sy); + const __m256i dot = lasx_madd_h_b(x, y); + return sum_i16_pairs_float(dot); } static inline __m128i packNibbles( __m256i bytes ) { @@ -743,7 +773,7 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k) y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3); } } -#elif defined(__wasm_simd128__) +#elif defined __wasm_simd128__ for (int i = 0; i < nb; i++) { v128_t srcv [8]; v128_t asrcv[8]; @@ -1031,7 +1061,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k) y[i].s = GGML_FP32_TO_FP16(d * vaddvq_s32(accv)); } -#elif defined(__wasm_simd128__) +#elif defined __wasm_simd128__ for (int i = 0; i < nb; i++) { v128_t srcv [8]; v128_t asrcv[8]; @@ -1645,7 +1675,87 @@ static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -1 //===================================== Q8_K ============================================== void quantize_row_q8_K(const float * restrict x, void * restrict y, int64_t k) { +#ifdef __wasm_simd128__ + assert(k % QK_K == 0); + const int64_t nb = k / QK_K; + block_q8_K * restrict yc = y; // Cast to proper type + + for (int i = 0; i < nb; i++) { + const float * x_block = x + i * QK_K; + + v128_t min_vec = wasm_v128_load(x_block); + v128_t max_vec = min_vec; + + for (int j = 4; j < QK_K; j += 4) { + v128_t x_vec = wasm_v128_load(x_block + j); + max_vec = wasm_f32x4_pmax(max_vec, x_vec); + min_vec = wasm_f32x4_pmin(min_vec, x_vec); + } + max_vec = wasm_f32x4_pmax(max_vec, wasm_i32x4_shuffle(max_vec, max_vec, 2, 3, 0, 1)); + max_vec = wasm_f32x4_pmax(max_vec, wasm_i32x4_shuffle(max_vec, max_vec, 1, 0, 3, 2)); + min_vec = wasm_f32x4_pmin(min_vec, wasm_i32x4_shuffle(min_vec, min_vec, 2, 3, 0, 1)); + min_vec = wasm_f32x4_pmin(min_vec, wasm_i32x4_shuffle(min_vec, min_vec, 1, 0, 3, 2)); + float max = wasm_f32x4_extract_lane(max_vec, 0); + float min = wasm_f32x4_extract_lane(min_vec, 0); + float amax = -min > max ? min : max; + + if (amax == 0.0f) { + yc[i].d = 0.0f; + const v128_t zero = wasm_i8x16_splat(0); + for (int j = 0; j < QK_K; j += 16) { + wasm_v128_store(yc[i].qs + j, zero); + } + continue; + } + + const float iscale = -127.0f / amax; + const v128_t scale_vec = wasm_f32x4_splat(iscale); + + // Process 16 elements per iteration + for (int j = 0, jb = 0; j < QK_K; j += 16, jb++) { + // Load and quantize 16 floats + v128_t x0 = wasm_v128_load(x_block + j); + v128_t x1 = wasm_v128_load(x_block + j + 4); + v128_t x2 = wasm_v128_load(x_block + j + 8); + v128_t x3 = wasm_v128_load(x_block + j + 12); + + v128_t q0 = wasm_f32x4_nearest(wasm_f32x4_mul(x0, scale_vec)); + v128_t q1 = wasm_f32x4_nearest(wasm_f32x4_mul(x1, scale_vec)); + v128_t q2 = wasm_f32x4_nearest(wasm_f32x4_mul(x2, scale_vec)); + v128_t q3 = wasm_f32x4_nearest(wasm_f32x4_mul(x3, scale_vec)); + + // Convert to i32 with saturation + v128_t i0 = wasm_i32x4_trunc_sat_f32x4(q0); + v128_t i1 = wasm_i32x4_trunc_sat_f32x4(q1); + v128_t i2 = wasm_i32x4_trunc_sat_f32x4(q2); + v128_t i3 = wasm_i32x4_trunc_sat_f32x4(q3); + + // Pack into 16 i8 values + v128_t i8 = wasm_i8x16_narrow_i16x8( + wasm_i16x8_narrow_i32x4(i0, i1), + wasm_i16x8_narrow_i32x4(i2, i3) + ); + wasm_v128_store(yc[i].qs + j, i8); + + // Calculate bsums using SIMD + v128_t sum16 = wasm_i16x8_add( + wasm_i16x8_extend_low_i8x16(i8), + wasm_i16x8_extend_high_i8x16(i8) + ); + v128_t sum32 = wasm_i32x4_add( + wasm_i32x4_extend_low_i16x8(sum16), + wasm_i32x4_extend_high_i16x8(sum16) + ); + sum32 = wasm_i32x4_add(sum32, wasm_i32x4_shuffle(sum32, sum32, 2, 3, 0, 1)); + sum32 = wasm_i32x4_add(sum32, wasm_i32x4_shuffle(sum32, sum32, 1, 0, 3, 2)); + yc[i].bsums[jb] = wasm_i32x4_extract_lane(sum32, 0); + } + + yc[i].d = 1.0f / iscale; + } +#else quantize_row_q8_K_ref(x, y, k); +#endif } //===================================== Dot products ================================= @@ -2003,6 +2113,94 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r } sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1); +#elif defined __wasm_simd128__ + v128_t sumv = wasm_f32x4_splat(0.0f); + + const v128_t m4b = wasm_i8x16_splat(0x0F); + const v128_t s8b = wasm_i8x16_splat(0x8); + + for (; ib + 1 < nb; ib += 2) { + const block_q4_0 * restrict x0 = &x[ib]; + const block_q4_0 * restrict x1 = &x[ib + 1]; + const block_q8_0 * restrict y0 = &y[ib]; + const block_q8_0 * restrict y1 = &y[ib + 1]; + + // Load and process x0 + v128_t v0_0 = wasm_v128_load(x0->qs); + v128_t v0_0l = wasm_v128_and(v0_0, m4b); + v128_t v0_0h = wasm_u8x16_shr(v0_0, 4); + v128_t v0_0ls = wasm_i8x16_sub(v0_0l, s8b); + v128_t v0_0hs = wasm_i8x16_sub(v0_0h, s8b); + + // Load y0 vectors + v128_t y0_l = wasm_v128_load(y0->qs); + v128_t y0_h = wasm_v128_load(y0->qs + 16); + + // Extend to i16x8 and compute dot products + v128_t dx0l = wasm_i16x8_extend_low_i8x16(v0_0ls); + v128_t dx0h = wasm_i16x8_extend_high_i8x16(v0_0ls); + v128_t dx0hl = wasm_i16x8_extend_low_i8x16(v0_0hs); + v128_t dx0hh = wasm_i16x8_extend_high_i8x16(v0_0hs); + + v128_t dy0ll = wasm_i16x8_extend_low_i8x16(y0_l); + v128_t dy0lh = wasm_i16x8_extend_high_i8x16(y0_l); + v128_t dy0hl = wasm_i16x8_extend_low_i8x16(y0_h); + v128_t dy0hh = wasm_i16x8_extend_high_i8x16(y0_h); + + v128_t dp0 = wasm_i32x4_add( + wasm_i32x4_add( + wasm_i32x4_dot_i16x8(dx0l, dy0ll), + wasm_i32x4_dot_i16x8(dx0h, dy0lh) + ), + wasm_i32x4_add( + wasm_i32x4_dot_i16x8(dx0hl, dy0hl), + wasm_i32x4_dot_i16x8(dx0hh, dy0hh) + ) + ); + + // Load and process x1 + v128_t v0_1 = wasm_v128_load(x1->qs); + v128_t v0_1l = wasm_v128_and(v0_1, m4b); + v128_t v0_1h = wasm_u8x16_shr(v0_1, 4); + v128_t v0_1ls = wasm_i8x16_sub(v0_1l, s8b); + v128_t v0_1hs = wasm_i8x16_sub(v0_1h, s8b); + + // Load y1 vectors + v128_t y1_l = wasm_v128_load(y1->qs); + v128_t y1_h = wasm_v128_load(y1->qs + 16); + + // Extend to i16x8 and compute dot products + v128_t dx1l = wasm_i16x8_extend_low_i8x16(v0_1ls); + v128_t dx1h = wasm_i16x8_extend_high_i8x16(v0_1ls); + v128_t dx1hl = wasm_i16x8_extend_low_i8x16(v0_1hs); + v128_t dx1hh = wasm_i16x8_extend_high_i8x16(v0_1hs); + + v128_t dy1ll = wasm_i16x8_extend_low_i8x16(y1_l); + v128_t dy1lh = wasm_i16x8_extend_high_i8x16(y1_l); + v128_t dy1hl = wasm_i16x8_extend_low_i8x16(y1_h); + v128_t dy1hh = wasm_i16x8_extend_high_i8x16(y1_h); + + v128_t dp1 = wasm_i32x4_add( + wasm_i32x4_add( + wasm_i32x4_dot_i16x8(dx1l, dy1ll), + wasm_i32x4_dot_i16x8(dx1h, dy1lh) + ), + wasm_i32x4_add( + wasm_i32x4_dot_i16x8(dx1hl, dy1hl), + wasm_i32x4_dot_i16x8(dx1hh, dy1hh) + ) + ); + + // Accumulate results with scaling + float scale0 = GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d); + float scale1 = GGML_FP16_TO_FP32(x1->d) * GGML_FP16_TO_FP32(y1->d); + + sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(dp0), wasm_f32x4_splat(scale0))); + sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(dp1), wasm_f32x4_splat(scale1))); + } + + sumf = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) + + wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3); #elif defined(__AVX2__) // Initialize accumulator with zeros __m256 acc = _mm256_setzero_ps(); @@ -2689,10 +2887,10 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * r } sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1); -#elif defined(__wasm_simd128__) +#elif defined __wasm_simd128__ v128_t sumv = wasm_f32x4_splat(0.0f); - uint32_t qh; + uint32_t qh_; uint64_t tmp[4]; // TODO: check if unrolling this is better @@ -2703,12 +2901,12 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * r const v128_t m4b = wasm_i8x16_splat(0x0F); // extract the 5th bit - memcpy(&qh, x0->qh, sizeof(qh)); + memcpy(&qh_, x0->qh, sizeof(qh_)); - tmp[0] = table_b2b_1[(qh >> 0) & 0xFF]; - tmp[1] = table_b2b_1[(qh >> 8) & 0xFF]; - tmp[2] = table_b2b_1[(qh >> 16) & 0xFF]; - tmp[3] = table_b2b_1[(qh >> 24) ]; + tmp[0] = table_b2b_1[(qh_ >> 0) & 0xFF]; + tmp[1] = table_b2b_1[(qh_ >> 8) & 0xFF]; + tmp[2] = table_b2b_1[(qh_ >> 16) & 0xFF]; + tmp[3] = table_b2b_1[(qh_ >> 24) ]; const v128_t qhl = wasm_v128_load(tmp + 0); const v128_t qhh = wasm_v128_load(tmp + 2); @@ -3050,12 +3248,12 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r } sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs0 + summs1; -#elif defined(__wasm_simd128__) +#elif defined __wasm_simd128__ v128_t sumv = wasm_f32x4_splat(0.0f); float summs = 0.0f; - uint32_t qh; + uint32_t qh_; uint64_t tmp[4]; // TODO: check if unrolling this is better @@ -3068,12 +3266,12 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r const v128_t m4b = wasm_i8x16_splat(0x0F); // extract the 5th bit - memcpy(&qh, x0->qh, sizeof(qh)); + memcpy(&qh_, x0->qh, sizeof(qh_)); - tmp[0] = table_b2b_0[(qh >> 0) & 0xFF]; - tmp[1] = table_b2b_0[(qh >> 8) & 0xFF]; - tmp[2] = table_b2b_0[(qh >> 16) & 0xFF]; - tmp[3] = table_b2b_0[(qh >> 24) ]; + tmp[0] = table_b2b_0[(qh_ >> 0) & 0xFF]; + tmp[1] = table_b2b_0[(qh_ >> 8) & 0xFF]; + tmp[2] = table_b2b_0[(qh_ >> 16) & 0xFF]; + tmp[3] = table_b2b_0[(qh_ >> 24) ]; const v128_t qhl = wasm_v128_load(tmp + 0); const v128_t qhh = wasm_v128_load(tmp + 2); @@ -3566,6 +3764,45 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r } sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1); +#elif defined __wasm_simd128__ + v128_t sumv = wasm_f32x4_splat(0.0f); + + for (; ib < nb; ++ib) { + const block_q8_0 * restrict x0 = &x[ib]; + const block_q8_0 * restrict y0 = &y[ib]; + + const v128_t x0_0 = wasm_v128_load(x0->qs); + const v128_t x0_1 = wasm_v128_load(x0->qs + 16); + const v128_t y0_0 = wasm_v128_load(y0->qs); + const v128_t y0_1 = wasm_v128_load(y0->qs + 16); + + // Extend 8-bit to 16-bit + const v128_t x0_0l = wasm_i16x8_extend_low_i8x16(x0_0); + const v128_t x0_0h = wasm_i16x8_extend_high_i8x16(x0_0); + const v128_t x0_1l = wasm_i16x8_extend_low_i8x16(x0_1); + const v128_t x0_1h = wasm_i16x8_extend_high_i8x16(x0_1); + + const v128_t y0_0l = wasm_i16x8_extend_low_i8x16(y0_0); + const v128_t y0_0h = wasm_i16x8_extend_high_i8x16(y0_0); + const v128_t y0_1l = wasm_i16x8_extend_low_i8x16(y0_1); + const v128_t y0_1h = wasm_i16x8_extend_high_i8x16(y0_1); + + // Compute dot products + const v128_t dx0_0 = wasm_i32x4_dot_i16x8(x0_0l, y0_0l); + const v128_t dx0_1 = wasm_i32x4_dot_i16x8(x0_0h, y0_0h); + const v128_t dx1_0 = wasm_i32x4_dot_i16x8(x0_1l, y0_1l); + const v128_t dx1_1 = wasm_i32x4_dot_i16x8(x0_1h, y0_1h); + + // Sum all dot products + const v128_t sum_dots = wasm_i32x4_add(wasm_i32x4_add(dx0_0, dx0_1), wasm_i32x4_add(dx1_0, dx1_1)); + + // Convert to float and accumulate + const float scale = GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d); + sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(sum_dots), wasm_f32x4_splat(scale))); + } + + sumf = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) + + wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3); #elif defined(__AVX2__) // Initialize accumulator with zeros __m256 acc = _mm256_setzero_ps(); @@ -4440,6 +4677,106 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r *s = hsum_float_8(acc); +#elif defined __wasm_simd128__ + float sumf = 0; + + for (int i = 0; i < nb; ++i) { + const uint8_t * q2 = x[i].qs; + const int8_t * q8 = y[i].qs; + const uint8_t * sc = x[i].scales; + + // Vectorized summs calculation + v128_t summs_vec = wasm_i32x4_splat(0); + { + v128_t sc_vec = wasm_v128_load(sc); + v128_t sc_upper = wasm_u8x16_shr(sc_vec, 4); + + v128_t sc_low = wasm_u16x8_extend_low_u8x16(sc_upper); + v128_t sc_high = wasm_u16x8_extend_high_u8x16(sc_upper); + + v128_t bsums1 = wasm_v128_load(&y[i].bsums[0]); + v128_t bsums2 = wasm_v128_load(&y[i].bsums[8]); + + summs_vec = wasm_i32x4_add( + wasm_i32x4_add(wasm_i32x4_dot_i16x8(sc_low, bsums1), + wasm_i32x4_dot_i16x8(sc_high, bsums2)), + summs_vec + ); + + summs_vec = wasm_i32x4_add(summs_vec, wasm_i32x4_shuffle(summs_vec, summs_vec, 2, 3, 0, 1)); + summs_vec = wasm_i32x4_add(summs_vec, wasm_i32x4_shuffle(summs_vec, summs_vec, 1, 0, 3, 2)); + } + int32_t summs = wasm_i32x4_extract_lane(summs_vec, 0); + + // Vectorized isum calculation + int32_t isum = 0; + const uint8_t * sc_ptr = sc; + const int k_iters = QK_K/128; + + for (int k = 0; k < k_iters; ++k) { + v128_t isum_vec = wasm_i32x4_splat(0); + int shift = 0; + + for (int j = 0; j < 4; ++j) { + const int d0 = (sc_ptr[0] & 0xF); + const int d1 = (sc_ptr[1] & 0xF); + sc_ptr += 2; + + // Process first 16 elements + v128_t q2_0 = wasm_v128_load(q2); + v128_t q8_0 = wasm_v128_load(q8); + v128_t q2_shift_0 = wasm_u8x16_shr(q2_0, shift); + v128_t q2_bits_0 = wasm_v128_and(q2_shift_0, wasm_i8x16_splat(0x03)); + + // Process next 16 elements + v128_t q2_1 = wasm_v128_load(q2 + 16); + v128_t q8_1 = wasm_v128_load(q8 + 16); + v128_t q2_shift_1 = wasm_u8x16_shr(q2_1, shift); + v128_t q2_bits_1 = wasm_v128_and(q2_shift_1, wasm_i8x16_splat(0x03)); + + // Calculate dot products + v128_t p0 = wasm_i32x4_dot_i16x8( + wasm_i16x8_extend_low_i8x16(q8_0), + wasm_i16x8_extend_low_i8x16(q2_bits_0) + ); + v128_t p1 = wasm_i32x4_dot_i16x8( + wasm_i16x8_extend_high_i8x16(q8_0), + wasm_i16x8_extend_high_i8x16(q2_bits_0) + ); + v128_t p2 = wasm_i32x4_dot_i16x8( + wasm_i16x8_extend_low_i8x16(q8_1), + wasm_i16x8_extend_low_i8x16(q2_bits_1) + ); + v128_t p3 = wasm_i32x4_dot_i16x8( + wasm_i16x8_extend_high_i8x16(q8_1), + wasm_i16x8_extend_high_i8x16(q2_bits_1) + ); + + // Accumulate scaled results + v128_t scaled = wasm_i32x4_add( + wasm_i32x4_mul(wasm_i32x4_add(p0, p1), wasm_i32x4_splat(d0)), + wasm_i32x4_mul(wasm_i32x4_add(p2, p3), wasm_i32x4_splat(d1)) + ); + + isum_vec = wasm_i32x4_add(isum_vec, scaled); + q8 += 32; + shift += 2; + } + q2 += 32; + + // Horizontal sum of isum_vec + isum_vec = wasm_i32x4_add(isum_vec, wasm_i32x4_shuffle(isum_vec, isum_vec, 2, 3, 0, 1)); + isum_vec = wasm_i32x4_add(isum_vec, wasm_i32x4_shuffle(isum_vec, isum_vec, 1, 0, 3, 2)); + isum += wasm_i32x4_extract_lane(isum_vec, 0); + } + + const float dall = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d; + sumf += dall * isum - dmin * summs; + } + + *s = sumf; + #elif defined __riscv_v_intrinsic float sumf = 0; @@ -4659,9 +4996,6 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r #elif defined __loongarch_asx - const __m256i m3 = __lasx_xvreplgr2vr_b(3); - const __m128i m4 = __lsx_vreplgr2vr_b(0xF); - __m256 acc = (__m256)__lasx_xvldi(0); for (int i = 0; i < nb; ++i) { @@ -4672,18 +5006,15 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r const uint8_t * restrict q2 = x[i].qs; const int8_t * restrict q8 = y[i].qs; - const __m128i mins_and_scales = __lsx_vld((const __m128i*)x[i].scales, 0); - const __m128i scales8 = __lsx_vand_v(mins_and_scales, m4); - const __m128i mins8 = __lsx_vand_v(__lsx_vsrli_h(mins_and_scales, 4), m4); - const __m256i mins = lasx_ext8_16(mins8); + const __m128i mins_and_scales128 = __lsx_vld((const __m128i*)x[i].scales, 0); + const __m128i scales128 = __lsx_vandi_b(mins_and_scales128, 0xf); + const __m256i mins = lasx_ext8_16(__lsx_vsrli_b(mins_and_scales128, 4)); const __m256i prod = lasx_madd_h(mins, __lasx_xvld((const __m256i*)y[i].bsums, 0)); acc = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(dmin), __lasx_xvffint_s_w(prod), acc); - const __m256i all_scales = lasx_ext8_16(scales8); - const __m128i l_scales = lasx_extracti128(all_scales, 0); - const __m128i h_scales = lasx_extracti128(all_scales, 1); - const __m256i scales[2] = {lasx_insertf128(l_scales, l_scales), lasx_insertf128(h_scales, h_scales)}; + const v16i8 shuffle_mask = {0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15}; + const __m256i scales_shuffled = lasx_ext8_16(__lsx_vshuf_b(scales128, scales128, (__m128i)shuffle_mask)); __m256i sumi = __lasx_xvldi(0); @@ -4696,20 +5027,20 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r const __m256i q8_2 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32; const __m256i q8_3 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32; - const __m256i q2_0 = __lasx_xvand_v(q2bits, m3); - const __m256i q2_1 = __lasx_xvand_v(__lasx_xvsrli_h(q2bits, 2), m3); - const __m256i q2_2 = __lasx_xvand_v(__lasx_xvsrli_h(q2bits, 4), m3); - const __m256i q2_3 = __lasx_xvand_v(__lasx_xvsrli_h(q2bits, 6), m3); + const __m256i q2_0 = __lasx_xvandi_b(q2bits, 3); + const __m256i q2_1 = __lasx_xvandi_b(__lasx_xvsrli_b(q2bits, 2), 3); + const __m256i q2_2 = __lasx_xvandi_b(__lasx_xvsrli_b(q2bits, 4), 3); + const __m256i q2_3 = __lasx_xvsrli_b(q2bits, 6); - __m256i p0 = lasx_maddubs_h(q2_0, q8_0); - __m256i p1 = lasx_maddubs_h(q2_1, q8_1); - __m256i p2 = lasx_maddubs_h(q2_2, q8_2); - __m256i p3 = lasx_maddubs_h(q2_3, q8_3); + __m256i p0 = lasx_madd_h_b(q2_0, q8_0); + __m256i p1 = lasx_madd_h_b(q2_1, q8_1); + __m256i p2 = lasx_madd_h_b(q2_2, q8_2); + __m256i p3 = lasx_madd_h_b(q2_3, q8_3); - p0 = lasx_madd_h(lasx_shuffle_b(scales[j], get_scale_shuffle_q3k(0)), p0); - p1 = lasx_madd_h(lasx_shuffle_b(scales[j], get_scale_shuffle_q3k(1)), p1); - p2 = lasx_madd_h(lasx_shuffle_b(scales[j], get_scale_shuffle_q3k(2)), p2); - p3 = lasx_madd_h(lasx_shuffle_b(scales[j], get_scale_shuffle_q3k(3)), p3); + p0 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 0), p0); + p1 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 1), p1); + p2 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 2), p2); + p3 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 3), p3); p0 = __lasx_xvadd_w(p0, p1); p2 = __lasx_xvadd_w(p2, p3); @@ -5122,6 +5453,94 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r *s = hsum_float_8(acc); +#elif defined __wasm_simd128__ + int8_t aux8[QK_K]; + float sums[8] = {0}; + uint32_t auxs[4]; + + float sumf = 0; + for (int i = 0; i < nb; ++i) { + const uint8_t * restrict q3 = x[i].qs; + const uint8_t * restrict hm = x[i].hmask; + const int8_t * restrict q8 = y[i].qs; + + // Process blocks with SIMD + int8_t * a = aux8; + uint8_t m = 1; + for (int j = 0; j < QK_K; j += 128) { + for (int shift = 0; shift <= 6; shift += 2) { + v128_t v_m = wasm_i8x16_splat(m); + for (int l = 0; l < 32; l += 16) { + v128_t v_q3 = wasm_v128_load(q3 + l); + v128_t v_shift = wasm_i8x16_shr(v_q3, shift); + v128_t v_low2 = wasm_v128_and(v_shift, wasm_i8x16_splat(0x03)); + + v128_t v_hm = wasm_v128_load(hm + l); + v128_t v_mask = wasm_v128_and(v_hm, v_m); + v_mask = wasm_i8x16_ne(v_mask, wasm_i8x16_splat(0)); + + v_low2 = wasm_i8x16_sub(v_low2, wasm_v128_and(wasm_i8x16_splat(4), wasm_v128_not(v_mask))); + wasm_v128_store(a + l, v_low2); + } + a += 32; + m <<= 1; + } + q3 += 32; + } + + // Extract scales + memcpy(auxs, x[i].scales, 12); + uint32_t tmp = auxs[2]; + auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4); + auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4); + auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4); + auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4); + const int8_t * scales = (const int8_t *)auxs; + + // SIMD dot product with register accumulators + v128_t v_acc0 = wasm_i32x4_splat(0); + v128_t v_acc1 = wasm_i32x4_splat(0); + a = aux8; + for (int j = 0; j < QK_K/16; ++j) { + const v128_t v_scale = wasm_i16x8_splat(scales[j] - 32); + + // Process 16 elements per iteration + for (int k = 0; k < 2; ++k) { + const v128_t v_q8 = wasm_i16x8_load8x8(q8); + const v128_t v_a = wasm_i16x8_load8x8(a); + + v128_t v_prod = wasm_i16x8_mul(v_q8, v_a); + v_prod = wasm_i16x8_mul(v_prod, v_scale); + + v_acc0 = wasm_i32x4_add(v_acc0, wasm_i32x4_extend_low_i16x8(v_prod)); + v_acc1 = wasm_i32x4_add(v_acc1, wasm_i32x4_extend_high_i16x8(v_prod)); + + q8 += 8; + a += 8; + } + } + + // Accumulate results + const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const v128_t v_d = wasm_f32x4_splat(d); + v128_t v_sum = wasm_f32x4_add( + wasm_f32x4_mul(wasm_f32x4_convert_i32x4(v_acc0), v_d), + wasm_f32x4_mul(wasm_f32x4_convert_i32x4(v_acc1), v_d) + ); + + // Accumulate into sums vector + wasm_v128_store(sums, wasm_f32x4_add(wasm_v128_load(sums), v_sum)); + } + + // Horizontal sum + v128_t v_sum = wasm_f32x4_add(wasm_v128_load(sums), wasm_v128_load(sums + 4)); + sumf = wasm_f32x4_extract_lane(v_sum, 0) + + wasm_f32x4_extract_lane(v_sum, 1) + + wasm_f32x4_extract_lane(v_sum, 2) + + wasm_f32x4_extract_lane(v_sum, 3); + + *s = sumf; + #elif defined __riscv_v_intrinsic uint32_t aux[3]; @@ -5377,8 +5796,6 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r #elif defined __loongarch_asx - const __m256i m3 = __lasx_xvreplgr2vr_b(3); - const __m256i mone = __lasx_xvreplgr2vr_b(1); const __m128i m32 = __lsx_vreplgr2vr_b(32); __m256 acc = (__m256)__lasx_xvldi(0); @@ -5398,10 +5815,9 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4), (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4)); scales128 = __lsx_vsub_b(scales128, m32); - const __m256i all_scales = lasx_ext8_16(scales128); - const __m128i l_scales = lasx_extracti128(all_scales, 0); - const __m128i h_scales = lasx_extracti128(all_scales, 1); - const __m256i scales[2] = {lasx_insertf128(l_scales, l_scales), lasx_insertf128(h_scales, h_scales)}; + + const v16i8 shuffle_mask = {0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15}; + const __m256i scales_shuffled = lasx_ext8_16(__lsx_vshuf_b(scales128, scales128, (__m128i)shuffle_mask)); // high bit const __m256i hbits = __lasx_xvld((const __m256i*)x[i].hmask, 0); @@ -5409,35 +5825,23 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r // integer accumulator __m256i sumi = __lasx_xvldi(0); - int bit = 0; - int is = 0; - __m256i xvbit; - - for (int j = 0; j < QK_K/128; ++j) { // load low 2 bits const __m256i q3bits = __lasx_xvld((const __m256i*)q3, 0); q3 += 32; - xvbit = __lasx_xvreplgr2vr_h(bit); // prepare low and high bits - const __m256i q3l_0 = __lasx_xvand_v(q3bits, m3); - const __m256i q3h_0 = __lasx_xvslli_h(__lasx_xvsrl_h(__lasx_xvandn_v(hbits, __lasx_xvsll_h(mone, xvbit)), xvbit), 2); - ++bit; - - xvbit = __lasx_xvreplgr2vr_h(bit); - const __m256i q3l_1 = __lasx_xvand_v(__lasx_xvsrli_h(q3bits, 2), m3); - const __m256i q3h_1 = __lasx_xvslli_h(__lasx_xvsrl_h(__lasx_xvandn_v(hbits, __lasx_xvsll_h(mone, xvbit)), xvbit), 2); - ++bit; - - xvbit = __lasx_xvreplgr2vr_h(bit); - const __m256i q3l_2 = __lasx_xvand_v(__lasx_xvsrli_h(q3bits, 4), m3); - const __m256i q3h_2 = __lasx_xvslli_h(__lasx_xvsrl_h(__lasx_xvandn_v(hbits, __lasx_xvsll_h(mone, xvbit)), xvbit), 2); - ++bit; - - xvbit = __lasx_xvreplgr2vr_h(bit); - const __m256i q3l_3 = __lasx_xvand_v(__lasx_xvsrli_h(q3bits, 6), m3); - const __m256i q3h_3 = __lasx_xvslli_h(__lasx_xvsrl_h(__lasx_xvandn_v(hbits, __lasx_xvsll_h(mone, xvbit)), xvbit), 2); - ++bit; + const __m256i q3l_0 = __lasx_xvandi_b(q3bits, 3); + const __m256i q3l_1 = __lasx_xvandi_b(__lasx_xvsrli_b(q3bits, 2), 3); + const __m256i q3l_2 = __lasx_xvandi_b(__lasx_xvsrli_b(q3bits, 4), 3); + const __m256i q3l_3 = __lasx_xvsrli_b(q3bits, 6); + const __m256i q3h_0 = __lasx_xvslli_b(__lasx_xvseqi_b(lasx_xvandi_b_bit(hbits, 4 * j + 0), 0), 2); + const __m256i q3h_1 = __lasx_xvslli_b(__lasx_xvseqi_b(lasx_xvandi_b_bit(hbits, 4 * j + 1), 0), 2); + const __m256i q3h_2 = __lasx_xvslli_b(__lasx_xvseqi_b(lasx_xvandi_b_bit(hbits, 4 * j + 2), 0), 2); + const __m256i q3h_3 = __lasx_xvslli_b(__lasx_xvseqi_b(lasx_xvandi_b_bit(hbits, 4 * j + 3), 0), 2); + const __m256i q3_0 = __lasx_xvor_v(q3h_0, q3l_0); + const __m256i q3_1 = __lasx_xvor_v(q3h_1, q3l_1); + const __m256i q3_2 = __lasx_xvor_v(q3h_2, q3l_2); + const __m256i q3_3 = __lasx_xvor_v(q3h_3, q3l_3); // load Q8 quants const __m256i q8_0 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32; @@ -5445,29 +5849,16 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r const __m256i q8_2 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32; const __m256i q8_3 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32; - // Dot product: we multiply the 2 low bits and 1 high bit part separately, so we can use lasx_maddubs_h, - // and then subtract. The high bit part has the 2 already subtracted (and so, it is zero if the high bit was not set, - // and 2 if the high bit was set) - __m256i q8s_0 = lasx_maddubs_h(q3h_0, q8_0); - __m256i q8s_1 = lasx_maddubs_h(q3h_1, q8_1); - __m256i q8s_2 = lasx_maddubs_h(q3h_2, q8_2); - __m256i q8s_3 = lasx_maddubs_h(q3h_3, q8_3); - - __m256i p16_0 = lasx_maddubs_h(q3l_0, q8_0); - __m256i p16_1 = lasx_maddubs_h(q3l_1, q8_1); - __m256i p16_2 = lasx_maddubs_h(q3l_2, q8_2); - __m256i p16_3 = lasx_maddubs_h(q3l_3, q8_3); - - p16_0 = __lasx_xvsub_h(p16_0, q8s_0); - p16_1 = __lasx_xvsub_h(p16_1, q8s_1); - p16_2 = __lasx_xvsub_h(p16_2, q8s_2); - p16_3 = __lasx_xvsub_h(p16_3, q8s_3); + __m256i p16_0 = lasx_madd_h_b(q8_0, q3_0); + __m256i p16_1 = lasx_madd_h_b(q8_1, q3_1); + __m256i p16_2 = lasx_madd_h_b(q8_2, q3_2); + __m256i p16_3 = lasx_madd_h_b(q8_3, q3_3); // multiply with scales - p16_0 = lasx_madd_h(lasx_shuffle_b(scales[j], get_scale_shuffle_q3k(is + 0)), p16_0); - p16_1 = lasx_madd_h(lasx_shuffle_b(scales[j], get_scale_shuffle_q3k(is + 1)), p16_1); - p16_2 = lasx_madd_h(lasx_shuffle_b(scales[j], get_scale_shuffle_q3k(is + 2)), p16_2); - p16_3 = lasx_madd_h(lasx_shuffle_b(scales[j], get_scale_shuffle_q3k(is + 3)), p16_3); + p16_0 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 0), p16_0); + p16_1 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 1), p16_1); + p16_2 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 2), p16_2); + p16_3 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 3), p16_3); // accumulate p16_0 = __lasx_xvadd_w(p16_0, p16_1); @@ -5475,7 +5866,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r sumi = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p16_0, p16_2)); } // multiply with block scale and accumulate - acc = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(sumi), acc);//FIXME + acc = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(sumi), acc); } *s = hsum_float_8(acc); @@ -5647,7 +6038,7 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r } } *s = sumf; -#elif __ARM_NEON +#elif defined __ARM_NEON const uint8x16_t m4b = vdupq_n_u8(0xf); const int32x4_t mzero = vdupq_n_s32(0); @@ -5710,6 +6101,107 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r *s = sumf; +#elif defined __wasm_simd128__ + const uint8_t * scales = (const uint8_t*)&utmp[0]; + float sumf = 0; + + for (int i = 0; i < nb; ++i) { + const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); + const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); // Corrected sign + + const uint8_t * restrict q4 = x[i].qs; + const int8_t * restrict q8 = y[i].qs; + + // Process scales and mins + memcpy(utmp, x[i].scales, 12); + utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); + const uint32_t uaux = utmp[1] & kmask1; + utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4); + utmp[2] = uaux; + utmp[0] &= kmask1; + + // Sum mins * q8sums + int32_t sumi = 0; + const int16_t * restrict q8sums = y[i].bsums; + const uint8_t * m = (const uint8_t *)&utmp[2]; + for (int j = 0; j < 16; j += 2) { + sumi += (q8sums[j] + q8sums[j+1]) * m[j/2]; + } + sumf -= dmin * sumi; + + int32_t sumi1 = 0; + int32_t sumi2 = 0; + + for (int j = 0; j < QK_K/64; ++j) { + // Load 64 4-bit weights (32 bytes) + const v128_t q4x0 = wasm_v128_load(q4); + const v128_t q4x1 = wasm_v128_load(q4 + 16); + q4 += 32; + + // Split into low/high nibbles + const v128_t q4l0 = wasm_v128_and(q4x0, wasm_i8x16_splat(0x0F)); + const v128_t q4h0 = wasm_u8x16_shr(q4x0, 4); + const v128_t q4l1 = wasm_v128_and(q4x1, wasm_i8x16_splat(0x0F)); + const v128_t q4h1 = wasm_u8x16_shr(q4x1, 4); + + // Load 64 8-bit values (64 bytes) + const v128_t q8x0 = wasm_v128_load(q8); + const v128_t q8x1 = wasm_v128_load(q8 + 16); + const v128_t q8x2 = wasm_v128_load(q8 + 32); + const v128_t q8x3 = wasm_v128_load(q8 + 48); + q8 += 64; + + // Low nibble products + v128_t vacc1 = wasm_i32x4_dot_i16x8( + wasm_i16x8_extend_low_i8x16(q4l0), + wasm_i16x8_extend_low_i8x16(q8x0) + ); + vacc1 = wasm_i32x4_add(vacc1, wasm_i32x4_dot_i16x8( + wasm_i16x8_extend_high_i8x16(q4l0), + wasm_i16x8_extend_high_i8x16(q8x0) + )); + vacc1 = wasm_i32x4_add(vacc1, wasm_i32x4_dot_i16x8( + wasm_i16x8_extend_low_i8x16(q4l1), + wasm_i16x8_extend_low_i8x16(q8x1) + )); + vacc1 = wasm_i32x4_add(vacc1, wasm_i32x4_dot_i16x8( + wasm_i16x8_extend_high_i8x16(q4l1), + wasm_i16x8_extend_high_i8x16(q8x1) + )); + + // High nibble products + v128_t vacc2 = wasm_i32x4_dot_i16x8( + wasm_i16x8_extend_low_i8x16(q4h0), + wasm_i16x8_extend_low_i8x16(q8x2) + ); + vacc2 = wasm_i32x4_add(vacc2, wasm_i32x4_dot_i16x8( + wasm_i16x8_extend_high_i8x16(q4h0), + wasm_i16x8_extend_high_i8x16(q8x2) + )); + vacc2 = wasm_i32x4_add(vacc2, wasm_i32x4_dot_i16x8( + wasm_i16x8_extend_low_i8x16(q4h1), + wasm_i16x8_extend_low_i8x16(q8x3) + )); + vacc2 = wasm_i32x4_add(vacc2, wasm_i32x4_dot_i16x8( + wasm_i16x8_extend_high_i8x16(q4h1), + wasm_i16x8_extend_high_i8x16(q8x3) + )); + + // Accumulate scaled results + int32_t vacc1_sum = wasm_i32x4_extract_lane(vacc1, 0) + wasm_i32x4_extract_lane(vacc1, 1) + + wasm_i32x4_extract_lane(vacc1, 2) + wasm_i32x4_extract_lane(vacc1, 3); + sumi1 += vacc1_sum * scales[2*j]; + + int32_t vacc2_sum = wasm_i32x4_extract_lane(vacc2, 0) + wasm_i32x4_extract_lane(vacc2, 1) + + wasm_i32x4_extract_lane(vacc2, 2) + wasm_i32x4_extract_lane(vacc2, 3); + sumi2 += vacc2_sum * scales[2*j+1]; + } + + sumf += d * (sumi1 + sumi2); + } + + *s = sumf; + #elif defined __AVX2__ const __m256i m4 = _mm256_set1_epi8(0xF); @@ -6067,11 +6559,6 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r *s = vec_extract(vsumf0, 0); #elif defined __loongarch_asx - GGML_UNUSED(kmask1); - GGML_UNUSED(kmask2); - GGML_UNUSED(kmask3); - - const __m256i m4 = __lasx_xvreplgr2vr_b(0xF); __m256 acc = (__m256)__lasx_xvldi(0); __m128 acc_m = (__m128)__lsx_vldi(0); @@ -6091,33 +6578,34 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r const uint8_t * restrict q4 = x[i].qs; const int8_t * restrict q8 = y[i].qs; - const __m256i mins_and_scales = lasx_extu8_16(lsx_set_w(utmp[3], utmp[2], utmp[1], utmp[0])); + const __m128i mins_and_scales128 = lsx_set_w(utmp[3], utmp[2], utmp[1], utmp[0]); + const __m128i mins128 = __lsx_vexth_h_b(mins_and_scales128); + const __m128i scales128 = __lsx_vsllwil_h_b(mins_and_scales128, 0); const __m256i q8sums = __lasx_xvld((const __m256i*)y[i].bsums, 0); const __m128i q8s = lsx_hadd_h(lasx_extracti128(q8sums, 0), lasx_extracti128(q8sums, 1)); - const __m128i prod = lsx_madd_h(lasx_extracti128(mins_and_scales, 1), q8s); + const __m128i prod = lsx_madd_h(mins128, q8s); acc_m = __lsx_vfmadd_s(__lsx_vreplfr2vr_s(dmin), __lsx_vffint_s_w(prod), acc_m); - const __m128i sc128 = lasx_extracti128(mins_and_scales, 0); - const __m256i scales = lasx_insertf128(sc128, sc128); + const __m256i scales = lasx_insertf128(scales128, scales128); __m256i sumi = __lasx_xvldi(0); for (int j = 0; j < QK_K/64; ++j) { - const __m256i scale_l = lasx_shuffle_b(scales, get_scale_shuffle_k4(2*j+0)); - const __m256i scale_h = lasx_shuffle_b(scales, get_scale_shuffle_k4(2*j+1)); + const __m256i scale_l = lasx_xvrepl128vei_h(scales, 2 * j + 0); + const __m256i scale_h = lasx_xvrepl128vei_h(scales, 2 * j + 1); const __m256i q4bits = __lasx_xvld((const __m256i*)q4, 0); q4 += 32; - const __m256i q4l = __lasx_xvand_v(q4bits, m4); - const __m256i q4h = __lasx_xvand_v(__lasx_xvsrli_h(q4bits, 4), m4); + const __m256i q4l = __lasx_xvandi_b(q4bits, 0xf); + const __m256i q4h = __lasx_xvsrli_b(q4bits, 4); const __m256i q8l = __lasx_xvld((const __m256i*)q8, 0); q8 += 32; - __m256i p16l = lasx_maddubs_h(q4l, q8l); + __m256i p16l = lasx_madd_h_b(q4l, q8l); p16l = lasx_madd_h(scale_l, p16l); const __m256i q8h = __lasx_xvld((const __m256i*)q8, 0); q8 += 32; - __m256i p16h = lasx_maddubs_h(q4h, q8h); + __m256i p16h = lasx_madd_h_b(q4h, q8h); p16h = lasx_madd_h(scale_h, p16h); const __m256i sumj = __lasx_xvadd_w(p16l, p16h); @@ -6460,6 +6948,118 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r *s = hsum_float_8(acc) + summs; +#elif defined __wasm_simd128__ + //const uint8_t * scales = (const uint8_t*)&utmp[0]; + float sumf = 0; + + for (int i = 0; i < nb; ++i) { + const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); + const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); // Fixed sign + + const uint8_t * restrict q5 = x[i].qs; + const uint8_t * restrict qh = x[i].qh; + const int8_t * restrict q8 = y[i].qs; + + // Process scales and mins + memcpy(utmp, x[i].scales, 12); + utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); + const uint32_t uaux = utmp[1] & kmask1; + utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4); + utmp[2] = uaux; + utmp[0] &= kmask1; + + // Sum mins * q8sums + int32_t sumi_mins = 0; + const int16_t * restrict q8sums = y[i].bsums; + const uint8_t * m = (const uint8_t *)&utmp[2]; + for (int j = 0; j < 16; j += 2) { + sumi_mins += (q8sums[j] + q8sums[j+1]) * m[j/2]; + } + sumf -= dmin * sumi_mins; // Correct subtraction + + v128_t qh0 = wasm_v128_load(qh); + v128_t qh1 = wasm_v128_load(qh + 16); + const uint8_t * sc = (const uint8_t *)utmp; + + int32_t sumi = 0; + + for (int j = 0; j < QK_K/64; ++j) { + const int shift = j * 2; + v128_t qh_shift0 = wasm_u8x16_shr(qh0, shift); + v128_t qh_shift1 = wasm_u8x16_shr(qh1, shift); + + v128_t qh_low0 = wasm_i8x16_shl(wasm_v128_and(qh_shift0, wasm_i8x16_splat(0x01)), 4); + v128_t qh_high0 = wasm_i8x16_shl(wasm_v128_and(qh_shift0, wasm_i8x16_splat(0x02)), 3); + v128_t qh_low1 = wasm_i8x16_shl(wasm_v128_and(qh_shift1, wasm_i8x16_splat(0x01)), 4); + v128_t qh_high1 = wasm_i8x16_shl(wasm_v128_and(qh_shift1, wasm_i8x16_splat(0x02)), 3); + + v128_t q5_0 = wasm_v128_load(q5); + v128_t q5_1 = wasm_v128_load(q5 + 16); + q5 += 32; + + v128_t q5l_0 = wasm_v128_or(wasm_v128_and(q5_0, wasm_i8x16_splat(0x0F)), qh_low0); + v128_t q5h_0 = wasm_v128_or(wasm_u8x16_shr(q5_0, 4), qh_high0); + v128_t q5l_1 = wasm_v128_or(wasm_v128_and(q5_1, wasm_i8x16_splat(0x0F)), qh_low1); + v128_t q5h_1 = wasm_v128_or(wasm_u8x16_shr(q5_1, 4), qh_high1); + + v128_t q8_0 = wasm_v128_load(q8); + v128_t q8_1 = wasm_v128_load(q8 + 16); + v128_t q8_2 = wasm_v128_load(q8 + 32); + v128_t q8_3 = wasm_v128_load(q8 + 48); + q8 += 64; + + // Process low quants + v128_t pl0 = wasm_i32x4_dot_i16x8( + wasm_i16x8_extend_low_i8x16(q5l_0), + wasm_i16x8_extend_low_i8x16(q8_0) + ); + pl0 = wasm_i32x4_add(pl0, wasm_i32x4_dot_i16x8( + wasm_i16x8_extend_high_i8x16(q5l_0), + wasm_i16x8_extend_high_i8x16(q8_0) + )); + v128_t pl1 = wasm_i32x4_dot_i16x8( + wasm_i16x8_extend_low_i8x16(q5l_1), + wasm_i16x8_extend_low_i8x16(q8_1) + ); + pl1 = wasm_i32x4_add(pl1, wasm_i32x4_dot_i16x8( + wasm_i16x8_extend_high_i8x16(q5l_1), + wasm_i16x8_extend_high_i8x16(q8_1) + )); + v128_t sum_low = wasm_i32x4_add(pl0, pl1); + + // Process high quants + v128_t ph0 = wasm_i32x4_dot_i16x8( + wasm_i16x8_extend_low_i8x16(q5h_0), + wasm_i16x8_extend_low_i8x16(q8_2) + ); + ph0 = wasm_i32x4_add(ph0, wasm_i32x4_dot_i16x8( + wasm_i16x8_extend_high_i8x16(q5h_0), + wasm_i16x8_extend_high_i8x16(q8_2) + )); + v128_t ph1 = wasm_i32x4_dot_i16x8( + wasm_i16x8_extend_low_i8x16(q5h_1), + wasm_i16x8_extend_low_i8x16(q8_3) + ); + ph1 = wasm_i32x4_add(ph1, wasm_i32x4_dot_i16x8( + wasm_i16x8_extend_high_i8x16(q5h_1), + wasm_i16x8_extend_high_i8x16(q8_3) + )); + v128_t sum_high = wasm_i32x4_add(ph0, ph1); + + // Accumulate with scale factors + int32_t sl = wasm_i32x4_extract_lane(sum_low, 0) + wasm_i32x4_extract_lane(sum_low, 1) + + wasm_i32x4_extract_lane(sum_low, 2) + wasm_i32x4_extract_lane(sum_low, 3); + int32_t sh = wasm_i32x4_extract_lane(sum_high, 0) + wasm_i32x4_extract_lane(sum_high, 1) + + wasm_i32x4_extract_lane(sum_high, 2) + wasm_i32x4_extract_lane(sum_high, 3); + + sumi += sl * sc[2*j] + sh * sc[2*j+1]; + } + + sumf += d * sumi; + } + + *s = sumf; + #elif defined __riscv_v_intrinsic const uint8_t * scales = (const uint8_t*)&utmp[0]; @@ -6682,19 +7282,11 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r *s = vec_extract(vsumf0, 0); #elif defined __loongarch_asx - GGML_UNUSED(kmask1); - GGML_UNUSED(kmask2); - GGML_UNUSED(kmask3); - - const __m256i m4 = __lasx_xvreplgr2vr_b(0xF); - const __m128i mzero = __lsx_vldi(0); - const __m256i mone = __lasx_xvreplgr2vr_b(1); __m256 acc = (__m256)__lasx_xvldi(0); + __m128 acc_m = (__m128)__lsx_vldi(0); - float summs = 0.f; - - for (int i = 0; i < nb; ++i) { + for (int i = 0; i < nb; ++i) { const uint8_t * restrict q5 = x[i].qs; const int8_t * restrict q8 = y[i].qs; @@ -6709,49 +7301,40 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r utmp[2] = uaux; utmp[0] &= kmask1; - const __m256i mins_and_scales = lasx_extu8_16(lsx_set_w(utmp[3], utmp[2], utmp[1], utmp[0])); + const __m128i mins_and_scales128 = lsx_set_w(utmp[3], utmp[2], utmp[1], utmp[0]); + const __m128i mins128 = __lsx_vexth_h_b(mins_and_scales128); + const __m128i scales128 = __lsx_vsllwil_h_b(mins_and_scales128, 0); const __m256i q8sums = __lasx_xvld((const __m256i*)y[i].bsums, 0); const __m128i q8s = lsx_hadd_h(lasx_extracti128(q8sums, 0), lasx_extracti128(q8sums, 1)); - const __m128i prod = lsx_madd_h(lasx_extracti128(mins_and_scales, 1), q8s); - const __m128i hsum = lsx_hadd_w(lsx_hadd_w(prod, mzero), mzero); - summs += dmin * __lsx_vpickve2gr_w(hsum, 0); //TODO check + const __m128i prod = lsx_madd_h(mins128, q8s); + acc_m = __lsx_vfmadd_s(__lsx_vreplfr2vr_s(dmin), __lsx_vffint_s_w(prod), acc_m); - const __m128i sc128 = lasx_extracti128(mins_and_scales, 0); - const __m256i scales = lasx_insertf128(sc128, sc128); + const __m256i scales = lasx_insertf128(scales128, scales128); const __m256i hbits = __lasx_xvld((const __m256i*)x[i].qh, 0); - __m256i hmask = mone; __m256i sumi = __lasx_xvldi(0); - int bit = 0; - __m256i xvbit; - for (int j = 0; j < QK_K/64; ++j) { - const __m256i scale_0 = lasx_shuffle_b(scales, get_scale_shuffle_k4(2*j+0)); - const __m256i scale_1 = lasx_shuffle_b(scales, get_scale_shuffle_k4(2*j+1)); + const __m256i scale_0 = lasx_xvrepl128vei_h(scales, 2 * j + 0); + const __m256i scale_1 = lasx_xvrepl128vei_h(scales, 2 * j + 1); const __m256i q5bits = __lasx_xvld((const __m256i*)q5, 0); q5 += 32; - xvbit = __lasx_xvreplgr2vr_h(bit++); - const __m256i q5l_0 = __lasx_xvand_v(q5bits, m4); - const __m256i q5h_0 = __lasx_xvslli_h(__lasx_xvsrl_h(__lasx_xvand_v(hbits, hmask), xvbit), 4); - const __m256i q5_0 = __lasx_xvadd_b(q5l_0, q5h_0); - hmask = __lasx_xvslli_h(hmask, 1); - - xvbit = __lasx_xvreplgr2vr_h(bit++); - const __m256i q5l_1 = __lasx_xvand_v(__lasx_xvsrli_h(q5bits, 4), m4); - const __m256i q5h_1 = __lasx_xvslli_h(__lasx_xvsrl_h(__lasx_xvand_v(hbits, hmask), xvbit), 4); - const __m256i q5_1 = __lasx_xvadd_b(q5l_1, q5h_1); - hmask = __lasx_xvslli_h(hmask, 1); + const __m256i q5l_0 = __lasx_xvandi_b(q5bits, 0xf); + const __m256i q5l_1 = __lasx_xvsrli_b(q5bits, 4); + const __m256i q5h_0 = __lasx_xvnori_b(__lasx_xvseqi_b(lasx_xvandi_b_bit(hbits, 2 * j + 0), 0), 0xef); + const __m256i q5h_1 = __lasx_xvnori_b(__lasx_xvseqi_b(lasx_xvandi_b_bit(hbits, 2 * j + 1), 0), 0xef); + const __m256i q5_0 = __lasx_xvor_v(q5l_0, q5h_0); + const __m256i q5_1 = __lasx_xvor_v(q5l_1, q5h_1); const __m256i q8_0 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32; const __m256i q8_1 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32; - __m256i p16_0 = lasx_maddubs_h(q5_0, q8_0); - __m256i p16_1 = lasx_maddubs_h(q5_1, q8_1); + __m256i p16_0 = lasx_madd_h_b(q5_0, q8_0); + __m256i p16_1 = lasx_madd_h_b(q5_1, q8_1); p16_0 = lasx_madd_h(scale_0, p16_0); p16_1 = lasx_madd_h(scale_1, p16_1); @@ -6765,7 +7348,10 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r } - *s = hsum_float_8(acc) + summs; + acc_m = __lsx_vfadd_s(acc_m, (__m128)__lsx_vbsrl_v(acc_m, 8)); + acc_m = __lsx_vfadd_s(acc_m, (__m128)__lsx_vbsrl_v(acc_m, 4)); + + *s = hsum_float_8(acc) + ((v4f32)acc_m)[0]; #else @@ -7123,6 +7709,85 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r *s = hsum_float_8(acc); +#elif defined __wasm_simd128__ + int8_t aux8[QK_K] __attribute__((aligned(16))); + int32_t aux32[8] __attribute__((aligned(16))) = {0}; + float sums[8] __attribute__((aligned(16))) = {0}; + + for (int i = 0; i < nb; ++i) { + // Unpack 6-bit quantized data into aux8 (unchanged) + const uint8_t * restrict q4 = x[i].ql; + const uint8_t * restrict qh = x[i].qh; + int8_t * a = aux8; + for (int j = 0; j < QK_K; j += 128) { + for (int l = 0; l < 32; ++l) { + a[l + 0] = (int8_t)((q4[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32; + a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32; + a[l + 64] = (int8_t)((q4[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32; + a[l + 96] = (int8_t)((q4[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32; + } + a += 128; + q4 += 64; + qh += 32; + } + + const int8_t * restrict a_ptr = aux8; + const int8_t * restrict q8 = y[i].qs; + v128_t acc0 = wasm_i32x4_splat(0); + v128_t acc1 = wasm_i32x4_splat(0); + + for (int j = 0; j < QK_K/16; ++j) { + const int scale = x[i].scales[j]; + const v128_t vscale = wasm_i32x4_splat(scale); + + // Load 16 elements from a and q8 + const v128_t a_vec = wasm_v128_load(a_ptr); + const v128_t q8_vec = wasm_v128_load(q8); + + // Process low 8 elements + v128_t a_low = wasm_i16x8_extend_low_i8x16(a_vec); + v128_t q8_low = wasm_i16x8_extend_low_i8x16(q8_vec); + v128_t prod_low = wasm_i16x8_mul(a_low, q8_low); + v128_t prod_lo_lo = wasm_i32x4_extend_low_i16x8(prod_low); + v128_t prod_lo_hi = wasm_i32x4_extend_high_i16x8(prod_low); + + // Process high 8 elements + v128_t a_high = wasm_i16x8_extend_high_i8x16(a_vec); + v128_t q8_high = wasm_i16x8_extend_high_i8x16(q8_vec); + v128_t prod_high = wasm_i16x8_mul(a_high, q8_high); + v128_t prod_hi_lo = wasm_i32x4_extend_low_i16x8(prod_high); + v128_t prod_hi_hi = wasm_i32x4_extend_high_i16x8(prod_high); + + // Scale and accumulate + prod_lo_lo = wasm_i32x4_mul(prod_lo_lo, vscale); + prod_lo_hi = wasm_i32x4_mul(prod_lo_hi, vscale); + prod_hi_lo = wasm_i32x4_mul(prod_hi_lo, vscale); + prod_hi_hi = wasm_i32x4_mul(prod_hi_hi, vscale); + + acc0 = wasm_i32x4_add(acc0, wasm_i32x4_add(prod_lo_lo, prod_hi_lo)); + acc1 = wasm_i32x4_add(acc1, wasm_i32x4_add(prod_lo_hi, prod_hi_hi)); + + a_ptr += 16; + q8 += 16; + } + + // Store accumulated results + wasm_v128_store(&aux32[0], acc0); + wasm_v128_store(&aux32[4], acc1); + + const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + for (int l = 0; l < 8; ++l) { + sums[l] += d * aux32[l]; + } + } + + // Sum final results + float sumf = 0; + for (int l = 0; l < 8; ++l) { + sumf += sums[l]; + } + *s = sumf; + #elif defined __riscv_v_intrinsic float sumf = 0; @@ -7347,8 +8012,6 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r #elif defined __loongarch_asx - const __m256i m4 = __lasx_xvreplgr2vr_b(0xF); - const __m256i m2 = __lasx_xvreplgr2vr_b(3); const __m256i m32s = __lasx_xvreplgr2vr_b(32); __m256 acc = (__m256)__lasx_xvldi(0); @@ -7361,58 +8024,42 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r const uint8_t * restrict qh = x[i].qh; const int8_t * restrict q8 = y[i].qs; - const __m128i scales = __lsx_vld((const __m128i*)x[i].scales, 0); + const __m128i scales128 = __lsx_vld((const __m128i*)x[i].scales, 0); + const v16i8 shuffle_mask = {0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15}; + const __m256i scales_shuffled = lasx_ext8_16(__lsx_vshuf_b(scales128, scales128, (__m128i)shuffle_mask)); __m256i sumi = __lasx_xvldi(0); - int is = 0; - for (int j = 0; j < QK_K/128; ++j) { - const __m128i scale_0 = lsx_shuffle_b(scales, get_scale_shuffle(is + 0)); - const __m128i scale_1 = lsx_shuffle_b(scales, get_scale_shuffle(is + 1)); - const __m128i scale_2 = lsx_shuffle_b(scales, get_scale_shuffle(is + 2)); - const __m128i scale_3 = lsx_shuffle_b(scales, get_scale_shuffle(is + 3)); - is += 4; - const __m256i q4bits1 = __lasx_xvld((const __m256i*)q4, 0); q4 += 32; const __m256i q4bits2 = __lasx_xvld((const __m256i*)q4, 0); q4 += 32; const __m256i q4bitsH = __lasx_xvld((const __m256i*)qh, 0); qh += 32; - const __m256i q4h_0 = __lasx_xvslli_h(__lasx_xvand_v(q4bitsH, m2), 4); - const __m256i q4h_1 = __lasx_xvslli_h(__lasx_xvand_v(__lasx_xvsrli_h(q4bitsH, 2), m2), 4); - const __m256i q4h_2 = __lasx_xvslli_h(__lasx_xvand_v(__lasx_xvsrli_h(q4bitsH, 4), m2), 4); - const __m256i q4h_3 = __lasx_xvslli_h(__lasx_xvand_v(__lasx_xvsrli_h(q4bitsH, 6), m2), 4); + const __m256i q4h_0 = __lasx_xvslli_b(__lasx_xvandi_b(q4bitsH, 3), 4); + const __m256i q4h_1 = __lasx_xvslli_b(__lasx_xvandi_b(q4bitsH, 3 << 2), 2); + const __m256i q4h_2 = __lasx_xvandi_b(q4bitsH, 3 << 4); + const __m256i q4h_3 = __lasx_xvsrli_b(__lasx_xvandi_b(q4bitsH, 3 << 6), 2); - const __m256i q4_0 = __lasx_xvor_v(__lasx_xvand_v(q4bits1, m4), q4h_0); - const __m256i q4_1 = __lasx_xvor_v(__lasx_xvand_v(q4bits2, m4), q4h_1); - const __m256i q4_2 = __lasx_xvor_v(__lasx_xvand_v(__lasx_xvsrli_h(q4bits1, 4), m4), q4h_2); - const __m256i q4_3 = __lasx_xvor_v(__lasx_xvand_v(__lasx_xvsrli_h(q4bits2, 4), m4), q4h_3); + const __m256i q4_0 = __lasx_xvor_v(__lasx_xvandi_b(q4bits1, 0xf), q4h_0); + const __m256i q4_1 = __lasx_xvor_v(__lasx_xvandi_b(q4bits2, 0xf), q4h_1); + const __m256i q4_2 = __lasx_xvor_v(__lasx_xvsrli_b(q4bits1, 4), q4h_2); + const __m256i q4_3 = __lasx_xvor_v(__lasx_xvsrli_b(q4bits2, 4), q4h_3); const __m256i q8_0 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32; const __m256i q8_1 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32; const __m256i q8_2 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32; const __m256i q8_3 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32; - __m256i q8s_0 = lasx_maddubs_h(m32s, q8_0); - __m256i q8s_1 = lasx_maddubs_h(m32s, q8_1); - __m256i q8s_2 = lasx_maddubs_h(m32s, q8_2); - __m256i q8s_3 = lasx_maddubs_h(m32s, q8_3); + __m256i p16_0 = lasx_madd_h_b(__lasx_xvsub_b(q4_0, m32s), q8_0); + __m256i p16_1 = lasx_madd_h_b(__lasx_xvsub_b(q4_1, m32s), q8_1); + __m256i p16_2 = lasx_madd_h_b(__lasx_xvsub_b(q4_2, m32s), q8_2); + __m256i p16_3 = lasx_madd_h_b(__lasx_xvsub_b(q4_3, m32s), q8_3); - __m256i p16_0 = lasx_maddubs_h(q4_0, q8_0); - __m256i p16_1 = lasx_maddubs_h(q4_1, q8_1); - __m256i p16_2 = lasx_maddubs_h(q4_2, q8_2); - __m256i p16_3 = lasx_maddubs_h(q4_3, q8_3); - - p16_0 = __lasx_xvsub_h(p16_0, q8s_0); - p16_1 = __lasx_xvsub_h(p16_1, q8s_1); - p16_2 = __lasx_xvsub_h(p16_2, q8s_2); - p16_3 = __lasx_xvsub_h(p16_3, q8s_3); - - p16_0 = lasx_madd_h(lasx_ext8_16(scale_0), p16_0); - p16_1 = lasx_madd_h(lasx_ext8_16(scale_1), p16_1); - p16_2 = lasx_madd_h(lasx_ext8_16(scale_2), p16_2); - p16_3 = lasx_madd_h(lasx_ext8_16(scale_3), p16_3); + p16_0 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 0), p16_0); + p16_1 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 1), p16_1); + p16_2 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 2), p16_2); + p16_3 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 3), p16_3); sumi = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p16_0, p16_1)); sumi = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p16_2, p16_3)); @@ -9737,13 +10384,9 @@ static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) { } #elif defined(__loongarch_asx) static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) { - const __m256i ax = __lasx_xvsigncov_b(x, x); - const __m256i sy = __lasx_xvsigncov_b(x, y); - __m256i tmp1, tmp2, tmp3; - tmp1 = __lasx_xvmulwev_h_bu_b(ax, sy); - tmp2 = __lasx_xvmulwod_h_bu_b(ax, sy); - tmp3 = __lasx_xvadd_h(tmp1, tmp2); - return __lasx_xvsat_h(tmp3, 15); + const __m256i a = __lasx_xvmulwev_h_b(x, y); + const __m256i b = __lasx_xvmulwod_h_b(x, y); + return __lasx_xvadd_h(a, b); } #endif @@ -10793,67 +11436,31 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void * #elif defined(__loongarch_asx) const __m128i values128 = __lsx_vld((const __m128i*)kvalues_iq4nl, 0); - const __m128i m4b = __lsx_vreplgr2vr_b(0x0f); __m256 accum = (__m256)__lasx_xvldi(0); - __m256i tmp1; - __m128i tmp0, tmp2, tmp3, tmp4, mask_8f, mask; - mask_8f = __lsx_vreplgr2vr_b(0x8f); for (int ibl = 0; ibl < nb; ++ibl) { const uint8_t * qs = x[ibl].qs; const int8_t * q8 = y[ibl].qs; uint16_t sh = x[ibl].scales_h; __m256i sumi1 = __lasx_xvldi(0); __m256i sumi2 = __lasx_xvldi(0); - __m128i zero = __lsx_vldi(0); for (int ib = 0; ib < QK_K/32; ib += 2) { - const __m128i q4bits_1 = __lsx_vld((const __m128i*)qs, 0); qs += 16; - const __m128i q4bits_2 = __lsx_vld((const __m128i*)qs, 0); qs += 16; + const __m128i q4bits_1 = __lsx_vld((const __m128i*)qs, 0); qs += 16; + const __m128i q4bits_2 = __lsx_vld((const __m128i*)qs, 0); qs += 16; const __m256i q8b_1 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32; const __m256i q8b_2 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32; - tmp2 = __lsx_vand_v(__lsx_vand_v(__lsx_vsrli_h(q4bits_1, 4), m4b), mask_8f); - tmp0 = __lsx_vori_b(tmp2, 0x10); - mask = __lsx_vsle_b(zero, tmp2); - tmp3 = __lsx_vand_v(tmp0, mask); - tmp3 = __lsx_vshuf_b(values128, zero, tmp3); - - tmp2 = __lsx_vand_v(__lsx_vand_v(q4bits_1, m4b), mask_8f); - tmp0 = __lsx_vori_b(tmp2, 0x10); - mask = __lsx_vsle_b(zero, tmp2); - tmp4 = __lsx_vand_v(tmp0, mask); - tmp4 = __lsx_vshuf_b(values128, zero, tmp4); - - const __m256i q4b_1 = lasx_insertf128(tmp3, tmp4); - - tmp2 = __lsx_vand_v(__lsx_vand_v(__lsx_vsrli_h(q4bits_2, 4), m4b), mask_8f); - tmp0 = __lsx_vori_b(tmp2, 0x10); - mask = __lsx_vsle_b(zero, tmp2); - tmp3 = __lsx_vand_v(tmp0, mask); - tmp3 = __lsx_vshuf_b(values128, zero, tmp3); - - tmp2 = __lsx_vand_v(__lsx_vand_v(q4bits_2, m4b), mask_8f); - tmp0 = __lsx_vori_b(tmp2, 0x10); - mask = __lsx_vsle_b(zero, tmp2); - tmp4 = __lsx_vand_v(tmp0, mask); - tmp4 = __lsx_vshuf_b(values128, zero, tmp4); - - const __m256i q4b_2 = lasx_insertf128(tmp3, tmp4); - + const __m256i q4b_1 = lasx_insertf128(__lsx_vshuf_b(values128, values128, __lsx_vsrli_b(q4bits_1, 4)), + __lsx_vshuf_b(values128, values128, __lsx_vandi_b(q4bits_1, 0xf))); + const __m256i q4b_2 = lasx_insertf128(__lsx_vshuf_b(values128, values128, __lsx_vsrli_b(q4bits_2, 4)), + __lsx_vshuf_b(values128, values128, __lsx_vandi_b(q4bits_2, 0xf))); const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1); const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2); const int16_t ls1 = ((x[ibl].scales_l[ib/2] & 0xf) | ((sh << 4) & 0x30)) - 32; const int16_t ls2 = ((x[ibl].scales_l[ib/2] >> 4) | ((sh << 2) & 0x30)) - 32; sh >>= 4; - __m256i tmp5, tmp6; - tmp1 = __lasx_xvreplgr2vr_h(ls1); - tmp5 = __lasx_xvmulwev_w_h(p16_1, tmp1); - tmp6 = __lasx_xvmulwod_w_h(p16_1, tmp1); - const __m256i p_1 = __lasx_xvadd_w(tmp5, tmp6); - tmp1 = __lasx_xvreplgr2vr_h(ls2); - tmp5 = __lasx_xvmulwev_w_h(p16_2, tmp1); - tmp6 = __lasx_xvmulwod_w_h(p16_2, tmp1); - const __m256i p_2 = __lasx_xvadd_w(tmp5, tmp6); + const __m256i p_1 = lasx_madd_h(p16_1, __lasx_xvreplgr2vr_h(ls1)); + const __m256i p_2 = lasx_madd_h(p16_2, __lasx_xvreplgr2vr_h(ls2)); sumi1 = __lasx_xvadd_w(p_1, sumi1); sumi2 = __lasx_xvadd_w(p_2, sumi2); } diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 9b7680ca2..d5dd6a729 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -7,7 +7,6 @@ #include "ggml-cpu-impl.h" #include "ggml-cpu.h" #include "ggml-impl.h" -#include "ggml-quants.h" #include "ggml-cpu-quants.h" #include "ggml-threading.h" // #include "amx/amx.h" @@ -1295,7 +1294,7 @@ struct ggml_threadpool { atomic_int n_graph; // incremented when there is work to be done (i.e each graph) atomic_int GGML_CACHE_ALIGN n_barrier; atomic_int GGML_CACHE_ALIGN n_barrier_passed; - atomic_int current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads. + atomic_int GGML_CACHE_ALIGN current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads. // these are atomic as an annotation for thread-sanitizer atomic_bool stop; // Used for stopping the threadpool altogether @@ -7528,6 +7527,7 @@ UseGgmlGemm1:; if (src1->type != vec_dot_type) { char * wdata = params->wdata; + const size_t nbw0 = ggml_type_size(vec_dot_type); const size_t nbw1 = ggml_row_size(vec_dot_type, ne10); const size_t nbw2 = nbw1*ne11; const size_t nbw3 = nbw2*ne12; @@ -7535,6 +7535,7 @@ UseGgmlGemm1:; assert(params->wsize >= ne13*nbw3); GGML_ASSERT(src1->type == GGML_TYPE_F32); + #if 0 for (int64_t i13 = 0; i13 < ne13; ++i13) { for (int64_t i12 = 0; i12 < ne12; ++i12) { for (int64_t i11 = ith; i11 < ne11; i11 += nth) { @@ -7544,6 +7545,20 @@ UseGgmlGemm1:; } } } + #else + for (int64_t i13 = 0; i13 < ne13; ++i13) { + for (int64_t i12 = 0; i12 < ne12; ++i12) { + for (int64_t i11 = 0; i11 < ne11; ++i11) { + size_t bs = ggml_blck_size(vec_dot_type); + int64_t ne10_block_start = (ith * ne10/bs) / nth; + int64_t ne10_block_end = ((ith + 1) * ne10/bs) / nth; + from_float((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + ne10_block_start*bs*nb10), + (void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1 + ne10_block_start*nbw0), + (ne10_block_end - ne10_block_start) * bs); + } + } + } + #endif } if (ith == 0) { @@ -7631,7 +7646,6 @@ UseGgmlGemm2:; if ((nr0 % 2 != 0) || (ne11 % 2 != 0) || ((ir0_end - ir0_start) % 2 != 0) || ((ir1_end - ir1_start) % 2 != 0)) { num_rows_per_vec_dot = 1; } - ggml_compute_forward_mul_mat_one_chunk(params, dst, src0->type, num_rows_per_vec_dot, ir0_start, ir0_end, ir1_start, ir1_end); if (nth >= nchunk0 * nchunk1) { @@ -7644,6 +7658,84 @@ UseGgmlGemm2:; // ggml_compute_forward_mul_mat_id +#define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ids->ne[0]*ids->ne[1] + (i1)] + +struct mmid_row_mapping { + int32_t i1; + int32_t i2; +}; + +static void ggml_compute_forward_mul_mat_id_one_chunk( + struct ggml_tensor * dst, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + const struct ggml_tensor * ids, + const int64_t cur_a, + const int64_t ir0_start, + const int64_t ir0_end, + const int64_t ir1_start, + const int64_t ir1_end, + const char * src0_cur, + const struct mmid_row_mapping * matrix_rows, + const size_t row_size, + const bool src1_cont, + const void * wdata) { + + GGML_TENSOR_BINARY_OP_LOCALS + + const enum ggml_type type = src0->type; + + ggml_vec_dot_t const vec_dot = type_traits_cpu[type].vec_dot; + enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type; + + const int64_t blck_0 = 16; + const int64_t blck_1 = 16; + + float tmp[16]; + + for (int64_t iir1 = ir1_start; iir1 < ir1_end; iir1 += blck_1) { + for (int64_t iir0 = ir0_start; iir0 < ir0_end; iir0 += blck_0) { + for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir1_end; ++ir1) { + const int64_t _i12 = ir1; // logical row index for this expert + + struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, _i12); + const int id = row_mapping.i1; // selected expert index + + const int64_t i11 = id % ne11; + const int64_t i12 = row_mapping.i2; // row index in src1 + + const int64_t i1 = id; // selected expert index + const int64_t i2 = i12; // row + + // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides + // if it is, then we have either copied the data to params->wdata and made it contiguous or we are using + // the original src1 data pointer, so we should index using the indices directly + // TODO: this is a bit of a hack, we should probably have a better way to handle this + const char * src1_col = (const char *) wdata + + (src1_cont || src1->type != vec_dot_type + ? (i11 + i12*ne11)*row_size + : (i11*nb11 + i12*nb12)); + + float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2)); + + for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ++ir0) { + vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_cur + ir0*nb01, 0, src1_col, 0, 1); + } + + memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir0_end) - iir0)*sizeof(float)); + } + } + } +} + +static void * incr_ptr_aligned(void ** p, size_t size, size_t align) { + + void * ptr = *p; + ptr = (void *) GGML_PAD((uintptr_t) ptr, align); + *p = (void *) ((char *) ptr + size); + return ptr; +} + static void ggml_compute_forward_mul_mat_id( const struct ggml_compute_params * params, struct ggml_tensor * dst) { @@ -7661,7 +7753,6 @@ static void ggml_compute_forward_mul_mat_id( const bool src1_cont = ggml_is_contiguous(src1); - ggml_vec_dot_t const vec_dot = type_traits_cpu[type].vec_dot; enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type; ggml_from_float_t const from_float = type_traits_cpu[vec_dot_type].from_float; @@ -7679,21 +7770,27 @@ static void ggml_compute_forward_mul_mat_id( const int n_ids = ids->ne[0]; // n_expert_used const int n_as = ne02; // n_expert - char * wdata_src1_end = (src1->type == vec_dot_type) ? - (char *) params->wdata : - (char *) params->wdata + GGML_PAD(ggml_row_size(vec_dot_type, ggml_nelements(src1)), sizeof(int64_t)); + void * wdata_cur = params->wdata; - struct mmid_row_mapping { - int32_t i1; - int32_t i2; - }; + if (src1->type != vec_dot_type) { + incr_ptr_aligned(&wdata_cur, ggml_row_size(vec_dot_type, ggml_nelements(src1)), sizeof(int64_t)); + } - int64_t * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as] - struct mmid_row_mapping * matrix_rows = (struct mmid_row_mapping *)(matrix_row_counts + n_as); // [n_as][ne11] + int64_t * matrix_row_counts = // [n_as] + incr_ptr_aligned(&wdata_cur, n_as*sizeof(int64_t), sizeof(int64_t)); + + struct mmid_row_mapping * matrix_rows = // [n_as][ids->ne[0]*ids->ne[1]] + incr_ptr_aligned(&wdata_cur, n_as*ids->ne[0]*ids->ne[1]*sizeof(struct mmid_row_mapping), sizeof(int64_t)); + + char (*atomic_current_chunk)[CACHE_LINE_SIZE] = // [n_as] + incr_ptr_aligned(&wdata_cur, CACHE_LINE_SIZE * n_as, CACHE_LINE_SIZE); + + GGML_ASSERT(params->wsize >= (size_t)((char *) wdata_cur - (char *) params->wdata)); if (src1->type != vec_dot_type) { char * wdata = params->wdata; + const size_t nbw0 = ggml_type_size(vec_dot_type); const size_t nbw1 = ggml_row_size(vec_dot_type, ne10); const size_t nbw2 = nbw1*ne11; const size_t nbw3 = nbw2*ne12; @@ -7701,19 +7798,32 @@ static void ggml_compute_forward_mul_mat_id( assert(params->wsize >= ne13*nbw3); GGML_ASSERT(src1->type == GGML_TYPE_F32); +#if 0 for (int64_t i13 = 0; i13 < ne13; ++i13) { - for (int64_t i12 = 0; i12 < ne12; ++i12) { - for (int64_t i11 = ith; i11 < ne11; i11 += nth) { + for (int64_t i12 = ith; i12 < ne12; i12 += nth) { + for (int64_t i11 = 0; i11 < ne11; ++i11) { from_float((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1), ne10); } } } +#else + for (int64_t i13 = 0; i13 < ne13; ++i13) { + for (int64_t i12 = 0; i12 < ne12; ++i12) { + for (int64_t i11 = 0; i11 < ne11; ++i11) { + size_t bs = ggml_blck_size(vec_dot_type); + int64_t ne10_block_start = (ith * ne10/bs) / nth; + int64_t ne10_block_end = ((ith + 1) * ne10/bs) / nth; + from_float((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + ne10_block_start*bs*nb10), + (void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1 + ne10_block_start*nbw0), + (ne10_block_end - ne10_block_start) * bs); + } + } + } +#endif } -#define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne12 + (i1)] - if (ith == 0) { // initialize matrix_row_counts memset(matrix_row_counts, 0, n_as*sizeof(int64_t)); @@ -7731,9 +7841,14 @@ static void ggml_compute_forward_mul_mat_id( } } + // reset current_chunk + for (int cur_a = ith; cur_a < n_as; cur_a += nth) { + atomic_int * current_chunk_ctr = (atomic_int *)(atomic_current_chunk + cur_a); + *current_chunk_ctr = nth; + } + ggml_barrier(params->threadpool); - // compute each matrix multiplication in sequence for (int cur_a = 0; cur_a < n_as; ++cur_a) { const int64_t cne1 = matrix_row_counts[cur_a]; @@ -7741,84 +7856,64 @@ static void ggml_compute_forward_mul_mat_id( continue; } - const char * src0_cur = (const char *) src0->data + cur_a*nb02; - - const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; + const char * src0_cur = (const char *) src0->data + cur_a * nb02; + const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; const size_t row_size = ggml_row_size(vec_dot_type, ne10); - const int64_t nr0 = ne01; // src0 rows - const int64_t nr1 = cne1; // src1 rows + const int64_t nr0 = ne01; + const int64_t nr1 = cne1; - // distribute the thread work across the inner or outer loop based on which one is larger + int chunk_size = 16; + if (nr0 == 1 || nr1 == 1) { + chunk_size = 64; + } - const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows - const int64_t nth1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows +#if defined(__aarch64__) + // disable for ARM + const bool disable_chunking = true; +#else + // disable for NUMA + const bool disable_chunking = ggml_is_numa(); +#endif // defined(__aarch64__) - const int64_t ith0 = ith % nth0; - const int64_t ith1 = ith / nth0; + int64_t nchunk0 = (nr0 + chunk_size - 1) / chunk_size; + int64_t nchunk1 = (nr1 + chunk_size - 1) / chunk_size; - const int64_t dr0 = (nr0 + nth0 - 1)/nth0; - const int64_t dr1 = (nr1 + nth1 - 1)/nth1; + if (nchunk0 * nchunk1 < nth * 4 || disable_chunking) { + nchunk0 = nr0 > nr1 ? nth : 1; + nchunk1 = nr0 > nr1 ? 1 : nth; + } - const int64_t ir010 = dr0*ith0; - const int64_t ir011 = MIN(ir010 + dr0, nr0); + const int64_t dr0 = (nr0 + nchunk0 - 1) / nchunk0; + const int64_t dr1 = (nr1 + nchunk1 - 1) / nchunk1; - const int64_t ir110 = dr1*ith1; - const int64_t ir111 = MIN(ir110 + dr1, nr1); + int current_chunk = ith; - // threads with no work simply yield (not sure if it helps) - //if (ir010 >= ir011 || ir110 >= ir111) { - // sched_yield(); - // continue; - //} + atomic_int * current_chunk_ctr = (atomic_int *)(atomic_current_chunk + cur_a); - // block-tiling attempt - const int64_t blck_0 = 16; - const int64_t blck_1 = 16; + while (current_chunk < nchunk0 * nchunk1) { + const int64_t ith0 = current_chunk % nchunk0; + const int64_t ith1 = current_chunk / nchunk0; - // attempt to reduce false-sharing (does not seem to make a difference) - float tmp[16]; + const int64_t ir0_start = dr0 * ith0; + const int64_t ir0_end = MIN(ir0_start + dr0, nr0); - for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) { - for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) { - for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) { - const int64_t _i12 = ir1; // logical row index for this expert + const int64_t ir1_start = dr1 * ith1; + const int64_t ir1_end = MIN(ir1_start + dr1, nr1); - struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, _i12); - const int id = row_mapping.i1; // selected expert index + ggml_compute_forward_mul_mat_id_one_chunk( + dst, src0, src1, ids, cur_a, + ir0_start, ir0_end, ir1_start, ir1_end, + src0_cur, matrix_rows, row_size, src1_cont, wdata + ); - const int64_t i11 = id % ne11; - const int64_t i12 = row_mapping.i2; // row index in src1 - - const int64_t i1 = id; // selected expert index - const int64_t i2 = i12; // row - - // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides - // if it is, then we have either copied the data to params->wdata and made it contiguous or we are using - // the original src1 data pointer, so we should index using the indices directly - // TODO: this is a bit of a hack, we should probably have a better way to handle this - const char * src1_col = (const char *) wdata + - (src1_cont || src1->type != vec_dot_type - ? (i11 + i12*ne11)*row_size - : (i11*nb11 + i12*nb12)); - - float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2)); - - //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) { - // vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col); - //} - - for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) { - vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_cur + ir0*nb01, 0, src1_col, 0, 1); - } - - memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float)); - } + if (nth >= nchunk0 * nchunk1) { + break; } + + current_chunk = atomic_fetch_add_explicit(current_chunk_ctr, 1, memory_order_relaxed); } } - -#undef MMID_MATRIX_ROW } // ggml_compute_forward_out_prod @@ -9112,10 +9207,6 @@ static void ggml_compute_forward_clamp_f32( const struct ggml_tensor * src0 = dst->src[0]; - if (params->ith != 0) { - return; - } - float min; float max; memcpy(&min, (float *) dst->op_params + 0, sizeof(float)); @@ -13761,14 +13852,19 @@ struct ggml_cplan ggml_graph_plan( cur = 0; const struct ggml_tensor * src0 = node->src[0]; const struct ggml_tensor * src1 = node->src[1]; + const struct ggml_tensor * ids = node->src[2]; const enum ggml_type vec_dot_type = type_traits_cpu[src0->type].vec_dot_type; - if (src1->type != vec_dot_type) { - cur += ggml_row_size(vec_dot_type, ggml_nelements(src1)); - } const int n_as = src0->ne[2]; - cur += GGML_PAD(cur, sizeof(int64_t)); // align - cur += n_as * sizeof(int64_t); // matrix_row_counts - cur += n_as * src1->ne[2] * sizeof(int64_t); // matrix_rows + // src1 + if (src1->type != vec_dot_type) { + cur += ggml_row_size(vec_dot_type, ggml_nelements(src1)) + sizeof(int64_t); + } + // matrix_row_counts + cur += n_as * sizeof(int64_t) + sizeof(int64_t); + // matrix_rows + cur += n_as*ids->ne[0]*ids->ne[1]*sizeof(struct mmid_row_mapping) + sizeof(int64_t); + // atomic_current_chunk + cur += CACHE_LINE_SIZE*n_as + CACHE_LINE_SIZE; } break; case GGML_OP_OUT_PROD: { diff --git a/ggml/src/ggml-cpu/llamafile/sgemm.cpp b/ggml/src/ggml-cpu/llamafile/sgemm.cpp index c22a66287..e0482c593 100644 --- a/ggml/src/ggml-cpu/llamafile/sgemm.cpp +++ b/ggml/src/ggml-cpu/llamafile/sgemm.cpp @@ -280,14 +280,6 @@ template <> inline __m256bh load(const float *p) { } #endif -//////////////////////////////////////////////////////////////////////////////////////////////////// -// CONSTANTS - -#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) -static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113}; -static const __m128i iq4nlt = _mm_loadu_si128((const __m128i *) kvalues_iq4nl); -#endif - //////////////////////////////////////////////////////////////////////////////////////////////////// // FLOATING POINT MATRIX MULTIPLICATION @@ -614,6 +606,14 @@ class tinyBLAS_Q0_AVX { TC *C, int64_t ldc, int ith, int nth) : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) { + const int8_t kvalues_iq4nl[16] = { + -127, -104, -83, -65, + -49, -35, -22, -10, + 1, 13, 25, 38, + 53, 69, 89, 113 + }; + + iq4nlt = _mm_loadu_si128((const __m128i *)kvalues_iq4nl); } void matmul(int64_t m, int64_t n) { @@ -1038,6 +1038,7 @@ class tinyBLAS_Q0_AVX { const int64_t ldc; const int ith; const int nth; + __m128i iq4nlt; }; #endif // __AVX__ diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 8e3f0fc19..b99ff1f71 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -180,11 +180,11 @@ static ggml_cuda_device_info ggml_cuda_init() { int major_version = 0; size_t version_length = 0; if (rocblas_get_version_string_size(&version_length) == rocblas_status_success) { - std::string version(version_length, '\0'); + std::vector version(version_length+1, '\0'); if (rocblas_get_version_string(version.data(), version.size()) == rocblas_status_success) { - version.resize(::strlen(version.c_str())); + version.resize(::strlen(version.data())); int parsed_value = 0; - if (std::from_chars(version.c_str(), version.c_str() + version.length(), parsed_value).ec == std::errc()) { + if (std::from_chars(version.data(), version.data() + version.size(), parsed_value).ec == std::errc()) { major_version = parsed_value; } } @@ -1481,12 +1481,7 @@ static void ggml_cuda_op_mul_mat( const size_t nbytes_data = ggml_nbytes(src0); const size_t nbytes_padding = ggml_row_size(src0->type, MATRIX_ROW_PADDING - ne00 % MATRIX_ROW_PADDING); dev[id].src0_dd = dev[id].src0_dd_alloc.alloc(ctx.pool(id), nbytes_data + nbytes_padding); - // TODO: remove this for MUSA once the Guilty Lockup issue is resolved -#ifndef GGML_USE_MUSA CUDA_CHECK(cudaMemsetAsync(dev[id].src0_dd, 0, nbytes_data + nbytes_padding, stream)); -#else // GGML_USE_MUSA - CUDA_CHECK(cudaMemsetAsync(dev[id].src0_dd + nbytes_data, 0, nbytes_padding, stream)); -#endif // !GGML_USE_MUSA } // If src0 is on a temporary compute buffer (partial offloading) there may be some padding that needs to be cleared: diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu index 40b6869f1..4f78e113e 100644 --- a/ggml/src/ggml-cuda/mmq.cu +++ b/ggml/src/ggml-cuda/mmq.cu @@ -151,5 +151,5 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) { return !fp16_mma_hardware_available(cc) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE; } - return (!GGML_CUDA_CC_IS_RDNA3(cc) && !GGML_CUDA_CC_IS_CDNA(cc) && !GGML_CUDA_CC_IS_GCN(cc)) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE; + return (!GGML_CUDA_CC_IS_RDNA3(cc) && !GGML_CUDA_CC_IS_CDNA(cc)) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE; } diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 898ccb0ca..170d73f04 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -1434,6 +1434,7 @@ static void ggml_vk_load_shaders(vk_device& device) { VK_LOG_DEBUG("ggml_vk_load_shaders(" << device->name << ")"); // some shaders have a minimum subgroup size + const uint32_t subgroup_size_8 = std::max(device->subgroup_size, 8u); const uint32_t subgroup_size_16 = std::max(device->subgroup_size, 16u); const uint32_t subgroup_size_32 = std::max(device->subgroup_size, 32u); @@ -1496,13 +1497,13 @@ static void ggml_vk_load_shaders(vk_device& device) { const uint32_t tk_m = device->coopmat_support ? device->coopmat_k : 1; const uint32_t tk_s = device->coopmat_support ? device->coopmat_k : 1; - l_warptile = { 128, 128, 128, 16, device->subgroup_size * 2, 64, 2, tm_l, tn_l, tk_l, device->subgroup_size }; - m_warptile = { 128, 64, 64, 16, device->subgroup_size, 32, 2, tm_m, tn_m, tk_m, device->subgroup_size }; - s_warptile = { subgroup_size_16, 32, 32, 16, 32, 32, 2, tm_s, tn_s, tk_s, device->subgroup_size }; + l_warptile = { 128, 128, 128, 16, subgroup_size_8 * 2, 64, 2, tm_l, tn_l, tk_l, subgroup_size_8 }; + m_warptile = { 128, 64, 64, 16, subgroup_size_8, 32, 2, tm_m, tn_m, tk_m, subgroup_size_8 }; + s_warptile = { subgroup_size_16, 32, 32, 16, 32, 32, 2, tm_s, tn_s, tk_s, subgroup_size_8 }; - l_warptile_mmq = { 128, 128, 128, 32, device->subgroup_size * 2, 64, 2, tm_l, tn_l, tk_l, device->subgroup_size }; - m_warptile_mmq = { 128, 64, 64, 32, device->subgroup_size, 32, 2, tm_m, tn_m, tk_m, device->subgroup_size }; - s_warptile_mmq = { subgroup_size_32, 32, 32, 32, 32, 32, 2, tm_s, tn_s, tk_s, device->subgroup_size }; + l_warptile_mmq = { 128, 128, 128, 32, subgroup_size_8 * 2, 64, 2, tm_l, tn_l, tk_l, subgroup_size_8 }; + m_warptile_mmq = { 128, 64, 64, 32, subgroup_size_8, 32, 2, tm_m, tn_m, tk_m, subgroup_size_8 }; + s_warptile_mmq = { subgroup_size_32, 32, 32, 32, 32, 32, 2, tm_s, tn_s, tk_s, subgroup_size_8 }; l_mmq_wg_denoms = l_wg_denoms = {128, 128, 1 }; m_mmq_wg_denoms = m_wg_denoms = { 64, 64, 1 }; diff --git a/include/llama.h b/include/llama.h index 5323f2ba4..6a38c9c23 100644 --- a/include/llama.h +++ b/include/llama.h @@ -1174,6 +1174,9 @@ extern "C" { /// @details XTC sampler as described in https://github.com/oobabooga/text-generation-webui/pull/6335 LLAMA_API struct llama_sampler * llama_sampler_init_xtc (float p, float t, size_t min_keep, uint32_t seed); + /// @details Top n sigma sampling as described in academic paper "Top-nσ: Not All Logits Are You Need" https://arxiv.org/pdf/2411.07641 + LLAMA_API struct llama_sampler * llama_sampler_init_top_n_sigma(float n); + /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. /// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. diff --git a/models/templates/README.md b/models/templates/README.md new file mode 100644 index 000000000..72c30d1e1 --- /dev/null +++ b/models/templates/README.md @@ -0,0 +1,22 @@ +These templates can be updated with the following commands: + +```bash +./scripts/get_chat_template.py CohereForAI/c4ai-command-r-plus tool_use > models/templates/CohereForAI-c4ai-command-r-plus-tool_use.jinja +./scripts/get_chat_template.py CohereForAI/c4ai-command-r7b-12-2024 default > models/templates/CohereForAI-c4ai-command-r7b-12-2024-default.jinja +./scripts/get_chat_template.py CohereForAI/c4ai-command-r7b-12-2024 rag > models/templates/CohereForAI-c4ai-command-r7b-12-2024-rag.jinja +./scripts/get_chat_template.py CohereForAI/c4ai-command-r7b-12-2024 tool_use > models/templates/CohereForAI-c4ai-command-r7b-12-2024-tool_use.jinja +./scripts/get_chat_template.py deepseek-ai/DeepSeek-R1-Distill-Llama-8B > models/templates/deepseek-ai-DeepSeek-R1-Distill-Llama-8B.jinja +./scripts/get_chat_template.py deepseek-ai/DeepSeek-R1-Distill-Qwen-32B > models/templates/deepseek-ai-DeepSeek-R1-Distill-Qwen-32B.jinja +./scripts/get_chat_template.py fireworks-ai/llama-3-firefunction-v2 > models/templates/fireworks-ai-llama-3-firefunction-v2.jinja +./scripts/get_chat_template.py google/gemma-2-2b-it > models/templates/google-gemma-2-2b-it.jinja +./scripts/get_chat_template.py meetkai/functionary-medium-v3. > models/templates/meetkai-functionary-medium-v3.jinja +./scripts/get_chat_template.py meetkai/functionary-medium-v3.2 > models/templates/meetkai-functionary-medium-v3.2.jinja +./scripts/get_chat_template.py meta-llama/Llama-3.1-8B-Instruct > models/templates/meta-llama-Llama-3.1-8B-Instruct.jinja +./scripts/get_chat_template.py meta-llama/Llama-3.2-3B-Instruct > models/templates/meta-llama-Llama-3.2-3B-Instruct.jinja +./scripts/get_chat_template.py meta-llama/Llama-3.3-70B-Instruct > models/templates/meta-llama-Llama-3.3-70B-Instruct.jinja +./scripts/get_chat_template.py microsoft/Phi-3.5-mini-instruct > models/templates/microsoft-Phi-3.5-mini-instruct.jinja +./scripts/get_chat_template.py mistralai/Mistral-Nemo-Instruct-2407 > models/templates/mistralai-Mistral-Nemo-Instruct-2407.jinja +./scripts/get_chat_template.py NousResearch/Hermes-2-Pro-Llama-3-8B tool_use > models/templates/NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use.jinja +./scripts/get_chat_template.py NousResearch/Hermes-3-Llama-3.1-8B tool_use > models/templates/NousResearch-Hermes-3-Llama-3.1-8B-tool_use.jinja +./scripts/get_chat_template.py Qwen/Qwen2.5-7B-Instruct > models/templates/Qwen-Qwen2.5-7B-Instruct.jinja +``` \ No newline at end of file diff --git a/models/templates/llama-cpp-deepseek-r1.jinja b/models/templates/llama-cpp-deepseek-r1.jinja new file mode 100644 index 000000000..fcb1732eb --- /dev/null +++ b/models/templates/llama-cpp-deepseek-r1.jinja @@ -0,0 +1,76 @@ +{%- if not add_generation_prompt is defined -%} + {%- set add_generation_prompt = false -%} +{%- endif -%} +{%- set ns = namespace(is_first=false, is_tool_outputs=false, is_output_first=true, system_prompt='') -%} +{%- for message in messages -%} + {%- if message['role'] == 'system' -%} + {%- set ns.system_prompt = message['content'] -%} + {%- endif -%} +{%- endfor -%} +{{bos_token}} +{%- if tools %} +You can call any of the following function tools to satisfy the user's requests: {{tools | map(attribute='function') | tojson(indent=2)}} + +Example function tool call syntax: + +<|tool▁calls▁begin|><|tool▁call▁begin|>function<|tool▁sep|>example_function_name +```json +{ + "arg1": "some_value" + ... +} +``` +<|tool▁call▁end|><|tool▁calls▁end|> + +{% endif -%} +{{ns.system_prompt}} +{%- macro flush_tool_outputs() -%} + {%- if ns.is_tool_outputs -%} + {{- '<|tool▁outputs▁end|><|end▁of▁sentence|>' -}} + {%- set ns.is_tool_outputs = false -%} + {%- endif -%} +{%- endmacro -%} +{{- flush_tool_outputs() -}} +{%- for message in messages -%} + {%- if message['role'] != 'tool' -%} + {{- flush_tool_outputs() -}} + {%- endif -%} + {%- if message['role'] == 'user' -%} + {{- '<|User|>' + message['content'] + '<|end▁of▁sentence|>' -}} + {%- endif -%} + {%- if message['role'] == 'assistant' and message['content'] is none -%} + {{- '<|Assistant|><|tool▁calls▁begin|>' -}} + {%- set ns.is_first = true -%} + {%- for tc in message['tool_calls'] -%} + {%- if ns.is_first -%} + {%- set ns.is_first = false -%} + {%- else -%} + {{- '\n' -}} + {%- endif -%} + {%- set tool_name = tc['function']['name'] -%} + {%- set tool_args = tc['function']['arguments'] -%} + {{- '<|tool▁call▁begin|>' + tc['type'] + '<|tool▁sep|>' + tool_name + '\n' + '```json' + '\n' + tool_args + '\n' + '```' + '<|tool▁call▁end|>' -}} + {%- endfor -%} + {{- '<|tool▁calls▁end|><|end▁of▁sentence|>' -}} + {%- endif -%} + {%- if message['role'] == 'assistant' and message['content'] is not none -%} + {{- flush_tool_outputs() -}} + {%- set content = message['content'] -%} + {%- if '' in content -%} + {%- set content = content.split('')[-1] -%} + {%- endif -%} + {{- '<|Assistant|>' + content + '<|end▁of▁sentence|>' -}} + {%- endif -%} + {%- if message['role'] == 'tool' -%} + {%- set ns.is_tool_outputs = true -%} + {%- if ns.is_output_first -%} + {{- '<|tool▁outputs▁begin|>' -}} + {%- set ns.is_output_first = false -%} + {%- endif -%} + {{- '\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>' -}} + {%- endif -%} +{%- endfor -%} +{{- flush_tool_outputs() -}} +{%- if add_generation_prompt and not ns.is_tool_outputs -%} + {{- '<|Assistant|>\n' -}} +{%- endif -%} \ No newline at end of file diff --git a/src/llama-grammar.cpp b/src/llama-grammar.cpp index 9b518d1ac..46e27a96e 100644 --- a/src/llama-grammar.cpp +++ b/src/llama-grammar.cpp @@ -1186,7 +1186,7 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token return; } } - LLAMA_LOG_DEBUG("Grammar still awaiting trigger after token %d (`%s`) (buffer: `%s`)\n", token, piece.c_str(), grammar.trigger_buffer.c_str()); + LLAMA_LOG_DEBUG("Grammar still awaiting trigger after token %d (`%s`)\n", token, piece.c_str()); return; } } diff --git a/src/llama-impl.h b/src/llama-impl.h index 12d1fb082..02b1d07f8 100644 --- a/src/llama-impl.h +++ b/src/llama-impl.h @@ -6,13 +6,13 @@ #include #ifdef __GNUC__ -#ifdef __MINGW32__ -#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__))) +# if defined(__MINGW32__) && !defined(__clang__) +# define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__))) +# else +# define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__))) +# endif #else -#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__))) -#endif -#else -#define LLAMA_ATTRIBUTE_FORMAT(...) +# define LLAMA_ATTRIBUTE_FORMAT(...) #endif // diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h index dca6f3998..1ed688e3b 100644 --- a/src/llama-kv-cache.h +++ b/src/llama-kv-cache.h @@ -37,7 +37,7 @@ struct llama_kv_cache { bool can_shift = false; // Note: The value of head isn't only used to optimize searching - // for a free KV slot. llama_decode_internal also uses it, so it + // for a free KV slot. llama_decode_impl also uses it, so it // cannot be freely changed after a slot has been allocated. uint32_t head = 0; uint32_t size = 0; diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 990b61297..f40bf2db8 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -1698,6 +1698,73 @@ struct llama_sampler * llama_sampler_init_penalties( ); } +// top-n-sigma + +struct llama_sampler_top_n_sigma { + const float n; +}; + +static const char * llama_sampler_top_n_sigma_name(const struct llama_sampler * /*smpl*/) { + return "top-n-sigma"; +} + +static void llama_sampler_top_n_sigma_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { + const auto * ctx = (llama_sampler_top_n_sigma *) smpl->ctx; + + // find max logit and calculate mean + float max = cur_p->data[0].logit; + float logits_sum = 0; + for (size_t i = 0; i < cur_p->size; ++i) { + if (cur_p->data[i].logit > max) { + max = cur_p->data[i].logit; + } + logits_sum += cur_p->data[i].logit; + } + float mean = logits_sum/cur_p->size; + + // calculate standard deviation + float acc = 0; + for (size_t i = 0; i < cur_p->size; ++i) { + acc += pow(cur_p->data[i].logit - mean, 2); + } + float std = sqrt(acc/cur_p->size); + + //apply mask + for (size_t i = 0; i < cur_p->size; ++i) { + if (cur_p->data[i].logit < max - (ctx->n * std)) { + cur_p->data[i].logit = -INFINITY; + } + } + llama_sampler_softmax_impl(cur_p); +} + +static struct llama_sampler * llama_sampler_top_n_sigma_clone(const struct llama_sampler * smpl) { + const auto * ctx = (const llama_sampler_top_n_sigma *) smpl->ctx; + return llama_sampler_init_top_n_sigma(ctx->n); +} + +static void llama_sampler_top_n_sigma_free(struct llama_sampler * smpl) { + delete (llama_sampler_top_n_sigma *) smpl->ctx; +} + +static struct llama_sampler_i llama_sampler_top_n_sigma_i = { + /* .name = */ llama_sampler_top_n_sigma_name, + /* .accept = */ nullptr, + /* .apply = */ llama_sampler_top_n_sigma_apply, + /* .reset = */ nullptr, + /* .clone = */ llama_sampler_top_n_sigma_clone, + /* .free = */ llama_sampler_top_n_sigma_free, +}; + +struct llama_sampler * llama_sampler_init_top_n_sigma(float n) { + return llama_sampler_init( + /* .iface = */ &llama_sampler_top_n_sigma_i, + /* .ctx = */ new llama_sampler_top_n_sigma { + /* .n = */ n, + } + ); +} + // DRY struct llama_sampler_dry {