mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-05-06 16:21:49 +00:00
Merge commit 'b677721819' into concedo_experimental
# Conflicts: # CONTRIBUTING.md # common/chat.cpp # docs/ops.md # docs/ops/CPU.csv # docs/ops/CUDA.csv # docs/ops/OpenCL.csv # ggml/src/ggml-cann/aclnn_ops.cpp # ggml/src/ggml-cann/common.h # ggml/src/ggml-cann/ggml-cann.cpp # ggml/src/ggml-sycl/softmax.cpp # grammars/README.md # src/CMakeLists.txt # tests/test-backend-ops.cpp # tests/test-chat.cpp # tests/test-grammar-integration.cpp # tests/test-grammar-parser.cpp # tests/test-llama-grammar.cpp # tools/mtmd/CMakeLists.txt
This commit is contained in:
commit
34d243bf3c
24 changed files with 1434 additions and 136 deletions
133
common/chat.cpp
133
common/chat.cpp
|
|
@ -1,4 +1,6 @@
|
|||
#include "chat.h"
|
||||
#include "chat-parser.h"
|
||||
#include "chat-peg-parser.h"
|
||||
#include "common.h"
|
||||
#include "json-partial.h"
|
||||
#include "json-schema-to-grammar.h"
|
||||
|
|
@ -150,6 +152,7 @@ struct templates_params {
|
|||
common_chat_tool_choice tool_choice;
|
||||
json json_schema;
|
||||
bool parallel_tool_calls;
|
||||
common_reasoning_format reasoning_format;
|
||||
bool stream;
|
||||
std::string grammar;
|
||||
bool add_generation_prompt = true;
|
||||
|
|
@ -594,6 +597,16 @@ common_chat_templates_ptr common_chat_templates_init(
|
|||
"{%- if false %}");
|
||||
}
|
||||
|
||||
// TODO @aldehir : this is a temporary fix, pending Minja changes
|
||||
// Ref: https://github.com/ggml-org/llama.cpp/pull/17713#issuecomment-3631342664
|
||||
if (default_template_src.find("[TOOL_CALLS]") != std::string::npos
|
||||
// search for the error message and patch it
|
||||
&& default_template_src.find("if (message['content'] is none or") != std::string::npos) {
|
||||
string_replace_all(default_template_src,
|
||||
"{%- if (message['content'] is none or message['content'] == '' or message['content']|length == 0) and (message['tool_calls'] is not defined or message['tool_calls'] is none or message['tool_calls']|length == 0) %}",
|
||||
"{%- if false %}");
|
||||
}
|
||||
|
||||
std::string token_bos = bos_token_override;
|
||||
std::string token_eos = eos_token_override;
|
||||
bool add_bos = false;
|
||||
|
|
@ -992,6 +1005,118 @@ static common_chat_params common_chat_params_init_lfm2(const common_chat_templat
|
|||
return data;
|
||||
}
|
||||
|
||||
static common_chat_params common_chat_params_init_ministral_3(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
||||
common_chat_params data;
|
||||
|
||||
// Build up messages to follow the format: https://huggingface.co/mistralai/Ministral-3-14B-Reasoning-2512/blob/main/chat_template.jinja
|
||||
auto adjusted_messages = json::array();
|
||||
for (const auto & msg : inputs.messages) {
|
||||
auto role = msg.value("role", "");
|
||||
if (role != "system" && role != "assistant") {
|
||||
// Only adjust system and assistant messages. Interestingly, the system message may contain thinking.
|
||||
adjusted_messages.push_back(msg);
|
||||
continue;
|
||||
}
|
||||
|
||||
auto content = json::array();
|
||||
|
||||
// If message contains `reasoning_content`, add it as a block of type `thinking`
|
||||
if (msg.contains("reasoning_content") && msg.at("reasoning_content").is_string()) {
|
||||
content.push_back({
|
||||
{"type", "thinking"},
|
||||
{"thinking", msg.at("reasoning_content").get<std::string>()},
|
||||
});
|
||||
}
|
||||
|
||||
// If message contains `content`, add it as a block of type `text`
|
||||
if (msg.contains("content")) {
|
||||
if (msg.at("content").is_string()) {
|
||||
content.push_back({
|
||||
{"type", "text"},
|
||||
{"text", msg.at("content").get<std::string>()},
|
||||
});
|
||||
} else if (msg.at("content").is_array()) {
|
||||
auto blocks = msg.at("content");
|
||||
content.insert(content.end(), blocks.begin(), blocks.end());
|
||||
}
|
||||
}
|
||||
|
||||
auto adjusted = msg;
|
||||
adjusted["content"] = content;
|
||||
adjusted.erase("reasoning_content");
|
||||
adjusted_messages.push_back(adjusted);
|
||||
}
|
||||
|
||||
auto has_tools = inputs.tools.is_array() && !inputs.tools.empty();
|
||||
auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
|
||||
auto include_grammar = true;
|
||||
|
||||
data.prompt = apply(tmpl, inputs, /* messages_override = */ adjusted_messages);
|
||||
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
|
||||
data.preserved_tokens = {
|
||||
"[THINK]",
|
||||
"[/THINK]",
|
||||
"[TOOL_CALLS]",
|
||||
"[ARGS]",
|
||||
};
|
||||
|
||||
auto parser = build_chat_peg_native_parser([&](common_chat_peg_native_builder & p) {
|
||||
auto reasoning = extract_reasoning ? p.optional("[THINK]" + p.reasoning(p.until("[/THINK]")) + "[/THINK]") : p.eps();
|
||||
|
||||
// Response format parser
|
||||
if (inputs.json_schema.is_object() && !inputs.json_schema.empty()) {
|
||||
// Ministral wants to emit json surrounded by code fences
|
||||
return reasoning << "```json" << p.content(p.schema(p.json(), "response-format", inputs.json_schema)) << "```";
|
||||
}
|
||||
|
||||
// Tool call parser
|
||||
if (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE) {
|
||||
auto tool_choice = p.choice();
|
||||
foreach_function(inputs.tools, [&](const json & tool) {
|
||||
const auto & function = tool.at("function");
|
||||
std::string name = function.at("name");
|
||||
const auto & schema = function.at("parameters");
|
||||
|
||||
tool_choice |= p.rule("tool-" + name,
|
||||
p.tool_open(p.tool_name(p.literal(name)) + "[ARGS]")
|
||||
+ p.tool_args(p.schema(p.json(), "tool-" + name + "-schema", schema))
|
||||
);
|
||||
});
|
||||
|
||||
auto min_calls = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED ? 1 : 0;
|
||||
auto max_calls = inputs.parallel_tool_calls ? -1 : 1;
|
||||
auto tool_calls = p.trigger_rule("tool-call", p.repeat("[TOOL_CALLS]" + tool_choice, min_calls, max_calls));
|
||||
|
||||
return reasoning << p.content(p.until("[TOOL_CALLS]")) << tool_calls;
|
||||
}
|
||||
|
||||
// Content only parser
|
||||
include_grammar = false;
|
||||
return reasoning << p.content(p.rest());
|
||||
});
|
||||
|
||||
data.parser = parser.save();
|
||||
|
||||
if (include_grammar) {
|
||||
data.grammar_lazy = has_tools && inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_AUTO;
|
||||
|
||||
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
||||
foreach_function(inputs.tools, [&](const json & tool) {
|
||||
const auto & function = tool.at("function");
|
||||
auto schema = function.at("parameters");
|
||||
builder.resolve_refs(schema);
|
||||
});
|
||||
parser.build_grammar(builder, data.grammar_lazy);
|
||||
});
|
||||
|
||||
data.grammar_triggers = {
|
||||
{COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "[TOOL_CALLS]"}
|
||||
};
|
||||
}
|
||||
|
||||
return data;
|
||||
}
|
||||
|
||||
static common_chat_params common_chat_params_init_magistral(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
||||
common_chat_params data;
|
||||
data.prompt = apply(tmpl, inputs);
|
||||
|
|
@ -2346,6 +2471,7 @@ static common_chat_params common_chat_templates_apply_jinja(
|
|||
params.messages = common_chat_msgs_to_json_oaicompat<json>(inputs.messages, /* concat_text= */ !tmpl.original_caps().requires_typed_content);
|
||||
params.add_generation_prompt = inputs.add_generation_prompt;
|
||||
params.tool_choice = inputs.tool_choice;
|
||||
params.reasoning_format = inputs.reasoning_format;
|
||||
params.enable_thinking = inputs.enable_thinking;
|
||||
params.grammar = inputs.grammar;
|
||||
params.now = inputs.now;
|
||||
|
|
@ -2509,6 +2635,13 @@ static common_chat_params common_chat_templates_apply_jinja(
|
|||
return common_chat_params_init_llama_3_x(tmpl, params, allow_python_tag_builtin_tools);
|
||||
}
|
||||
|
||||
// Ministral/Mistral Large 3
|
||||
if (src.find("[SYSTEM_PROMPT]") != std::string::npos &&
|
||||
src.find("[TOOL_CALLS]") != std::string::npos &&
|
||||
src.find("[ARGS]") != std::string::npos) {
|
||||
return common_chat_params_init_ministral_3(tmpl, params);
|
||||
}
|
||||
|
||||
if (src.find("[THINK]") != std::string::npos && src.find("[/THINK]") != std::string::npos) {
|
||||
return common_chat_params_init_magistral(tmpl, params);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,6 +1,11 @@
|
|||
#include "console.h"
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
#include <cassert>
|
||||
#include <cstddef>
|
||||
#include <cctype>
|
||||
#include <cwctype>
|
||||
#include <cstdint>
|
||||
|
||||
#if defined(_WIN32)
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
|
|
@ -35,9 +40,26 @@
|
|||
|
||||
namespace console {
|
||||
|
||||
#if defined (_WIN32)
|
||||
namespace {
|
||||
// Use private-use unicode values to represent special keys that are not reported
|
||||
// as characters (e.g. arrows on Windows). These values should never clash with
|
||||
// real input and let the rest of the code handle navigation uniformly.
|
||||
static constexpr char32_t KEY_ARROW_LEFT = 0xE000;
|
||||
static constexpr char32_t KEY_ARROW_RIGHT = 0xE001;
|
||||
static constexpr char32_t KEY_ARROW_UP = 0xE002;
|
||||
static constexpr char32_t KEY_ARROW_DOWN = 0xE003;
|
||||
static constexpr char32_t KEY_HOME = 0xE004;
|
||||
static constexpr char32_t KEY_END = 0xE005;
|
||||
static constexpr char32_t KEY_CTRL_ARROW_LEFT = 0xE006;
|
||||
static constexpr char32_t KEY_CTRL_ARROW_RIGHT = 0xE007;
|
||||
static constexpr char32_t KEY_DELETE = 0xE008;
|
||||
}
|
||||
|
||||
//
|
||||
// Console state
|
||||
//
|
||||
#endif
|
||||
|
||||
static bool advanced_display = false;
|
||||
static bool simple_io = true;
|
||||
|
|
@ -176,7 +198,18 @@ namespace console {
|
|||
if (record.EventType == KEY_EVENT && record.Event.KeyEvent.bKeyDown) {
|
||||
wchar_t wc = record.Event.KeyEvent.uChar.UnicodeChar;
|
||||
if (wc == 0) {
|
||||
continue;
|
||||
const DWORD ctrl_mask = LEFT_CTRL_PRESSED | RIGHT_CTRL_PRESSED;
|
||||
const bool ctrl_pressed = (record.Event.KeyEvent.dwControlKeyState & ctrl_mask) != 0;
|
||||
switch (record.Event.KeyEvent.wVirtualKeyCode) {
|
||||
case VK_LEFT: return ctrl_pressed ? KEY_CTRL_ARROW_LEFT : KEY_ARROW_LEFT;
|
||||
case VK_RIGHT: return ctrl_pressed ? KEY_CTRL_ARROW_RIGHT : KEY_ARROW_RIGHT;
|
||||
case VK_UP: return KEY_ARROW_UP;
|
||||
case VK_DOWN: return KEY_ARROW_DOWN;
|
||||
case VK_HOME: return KEY_HOME;
|
||||
case VK_END: return KEY_END;
|
||||
case VK_DELETE: return KEY_DELETE;
|
||||
default: continue;
|
||||
}
|
||||
}
|
||||
|
||||
if ((wc >= 0xD800) && (wc <= 0xDBFF)) { // Check if wc is a high surrogate
|
||||
|
|
@ -315,6 +348,52 @@ namespace console {
|
|||
#endif
|
||||
}
|
||||
|
||||
static char32_t decode_utf8(const std::string & input, size_t pos, size_t & advance) {
|
||||
unsigned char c = static_cast<unsigned char>(input[pos]);
|
||||
if ((c & 0x80u) == 0u) {
|
||||
advance = 1;
|
||||
return c;
|
||||
}
|
||||
if ((c & 0xE0u) == 0xC0u && pos + 1 < input.size()) {
|
||||
unsigned char c1 = static_cast<unsigned char>(input[pos + 1]);
|
||||
if ((c1 & 0xC0u) != 0x80u) {
|
||||
advance = 1;
|
||||
return 0xFFFD;
|
||||
}
|
||||
advance = 2;
|
||||
return ((c & 0x1Fu) << 6) | (static_cast<unsigned char>(input[pos + 1]) & 0x3Fu);
|
||||
}
|
||||
if ((c & 0xF0u) == 0xE0u && pos + 2 < input.size()) {
|
||||
unsigned char c1 = static_cast<unsigned char>(input[pos + 1]);
|
||||
unsigned char c2 = static_cast<unsigned char>(input[pos + 2]);
|
||||
if ((c1 & 0xC0u) != 0x80u || (c2 & 0xC0u) != 0x80u) {
|
||||
advance = 1;
|
||||
return 0xFFFD;
|
||||
}
|
||||
advance = 3;
|
||||
return ((c & 0x0Fu) << 12) |
|
||||
((static_cast<unsigned char>(input[pos + 1]) & 0x3Fu) << 6) |
|
||||
(static_cast<unsigned char>(input[pos + 2]) & 0x3Fu);
|
||||
}
|
||||
if ((c & 0xF8u) == 0xF0u && pos + 3 < input.size()) {
|
||||
unsigned char c1 = static_cast<unsigned char>(input[pos + 1]);
|
||||
unsigned char c2 = static_cast<unsigned char>(input[pos + 2]);
|
||||
unsigned char c3 = static_cast<unsigned char>(input[pos + 3]);
|
||||
if ((c1 & 0xC0u) != 0x80u || (c2 & 0xC0u) != 0x80u || (c3 & 0xC0u) != 0x80u) {
|
||||
advance = 1;
|
||||
return 0xFFFD;
|
||||
}
|
||||
advance = 4;
|
||||
return ((c & 0x07u) << 18) |
|
||||
((static_cast<unsigned char>(input[pos + 1]) & 0x3Fu) << 12) |
|
||||
((static_cast<unsigned char>(input[pos + 2]) & 0x3Fu) << 6) |
|
||||
(static_cast<unsigned char>(input[pos + 3]) & 0x3Fu);
|
||||
}
|
||||
|
||||
advance = 1;
|
||||
return 0xFFFD; // replacement character for invalid input
|
||||
}
|
||||
|
||||
static void append_utf8(char32_t ch, std::string & out) {
|
||||
if (ch <= 0x7F) {
|
||||
out.push_back(static_cast<unsigned char>(ch));
|
||||
|
|
@ -336,22 +415,319 @@ namespace console {
|
|||
}
|
||||
|
||||
// Helper function to remove the last UTF-8 character from a string
|
||||
static void pop_back_utf8_char(std::string & line) {
|
||||
if (line.empty()) {
|
||||
static size_t prev_utf8_char_pos(const std::string & line, size_t pos) {
|
||||
if (pos == 0) return 0;
|
||||
pos--;
|
||||
while (pos > 0 && (line[pos] & 0xC0) == 0x80) {
|
||||
pos--;
|
||||
}
|
||||
return pos;
|
||||
}
|
||||
|
||||
static size_t next_utf8_char_pos(const std::string & line, size_t pos) {
|
||||
if (pos >= line.length()) return line.length();
|
||||
pos++;
|
||||
while (pos < line.length() && (line[pos] & 0xC0) == 0x80) {
|
||||
pos++;
|
||||
}
|
||||
return pos;
|
||||
}
|
||||
|
||||
static void move_cursor(int delta);
|
||||
static void move_word_left(size_t & char_pos, size_t & byte_pos, const std::vector<int> & widths, const std::string & line);
|
||||
static void move_word_right(size_t & char_pos, size_t & byte_pos, const std::vector<int> & widths, const std::string & line);
|
||||
static void move_to_line_start(size_t & char_pos, size_t & byte_pos, const std::vector<int> & widths);
|
||||
static void move_to_line_end(size_t & char_pos, size_t & byte_pos, const std::vector<int> & widths, const std::string & line);
|
||||
|
||||
static void delete_at_cursor(std::string & line, std::vector<int> & widths, size_t & char_pos, size_t & byte_pos) {
|
||||
if (char_pos >= widths.size()) {
|
||||
return;
|
||||
}
|
||||
|
||||
size_t pos = line.length() - 1;
|
||||
size_t next_pos = next_utf8_char_pos(line, byte_pos);
|
||||
int w = widths[char_pos];
|
||||
size_t char_len = next_pos - byte_pos;
|
||||
|
||||
// Find the start of the last UTF-8 character (checking up to 4 bytes back)
|
||||
for (size_t i = 0; i < 3 && pos > 0; ++i, --pos) {
|
||||
if ((line[pos] & 0xC0) != 0x80) {
|
||||
break; // Found the start of the character
|
||||
}
|
||||
line.erase(byte_pos, char_len);
|
||||
widths.erase(widths.begin() + char_pos);
|
||||
|
||||
size_t p = byte_pos;
|
||||
int tail_width = 0;
|
||||
for (size_t i = char_pos; i < widths.size(); ++i) {
|
||||
size_t following = next_utf8_char_pos(line, p);
|
||||
put_codepoint(line.c_str() + p, following - p, widths[i]);
|
||||
tail_width += widths[i];
|
||||
p = following;
|
||||
}
|
||||
line.erase(pos);
|
||||
|
||||
for (int i = 0; i < w; ++i) {
|
||||
fputc(' ', out);
|
||||
}
|
||||
|
||||
move_cursor(-(tail_width + w));
|
||||
}
|
||||
|
||||
static void clear_current_line(const std::vector<int> & widths) {
|
||||
int total_width = 0;
|
||||
for (int w : widths) {
|
||||
total_width += (w > 0 ? w : 1);
|
||||
}
|
||||
|
||||
if (total_width > 0) {
|
||||
std::string spaces(total_width, ' ');
|
||||
fwrite(spaces.c_str(), 1, total_width, out);
|
||||
move_cursor(-total_width);
|
||||
}
|
||||
}
|
||||
|
||||
static void set_line_contents(std::string new_line, std::string & line, std::vector<int> & widths, size_t & char_pos,
|
||||
size_t & byte_pos) {
|
||||
move_to_line_start(char_pos, byte_pos, widths);
|
||||
clear_current_line(widths);
|
||||
|
||||
line = std::move(new_line);
|
||||
widths.clear();
|
||||
byte_pos = 0;
|
||||
char_pos = 0;
|
||||
|
||||
size_t idx = 0;
|
||||
while (idx < line.size()) {
|
||||
size_t advance = 0;
|
||||
char32_t cp = decode_utf8(line, idx, advance);
|
||||
int expected_width = estimateWidth(cp);
|
||||
int real_width = put_codepoint(line.c_str() + idx, advance, expected_width);
|
||||
if (real_width < 0) real_width = 0;
|
||||
widths.push_back(real_width);
|
||||
idx += advance;
|
||||
++char_pos;
|
||||
byte_pos = idx;
|
||||
}
|
||||
}
|
||||
|
||||
static void move_to_line_start(size_t & char_pos, size_t & byte_pos, const std::vector<int> & widths) {
|
||||
int back_width = 0;
|
||||
for (size_t i = 0; i < char_pos; ++i) {
|
||||
back_width += widths[i];
|
||||
}
|
||||
move_cursor(-back_width);
|
||||
char_pos = 0;
|
||||
byte_pos = 0;
|
||||
}
|
||||
|
||||
static void move_to_line_end(size_t & char_pos, size_t & byte_pos, const std::vector<int> & widths, const std::string & line) {
|
||||
int forward_width = 0;
|
||||
for (size_t i = char_pos; i < widths.size(); ++i) {
|
||||
forward_width += widths[i];
|
||||
}
|
||||
move_cursor(forward_width);
|
||||
char_pos = widths.size();
|
||||
byte_pos = line.length();
|
||||
}
|
||||
|
||||
static bool has_ctrl_modifier(const std::string & params) {
|
||||
size_t start = 0;
|
||||
while (start < params.size()) {
|
||||
size_t end = params.find(';', start);
|
||||
size_t len = (end == std::string::npos) ? params.size() - start : end - start;
|
||||
if (len > 0) {
|
||||
int value = 0;
|
||||
for (size_t i = 0; i < len; ++i) {
|
||||
char ch = params[start + i];
|
||||
if (!std::isdigit(static_cast<unsigned char>(ch))) {
|
||||
value = -1;
|
||||
break;
|
||||
}
|
||||
value = value * 10 + (ch - '0');
|
||||
}
|
||||
if (value == 5) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
if (end == std::string::npos) {
|
||||
break;
|
||||
}
|
||||
start = end + 1;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool is_space_codepoint(char32_t cp) {
|
||||
return std::iswspace(static_cast<wint_t>(cp)) != 0;
|
||||
}
|
||||
|
||||
static void move_word_left(size_t & char_pos, size_t & byte_pos, const std::vector<int> & widths, const std::string & line) {
|
||||
if (char_pos == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
size_t new_char_pos = char_pos;
|
||||
size_t new_byte_pos = byte_pos;
|
||||
int move_width = 0;
|
||||
|
||||
while (new_char_pos > 0) {
|
||||
size_t prev_byte = prev_utf8_char_pos(line, new_byte_pos);
|
||||
size_t advance = 0;
|
||||
char32_t cp = decode_utf8(line, prev_byte, advance);
|
||||
if (!is_space_codepoint(cp)) {
|
||||
break;
|
||||
}
|
||||
move_width += widths[new_char_pos - 1];
|
||||
new_char_pos--;
|
||||
new_byte_pos = prev_byte;
|
||||
}
|
||||
|
||||
while (new_char_pos > 0) {
|
||||
size_t prev_byte = prev_utf8_char_pos(line, new_byte_pos);
|
||||
size_t advance = 0;
|
||||
char32_t cp = decode_utf8(line, prev_byte, advance);
|
||||
if (is_space_codepoint(cp)) {
|
||||
break;
|
||||
}
|
||||
move_width += widths[new_char_pos - 1];
|
||||
new_char_pos--;
|
||||
new_byte_pos = prev_byte;
|
||||
}
|
||||
|
||||
move_cursor(-move_width);
|
||||
char_pos = new_char_pos;
|
||||
byte_pos = new_byte_pos;
|
||||
}
|
||||
|
||||
static void move_word_right(size_t & char_pos, size_t & byte_pos, const std::vector<int> & widths, const std::string & line) {
|
||||
if (char_pos >= widths.size()) {
|
||||
return;
|
||||
}
|
||||
|
||||
size_t new_char_pos = char_pos;
|
||||
size_t new_byte_pos = byte_pos;
|
||||
int move_width = 0;
|
||||
|
||||
while (new_char_pos < widths.size()) {
|
||||
size_t advance = 0;
|
||||
char32_t cp = decode_utf8(line, new_byte_pos, advance);
|
||||
if (!is_space_codepoint(cp)) {
|
||||
break;
|
||||
}
|
||||
move_width += widths[new_char_pos];
|
||||
new_char_pos++;
|
||||
new_byte_pos += advance;
|
||||
}
|
||||
|
||||
while (new_char_pos < widths.size()) {
|
||||
size_t advance = 0;
|
||||
char32_t cp = decode_utf8(line, new_byte_pos, advance);
|
||||
if (is_space_codepoint(cp)) {
|
||||
break;
|
||||
}
|
||||
move_width += widths[new_char_pos];
|
||||
new_char_pos++;
|
||||
new_byte_pos += advance;
|
||||
}
|
||||
|
||||
while (new_char_pos < widths.size()) {
|
||||
size_t advance = 0;
|
||||
char32_t cp = decode_utf8(line, new_byte_pos, advance);
|
||||
if (!is_space_codepoint(cp)) {
|
||||
break;
|
||||
}
|
||||
move_width += widths[new_char_pos];
|
||||
new_char_pos++;
|
||||
new_byte_pos += advance;
|
||||
}
|
||||
|
||||
move_cursor(move_width);
|
||||
char_pos = new_char_pos;
|
||||
byte_pos = new_byte_pos;
|
||||
}
|
||||
|
||||
static void move_cursor(int delta) {
|
||||
if (delta == 0) return;
|
||||
#if defined(_WIN32)
|
||||
if (hConsole != NULL) {
|
||||
CONSOLE_SCREEN_BUFFER_INFO bufferInfo;
|
||||
GetConsoleScreenBufferInfo(hConsole, &bufferInfo);
|
||||
COORD newCursorPosition = bufferInfo.dwCursorPosition;
|
||||
int width = bufferInfo.dwSize.X;
|
||||
int newX = newCursorPosition.X + delta;
|
||||
int newY = newCursorPosition.Y;
|
||||
|
||||
while (newX >= width) {
|
||||
newX -= width;
|
||||
newY++;
|
||||
}
|
||||
while (newX < 0) {
|
||||
newX += width;
|
||||
newY--;
|
||||
}
|
||||
|
||||
newCursorPosition.X = newX;
|
||||
newCursorPosition.Y = newY;
|
||||
SetConsoleCursorPosition(hConsole, newCursorPosition);
|
||||
}
|
||||
#else
|
||||
if (delta < 0) {
|
||||
for (int i = 0; i < -delta; i++) fprintf(out, "\b");
|
||||
} else {
|
||||
for (int i = 0; i < delta; i++) fprintf(out, "\033[C");
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
struct history_t {
|
||||
std::vector<std::string> entries;
|
||||
size_t viewing_idx = SIZE_MAX;
|
||||
std::string backup_line; // current line before viewing history
|
||||
void add(const std::string & line) {
|
||||
if (line.empty()) {
|
||||
return;
|
||||
}
|
||||
// avoid duplicates with the last entry
|
||||
if (entries.empty() || entries.back() != line) {
|
||||
entries.push_back(line);
|
||||
}
|
||||
// also clear viewing state
|
||||
end_viewing();
|
||||
}
|
||||
bool prev(std::string & cur_line) {
|
||||
if (entries.empty()) {
|
||||
return false;
|
||||
}
|
||||
if (viewing_idx == SIZE_MAX) {
|
||||
return false;
|
||||
}
|
||||
if (viewing_idx > 0) {
|
||||
viewing_idx--;
|
||||
}
|
||||
cur_line = entries[viewing_idx];
|
||||
return true;
|
||||
}
|
||||
bool next(std::string & cur_line) {
|
||||
if (entries.empty() || viewing_idx == SIZE_MAX) {
|
||||
return false;
|
||||
}
|
||||
viewing_idx++;
|
||||
if (viewing_idx >= entries.size()) {
|
||||
cur_line = backup_line;
|
||||
end_viewing();
|
||||
} else {
|
||||
cur_line = entries[viewing_idx];
|
||||
}
|
||||
return true;
|
||||
}
|
||||
void begin_viewing(const std::string & line) {
|
||||
backup_line = line;
|
||||
viewing_idx = entries.size();
|
||||
}
|
||||
void end_viewing() {
|
||||
viewing_idx = SIZE_MAX;
|
||||
backup_line.clear();
|
||||
}
|
||||
bool is_viewing() const {
|
||||
return viewing_idx != SIZE_MAX;
|
||||
}
|
||||
} history;
|
||||
|
||||
static bool readline_advanced(std::string & line, bool multiline_input) {
|
||||
if (out != stdout) {
|
||||
fflush(stdout);
|
||||
|
|
@ -362,8 +738,33 @@ namespace console {
|
|||
bool is_special_char = false;
|
||||
bool end_of_stream = false;
|
||||
|
||||
size_t byte_pos = 0; // current byte index
|
||||
size_t char_pos = 0; // current character index (one char can be multiple bytes)
|
||||
|
||||
char32_t input_char;
|
||||
while (true) {
|
||||
assert(char_pos <= byte_pos);
|
||||
assert(char_pos <= widths.size());
|
||||
auto history_prev = [&]() {
|
||||
if (!history.is_viewing()) {
|
||||
history.begin_viewing(line);
|
||||
}
|
||||
std::string new_line;
|
||||
if (!history.prev(new_line)) {
|
||||
return;
|
||||
}
|
||||
set_line_contents(new_line, line, widths, char_pos, byte_pos);
|
||||
};
|
||||
auto history_next = [&]() {
|
||||
if (history.is_viewing()) {
|
||||
std::string new_line;
|
||||
if (!history.next(new_line)) {
|
||||
return;
|
||||
}
|
||||
set_line_contents(new_line, line, widths, char_pos, byte_pos);
|
||||
}
|
||||
};
|
||||
|
||||
fflush(out); // Ensure all output is displayed before waiting for input
|
||||
input_char = getchar32();
|
||||
|
||||
|
|
@ -371,7 +772,7 @@ namespace console {
|
|||
break;
|
||||
}
|
||||
|
||||
if (input_char == (char32_t) WEOF || input_char == 0x04 /* Ctrl+D*/) {
|
||||
if (input_char == (char32_t) WEOF || input_char == 0x04 /* Ctrl+D */) {
|
||||
end_of_stream = true;
|
||||
break;
|
||||
}
|
||||
|
|
@ -384,7 +785,71 @@ namespace console {
|
|||
|
||||
if (input_char == '\033') { // Escape sequence
|
||||
char32_t code = getchar32();
|
||||
if (code == '[' || code == 0x1B) {
|
||||
if (code == '[') {
|
||||
std::string params;
|
||||
while (true) {
|
||||
code = getchar32();
|
||||
if ((code >= 'A' && code <= 'Z') || (code >= 'a' && code <= 'z') || code == '~' || code == (char32_t) WEOF) {
|
||||
break;
|
||||
}
|
||||
params.push_back(static_cast<char>(code));
|
||||
}
|
||||
|
||||
const bool ctrl_modifier = has_ctrl_modifier(params);
|
||||
|
||||
if (code == 'D') { // left
|
||||
if (ctrl_modifier) {
|
||||
move_word_left(char_pos, byte_pos, widths, line);
|
||||
} else if (char_pos > 0) {
|
||||
int w = widths[char_pos - 1];
|
||||
move_cursor(-w);
|
||||
char_pos--;
|
||||
byte_pos = prev_utf8_char_pos(line, byte_pos);
|
||||
}
|
||||
} else if (code == 'C') { // right
|
||||
if (ctrl_modifier) {
|
||||
move_word_right(char_pos, byte_pos, widths, line);
|
||||
} else if (char_pos < widths.size()) {
|
||||
int w = widths[char_pos];
|
||||
move_cursor(w);
|
||||
char_pos++;
|
||||
byte_pos = next_utf8_char_pos(line, byte_pos);
|
||||
}
|
||||
} else if (code == 'H') { // home
|
||||
move_to_line_start(char_pos, byte_pos, widths);
|
||||
} else if (code == 'F') { // end
|
||||
move_to_line_end(char_pos, byte_pos, widths, line);
|
||||
} else if (code == 'A' || code == 'B') {
|
||||
// up/down
|
||||
if (code == 'A') {
|
||||
history_prev();
|
||||
is_special_char = false;
|
||||
} else if (code == 'B') {
|
||||
history_next();
|
||||
is_special_char = false;
|
||||
}
|
||||
} else if ((code == '~' || (code >= 'A' && code <= 'Z') || (code >= 'a' && code <= 'z')) && !params.empty()) {
|
||||
std::string digits;
|
||||
for (char ch : params) {
|
||||
if (ch == ';') {
|
||||
break;
|
||||
}
|
||||
if (std::isdigit(static_cast<unsigned char>(ch))) {
|
||||
digits.push_back(ch);
|
||||
}
|
||||
}
|
||||
|
||||
if (code == '~') {
|
||||
if (digits == "1" || digits == "7") { // home
|
||||
move_to_line_start(char_pos, byte_pos, widths);
|
||||
} else if (digits == "4" || digits == "8") { // end
|
||||
move_to_line_end(char_pos, byte_pos, widths, line);
|
||||
} else if (digits == "3") { // delete
|
||||
delete_at_cursor(line, widths, char_pos, byte_pos);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if (code == 0x1B) {
|
||||
// Discard the rest of the escape sequence
|
||||
while ((code = getchar32()) != (char32_t) WEOF) {
|
||||
if ((code >= 'A' && code <= 'Z') || (code >= 'a' && code <= 'z') || code == '~') {
|
||||
|
|
@ -392,28 +857,107 @@ namespace console {
|
|||
}
|
||||
}
|
||||
}
|
||||
#if defined(_WIN32)
|
||||
} else if (input_char == KEY_ARROW_LEFT) {
|
||||
if (char_pos > 0) {
|
||||
int w = widths[char_pos - 1];
|
||||
move_cursor(-w);
|
||||
char_pos--;
|
||||
byte_pos = prev_utf8_char_pos(line, byte_pos);
|
||||
}
|
||||
} else if (input_char == KEY_ARROW_RIGHT) {
|
||||
if (char_pos < widths.size()) {
|
||||
int w = widths[char_pos];
|
||||
move_cursor(w);
|
||||
char_pos++;
|
||||
byte_pos = next_utf8_char_pos(line, byte_pos);
|
||||
}
|
||||
} else if (input_char == KEY_CTRL_ARROW_LEFT) {
|
||||
move_word_left(char_pos, byte_pos, widths, line);
|
||||
} else if (input_char == KEY_CTRL_ARROW_RIGHT) {
|
||||
move_word_right(char_pos, byte_pos, widths, line);
|
||||
} else if (input_char == KEY_HOME) {
|
||||
move_to_line_start(char_pos, byte_pos, widths);
|
||||
} else if (input_char == KEY_END) {
|
||||
move_to_line_end(char_pos, byte_pos, widths, line);
|
||||
} else if (input_char == KEY_DELETE) {
|
||||
delete_at_cursor(line, widths, char_pos, byte_pos);
|
||||
} else if (input_char == KEY_ARROW_UP || input_char == KEY_ARROW_DOWN) {
|
||||
if (input_char == KEY_ARROW_UP) {
|
||||
history_prev();
|
||||
is_special_char = false;
|
||||
} else if (input_char == KEY_ARROW_DOWN) {
|
||||
history_next();
|
||||
is_special_char = false;
|
||||
}
|
||||
#endif
|
||||
} else if (input_char == 0x08 || input_char == 0x7F) { // Backspace
|
||||
if (!widths.empty()) {
|
||||
int count;
|
||||
do {
|
||||
count = widths.back();
|
||||
widths.pop_back();
|
||||
// Move cursor back, print space, and move cursor back again
|
||||
for (int i = 0; i < count; i++) {
|
||||
replace_last(' ');
|
||||
pop_cursor();
|
||||
}
|
||||
pop_back_utf8_char(line);
|
||||
} while (count == 0 && !widths.empty());
|
||||
if (char_pos > 0) {
|
||||
int w = widths[char_pos - 1];
|
||||
move_cursor(-w);
|
||||
char_pos--;
|
||||
size_t prev_pos = prev_utf8_char_pos(line, byte_pos);
|
||||
size_t char_len = byte_pos - prev_pos;
|
||||
byte_pos = prev_pos;
|
||||
|
||||
// remove the character
|
||||
line.erase(byte_pos, char_len);
|
||||
widths.erase(widths.begin() + char_pos);
|
||||
|
||||
// redraw tail
|
||||
size_t p = byte_pos;
|
||||
int tail_width = 0;
|
||||
for (size_t i = char_pos; i < widths.size(); ++i) {
|
||||
size_t next_p = next_utf8_char_pos(line, p);
|
||||
put_codepoint(line.c_str() + p, next_p - p, widths[i]);
|
||||
tail_width += widths[i];
|
||||
p = next_p;
|
||||
}
|
||||
|
||||
// clear display
|
||||
for (int i = 0; i < w; ++i) {
|
||||
fputc(' ', out);
|
||||
}
|
||||
move_cursor(-(tail_width + w));
|
||||
}
|
||||
} else {
|
||||
int offset = line.length();
|
||||
append_utf8(input_char, line);
|
||||
int width = put_codepoint(line.c_str() + offset, line.length() - offset, estimateWidth(input_char));
|
||||
if (width < 0) {
|
||||
width = 0;
|
||||
// insert character
|
||||
std::string new_char_str;
|
||||
append_utf8(input_char, new_char_str);
|
||||
int w = estimateWidth(input_char);
|
||||
|
||||
if (char_pos == widths.size()) {
|
||||
// insert at the end
|
||||
line += new_char_str;
|
||||
int real_w = put_codepoint(new_char_str.c_str(), new_char_str.length(), w);
|
||||
if (real_w < 0) real_w = 0;
|
||||
widths.push_back(real_w);
|
||||
byte_pos += new_char_str.length();
|
||||
char_pos++;
|
||||
} else {
|
||||
// insert in middle
|
||||
line.insert(byte_pos, new_char_str);
|
||||
|
||||
int real_w = put_codepoint(new_char_str.c_str(), new_char_str.length(), w);
|
||||
if (real_w < 0) real_w = 0;
|
||||
|
||||
widths.insert(widths.begin() + char_pos, real_w);
|
||||
|
||||
// print the tail
|
||||
size_t p = byte_pos + new_char_str.length();
|
||||
int tail_width = 0;
|
||||
for (size_t i = char_pos + 1; i < widths.size(); ++i) {
|
||||
size_t next_p = next_utf8_char_pos(line, p);
|
||||
put_codepoint(line.c_str() + p, next_p - p, widths[i]);
|
||||
tail_width += widths[i];
|
||||
p = next_p;
|
||||
}
|
||||
|
||||
move_cursor(-tail_width);
|
||||
|
||||
byte_pos += new_char_str.length();
|
||||
char_pos++;
|
||||
}
|
||||
widths.push_back(width);
|
||||
}
|
||||
|
||||
if (!line.empty() && (line.back() == '\\' || line.back() == '/')) {
|
||||
|
|
@ -451,6 +995,15 @@ namespace console {
|
|||
}
|
||||
}
|
||||
|
||||
if (!end_of_stream && !line.empty()) {
|
||||
// remove the trailing newline for history storage
|
||||
if (!line.empty() && line.back() == '\n') {
|
||||
line.pop_back();
|
||||
}
|
||||
// TODO: maybe support multiline history entries?
|
||||
history.add(line);
|
||||
}
|
||||
|
||||
fflush(out);
|
||||
return has_more;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -383,6 +383,17 @@ class ModelBase:
|
|||
s = self.model_tensors[name]
|
||||
self.model_tensors[weight_name] = lambda w=w, s=s, bs=block_size: dequant_simple(w(), s(), bs)
|
||||
tensors_to_remove.append(name)
|
||||
if name.endswith(".activation_scale"): # unused
|
||||
tensors_to_remove.append(name)
|
||||
# mistral format
|
||||
if name.endswith(".qscale_weight"):
|
||||
weight_name = name.removesuffix("qscale_weight") + "weight"
|
||||
w = self.model_tensors[weight_name]
|
||||
s = self.model_tensors[name]
|
||||
self.model_tensors[weight_name] = lambda w=w, s=s, bs=block_size: dequant_simple(w(), s(), bs)
|
||||
tensors_to_remove.append(name)
|
||||
if name.endswith(".qscale_act"):
|
||||
tensors_to_remove.append(name)
|
||||
elif quant_method == "gptq":
|
||||
for name in self.model_tensors.keys():
|
||||
if name.endswith(".qweight"):
|
||||
|
|
@ -2854,13 +2865,10 @@ class Mistral3Model(LlamaModel):
|
|||
self.gguf_writer.add_attn_temperature_scale(rope_params["llama_4_scaling_beta"])
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
|
||||
# TODO: probably not worth supporting quantized weight, as official BF16 is also available
|
||||
if name.endswith("weight_scale_inv"):
|
||||
raise ValueError("This is a quantized weight, please use BF16 weight instead")
|
||||
|
||||
name = name.replace("language_model.", "")
|
||||
if "multi_modal_projector" in name or "vision_tower" in name:
|
||||
return []
|
||||
|
||||
return super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
|
|
@ -5825,9 +5833,11 @@ class Gemma3Model(TextModel):
|
|||
norm_shift = 1.0 # Gemma3RMSNorm adds 1.0 to the norm value
|
||||
|
||||
def set_vocab(self):
|
||||
self._set_vocab_sentencepiece()
|
||||
|
||||
self.gguf_writer.add_add_space_prefix(False)
|
||||
if (self.dir_model / "tokenizer.model").is_file():
|
||||
self._set_vocab_sentencepiece()
|
||||
self.gguf_writer.add_add_space_prefix(False)
|
||||
else:
|
||||
self._set_vocab_gpt2()
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
hparams = self.hparams
|
||||
|
|
@ -5845,13 +5855,24 @@ class Gemma3Model(TextModel):
|
|||
self.gguf_writer.add_rope_freq_base(hparams.get("rope_theta", 1_000_000.0)) # for global layers
|
||||
# attn_logit_softcapping is removed in Gemma3
|
||||
assert hparams.get("attn_logit_softcapping") is None
|
||||
self.gguf_writer.add_sliding_window(hparams["sliding_window"])
|
||||
if (final_logit_softcap := hparams.get("final_logit_softcapping")):
|
||||
self.gguf_writer.add_final_logit_softcapping(final_logit_softcap)
|
||||
if hparams.get("sliding_window_pattern") != 1:
|
||||
self.gguf_writer.add_sliding_window(hparams["sliding_window"])
|
||||
self.gguf_writer.add_head_count_kv(hparams.get("num_key_value_heads", 4))
|
||||
if hparams.get("rope_scaling") is not None:
|
||||
assert hparams["rope_scaling"]["rope_type"] == "linear"
|
||||
# important: this rope_scaling is only applied for global layers, and not used by 1B model
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
||||
self.gguf_writer.add_rope_scaling_factor(hparams["rope_scaling"]["factor"])
|
||||
rope_scaling = hparams["rope_scaling"]
|
||||
if rope_scaling["rope_type"] == "linear":
|
||||
# important: this rope_scaling is only applied for global layers, and not used by 1B model
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
||||
elif rope_scaling["rope_type"] == "yarn":
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
|
||||
self.gguf_writer.add_rope_scaling_yarn_ext_factor(rope_scaling["extrapolation_factor"])
|
||||
self.gguf_writer.add_rope_scaling_yarn_beta_fast(rope_scaling["beta_fast"])
|
||||
self.gguf_writer.add_rope_scaling_yarn_beta_slow(rope_scaling["beta_slow"])
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
del bid # unused
|
||||
|
|
@ -5865,8 +5886,10 @@ class Gemma3Model(TextModel):
|
|||
|
||||
# remove OOV (out-of-vocabulary) rows in token_embd
|
||||
if "embed_tokens.weight" in name:
|
||||
vocab = self._create_vocab_sentencepiece()
|
||||
tokens = vocab[0]
|
||||
if (self.dir_model / "tokenizer.model").is_file():
|
||||
tokens = self._create_vocab_sentencepiece()[0]
|
||||
else:
|
||||
tokens = self.get_vocab_base()[0]
|
||||
data_torch = data_torch[:len(tokens)]
|
||||
|
||||
# ref code in Gemma3RMSNorm
|
||||
|
|
@ -9883,6 +9906,18 @@ class MistralModel(LlamaModel):
|
|||
self.gguf_writer.add_architecture()
|
||||
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
|
||||
|
||||
def dequant_model(self):
|
||||
# transform quantization config into HF format
|
||||
quant_config = self.hparams.get("quantization")
|
||||
if quant_config is not None:
|
||||
assert quant_config["qformat_weight"] == "fp8_e4m3"
|
||||
self.hparams["quantization_config"] = {
|
||||
"activation_scheme": "static",
|
||||
"quant_method": "fp8",
|
||||
"weight_block_size": None,
|
||||
}
|
||||
return super().dequant_model()
|
||||
|
||||
@staticmethod
|
||||
def get_community_chat_template(vocab: MistralVocab, templates_dir: Path, is_mistral_format: bool):
|
||||
assert TokenizerVersion is not None and Tekkenizer is not None and SentencePieceTokenizer is not None, _mistral_import_error_msg
|
||||
|
|
|
|||
|
|
@ -25,6 +25,7 @@ static bool ggml_is_view(const struct ggml_tensor * t) {
|
|||
// ops that return true for this function must not use restrict pointers for their backend implementations
|
||||
bool ggml_op_can_inplace(enum ggml_op op) {
|
||||
switch (op) {
|
||||
case GGML_OP_FILL:
|
||||
case GGML_OP_SCALE:
|
||||
case GGML_OP_DIAG_MASK_ZERO:
|
||||
case GGML_OP_DIAG_MASK_INF:
|
||||
|
|
|
|||
77
ggml/src/ggml-cuda/diag.cu
Normal file
77
ggml/src/ggml-cuda/diag.cu
Normal file
|
|
@ -0,0 +1,77 @@
|
|||
#include "convert.cuh"
|
||||
#include "diag.cuh"
|
||||
#include "ggml.h"
|
||||
|
||||
template <typename T>
|
||||
static __global__ void diag_kernel(T * __restrict__ dst,
|
||||
const T * __restrict__ src,
|
||||
const int64_t ne0,
|
||||
const int64_t ne1,
|
||||
const int64_t ne2,
|
||||
const int64_t ne3,
|
||||
const int64_t total_elements) {
|
||||
const int64_t global_idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
if (global_idx >= total_elements) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int64_t i0 = global_idx % ne0;
|
||||
const int64_t i1 = (global_idx / ne0) % ne1;
|
||||
const int64_t i2 = (global_idx / (ne0 * ne1)) % ne2;
|
||||
const int64_t i3 = global_idx / (ne0 * ne1 * ne2);
|
||||
|
||||
const int64_t dst_idx = ((i3 * ne2 + i2) * ne1 + i1) * ne0 + i0;
|
||||
|
||||
if (i0 == i1) {
|
||||
const int64_t batch_idx = i3 * ne2 + i2;
|
||||
const int64_t src_idx = batch_idx * ne0 + i0;
|
||||
dst[dst_idx] = src[src_idx];
|
||||
} else {
|
||||
dst[dst_idx] = ggml_cuda_cast<T>(0);
|
||||
}
|
||||
GGML_UNUSED_VARS(ne3);
|
||||
}
|
||||
|
||||
void ggml_cuda_op_diag(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
|
||||
void * dst_d = dst->data;
|
||||
const void * src0_d = src0->data;
|
||||
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous(dst));
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
|
||||
const int64_t ne00 = src0->ne[0];
|
||||
const int64_t ne01 = src0->ne[1];
|
||||
const int64_t ne02 = src0->ne[2];
|
||||
const int64_t ne03 = src0->ne[3];
|
||||
|
||||
const int64_t ne0 = dst->ne[0];
|
||||
const int64_t ne1 = dst->ne[1];
|
||||
const int64_t ne2 = dst->ne[2];
|
||||
const int64_t ne3 = dst->ne[3];
|
||||
|
||||
GGML_ASSERT(ne00 == ne0);
|
||||
GGML_ASSERT(ne01 == 1);
|
||||
GGML_ASSERT(ne02 == ne2);
|
||||
GGML_ASSERT(ne03 == ne3);
|
||||
|
||||
const int64_t n_elems = ggml_nelements(dst);
|
||||
const int64_t num_blocks = (n_elems + CUDA_DIAG_BLOCK_SIZE - 1) / CUDA_DIAG_BLOCK_SIZE;
|
||||
|
||||
switch (dst->type) {
|
||||
case GGML_TYPE_F32:
|
||||
diag_kernel<<<num_blocks, CUDA_DIAG_BLOCK_SIZE, 0, stream>>>((float *) dst_d, (const float *) src0_d, ne0,
|
||||
ne1, ne2, ne3, n_elems);
|
||||
break;
|
||||
case GGML_TYPE_F16:
|
||||
diag_kernel<<<num_blocks, CUDA_DIAG_BLOCK_SIZE, 0, stream>>>((half *) dst_d, (const half *) src0_d, ne0,
|
||||
ne1, ne2, ne3, n_elems);
|
||||
break;
|
||||
default:
|
||||
GGML_ABORT("unsupported type");
|
||||
}
|
||||
}
|
||||
5
ggml/src/ggml-cuda/diag.cuh
Normal file
5
ggml/src/ggml-cuda/diag.cuh
Normal file
|
|
@ -0,0 +1,5 @@
|
|||
#include "common.cuh"
|
||||
|
||||
#define CUDA_DIAG_BLOCK_SIZE 256
|
||||
|
||||
void ggml_cuda_op_diag(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
|
@ -955,22 +955,11 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
|
|||
(K_h2 + int64_t(kb0)*nbatch_fa*stride_K, tile_K, nbatch_K2, stride_K, k_VKQ_sup);
|
||||
}
|
||||
|
||||
for (; kb0 < kb0_stop-1; ++kb0) {
|
||||
constexpr bool last_iter = false;
|
||||
constexpr bool oob_check = false;
|
||||
constexpr int k_VKQ_sup = nbatch_fa;
|
||||
flash_attn_ext_f16_iter
|
||||
<DKQ, DV, ncols1, ncols2, nwarps, use_logit_softcap, mla, needs_fixup, is_fixup, last_iter, oob_check,
|
||||
T_A_KQ, T_B_KQ, T_C_KQ, T_A_VKQ, T_B_VKQ, T_C_VKQ>
|
||||
(Q_f2, K_h2, V_h2, mask_h, dstk, dstk_fixup, scale, slope, logit_softcap,
|
||||
ne01, ne02, stride_K, stride_V, stride_mask, tile_Q, tile_K, tile_V, tile_mask, Q_B, VKQ_C,
|
||||
KQ_max, KQ_rowsum, jt, kb0, k_VKQ_sup);
|
||||
}
|
||||
// kb0_start is always < kb0_stop so the last iter can be executed unconditionally.
|
||||
if constexpr (ncols2 == 1) {
|
||||
if (ne11 % nbatch_fa == 0) {
|
||||
constexpr bool last_iter = true;
|
||||
constexpr bool oob_check = false;
|
||||
constexpr bool oob_check = true;
|
||||
for (; kb0 < kb0_stop-1; ++kb0) {
|
||||
constexpr bool last_iter = false;
|
||||
constexpr int k_VKQ_sup = nbatch_fa;
|
||||
flash_attn_ext_f16_iter
|
||||
<DKQ, DV, ncols1, ncols2, nwarps, use_logit_softcap, mla, needs_fixup, is_fixup, last_iter, oob_check,
|
||||
|
|
@ -978,10 +967,20 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
|
|||
(Q_f2, K_h2, V_h2, mask_h, dstk, dstk_fixup, scale, slope, logit_softcap,
|
||||
ne01, ne02, stride_K, stride_V, stride_mask, tile_Q, tile_K, tile_V, tile_mask, Q_B, VKQ_C,
|
||||
KQ_max, KQ_rowsum, jt, kb0, k_VKQ_sup);
|
||||
} else {
|
||||
constexpr bool last_iter = true;
|
||||
constexpr bool oob_check = true;
|
||||
const int k_VKQ_sup = ne11 - kb0*nbatch_fa;
|
||||
}
|
||||
constexpr bool last_iter = true;
|
||||
const int k_VKQ_sup = ne11 - kb0*nbatch_fa;
|
||||
flash_attn_ext_f16_iter
|
||||
<DKQ, DV, ncols1, ncols2, nwarps, use_logit_softcap, mla, needs_fixup, is_fixup, last_iter, oob_check,
|
||||
T_A_KQ, T_B_KQ, T_C_KQ, T_A_VKQ, T_B_VKQ, T_C_VKQ>
|
||||
(Q_f2, K_h2, V_h2, mask_h, dstk, dstk_fixup, scale, slope, logit_softcap,
|
||||
ne01, ne02, stride_K, stride_V, stride_mask, tile_Q, tile_K, tile_V, tile_mask, Q_B, VKQ_C,
|
||||
KQ_max, KQ_rowsum, jt, kb0, k_VKQ_sup);
|
||||
} else {
|
||||
constexpr bool oob_check = false;
|
||||
for (; kb0 < kb0_stop-1; ++kb0) {
|
||||
constexpr bool last_iter = false;
|
||||
constexpr int k_VKQ_sup = nbatch_fa;
|
||||
flash_attn_ext_f16_iter
|
||||
<DKQ, DV, ncols1, ncols2, nwarps, use_logit_softcap, mla, needs_fixup, is_fixup, last_iter, oob_check,
|
||||
T_A_KQ, T_B_KQ, T_C_KQ, T_A_VKQ, T_B_VKQ, T_C_VKQ>
|
||||
|
|
@ -989,9 +988,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
|
|||
ne01, ne02, stride_K, stride_V, stride_mask, tile_Q, tile_K, tile_V, tile_mask, Q_B, VKQ_C,
|
||||
KQ_max, KQ_rowsum, jt, kb0, k_VKQ_sup);
|
||||
}
|
||||
} else {
|
||||
constexpr bool last_iter = true;
|
||||
constexpr bool oob_check = false;
|
||||
constexpr int k_VKQ_sup = nbatch_fa;
|
||||
flash_attn_ext_f16_iter
|
||||
<DKQ, DV, ncols1, ncols2, nwarps, use_logit_softcap, mla, needs_fixup, is_fixup, last_iter, oob_check,
|
||||
|
|
|
|||
|
|
@ -564,6 +564,12 @@ static __device__ __forceinline__ void flash_attn_tile_iter(
|
|||
for (int i_KQ_0 = 0; i_KQ_0 < nbatch_fa; i_KQ_0 += np*warp_size) {
|
||||
const int i_KQ = i_KQ_0 + (threadIdx.y % np)*warp_size + threadIdx.x;
|
||||
|
||||
#if defined(FAST_FP16_AVAILABLE) && !defined(V_DOT2_F32_F16_AVAILABLE)
|
||||
// Without the v_dot2_f32_f16 instruction there is a higher risk of numerical overflow in the KQ calculation.
|
||||
// Therefore, scale down Q values and apply the inverse scale the FP32 KQ values afterwards again.
|
||||
KQ_acc[i_KQ_0/(np*warp_size)*cpw + jc0] *= 4.0f;
|
||||
#endif // defined(FAST_FP16_AVAILABLE) && !defined(V_DOT2_F32_F16_AVAILABLE)
|
||||
|
||||
if (use_logit_softcap) {
|
||||
KQ_acc[(i_KQ_0/(np*warp_size))*cpw + jc0] = logit_softcap * tanhf(KQ_acc[(i_KQ_0/(np*warp_size))*cpw + jc0]);
|
||||
}
|
||||
|
|
@ -858,6 +864,11 @@ static __global__ void flash_attn_tile(
|
|||
#pragma unroll
|
||||
for (int i1 = 0; i1 < cpy_ne_D; i1 += 2) {
|
||||
tmp_h2[i1/2] = make_half2(tmp_f[i1 + 0], tmp_f[i1 + 1]);
|
||||
#if defined(FAST_FP16_AVAILABLE) && !defined(V_DOT2_F32_F16_AVAILABLE)
|
||||
// Without the v_dot2_f32_f16 instruction there is a higher risk of numerical overflow in the KQ calculation.
|
||||
// Therefore, scale down Q values and apply the inverse scale the FP32 KQ values afterwards again.
|
||||
tmp_h2[i1/2] *= make_half2(0.25f, 0.25f);
|
||||
#endif // defined(FAST_FP16_AVAILABLE) && !defined(V_DOT2_F32_F16_AVAILABLE)
|
||||
}
|
||||
ggml_cuda_memcpy_1<sizeof(tmp_h2)>(
|
||||
&Q_tmp[jc*(DKQ/2) + i0/2 + (threadIdx.y % np)*(warp_size*cpy_ne_D/2) + threadIdx.x*(cpy_ne_D/2)],
|
||||
|
|
|
|||
|
|
@ -36,12 +36,26 @@ static void ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2(ggml_backend_cuda_con
|
|||
const ggml_tensor * KQV = dst;
|
||||
const ggml_tensor * Q = dst->src[0];
|
||||
const ggml_tensor * K = dst->src[1];
|
||||
const ggml_tensor * V = dst->src[2];
|
||||
const ggml_tensor * mask = dst->src[3];
|
||||
|
||||
float max_bias = 0.0f;
|
||||
memcpy(&max_bias, (const float *) KQV->op_params + 1, sizeof(float));
|
||||
|
||||
const bool use_gqa_opt = mask && max_bias == 0.0f && K->ne[1] % FATTN_KQ_STRIDE == 0;
|
||||
// Edge cases like no mask, ALiBi, unpadded K/V, or misaligned addresses for large data transfers
|
||||
// are put into the template specialization without GQA optimizations.
|
||||
bool use_gqa_opt = mask && max_bias == 0.0f && K->ne[1] % FATTN_KQ_STRIDE == 0;
|
||||
for (const ggml_tensor * t : {Q, K, V, mask}) {
|
||||
if (t == nullptr) {
|
||||
continue;
|
||||
}
|
||||
for (size_t i = 1; i < GGML_MAX_DIMS; ++i) {
|
||||
if (t->nb[i] % 16 != 0) {
|
||||
use_gqa_opt = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
GGML_ASSERT(Q->ne[2] % K->ne[2] == 0);
|
||||
const int gqa_ratio = Q->ne[2] / K->ne[2];
|
||||
|
|
|
|||
|
|
@ -4,7 +4,7 @@
|
|||
#define CUDA_FILL_BLOCK_SIZE 256
|
||||
|
||||
template <typename T>
|
||||
static __global__ void fill_kernel(T * __restrict__ dst, const int64_t k, const T value) {
|
||||
static __global__ void fill_kernel(T * dst, const int64_t k, const T value) {
|
||||
const int64_t i = (int64_t)blockDim.x * blockIdx.x + threadIdx.x;
|
||||
if (i >= k) {
|
||||
return;
|
||||
|
|
|
|||
|
|
@ -22,6 +22,7 @@ bool g_mul_mat_q = true;
|
|||
#include "ggml-cuda/cpy.cuh"
|
||||
#include "ggml-cuda/cross-entropy-loss.cuh"
|
||||
#include "ggml-cuda/diagmask.cuh"
|
||||
#include "ggml-cuda/diag.cuh"
|
||||
#include "ggml-cuda/fattn.cuh"
|
||||
#include "ggml-cuda/getrows.cuh"
|
||||
#include "ggml-cuda/im2col.cuh"
|
||||
|
|
@ -2654,6 +2655,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
|||
case GGML_OP_PERMUTE:
|
||||
case GGML_OP_TRANSPOSE:
|
||||
break;
|
||||
case GGML_OP_DIAG:
|
||||
ggml_cuda_op_diag(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_DIAG_MASK_INF:
|
||||
ggml_cuda_op_diag_mask_inf(ctx, dst);
|
||||
break;
|
||||
|
|
@ -4637,6 +4641,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
|||
case GGML_OP_FILL:
|
||||
case GGML_OP_CUMSUM:
|
||||
case GGML_OP_TRI:
|
||||
case GGML_OP_DIAG:
|
||||
return true;
|
||||
case GGML_OP_SOLVE_TRI:
|
||||
return op->src[0]->ne[0] <= 64 && op->src[1]->ne[0] <= 32;
|
||||
|
|
|
|||
|
|
@ -411,6 +411,38 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_ssm_conv(ggml_me
|
|||
return res;
|
||||
}
|
||||
|
||||
ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_ssm_conv_batched(ggml_metal_library_t lib, const ggml_tensor * op, int ssm_conv_bs) {
|
||||
GGML_ASSERT(op->src[0]->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(op->src[1]->type == GGML_TYPE_F32);
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous(op->src[0]));
|
||||
GGML_ASSERT(ggml_is_contiguous(op->src[1]));
|
||||
|
||||
char base[256];
|
||||
char name[256];
|
||||
|
||||
const char * suffix = "";
|
||||
if (op->src[1]->ne[0] % 4 == 0) {
|
||||
suffix = "_4";
|
||||
}
|
||||
|
||||
snprintf(base, 256, "kernel_ssm_conv_%s_%s_batched%s", ggml_type_name(op->src[0]->type), ggml_type_name(op->src[1]->type), suffix);
|
||||
snprintf(name, 256, "%s_ssm_conv_bs=%d", base, ssm_conv_bs);
|
||||
|
||||
ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name);
|
||||
if (!res.pipeline) {
|
||||
ggml_metal_cv_t cv = ggml_metal_cv_init();
|
||||
|
||||
ggml_metal_cv_set_int16(cv, ssm_conv_bs, FC_SSM_CONV + 0);
|
||||
|
||||
res = ggml_metal_library_compile_pipeline(lib, base, name, cv);
|
||||
|
||||
ggml_metal_cv_free(cv);
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_ssm_scan(ggml_metal_library_t lib, const ggml_tensor * op) {
|
||||
GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
|
||||
|
||||
|
|
@ -427,7 +459,12 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_ssm_scan(ggml_me
|
|||
res = ggml_metal_library_compile_pipeline(lib, base, name, nullptr);
|
||||
}
|
||||
|
||||
res.smem = 32*sizeof(float)*nsg;
|
||||
// Shared memory layout:
|
||||
// - sgptg * NW floats for partial sums (nsg * 32)
|
||||
// - sgptg floats for shared_x_dt (nsg)
|
||||
// - sgptg floats for shared_dA (nsg)
|
||||
// Total: nsg * (32 + 2) floats
|
||||
res.smem = (32 + 2)*sizeof(float)*nsg;
|
||||
|
||||
return res;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -117,6 +117,7 @@ struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_cumsum_ad
|
|||
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_tri (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_soft_max (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_ssm_conv (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_ssm_conv_batched (ggml_metal_library_t lib, const struct ggml_tensor * op, int ssm_conv_bs);
|
||||
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_ssm_scan (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_rwkv (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mv_ext (ggml_metal_library_t lib, enum ggml_type tsrc0, enum ggml_type tsrc1, int nsg, int nxpsg, int r1ptg);
|
||||
|
|
|
|||
|
|
@ -77,6 +77,7 @@
|
|||
#define FC_MUL_MV 600
|
||||
#define FC_MUL_MM 700
|
||||
#define FC_ROPE 800
|
||||
#define FC_SSM_CONV 900
|
||||
|
||||
// op-specific constants
|
||||
#define OP_FLASH_ATTN_EXT_NQPTG 8
|
||||
|
|
|
|||
|
|
@ -221,7 +221,7 @@ static int ggml_metal_op_encode_impl(ggml_metal_op_t ctx, int idx) {
|
|||
}
|
||||
|
||||
if (ctx->debug_graph > 0) {
|
||||
GGML_LOG_DEBUG("%s: node[%5d] - %-12s %s\n", __func__, idx, ggml_op_name(node->op), is_concurrent ? "(concurrent)" : "");
|
||||
GGML_LOG_DEBUG("%s: node[%5d] - %-12s %-12s %s\n", __func__, idx, ggml_op_name(node->op), ggml_get_name(node), is_concurrent ? "(concurrent)" : "");
|
||||
}
|
||||
if (ctx->debug_graph > 1) {
|
||||
GGML_TENSOR_LOCALS( int64_t, ne0, node->src[0], ne);
|
||||
|
|
@ -1365,15 +1365,43 @@ int ggml_metal_op_ssm_conv(ggml_metal_op_t ctx, int idx) {
|
|||
/*.nb2 =*/ nb2,
|
||||
};
|
||||
|
||||
auto pipeline = ggml_metal_library_get_pipeline_ssm_conv(lib, op);
|
||||
// Use batched kernel for prefill (ne1 > 1) to reduce threadgroup dispatch overhead
|
||||
const bool use_batched = (ne1 > 1);
|
||||
|
||||
ggml_metal_encoder_set_pipeline(enc, pipeline);
|
||||
ggml_metal_encoder_set_bytes(enc, &args, sizeof(args), 0);
|
||||
ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op->src[0]), 1);
|
||||
ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op->src[1]), 2);
|
||||
ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op), 3);
|
||||
if (use_batched) {
|
||||
// Determine the smallest power of 2 that's >= ne1, but <= 256
|
||||
int BATCH_SIZE;
|
||||
if (ne1 > 128) BATCH_SIZE = 256;
|
||||
else if (ne1 > 64 ) BATCH_SIZE = 128;
|
||||
else if (ne1 > 32 ) BATCH_SIZE = 64;
|
||||
else if (ne1 > 16 ) BATCH_SIZE = 32;
|
||||
else if (ne1 > 8 ) BATCH_SIZE = 16;
|
||||
else if (ne1 > 4 ) BATCH_SIZE = 8;
|
||||
else BATCH_SIZE = 2;
|
||||
|
||||
ggml_metal_encoder_dispatch_threadgroups(enc, ne01, ne1, ne02, 1, 1, 1);
|
||||
auto pipeline = ggml_metal_library_get_pipeline_ssm_conv_batched(lib, op, BATCH_SIZE);
|
||||
|
||||
ggml_metal_encoder_set_pipeline(enc, pipeline);
|
||||
ggml_metal_encoder_set_bytes(enc, &args, sizeof(args), 0);
|
||||
ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op->src[0]), 1);
|
||||
ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op->src[1]), 2);
|
||||
ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op), 3);
|
||||
|
||||
// Dispatch: ne01 rows, ceil(ne1/BATCH_SIZE) token batches, ne02 sequences
|
||||
// Each threadgroup has BATCH_SIZE threads, each handling one token
|
||||
const int n_token_batches = (ne1 + BATCH_SIZE - 1) / BATCH_SIZE;
|
||||
ggml_metal_encoder_dispatch_threadgroups(enc, ne01, n_token_batches, ne02, BATCH_SIZE, 1, 1);
|
||||
} else {
|
||||
auto pipeline = ggml_metal_library_get_pipeline_ssm_conv(lib, op);
|
||||
|
||||
ggml_metal_encoder_set_pipeline(enc, pipeline);
|
||||
ggml_metal_encoder_set_bytes(enc, &args, sizeof(args), 0);
|
||||
ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op->src[0]), 1);
|
||||
ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op->src[1]), 2);
|
||||
ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op), 3);
|
||||
|
||||
ggml_metal_encoder_dispatch_threadgroups(enc, ne01, ne1, ne02, 1, 1, 1);
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2343,7 +2343,102 @@ kernel void kernel_ssm_conv_f32_f32_4(
|
|||
x[0] = sumf;
|
||||
}
|
||||
|
||||
constant short FC_ssm_conv_bs [[function_constant(FC_SSM_CONV + 0)]];
|
||||
|
||||
// Batched version: each threadgroup processes multiple tokens for better efficiency
|
||||
// Thread layout: each thread handles one token, threadgroup covers BATCH_SIZE tokens
|
||||
kernel void kernel_ssm_conv_f32_f32_batched(
|
||||
constant ggml_metal_kargs_ssm_conv & args,
|
||||
device const void * src0,
|
||||
device const void * src1,
|
||||
device float * dst,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
uint3 ntg[[threads_per_threadgroup]]) {
|
||||
// tgpig.x = row index (ir)
|
||||
// tgpig.y = batch of tokens (i2_base / BATCH_SIZE)
|
||||
// tgpig.z = sequence index (i3)
|
||||
// tpitg.x = thread within batch (0..BATCH_SIZE-1)
|
||||
const short BATCH_SIZE = FC_ssm_conv_bs;
|
||||
|
||||
const int64_t ir = tgpig.x;
|
||||
const int64_t i2_base = tgpig.y * BATCH_SIZE;
|
||||
const int64_t i3 = tgpig.z;
|
||||
const int64_t i2_off = tpitg.x;
|
||||
const int64_t i2 = i2_base + i2_off;
|
||||
|
||||
const int64_t nc = args.ne10; // conv kernel size (typically 4)
|
||||
const int64_t n_t = args.ne1; // number of tokens
|
||||
|
||||
// Bounds check for partial batches at the end
|
||||
if (i2 >= n_t) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Load conv weights (shared across all tokens for this row)
|
||||
device const float * c = (device const float *) ((device const char *) src1 + ir*args.nb11);
|
||||
|
||||
// Load source for this specific token
|
||||
device const float * s = (device const float *) ((device const char *) src0 + ir*args.nb01 + i2*args.nb00 + i3*args.nb02);
|
||||
|
||||
// Output location for this token
|
||||
device float * x = (device float *) ((device char *) dst + ir*args.nb0 + i2*args.nb1 + i3*args.nb2);
|
||||
|
||||
float sumf = 0.0f;
|
||||
for (int64_t i0 = 0; i0 < nc; ++i0) {
|
||||
sumf += s[i0] * c[i0];
|
||||
}
|
||||
|
||||
x[0] = sumf;
|
||||
}
|
||||
|
||||
kernel void kernel_ssm_conv_f32_f32_batched_4(
|
||||
constant ggml_metal_kargs_ssm_conv & args,
|
||||
device const void * src0,
|
||||
device const void * src1,
|
||||
device float * dst,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
uint3 ntg[[threads_per_threadgroup]]) {
|
||||
// tgpig.x = row index (ir)
|
||||
// tgpig.y = batch of tokens (i2_base / BATCH_SIZE)
|
||||
// tgpig.z = sequence index (i3)
|
||||
// tpitg.x = thread within batch (0..BATCH_SIZE-1)
|
||||
const short BATCH_SIZE = FC_ssm_conv_bs;
|
||||
|
||||
const int64_t ir = tgpig.x;
|
||||
const int64_t i2_base = tgpig.y * BATCH_SIZE;
|
||||
const int64_t i3 = tgpig.z;
|
||||
const int64_t i2_off = tpitg.x;
|
||||
const int64_t i2 = i2_base + i2_off;
|
||||
|
||||
const int64_t nc = args.ne10; // conv kernel size (typically 4)
|
||||
const int64_t n_t = args.ne1; // number of tokens
|
||||
|
||||
// Bounds check for partial batches at the end
|
||||
if (i2 >= n_t) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Load conv weights (shared across all tokens for this row)
|
||||
device const float4 * c = (device const float4 *) ((device const char *) src1 + ir*args.nb11);
|
||||
|
||||
// Load source for this specific token
|
||||
device const float4 * s = (device const float4 *) ((device const char *) src0 + ir*args.nb01 + i2*args.nb00 + i3*args.nb02);
|
||||
|
||||
// Output location for this token
|
||||
device float * x = (device float *) ((device char *) dst + ir*args.nb0 + i2*args.nb1 + i3*args.nb2);
|
||||
|
||||
float sumf = 0.0f;
|
||||
for (int64_t i0 = 0; i0 < nc/4; ++i0) {
|
||||
sumf += dot(s[i0], c[i0]);
|
||||
}
|
||||
|
||||
x[0] = sumf;
|
||||
}
|
||||
|
||||
// ref: ggml.c:ggml_compute_forward_ssm_scan_f32, Mamba-2 part
|
||||
// Optimized version: reduces redundant memory loads by having one thread load shared values
|
||||
kernel void kernel_ssm_scan_f32(
|
||||
constant ggml_metal_kargs_ssm_scan & args,
|
||||
device const void * src0,
|
||||
|
|
@ -2363,7 +2458,15 @@ kernel void kernel_ssm_scan_f32(
|
|||
uint3 tgpg[[threadgroups_per_grid]]) {
|
||||
constexpr short NW = N_SIMDWIDTH;
|
||||
|
||||
shared[tpitg.x] = 0.0f;
|
||||
// Shared memory layout:
|
||||
// [0..sgptg*NW-1]: partial sums for reduction (existing)
|
||||
// [sgptg*NW..sgptg*NW+sgptg-1]: pre-computed x_dt values for each token in batch
|
||||
// [sgptg*NW+sgptg..sgptg*NW+2*sgptg-1]: pre-computed dA values for each token in batch
|
||||
threadgroup float * shared_sums = shared;
|
||||
threadgroup float * shared_x_dt = shared + sgptg * NW;
|
||||
threadgroup float * shared_dA = shared + sgptg * NW + sgptg;
|
||||
|
||||
shared_sums[tpitg.x] = 0.0f;
|
||||
|
||||
const int32_t i0 = tpitg.x;
|
||||
const int32_t i1 = tgpig.x;
|
||||
|
|
@ -2403,32 +2506,47 @@ kernel void kernel_ssm_scan_f32(
|
|||
for (int i2 = 0; i2 < n_t; i2 += sgptg) {
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
for (int t = 0; t < sgptg && i2 + t < n_t; t++) {
|
||||
const float dt0 = dt[0];
|
||||
// Pre-compute x_dt and dA for this batch of tokens
|
||||
// Only first sgptg threads do the loads and expensive math
|
||||
if (i0 < sgptg && i2 + i0 < n_t) {
|
||||
// ns12 and ns21 are element strides (nb12/nb10, nb21/nb20)
|
||||
device const float * x_t = x + i0 * args.ns12;
|
||||
device const float * dt_t = dt + i0 * args.ns21;
|
||||
|
||||
const float dt0 = dt_t[0];
|
||||
const float dtsp = dt0 <= 20.0f ? log(1.0f + exp(dt0)) : dt0;
|
||||
const float x_dt = x[0] * dtsp;
|
||||
const float dA = exp(dtsp * A0);
|
||||
shared_x_dt[i0] = x_t[0] * dtsp;
|
||||
shared_dA[i0] = dtsp; // Store dtsp, compute exp(dtsp * A0) per-thread since A0 varies
|
||||
}
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
for (int t = 0; t < sgptg && i2 + t < n_t; t++) {
|
||||
const float x_dt = shared_x_dt[t];
|
||||
const float dA = exp(shared_dA[t] * A0);
|
||||
|
||||
s = (s0 * dA) + (B[i0] * x_dt);
|
||||
|
||||
const float sumf = simd_sum(s * C[i0]);
|
||||
|
||||
if (tiisg == 0) {
|
||||
shared[t*NW + sgitg] = sumf;
|
||||
shared_sums[t*NW + sgitg] = sumf;
|
||||
}
|
||||
|
||||
// recurse
|
||||
s0 = s;
|
||||
|
||||
x += args.ns12;
|
||||
dt += args.ns21;
|
||||
B += args.ns42;
|
||||
C += args.ns52;
|
||||
}
|
||||
|
||||
// Advance pointers for next batch
|
||||
x += sgptg * args.ns12;
|
||||
dt += sgptg * args.ns21;
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
const float sumf = simd_sum(shared[sgitg*NW + tiisg]);
|
||||
const float sumf = simd_sum(shared_sums[sgitg*NW + tiisg]);
|
||||
|
||||
if (tiisg == 0 && i2 + sgitg < n_t) {
|
||||
y[sgitg*nh*nr] = sumf;
|
||||
|
|
|
|||
|
|
@ -128,6 +128,13 @@ static void ggml_print_backtrace_symbols(void) {
|
|||
// int nptrs = backtrace(trace, sizeof(trace)/sizeof(trace[0]));
|
||||
// backtrace_symbols_fd(trace, nptrs, STDERR_FILENO);
|
||||
}
|
||||
#elif defined(__APPLE__)
|
||||
#include <execinfo.h>
|
||||
static void ggml_print_backtrace_symbols(void) {
|
||||
void * trace[100];
|
||||
int nptrs = backtrace(trace, sizeof(trace)/sizeof(trace[0]));
|
||||
backtrace_symbols_fd(trace, nptrs, STDERR_FILENO);
|
||||
}
|
||||
#else
|
||||
static void ggml_print_backtrace_symbols(void) {
|
||||
// platform not supported
|
||||
|
|
@ -139,6 +146,20 @@ void ggml_print_backtrace(void) {
|
|||
if (GGML_NO_BACKTRACE) {
|
||||
return;
|
||||
}
|
||||
#if defined(__APPLE__)
|
||||
// On macOS, fork+debugger attachment is problematic due to:
|
||||
// 1. libdispatch "poisons" forked child processes
|
||||
// 2. lldb has issues attaching to parent from forked child
|
||||
// Use simple backtrace() instead to avoid Terminal.app crashes
|
||||
const char * GGML_BACKTRACE_LLDB = getenv("GGML_BACKTRACE_LLDB");
|
||||
if (!GGML_BACKTRACE_LLDB) {
|
||||
fprintf(stderr, "WARNING: Using native backtrace. Set GGML_BACKTRACE_LLDB for more info.\n");
|
||||
fprintf(stderr, "WARNING: GGML_BACKTRACE_LLDB may cause native MacOS Terminal.app to crash.\n");
|
||||
fprintf(stderr, "See: https://github.com/ggml-org/llama.cpp/pull/17869\n");
|
||||
ggml_print_backtrace_symbols();
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
#if defined(__linux__)
|
||||
FILE * f = fopen("/proc/self/status", "r");
|
||||
size_t size = 0;
|
||||
|
|
|
|||
|
|
@ -206,6 +206,52 @@ static std::pair<uint32_t, const char *> parse_char(const char * src) {
|
|||
throw std::runtime_error("unexpected end of input");
|
||||
}
|
||||
|
||||
static std::pair<uint32_t, const char *> parse_token(const llama_vocab * vocab, const char * src) {
|
||||
const char * pos = src;
|
||||
if (*pos != '<') {
|
||||
throw std::runtime_error(std::string("expecting '<' at ") + pos);
|
||||
}
|
||||
pos++;
|
||||
|
||||
// Parse <[id]>
|
||||
if (*pos == '[') {
|
||||
pos++;
|
||||
const char * int_end = parse_int(pos);
|
||||
uint32_t token_id = std::stoul(std::string(pos, int_end - pos));
|
||||
pos = int_end;
|
||||
if (*pos != ']') {
|
||||
throw std::runtime_error(std::string("expecting ']' at ") + pos);
|
||||
}
|
||||
pos++;
|
||||
if (*pos != '>') {
|
||||
throw std::runtime_error(std::string("expecting '>' at ") + pos);
|
||||
}
|
||||
pos++;
|
||||
return std::make_pair(token_id, pos);
|
||||
}
|
||||
|
||||
if (vocab == nullptr) {
|
||||
throw std::runtime_error(std::string("no vocab to parse token at ") + src);
|
||||
}
|
||||
|
||||
// Parse <token> and tokenize to obtain the token id
|
||||
while (*pos != 0 && *pos != '>') {
|
||||
pos++;
|
||||
}
|
||||
if (*pos != '>') {
|
||||
throw std::runtime_error(std::string("expecting '>' at ") + pos);
|
||||
}
|
||||
pos++;
|
||||
|
||||
llama_token tokens[2];
|
||||
int32_t n_tokens = vocab->tokenize(src, static_cast<int32_t>(pos - src), tokens, 2, false, true);
|
||||
if (n_tokens != 1) {
|
||||
// must tokenize to exactly 1 token
|
||||
throw std::runtime_error("invalid token '" + std::string(src, pos - src) + "'");
|
||||
}
|
||||
return std::make_pair(tokens[0], pos);
|
||||
}
|
||||
|
||||
static void print_grammar_char(FILE * file, uint32_t c) {
|
||||
if (0x20 <= c && c <= 0x7f) {
|
||||
fprintf(file, "%c", static_cast<char>(c));
|
||||
|
|
@ -237,6 +283,8 @@ static void print_rule_binary(FILE * file, const llama_grammar_rule & rule) {
|
|||
case LLAMA_GRETYPE_CHAR_RNG_UPPER: fprintf(file, "CHAR_RNG_UPPER"); break;
|
||||
case LLAMA_GRETYPE_CHAR_ALT: fprintf(file, "CHAR_ALT"); break;
|
||||
case LLAMA_GRETYPE_CHAR_ANY: fprintf(file, "CHAR_ANY"); break;
|
||||
case LLAMA_GRETYPE_TOKEN: fprintf(file, "TOKEN"); break;
|
||||
case LLAMA_GRETYPE_TOKEN_NOT: fprintf(file, "TOKEN_NOT"); break;
|
||||
}
|
||||
switch (elem.type) {
|
||||
case LLAMA_GRETYPE_END:
|
||||
|
|
@ -253,6 +301,17 @@ static void print_rule_binary(FILE * file, const llama_grammar_rule & rule) {
|
|||
print_grammar_char(file, elem.value);
|
||||
fprintf(file, "\") ");
|
||||
break;
|
||||
case LLAMA_GRETYPE_TOKEN:
|
||||
fprintf(file, "<[");
|
||||
fprintf(file, "%u", elem.value);
|
||||
fprintf(file, "]> ");
|
||||
break;
|
||||
case LLAMA_GRETYPE_TOKEN_NOT:
|
||||
fprintf(file, "!");
|
||||
fprintf(file, "<[");
|
||||
fprintf(file, "%u", elem.value);
|
||||
fprintf(file, "]> ");
|
||||
break;
|
||||
}
|
||||
}
|
||||
fprintf(file, "\n");
|
||||
|
|
@ -309,6 +368,17 @@ static void print_rule(
|
|||
case LLAMA_GRETYPE_CHAR_ANY:
|
||||
fprintf(file, ".");
|
||||
break;
|
||||
case LLAMA_GRETYPE_TOKEN:
|
||||
fprintf(file, "<[");
|
||||
fprintf(file, "%u", elem.value);
|
||||
fprintf(file, "]> ");
|
||||
break;
|
||||
case LLAMA_GRETYPE_TOKEN_NOT:
|
||||
fprintf(file, "!");
|
||||
fprintf(file, "<[");
|
||||
fprintf(file, "%u", elem.value);
|
||||
fprintf(file, "]> ");
|
||||
break;
|
||||
}
|
||||
if (is_char_element(elem)) {
|
||||
switch (rule[i + 1].type) {
|
||||
|
|
@ -469,6 +539,17 @@ const char * llama_grammar_parser::parse_sequence(
|
|||
}
|
||||
}
|
||||
pos = parse_space(pos + 1, is_nested);
|
||||
} else if (*pos == '<' || *pos == '!') { // token
|
||||
auto type = LLAMA_GRETYPE_TOKEN;
|
||||
if (*pos == '!') { // token inverse
|
||||
type = LLAMA_GRETYPE_TOKEN_NOT;
|
||||
pos++;
|
||||
}
|
||||
auto token_pair = parse_token(vocab, pos);
|
||||
const char * token_end = token_pair.second;
|
||||
last_sym_start = rule.size();
|
||||
rule.push_back({type, token_pair.first});
|
||||
pos = parse_space(token_end, is_nested);
|
||||
} else if (is_word_char(*pos)) { // rule reference
|
||||
const char * name_end = parse_name(pos);
|
||||
uint32_t ref_rule_id = get_symbol_id(pos, name_end - pos);
|
||||
|
|
@ -716,6 +797,21 @@ static bool llama_grammar_match_partial_char(
|
|||
return !is_positive_char;
|
||||
}
|
||||
|
||||
// returns true iff token matches the rule at pos (regular or inverse)
|
||||
// asserts that pos is pointing to a token element
|
||||
static bool llama_grammar_match_token(
|
||||
const llama_grammar_element * pos,
|
||||
const llama_token token) {
|
||||
GGML_ASSERT(pos->type == LLAMA_GRETYPE_TOKEN || pos->type == LLAMA_GRETYPE_TOKEN_NOT);
|
||||
if (pos->type == LLAMA_GRETYPE_TOKEN) {
|
||||
return pos->value == static_cast<uint32_t>(token);
|
||||
}
|
||||
if (pos->type == LLAMA_GRETYPE_TOKEN_NOT) {
|
||||
return pos->value != static_cast<uint32_t>(token);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// transforms a grammar pushdown stack into N possible stacks, all ending
|
||||
// at a character range (terminal element)
|
||||
static void llama_grammar_advance_stack(
|
||||
|
|
@ -763,6 +859,8 @@ static void llama_grammar_advance_stack(
|
|||
case LLAMA_GRETYPE_CHAR:
|
||||
case LLAMA_GRETYPE_CHAR_NOT:
|
||||
case LLAMA_GRETYPE_CHAR_ANY:
|
||||
case LLAMA_GRETYPE_TOKEN:
|
||||
case LLAMA_GRETYPE_TOKEN_NOT:
|
||||
if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
|
||||
// only add the stack if it's not a duplicate of one we already have
|
||||
new_stacks.emplace_back(stack);
|
||||
|
|
@ -856,26 +954,38 @@ llama_grammar_stacks & llama_grammar_get_stacks(struct llama_grammar * grammar)
|
|||
return grammar->stacks;
|
||||
}
|
||||
|
||||
static void llama_grammar_accept_chr(
|
||||
struct llama_grammar & grammar,
|
||||
const llama_grammar_stack & stack,
|
||||
uint32_t chr,
|
||||
llama_grammar_stacks & new_stacks) {
|
||||
if (stack.empty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
const llama_grammar_element * pos = stack.back();
|
||||
|
||||
// ignore if this turns into a token
|
||||
if (pos->type == LLAMA_GRETYPE_TOKEN || pos->type == LLAMA_GRETYPE_TOKEN_NOT) {
|
||||
return;
|
||||
}
|
||||
|
||||
auto match = llama_grammar_match_char(pos, chr);
|
||||
if (match.first) {
|
||||
llama_grammar_stack new_stack(stack.begin(), stack.end() - 1);
|
||||
if (!llama_grammar_is_end_of_sequence(match.second)) {
|
||||
new_stack.push_back(match.second);
|
||||
}
|
||||
llama_grammar_advance_stack(grammar.rules, new_stack, new_stacks);
|
||||
}
|
||||
}
|
||||
|
||||
void llama_grammar_accept(struct llama_grammar * grammar, uint32_t chr) {
|
||||
llama_grammar_stacks stacks_new;
|
||||
stacks_new.reserve(grammar->stacks.size());
|
||||
|
||||
for (const auto & stack : grammar->stacks) {
|
||||
if (stack.empty()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
auto match = llama_grammar_match_char(stack.back(), chr);
|
||||
if (match.first) {
|
||||
const llama_grammar_element * pos = match.second;
|
||||
|
||||
// update top of stack to next element, if any
|
||||
llama_grammar_stack new_stack(stack.begin(), stack.end() - 1);
|
||||
if (!llama_grammar_is_end_of_sequence(pos)) {
|
||||
new_stack.push_back(pos);
|
||||
}
|
||||
llama_grammar_advance_stack(grammar->rules, new_stack, stacks_new);
|
||||
}
|
||||
llama_grammar_accept_chr(*grammar, stack, chr, stacks_new);
|
||||
}
|
||||
|
||||
grammar->stacks = std::move(stacks_new);
|
||||
|
|
@ -932,6 +1042,22 @@ llama_grammar_candidates llama_grammar_reject_candidates_for_stack(
|
|||
|
||||
const llama_grammar_element * stack_pos = stack.back();
|
||||
|
||||
// if the top of the stack is a token rule, then we only need to check the token id
|
||||
if (stack_pos->type == LLAMA_GRETYPE_TOKEN || stack_pos->type == LLAMA_GRETYPE_TOKEN_NOT) {
|
||||
for (const auto & tok : candidates) {
|
||||
if (*tok.code_points == 0) {
|
||||
// reached the end of a token consumed by char rules, reject iff it ended
|
||||
// in a partial response
|
||||
if (tok.partial_utf8.n_remain != 0) {
|
||||
rejects.push_back(tok);
|
||||
}
|
||||
} else if (!llama_grammar_match_token(stack_pos, tok.id)) {
|
||||
rejects.push_back(tok);
|
||||
}
|
||||
}
|
||||
return rejects;
|
||||
}
|
||||
|
||||
llama_grammar_candidates next_candidates;
|
||||
next_candidates.reserve(candidates.size());
|
||||
|
||||
|
|
@ -944,7 +1070,7 @@ llama_grammar_candidates llama_grammar_reject_candidates_for_stack(
|
|||
rejects.push_back(tok);
|
||||
}
|
||||
} else if (llama_grammar_match_char(stack_pos, *tok.code_points).first) {
|
||||
next_candidates.push_back({ tok.index, tok.code_points + 1, tok.partial_utf8 });
|
||||
next_candidates.push_back({ tok.index, tok.code_points + 1, tok.partial_utf8, tok.id });
|
||||
} else {
|
||||
rejects.push_back(tok);
|
||||
}
|
||||
|
|
@ -962,7 +1088,7 @@ llama_grammar_candidates llama_grammar_reject_candidates_for_stack(
|
|||
|
||||
auto next_rejects = llama_grammar_reject_candidates(rules, next_stacks, next_candidates);
|
||||
for (const auto & tok : next_rejects) {
|
||||
rejects.push_back({ tok.index, tok.code_points - 1, tok.partial_utf8 });
|
||||
rejects.push_back({ tok.index, tok.code_points - 1, tok.partial_utf8, tok.id });
|
||||
}
|
||||
|
||||
if (cache_target) {
|
||||
|
|
@ -1032,12 +1158,13 @@ struct llama_grammar * llama_grammar_init_impl(
|
|||
vocab,
|
||||
std::move(vec_rules),
|
||||
std::move(stacks),
|
||||
/* .partial_utf8 = */ {},
|
||||
/* .lazy =*/ false,
|
||||
/* .awaiting_trigger = */ false,
|
||||
/* .trigger_buffer = */ "",
|
||||
/* .trigger_tokens = */ {},
|
||||
/* .trigger_patterns = */ {},
|
||||
/* .partial_utf8 = */ {},
|
||||
/* .lazy = */ false,
|
||||
/* .awaiting_trigger = */ false,
|
||||
/* .trigger_buffer = */ "",
|
||||
/* .trigger_buffer_positions = */ {},
|
||||
/* .trigger_tokens = */ {},
|
||||
/* .trigger_patterns = */ {},
|
||||
};
|
||||
}
|
||||
|
||||
|
|
@ -1050,7 +1177,7 @@ struct llama_grammar * llama_grammar_init_impl(
|
|||
size_t num_trigger_patterns,
|
||||
const llama_token * trigger_tokens,
|
||||
size_t num_trigger_tokens) {
|
||||
llama_grammar_parser parser;
|
||||
llama_grammar_parser parser(vocab);
|
||||
|
||||
// if there is a grammar, parse it
|
||||
// rules will be empty (default) if there are parse errors
|
||||
|
|
@ -1137,10 +1264,11 @@ struct llama_grammar * llama_grammar_init_impl(
|
|||
vocab,
|
||||
std::move(vec_rules),
|
||||
std::move(stacks),
|
||||
/* .partial_utf8 = */ {},
|
||||
/* .lazy = */ lazy,
|
||||
/* .awaiting_trigger = */ lazy,
|
||||
/* .trigger_buffer = */ "",
|
||||
/* .partial_utf8 = */ {},
|
||||
/* .lazy = */ lazy,
|
||||
/* .awaiting_trigger = */ lazy,
|
||||
/* .trigger_buffer = */ "",
|
||||
/* .trigger_buffer_positions = */ {},
|
||||
std::move(vec_trigger_tokens),
|
||||
std::move(vec_trigger_patterns),
|
||||
};
|
||||
|
|
@ -1163,6 +1291,7 @@ struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & gra
|
|||
grammar.lazy,
|
||||
grammar.awaiting_trigger,
|
||||
grammar.trigger_buffer,
|
||||
grammar.trigger_buffer_positions,
|
||||
grammar.trigger_tokens,
|
||||
grammar.trigger_patterns,
|
||||
};
|
||||
|
|
@ -1216,7 +1345,7 @@ void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_
|
|||
cur_p->data[i].logit = -INFINITY;
|
||||
} else {
|
||||
candidates_decoded.push_back(decode_utf8(piece, grammar.partial_utf8));
|
||||
candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second });
|
||||
candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second, id });
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -1235,10 +1364,12 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
|
|||
if (std::find(grammar.trigger_tokens.begin(), grammar.trigger_tokens.end(), token) != grammar.trigger_tokens.end()) {
|
||||
grammar.awaiting_trigger = false;
|
||||
grammar.trigger_buffer.clear();
|
||||
llama_grammar_accept_str(grammar, piece);
|
||||
llama_grammar_accept_token(grammar, token, piece);
|
||||
LLAMA_LOG_DEBUG("Grammar triggered on token %u (`%s`)", token, piece.c_str());
|
||||
return;
|
||||
} else {
|
||||
auto position = std::make_pair(grammar.trigger_buffer.size(), grammar.trigger_buffer.size() + piece.size());
|
||||
grammar.trigger_buffer_positions.push_back(std::make_pair(token, position));
|
||||
grammar.trigger_buffer += piece;
|
||||
|
||||
std::smatch match;
|
||||
|
|
@ -1256,10 +1387,23 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
|
|||
if (start == std::string::npos) {
|
||||
start = match.position(0);
|
||||
}
|
||||
|
||||
// replay tokens that overlap with [start, end)
|
||||
for (const auto & [tok, tok_pos] : grammar.trigger_buffer_positions) {
|
||||
auto [tok_start, tok_end] = tok_pos;
|
||||
if (tok_end <= start) {
|
||||
continue;
|
||||
}
|
||||
|
||||
size_t piece_start = (tok_start < start) ? start : tok_start; // allow for partial token pieces
|
||||
size_t piece_len = tok_end - piece_start;
|
||||
auto tok_piece = grammar.trigger_buffer.substr(piece_start, piece_len);
|
||||
llama_grammar_accept_token(grammar, tok, tok_piece);
|
||||
}
|
||||
|
||||
auto constrained_str = grammar.trigger_buffer.substr(start);
|
||||
// std::string constrained_str(match[1].first, grammar.trigger_buffer.end());
|
||||
grammar.trigger_buffer.clear();
|
||||
llama_grammar_accept_str(grammar, constrained_str);
|
||||
grammar.trigger_buffer_positions.clear();
|
||||
LLAMA_LOG_DEBUG("Grammar triggered on regex: '%s'\n", constrained_str.c_str());
|
||||
return;
|
||||
}
|
||||
|
|
@ -1278,7 +1422,7 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
|
|||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
llama_grammar_accept_str(grammar, piece);
|
||||
llama_grammar_accept_token(grammar, token, piece);
|
||||
}
|
||||
|
||||
void llama_grammar_accept_str(struct llama_grammar & grammar, const std::string & piece) {
|
||||
|
|
@ -1295,3 +1439,59 @@ void llama_grammar_accept_str(struct llama_grammar & grammar, const std::string
|
|||
throw std::runtime_error("Unexpected empty grammar stack after accepting piece: " + piece);
|
||||
}
|
||||
}
|
||||
|
||||
void llama_grammar_accept_token(struct llama_grammar & grammar, llama_token token, const std::string & piece) {
|
||||
// Note terminating 0 in decoded string
|
||||
const auto decoded = decode_utf8(piece, grammar.partial_utf8);
|
||||
const auto & code_points = decoded.first;
|
||||
|
||||
llama_grammar_stacks stacks_new;
|
||||
stacks_new.reserve(grammar.stacks.size());
|
||||
|
||||
for (const auto & stack : grammar.stacks) {
|
||||
if (stack.empty()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const llama_grammar_element * pos = stack.back();
|
||||
|
||||
if (pos->type == LLAMA_GRETYPE_TOKEN || pos->type == LLAMA_GRETYPE_TOKEN_NOT) {
|
||||
if (llama_grammar_match_token(pos, token)) {
|
||||
llama_grammar_stack new_stack(stack.begin(), stack.end() - 1);
|
||||
if (!llama_grammar_is_end_of_sequence(pos + 1)) {
|
||||
new_stack.push_back(pos + 1);
|
||||
}
|
||||
llama_grammar_advance_stack(grammar.rules, new_stack, stacks_new);
|
||||
}
|
||||
} else {
|
||||
llama_grammar_stacks current_stacks = {stack};
|
||||
|
||||
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
|
||||
llama_grammar_stacks next_stacks;
|
||||
|
||||
for (const auto & cur_stack : current_stacks) {
|
||||
llama_grammar_accept_chr(grammar, cur_stack, *it, next_stacks);
|
||||
}
|
||||
|
||||
current_stacks = std::move(next_stacks);
|
||||
if (current_stacks.empty()) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
for (auto & surviving_stack : current_stacks) {
|
||||
if (std::find(stacks_new.begin(), stacks_new.end(), surviving_stack) == stacks_new.end()) {
|
||||
stacks_new.emplace_back(surviving_stack);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
grammar.stacks = std::move(stacks_new);
|
||||
grammar.partial_utf8 = decoded.second;
|
||||
|
||||
if (grammar.stacks.empty()) {
|
||||
throw std::runtime_error("Unexpected empty grammar stack after accepting piece: " + piece + " (" + std::to_string(token) + ")");
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -36,11 +36,17 @@ enum llama_gretype {
|
|||
|
||||
// any character (.)
|
||||
LLAMA_GRETYPE_CHAR_ANY = 7,
|
||||
|
||||
// terminal element: token (<[token-id]>)
|
||||
LLAMA_GRETYPE_TOKEN = 8,
|
||||
|
||||
// inverse token (!<[token-id]>)
|
||||
LLAMA_GRETYPE_TOKEN_NOT = 9,
|
||||
};
|
||||
|
||||
typedef struct llama_grammar_element {
|
||||
enum llama_gretype type;
|
||||
uint32_t value; // Unicode code point or rule ID
|
||||
uint32_t value; // Unicode code point, rule ID, or token ID
|
||||
} llama_grammar_element;
|
||||
|
||||
struct llama_partial_utf8 {
|
||||
|
|
@ -52,6 +58,7 @@ struct llama_grammar_candidate {
|
|||
size_t index;
|
||||
const uint32_t * code_points;
|
||||
llama_partial_utf8 partial_utf8;
|
||||
llama_token id;
|
||||
};
|
||||
|
||||
using llama_grammar_rule = std::vector< llama_grammar_element>;
|
||||
|
|
@ -77,10 +84,13 @@ std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_stack(
|
|||
const llama_grammar_candidates & candidates);
|
||||
|
||||
struct llama_grammar_parser {
|
||||
const llama_vocab * vocab;
|
||||
std::map<std::string, uint32_t> symbol_ids;
|
||||
|
||||
llama_grammar_rules rules;
|
||||
|
||||
llama_grammar_parser(const struct llama_vocab * vocab = nullptr) : vocab(vocab) {}
|
||||
|
||||
llama_grammar_stack c_rules() const;
|
||||
|
||||
uint32_t get_symbol_id(const char * src, size_t len);
|
||||
|
|
@ -112,6 +122,9 @@ struct llama_grammar_trigger_pattern {
|
|||
};
|
||||
|
||||
struct llama_grammar {
|
||||
// maintain a list of llama_tokens and their positions in the trigger_buffer
|
||||
using token_pos = std::pair<llama_token, std::pair<size_t, size_t>>;
|
||||
|
||||
// note: allow null vocab for testing (not great)
|
||||
const llama_vocab * vocab;
|
||||
|
||||
|
|
@ -127,6 +140,7 @@ struct llama_grammar {
|
|||
bool lazy = false;
|
||||
bool awaiting_trigger = false; // Initialized to true for lazy grammars only
|
||||
std::string trigger_buffer; // Output buffered by lazy grammar. Will be cleared once trigger is found.
|
||||
std::vector<token_pos> trigger_buffer_positions; // Tokens buffered by lazy grammar. Used to replay when a trigger is found.
|
||||
std::vector<llama_token> trigger_tokens; // Tokens that trigger a lazy grammar, or tokens to force printing of (even if special).
|
||||
std::vector<llama_grammar_trigger_pattern>
|
||||
trigger_patterns; // Regular expressions that trigger a lazy grammar. Must be a full match of the entire generated
|
||||
|
|
@ -171,3 +185,8 @@ void llama_grammar_accept_impl(
|
|||
void llama_grammar_accept_str(
|
||||
struct llama_grammar & grammar,
|
||||
const std::string & piece);
|
||||
|
||||
void llama_grammar_accept_token(
|
||||
struct llama_grammar & grammar,
|
||||
llama_token token,
|
||||
const std::string & piece);
|
||||
|
|
|
|||
|
|
@ -973,7 +973,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
|||
|
||||
// mask out the other groups
|
||||
selection_probs = ggml_get_rows(ctx0, selection_groups, expert_groups); // [n_exp_per_group, n_group_used, n_tokens]
|
||||
selection_probs = ggml_set_rows(ctx0, ggml_scale_bias(ctx0, selection_groups, 0.0f, -INFINITY), selection_probs, expert_groups); // [n_exp_per_group, n_expert_groups, n_tokens]
|
||||
selection_probs = ggml_set_rows(ctx0, ggml_fill(ctx0, selection_groups, -INFINITY), selection_probs, expert_groups); // [n_exp_per_group, n_expert_groups, n_tokens]
|
||||
selection_probs = ggml_reshape_2d(ctx0, selection_probs, n_expert, n_tokens); // [n_expert, n_tokens]
|
||||
cb(selection_probs, "ffn_moe_probs_masked", il);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -58,7 +58,7 @@
|
|||
#include "models/gemma-embedding.cpp"
|
||||
#include "models/gemma.cpp"
|
||||
#include "models/gemma2-iswa.cpp"
|
||||
#include "models/gemma3-iswa.cpp"
|
||||
#include "models/gemma3.cpp"
|
||||
#include "models/gemma3n-iswa.cpp"
|
||||
#include "models/glm4-moe.cpp"
|
||||
#include "models/glm4.cpp"
|
||||
|
|
@ -225,6 +225,7 @@ const char * llm_type_name(llm_type type) {
|
|||
case LLM_TYPE_16B_A1B: return "16B.A1B";
|
||||
case LLM_TYPE_21B_A3B: return "21B.A3B";
|
||||
case LLM_TYPE_30B_A3B: return "30B.A3B";
|
||||
case LLM_TYPE_80B_A3B: return "80B.A3B";
|
||||
case LLM_TYPE_100B_A6B: return "100B.A6B";
|
||||
case LLM_TYPE_106B_A12B: return "106B.A12B";
|
||||
case LLM_TYPE_230B_A10B: return "230B.A10B";
|
||||
|
|
@ -1369,18 +1370,25 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
} break;
|
||||
case LLM_ARCH_GEMMA3:
|
||||
{
|
||||
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
||||
hparams.set_swa_pattern(6);
|
||||
const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
||||
if (found_swa && hparams.n_swa > 0) {
|
||||
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
||||
hparams.set_swa_pattern(6);
|
||||
|
||||
hparams.rope_freq_base_train_swa = 10000.0f;
|
||||
hparams.rope_freq_scale_train_swa = 1.0f;
|
||||
hparams.rope_freq_base_train_swa = 10000.0f;
|
||||
hparams.rope_freq_scale_train_swa = 1.0f;
|
||||
} else {
|
||||
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
|
||||
}
|
||||
|
||||
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
||||
hparams.f_final_logit_softcapping = 0.0f;
|
||||
ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false);
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 18: type = LLM_TYPE_270M; break;
|
||||
case 26: type = LLM_TYPE_1B; break;
|
||||
case 32: type = LLM_TYPE_8B; break; // Rnj-1
|
||||
case 34: type = LLM_TYPE_4B; break;
|
||||
case 48: type = LLM_TYPE_12B; break;
|
||||
case 62: type = LLM_TYPE_27B; break;
|
||||
|
|
@ -1704,8 +1712,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
||||
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 28: type = LLM_TYPE_20B; break;
|
||||
switch (hparams.n_ff_exp) {
|
||||
case 1408: type = LLM_TYPE_16B; break;
|
||||
case 1792: type = LLM_TYPE_20B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
|
|
@ -2354,7 +2363,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
}
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 80: type = LLM_TYPE_80B_A3B; break;
|
||||
case 48: type = LLM_TYPE_80B_A3B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
|
|
@ -7466,7 +7475,11 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|||
} break;
|
||||
case LLM_ARCH_GEMMA3:
|
||||
{
|
||||
llm = std::make_unique<llm_build_gemma3_iswa>(*this, params);
|
||||
if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
|
||||
llm = std::make_unique<llm_build_gemma3<true>>(*this, params);
|
||||
} else {
|
||||
llm = std::make_unique<llm_build_gemma3<false>>(*this, params);
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_GEMMA3N:
|
||||
{
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
#include "models.h"
|
||||
|
||||
llm_build_gemma3_iswa::llm_build_gemma3_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
template <bool iswa>
|
||||
llm_build_gemma3<iswa>::llm_build_gemma3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_k;
|
||||
|
||||
ggml_tensor * cur;
|
||||
|
|
@ -17,13 +18,28 @@ llm_build_gemma3_iswa::llm_build_gemma3_iswa(const llama_model & model, const ll
|
|||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
// TODO: is causal == true correct? might need some changes
|
||||
auto * inp_attn = build_attn_inp_kv_iswa();
|
||||
using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
|
||||
inp_attn_type * inp_attn = nullptr;
|
||||
|
||||
if constexpr (iswa) {
|
||||
inp_attn = build_attn_inp_kv_iswa();
|
||||
} else {
|
||||
inp_attn = build_attn_inp_kv();
|
||||
}
|
||||
|
||||
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
const float freq_base_l = model.get_rope_freq_base (cparams, il);
|
||||
const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
|
||||
float freq_base_l = 0.0f;
|
||||
float freq_scale_l = 0.0f;
|
||||
|
||||
if constexpr (iswa) {
|
||||
freq_base_l = model.get_rope_freq_base (cparams, il);
|
||||
freq_scale_l = model.get_rope_freq_scale(cparams, il);
|
||||
} else {
|
||||
freq_base_l = freq_base;
|
||||
freq_scale_l = freq_scale;
|
||||
}
|
||||
|
||||
// norm
|
||||
cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
|
||||
|
|
@ -102,7 +118,7 @@ llm_build_gemma3_iswa::llm_build_gemma3_iswa(const llama_model & model, const ll
|
|||
cur = build_norm(cur,
|
||||
model.layers[il].ffn_post_norm, NULL,
|
||||
LLM_NORM_RMS, -1);
|
||||
cb(cur, "ffn_post_norm", -1);
|
||||
cb(cur, "ffn_post_norm", il);
|
||||
|
||||
cur = ggml_add(ctx0, cur, sa_out);
|
||||
|
||||
|
|
@ -124,8 +140,17 @@ llm_build_gemma3_iswa::llm_build_gemma3_iswa(const llama_model & model, const ll
|
|||
// lm_head
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
|
||||
if (hparams.f_final_logit_softcapping) {
|
||||
cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping);
|
||||
cur = ggml_tanh(ctx0, cur);
|
||||
cur = ggml_scale(ctx0, cur, hparams.f_final_logit_softcapping);
|
||||
}
|
||||
|
||||
cb(cur, "result_output", -1);
|
||||
res->t_logits = cur;
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
}
|
||||
|
||||
template struct llm_build_gemma3<false>;
|
||||
template struct llm_build_gemma3<true>;
|
||||
|
|
@ -179,8 +179,9 @@ struct llm_build_gemma2_iswa : public llm_graph_context {
|
|||
llm_build_gemma2_iswa(const llama_model & model, const llm_graph_params & params);
|
||||
};
|
||||
|
||||
struct llm_build_gemma3_iswa : public llm_graph_context {
|
||||
llm_build_gemma3_iswa(const llama_model & model, const llm_graph_params & params);
|
||||
template <bool iswa>
|
||||
struct llm_build_gemma3 : public llm_graph_context {
|
||||
llm_build_gemma3(const llama_model & model, const llm_graph_params & params);
|
||||
};
|
||||
|
||||
struct llm_build_gemma3n_iswa : public llm_graph_context {
|
||||
|
|
|
|||
|
|
@ -972,6 +972,9 @@ json oaicompat_chat_params_parse(
|
|||
inputs.parallel_tool_calls = json_value(body, "parallel_tool_calls", false);
|
||||
inputs.add_generation_prompt = json_value(body, "add_generation_prompt", true);
|
||||
inputs.reasoning_format = opt.reasoning_format;
|
||||
if (body.contains("reasoning_format")) {
|
||||
inputs.reasoning_format = common_reasoning_format_from_name(body.at("reasoning_format").get<std::string>());
|
||||
}
|
||||
inputs.enable_thinking = opt.enable_thinking;
|
||||
if (!inputs.tools.empty() && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE) {
|
||||
if (body.contains("grammar")) {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue