mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-05-08 09:59:50 +00:00
Merge commit '12280ae905' into concedo_experimental
# Conflicts: # .github/workflows/build.yml # common/CMakeLists.txt # docs/docker.md # examples/model-conversion/scripts/causal/compare-logits.py # ggml/src/ggml-hexagon/htp/rope-ops.c # tests/test-backend-ops.cpp # tests/test-barrier.cpp # tools/server/CMakeLists.txt # tools/server/README.md
This commit is contained in:
commit
e88bf41fdc
49 changed files with 1380 additions and 583 deletions
|
|
@ -49,6 +49,7 @@
|
|||
#define LLAMA_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
|
||||
|
||||
using json = nlohmann::ordered_json;
|
||||
using namespace common_arg_utils;
|
||||
|
||||
static std::initializer_list<enum llama_example> mmproj_examples = {
|
||||
LLAMA_EXAMPLE_MTMD,
|
||||
|
|
@ -66,6 +67,15 @@ static std::string read_file(const std::string & fname) {
|
|||
return content;
|
||||
}
|
||||
|
||||
static const std::vector<common_arg> & get_common_arg_defs() {
|
||||
static const std::vector<common_arg> options = [] {
|
||||
common_params params;
|
||||
auto ctx = common_params_parser_init(params, LLAMA_EXAMPLE_SERVER, nullptr);
|
||||
return ctx.options;
|
||||
}();
|
||||
return options;
|
||||
}
|
||||
|
||||
common_arg & common_arg::set_examples(std::initializer_list<enum llama_example> examples) {
|
||||
this->examples = examples;
|
||||
return *this;
|
||||
|
|
@ -136,7 +146,7 @@ static std::vector<std::string> break_str_into_lines(std::string input, size_t m
|
|||
return result;
|
||||
}
|
||||
|
||||
std::string common_arg::to_string() {
|
||||
std::string common_arg::to_string() const {
|
||||
// params for printing to console
|
||||
const static int n_leading_spaces = 40;
|
||||
const static int n_char_per_line_help = 70; // TODO: detect this based on current console
|
||||
|
|
@ -649,6 +659,53 @@ static void add_rpc_devices(const std::string & servers) {
|
|||
}
|
||||
}
|
||||
|
||||
bool common_params_parse(int argc, char ** argv, llama_example ex, std::map<common_arg, std::string> & out_map) {
|
||||
common_params dummy_params;
|
||||
common_params_context ctx_arg = common_params_parser_init(dummy_params, ex, nullptr);
|
||||
|
||||
std::unordered_map<std::string, common_arg *> arg_to_options;
|
||||
for (auto & opt : ctx_arg.options) {
|
||||
for (const auto & arg : opt.args) {
|
||||
arg_to_options[arg] = &opt;
|
||||
}
|
||||
}
|
||||
|
||||
// TODO @ngxson : find a way to deduplicate this code
|
||||
|
||||
// handle command line arguments
|
||||
auto check_arg = [&](int i) {
|
||||
if (i+1 >= argc) {
|
||||
throw std::invalid_argument("expected value for argument");
|
||||
}
|
||||
};
|
||||
|
||||
for (int i = 1; i < argc; i++) {
|
||||
const std::string arg_prefix = "--";
|
||||
|
||||
std::string arg = argv[i];
|
||||
if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
|
||||
std::replace(arg.begin(), arg.end(), '_', '-');
|
||||
}
|
||||
if (arg_to_options.find(arg) == arg_to_options.end()) {
|
||||
throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
|
||||
}
|
||||
auto opt = *arg_to_options[arg];
|
||||
std::string val;
|
||||
if (opt.value_hint != nullptr) {
|
||||
// arg with single value
|
||||
check_arg(i);
|
||||
val = argv[++i];
|
||||
}
|
||||
if (opt.value_hint_2 != nullptr) {
|
||||
// TODO: support arg with 2 values
|
||||
throw std::invalid_argument("error: argument with 2 values is not yet supported\n");
|
||||
}
|
||||
out_map[opt] = val;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
|
||||
auto ctx_arg = common_params_parser_init(params, ex, print_usage);
|
||||
const common_params params_org = ctx_arg.params; // the example can modify the default params
|
||||
|
|
@ -694,25 +751,19 @@ static std::string list_builtin_chat_templates() {
|
|||
return msg.str();
|
||||
}
|
||||
|
||||
static bool is_truthy(const std::string & value) {
|
||||
bool common_arg_utils::is_truthy(const std::string & value) {
|
||||
return value == "on" || value == "enabled" || value == "1";
|
||||
}
|
||||
|
||||
static bool is_falsey(const std::string & value) {
|
||||
bool common_arg_utils::is_falsey(const std::string & value) {
|
||||
return value == "off" || value == "disabled" || value == "0";
|
||||
}
|
||||
|
||||
static bool is_autoy(const std::string & value) {
|
||||
bool common_arg_utils::is_autoy(const std::string & value) {
|
||||
return value == "auto" || value == "-1";
|
||||
}
|
||||
|
||||
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
|
||||
// default values specific to example
|
||||
// note: we place it here instead of inside server.cpp to allow llama-gen-docs to pick it up
|
||||
if (ex == LLAMA_EXAMPLE_SERVER) {
|
||||
params.use_jinja = true;
|
||||
}
|
||||
|
||||
params.use_color = tty_can_use_colors();
|
||||
|
||||
// load dynamic backends
|
||||
|
|
@ -1807,7 +1858,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONT_BATCHING"));
|
||||
add_opt(common_arg(
|
||||
{"--mmproj"}, "FILE",
|
||||
{"-mm", "--mmproj"}, "FILE",
|
||||
"path to a multimodal projector file. see tools/mtmd/README.md\n"
|
||||
"note: if -hf is used, this argument can be omitted",
|
||||
[](common_params & params, const std::string & value) {
|
||||
|
|
@ -1815,7 +1866,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||
}
|
||||
).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ"));
|
||||
add_opt(common_arg(
|
||||
{"--mmproj-url"}, "URL",
|
||||
{"-mmu", "--mmproj-url"}, "URL",
|
||||
"URL to a multimodal projector file. see tools/mtmd/README.md",
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.mmproj.url = value;
|
||||
|
|
@ -2545,6 +2596,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||
params.models_dir = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_DIR"));
|
||||
add_opt(common_arg(
|
||||
{"--models-preset"}, "PATH",
|
||||
"path to INI file containing model presets for the router server (default: disabled)",
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.models_preset = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_PRESET"));
|
||||
add_opt(common_arg(
|
||||
{"--models-max"}, "N",
|
||||
string_format("for router server, maximum number of models to load simultaneously (default: %d, 0 = unlimited)", params.models_max),
|
||||
|
|
@ -2561,14 +2619,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_MODELS_AUTOLOAD"));
|
||||
add_opt(common_arg(
|
||||
{"--jinja"},
|
||||
string_format("use jinja template for chat (default: %s)\n", params.use_jinja ? "enabled" : "disabled"),
|
||||
string_format("use jinja template for chat (default: %s)", params.use_jinja ? "enabled" : "disabled"),
|
||||
[](common_params & params) {
|
||||
params.use_jinja = true;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_JINJA"));
|
||||
add_opt(common_arg(
|
||||
{"--no-jinja"},
|
||||
string_format("disable jinja template for chat (default: %s)\n", params.use_jinja ? "enabled" : "disabled"),
|
||||
string_format("disable jinja template for chat (default: %s)", params.use_jinja ? "disabled" : "enabled"),
|
||||
[](common_params & params) {
|
||||
params.use_jinja = false;
|
||||
}
|
||||
|
|
|
|||
32
common/arg.h
32
common/arg.h
|
|
@ -3,8 +3,10 @@
|
|||
#include "common.h"
|
||||
|
||||
#include <set>
|
||||
#include <map>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <cstring>
|
||||
|
||||
//
|
||||
// CLI argument parsing
|
||||
|
|
@ -24,6 +26,8 @@ struct common_arg {
|
|||
void (*handler_str_str)(common_params & params, const std::string &, const std::string &) = nullptr;
|
||||
void (*handler_int) (common_params & params, int) = nullptr;
|
||||
|
||||
common_arg() = default;
|
||||
|
||||
common_arg(
|
||||
const std::initializer_list<const char *> & args,
|
||||
const char * value_hint,
|
||||
|
|
@ -61,9 +65,29 @@ struct common_arg {
|
|||
bool is_exclude(enum llama_example ex);
|
||||
bool get_value_from_env(std::string & output) const;
|
||||
bool has_value_from_env() const;
|
||||
std::string to_string();
|
||||
std::string to_string() const;
|
||||
|
||||
// for using as key in std::map
|
||||
bool operator<(const common_arg& other) const {
|
||||
if (args.empty() || other.args.empty()) {
|
||||
return false;
|
||||
}
|
||||
return strcmp(args[0], other.args[0]) < 0;
|
||||
}
|
||||
bool operator==(const common_arg& other) const {
|
||||
if (args.empty() || other.args.empty()) {
|
||||
return false;
|
||||
}
|
||||
return strcmp(args[0], other.args[0]) == 0;
|
||||
}
|
||||
};
|
||||
|
||||
namespace common_arg_utils {
|
||||
bool is_truthy(const std::string & value);
|
||||
bool is_falsey(const std::string & value);
|
||||
bool is_autoy(const std::string & value);
|
||||
}
|
||||
|
||||
struct common_params_context {
|
||||
enum llama_example ex = LLAMA_EXAMPLE_COMMON;
|
||||
common_params & params;
|
||||
|
|
@ -76,7 +100,11 @@ struct common_params_context {
|
|||
// if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message)
|
||||
bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
|
||||
|
||||
// function to be used by test-arg-parser
|
||||
// parse input arguments from CLI into a map
|
||||
// TODO: support repeated args in the future
|
||||
bool common_params_parse(int argc, char ** argv, llama_example ex, std::map<common_arg, std::string> & out_map);
|
||||
|
||||
// initialize argument parser context - used by test-arg-parser and preset
|
||||
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
|
||||
|
||||
struct common_remote_params {
|
||||
|
|
|
|||
|
|
@ -460,7 +460,7 @@ struct common_params {
|
|||
std::string public_path = ""; // NOLINT
|
||||
std::string api_prefix = ""; // NOLINT
|
||||
std::string chat_template = ""; // NOLINT
|
||||
bool use_jinja = false; // NOLINT
|
||||
bool use_jinja = true; // NOLINT
|
||||
bool enable_chat_template = true;
|
||||
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
|
||||
int reasoning_budget = -1;
|
||||
|
|
@ -480,9 +480,10 @@ struct common_params {
|
|||
bool endpoint_metrics = false;
|
||||
|
||||
// router server configs
|
||||
std::string models_dir = ""; // directory containing models for the router server
|
||||
int models_max = 4; // maximum number of models to load simultaneously
|
||||
bool models_autoload = true; // automatically load models when requested via the router server
|
||||
std::string models_dir = ""; // directory containing models for the router server
|
||||
std::string models_preset = ""; // directory containing model presets for the router server
|
||||
int models_max = 4; // maximum number of models to load simultaneously
|
||||
bool models_autoload = true; // automatically load models when requested via the router server
|
||||
|
||||
bool log_json = false;
|
||||
|
||||
|
|
|
|||
|
|
@ -12,6 +12,8 @@
|
|||
#include <filesystem>
|
||||
#include <fstream>
|
||||
#include <future>
|
||||
#include <map>
|
||||
#include <mutex>
|
||||
#include <regex>
|
||||
#include <string>
|
||||
#include <thread>
|
||||
|
|
@ -472,36 +474,79 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string &
|
|||
|
||||
#elif defined(LLAMA_USE_HTTPLIB)
|
||||
|
||||
static bool is_output_a_tty() {
|
||||
class ProgressBar {
|
||||
static inline std::mutex mutex;
|
||||
static inline std::map<const ProgressBar *, int> lines;
|
||||
static inline int max_line = 0;
|
||||
|
||||
static void cleanup(const ProgressBar * line) {
|
||||
lines.erase(line);
|
||||
if (lines.empty()) {
|
||||
max_line = 0;
|
||||
}
|
||||
}
|
||||
|
||||
static bool is_output_a_tty() {
|
||||
#if defined(_WIN32)
|
||||
return _isatty(_fileno(stdout));
|
||||
return _isatty(_fileno(stdout));
|
||||
#else
|
||||
return isatty(1);
|
||||
return isatty(1);
|
||||
#endif
|
||||
}
|
||||
|
||||
static void print_progress(size_t current, size_t total) {
|
||||
if (!is_output_a_tty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (!total) {
|
||||
return;
|
||||
public:
|
||||
ProgressBar() = default;
|
||||
|
||||
~ProgressBar() {
|
||||
std::lock_guard<std::mutex> lock(mutex);
|
||||
cleanup(this);
|
||||
}
|
||||
|
||||
size_t width = 50;
|
||||
size_t pct = (100 * current) / total;
|
||||
size_t pos = (width * current) / total;
|
||||
void update(size_t current, size_t total) {
|
||||
if (!is_output_a_tty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
std::cout << "["
|
||||
<< std::string(pos, '=')
|
||||
<< (pos < width ? ">" : "")
|
||||
<< std::string(width - pos, ' ')
|
||||
<< "] " << std::setw(3) << pct << "% ("
|
||||
<< current / (1024 * 1024) << " MB / "
|
||||
<< total / (1024 * 1024) << " MB)\r";
|
||||
std::cout.flush();
|
||||
}
|
||||
if (!total) {
|
||||
return;
|
||||
}
|
||||
|
||||
std::lock_guard<std::mutex> lock(mutex);
|
||||
|
||||
if (lines.find(this) == lines.end()) {
|
||||
lines[this] = max_line++;
|
||||
std::cout << "\n";
|
||||
}
|
||||
int lines_up = max_line - lines[this];
|
||||
|
||||
size_t width = 50;
|
||||
size_t pct = (100 * current) / total;
|
||||
size_t pos = (width * current) / total;
|
||||
|
||||
std::cout << "\033[s";
|
||||
|
||||
if (lines_up > 0) {
|
||||
std::cout << "\033[" << lines_up << "A";
|
||||
}
|
||||
std::cout << "\033[2K\r["
|
||||
<< std::string(pos, '=')
|
||||
<< (pos < width ? ">" : "")
|
||||
<< std::string(width - pos, ' ')
|
||||
<< "] " << std::setw(3) << pct << "% ("
|
||||
<< current / (1024 * 1024) << " MB / "
|
||||
<< total / (1024 * 1024) << " MB) "
|
||||
<< "\033[u";
|
||||
|
||||
std::cout.flush();
|
||||
|
||||
if (current == total) {
|
||||
cleanup(this);
|
||||
}
|
||||
}
|
||||
|
||||
ProgressBar(const ProgressBar &) = delete;
|
||||
ProgressBar & operator=(const ProgressBar &) = delete;
|
||||
};
|
||||
|
||||
static bool common_pull_file(httplib::Client & cli,
|
||||
const std::string & resolve_path,
|
||||
|
|
@ -523,6 +568,7 @@ static bool common_pull_file(httplib::Client & cli,
|
|||
const char * func = __func__; // avoid __func__ inside a lambda
|
||||
size_t downloaded = existing_size;
|
||||
size_t progress_step = 0;
|
||||
ProgressBar bar;
|
||||
|
||||
auto res = cli.Get(resolve_path, headers,
|
||||
[&](const httplib::Response &response) {
|
||||
|
|
@ -554,7 +600,7 @@ static bool common_pull_file(httplib::Client & cli,
|
|||
progress_step += len;
|
||||
|
||||
if (progress_step >= total_size / 1000 || downloaded == total_size) {
|
||||
print_progress(downloaded, total_size);
|
||||
bar.update(downloaded, total_size);
|
||||
progress_step = 0;
|
||||
}
|
||||
return true;
|
||||
|
|
@ -562,8 +608,6 @@ static bool common_pull_file(httplib::Client & cli,
|
|||
nullptr
|
||||
);
|
||||
|
||||
std::cout << "\n";
|
||||
|
||||
if (!res) {
|
||||
LOG_ERR("%s: error during download. Status: %d\n", __func__, res ? res->status : -1);
|
||||
return false;
|
||||
|
|
|
|||
180
common/preset.cpp
Normal file
180
common/preset.cpp
Normal file
|
|
@ -0,0 +1,180 @@
|
|||
#include "arg.h"
|
||||
#include "preset.h"
|
||||
#include "peg-parser.h"
|
||||
#include "log.h"
|
||||
|
||||
#include <fstream>
|
||||
#include <sstream>
|
||||
#include <filesystem>
|
||||
|
||||
static std::string rm_leading_dashes(const std::string & str) {
|
||||
size_t pos = 0;
|
||||
while (pos < str.size() && str[pos] == '-') {
|
||||
++pos;
|
||||
}
|
||||
return str.substr(pos);
|
||||
}
|
||||
|
||||
std::vector<std::string> common_preset::to_args() const {
|
||||
std::vector<std::string> args;
|
||||
|
||||
for (const auto & [opt, value] : options) {
|
||||
args.push_back(opt.args.back()); // use the last arg as the main arg
|
||||
if (opt.value_hint == nullptr && opt.value_hint_2 == nullptr) {
|
||||
// flag option, no value
|
||||
if (common_arg_utils::is_falsey(value)) {
|
||||
// skip the flag
|
||||
args.pop_back();
|
||||
}
|
||||
}
|
||||
if (opt.value_hint != nullptr) {
|
||||
// single value
|
||||
args.push_back(value);
|
||||
}
|
||||
if (opt.value_hint != nullptr && opt.value_hint_2 != nullptr) {
|
||||
throw std::runtime_error(string_format(
|
||||
"common_preset::to_args(): option '%s' has two values, which is not supported yet",
|
||||
opt.args.back()
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
return args;
|
||||
}
|
||||
|
||||
std::string common_preset::to_ini() const {
|
||||
std::ostringstream ss;
|
||||
|
||||
ss << "[" << name << "]\n";
|
||||
for (const auto & [opt, value] : options) {
|
||||
auto espaced_value = value;
|
||||
string_replace_all(espaced_value, "\n", "\\\n");
|
||||
ss << rm_leading_dashes(opt.args.back()) << " = ";
|
||||
ss << espaced_value << "\n";
|
||||
}
|
||||
ss << "\n";
|
||||
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
static std::map<std::string, std::map<std::string, std::string>> parse_ini_from_file(const std::string & path) {
|
||||
std::map<std::string, std::map<std::string, std::string>> parsed;
|
||||
|
||||
if (!std::filesystem::exists(path)) {
|
||||
throw std::runtime_error("preset file does not exist: " + path);
|
||||
}
|
||||
|
||||
std::ifstream file(path);
|
||||
if (!file.good()) {
|
||||
throw std::runtime_error("failed to open server preset file: " + path);
|
||||
}
|
||||
|
||||
std::string contents((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
|
||||
|
||||
static const auto parser = build_peg_parser([](auto & p) {
|
||||
// newline ::= "\r\n" / "\n" / "\r"
|
||||
auto newline = p.rule("newline", p.literal("\r\n") | p.literal("\n") | p.literal("\r"));
|
||||
|
||||
// ws ::= [ \t]*
|
||||
auto ws = p.rule("ws", p.chars("[ \t]", 0, -1));
|
||||
|
||||
// comment ::= [;#] (!newline .)*
|
||||
auto comment = p.rule("comment", p.chars("[;#]", 1, 1) + p.zero_or_more(p.negate(newline) + p.any()));
|
||||
|
||||
// eol ::= ws comment? (newline / EOF)
|
||||
auto eol = p.rule("eol", ws + p.optional(comment) + (newline | p.end()));
|
||||
|
||||
// ident ::= [a-zA-Z_] [a-zA-Z0-9_.-]*
|
||||
auto ident = p.rule("ident", p.chars("[a-zA-Z_]", 1, 1) + p.chars("[a-zA-Z0-9_.-]", 0, -1));
|
||||
|
||||
// value ::= (!eol-start .)*
|
||||
auto eol_start = p.rule("eol-start", ws + (p.chars("[;#]", 1, 1) | newline | p.end()));
|
||||
auto value = p.rule("value", p.zero_or_more(p.negate(eol_start) + p.any()));
|
||||
|
||||
// header-line ::= "[" ws ident ws "]" eol
|
||||
auto header_line = p.rule("header-line", "[" + ws + p.tag("section-name", p.chars("[^]]")) + ws + "]" + eol);
|
||||
|
||||
// kv-line ::= ident ws "=" ws value eol
|
||||
auto kv_line = p.rule("kv-line", p.tag("key", ident) + ws + "=" + ws + p.tag("value", value) + eol);
|
||||
|
||||
// comment-line ::= ws comment (newline / EOF)
|
||||
auto comment_line = p.rule("comment-line", ws + comment + (newline | p.end()));
|
||||
|
||||
// blank-line ::= ws (newline / EOF)
|
||||
auto blank_line = p.rule("blank-line", ws + (newline | p.end()));
|
||||
|
||||
// line ::= header-line / kv-line / comment-line / blank-line
|
||||
auto line = p.rule("line", header_line | kv_line | comment_line | blank_line);
|
||||
|
||||
// ini ::= line* EOF
|
||||
auto ini = p.rule("ini", p.zero_or_more(line) + p.end());
|
||||
|
||||
return ini;
|
||||
});
|
||||
|
||||
common_peg_parse_context ctx(contents);
|
||||
const auto result = parser.parse(ctx);
|
||||
if (!result.success()) {
|
||||
throw std::runtime_error("failed to parse server config file: " + path);
|
||||
}
|
||||
|
||||
std::string current_section = COMMON_PRESET_DEFAULT_NAME;
|
||||
std::string current_key;
|
||||
|
||||
ctx.ast.visit(result, [&](const auto & node) {
|
||||
if (node.tag == "section-name") {
|
||||
const std::string section = std::string(node.text);
|
||||
current_section = section;
|
||||
parsed[current_section] = {};
|
||||
} else if (node.tag == "key") {
|
||||
const std::string key = std::string(node.text);
|
||||
current_key = key;
|
||||
} else if (node.tag == "value" && !current_key.empty() && !current_section.empty()) {
|
||||
parsed[current_section][current_key] = std::string(node.text);
|
||||
current_key.clear();
|
||||
}
|
||||
});
|
||||
|
||||
return parsed;
|
||||
}
|
||||
|
||||
static std::map<std::string, common_arg> get_map_key_opt(common_params_context & ctx_params) {
|
||||
std::map<std::string, common_arg> mapping;
|
||||
for (const auto & opt : ctx_params.options) {
|
||||
if (opt.env != nullptr) {
|
||||
mapping[opt.env] = opt;
|
||||
}
|
||||
for (const auto & arg : opt.args) {
|
||||
mapping[rm_leading_dashes(arg)] = opt;
|
||||
}
|
||||
}
|
||||
return mapping;
|
||||
}
|
||||
|
||||
common_presets common_presets_load(const std::string & path, common_params_context & ctx_params) {
|
||||
common_presets out;
|
||||
auto key_to_opt = get_map_key_opt(ctx_params);
|
||||
auto ini_data = parse_ini_from_file(path);
|
||||
|
||||
for (auto section : ini_data) {
|
||||
common_preset preset;
|
||||
if (section.first.empty()) {
|
||||
preset.name = COMMON_PRESET_DEFAULT_NAME;
|
||||
} else {
|
||||
preset.name = section.first;
|
||||
}
|
||||
LOG_DBG("loading preset: %s\n", preset.name.c_str());
|
||||
for (const auto & [key, value] : section.second) {
|
||||
LOG_DBG("option: %s = %s\n", key.c_str(), value.c_str());
|
||||
if (key_to_opt.find(key) != key_to_opt.end()) {
|
||||
preset.options[key_to_opt[key]] = value;
|
||||
LOG_DBG("accepted option: %s = %s\n", key.c_str(), value.c_str());
|
||||
} else {
|
||||
// TODO: maybe warn about unknown key?
|
||||
}
|
||||
}
|
||||
out[preset.name] = preset;
|
||||
}
|
||||
|
||||
return out;
|
||||
}
|
||||
32
common/preset.h
Normal file
32
common/preset.h
Normal file
|
|
@ -0,0 +1,32 @@
|
|||
#pragma once
|
||||
|
||||
#include "common.h"
|
||||
#include "arg.h"
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <map>
|
||||
|
||||
//
|
||||
// INI preset parser and writer
|
||||
//
|
||||
|
||||
constexpr const char * COMMON_PRESET_DEFAULT_NAME = "default";
|
||||
|
||||
struct common_preset {
|
||||
std::string name;
|
||||
// TODO: support repeated args in the future
|
||||
std::map<common_arg, std::string> options;
|
||||
|
||||
// convert preset to CLI argument list
|
||||
std::vector<std::string> to_args() const;
|
||||
|
||||
// convert preset to INI format string
|
||||
std::string to_ini() const;
|
||||
|
||||
// TODO: maybe implement to_env() if needed
|
||||
};
|
||||
|
||||
// interface for multiple presets in one file
|
||||
using common_presets = std::map<std::string, common_preset>;
|
||||
common_presets common_presets_load(const std::string & path, common_params_context & ctx_params);
|
||||
|
|
@ -2330,13 +2330,11 @@ extern "C" {
|
|||
float stop,
|
||||
float step);
|
||||
|
||||
#define GGML_KQ_MASK_PAD 1
|
||||
|
||||
// q: [n_embd_k, n_batch, n_head, ne3 ]
|
||||
// k: [n_embd_k, n_kv, n_head_kv, ne3 ]
|
||||
// v: [n_embd_v, n_kv, n_head_kv, ne3 ] !! not transposed !!
|
||||
// mask: [n_kv, n_batch_pad, ne32, ne33] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !!
|
||||
// res: [n_embd_v, n_head, n_batch, ne3 ] !! permuted !!
|
||||
// q: [n_embd_k, n_batch, n_head, ne3 ]
|
||||
// k: [n_embd_k, n_kv, n_head_kv, ne3 ]
|
||||
// v: [n_embd_v, n_kv, n_head_kv, ne3 ] !! not transposed !!
|
||||
// mask: [n_kv, n_batch, ne32, ne33]
|
||||
// res: [n_embd_v, n_head, n_batch, ne3 ] !! permuted !!
|
||||
//
|
||||
// broadcast:
|
||||
// n_head % n_head_kv == 0
|
||||
|
|
|
|||
|
|
@ -312,16 +312,9 @@ static struct buffer_address ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * al
|
|||
}
|
||||
|
||||
// this is a very naive implementation, but for our case the number of free blocks should be very small
|
||||
static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, struct buffer_address addr, size_t size, const struct ggml_tensor * tensor) {
|
||||
static void ggml_dyn_tallocr_free_bytes(struct ggml_dyn_tallocr * alloc, struct buffer_address addr, size_t size) {
|
||||
size = aligned_offset(NULL, size, alloc->alignment);
|
||||
|
||||
AT_PRINTF("%s: freeing %s at {chunk=%d, offset=%zu} (%zu bytes) - n_free_blocks = %d\n",
|
||||
__func__, tensor->name, addr.chunk, addr.offset, size, alloc->chunks[addr.chunk]->n_free_blocks);
|
||||
|
||||
#ifdef GGML_ALLOCATOR_DEBUG
|
||||
remove_allocated_tensor(alloc, addr, tensor);
|
||||
#endif
|
||||
|
||||
struct tallocr_chunk * chunk = alloc->chunks[addr.chunk];
|
||||
|
||||
// see if we can merge with an existing block
|
||||
|
|
@ -357,8 +350,6 @@ static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, struct
|
|||
}
|
||||
// otherwise, add a new block
|
||||
ggml_dyn_tallocr_insert_block(chunk, addr.offset, size);
|
||||
|
||||
GGML_UNUSED(tensor);
|
||||
}
|
||||
|
||||
static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) {
|
||||
|
|
@ -616,13 +607,17 @@ static void ggml_gallocr_free_extra_space(ggml_gallocr_t galloc, struct ggml_ten
|
|||
|
||||
GGML_ASSERT(parent_size >= node_size);
|
||||
|
||||
// note: we want after the freeing the chunks to continue to be aligned
|
||||
struct ggml_dyn_tallocr * p_alloc = galloc->buf_tallocs[p_hn->buffer_id];
|
||||
parent_size = aligned_offset(NULL, parent_size, p_alloc->alignment);
|
||||
node_size = aligned_offset(NULL, node_size, p_alloc->alignment);
|
||||
|
||||
if (parent_size > node_size) {
|
||||
struct ggml_dyn_tallocr * p_alloc = galloc->buf_tallocs[p_hn->buffer_id];
|
||||
struct buffer_address p_addr = p_hn->addr;
|
||||
p_addr.offset += node_size;
|
||||
size_t extra_size = parent_size - node_size;
|
||||
AT_PRINTF("freeing extra %zu bytes from parent %s for %s\n", extra_size, parent->name, node->name);
|
||||
ggml_dyn_tallocr_free_tensor(p_alloc, p_addr, extra_size, parent);
|
||||
ggml_dyn_tallocr_free_bytes(p_alloc, p_addr, extra_size);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -706,7 +701,14 @@ static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * n
|
|||
struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
|
||||
ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
|
||||
size_t size = ggml_backend_buft_get_alloc_size(buft, node);
|
||||
ggml_dyn_tallocr_free_tensor(alloc, hn->addr, size, node);
|
||||
|
||||
AT_PRINTF("%s: freeing %s at {chunk=%d, offset=%zu} (%zu bytes) - n_free_blocks = %d\n",
|
||||
__func__, node->name, hn->addr.chunk, hn->addr.offset, size, alloc->chunks[hn->addr.chunk]->n_free_blocks);
|
||||
#ifdef GGML_ALLOCATOR_DEBUG
|
||||
remove_allocated_tensor(alloc, hn->addr, node);
|
||||
#endif
|
||||
|
||||
ggml_dyn_tallocr_free_bytes(alloc, hn->addr, size);
|
||||
hn->allocated = false;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -191,6 +191,9 @@ typedef void * thread_ret_t;
|
|||
|
||||
typedef pthread_t ggml_thread_t;
|
||||
|
||||
#define GGML_THREADPOOL_N_THREADS_MASK (0xffffU)
|
||||
#define GGML_THREADPOOL_N_THREADS_BITS (16)
|
||||
|
||||
#if defined(__APPLE__)
|
||||
#include <unistd.h>
|
||||
#include <mach/mach.h>
|
||||
|
|
@ -453,7 +456,7 @@ struct ggml_threadpool {
|
|||
struct ggml_cplan * cplan;
|
||||
|
||||
// synchronization primitives
|
||||
atomic_int n_graph; // incremented when there is work to be done (i.e each graph)
|
||||
atomic_int n_graph; // updated when there is work to be done (i.e each graph) holds graph and active thread counts.
|
||||
atomic_int GGML_CACHE_ALIGN n_barrier;
|
||||
atomic_int GGML_CACHE_ALIGN n_barrier_passed;
|
||||
atomic_int GGML_CACHE_ALIGN current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.
|
||||
|
|
@ -461,12 +464,10 @@ struct ggml_threadpool {
|
|||
// these are atomic as an annotation for thread-sanitizer
|
||||
atomic_bool stop; // Used for stopping the threadpool altogether
|
||||
atomic_bool pause; // Used for pausing the threadpool or individual threads
|
||||
atomic_int abort; // Used for aborting processing of a graph
|
||||
atomic_int abort; // Used for aborting processing of a graph
|
||||
|
||||
struct ggml_compute_state * workers; // per thread state
|
||||
int n_threads_max; // number of threads in the pool
|
||||
atomic_int n_threads_cur; // number of threads used in the current graph
|
||||
|
||||
int n_threads; // Number of threads in the pool
|
||||
int32_t prio; // Scheduling priority
|
||||
uint32_t poll; // Polling level (0 - no polling)
|
||||
|
||||
|
|
@ -543,7 +544,7 @@ struct ggml_state {
|
|||
static struct ggml_state g_state = {0};
|
||||
|
||||
void ggml_barrier(struct ggml_threadpool * tp) {
|
||||
int n_threads = atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed);
|
||||
int n_threads = atomic_load_explicit(&tp->n_graph, memory_order_relaxed) & GGML_THREADPOOL_N_THREADS_MASK;
|
||||
if (n_threads == 1) {
|
||||
return;
|
||||
}
|
||||
|
|
@ -560,7 +561,7 @@ void ggml_barrier(struct ggml_threadpool * tp) {
|
|||
// last thread
|
||||
atomic_store_explicit(&tp->n_barrier, 0, memory_order_relaxed);
|
||||
|
||||
// exit barrier (fill seq-cst fence)
|
||||
// exit barrier (full seq-cst fence)
|
||||
atomic_fetch_add_explicit(&tp->n_barrier_passed, 1, memory_order_seq_cst);
|
||||
return;
|
||||
}
|
||||
|
|
@ -3475,7 +3476,7 @@ static void ggml_thread_cpumask_next(const bool * global_mask, bool * local_mask
|
|||
void ggml_threadpool_free(struct ggml_threadpool* threadpool) {
|
||||
if (!threadpool) return;
|
||||
|
||||
const int n_threads = threadpool->n_threads_max;
|
||||
const int n_threads = threadpool->n_threads;
|
||||
|
||||
#ifndef GGML_USE_OPENMP
|
||||
struct ggml_compute_state* workers = threadpool->workers;
|
||||
|
|
@ -3551,7 +3552,7 @@ struct ggml_cplan ggml_graph_plan(
|
|||
//GGML_PRINT_DEBUG("Threadpool is not specified. Will create a disposable threadpool : n_threads %d\n", n_threads);
|
||||
}
|
||||
if (n_threads <= 0) {
|
||||
n_threads = threadpool ? threadpool->n_threads_max : GGML_DEFAULT_N_THREADS;
|
||||
n_threads = threadpool ? threadpool->n_threads : GGML_DEFAULT_N_THREADS;
|
||||
}
|
||||
|
||||
#if defined(__EMSCRIPTEN__) && !defined(__EMSCRIPTEN_PTHREADS__)
|
||||
|
|
@ -3778,12 +3779,14 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|||
|
||||
struct ggml_compute_params params = {
|
||||
/*.ith =*/ state->ith,
|
||||
/*.nth =*/ atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed),
|
||||
/*.nth =*/ atomic_load_explicit(&tp->n_graph, memory_order_relaxed) & GGML_THREADPOOL_N_THREADS_MASK,
|
||||
/*.wsize =*/ cplan->work_size,
|
||||
/*.wdata =*/ cplan->work_data,
|
||||
/*.threadpool=*/ tp,
|
||||
};
|
||||
|
||||
GGML_PRINT_DEBUG("thread #%d compute-start cplan %p last-graph %d \n", state->ith, cplan, state->last_graph);
|
||||
|
||||
for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) {
|
||||
struct ggml_tensor * node = cgraph->nodes[node_n];
|
||||
|
||||
|
|
@ -3805,6 +3808,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|||
}
|
||||
}
|
||||
|
||||
GGML_PRINT_DEBUG("thread #%d compute-done cplan %p last-graph %d \n", state->ith, cplan, state->last_graph);
|
||||
|
||||
ggml_barrier(state->threadpool);
|
||||
|
||||
return 0;
|
||||
|
|
@ -3812,27 +3817,23 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|||
|
||||
#ifndef GGML_USE_OPENMP
|
||||
|
||||
// check if thread is active
|
||||
static inline bool ggml_graph_compute_thread_active(struct ggml_compute_state * state) {
|
||||
struct ggml_threadpool * threadpool = state->threadpool;
|
||||
int n_threads = atomic_load_explicit(&threadpool->n_threads_cur, memory_order_relaxed);
|
||||
return (state->ith < n_threads);
|
||||
}
|
||||
|
||||
// check if thread is ready to proceed (exit from polling or sleeping)
|
||||
// returns true if loops should exit, sets state->pending to indicate new work
|
||||
static inline bool ggml_graph_compute_thread_ready(struct ggml_compute_state * state) {
|
||||
struct ggml_threadpool * threadpool = state->threadpool;
|
||||
|
||||
if (state->pending || threadpool->stop || threadpool->pause) { return true; }
|
||||
|
||||
// check for new graph/work
|
||||
int new_graph = atomic_load_explicit(&threadpool->n_graph, memory_order_relaxed);
|
||||
if (new_graph != state->last_graph) {
|
||||
state->pending = ggml_graph_compute_thread_active(state);
|
||||
state->last_graph = new_graph;
|
||||
int n_graph = atomic_load_explicit(&threadpool->n_graph, memory_order_relaxed);
|
||||
int n_threads = n_graph & GGML_THREADPOOL_N_THREADS_MASK;
|
||||
if (n_graph != state->last_graph) {
|
||||
state->pending = (state->ith < n_threads);
|
||||
state->last_graph = n_graph;
|
||||
return true;
|
||||
}
|
||||
|
||||
return state->pending;
|
||||
return false;
|
||||
}
|
||||
|
||||
// sync thread state after polling
|
||||
|
|
@ -3849,11 +3850,6 @@ static inline void ggml_graph_compute_thread_sync(struct ggml_compute_state * st
|
|||
static inline bool ggml_graph_compute_poll_for_work(struct ggml_compute_state * state) {
|
||||
struct ggml_threadpool * threadpool = state->threadpool;
|
||||
|
||||
// Skip polling for unused threads
|
||||
if (!ggml_graph_compute_thread_active(state)) {
|
||||
return state->pending;
|
||||
}
|
||||
|
||||
// This seems to make 0 ... 100 a decent range for polling level across modern processors.
|
||||
// Perhaps, we can adjust it dynamically based on load and things.
|
||||
const uint64_t n_rounds = 1024UL * 128 * threadpool->poll;
|
||||
|
|
@ -3915,7 +3911,6 @@ static thread_ret_t ggml_graph_compute_secondary_thread(void* data) {
|
|||
ggml_graph_compute_check_for_work(state);
|
||||
if (state->pending) {
|
||||
state->pending = false;
|
||||
|
||||
ggml_graph_compute_thread(state);
|
||||
}
|
||||
}
|
||||
|
|
@ -3930,14 +3925,15 @@ static void ggml_graph_compute_kickoff(struct ggml_threadpool * threadpool, int
|
|||
|
||||
ggml_mutex_lock(&threadpool->mutex);
|
||||
|
||||
GGML_PRINT_DEBUG("threadpool: n_threads_cur %d n_threads %d\n", threadpool->n_threads_cur, n_threads);
|
||||
// Update the number of active threads and the graph count
|
||||
int n_graph = atomic_load_explicit(&threadpool->n_graph, memory_order_relaxed) >> GGML_THREADPOOL_N_THREADS_BITS;
|
||||
n_graph = ((n_graph + 1) << GGML_THREADPOOL_N_THREADS_BITS) | (n_threads & GGML_THREADPOOL_N_THREADS_MASK);
|
||||
|
||||
// Update the number of active threads
|
||||
atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed);
|
||||
GGML_PRINT_DEBUG("compute-kickoff: n_threads %d n_graph %d\n", n_threads, n_graph);
|
||||
|
||||
// Indicate the graph is ready to be processed
|
||||
// We need the full seq-cst fence here because of the polling threads (used in thread_sync)
|
||||
atomic_fetch_add_explicit(&threadpool->n_graph, 1, memory_order_seq_cst);
|
||||
atomic_store_explicit(&threadpool->n_graph, n_graph, memory_order_seq_cst);
|
||||
|
||||
if (threadpool->pause) {
|
||||
// Update main thread prio and affinity to match the threadpool settings
|
||||
|
|
@ -3975,8 +3971,7 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(
|
|||
threadpool->pause = tpp->paused;
|
||||
threadpool->abort = -1;
|
||||
threadpool->workers = NULL;
|
||||
threadpool->n_threads_max = tpp->n_threads;
|
||||
threadpool->n_threads_cur = tpp->n_threads;
|
||||
threadpool->n_threads = tpp->n_threads;
|
||||
threadpool->poll = tpp->poll;
|
||||
threadpool->prio = tpp->prio;
|
||||
threadpool->ec = GGML_STATUS_SUCCESS;
|
||||
|
|
@ -4071,7 +4066,7 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
|
|||
{
|
||||
// update the number of threads from the actual number of threads that we got from OpenMP
|
||||
n_threads = omp_get_num_threads();
|
||||
atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed);
|
||||
atomic_store_explicit(&threadpool->n_graph, n_threads, memory_order_relaxed);
|
||||
}
|
||||
|
||||
// Apply thread CPU mask and priority
|
||||
|
|
@ -4084,13 +4079,13 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
|
|||
ggml_graph_compute_thread(&threadpool->workers[ith]);
|
||||
}
|
||||
} else {
|
||||
atomic_store_explicit(&threadpool->n_threads_cur, 1, memory_order_relaxed);
|
||||
atomic_store_explicit(&threadpool->n_graph, 1, memory_order_relaxed);
|
||||
ggml_graph_compute_thread(&threadpool->workers[0]);
|
||||
}
|
||||
#else
|
||||
if (n_threads > threadpool->n_threads_max) {
|
||||
GGML_LOG_WARN("cplan requested more threads (%d) than available (%d)\n", n_threads, threadpool->n_threads_max);
|
||||
n_threads = threadpool->n_threads_max;
|
||||
if (n_threads > threadpool->n_threads) {
|
||||
GGML_LOG_WARN("cplan requested more threads (%d) than available (%d)\n", n_threads, threadpool->n_threads);
|
||||
n_threads = threadpool->n_threads;
|
||||
}
|
||||
|
||||
// Kick all threads to start the new graph
|
||||
|
|
|
|||
|
|
@ -67,19 +67,22 @@
|
|||
#define GGML_CUDA_CC_RDNA1 (GGML_CUDA_CC_OFFSET_AMD + 0x1010) // RX 5000
|
||||
#define GGML_CUDA_CC_RDNA2 (GGML_CUDA_CC_OFFSET_AMD + 0x1030) // RX 6000, minimum for dp4a
|
||||
#define GGML_CUDA_CC_RDNA3 (GGML_CUDA_CC_OFFSET_AMD + 0x1100) // RX 7000, minimum for WMMA
|
||||
#define GGML_CUDA_CC_RDNA3_5 (GGML_CUDA_CC_OFFSET_AMD + 0x1150) // AI 370, AI Max 395 laptops.
|
||||
#define GGML_CUDA_CC_RDNA4 (GGML_CUDA_CC_OFFSET_AMD + 0x1200) // RX 9000
|
||||
|
||||
#define GGML_CUDA_CC_IS_AMD(cc) (cc >= GGML_CUDA_CC_OFFSET_AMD)
|
||||
#define GGML_CUDA_CC_IS_RDNA(cc) (cc >= GGML_CUDA_CC_RDNA1)
|
||||
#define GGML_CUDA_CC_IS_RDNA1(cc) (cc >= GGML_CUDA_CC_RDNA1 && cc < GGML_CUDA_CC_RDNA2)
|
||||
#define GGML_CUDA_CC_IS_RDNA2(cc) (cc >= GGML_CUDA_CC_RDNA2 && cc < GGML_CUDA_CC_RDNA3)
|
||||
#define GGML_CUDA_CC_IS_RDNA3(cc) (cc >= GGML_CUDA_CC_RDNA3 && cc < GGML_CUDA_CC_RDNA4)
|
||||
#define GGML_CUDA_CC_IS_RDNA4(cc) (cc >= GGML_CUDA_CC_RDNA4)
|
||||
#define GGML_CUDA_CC_IS_GCN(cc) (cc > GGML_CUDA_CC_OFFSET_AMD && cc < GGML_CUDA_CC_CDNA1)
|
||||
#define GGML_CUDA_CC_IS_CDNA(cc) (cc >= GGML_CUDA_CC_CDNA1 && cc < GGML_CUDA_CC_RDNA1)
|
||||
#define GGML_CUDA_CC_IS_CDNA1(cc) (cc >= GGML_CUDA_CC_CDNA1 && cc < GGML_CUDA_CC_CDNA2)
|
||||
#define GGML_CUDA_CC_IS_CDNA2(cc) (cc >= GGML_CUDA_CC_CDNA2 && cc < GGML_CUDA_CC_CDNA3)
|
||||
#define GGML_CUDA_CC_IS_CDNA3(cc) (cc >= GGML_CUDA_CC_CDNA3 && cc < GGML_CUDA_CC_RDNA1)
|
||||
#define GGML_CUDA_CC_IS_AMD(cc) (cc >= GGML_CUDA_CC_OFFSET_AMD)
|
||||
#define GGML_CUDA_CC_IS_RDNA(cc) (cc >= GGML_CUDA_CC_RDNA1)
|
||||
#define GGML_CUDA_CC_IS_RDNA1(cc) (cc >= GGML_CUDA_CC_RDNA1 && cc < GGML_CUDA_CC_RDNA2)
|
||||
#define GGML_CUDA_CC_IS_RDNA2(cc) (cc >= GGML_CUDA_CC_RDNA2 && cc < GGML_CUDA_CC_RDNA3)
|
||||
#define GGML_CUDA_CC_IS_RDNA3_0(cc) (cc >= GGML_CUDA_CC_RDNA3 && cc < GGML_CUDA_CC_RDNA3_5)
|
||||
#define GGML_CUDA_CC_IS_RDNA3_5(cc) (cc >= GGML_CUDA_CC_RDNA3_5 && cc < GGML_CUDA_CC_RDNA4)
|
||||
#define GGML_CUDA_CC_IS_RDNA3(cc) (GGML_CUDA_CC_IS_RDNA3_0(cc) || GGML_CUDA_CC_IS_RDNA3_5(cc))
|
||||
#define GGML_CUDA_CC_IS_RDNA4(cc) (cc >= GGML_CUDA_CC_RDNA4)
|
||||
#define GGML_CUDA_CC_IS_GCN(cc) (cc > GGML_CUDA_CC_OFFSET_AMD && cc < GGML_CUDA_CC_CDNA1)
|
||||
#define GGML_CUDA_CC_IS_CDNA(cc) (cc >= GGML_CUDA_CC_CDNA1 && cc < GGML_CUDA_CC_RDNA1)
|
||||
#define GGML_CUDA_CC_IS_CDNA1(cc) (cc >= GGML_CUDA_CC_CDNA1 && cc < GGML_CUDA_CC_CDNA2)
|
||||
#define GGML_CUDA_CC_IS_CDNA2(cc) (cc >= GGML_CUDA_CC_CDNA2 && cc < GGML_CUDA_CC_CDNA3)
|
||||
#define GGML_CUDA_CC_IS_CDNA3(cc) (cc >= GGML_CUDA_CC_CDNA3 && cc < GGML_CUDA_CC_RDNA1)
|
||||
|
||||
// Moore Threads
|
||||
#define MUSART_HMASK 40300 // MUSA rc4.3, min. ver. for half2 -> uint mask comparisons
|
||||
|
|
|
|||
|
|
@ -4643,9 +4643,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
|||
case GGML_OP_CUMSUM:
|
||||
case GGML_OP_TRI:
|
||||
case GGML_OP_DIAG:
|
||||
return true;
|
||||
case GGML_OP_SOLVE_TRI:
|
||||
return op->src[0]->ne[0] <= 64 && op->src[1]->ne[0] <= 32;
|
||||
return true;
|
||||
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -189,6 +189,9 @@ namespace ggml_cuda_mma {
|
|||
return 8 * (threadIdx.x / 16) + l;
|
||||
#elif defined(RDNA3)
|
||||
return 2 * l + (threadIdx.x / 16);
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
return -1;
|
||||
#endif // defined(RDNA4)
|
||||
} else {
|
||||
NO_DEVICE_CODE;
|
||||
|
|
@ -290,8 +293,12 @@ namespace ggml_cuda_mma {
|
|||
}
|
||||
}
|
||||
#elif defined(AMD_WMMA_AVAILABLE)
|
||||
|
||||
#if defined(RDNA3)
|
||||
// RDNA3 has duplicated data as input.
|
||||
static constexpr int ne = I * J / 32 * 2;
|
||||
#else
|
||||
static constexpr int ne = I * J / 32;
|
||||
#endif // defined(RDNA3)
|
||||
half2 x[ne] = {{0.0f, 0.0f}};
|
||||
|
||||
static constexpr __device__ bool supported() {
|
||||
|
|
@ -310,7 +317,14 @@ namespace ggml_cuda_mma {
|
|||
|
||||
static __device__ __forceinline__ int get_j(const int l) {
|
||||
if constexpr (I == 16 && J == 8) {
|
||||
#if defined(RDNA4)
|
||||
return 4 * (threadIdx.x / 16) + l;
|
||||
#elif defined(RDNA3)
|
||||
return l;
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
return -1;
|
||||
#endif // defined(RDNA4)
|
||||
} else {
|
||||
NO_DEVICE_CODE;
|
||||
return -1;
|
||||
|
|
@ -366,11 +380,16 @@ namespace ggml_cuda_mma {
|
|||
static constexpr int I = I_;
|
||||
static constexpr int J = J_;
|
||||
static constexpr data_layout dl = DATA_LAYOUT_I_MAJOR;
|
||||
static constexpr int ne = I * J / WARP_SIZE;
|
||||
|
||||
nv_bfloat162 x[ne] = {{0.0f, 0.0f}};
|
||||
|
||||
#if defined(AMD_WMMA_AVAILABLE)
|
||||
#if defined(RDNA3)
|
||||
// RDNA3 has duplicated data as input.
|
||||
static constexpr int ne = I * J / 32 * 2;
|
||||
#else
|
||||
static constexpr int ne = I * J / 32;
|
||||
#endif // defined(RDNA3)
|
||||
nv_bfloat162 x[ne] = {{0.0f, 0.0f}};
|
||||
|
||||
static constexpr __device__ bool supported() {
|
||||
if (I == 16 && J == 8) return true;
|
||||
return false;
|
||||
|
|
@ -387,13 +406,23 @@ namespace ggml_cuda_mma {
|
|||
|
||||
static __device__ __forceinline__ int get_j(const int l) {
|
||||
if constexpr (I == 16 && J == 8) {
|
||||
#if defined(RDNA4)
|
||||
return 4 * (threadIdx.x / 16) + l;
|
||||
#elif defined(RDNA3)
|
||||
return l;
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
return -1;
|
||||
#endif // defined(RDNA4)
|
||||
} else {
|
||||
NO_DEVICE_CODE;
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
#else
|
||||
static constexpr int ne = I * J / WARP_SIZE;
|
||||
nv_bfloat162 x[ne] = {{0.0f, 0.0f}};
|
||||
|
||||
static constexpr __device__ bool supported() {
|
||||
if (I == 8 && J == 8) return true;
|
||||
if (I == 16 && J == 4) return true;
|
||||
|
|
@ -546,8 +575,14 @@ namespace ggml_cuda_mma {
|
|||
}
|
||||
#elif defined(AMD_WMMA_AVAILABLE)
|
||||
if constexpr (std::is_same_v<T, half2> || std::is_same_v<T, nv_bfloat162>) {
|
||||
ggml_cuda_memcpy_1<sizeof(t.x)>(t.x, xs0 + t.get_i(0) * stride + t.get_j(0));
|
||||
|
||||
#if defined(RDNA4)
|
||||
ggml_cuda_memcpy_1<sizeof(t.x)>(t.x, xs0 + t.get_i(0) * stride + t.get_j(0));
|
||||
#elif defined(RDNA3)
|
||||
ggml_cuda_memcpy_1<sizeof(t.x)/2>(t.x, xs0 + t.get_i(0) * stride + t.get_j(0));
|
||||
ggml_cuda_memcpy_1<sizeof(t.x)/2>(t.x + t.ne/2, xs0 + t.get_i(0) * stride + t.get_j(t.ne/2));
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
#endif // defined(RDNA4)
|
||||
} else if constexpr (std::is_same_v<T, int>) {
|
||||
if constexpr (I == 16 && J == 4) {
|
||||
int64_t * xi = (int64_t *) t.x;
|
||||
|
|
@ -888,6 +923,16 @@ namespace ggml_cuda_mma {
|
|||
const halfx8_t& a_frag = reinterpret_cast<const halfx8_t&>(A.x[0]);
|
||||
const halfx8_t& b_frag = reinterpret_cast<const halfx8_t&>(B.x[0]);
|
||||
acc_frag = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12(a_frag, b_frag, acc_frag);
|
||||
#elif defined(RDNA3)
|
||||
using halfx16_t = __attribute__((ext_vector_type(16))) _Float16;
|
||||
using floatx8_t = __attribute__((ext_vector_type(8))) float;
|
||||
floatx8_t& acc_frag = reinterpret_cast<floatx8_t&>(D.x[0]);
|
||||
const halfx16_t& a_frag = reinterpret_cast<const halfx16_t&>(A.x[0]);
|
||||
const halfx16_t& b_frag = reinterpret_cast<const halfx16_t&>(B.x[0]);
|
||||
acc_frag = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32(a_frag, b_frag, acc_frag);
|
||||
#else
|
||||
GGML_UNUSED_VARS(D, A, B);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // RDNA4
|
||||
#else
|
||||
GGML_UNUSED_VARS(D, A, B);
|
||||
|
|
@ -905,6 +950,16 @@ namespace ggml_cuda_mma {
|
|||
const bf16x8_t& a_frag = reinterpret_cast<const bf16x8_t&>(A.x[0]);
|
||||
const bf16x8_t& b_frag = reinterpret_cast<const bf16x8_t&>(B.x[0]);
|
||||
acc_frag = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12(a_frag, b_frag, acc_frag);
|
||||
#elif defined(RDNA3)
|
||||
using bf16x16_t = __attribute__((ext_vector_type(16))) __bf16;
|
||||
using floatx8_t = __attribute__((ext_vector_type(8))) float;
|
||||
floatx8_t& acc_frag = reinterpret_cast<floatx8_t&>(D.x[0]);
|
||||
const bf16x16_t& a_frag = reinterpret_cast<const bf16x16_t&>(A.x[0]);
|
||||
const bf16x16_t& b_frag = reinterpret_cast<const bf16x16_t&>(B.x[0]);
|
||||
acc_frag = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32(a_frag, b_frag, acc_frag);
|
||||
#else
|
||||
GGML_UNUSED_VARS(D, A, B);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // RDNA4
|
||||
#else
|
||||
GGML_UNUSED_VARS(D, A, B);
|
||||
|
|
|
|||
|
|
@ -151,7 +151,9 @@ bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const
|
|||
return false;
|
||||
}
|
||||
} else {
|
||||
if (src1_ncols > 16) {
|
||||
if (GGML_CUDA_CC_IS_RDNA3_0(cc) && src1_ncols > 8) {
|
||||
return false;
|
||||
} else if (src1_ncols > 16) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
|
@ -160,9 +162,9 @@ bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const
|
|||
case GGML_TYPE_F32:
|
||||
return ampere_mma_available(cc);
|
||||
case GGML_TYPE_F16:
|
||||
return volta_mma_available(cc) || turing_mma_available(cc) || (amd_wmma_available(cc) && GGML_CUDA_CC_IS_RDNA4(cc));
|
||||
return volta_mma_available(cc) || turing_mma_available(cc) || amd_wmma_available(cc);
|
||||
case GGML_TYPE_BF16:
|
||||
return ampere_mma_available(cc) || (amd_wmma_available(cc) && GGML_CUDA_CC_IS_RDNA4(cc));
|
||||
return ampere_mma_available(cc) || amd_wmma_available(cc);
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -765,7 +765,10 @@ bool ggml_cuda_should_use_mmvf(enum ggml_type type, int cc, const int64_t * src0
|
|||
return ne11 <= 8;
|
||||
} else if (GGML_CUDA_CC_IS_AMD(cc)) {
|
||||
if (fp16_mma_hardware_available(cc)) {
|
||||
if (GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc)) {
|
||||
if (GGML_CUDA_CC_IS_RDNA3(cc)) {
|
||||
return ne11 <= 3;
|
||||
}
|
||||
if (GGML_CUDA_CC_IS_RDNA4(cc)) {
|
||||
return ne11 <= 5;
|
||||
}
|
||||
return ne11 <= 2;
|
||||
|
|
|
|||
|
|
@ -3,6 +3,80 @@
|
|||
#include "solve_tri.cuh"
|
||||
|
||||
#define MAX_N_FAST 64
|
||||
#define MAX_K_FAST 32
|
||||
|
||||
static __global__ void get_batch_pointers(const float * A,
|
||||
float * X,
|
||||
const float ** A_ptrs,
|
||||
float ** X_ptrs,
|
||||
int64_t ne02,
|
||||
int64_t total_batches,
|
||||
size_t s02,
|
||||
size_t s03,
|
||||
size_t s2,
|
||||
size_t s3) {
|
||||
const int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (idx >= total_batches) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int64_t i3 = idx / ne02;
|
||||
const int64_t i2 = idx % ne02;
|
||||
|
||||
A_ptrs[idx] = A + i3 * s03 + i2 * s02;
|
||||
X_ptrs[idx] = X + i3 * s3 + i2 * s2;
|
||||
}
|
||||
|
||||
static void solve_tri_f32_cublas(ggml_backend_cuda_context & ctx,
|
||||
const float * A,
|
||||
const float * B,
|
||||
float * X,
|
||||
int n,
|
||||
int k,
|
||||
int64_t ne02,
|
||||
int64_t ne03,
|
||||
size_t s02,
|
||||
size_t s03,
|
||||
size_t s12,
|
||||
size_t s13,
|
||||
size_t s2,
|
||||
size_t s3,
|
||||
cudaStream_t stream) {
|
||||
const float alpha = 1.0f;
|
||||
const int64_t total_batches = ne02 * ne03;
|
||||
if (total_batches == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Bulk copy B -> X (contiguous tensors)
|
||||
if (X != B) {
|
||||
const int64_t total_elements_BX = n * k * total_batches;
|
||||
CUDA_CHECK(cudaMemcpyAsync(X, B, total_elements_BX * sizeof(float), cudaMemcpyDeviceToDevice, stream));
|
||||
}
|
||||
|
||||
const int id = ggml_cuda_get_device();
|
||||
|
||||
ggml_cuda_pool_alloc<const float *> A_ptrs_alloc(ctx.pool(id), total_batches);
|
||||
ggml_cuda_pool_alloc<float *> X_ptrs_alloc(ctx.pool(id), total_batches);
|
||||
|
||||
const float ** A_ptrs_dev = A_ptrs_alloc.get();
|
||||
float ** X_ptrs_dev = X_ptrs_alloc.get();
|
||||
|
||||
get_batch_pointers<<<(total_batches + 255) / 256, 256, 0, stream>>>(A, X, A_ptrs_dev, X_ptrs_dev, ne02,
|
||||
total_batches, s02, s03, s2, s3);
|
||||
|
||||
CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream));
|
||||
|
||||
// Yes, this is necessary, without this we get RMSE errors
|
||||
CUBLAS_CHECK(cublasSetMathMode(ctx.cublas_handle(id), CUBLAS_DEFAULT_MATH));
|
||||
CUBLAS_CHECK(cublasStrsmBatched(ctx.cublas_handle(id), CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N,
|
||||
CUBLAS_DIAG_NON_UNIT, k, n, &alpha, A_ptrs_dev, n, X_ptrs_dev, k, total_batches));
|
||||
|
||||
// revert to standard mode from common.cuh
|
||||
CUBLAS_CHECK(cublasSetMathMode(ctx.cublas_handle(id), CUBLAS_TF32_TENSOR_OP_MATH));
|
||||
|
||||
GGML_UNUSED_VARS(s12, s13);
|
||||
}
|
||||
|
||||
// ======================
|
||||
// Fast Kernel (n <= 64, k <= 32) - Warp-based parallel reduction
|
||||
|
|
@ -63,7 +137,7 @@ static __global__ void solve_tri_f32_fast(const float * __restrict__ A,
|
|||
float x_low = (lane < n) ? B_batch[lane * k + col_idx] : 0.0f;
|
||||
float x_high = (WARP_SIZE + lane < n) ? B_batch[(WARP_SIZE + lane) * k + col_idx] : 0.0f;
|
||||
|
||||
const int half = WARP_SIZE;
|
||||
const int half = WARP_SIZE;
|
||||
const int nrows_low = (n < half) ? n : half;
|
||||
|
||||
#pragma unroll
|
||||
|
|
@ -81,8 +155,8 @@ static __global__ void solve_tri_f32_fast(const float * __restrict__ A,
|
|||
|
||||
#pragma unroll
|
||||
for (int row = half; row < n; ++row) {
|
||||
float sum = sA[row * n + lane] * x_low;
|
||||
const int j = half + lane;
|
||||
float sum = sA[row * n + lane] * x_low;
|
||||
const int j = half + lane;
|
||||
if (j < row) {
|
||||
sum += sA[row * n + j] * x_high;
|
||||
}
|
||||
|
|
@ -97,7 +171,7 @@ static __global__ void solve_tri_f32_fast(const float * __restrict__ A,
|
|||
for (int rr = 0; rr < 2; ++rr) {
|
||||
const int row = rr * WARP_SIZE + lane;
|
||||
if (row < n) {
|
||||
const float val = (row < half) ? x_low : x_high;
|
||||
const float val = (row < half) ? x_low : x_high;
|
||||
X_batch[row * k + col_idx] = val;
|
||||
}
|
||||
}
|
||||
|
|
@ -176,20 +250,26 @@ static void solve_tri_f32_cuda(const float * A,
|
|||
}
|
||||
|
||||
void ggml_cuda_op_solve_tri(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0]; // A (triangular n x x matrix)
|
||||
const ggml_tensor * src1 = dst->src[1]; // B (right hand side of n x k equation columns)
|
||||
const ggml_tensor * src0 = dst->src[0]; // A (n×n, lower triangular)
|
||||
const ggml_tensor * src1 = dst->src[1]; // B (n×k)
|
||||
|
||||
ggml_is_contiguous(src0);
|
||||
ggml_is_contiguous(src1);
|
||||
|
||||
const int64_t n = src0->ne[0];
|
||||
const int64_t k = src1->ne[0];
|
||||
const int64_t n = src0->ne[0];
|
||||
const int64_t k = src1->ne[0];
|
||||
const int64_t ne02 = src0->ne[2];
|
||||
const int64_t ne03 = src0->ne[3];
|
||||
|
||||
GGML_ASSERT(n <= 64);
|
||||
GGML_ASSERT(k <= 32);
|
||||
|
||||
solve_tri_f32_cuda((const float *) src0->data, (const float *) src1->data, (float *) dst->data, n, k, src0->ne[2],
|
||||
src0->ne[3], src0->nb[2] / sizeof(float), src0->nb[3] / sizeof(float),
|
||||
src1->nb[2] / sizeof(float), src1->nb[3] / sizeof(float), dst->nb[2] / sizeof(float),
|
||||
dst->nb[3] / sizeof(float), ctx.stream());
|
||||
if (n <= MAX_N_FAST && k <= MAX_K_FAST) {
|
||||
solve_tri_f32_cuda((const float *) src0->data, (const float *) src1->data, (float *) dst->data, n, k,
|
||||
src0->ne[2], src0->ne[3], src0->nb[2] / sizeof(float), src0->nb[3] / sizeof(float),
|
||||
src1->nb[2] / sizeof(float), src1->nb[3] / sizeof(float), dst->nb[2] / sizeof(float),
|
||||
dst->nb[3] / sizeof(float), ctx.stream());
|
||||
} else {
|
||||
solve_tri_f32_cublas(ctx, (const float *) src0->data, (const float *) src1->data, (float *) dst->data, n, k,
|
||||
ne02, ne03, src0->nb[2] / sizeof(float), src0->nb[3] / sizeof(float),
|
||||
src1->nb[2] / sizeof(float), src1->nb[3] / sizeof(float), dst->nb[2] / sizeof(float),
|
||||
dst->nb[3] / sizeof(float), ctx.stream());
|
||||
}
|
||||
}
|
||||
|
|
|
|||
4
ggml/src/ggml-cuda/vendors/hip.h
vendored
4
ggml/src/ggml-cuda/vendors/hip.h
vendored
|
|
@ -19,6 +19,9 @@
|
|||
#define CUDA_R_16F HIPBLAS_R_16F
|
||||
#define CUDA_R_16BF HIPBLAS_R_16B
|
||||
#define CUDA_R_32F HIPBLAS_R_32F
|
||||
#define CUBLAS_SIDE_RIGHT HIPBLAS_SIDE_RIGHT
|
||||
#define CUBLAS_FILL_MODE_UPPER HIPBLAS_FILL_MODE_UPPER
|
||||
#define CUBLAS_DIAG_NON_UNIT HIPBLAS_DIAG_NON_UNIT
|
||||
#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED hipDeviceAttributeVirtualMemoryManagementSupported
|
||||
#define CU_MEM_ALLOC_GRANULARITY_RECOMMENDED hipMemAllocationGranularityRecommended
|
||||
#define CU_MEM_ALLOCATION_TYPE_PINNED hipMemAllocationTypePinned
|
||||
|
|
@ -30,6 +33,7 @@
|
|||
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
|
||||
#define __all_sync(mask, var) __all(var)
|
||||
#define __any_sync(mask, var) __any(var)
|
||||
#define cublasStrsmBatched hipblasStrsmBatched
|
||||
#define cublasCreate hipblasCreate
|
||||
#define cublasDestroy hipblasDestroy
|
||||
#define cublasGemmEx hipblasGemmEx
|
||||
|
|
|
|||
5
ggml/src/ggml-cuda/vendors/musa.h
vendored
5
ggml/src/ggml-cuda/vendors/musa.h
vendored
|
|
@ -12,11 +12,16 @@
|
|||
#define CUBLAS_GEMM_DEFAULT_TENSOR_OP MUBLAS_GEMM_DEFAULT
|
||||
#define CUBLAS_OP_N MUBLAS_OP_N
|
||||
#define CUBLAS_OP_T MUBLAS_OP_T
|
||||
#define CUBLAS_DEFAULT_MATH MUBLAS_DEFAULT_MATH
|
||||
#define CUBLAS_SIDE_RIGHT MUBLAS_SIDE_RIGHT
|
||||
#define CUBLAS_FILL_MODE_UPPER MUBLAS_FILL_MODE_UPPER
|
||||
#define CUBLAS_DIAG_NON_UNIT MUBLAS_DIAG_NON_UNIT
|
||||
#define CUBLAS_STATUS_SUCCESS MUBLAS_STATUS_SUCCESS
|
||||
#define CUBLAS_TF32_TENSOR_OP_MATH MUBLAS_TENSOR_OP_MATH
|
||||
#define CUDA_R_16F MUSA_R_16F
|
||||
#define CUDA_R_16BF MUSA_R_16BF
|
||||
#define CUDA_R_32F MUSA_R_32F
|
||||
#define cublasStrsmBatched mublasStrsmBatched
|
||||
#define cublasComputeType_t cudaDataType_t
|
||||
#define cublasCreate mublasCreate
|
||||
#define cublasDestroy mublasDestroy
|
||||
|
|
|
|||
|
|
@ -5276,8 +5276,6 @@ struct ggml_tensor * ggml_flash_attn_ext(
|
|||
|
||||
if (mask) {
|
||||
GGML_ASSERT(ggml_is_contiguous(mask));
|
||||
GGML_ASSERT(mask->ne[1] >= GGML_PAD(q->ne[1], GGML_KQ_MASK_PAD) &&
|
||||
"the Flash-Attention kernel requires the mask to be padded to GGML_KQ_MASK_PAD and at least n_queries big");
|
||||
//GGML_ASSERT(ggml_can_repeat_rows(mask, qk));
|
||||
|
||||
GGML_ASSERT(q->ne[2] % mask->ne[2] == 0);
|
||||
|
|
|
|||
|
|
@ -2378,7 +2378,7 @@ static struct ggml_cgraph * whisper_build_graph_decoder(
|
|||
|
||||
const float KQscale = pow(float(n_state_head), -0.25);
|
||||
|
||||
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), 1);
|
||||
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, 1), 1);
|
||||
ggml_set_name(KQ_mask, "KQ_mask");
|
||||
ggml_set_input(KQ_mask);
|
||||
|
||||
|
|
@ -2806,7 +2806,7 @@ static bool whisper_decode_internal(
|
|||
}
|
||||
}
|
||||
|
||||
for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
|
||||
for (int i = n_tokens; i < GGML_PAD(n_tokens, 1); ++i) {
|
||||
for (int j = 0; j < n_kv; ++j) {
|
||||
data[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -695,6 +695,8 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
|
|||
udata->seq_idx .resize(LLAMA_MAX_SEQ, -1);
|
||||
udata->output .resize(n_tokens);
|
||||
|
||||
udata->seq_id_data.reserve(n_tokens);
|
||||
|
||||
seq_set_t seq_set_unq;
|
||||
|
||||
for (size_t i = 0; i < idxs.size(); ++i) {
|
||||
|
|
@ -716,11 +718,13 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
|
|||
}
|
||||
|
||||
udata->n_seq_id[i] = batch.n_seq_id[idxs[i]];
|
||||
udata->seq_id[i] = batch.seq_id[idxs[i]];
|
||||
udata->output[i] = batch.logits[idxs[i]];
|
||||
|
||||
for (int s = 0; s < udata->n_seq_id[i]; ++s) {
|
||||
seq_set_unq.set(udata->seq_id[i][s]);
|
||||
const llama_seq_id seq_id = batch.seq_id[idxs[i]][s];
|
||||
|
||||
udata->seq_id_data.push_back(seq_id);
|
||||
seq_set_unq.set(seq_id);
|
||||
}
|
||||
|
||||
if (udata->output[i]) {
|
||||
|
|
@ -728,6 +732,12 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
|
|||
}
|
||||
}
|
||||
|
||||
llama_seq_id * seq_id_ptr = udata->seq_id_data.data();
|
||||
for (size_t i = 0; i < idxs.size(); ++i) {
|
||||
udata->seq_id[i] = seq_id_ptr;
|
||||
seq_id_ptr += udata->n_seq_id[i];
|
||||
}
|
||||
|
||||
for (uint32_t s = 0; s < n_seq_max; ++s) {
|
||||
if (seq_set_unq.test(s)) {
|
||||
udata->seq_idx[s] = udata->seq_id_unq.size();
|
||||
|
|
|
|||
|
|
@ -56,13 +56,15 @@ struct llama_ubatch {
|
|||
std::vector<float> embd;
|
||||
std::vector<llama_pos> pos;
|
||||
std::vector<int32_t> n_seq_id;
|
||||
std::vector<llama_seq_id *> seq_id;
|
||||
std::vector<llama_seq_id *> seq_id; // these point into the seq_id_data below
|
||||
std::vector<llama_seq_id> seq_id_unq;
|
||||
std::vector<int32_t> seq_idx;
|
||||
std::vector<int8_t> output;
|
||||
|
||||
std::vector<llama_seq_id> seq_id_data;
|
||||
};
|
||||
|
||||
// the llama_ubatch pointers above point to this data if set. otherwise - points to non-owning data
|
||||
// the llama_ubatch pointers above point to this data if set. otherwise - point to external non-owning data
|
||||
std::shared_ptr<data_t> data;
|
||||
};
|
||||
|
||||
|
|
|
|||
|
|
@ -96,14 +96,6 @@ llama_context::llama_context(
|
|||
// with causal attention, the batch size is limited by the context size
|
||||
cparams.n_batch = cparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
|
||||
|
||||
// the batch has to be at least GGML_KQ_MASK_PAD because we will be padding the KQ_mask
|
||||
// this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext)
|
||||
// ref: https://github.com/ggerganov/llama.cpp/pull/5021
|
||||
// TODO: this padding is not needed for the cache-less context so we should probably move it to llama_memory
|
||||
if (cparams.n_batch < GGML_KQ_MASK_PAD) {
|
||||
LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD);
|
||||
cparams.n_batch = GGML_KQ_MASK_PAD;
|
||||
}
|
||||
cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
|
||||
|
||||
cparams.op_offload = params.op_offload;
|
||||
|
|
|
|||
|
|
@ -385,7 +385,7 @@ bool llm_graph_input_attn_kv::can_reuse(const llm_graph_params & params) {
|
|||
//res &= self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
|
||||
|
||||
res &= self_kq_mask->ne[0] == mctx->get_n_kv();
|
||||
res &= self_kq_mask->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD);
|
||||
res &= self_kq_mask->ne[1] == params.ubatch.n_tokens;
|
||||
|
||||
return res;
|
||||
}
|
||||
|
|
@ -416,10 +416,10 @@ bool llm_graph_input_attn_kv_iswa::can_reuse(const llm_graph_params & params) {
|
|||
//res &= self_v_idxs_swa->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
|
||||
|
||||
res &= self_kq_mask->ne[0] == mctx->get_base()->get_n_kv();
|
||||
res &= self_kq_mask->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD);
|
||||
res &= self_kq_mask->ne[1] == params.ubatch.n_tokens;
|
||||
|
||||
res &= self_kq_mask_swa->ne[0] == mctx->get_swa()->get_n_kv();
|
||||
res &= self_kq_mask_swa->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD);
|
||||
res &= self_kq_mask_swa->ne[1] == params.ubatch.n_tokens;
|
||||
|
||||
return res;
|
||||
}
|
||||
|
|
@ -452,7 +452,7 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
|
|||
}
|
||||
}
|
||||
|
||||
for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
|
||||
for (int i = n_tokens; i < n_tokens; ++i) {
|
||||
for (int j = 0; j < n_enc; ++j) {
|
||||
data[h*(n_enc*n_tokens) + i*n_enc + j] = -INFINITY;
|
||||
}
|
||||
|
|
@ -1470,13 +1470,13 @@ llm_graph_input_attn_no_cache * llm_graph_context::build_attn_inp_no_cache() con
|
|||
auto inp = std::make_unique<llm_graph_input_attn_no_cache>(hparams, cparams);
|
||||
|
||||
// note: there is no KV cache, so the number of KV values is equal to the number of tokens in the batch
|
||||
inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), 1, 1);
|
||||
inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens, 1, 1);
|
||||
ggml_set_input(inp->self_kq_mask);
|
||||
|
||||
inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
|
||||
|
||||
if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
|
||||
inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), 1, 1);
|
||||
inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens, 1, 1);
|
||||
ggml_set_input(inp->self_kq_mask_swa);
|
||||
|
||||
inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
|
||||
|
|
@ -1558,7 +1558,7 @@ static std::unique_ptr<llm_graph_input_attn_kv> build_attn_inp_kv_impl(
|
|||
inp->self_k_idxs = mctx_cur->build_input_k_idxs(ctx0, ubatch);
|
||||
inp->self_v_idxs = mctx_cur->build_input_v_idxs(ctx0, ubatch);
|
||||
|
||||
inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens/n_stream, GGML_KQ_MASK_PAD), 1, n_stream);
|
||||
inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
|
||||
ggml_set_input(inp->self_kq_mask);
|
||||
|
||||
inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
|
||||
|
|
@ -1701,7 +1701,7 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const {
|
|||
|
||||
const int32_t n_enc = !cross->v_embd.empty() ? cross->n_enc : hparams.n_ctx_train;
|
||||
|
||||
inp->cross_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_enc, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), 1, 1);
|
||||
inp->cross_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_enc, n_tokens, 1, 1);
|
||||
ggml_set_input(inp->cross_kq_mask);
|
||||
|
||||
inp->cross_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->cross_kq_mask, GGML_TYPE_F16) : inp->cross_kq_mask;
|
||||
|
|
@ -1767,7 +1767,7 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const
|
|||
inp->self_k_idxs = mctx_cur->get_base()->build_input_k_idxs(ctx0, ubatch);
|
||||
inp->self_v_idxs = mctx_cur->get_base()->build_input_v_idxs(ctx0, ubatch);
|
||||
|
||||
inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens/n_stream, GGML_KQ_MASK_PAD), 1, n_stream);
|
||||
inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
|
||||
ggml_set_input(inp->self_kq_mask);
|
||||
|
||||
inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
|
||||
|
|
@ -1781,7 +1781,7 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const
|
|||
inp->self_k_idxs_swa = mctx_cur->get_swa()->build_input_k_idxs(ctx0, ubatch);
|
||||
inp->self_v_idxs_swa = mctx_cur->get_swa()->build_input_v_idxs(ctx0, ubatch);
|
||||
|
||||
inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens/n_stream, GGML_KQ_MASK_PAD), 1, n_stream);
|
||||
inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
|
||||
ggml_set_input(inp->self_kq_mask_swa);
|
||||
|
||||
inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
|
||||
|
|
|
|||
|
|
@ -1232,8 +1232,7 @@ void llama_kv_cache::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * u
|
|||
GGML_ASSERT(n_tokens%n_stream == 0);
|
||||
|
||||
// n_tps == n_tokens_per_stream
|
||||
const int64_t n_tps = n_tokens/n_stream;
|
||||
const int64_t n_tps_pad = GGML_PAD(n_tps, GGML_KQ_MASK_PAD);
|
||||
const int64_t n_tps = n_tokens/n_stream;
|
||||
|
||||
std::fill(data, data + ggml_nelements(dst), -INFINITY);
|
||||
|
||||
|
|
@ -1266,7 +1265,7 @@ void llama_kv_cache::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * u
|
|||
const llama_pos p1_x = is_2d ? ubatch->pos[i + ubatch->n_tokens*2] : 0;
|
||||
const llama_pos p1_y = is_2d ? ubatch->pos[i + ubatch->n_tokens] : 0;
|
||||
|
||||
const uint64_t idst = n_kv*(h*n_stream*n_tps_pad + s*n_tps_pad + ii);
|
||||
const uint64_t idst = n_kv*(h*n_stream*n_tps + s*n_tps + ii);
|
||||
|
||||
for (uint32_t j = 0; j < n_kv; ++j) {
|
||||
if (cells.is_empty(j)) {
|
||||
|
|
|
|||
|
|
@ -87,6 +87,10 @@ static void sigint_handler(int signo) {
|
|||
int main(int argc, char ** argv) {
|
||||
common_params params;
|
||||
g_params = ¶ms;
|
||||
|
||||
// disable jinja by default
|
||||
params.use_jinja = false;
|
||||
|
||||
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMPLETION, print_usage)) {
|
||||
return 1;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -622,11 +622,12 @@ struct clip_graph {
|
|||
cur = ggml_mul(ctx0, cur, model.mm_input_norm_w);
|
||||
cur = ggml_add(ctx0, cur, model.mm_input_norm_b);
|
||||
|
||||
cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
|
||||
cur = ggml_add(ctx0, cur, model.mm_1_b);
|
||||
cur = ggml_gelu(ctx0, cur);
|
||||
cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
|
||||
cur = ggml_add(ctx0, cur, model.mm_2_b);
|
||||
cur = build_ffn(cur,
|
||||
model.mm_1_w, model.mm_1_b,
|
||||
nullptr, nullptr,
|
||||
model.mm_2_w, model.mm_2_b,
|
||||
FFN_GELU,
|
||||
-1);
|
||||
|
||||
} else if (ctx->proj_type() == PROJECTOR_TYPE_JANUS_PRO) {
|
||||
cur = build_ffn(cur,
|
||||
|
|
@ -694,16 +695,12 @@ struct clip_graph {
|
|||
|
||||
// LlavaMultiModalProjector (always using GELU activation)
|
||||
{
|
||||
cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
|
||||
if (model.mm_1_b) {
|
||||
cur = ggml_add(ctx0, cur, model.mm_1_b);
|
||||
}
|
||||
|
||||
cur = ggml_gelu(ctx0, cur);
|
||||
cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
|
||||
if (model.mm_2_b) {
|
||||
cur = ggml_add(ctx0, cur, model.mm_2_b);
|
||||
}
|
||||
cur = build_ffn(cur,
|
||||
model.mm_1_w, model.mm_1_b,
|
||||
nullptr, nullptr,
|
||||
model.mm_2_w, model.mm_2_b,
|
||||
FFN_GELU,
|
||||
-1);
|
||||
}
|
||||
|
||||
// arrangement of the [IMG_BREAK] token
|
||||
|
|
@ -802,10 +799,6 @@ struct clip_graph {
|
|||
|
||||
// if flash attn is used, we need to pad the mask and cast to f16
|
||||
if (ctx->flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) {
|
||||
int n_pad = GGML_PAD(window_mask->ne[1], GGML_KQ_MASK_PAD) - window_mask->ne[1];
|
||||
if (n_pad > 0) {
|
||||
window_mask = ggml_pad(ctx0, window_mask, 0, n_pad, 0, 0);
|
||||
}
|
||||
window_mask = ggml_cast(ctx0, window_mask, GGML_TYPE_F16);
|
||||
}
|
||||
|
||||
|
|
@ -818,7 +811,7 @@ struct clip_graph {
|
|||
|
||||
// loop over layers
|
||||
for (int il = 0; il < n_layer; il++) {
|
||||
auto & layer = model.layers[il];
|
||||
const auto & layer = model.layers[il];
|
||||
const bool full_attn = use_window_attn ? (il + 1) % n_wa_pattern == 0 : true;
|
||||
|
||||
ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
|
||||
|
|
@ -897,16 +890,12 @@ struct clip_graph {
|
|||
// multimodal projection
|
||||
ggml_tensor * embeddings = inpL;
|
||||
embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, n_pos / 4, batch_size);
|
||||
|
||||
embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
|
||||
embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
|
||||
|
||||
// GELU activation
|
||||
embeddings = ggml_gelu(ctx0, embeddings);
|
||||
|
||||
// Second linear layer
|
||||
embeddings = ggml_mul_mat(ctx0, model.mm_1_w, embeddings);
|
||||
embeddings = ggml_add(ctx0, embeddings, model.mm_1_b);
|
||||
embeddings = build_ffn(embeddings,
|
||||
model.mm_0_w, model.mm_0_b,
|
||||
nullptr, nullptr,
|
||||
model.mm_1_w, model.mm_1_b,
|
||||
FFN_GELU,
|
||||
-1);
|
||||
|
||||
if (use_window_attn) {
|
||||
window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / 4);
|
||||
|
|
@ -1284,11 +1273,12 @@ struct clip_graph {
|
|||
// projector LayerNorm uses pytorch's default eps = 1e-5
|
||||
// ref: https://huggingface.co/OpenGVLab/InternVL3-8B-Instruct/blob/a34d3e4e129a5856abfd6aa6de79776484caa14e/modeling_internvl_chat.py#L79
|
||||
cur = build_norm(cur, model.mm_0_w, model.mm_0_b, NORM_TYPE_NORMAL, 1e-5, -1);
|
||||
cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
|
||||
cur = ggml_add(ctx0, cur, model.mm_1_b);
|
||||
cur = ggml_gelu(ctx0, cur);
|
||||
cur = ggml_mul_mat(ctx0, model.mm_3_w, cur);
|
||||
cur = ggml_add(ctx0, cur, model.mm_3_b);
|
||||
cur = build_ffn(cur,
|
||||
model.mm_1_w, model.mm_1_b,
|
||||
nullptr, nullptr,
|
||||
model.mm_3_w, model.mm_3_b,
|
||||
FFN_GELU,
|
||||
-1);
|
||||
}
|
||||
|
||||
// build the graph
|
||||
|
|
@ -1439,11 +1429,12 @@ struct clip_graph {
|
|||
cb(cur, "proj_inp_normed", -1);
|
||||
|
||||
// projection mlp
|
||||
cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
|
||||
cur = ggml_add(ctx0, cur, model.mm_1_b);
|
||||
cur = ggml_gelu(ctx0, cur);
|
||||
cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
|
||||
cur = ggml_add(ctx0, cur, model.mm_2_b);
|
||||
cur = build_ffn(cur,
|
||||
model.mm_1_w, model.mm_1_b,
|
||||
nullptr, nullptr,
|
||||
model.mm_2_w, model.mm_2_b,
|
||||
FFN_GELU,
|
||||
-1);
|
||||
cb(cur, "proj_out", -1);
|
||||
}
|
||||
|
||||
|
|
@ -1914,9 +1905,12 @@ struct clip_graph {
|
|||
|
||||
} else if (ctx->proj_type() == PROJECTOR_TYPE_VOXTRAL) {
|
||||
// projector
|
||||
cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
|
||||
cur = ggml_gelu_erf(ctx0, cur);
|
||||
cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
|
||||
cur = build_ffn(cur,
|
||||
model.mm_1_w, model.mm_1_b,
|
||||
nullptr, nullptr,
|
||||
model.mm_2_w, model.mm_2_b,
|
||||
FFN_GELU_ERF,
|
||||
-1);
|
||||
|
||||
} else {
|
||||
GGML_ABORT("%s: unknown projector type", __func__);
|
||||
|
|
@ -2101,34 +2095,66 @@ private:
|
|||
|
||||
// self-attention
|
||||
{
|
||||
ggml_tensor * Qcur = ggml_mul_mat(ctx0, layer.q_w, cur);
|
||||
if (layer.q_b) {
|
||||
Qcur = ggml_add(ctx0, Qcur, layer.q_b);
|
||||
}
|
||||
ggml_tensor * Qcur = nullptr;
|
||||
ggml_tensor * Kcur = nullptr;
|
||||
ggml_tensor * Vcur = nullptr;
|
||||
if (layer.qkv_w != nullptr) {
|
||||
// fused qkv
|
||||
cur = ggml_mul_mat(ctx0, layer.qkv_w, cur);
|
||||
if (layer.qkv_b != nullptr) {
|
||||
cur = ggml_add(ctx0, cur, layer.qkv_b);
|
||||
}
|
||||
|
||||
ggml_tensor * Kcur = ggml_mul_mat(ctx0, layer.k_w, cur);
|
||||
if (layer.k_b) {
|
||||
Kcur = ggml_add(ctx0, Kcur, layer.k_b);
|
||||
}
|
||||
Qcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
|
||||
/* nb1 */ ggml_row_size(cur->type, d_head),
|
||||
/* nb2 */ cur->nb[1],
|
||||
/* offset */ 0);
|
||||
|
||||
ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.v_w, cur);
|
||||
if (layer.v_b) {
|
||||
Vcur = ggml_add(ctx0, Vcur, layer.v_b);
|
||||
}
|
||||
Kcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
|
||||
/* nb1 */ ggml_row_size(cur->type, d_head),
|
||||
/* nb2 */ cur->nb[1],
|
||||
/* offset */ ggml_row_size(cur->type, n_embd));
|
||||
|
||||
if (layer.q_norm) {
|
||||
Qcur = build_norm(Qcur, layer.q_norm, NULL, norm_t, eps, il);
|
||||
cb(Qcur, "Qcur_norm", il);
|
||||
}
|
||||
Vcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
|
||||
/* nb1 */ ggml_row_size(cur->type, d_head),
|
||||
/* nb2 */ cur->nb[1],
|
||||
/* offset */ ggml_row_size(cur->type, 2 * n_embd));
|
||||
|
||||
if (layer.k_norm) {
|
||||
Kcur = build_norm(Kcur, layer.k_norm, NULL, norm_t, eps, il);
|
||||
cb(Kcur, "Kcur_norm", il);
|
||||
}
|
||||
// TODO: q/k norm requires row size == n_embd, while here it's d_head
|
||||
// we can add support in the future if needed
|
||||
GGML_ASSERT(layer.q_norm == nullptr && layer.k_norm == nullptr);
|
||||
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos);
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos);
|
||||
Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos);
|
||||
} else {
|
||||
// separate q, k, v
|
||||
Qcur = ggml_mul_mat(ctx0, layer.q_w, cur);
|
||||
if (layer.q_b) {
|
||||
Qcur = ggml_add(ctx0, Qcur, layer.q_b);
|
||||
}
|
||||
|
||||
Kcur = ggml_mul_mat(ctx0, layer.k_w, cur);
|
||||
if (layer.k_b) {
|
||||
Kcur = ggml_add(ctx0, Kcur, layer.k_b);
|
||||
}
|
||||
|
||||
Vcur = ggml_mul_mat(ctx0, layer.v_w, cur);
|
||||
if (layer.v_b) {
|
||||
Vcur = ggml_add(ctx0, Vcur, layer.v_b);
|
||||
}
|
||||
|
||||
if (layer.q_norm) {
|
||||
Qcur = build_norm(Qcur, layer.q_norm, NULL, norm_t, eps, il);
|
||||
cb(Qcur, "Qcur_norm", il);
|
||||
}
|
||||
|
||||
if (layer.k_norm) {
|
||||
Kcur = build_norm(Kcur, layer.k_norm, NULL, norm_t, eps, il);
|
||||
cb(Kcur, "Kcur_norm", il);
|
||||
}
|
||||
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos);
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos);
|
||||
Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos);
|
||||
}
|
||||
|
||||
cb(Qcur, "Qcur", il);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
|
|
|||
|
|
@ -270,6 +270,7 @@ int main(int argc, char ** argv) {
|
|||
ggml_time_init();
|
||||
|
||||
common_params params;
|
||||
params.use_jinja = false; // disable jinja by default
|
||||
params.sampling.temp = 0.2; // lower temp by default for better quality
|
||||
|
||||
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_MTMD, show_additional_info)) {
|
||||
|
|
@ -317,7 +318,9 @@ int main(int argc, char ** argv) {
|
|||
g_is_generating = true;
|
||||
if (params.prompt.find(mtmd_default_marker()) == std::string::npos) {
|
||||
for (size_t i = 0; i < params.image.size(); i++) {
|
||||
params.prompt += mtmd_default_marker();
|
||||
// most models require the marker before each image
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/17616
|
||||
params.prompt = mtmd_default_marker() + params.prompt;
|
||||
}
|
||||
}
|
||||
common_chat_msg msg;
|
||||
|
|
|
|||
|
|
@ -32,23 +32,32 @@ fi
|
|||
|
||||
arr_prefix=()
|
||||
arr_hf=()
|
||||
arr_tmpl=() # chat template
|
||||
arr_extra_args=()
|
||||
arr_file=()
|
||||
|
||||
add_test_vision() {
|
||||
local hf=$1
|
||||
local tmpl=${2:-""} # default to empty string if not provided
|
||||
shift
|
||||
local extra_args=""
|
||||
if [ $# -gt 0 ]; then
|
||||
extra_args=$(printf " %q" "$@")
|
||||
fi
|
||||
arr_prefix+=("[vision]")
|
||||
arr_hf+=("$hf")
|
||||
arr_tmpl+=("$tmpl")
|
||||
arr_extra_args+=("$extra_args")
|
||||
arr_file+=("test-1.jpeg")
|
||||
}
|
||||
|
||||
add_test_audio() {
|
||||
local hf=$1
|
||||
shift
|
||||
local extra_args=""
|
||||
if [ $# -gt 0 ]; then
|
||||
extra_args=$(printf " %q" "$@")
|
||||
fi
|
||||
arr_prefix+=("[audio] ")
|
||||
arr_hf+=("$hf")
|
||||
arr_tmpl+=("") # no need for chat tmpl
|
||||
arr_extra_args+=("$extra_args")
|
||||
arr_file+=("test-2.mp3")
|
||||
}
|
||||
|
||||
|
|
@ -56,9 +65,9 @@ add_test_vision "ggml-org/SmolVLM-500M-Instruct-GGUF:Q8_0"
|
|||
add_test_vision "ggml-org/SmolVLM2-2.2B-Instruct-GGUF:Q4_K_M"
|
||||
add_test_vision "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF:Q8_0"
|
||||
add_test_vision "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M"
|
||||
add_test_vision "THUDM/glm-edge-v-5b-gguf:Q4_K_M"
|
||||
add_test_vision "second-state/Llava-v1.5-7B-GGUF:Q2_K" "vicuna"
|
||||
add_test_vision "cjpais/llava-1.6-mistral-7b-gguf:Q3_K_M" "vicuna"
|
||||
add_test_vision "THUDM/glm-edge-v-5b-gguf:Q4_K_M" -p "name of the newspaper?<__media__>"
|
||||
add_test_vision "second-state/Llava-v1.5-7B-GGUF:Q2_K" --chat-template vicuna
|
||||
add_test_vision "cjpais/llava-1.6-mistral-7b-gguf:Q3_K_M" --chat-template vicuna
|
||||
add_test_vision "ibm-research/granite-vision-3.2-2b-GGUF:Q4_K_M"
|
||||
add_test_vision "second-state/MiniCPM-Llama3-V-2_5-GGUF:Q2_K" # model from openbmb is corrupted
|
||||
add_test_vision "openbmb/MiniCPM-V-2_6-gguf:Q2_K"
|
||||
|
|
@ -79,7 +88,7 @@ add_test_audio "ggml-org/Voxtral-Mini-3B-2507-GGUF:Q4_K_M"
|
|||
# to test the big models, run: ./tests.sh big
|
||||
if [ "$RUN_BIG_TESTS" = true ]; then
|
||||
add_test_vision "ggml-org/pixtral-12b-GGUF:Q4_K_M"
|
||||
add_test_vision "ggml-org/Mistral-Small-3.1-24B-Instruct-2503-GGUF" "mistral-v7"
|
||||
add_test_vision "ggml-org/Mistral-Small-3.1-24B-Instruct-2503-GGUF" --chat-template mistral-v7
|
||||
add_test_vision "ggml-org/Qwen2-VL-2B-Instruct-GGUF:Q4_K_M"
|
||||
add_test_vision "ggml-org/Qwen2-VL-7B-Instruct-GGUF:Q4_K_M"
|
||||
add_test_vision "ggml-org/Qwen2.5-VL-3B-Instruct-GGUF:Q4_K_M"
|
||||
|
|
@ -89,7 +98,7 @@ if [ "$RUN_BIG_TESTS" = true ]; then
|
|||
add_test_vision "ggml-org/InternVL3-14B-Instruct-GGUF:Q4_K_M"
|
||||
add_test_vision "ggml-org/Qwen2.5-Omni-7B-GGUF:Q4_K_M"
|
||||
# add_test_vision "ggml-org/Qwen2.5-VL-32B-Instruct-GGUF:Q4_K_M" # does not work on my mac M3 Ultra
|
||||
add_test_vision "ggml-org/Kimi-VL-A3B-Thinking-2506-GGUF:Q4_K_M"
|
||||
# add_test_vision "ggml-org/Kimi-VL-A3B-Thinking-2506-GGUF:Q4_K_M" # not always working
|
||||
|
||||
add_test_audio "ggml-org/ultravox-v0_5-llama-3_1-8b-GGUF:Q4_K_M"
|
||||
add_test_audio "ggml-org/Qwen2.5-Omni-7B-GGUF:Q4_K_M"
|
||||
|
|
@ -122,21 +131,25 @@ for i in "${!arr_hf[@]}"; do
|
|||
bin="llama-mtmd-cli"
|
||||
prefix="${arr_prefix[$i]}"
|
||||
hf="${arr_hf[$i]}"
|
||||
tmpl="${arr_tmpl[$i]}"
|
||||
extra_args="${arr_extra_args[$i]}"
|
||||
inp_file="${arr_file[$i]}"
|
||||
|
||||
echo "Running test with binary: $bin and HF model: $hf"
|
||||
echo ""
|
||||
echo ""
|
||||
|
||||
output=$(\
|
||||
"$PROJ_ROOT/build/bin/$bin" \
|
||||
-hf "$hf" \
|
||||
--image $SCRIPT_DIR/$inp_file \
|
||||
-p "what is the publisher name of the newspaper?" \
|
||||
cmd="$(printf %q "$PROJ_ROOT/build/bin/$bin") \
|
||||
-hf $(printf %q "$hf") \
|
||||
--image $(printf %q "$SCRIPT_DIR/$inp_file") \
|
||||
--temp 0 -n 128 \
|
||||
${tmpl:+--chat-template "$tmpl"} \
|
||||
2>&1 | tee /dev/tty)
|
||||
${extra_args}"
|
||||
|
||||
# if extra_args does not contain -p, we add a default prompt
|
||||
if ! [[ "$extra_args" =~ "-p" ]]; then
|
||||
cmd+=" -p \"what is the publisher name of the newspaper?\""
|
||||
fi
|
||||
|
||||
output=$(eval "$cmd" 2>&1 | tee /dev/tty)
|
||||
|
||||
echo "$output" > $SCRIPT_DIR/output/$bin-$(echo "$hf" | tr '/' '-').log
|
||||
|
||||
|
|
@ -144,9 +157,9 @@ for i in "${!arr_hf[@]}"; do
|
|||
if echo "$output" | grep -iq "new york" \
|
||||
|| (echo "$output" | grep -iq "men" && echo "$output" | grep -iq "walk")
|
||||
then
|
||||
result="$prefix \033[32mOK\033[0m: $bin $hf"
|
||||
result="$prefix \033[32mOK\033[0m: $hf"
|
||||
else
|
||||
result="$prefix \033[31mFAIL\033[0m: $bin $hf"
|
||||
result="$prefix \033[31mFAIL\033[0m: $hf"
|
||||
fi
|
||||
echo -e "$result"
|
||||
arr_res+=("$result")
|
||||
|
|
|
|||
Binary file not shown.
|
|
@ -1,6 +1,7 @@
|
|||
#include "server-common.h"
|
||||
#include "server-models.h"
|
||||
|
||||
#include "preset.h"
|
||||
#include "download.h"
|
||||
|
||||
#include <cpp-httplib/httplib.h> // TODO: remove this once we use HTTP client from download.h
|
||||
|
|
@ -33,6 +34,10 @@
|
|||
|
||||
#define CMD_EXIT "exit"
|
||||
|
||||
// address for child process, this is needed because router may run on 0.0.0.0
|
||||
// ref: https://github.com/ggml-org/llama.cpp/issues/17862
|
||||
#define CHILD_ADDR "127.0.0.1"
|
||||
|
||||
static std::filesystem::path get_server_exec_path() {
|
||||
#if defined(_WIN32)
|
||||
wchar_t buf[32768] = { 0 }; // Large buffer to handle long paths
|
||||
|
|
@ -132,6 +137,93 @@ static std::vector<local_model> list_local_models(const std::string & dir) {
|
|||
return models;
|
||||
}
|
||||
|
||||
//
|
||||
// server_presets
|
||||
//
|
||||
|
||||
|
||||
server_presets::server_presets(int argc, char ** argv, common_params & base_params, const std::string & presets_path)
|
||||
: ctx_params(common_params_parser_init(base_params, LLAMA_EXAMPLE_SERVER)) {
|
||||
if (!presets_path.empty()) {
|
||||
presets = common_presets_load(presets_path, ctx_params);
|
||||
SRV_INF("Loaded %zu presets from %s\n", presets.size(), presets_path.c_str());
|
||||
}
|
||||
|
||||
// populate reserved args (will be appended by the router)
|
||||
for (auto & opt : ctx_params.options) {
|
||||
if (opt.env == nullptr) {
|
||||
continue;
|
||||
}
|
||||
std::string env = opt.env;
|
||||
if (env == "LLAMA_ARG_PORT" ||
|
||||
env == "LLAMA_ARG_HOST" ||
|
||||
env == "LLAMA_ARG_ALIAS" ||
|
||||
env == "LLAMA_ARG_API_KEY" ||
|
||||
env == "LLAMA_ARG_MODELS_DIR" ||
|
||||
env == "LLAMA_ARG_MODELS_MAX" ||
|
||||
env == "LLAMA_ARG_MODELS_PRESET" ||
|
||||
env == "LLAMA_ARG_MODEL" ||
|
||||
env == "LLAMA_ARG_MMPROJ" ||
|
||||
env == "LLAMA_ARG_HF_REPO" ||
|
||||
env == "LLAMA_ARG_NO_MODELS_AUTOLOAD") {
|
||||
control_args[env] = opt;
|
||||
}
|
||||
}
|
||||
|
||||
// read base args from router's argv
|
||||
common_params_parse(argc, argv, LLAMA_EXAMPLE_SERVER, base_args);
|
||||
|
||||
// remove any router-controlled args from base_args
|
||||
for (const auto & cargs : control_args) {
|
||||
auto it = base_args.find(cargs.second);
|
||||
if (it != base_args.end()) {
|
||||
base_args.erase(it);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
common_preset server_presets::get_preset(const std::string & name) {
|
||||
auto it = presets.find(name);
|
||||
if (it != presets.end()) {
|
||||
return it->second;
|
||||
}
|
||||
return common_preset();
|
||||
}
|
||||
|
||||
void server_presets::render_args(server_model_meta & meta) {
|
||||
common_preset preset = meta.preset; // copy
|
||||
// merging 3 kinds of args:
|
||||
// 1. model-specific args (from preset)
|
||||
// force removing control args if any
|
||||
for (auto & cargs : control_args) {
|
||||
if (preset.options.find(cargs.second) != preset.options.end()) {
|
||||
SRV_WRN("Preset '%s' contains reserved arg '%s', removing it\n", preset.name.c_str(), cargs.second.args[0]);
|
||||
preset.options.erase(cargs.second);
|
||||
}
|
||||
}
|
||||
// 2. base args (from router)
|
||||
// inherit from base args
|
||||
for (const auto & [arg, value] : base_args) {
|
||||
preset.options[arg] = value;
|
||||
}
|
||||
// 3. control args (from router)
|
||||
// set control values
|
||||
preset.options[control_args["LLAMA_ARG_HOST"]] = CHILD_ADDR;
|
||||
preset.options[control_args["LLAMA_ARG_PORT"]] = std::to_string(meta.port);
|
||||
preset.options[control_args["LLAMA_ARG_ALIAS"]] = meta.name;
|
||||
if (meta.in_cache) {
|
||||
preset.options[control_args["LLAMA_ARG_HF_REPO"]] = meta.name;
|
||||
} else {
|
||||
preset.options[control_args["LLAMA_ARG_MODEL"]] = meta.path;
|
||||
if (!meta.path_mmproj.empty()) {
|
||||
preset.options[control_args["LLAMA_ARG_MMPROJ"]] = meta.path_mmproj;
|
||||
}
|
||||
}
|
||||
meta.args = preset.to_args();
|
||||
// add back the binary path at the front
|
||||
meta.args.insert(meta.args.begin(), get_server_exec_path().string());
|
||||
}
|
||||
|
||||
//
|
||||
// server_models
|
||||
//
|
||||
|
|
@ -140,7 +232,7 @@ server_models::server_models(
|
|||
const common_params & params,
|
||||
int argc,
|
||||
char ** argv,
|
||||
char ** envp) : base_params(params) {
|
||||
char ** envp) : base_params(params), presets(argc, argv, base_params, params.models_preset) {
|
||||
for (int i = 0; i < argc; i++) {
|
||||
base_args.push_back(std::string(argv[i]));
|
||||
}
|
||||
|
|
@ -155,11 +247,58 @@ server_models::server_models(
|
|||
LOG_WRN("failed to get server executable path: %s\n", e.what());
|
||||
LOG_WRN("using original argv[0] as fallback: %s\n", base_args[0].c_str());
|
||||
}
|
||||
// TODO: allow refreshing cached model list
|
||||
// add cached models
|
||||
load_models();
|
||||
}
|
||||
|
||||
void server_models::add_model(server_model_meta && meta) {
|
||||
if (mapping.find(meta.name) != mapping.end()) {
|
||||
throw std::runtime_error(string_format("model '%s' appears multiple times", meta.name.c_str()));
|
||||
}
|
||||
presets.render_args(meta); // populate meta.args
|
||||
std::string name = meta.name;
|
||||
mapping[name] = instance_t{
|
||||
/* subproc */ std::make_shared<subprocess_s>(),
|
||||
/* th */ std::thread(),
|
||||
/* meta */ std::move(meta)
|
||||
};
|
||||
}
|
||||
|
||||
static std::vector<local_model> list_custom_path_models(server_presets & presets) {
|
||||
// detect any custom-path models in presets
|
||||
std::vector<local_model> custom_models;
|
||||
for (auto & [model_name, preset] : presets.presets) {
|
||||
local_model model;
|
||||
model.name = model_name;
|
||||
std::vector<common_arg> to_erase;
|
||||
for (auto & [arg, value] : preset.options) {
|
||||
std::string env(arg.env ? arg.env : "");
|
||||
if (env == "LLAMA_ARG_MODEL") {
|
||||
model.path = value;
|
||||
to_erase.push_back(arg);
|
||||
}
|
||||
if (env == "LLAMA_ARG_MMPROJ") {
|
||||
model.path_mmproj = value;
|
||||
to_erase.push_back(arg);
|
||||
}
|
||||
}
|
||||
for (auto & arg : to_erase) {
|
||||
preset.options.erase(arg);
|
||||
}
|
||||
if (!model.name.empty() && !model.path.empty()) {
|
||||
custom_models.push_back(model);
|
||||
}
|
||||
}
|
||||
return custom_models;
|
||||
}
|
||||
|
||||
// TODO: allow refreshing cached model list
|
||||
void server_models::load_models() {
|
||||
// loading models from 3 sources:
|
||||
// 1. cached models
|
||||
auto cached_models = common_list_cached_models();
|
||||
for (const auto & model : cached_models) {
|
||||
server_model_meta meta{
|
||||
/* preset */ presets.get_preset(model.to_string()),
|
||||
/* name */ model.to_string(),
|
||||
/* path */ model.manifest_path,
|
||||
/* path_mmproj */ "", // auto-detected when loading
|
||||
|
|
@ -170,21 +309,18 @@ server_models::server_models(
|
|||
/* args */ std::vector<std::string>(),
|
||||
/* exit_code */ 0
|
||||
};
|
||||
mapping[meta.name] = instance_t{
|
||||
/* subproc */ std::make_shared<subprocess_s>(),
|
||||
/* th */ std::thread(),
|
||||
/* meta */ meta
|
||||
};
|
||||
add_model(std::move(meta));
|
||||
}
|
||||
// add local models specificed via --models-dir
|
||||
if (!params.models_dir.empty()) {
|
||||
auto local_models = list_local_models(params.models_dir);
|
||||
// 2. local models specificed via --models-dir
|
||||
if (!base_params.models_dir.empty()) {
|
||||
auto local_models = list_local_models(base_params.models_dir);
|
||||
for (const auto & model : local_models) {
|
||||
if (mapping.find(model.name) != mapping.end()) {
|
||||
// already exists in cached models, skip
|
||||
continue;
|
||||
}
|
||||
server_model_meta meta{
|
||||
/* preset */ presets.get_preset(model.name),
|
||||
/* name */ model.name,
|
||||
/* path */ model.path,
|
||||
/* path_mmproj */ model.path_mmproj,
|
||||
|
|
@ -195,13 +331,31 @@ server_models::server_models(
|
|||
/* args */ std::vector<std::string>(),
|
||||
/* exit_code */ 0
|
||||
};
|
||||
mapping[meta.name] = instance_t{
|
||||
/* subproc */ std::make_shared<subprocess_s>(),
|
||||
/* th */ std::thread(),
|
||||
/* meta */ meta
|
||||
};
|
||||
add_model(std::move(meta));
|
||||
}
|
||||
}
|
||||
// 3. custom-path models specified in presets
|
||||
auto custom_models = list_custom_path_models(presets);
|
||||
for (const auto & model : custom_models) {
|
||||
server_model_meta meta{
|
||||
/* preset */ presets.get_preset(model.name),
|
||||
/* name */ model.name,
|
||||
/* path */ model.path,
|
||||
/* path_mmproj */ model.path_mmproj,
|
||||
/* in_cache */ false,
|
||||
/* port */ 0,
|
||||
/* status */ SERVER_MODEL_STATUS_UNLOADED,
|
||||
/* last_used */ 0,
|
||||
/* args */ std::vector<std::string>(),
|
||||
/* exit_code */ 0
|
||||
};
|
||||
add_model(std::move(meta));
|
||||
}
|
||||
// log available models
|
||||
SRV_INF("Available models (%zu) (*: custom preset)\n", mapping.size());
|
||||
for (const auto & [name, inst] : mapping) {
|
||||
SRV_INF(" %c %s\n", inst.meta.preset.name.empty() ? ' ' : '*', name.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
void server_models::update_meta(const std::string & name, const server_model_meta & meta) {
|
||||
|
|
@ -335,19 +489,7 @@ void server_models::unload_lru() {
|
|||
}
|
||||
}
|
||||
|
||||
static void add_or_replace_arg(std::vector<std::string> & args, const std::string & key, const std::string & value) {
|
||||
for (size_t i = 0; i < args.size(); i++) {
|
||||
if (args[i] == key && i + 1 < args.size()) {
|
||||
args[i + 1] = value;
|
||||
return;
|
||||
}
|
||||
}
|
||||
// not found, append
|
||||
args.push_back(key);
|
||||
args.push_back(value);
|
||||
}
|
||||
|
||||
void server_models::load(const std::string & name, bool auto_load) {
|
||||
void server_models::load(const std::string & name) {
|
||||
if (!has_model(name)) {
|
||||
throw std::runtime_error("model name=" + name + " is not found");
|
||||
}
|
||||
|
|
@ -376,26 +518,10 @@ void server_models::load(const std::string & name, bool auto_load) {
|
|||
{
|
||||
SRV_INF("spawning server instance with name=%s on port %d\n", inst.meta.name.c_str(), inst.meta.port);
|
||||
|
||||
std::vector<std::string> child_args;
|
||||
if (auto_load && !meta.args.empty()) {
|
||||
child_args = meta.args; // copy previous args
|
||||
} else {
|
||||
child_args = base_args; // copy
|
||||
if (inst.meta.in_cache) {
|
||||
add_or_replace_arg(child_args, "-hf", inst.meta.name);
|
||||
} else {
|
||||
add_or_replace_arg(child_args, "-m", inst.meta.path);
|
||||
if (!inst.meta.path_mmproj.empty()) {
|
||||
add_or_replace_arg(child_args, "--mmproj", inst.meta.path_mmproj);
|
||||
}
|
||||
}
|
||||
}
|
||||
presets.render_args(inst.meta); // update meta.args
|
||||
|
||||
// set model args
|
||||
add_or_replace_arg(child_args, "--port", std::to_string(inst.meta.port));
|
||||
add_or_replace_arg(child_args, "--alias", inst.meta.name);
|
||||
|
||||
std::vector<std::string> child_env = base_env; // copy
|
||||
std::vector<std::string> child_args = inst.meta.args; // copy
|
||||
std::vector<std::string> child_env = base_env; // copy
|
||||
child_env.push_back("LLAMA_SERVER_ROUTER_PORT=" + std::to_string(base_params.port));
|
||||
|
||||
SRV_INF("%s", "spawning server instance with args:\n");
|
||||
|
|
@ -541,7 +667,7 @@ bool server_models::ensure_model_loaded(const std::string & name) {
|
|||
}
|
||||
if (meta->status == SERVER_MODEL_STATUS_UNLOADED) {
|
||||
SRV_INF("model name=%s is not loaded, loading...\n", name.c_str());
|
||||
load(name, true);
|
||||
load(name);
|
||||
}
|
||||
|
||||
SRV_INF("waiting until model name=%s is fully loaded...\n", name.c_str());
|
||||
|
|
@ -571,7 +697,7 @@ server_http_res_ptr server_models::proxy_request(const server_http_req & req, co
|
|||
SRV_INF("proxying request to model %s on port %d\n", name.c_str(), meta->port);
|
||||
auto proxy = std::make_unique<server_http_proxy>(
|
||||
method,
|
||||
base_params.hostname,
|
||||
CHILD_ADDR,
|
||||
meta->port,
|
||||
req.path,
|
||||
req.headers,
|
||||
|
|
@ -724,38 +850,6 @@ void server_models_routes::init_routes() {
|
|||
return models.proxy_request(req, method, name, true); // update last usage for POST request only
|
||||
};
|
||||
|
||||
this->get_router_models = [this](const server_http_req &) {
|
||||
auto res = std::make_unique<server_http_res>();
|
||||
json models_json = json::array();
|
||||
auto all_models = models.get_all_meta();
|
||||
std::time_t t = std::time(0);
|
||||
for (const auto & meta : all_models) {
|
||||
json status {
|
||||
{"value", server_model_status_to_string(meta.status)},
|
||||
{"args", meta.args},
|
||||
};
|
||||
if (meta.is_failed()) {
|
||||
status["exit_code"] = meta.exit_code;
|
||||
status["failed"] = true;
|
||||
}
|
||||
models_json.push_back(json {
|
||||
{"id", meta.name},
|
||||
{"object", "model"}, // for OAI-compat
|
||||
{"owned_by", "llamacpp"}, // for OAI-compat
|
||||
{"created", t}, // for OAI-compat
|
||||
{"in_cache", meta.in_cache},
|
||||
{"path", meta.path},
|
||||
{"status", status},
|
||||
// TODO: add other fields, may require reading GGUF metadata
|
||||
});
|
||||
}
|
||||
res_ok(res, {
|
||||
{"data", models_json},
|
||||
{"object", "list"},
|
||||
});
|
||||
return res;
|
||||
};
|
||||
|
||||
this->post_router_models_load = [this](const server_http_req & req) {
|
||||
auto res = std::make_unique<server_http_res>();
|
||||
json body = json::parse(req.body);
|
||||
|
|
@ -769,7 +863,7 @@ void server_models_routes::init_routes() {
|
|||
res_err(res, format_error_response("model is already loaded", ERROR_TYPE_INVALID_REQUEST));
|
||||
return res;
|
||||
}
|
||||
models.load(name, false);
|
||||
models.load(name);
|
||||
res_ok(res, {{"success", true}});
|
||||
return res;
|
||||
};
|
||||
|
|
@ -793,9 +887,12 @@ void server_models_routes::init_routes() {
|
|||
std::time_t t = std::time(0);
|
||||
for (const auto & meta : all_models) {
|
||||
json status {
|
||||
{"value", server_model_status_to_string(meta.status)},
|
||||
{"args", meta.args},
|
||||
{"value", server_model_status_to_string(meta.status)},
|
||||
{"args", meta.args},
|
||||
};
|
||||
if (!meta.preset.name.empty()) {
|
||||
status["preset"] = meta.preset.to_ini();
|
||||
}
|
||||
if (meta.is_failed()) {
|
||||
status["exit_code"] = meta.exit_code;
|
||||
status["failed"] = true;
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
#pragma once
|
||||
|
||||
#include "common.h"
|
||||
#include "preset.h"
|
||||
#include "server-http.h"
|
||||
|
||||
#include <mutex>
|
||||
|
|
@ -47,6 +48,7 @@ static std::string server_model_status_to_string(server_model_status status) {
|
|||
}
|
||||
|
||||
struct server_model_meta {
|
||||
common_preset preset;
|
||||
std::string name;
|
||||
std::string path;
|
||||
std::string path_mmproj; // only available if in_cache=false
|
||||
|
|
@ -54,7 +56,7 @@ struct server_model_meta {
|
|||
int port = 0;
|
||||
server_model_status status = SERVER_MODEL_STATUS_UNLOADED;
|
||||
int64_t last_used = 0; // for LRU unloading
|
||||
std::vector<std::string> args; // additional args passed to the model instance (used for debugging)
|
||||
std::vector<std::string> args; // args passed to the model instance, will be populated by render_args()
|
||||
int exit_code = 0; // exit code of the model instance process (only valid if status == FAILED)
|
||||
|
||||
bool is_active() const {
|
||||
|
|
@ -66,6 +68,19 @@ struct server_model_meta {
|
|||
}
|
||||
};
|
||||
|
||||
// the server_presets struct holds the presets read from presets.ini
|
||||
// as well as base args from the router server
|
||||
struct server_presets {
|
||||
common_presets presets;
|
||||
common_params_context ctx_params;
|
||||
std::map<common_arg, std::string> base_args;
|
||||
std::map<std::string, common_arg> control_args; // args reserved for server control
|
||||
|
||||
server_presets(int argc, char ** argv, common_params & base_params, const std::string & models_dir);
|
||||
common_preset get_preset(const std::string & name);
|
||||
void render_args(server_model_meta & meta);
|
||||
};
|
||||
|
||||
struct subprocess_s;
|
||||
|
||||
struct server_models {
|
||||
|
|
@ -85,14 +100,21 @@ private:
|
|||
std::vector<std::string> base_args;
|
||||
std::vector<std::string> base_env;
|
||||
|
||||
server_presets presets;
|
||||
|
||||
void update_meta(const std::string & name, const server_model_meta & meta);
|
||||
|
||||
// unload least recently used models if the limit is reached
|
||||
void unload_lru();
|
||||
|
||||
// not thread-safe, caller must hold mutex
|
||||
void add_model(server_model_meta && meta);
|
||||
|
||||
public:
|
||||
server_models(const common_params & params, int argc, char ** argv, char ** envp);
|
||||
|
||||
void load_models();
|
||||
|
||||
// check if a model instance exists
|
||||
bool has_model(const std::string & name);
|
||||
|
||||
|
|
@ -102,8 +124,7 @@ public:
|
|||
// return a copy of all model metadata
|
||||
std::vector<server_model_meta> get_all_meta();
|
||||
|
||||
// if auto_load is true, load the model with previous args if any
|
||||
void load(const std::string & name, bool auto_load);
|
||||
void load(const std::string & name);
|
||||
void unload(const std::string & name);
|
||||
void unload_all();
|
||||
|
||||
|
|
|
|||
36
tools/server/webui/package-lock.json
generated
36
tools/server/webui/package-lock.json
generated
|
|
@ -41,7 +41,7 @@
|
|||
"@tailwindcss/vite": "^4.0.0",
|
||||
"@types/node": "^22",
|
||||
"@vitest/browser": "^3.2.3",
|
||||
"bits-ui": "^2.8.11",
|
||||
"bits-ui": "^2.14.4",
|
||||
"clsx": "^2.1.1",
|
||||
"dexie": "^4.0.11",
|
||||
"eslint": "^9.18.0",
|
||||
|
|
@ -3343,17 +3343,17 @@
|
|||
}
|
||||
},
|
||||
"node_modules/bits-ui": {
|
||||
"version": "2.8.11",
|
||||
"resolved": "https://registry.npmjs.org/bits-ui/-/bits-ui-2.8.11.tgz",
|
||||
"integrity": "sha512-lKN9rAk69my6j7H1D4B87r8LrHuEtfEsf1xCixBj9yViql2BdI3f04HyyyT7T1GOCpgb9+8b0B+nm3LN81Konw==",
|
||||
"version": "2.14.4",
|
||||
"resolved": "https://registry.npmjs.org/bits-ui/-/bits-ui-2.14.4.tgz",
|
||||
"integrity": "sha512-W6kenhnbd/YVvur+DKkaVJ6GldE53eLewur5AhUCqslYQ0vjZr8eWlOfwZnMiPB+PF5HMVqf61vXBvmyrAmPWg==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@floating-ui/core": "^1.7.1",
|
||||
"@floating-ui/dom": "^1.7.1",
|
||||
"esm-env": "^1.1.2",
|
||||
"runed": "^0.29.1",
|
||||
"svelte-toolbelt": "^0.9.3",
|
||||
"runed": "^0.35.1",
|
||||
"svelte-toolbelt": "^0.10.6",
|
||||
"tabbable": "^6.2.0"
|
||||
},
|
||||
"engines": {
|
||||
|
|
@ -3368,9 +3368,9 @@
|
|||
}
|
||||
},
|
||||
"node_modules/bits-ui/node_modules/runed": {
|
||||
"version": "0.29.2",
|
||||
"resolved": "https://registry.npmjs.org/runed/-/runed-0.29.2.tgz",
|
||||
"integrity": "sha512-0cq6cA6sYGZwl/FvVqjx9YN+1xEBu9sDDyuWdDW1yWX7JF2wmvmVKfH+hVCZs+csW+P3ARH92MjI3H9QTagOQA==",
|
||||
"version": "0.35.1",
|
||||
"resolved": "https://registry.npmjs.org/runed/-/runed-0.35.1.tgz",
|
||||
"integrity": "sha512-2F4Q/FZzbeJTFdIS/PuOoPRSm92sA2LhzTnv6FXhCoENb3huf5+fDuNOg1LNvGOouy3u/225qxmuJvcV3IZK5Q==",
|
||||
"dev": true,
|
||||
"funding": [
|
||||
"https://github.com/sponsors/huntabyte",
|
||||
|
|
@ -3378,23 +3378,31 @@
|
|||
],
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"esm-env": "^1.0.0"
|
||||
"dequal": "^2.0.3",
|
||||
"esm-env": "^1.0.0",
|
||||
"lz-string": "^1.5.0"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"@sveltejs/kit": "^2.21.0",
|
||||
"svelte": "^5.7.0"
|
||||
},
|
||||
"peerDependenciesMeta": {
|
||||
"@sveltejs/kit": {
|
||||
"optional": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"node_modules/bits-ui/node_modules/svelte-toolbelt": {
|
||||
"version": "0.9.3",
|
||||
"resolved": "https://registry.npmjs.org/svelte-toolbelt/-/svelte-toolbelt-0.9.3.tgz",
|
||||
"integrity": "sha512-HCSWxCtVmv+c6g1ACb8LTwHVbDqLKJvHpo6J8TaqwUme2hj9ATJCpjCPNISR1OCq2Q4U1KT41if9ON0isINQZw==",
|
||||
"version": "0.10.6",
|
||||
"resolved": "https://registry.npmjs.org/svelte-toolbelt/-/svelte-toolbelt-0.10.6.tgz",
|
||||
"integrity": "sha512-YWuX+RE+CnWYx09yseAe4ZVMM7e7GRFZM6OYWpBKOb++s+SQ8RBIMMe+Bs/CznBMc0QPLjr+vDBxTAkozXsFXQ==",
|
||||
"dev": true,
|
||||
"funding": [
|
||||
"https://github.com/sponsors/huntabyte"
|
||||
],
|
||||
"dependencies": {
|
||||
"clsx": "^2.1.1",
|
||||
"runed": "^0.29.0",
|
||||
"runed": "^0.35.1",
|
||||
"style-to-object": "^1.0.8"
|
||||
},
|
||||
"engines": {
|
||||
|
|
|
|||
|
|
@ -43,7 +43,7 @@
|
|||
"@tailwindcss/vite": "^4.0.0",
|
||||
"@types/node": "^22",
|
||||
"@vitest/browser": "^3.2.3",
|
||||
"bits-ui": "^2.8.11",
|
||||
"bits-ui": "^2.14.4",
|
||||
"clsx": "^2.1.1",
|
||||
"dexie": "^4.0.11",
|
||||
"eslint": "^9.18.0",
|
||||
|
|
|
|||
|
|
@ -331,6 +331,7 @@
|
|||
class="{INPUT_CLASSES} border-radius-bottom-none mx-auto max-w-[48rem] overflow-hidden rounded-3xl backdrop-blur-md {disabled
|
||||
? 'cursor-not-allowed opacity-60'
|
||||
: ''} {className}"
|
||||
data-slot="chat-form"
|
||||
>
|
||||
<ChatAttachmentsList
|
||||
bind:uploadedFiles
|
||||
|
|
|
|||
|
|
@ -1,6 +1,5 @@
|
|||
<script lang="ts">
|
||||
import { Input } from '$lib/components/ui/input';
|
||||
import { Search } from '@lucide/svelte';
|
||||
import { SearchInput } from '$lib/components/app';
|
||||
|
||||
interface Props {
|
||||
value?: string;
|
||||
|
|
@ -15,19 +14,6 @@
|
|||
onInput,
|
||||
class: className
|
||||
}: Props = $props();
|
||||
|
||||
function handleInput(event: Event) {
|
||||
const target = event.target as HTMLInputElement;
|
||||
|
||||
value = target.value;
|
||||
onInput?.(target.value);
|
||||
}
|
||||
</script>
|
||||
|
||||
<div class="relative mb-4 {className}">
|
||||
<Search
|
||||
class="absolute top-1/2 left-3 h-4 w-4 -translate-y-1/2 transform text-muted-foreground"
|
||||
/>
|
||||
|
||||
<Input bind:value class="pl-10" oninput={handleInput} {placeholder} type="search" />
|
||||
</div>
|
||||
<SearchInput bind:value {placeholder} {onInput} class="mb-4 {className}" />
|
||||
|
|
|
|||
|
|
@ -64,6 +64,7 @@ export { default as CopyToClipboardIcon } from './misc/CopyToClipboardIcon.svelt
|
|||
export { default as KeyboardShortcutInfo } from './misc/KeyboardShortcutInfo.svelte';
|
||||
export { default as MarkdownContent } from './misc/MarkdownContent.svelte';
|
||||
export { default as RemoveButton } from './misc/RemoveButton.svelte';
|
||||
export { default as SearchInput } from './misc/SearchInput.svelte';
|
||||
export { default as SyntaxHighlightedCode } from './misc/SyntaxHighlightedCode.svelte';
|
||||
export { default as ModelsSelector } from './models/ModelsSelector.svelte';
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,73 @@
|
|||
<script lang="ts">
|
||||
import { Input } from '$lib/components/ui/input';
|
||||
import { Search, X } from '@lucide/svelte';
|
||||
|
||||
interface Props {
|
||||
value?: string;
|
||||
placeholder?: string;
|
||||
onInput?: (value: string) => void;
|
||||
onClose?: () => void;
|
||||
onKeyDown?: (event: KeyboardEvent) => void;
|
||||
class?: string;
|
||||
id?: string;
|
||||
ref?: HTMLInputElement | null;
|
||||
}
|
||||
|
||||
let {
|
||||
value = $bindable(''),
|
||||
placeholder = 'Search...',
|
||||
onInput,
|
||||
onClose,
|
||||
onKeyDown,
|
||||
class: className,
|
||||
id,
|
||||
ref = $bindable(null)
|
||||
}: Props = $props();
|
||||
|
||||
let showClearButton = $derived(!!value || !!onClose);
|
||||
|
||||
function handleInput(event: Event) {
|
||||
const target = event.target as HTMLInputElement;
|
||||
|
||||
value = target.value;
|
||||
onInput?.(target.value);
|
||||
}
|
||||
|
||||
function handleClear() {
|
||||
if (value) {
|
||||
value = '';
|
||||
onInput?.('');
|
||||
ref?.focus();
|
||||
} else {
|
||||
onClose?.();
|
||||
}
|
||||
}
|
||||
</script>
|
||||
|
||||
<div class="relative {className}">
|
||||
<Search
|
||||
class="absolute top-1/2 left-3 h-4 w-4 -translate-y-1/2 transform text-muted-foreground"
|
||||
/>
|
||||
|
||||
<Input
|
||||
{id}
|
||||
bind:value
|
||||
bind:ref
|
||||
class="pl-9 {showClearButton ? 'pr-9' : ''}"
|
||||
oninput={handleInput}
|
||||
onkeydown={onKeyDown}
|
||||
{placeholder}
|
||||
type="search"
|
||||
/>
|
||||
|
||||
{#if showClearButton}
|
||||
<button
|
||||
type="button"
|
||||
class="absolute top-1/2 right-3 -translate-y-1/2 transform text-muted-foreground transition-colors hover:text-foreground"
|
||||
onclick={handleClear}
|
||||
aria-label={value ? 'Clear search' : 'Close'}
|
||||
>
|
||||
<X class="h-4 w-4" />
|
||||
</button>
|
||||
{/if}
|
||||
</div>
|
||||
|
|
@ -2,8 +2,8 @@
|
|||
import { onMount, tick } from 'svelte';
|
||||
import { ChevronDown, EyeOff, Loader2, MicOff, Package, Power } from '@lucide/svelte';
|
||||
import * as Tooltip from '$lib/components/ui/tooltip';
|
||||
import * as Popover from '$lib/components/ui/popover';
|
||||
import { cn } from '$lib/components/ui/utils';
|
||||
import { portalToBody } from '$lib/utils';
|
||||
import {
|
||||
modelsStore,
|
||||
modelOptions,
|
||||
|
|
@ -17,12 +17,8 @@
|
|||
import { usedModalities, conversationsStore } from '$lib/stores/conversations.svelte';
|
||||
import { ServerModelStatus } from '$lib/enums';
|
||||
import { isRouterMode } from '$lib/stores/server.svelte';
|
||||
import { DialogModelInformation } from '$lib/components/app';
|
||||
import {
|
||||
MENU_MAX_WIDTH,
|
||||
MENU_OFFSET,
|
||||
VIEWPORT_GUTTER
|
||||
} from '$lib/constants/floating-ui-constraints';
|
||||
import { DialogModelInformation, SearchInput } from '$lib/components/app';
|
||||
import type { ModelOption } from '$lib/types/models';
|
||||
|
||||
interface Props {
|
||||
class?: string;
|
||||
|
|
@ -145,185 +141,126 @@
|
|||
return options.some((option) => option.model === currentModel);
|
||||
});
|
||||
|
||||
let isOpen = $state(false);
|
||||
let showModelDialog = $state(false);
|
||||
let container: HTMLDivElement | null = null;
|
||||
let menuRef = $state<HTMLDivElement | null>(null);
|
||||
let triggerButton = $state<HTMLButtonElement | null>(null);
|
||||
let menuPosition = $state<{
|
||||
top: number;
|
||||
left: number;
|
||||
width: number;
|
||||
placement: 'top' | 'bottom';
|
||||
maxHeight: number;
|
||||
} | null>(null);
|
||||
let searchTerm = $state('');
|
||||
let searchInputRef = $state<HTMLInputElement | null>(null);
|
||||
let highlightedIndex = $state<number>(-1);
|
||||
|
||||
onMount(async () => {
|
||||
try {
|
||||
await modelsStore.fetch();
|
||||
} catch (error) {
|
||||
console.error('Unable to load models:', error);
|
||||
}
|
||||
let filteredOptions: ModelOption[] = $derived(
|
||||
(() => {
|
||||
const term = searchTerm.trim().toLowerCase();
|
||||
if (!term) return options;
|
||||
|
||||
return options.filter(
|
||||
(option) =>
|
||||
option.model.toLowerCase().includes(term) || option.name?.toLowerCase().includes(term)
|
||||
);
|
||||
})()
|
||||
);
|
||||
|
||||
// Get indices of compatible options for keyboard navigation
|
||||
let compatibleIndices = $derived(
|
||||
filteredOptions
|
||||
.map((option, index) => (isModelCompatible(option) ? index : -1))
|
||||
.filter((i) => i !== -1)
|
||||
);
|
||||
|
||||
// Reset highlighted index when search term changes
|
||||
$effect(() => {
|
||||
void searchTerm;
|
||||
highlightedIndex = -1;
|
||||
});
|
||||
|
||||
function toggleOpen() {
|
||||
let isOpen = $state(false);
|
||||
let showModelDialog = $state(false);
|
||||
|
||||
onMount(() => {
|
||||
modelsStore.fetch().catch((error) => {
|
||||
console.error('Unable to load models:', error);
|
||||
});
|
||||
});
|
||||
|
||||
function handleOpenChange(open: boolean) {
|
||||
if (loading || updating) return;
|
||||
|
||||
if (isRouter) {
|
||||
// Router mode: show dropdown
|
||||
if (isOpen) {
|
||||
closeMenu();
|
||||
} else {
|
||||
openMenu();
|
||||
if (open) {
|
||||
isOpen = true;
|
||||
searchTerm = '';
|
||||
highlightedIndex = -1;
|
||||
|
||||
// Focus search input after popover opens
|
||||
tick().then(() => {
|
||||
requestAnimationFrame(() => searchInputRef?.focus());
|
||||
});
|
||||
|
||||
if (isRouter) {
|
||||
modelsStore.fetchRouterModels().then(() => {
|
||||
modelsStore.fetchModalitiesForLoadedModels();
|
||||
});
|
||||
}
|
||||
} else {
|
||||
// Single model mode: show dialog
|
||||
showModelDialog = true;
|
||||
isOpen = false;
|
||||
searchTerm = '';
|
||||
highlightedIndex = -1;
|
||||
}
|
||||
}
|
||||
|
||||
async function openMenu() {
|
||||
function handleTriggerClick() {
|
||||
if (loading || updating) return;
|
||||
|
||||
isOpen = true;
|
||||
await tick();
|
||||
updateMenuPosition();
|
||||
requestAnimationFrame(() => updateMenuPosition());
|
||||
|
||||
if (isRouter) {
|
||||
modelsStore.fetchRouterModels().then(() => {
|
||||
modelsStore.fetchModalitiesForLoadedModels();
|
||||
});
|
||||
if (!isRouter) {
|
||||
// Single model mode: show dialog instead of popover
|
||||
showModelDialog = true;
|
||||
}
|
||||
// For router mode, the Popover handles open/close
|
||||
}
|
||||
|
||||
export function open() {
|
||||
if (isRouter) {
|
||||
openMenu();
|
||||
handleOpenChange(true);
|
||||
} else {
|
||||
showModelDialog = true;
|
||||
}
|
||||
}
|
||||
|
||||
function closeMenu() {
|
||||
if (!isOpen) return;
|
||||
|
||||
isOpen = false;
|
||||
menuPosition = null;
|
||||
handleOpenChange(false);
|
||||
}
|
||||
|
||||
function handlePointerDown(event: PointerEvent) {
|
||||
if (!container) return;
|
||||
function handleSearchKeyDown(event: KeyboardEvent) {
|
||||
if (event.isComposing) return;
|
||||
|
||||
const target = event.target as Node | null;
|
||||
if (event.key === 'ArrowDown') {
|
||||
event.preventDefault();
|
||||
if (compatibleIndices.length === 0) return;
|
||||
|
||||
if (target && !container.contains(target) && !(menuRef && menuRef.contains(target))) {
|
||||
closeMenu();
|
||||
}
|
||||
}
|
||||
|
||||
function handleKeydown(event: KeyboardEvent) {
|
||||
if (event.key === 'Escape') {
|
||||
closeMenu();
|
||||
}
|
||||
}
|
||||
|
||||
function handleResize() {
|
||||
if (isOpen) {
|
||||
updateMenuPosition();
|
||||
}
|
||||
}
|
||||
|
||||
function updateMenuPosition() {
|
||||
if (!isOpen || !triggerButton || !menuRef) return;
|
||||
|
||||
const triggerRect = triggerButton.getBoundingClientRect();
|
||||
const viewportWidth = window.innerWidth;
|
||||
const viewportHeight = window.innerHeight;
|
||||
|
||||
if (viewportWidth === 0 || viewportHeight === 0) return;
|
||||
|
||||
const scrollWidth = menuRef.scrollWidth;
|
||||
const scrollHeight = menuRef.scrollHeight;
|
||||
|
||||
const availableWidth = Math.max(0, viewportWidth - VIEWPORT_GUTTER * 2);
|
||||
const constrainedMaxWidth = Math.min(MENU_MAX_WIDTH, availableWidth || MENU_MAX_WIDTH);
|
||||
const safeMaxWidth =
|
||||
constrainedMaxWidth > 0 ? constrainedMaxWidth : Math.min(MENU_MAX_WIDTH, viewportWidth);
|
||||
const desiredMinWidth = Math.min(160, safeMaxWidth || 160);
|
||||
|
||||
let width = Math.min(
|
||||
Math.max(triggerRect.width, scrollWidth, desiredMinWidth),
|
||||
safeMaxWidth || 320
|
||||
);
|
||||
|
||||
const availableBelow = Math.max(
|
||||
0,
|
||||
viewportHeight - VIEWPORT_GUTTER - triggerRect.bottom - MENU_OFFSET
|
||||
);
|
||||
const availableAbove = Math.max(0, triggerRect.top - VIEWPORT_GUTTER - MENU_OFFSET);
|
||||
const viewportAllowance = Math.max(0, viewportHeight - VIEWPORT_GUTTER * 2);
|
||||
const fallbackAllowance = Math.max(1, viewportAllowance > 0 ? viewportAllowance : scrollHeight);
|
||||
|
||||
function computePlacement(placement: 'top' | 'bottom') {
|
||||
const available = placement === 'bottom' ? availableBelow : availableAbove;
|
||||
const allowedHeight =
|
||||
available > 0 ? Math.min(available, fallbackAllowance) : fallbackAllowance;
|
||||
const maxHeight = Math.min(scrollHeight, allowedHeight);
|
||||
const height = Math.max(0, maxHeight);
|
||||
|
||||
let top: number;
|
||||
if (placement === 'bottom') {
|
||||
const rawTop = triggerRect.bottom + MENU_OFFSET;
|
||||
const minTop = VIEWPORT_GUTTER;
|
||||
const maxTop = viewportHeight - VIEWPORT_GUTTER - height;
|
||||
if (maxTop < minTop) {
|
||||
top = minTop;
|
||||
} else {
|
||||
top = Math.min(Math.max(rawTop, minTop), maxTop);
|
||||
}
|
||||
const currentPos = compatibleIndices.indexOf(highlightedIndex);
|
||||
if (currentPos === -1 || currentPos === compatibleIndices.length - 1) {
|
||||
highlightedIndex = compatibleIndices[0];
|
||||
} else {
|
||||
const rawTop = triggerRect.top - MENU_OFFSET - height;
|
||||
const minTop = VIEWPORT_GUTTER;
|
||||
const maxTop = viewportHeight - VIEWPORT_GUTTER - height;
|
||||
if (maxTop < minTop) {
|
||||
top = minTop;
|
||||
} else {
|
||||
top = Math.max(Math.min(rawTop, maxTop), minTop);
|
||||
highlightedIndex = compatibleIndices[currentPos + 1];
|
||||
}
|
||||
} else if (event.key === 'ArrowUp') {
|
||||
event.preventDefault();
|
||||
if (compatibleIndices.length === 0) return;
|
||||
|
||||
const currentPos = compatibleIndices.indexOf(highlightedIndex);
|
||||
if (currentPos === -1 || currentPos === 0) {
|
||||
highlightedIndex = compatibleIndices[compatibleIndices.length - 1];
|
||||
} else {
|
||||
highlightedIndex = compatibleIndices[currentPos - 1];
|
||||
}
|
||||
} else if (event.key === 'Enter') {
|
||||
event.preventDefault();
|
||||
if (highlightedIndex >= 0 && highlightedIndex < filteredOptions.length) {
|
||||
const option = filteredOptions[highlightedIndex];
|
||||
if (isModelCompatible(option)) {
|
||||
handleSelect(option.id);
|
||||
}
|
||||
}
|
||||
|
||||
return { placement, top, height, maxHeight };
|
||||
}
|
||||
|
||||
const belowMetrics = computePlacement('bottom');
|
||||
const aboveMetrics = computePlacement('top');
|
||||
|
||||
let metrics = belowMetrics;
|
||||
if (scrollHeight > belowMetrics.maxHeight && aboveMetrics.maxHeight > belowMetrics.maxHeight) {
|
||||
metrics = aboveMetrics;
|
||||
}
|
||||
|
||||
let left = triggerRect.right - width;
|
||||
const maxLeft = viewportWidth - VIEWPORT_GUTTER - width;
|
||||
if (maxLeft < VIEWPORT_GUTTER) {
|
||||
left = VIEWPORT_GUTTER;
|
||||
} else {
|
||||
if (left > maxLeft) {
|
||||
left = maxLeft;
|
||||
}
|
||||
if (left < VIEWPORT_GUTTER) {
|
||||
left = VIEWPORT_GUTTER;
|
||||
} else if (compatibleIndices.length > 0) {
|
||||
// No selection - highlight first compatible option
|
||||
highlightedIndex = compatibleIndices[0];
|
||||
}
|
||||
}
|
||||
|
||||
menuPosition = {
|
||||
top: Math.round(metrics.top),
|
||||
left: Math.round(left),
|
||||
width: Math.round(width),
|
||||
placement: metrics.placement,
|
||||
maxHeight: Math.round(metrics.maxHeight)
|
||||
};
|
||||
}
|
||||
|
||||
async function handleSelect(modelId: string) {
|
||||
|
|
@ -356,6 +293,14 @@
|
|||
|
||||
if (shouldCloseMenu) {
|
||||
closeMenu();
|
||||
|
||||
// Focus the chat textarea after model selection
|
||||
requestAnimationFrame(() => {
|
||||
const textarea = document.querySelector<HTMLTextAreaElement>(
|
||||
'[data-slot="chat-form"] textarea'
|
||||
);
|
||||
textarea?.focus();
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -404,10 +349,7 @@
|
|||
}
|
||||
</script>
|
||||
|
||||
<svelte:window onresize={handleResize} />
|
||||
<svelte:document onpointerdown={handlePointerDown} onkeydown={handleKeydown} />
|
||||
|
||||
<div class={cn('relative inline-flex flex-col items-end gap-1', className)} bind:this={container}>
|
||||
<div class={cn('relative inline-flex flex-col items-end gap-1', className)}>
|
||||
{#if loading && options.length === 0 && isRouter}
|
||||
<div class="flex items-center gap-2 text-xs text-muted-foreground">
|
||||
<Loader2 class="h-3.5 w-3.5 animate-spin" />
|
||||
|
|
@ -418,9 +360,8 @@
|
|||
{:else}
|
||||
{@const selectedOption = getDisplayOption()}
|
||||
|
||||
<div class="relative">
|
||||
<button
|
||||
type="button"
|
||||
<Popover.Root bind:open={isOpen} onOpenChange={handleOpenChange}>
|
||||
<Popover.Trigger
|
||||
class={cn(
|
||||
`inline-flex cursor-pointer items-center gap-1.5 rounded-sm bg-muted-foreground/10 px-1.5 py-1 text-xs transition hover:text-foreground focus:outline-none focus-visible:ring-2 focus-visible:ring-ring focus-visible:ring-offset-2 disabled:cursor-not-allowed disabled:opacity-60`,
|
||||
!isCurrentModelInCache()
|
||||
|
|
@ -430,15 +371,11 @@
|
|||
: isHighlightedCurrentModelActive
|
||||
? 'text-foreground'
|
||||
: 'text-muted-foreground',
|
||||
isOpen ? 'text-foreground' : '',
|
||||
className
|
||||
isOpen ? 'text-foreground' : ''
|
||||
)}
|
||||
style="max-width: min(calc(100cqw - 6.5rem), 32rem)"
|
||||
aria-haspopup={isRouter ? 'listbox' : undefined}
|
||||
aria-expanded={isRouter ? isOpen : undefined}
|
||||
onclick={toggleOpen}
|
||||
bind:this={triggerButton}
|
||||
disabled={disabled || updating}
|
||||
onclick={handleTriggerClick}
|
||||
disabled={disabled || updating || !isRouter}
|
||||
>
|
||||
<Package class="h-3.5 w-3.5" />
|
||||
|
||||
|
|
@ -451,33 +388,35 @@
|
|||
{:else if isRouter}
|
||||
<ChevronDown class="h-3 w-3.5" />
|
||||
{/if}
|
||||
</button>
|
||||
</Popover.Trigger>
|
||||
|
||||
{#if isOpen && isRouter}
|
||||
<div
|
||||
bind:this={menuRef}
|
||||
use:portalToBody
|
||||
class={cn(
|
||||
'fixed z-[1000] overflow-hidden rounded-md border bg-popover shadow-lg transition-opacity',
|
||||
menuPosition ? 'opacity-100' : 'pointer-events-none opacity-0'
|
||||
)}
|
||||
role="listbox"
|
||||
style:top={menuPosition ? `${menuPosition.top}px` : undefined}
|
||||
style:left={menuPosition ? `${menuPosition.left}px` : undefined}
|
||||
style:width={menuPosition ? `${menuPosition.width}px` : undefined}
|
||||
data-placement={menuPosition?.placement ?? 'bottom'}
|
||||
>
|
||||
<Popover.Content
|
||||
class="group/popover-content w-96 max-w-[calc(100vw-2rem)] p-0"
|
||||
align="end"
|
||||
sideOffset={8}
|
||||
collisionPadding={16}
|
||||
>
|
||||
<div class="flex max-h-[50dvh] flex-col overflow-hidden">
|
||||
<div
|
||||
class="overflow-y-auto py-1"
|
||||
style:max-height={menuPosition && menuPosition.maxHeight > 0
|
||||
? `${menuPosition.maxHeight}px`
|
||||
: undefined}
|
||||
class="order-1 shrink-0 border-b p-4 group-data-[side=top]/popover-content:order-2 group-data-[side=top]/popover-content:border-t group-data-[side=top]/popover-content:border-b-0"
|
||||
>
|
||||
<SearchInput
|
||||
id="model-search"
|
||||
placeholder="Search models..."
|
||||
bind:value={searchTerm}
|
||||
bind:ref={searchInputRef}
|
||||
onClose={closeMenu}
|
||||
onKeyDown={handleSearchKeyDown}
|
||||
/>
|
||||
</div>
|
||||
<div
|
||||
class="models-list order-2 min-h-0 flex-1 overflow-y-auto group-data-[side=top]/popover-content:order-1"
|
||||
>
|
||||
{#if !isCurrentModelInCache() && currentModel}
|
||||
<!-- Show unavailable model as first option (disabled) -->
|
||||
<button
|
||||
type="button"
|
||||
class="flex w-full cursor-not-allowed items-center bg-red-400/10 px-3 py-2 text-left text-sm text-red-400"
|
||||
class="flex w-full cursor-not-allowed items-center bg-red-400/10 px-4 py-2 text-left text-sm text-red-400"
|
||||
role="option"
|
||||
aria-selected="true"
|
||||
aria-disabled="true"
|
||||
|
|
@ -488,20 +427,25 @@
|
|||
</button>
|
||||
<div class="my-1 h-px bg-border"></div>
|
||||
{/if}
|
||||
{#each options as option (option.id)}
|
||||
{#if filteredOptions.length === 0}
|
||||
<p class="px-4 py-3 text-sm text-muted-foreground">No models found.</p>
|
||||
{/if}
|
||||
{#each filteredOptions as option, index (option.id)}
|
||||
{@const status = getModelStatus(option.model)}
|
||||
{@const isLoaded = status === ServerModelStatus.LOADED}
|
||||
{@const isLoading = status === ServerModelStatus.LOADING}
|
||||
{@const isSelected = currentModel === option.model || activeId === option.id}
|
||||
{@const isCompatible = isModelCompatible(option)}
|
||||
{@const isHighlighted = index === highlightedIndex}
|
||||
{@const missingModalities = getMissingModalities(option)}
|
||||
|
||||
<div
|
||||
class={cn(
|
||||
'group flex w-full items-center gap-2 px-3 py-2 text-left text-sm transition focus:outline-none',
|
||||
'group flex w-full items-center gap-2 px-4 py-2 text-left text-sm transition focus:outline-none',
|
||||
isCompatible
|
||||
? 'cursor-pointer hover:bg-muted focus:bg-muted'
|
||||
: 'cursor-not-allowed opacity-50',
|
||||
isSelected
|
||||
isSelected || isHighlighted
|
||||
? 'bg-accent text-accent-foreground'
|
||||
: isCompatible
|
||||
? 'hover:bg-accent hover:text-accent-foreground'
|
||||
|
|
@ -509,10 +453,11 @@
|
|||
isLoaded ? 'text-popover-foreground' : 'text-muted-foreground'
|
||||
)}
|
||||
role="option"
|
||||
aria-selected={isSelected}
|
||||
aria-selected={isSelected || isHighlighted}
|
||||
aria-disabled={!isCompatible}
|
||||
tabindex={isCompatible ? 0 : -1}
|
||||
onclick={() => isCompatible && handleSelect(option.id)}
|
||||
onmouseenter={() => (highlightedIndex = index)}
|
||||
onkeydown={(e) => {
|
||||
if (isCompatible && (e.key === 'Enter' || e.key === ' ')) {
|
||||
e.preventDefault();
|
||||
|
|
@ -586,8 +531,8 @@
|
|||
{/each}
|
||||
</div>
|
||||
</div>
|
||||
{/if}
|
||||
</div>
|
||||
</Popover.Content>
|
||||
</Popover.Root>
|
||||
{/if}
|
||||
</div>
|
||||
|
||||
|
|
|
|||
19
tools/server/webui/src/lib/components/ui/popover/index.ts
Normal file
19
tools/server/webui/src/lib/components/ui/popover/index.ts
Normal file
|
|
@ -0,0 +1,19 @@
|
|||
import Root from './popover.svelte';
|
||||
import Close from './popover-close.svelte';
|
||||
import Content from './popover-content.svelte';
|
||||
import Trigger from './popover-trigger.svelte';
|
||||
import Portal from './popover-portal.svelte';
|
||||
|
||||
export {
|
||||
Root,
|
||||
Content,
|
||||
Trigger,
|
||||
Close,
|
||||
Portal,
|
||||
//
|
||||
Root as Popover,
|
||||
Content as PopoverContent,
|
||||
Trigger as PopoverTrigger,
|
||||
Close as PopoverClose,
|
||||
Portal as PopoverPortal
|
||||
};
|
||||
|
|
@ -0,0 +1,7 @@
|
|||
<script lang="ts">
|
||||
import { Popover as PopoverPrimitive } from 'bits-ui';
|
||||
|
||||
let { ref = $bindable(null), ...restProps }: PopoverPrimitive.CloseProps = $props();
|
||||
</script>
|
||||
|
||||
<PopoverPrimitive.Close bind:ref data-slot="popover-close" {...restProps} />
|
||||
|
|
@ -0,0 +1,37 @@
|
|||
<script lang="ts">
|
||||
import { Popover as PopoverPrimitive } from 'bits-ui';
|
||||
import PopoverPortal from './popover-portal.svelte';
|
||||
import { cn, type WithoutChildrenOrChild } from '$lib/components/ui/utils.js';
|
||||
import type { ComponentProps } from 'svelte';
|
||||
|
||||
let {
|
||||
ref = $bindable(null),
|
||||
class: className,
|
||||
sideOffset = 4,
|
||||
side,
|
||||
align = 'center',
|
||||
collisionPadding = 8,
|
||||
avoidCollisions = true,
|
||||
portalProps,
|
||||
...restProps
|
||||
}: PopoverPrimitive.ContentProps & {
|
||||
portalProps?: WithoutChildrenOrChild<ComponentProps<typeof PopoverPortal>>;
|
||||
} = $props();
|
||||
</script>
|
||||
|
||||
<PopoverPortal {...portalProps}>
|
||||
<PopoverPrimitive.Content
|
||||
bind:ref
|
||||
data-slot="popover-content"
|
||||
{sideOffset}
|
||||
{side}
|
||||
{align}
|
||||
{collisionPadding}
|
||||
{avoidCollisions}
|
||||
class={cn(
|
||||
'z-50 w-72 origin-(--bits-popover-content-transform-origin) rounded-md border bg-popover p-4 text-popover-foreground shadow-md outline-hidden data-[side=bottom]:slide-in-from-top-2 data-[side=left]:slide-in-from-end-2 data-[side=right]:slide-in-from-start-2 data-[side=top]:slide-in-from-bottom-2 data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=closed]:zoom-out-95 data-[state=open]:animate-in data-[state=open]:fade-in-0 data-[state=open]:zoom-in-95',
|
||||
className
|
||||
)}
|
||||
{...restProps}
|
||||
/>
|
||||
</PopoverPortal>
|
||||
|
|
@ -0,0 +1,7 @@
|
|||
<script lang="ts">
|
||||
import { Popover as PopoverPrimitive } from 'bits-ui';
|
||||
|
||||
let { ...restProps }: PopoverPrimitive.PortalProps = $props();
|
||||
</script>
|
||||
|
||||
<PopoverPrimitive.Portal {...restProps} />
|
||||
|
|
@ -0,0 +1,17 @@
|
|||
<script lang="ts">
|
||||
import { cn } from '$lib/components/ui/utils.js';
|
||||
import { Popover as PopoverPrimitive } from 'bits-ui';
|
||||
|
||||
let {
|
||||
ref = $bindable(null),
|
||||
class: className,
|
||||
...restProps
|
||||
}: PopoverPrimitive.TriggerProps = $props();
|
||||
</script>
|
||||
|
||||
<PopoverPrimitive.Trigger
|
||||
bind:ref
|
||||
data-slot="popover-trigger"
|
||||
class={cn('', className)}
|
||||
{...restProps}
|
||||
/>
|
||||
|
|
@ -0,0 +1,7 @@
|
|||
<script lang="ts">
|
||||
import { Popover as PopoverPrimitive } from 'bits-ui';
|
||||
|
||||
let { open = $bindable(false), ...restProps }: PopoverPrimitive.RootProps = $props();
|
||||
</script>
|
||||
|
||||
<PopoverPrimitive.Root bind:open {...restProps} />
|
||||
|
|
@ -1,3 +1,2 @@
|
|||
export const VIEWPORT_GUTTER = 8;
|
||||
export const MENU_OFFSET = 6;
|
||||
export const MENU_MAX_WIDTH = 320;
|
||||
|
|
|
|||
|
|
@ -295,14 +295,21 @@ class ModelsStore {
|
|||
* Fetch props for a specific model from /props endpoint
|
||||
* Uses caching to avoid redundant requests
|
||||
*
|
||||
* In ROUTER mode, this will only fetch props if the model is loaded,
|
||||
* since unloaded models return 400 from /props endpoint.
|
||||
*
|
||||
* @param modelId - Model identifier to fetch props for
|
||||
* @returns Props data or null if fetch failed
|
||||
* @returns Props data or null if fetch failed or model not loaded
|
||||
*/
|
||||
async fetchModelProps(modelId: string): Promise<ApiLlamaCppServerProps | null> {
|
||||
// Return cached props if available
|
||||
const cached = this.modelPropsCache.get(modelId);
|
||||
if (cached) return cached;
|
||||
|
||||
if (serverStore.isRouterMode && !this.isModelLoaded(modelId)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// Avoid duplicate fetches
|
||||
if (this.modelPropsFetching.has(modelId)) return null;
|
||||
|
||||
|
|
|
|||
|
|
@ -303,6 +303,27 @@ $$\n\\pi_n(\\mathbb{S}^3) = \\begin{cases}
|
|||
expect(output).toBe(input); // Code blocks prevent misinterpretation
|
||||
});
|
||||
|
||||
test('preserves backslash parentheses in code blocks (GitHub issue)', () => {
|
||||
const input = '```python\nfoo = "\\(bar\\)"\n```';
|
||||
const output = preprocessLaTeX(input);
|
||||
|
||||
expect(output).toBe(input); // Code blocks should not have LaTeX conversion applied
|
||||
});
|
||||
|
||||
test('preserves backslash brackets in code blocks', () => {
|
||||
const input = '```python\nfoo = "\\[bar\\]"\n```';
|
||||
const output = preprocessLaTeX(input);
|
||||
|
||||
expect(output).toBe(input); // Code blocks should not have LaTeX conversion applied
|
||||
});
|
||||
|
||||
test('preserves backslash parentheses in inline code', () => {
|
||||
const input = 'Use `foo = "\\(bar\\)"` in your code.';
|
||||
const output = preprocessLaTeX(input);
|
||||
|
||||
expect(output).toBe(input);
|
||||
});
|
||||
|
||||
test('escape backslash in mchem ce', () => {
|
||||
const input = 'mchem ce:\n$\\ce{2H2(g) + O2(g) -> 2H2O(l)}$';
|
||||
const output = preprocessLaTeX(input);
|
||||
|
|
|
|||
|
|
@ -226,19 +226,16 @@ export function preprocessLaTeX(content: string): string {
|
|||
return expr;
|
||||
});
|
||||
|
||||
// Step 5: Restore code blocks
|
||||
content = content.replace(/<<CODE_BLOCK_(\d+)>>/g, (_, index) => {
|
||||
return codeBlocks[parseInt(index)];
|
||||
});
|
||||
|
||||
// Step 6: Apply additional escaping functions (brackets and mhchem)
|
||||
// Step 5: Apply additional escaping functions (brackets and mhchem)
|
||||
// This must happen BEFORE restoring code blocks to avoid affecting code content
|
||||
content = escapeBrackets(content);
|
||||
|
||||
if (doEscapeMhchem && (content.includes('\\ce{') || content.includes('\\pu{'))) {
|
||||
content = escapeMhchem(content);
|
||||
}
|
||||
|
||||
// Final pass: Convert \(...\) → $...$, \[...\] → $$...$$
|
||||
// Step 6: Convert remaining \(...\) → $...$, \[...\] → $$...$$
|
||||
// This must happen BEFORE restoring code blocks to avoid affecting code content
|
||||
content = content
|
||||
// Using the look‑behind pattern `(?<!\\)` we skip matches
|
||||
// that are preceded by a backslash, e.g.
|
||||
|
|
@ -248,12 +245,18 @@ export function preprocessLaTeX(content: string): string {
|
|||
// Using the look‑behind pattern `(?<!\\)` we skip matches
|
||||
// that are preceded by a backslash, e.g. `\\[4pt]`.
|
||||
/(?<!\\)\\\[([\s\S]*?)\\\]/g, // display, see also PR #16599
|
||||
(_, prefix: string, content: string) => {
|
||||
return `${prefix}$$${content}$$`;
|
||||
(_, content: string) => {
|
||||
return `$$${content}$$`;
|
||||
}
|
||||
);
|
||||
|
||||
// Step 7: Restore blockquote markers
|
||||
// Step 7: Restore code blocks
|
||||
// This happens AFTER all LaTeX conversions to preserve code content
|
||||
content = content.replace(/<<CODE_BLOCK_(\d+)>>/g, (_, index) => {
|
||||
return codeBlocks[parseInt(index)];
|
||||
});
|
||||
|
||||
// Step 8: Restore blockquote markers
|
||||
if (blockquoteMarkers.size > 0) {
|
||||
const finalLines = content.split('\n');
|
||||
const restoredLines = finalLines.map((line, index) => {
|
||||
|
|
|
|||
4
vendor/cpp-httplib/CMakeLists.txt
vendored
4
vendor/cpp-httplib/CMakeLists.txt
vendored
|
|
@ -9,6 +9,10 @@ if (NOT MSVC)
|
|||
endif()
|
||||
|
||||
target_link_libraries (${TARGET} PRIVATE Threads::Threads)
|
||||
|
||||
if (WIN32 AND NOT MSVC)
|
||||
target_link_libraries(${TARGET} PUBLIC ws2_32)
|
||||
endif()
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
target_compile_definitions(${TARGET} PRIVATE
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue