diff --git a/common/arg.cpp b/common/arg.cpp
index 05f41036e..52a254a35 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -49,6 +49,7 @@
 #define LLAMA_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
 
 using json = nlohmann::ordered_json;
+using namespace common_arg_utils;
 
 static std::initializer_list<enum llama_example> mmproj_examples = {
     LLAMA_EXAMPLE_MTMD,
@@ -66,6 +67,15 @@ static std::string read_file(const std::string & fname) {
     return content;
 }
 
+static const std::vector<common_arg> & get_common_arg_defs() {
+    static const std::vector<common_arg> options = [] {
+        common_params params;
+        auto ctx = common_params_parser_init(params, LLAMA_EXAMPLE_SERVER, nullptr);
+        return ctx.options;
+    }();
+    return options;
+}
+
 common_arg & common_arg::set_examples(std::initializer_list<enum llama_example> examples) {
     this->examples = examples;
     return *this;
@@ -136,7 +146,7 @@ static std::vector<std::string> break_str_into_lines(std::string input, size_t m
     return result;
 }
 
-std::string common_arg::to_string() {
+std::string common_arg::to_string() const {
     // params for printing to console
     const static int n_leading_spaces = 40;
     const static int n_char_per_line_help = 70; // TODO: detect this based on current console
@@ -649,6 +659,53 @@ static void add_rpc_devices(const std::string & servers) {
     }
 }
 
+bool common_params_parse(int argc, char ** argv, llama_example ex, std::map<common_arg, std::string> & out_map) {
+    common_params dummy_params;
+    common_params_context ctx_arg = common_params_parser_init(dummy_params, ex, nullptr);
+
+    std::unordered_map<std::string, common_arg *> arg_to_options;
+    for (auto & opt : ctx_arg.options) {
+        for (const auto & arg : opt.args) {
+            arg_to_options[arg] = &opt;
+        }
+    }
+
+    // TODO @ngxson : find a way to deduplicate this code
+
+    // handle command line arguments
+    auto check_arg = [&](int i) {
+        if (i+1 >= argc) {
+            throw std::invalid_argument("expected value for argument");
+        }
+    };
+
+    for (int i = 1; i < argc; i++) {
+        const std::string arg_prefix = "--";
+
+        std::string arg = argv[i];
+        if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
+            std::replace(arg.begin(), arg.end(), '_', '-');
+        }
+        if (arg_to_options.find(arg) == arg_to_options.end()) {
+            throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
+        }
+        auto opt = *arg_to_options[arg];
+        std::string val;
+        if (opt.value_hint != nullptr) {
+            // arg with single value
+            check_arg(i);
+            val = argv[++i];
+        }
+        if (opt.value_hint_2 != nullptr) {
+            // TODO: support arg with 2 values
+            throw std::invalid_argument("error: argument with 2 values is not yet supported\n");
+        }
+        out_map[opt] = val;
+    }
+
+    return true;
+}
+
 bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
     auto ctx_arg = common_params_parser_init(params, ex, print_usage);
     const common_params params_org = ctx_arg.params; // the example can modify the default params
@@ -694,25 +751,19 @@ static std::string list_builtin_chat_templates() {
     return msg.str();
 }
 
-static bool is_truthy(const std::string & value) {
+bool common_arg_utils::is_truthy(const std::string & value) {
     return value == "on" || value == "enabled" || value == "1";
 }
 
-static bool is_falsey(const std::string & value) {
+bool common_arg_utils::is_falsey(const std::string & value) {
     return value == "off" || value == "disabled" || value == "0";
 }
 
-static bool is_autoy(const std::string & value) {
+bool common_arg_utils::is_autoy(const std::string & value) {
     return value == "auto" || value == "-1";
 }
 
 common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
-    // default values specific to example
-    // note: we place it here instead of inside server.cpp to allow llama-gen-docs to pick it up
-    if (ex == LLAMA_EXAMPLE_SERVER) {
-        params.use_jinja = true;
-    }
-
     params.use_color = tty_can_use_colors();
 
     // load dynamic backends
@@ -1807,7 +1858,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONT_BATCHING"));
     add_opt(common_arg(
-        {"--mmproj"}, "FILE",
+        {"-mm", "--mmproj"}, "FILE",
         "path to a multimodal projector file. see tools/mtmd/README.md\n"
         "note: if -hf is used, this argument can be omitted",
         [](common_params & params, const std::string & value) {
@@ -1815,7 +1866,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         }
     ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ"));
     add_opt(common_arg(
-        {"--mmproj-url"}, "URL",
+        {"-mmu", "--mmproj-url"}, "URL",
         "URL to a multimodal projector file. see tools/mtmd/README.md",
         [](common_params & params, const std::string & value) {
             params.mmproj.url = value;
@@ -2545,6 +2596,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.models_dir = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_DIR"));
+    add_opt(common_arg(
+        {"--models-preset"}, "PATH",
+        "path to INI file containing model presets for the router server (default: disabled)",
+        [](common_params & params, const std::string & value) {
+            params.models_preset = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_PRESET"));
     add_opt(common_arg(
         {"--models-max"}, "N",
         string_format("for router server, maximum number of models to load simultaneously (default: %d, 0 = unlimited)", params.models_max),
@@ -2561,14 +2619,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_MODELS_AUTOLOAD"));
     add_opt(common_arg(
         {"--jinja"},
-        string_format("use jinja template for chat (default: %s)\n", params.use_jinja ? "enabled" : "disabled"),
+        string_format("use jinja template for chat (default: %s)", params.use_jinja ? "enabled" : "disabled"),
         [](common_params & params) {
             params.use_jinja = true;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_JINJA"));
     add_opt(common_arg(
         {"--no-jinja"},
-        string_format("disable jinja template for chat (default: %s)\n", params.use_jinja ? "enabled" : "disabled"),
+        string_format("disable jinja template for chat (default: %s)", params.use_jinja ? "disabled" : "enabled"),
         [](common_params & params) {
             params.use_jinja = false;
         }
diff --git a/common/arg.h b/common/arg.h
index 7ab7e2cea..219c115e6 100644
--- a/common/arg.h
+++ b/common/arg.h
@@ -3,8 +3,10 @@
 #include "common.h"
 
 #include <set>
+#include <map>
 #include <string>
 #include <vector>
+#include <cstring>
 
 //
 // CLI argument parsing
@@ -24,6 +26,8 @@ struct common_arg {
     void (*handler_str_str)(common_params & params, const std::string &, const std::string &) = nullptr;
     void (*handler_int)    (common_params & params, int) = nullptr;
 
+    common_arg() = default;
+
     common_arg(
         const std::initializer_list<const char *> & args,
         const char * value_hint,
@@ -61,9 +65,29 @@ struct common_arg {
     bool is_exclude(enum llama_example ex);
     bool get_value_from_env(std::string & output) const;
     bool has_value_from_env() const;
-    std::string to_string();
+    std::string to_string() const;
+
+    // for using as key in std::map
+    bool operator<(const common_arg& other) const {
+        if (args.empty() || other.args.empty()) {
+            return false;
+        }
+        return strcmp(args[0], other.args[0]) < 0;
+    }
+    bool operator==(const common_arg& other) const {
+        if (args.empty() || other.args.empty()) {
+            return false;
+        }
+        return strcmp(args[0], other.args[0]) == 0;
+    }
 };
 
+namespace common_arg_utils {
+    bool is_truthy(const std::string & value);
+    bool is_falsey(const std::string & value);
+    bool is_autoy(const std::string & value);
+}
+
 struct common_params_context {
     enum llama_example ex = LLAMA_EXAMPLE_COMMON;
     common_params & params;
@@ -76,7 +100,11 @@ struct common_params_context {
 // if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message)
 bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
 
-// function to be used by test-arg-parser
+// parse input arguments from CLI into a map
+// TODO: support repeated args in the future
+bool common_params_parse(int argc, char ** argv, llama_example ex, std::map<common_arg, std::string> & out_map);
+
+// initialize argument parser context - used by test-arg-parser and preset
 common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
 
 struct common_remote_params {
diff --git a/common/common.h b/common/common.h
index 3f99817de..c2c48a2b0 100644
--- a/common/common.h
+++ b/common/common.h
@@ -460,7 +460,7 @@ struct common_params {
     std::string public_path   = "";                                                                         // NOLINT
     std::string api_prefix    = "";                                                                         // NOLINT
     std::string chat_template = "";                                                                         // NOLINT
-    bool use_jinja = false;                                                                                 // NOLINT
+    bool use_jinja = true;                                                                                  // NOLINT
     bool enable_chat_template = true;
     common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
     int reasoning_budget = -1;
@@ -480,9 +480,10 @@ struct common_params {
     bool endpoint_metrics = false;
 
     // router server configs
-    std::string models_dir = ""; // directory containing models for the router server
-    int models_max = 4;          // maximum number of models to load simultaneously
-    bool models_autoload = true; // automatically load models when requested via the router server
+    std::string models_dir    = ""; // directory containing models for the router server
+    std::string models_preset = ""; // directory containing model presets for the router server
+    int models_max = 4;             // maximum number of models to load simultaneously
+    bool models_autoload = true;    // automatically load models when requested via the router server
 
     bool log_json = false;
 
diff --git a/common/download.cpp b/common/download.cpp
index ab68c53b4..ef8747256 100644
--- a/common/download.cpp
+++ b/common/download.cpp
@@ -12,6 +12,8 @@
 #include <filesystem>
 #include <fstream>
 #include <future>
+#include <map>
+#include <mutex>
 #include <regex>
 #include <string>
 #include <thread>
@@ -472,36 +474,79 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string &
 
 #elif defined(LLAMA_USE_HTTPLIB)
 
-static bool is_output_a_tty() {
+class ProgressBar {
+    static inline std::mutex mutex;
+    static inline std::map<const ProgressBar *, int> lines;
+    static inline int max_line = 0;
+
+    static void cleanup(const ProgressBar * line) {
+        lines.erase(line);
+        if (lines.empty()) {
+            max_line = 0;
+        }
+    }
+
+    static bool is_output_a_tty() {
 #if defined(_WIN32)
-    return _isatty(_fileno(stdout));
+        return _isatty(_fileno(stdout));
 #else
-    return isatty(1);
+        return isatty(1);
 #endif
-}
-
-static void print_progress(size_t current, size_t total) {
-    if (!is_output_a_tty()) {
-        return;
     }
 
-    if (!total) {
-        return;
+public:
+    ProgressBar() = default;
+
+    ~ProgressBar() {
+        std::lock_guard<std::mutex> lock(mutex);
+        cleanup(this);
     }
 
-    size_t width = 50;
-    size_t pct = (100 * current) / total;
-    size_t pos = (width * current) / total;
+    void update(size_t current, size_t total) {
+        if (!is_output_a_tty()) {
+            return;
+        }
 
-    std::cout << "["
-              << std::string(pos, '=')
-              << (pos < width ? ">" : "")
-              << std::string(width - pos, ' ')
-              << "] " << std::setw(3) << pct << "%  ("
-              << current / (1024 * 1024) << " MB / "
-              << total / (1024 * 1024) << " MB)\r";
-    std::cout.flush();
-}
+        if (!total) {
+            return;
+        }
+
+        std::lock_guard<std::mutex> lock(mutex);
+
+        if (lines.find(this) == lines.end()) {
+            lines[this] = max_line++;
+            std::cout << "\n";
+        }
+        int lines_up = max_line - lines[this];
+
+        size_t width = 50;
+        size_t pct = (100 * current) / total;
+        size_t pos = (width * current) / total;
+
+        std::cout << "\033[s";
+
+        if (lines_up > 0) {
+            std::cout << "\033[" << lines_up << "A";
+        }
+        std::cout << "\033[2K\r["
+            << std::string(pos, '=')
+            << (pos < width ? ">" : "")
+            << std::string(width - pos, ' ')
+            << "] " << std::setw(3) << pct << "%  ("
+            << current / (1024 * 1024) << " MB / "
+            << total / (1024 * 1024) << " MB) "
+            << "\033[u";
+
+        std::cout.flush();
+
+        if (current == total) {
+             cleanup(this);
+        }
+    }
+
+    ProgressBar(const ProgressBar &) = delete;
+    ProgressBar & operator=(const ProgressBar &) = delete;
+};
 
 static bool common_pull_file(httplib::Client & cli,
                              const std::string & resolve_path,
@@ -523,6 +568,7 @@ static bool common_pull_file(httplib::Client & cli,
     const char * func = __func__; // avoid __func__ inside a lambda
     size_t downloaded = existing_size;
     size_t progress_step = 0;
+    ProgressBar bar;
 
     auto res = cli.Get(resolve_path, headers,
         [&](const httplib::Response &response) {
@@ -554,7 +600,7 @@ static bool common_pull_file(httplib::Client & cli,
             progress_step += len;
 
             if (progress_step >= total_size / 1000 || downloaded == total_size) {
-                print_progress(downloaded, total_size);
+                bar.update(downloaded, total_size);
                 progress_step = 0;
             }
             return true;
@@ -562,8 +608,6 @@ static bool common_pull_file(httplib::Client & cli,
         nullptr
     );
 
-    std::cout << "\n";
-
     if (!res) {
         LOG_ERR("%s: error during download. Status: %d\n", __func__, res ? res->status : -1);
         return false;
diff --git a/common/preset.cpp b/common/preset.cpp
new file mode 100644
index 000000000..09ac171b7
--- /dev/null
+++ b/common/preset.cpp
@@ -0,0 +1,180 @@
+#include "arg.h"
+#include "preset.h"
+#include "peg-parser.h"
+#include "log.h"
+
+#include <fstream>
+#include <sstream>
+#include <filesystem>
+
+static std::string rm_leading_dashes(const std::string & str) {
+    size_t pos = 0;
+    while (pos < str.size() && str[pos] == '-') {
+        ++pos;
+    }
+    return str.substr(pos);
+}
+
+std::vector<std::string> common_preset::to_args() const {
+    std::vector<std::string> args;
+
+    for (const auto & [opt, value] : options) {
+        args.push_back(opt.args.back()); // use the last arg as the main arg
+        if (opt.value_hint == nullptr && opt.value_hint_2 == nullptr) {
+            // flag option, no value
+            if (common_arg_utils::is_falsey(value)) {
+                // skip the flag
+                args.pop_back();
+            }
+        }
+        if (opt.value_hint != nullptr) {
+            // single value
+            args.push_back(value);
+        }
+        if (opt.value_hint != nullptr && opt.value_hint_2 != nullptr) {
+            throw std::runtime_error(string_format(
+                "common_preset::to_args(): option '%s' has two values, which is not supported yet",
+                opt.args.back()
+            ));
+        }
+    }
+
+    return args;
+}
+
+std::string common_preset::to_ini() const {
+    std::ostringstream ss;
+
+    ss << "[" << name << "]\n";
+    for (const auto & [opt, value] : options) {
+        auto espaced_value = value;
+        string_replace_all(espaced_value, "\n", "\\\n");
+        ss << rm_leading_dashes(opt.args.back()) << " = ";
+        ss << espaced_value << "\n";
+    }
+    ss << "\n";
+
+    return ss.str();
+}
+
+static std::map<std::string, std::map<std::string, std::string>> parse_ini_from_file(const std::string & path) {
+    std::map<std::string, std::map<std::string, std::string>> parsed;
+
+    if (!std::filesystem::exists(path)) {
+        throw std::runtime_error("preset file does not exist: " + path);
+    }
+
+    std::ifstream file(path);
+    if (!file.good()) {
+        throw std::runtime_error("failed to open server preset file: " + path);
+    }
+
+    std::string contents((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
+
+    static const auto parser = build_peg_parser([](auto & p) {
+        // newline ::= "\r\n" / "\n" / "\r"
+        auto newline = p.rule("newline", p.literal("\r\n") | p.literal("\n") | p.literal("\r"));
+
+        // ws ::= [ \t]*
+        auto ws = p.rule("ws", p.chars("[ \t]", 0, -1));
+
+        // comment ::= [;#] (!newline .)*
+        auto comment = p.rule("comment", p.chars("[;#]", 1, 1) + p.zero_or_more(p.negate(newline) + p.any()));
+
+        // eol ::= ws comment? (newline / EOF)
+        auto eol = p.rule("eol", ws + p.optional(comment) + (newline | p.end()));
+
+        // ident ::= [a-zA-Z_] [a-zA-Z0-9_.-]*
+        auto ident = p.rule("ident", p.chars("[a-zA-Z_]", 1, 1) + p.chars("[a-zA-Z0-9_.-]", 0, -1));
+
+        // value ::= (!eol-start .)*
+        auto eol_start = p.rule("eol-start", ws + (p.chars("[;#]", 1, 1) | newline | p.end()));
+        auto value = p.rule("value", p.zero_or_more(p.negate(eol_start) + p.any()));
+
+        // header-line ::= "[" ws ident ws "]" eol
+        auto header_line = p.rule("header-line", "[" + ws + p.tag("section-name", p.chars("[^]]")) + ws + "]" + eol);
+
+        // kv-line ::= ident ws "=" ws value eol
+        auto kv_line = p.rule("kv-line", p.tag("key", ident) + ws + "=" + ws + p.tag("value", value) + eol);
+
+        // comment-line ::= ws comment (newline / EOF)
+        auto comment_line = p.rule("comment-line", ws + comment + (newline | p.end()));
+
+        // blank-line ::= ws (newline / EOF)
+        auto blank_line = p.rule("blank-line", ws + (newline | p.end()));
+
+        // line ::= header-line / kv-line / comment-line / blank-line
+        auto line = p.rule("line", header_line | kv_line | comment_line | blank_line);
+
+        // ini ::= line* EOF
+        auto ini = p.rule("ini", p.zero_or_more(line) + p.end());
+
+        return ini;
+    });
+
+    common_peg_parse_context ctx(contents);
+    const auto result = parser.parse(ctx);
+    if (!result.success()) {
+        throw std::runtime_error("failed to parse server config file: " + path);
+    }
+
+    std::string current_section = COMMON_PRESET_DEFAULT_NAME;
+    std::string current_key;
+
+    ctx.ast.visit(result, [&](const auto & node) {
+        if (node.tag == "section-name") {
+            const std::string section = std::string(node.text);
+            current_section = section;
+            parsed[current_section] = {};
+        } else if (node.tag == "key") {
+            const std::string key = std::string(node.text);
+            current_key = key;
+        } else if (node.tag == "value" && !current_key.empty() && !current_section.empty()) {
+            parsed[current_section][current_key] = std::string(node.text);
+            current_key.clear();
+        }
+    });
+
+    return parsed;
+}
+
+static std::map<std::string, common_arg> get_map_key_opt(common_params_context & ctx_params) {
+    std::map<std::string, common_arg> mapping;
+    for (const auto & opt : ctx_params.options) {
+        if (opt.env != nullptr) {
+            mapping[opt.env] = opt;
+        }
+        for (const auto & arg : opt.args) {
+            mapping[rm_leading_dashes(arg)] = opt;
+        }
+    }
+    return mapping;
+}
+
+common_presets common_presets_load(const std::string & path, common_params_context & ctx_params) {
+    common_presets out;
+    auto key_to_opt = get_map_key_opt(ctx_params);
+    auto ini_data = parse_ini_from_file(path);
+
+    for (auto section : ini_data) {
+        common_preset preset;
+        if (section.first.empty()) {
+            preset.name = COMMON_PRESET_DEFAULT_NAME;
+        } else {
+            preset.name = section.first;
+        }
+        LOG_DBG("loading preset: %s\n", preset.name.c_str());
+        for (const auto & [key, value] : section.second) {
+            LOG_DBG("option: %s = %s\n", key.c_str(), value.c_str());
+            if (key_to_opt.find(key) != key_to_opt.end()) {
+                preset.options[key_to_opt[key]] = value;
+                LOG_DBG("accepted option: %s = %s\n", key.c_str(), value.c_str());
+            } else {
+                // TODO: maybe warn about unknown key?
+            }
+        }
+        out[preset.name] = preset;
+    }
+
+    return out;
+}
diff --git a/common/preset.h b/common/preset.h
new file mode 100644
index 000000000..dceb849eb
--- /dev/null
+++ b/common/preset.h
@@ -0,0 +1,32 @@
+#pragma once
+
+#include "common.h"
+#include "arg.h"
+
+#include <string>
+#include <vector>
+#include <map>
+
+//
+// INI preset parser and writer
+//
+
+constexpr const char * COMMON_PRESET_DEFAULT_NAME = "default";
+
+struct common_preset {
+    std::string name;
+    // TODO: support repeated args in the future
+    std::map<common_arg, std::string> options;
+
+    // convert preset to CLI argument list
+    std::vector<std::string> to_args() const;
+
+    // convert preset to INI format string
+    std::string to_ini() const;
+
+    // TODO: maybe implement to_env() if needed
+};
+
+// interface for multiple presets in one file
+using common_presets = std::map<std::string, common_preset>;
+common_presets common_presets_load(const std::string & path, common_params_context & ctx_params);
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 572fd0143..dbd11d657 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -2330,13 +2330,11 @@ extern "C" {
             float                 stop,
             float                 step);
 
-#define GGML_KQ_MASK_PAD 1
-
-    // q:    [n_embd_k, n_batch,     n_head,    ne3 ]
-    // k:    [n_embd_k, n_kv,        n_head_kv, ne3 ]
-    // v:    [n_embd_v, n_kv,        n_head_kv, ne3 ] !! not transposed !!
-    // mask: [n_kv,     n_batch_pad, ne32,      ne33] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !!
-    // res:  [n_embd_v, n_head,      n_batch,   ne3 ] !! permuted !!
+    // q:    [n_embd_k, n_batch, n_head,    ne3 ]
+    // k:    [n_embd_k, n_kv,    n_head_kv, ne3 ]
+    // v:    [n_embd_v, n_kv,    n_head_kv, ne3 ] !! not transposed !!
+    // mask: [n_kv,     n_batch, ne32,      ne33]
+    // res:  [n_embd_v, n_head,  n_batch,   ne3 ] !! permuted !!
     //
     // broadcast:
     //   n_head % n_head_kv == 0
diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
index a5995fdc2..ec16cbda9 100644
--- a/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c
@@ -312,16 +312,9 @@ static struct buffer_address ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * al
 }
 
 // this is a very naive implementation, but for our case the number of free blocks should be very small
-static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, struct buffer_address addr, size_t size, const struct ggml_tensor * tensor) {
+static void ggml_dyn_tallocr_free_bytes(struct ggml_dyn_tallocr * alloc, struct buffer_address addr, size_t size) {
     size = aligned_offset(NULL, size, alloc->alignment);
 
-    AT_PRINTF("%s: freeing %s at {chunk=%d, offset=%zu} (%zu bytes) - n_free_blocks = %d\n",
-        __func__, tensor->name, addr.chunk, addr.offset, size, alloc->chunks[addr.chunk]->n_free_blocks);
-
-#ifdef GGML_ALLOCATOR_DEBUG
-    remove_allocated_tensor(alloc, addr, tensor);
-#endif
-
     struct tallocr_chunk * chunk = alloc->chunks[addr.chunk];
 
     // see if we can merge with an existing block
@@ -357,8 +350,6 @@ static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, struct
     }
     // otherwise, add a new block
     ggml_dyn_tallocr_insert_block(chunk, addr.offset, size);
-
-    GGML_UNUSED(tensor);
 }
 
 static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) {
@@ -616,13 +607,17 @@ static void ggml_gallocr_free_extra_space(ggml_gallocr_t galloc, struct ggml_ten
 
     GGML_ASSERT(parent_size >= node_size);
 
+    // note: we want after the freeing the chunks to continue to be aligned
+    struct ggml_dyn_tallocr * p_alloc = galloc->buf_tallocs[p_hn->buffer_id];
+    parent_size = aligned_offset(NULL, parent_size, p_alloc->alignment);
+    node_size = aligned_offset(NULL, node_size, p_alloc->alignment);
+
     if (parent_size > node_size) {
-        struct ggml_dyn_tallocr * p_alloc = galloc->buf_tallocs[p_hn->buffer_id];
         struct buffer_address p_addr = p_hn->addr;
         p_addr.offset += node_size;
         size_t extra_size = parent_size - node_size;
         AT_PRINTF("freeing extra %zu bytes from parent %s for %s\n", extra_size, parent->name, node->name);
-        ggml_dyn_tallocr_free_tensor(p_alloc, p_addr, extra_size, parent);
+        ggml_dyn_tallocr_free_bytes(p_alloc, p_addr, extra_size);
     }
 }
 
@@ -706,7 +701,14 @@ static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * n
     struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
     ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
     size_t size = ggml_backend_buft_get_alloc_size(buft, node);
-    ggml_dyn_tallocr_free_tensor(alloc, hn->addr, size, node);
+
+    AT_PRINTF("%s: freeing %s at {chunk=%d, offset=%zu} (%zu bytes) - n_free_blocks = %d\n",
+        __func__, node->name, hn->addr.chunk, hn->addr.offset, size, alloc->chunks[hn->addr.chunk]->n_free_blocks);
+#ifdef GGML_ALLOCATOR_DEBUG
+    remove_allocated_tensor(alloc, hn->addr, node);
+#endif
+
+    ggml_dyn_tallocr_free_bytes(alloc, hn->addr, size);
     hn->allocated = false;
 }
 
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index 2223b33dc..2f3205fb9 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -191,6 +191,9 @@ typedef void * thread_ret_t;
 
 typedef pthread_t ggml_thread_t;
 
+#define GGML_THREADPOOL_N_THREADS_MASK (0xffffU)
+#define GGML_THREADPOOL_N_THREADS_BITS (16)
+
 #if defined(__APPLE__)
 #include <unistd.h>
 #include <mach/mach.h>
@@ -453,7 +456,7 @@ struct ggml_threadpool {
     struct ggml_cplan  * cplan;
 
     // synchronization primitives
-    atomic_int n_graph;       // incremented when there is work to be done (i.e each graph)
+    atomic_int n_graph;       // updated when there is work to be done (i.e each graph) holds graph and active thread counts.
     atomic_int GGML_CACHE_ALIGN n_barrier;
     atomic_int GGML_CACHE_ALIGN n_barrier_passed;
     atomic_int GGML_CACHE_ALIGN current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.
@@ -461,12 +464,10 @@ struct ggml_threadpool {
     // these are atomic as an annotation for thread-sanitizer
     atomic_bool stop;         // Used for stopping the threadpool altogether
     atomic_bool pause;        // Used for pausing the threadpool or individual threads
-    atomic_int abort;         // Used for aborting processing of a graph
+    atomic_int  abort;        // Used for aborting processing of a graph
 
     struct ggml_compute_state * workers;   // per thread state
-    int          n_threads_max; // number of threads in the pool
-    atomic_int   n_threads_cur; // number of threads used in the current graph
-
+    int          n_threads;   // Number of threads in the pool
     int32_t      prio;        // Scheduling priority
     uint32_t     poll;        // Polling level (0 - no polling)
 
@@ -543,7 +544,7 @@ struct ggml_state {
 static struct ggml_state g_state = {0};
 
 void ggml_barrier(struct ggml_threadpool * tp) {
-    int n_threads = atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed);
+    int n_threads = atomic_load_explicit(&tp->n_graph, memory_order_relaxed) & GGML_THREADPOOL_N_THREADS_MASK;
     if (n_threads == 1) {
         return;
     }
@@ -560,7 +561,7 @@ void ggml_barrier(struct ggml_threadpool * tp) {
         // last thread
         atomic_store_explicit(&tp->n_barrier, 0, memory_order_relaxed);
 
-        // exit barrier (fill seq-cst fence)
+        // exit barrier (full seq-cst fence)
         atomic_fetch_add_explicit(&tp->n_barrier_passed, 1, memory_order_seq_cst);
         return;
     }
@@ -3475,7 +3476,7 @@ static void ggml_thread_cpumask_next(const bool * global_mask, bool * local_mask
 void ggml_threadpool_free(struct ggml_threadpool* threadpool) {
     if (!threadpool) return;
 
-    const int n_threads = threadpool->n_threads_max;
+    const int n_threads = threadpool->n_threads;
 
 #ifndef GGML_USE_OPENMP
     struct ggml_compute_state* workers = threadpool->workers;
@@ -3551,7 +3552,7 @@ struct ggml_cplan ggml_graph_plan(
         //GGML_PRINT_DEBUG("Threadpool is not specified. Will create a disposable threadpool : n_threads %d\n", n_threads);
     }
     if (n_threads <= 0) {
-        n_threads = threadpool ? threadpool->n_threads_max : GGML_DEFAULT_N_THREADS;
+        n_threads = threadpool ? threadpool->n_threads : GGML_DEFAULT_N_THREADS;
     }
 
 #if defined(__EMSCRIPTEN__) && !defined(__EMSCRIPTEN_PTHREADS__)
@@ -3778,12 +3779,14 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
 
     struct ggml_compute_params params = {
         /*.ith       =*/ state->ith,
-        /*.nth       =*/ atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed),
+        /*.nth       =*/ atomic_load_explicit(&tp->n_graph, memory_order_relaxed) & GGML_THREADPOOL_N_THREADS_MASK,
         /*.wsize     =*/ cplan->work_size,
         /*.wdata     =*/ cplan->work_data,
         /*.threadpool=*/ tp,
     };
 
+    GGML_PRINT_DEBUG("thread #%d compute-start cplan %p last-graph %d \n", state->ith, cplan, state->last_graph);
+
     for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) {
         struct ggml_tensor * node = cgraph->nodes[node_n];
 
@@ -3805,6 +3808,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
         }
     }
 
+    GGML_PRINT_DEBUG("thread #%d compute-done cplan %p last-graph %d \n", state->ith, cplan, state->last_graph);
+
     ggml_barrier(state->threadpool);
 
     return 0;
@@ -3812,27 +3817,23 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
 
 #ifndef GGML_USE_OPENMP
 
-// check if thread is active
-static inline bool ggml_graph_compute_thread_active(struct ggml_compute_state * state) {
-    struct ggml_threadpool * threadpool = state->threadpool;
-    int n_threads = atomic_load_explicit(&threadpool->n_threads_cur, memory_order_relaxed);
-    return (state->ith < n_threads);
-}
-
 // check if thread is ready to proceed (exit from polling or sleeping)
+// returns true if loops should exit, sets state->pending to indicate new work
 static inline bool ggml_graph_compute_thread_ready(struct ggml_compute_state * state) {
     struct ggml_threadpool * threadpool = state->threadpool;
 
     if (state->pending || threadpool->stop || threadpool->pause) { return true; }
 
     // check for new graph/work
-    int new_graph = atomic_load_explicit(&threadpool->n_graph, memory_order_relaxed);
-    if (new_graph != state->last_graph) {
-        state->pending    = ggml_graph_compute_thread_active(state);
-        state->last_graph = new_graph;
+    int n_graph   = atomic_load_explicit(&threadpool->n_graph, memory_order_relaxed);
+    int n_threads = n_graph & GGML_THREADPOOL_N_THREADS_MASK;
+    if (n_graph != state->last_graph) {
+        state->pending    = (state->ith < n_threads);
+        state->last_graph = n_graph;
+        return true;
     }
 
-    return state->pending;
+    return false;
 }
 
 // sync thread state after polling
@@ -3849,11 +3850,6 @@ static inline void ggml_graph_compute_thread_sync(struct ggml_compute_state * st
 static inline bool ggml_graph_compute_poll_for_work(struct ggml_compute_state * state) {
     struct ggml_threadpool * threadpool = state->threadpool;
 
-    // Skip polling for unused threads
-    if (!ggml_graph_compute_thread_active(state)) {
-        return state->pending;
-    }
-
     // This seems to make 0 ... 100 a decent range for polling level across modern processors.
     // Perhaps, we can adjust it dynamically based on load and things.
     const uint64_t n_rounds = 1024UL * 128 * threadpool->poll;
@@ -3915,7 +3911,6 @@ static thread_ret_t ggml_graph_compute_secondary_thread(void* data) {
         ggml_graph_compute_check_for_work(state);
         if (state->pending) {
             state->pending = false;
-
             ggml_graph_compute_thread(state);
         }
     }
@@ -3930,14 +3925,15 @@ static void ggml_graph_compute_kickoff(struct ggml_threadpool * threadpool, int
 
     ggml_mutex_lock(&threadpool->mutex);
 
-    GGML_PRINT_DEBUG("threadpool: n_threads_cur %d n_threads %d\n", threadpool->n_threads_cur, n_threads);
+    // Update the number of active threads and the graph count
+    int n_graph = atomic_load_explicit(&threadpool->n_graph, memory_order_relaxed) >> GGML_THREADPOOL_N_THREADS_BITS;
+    n_graph = ((n_graph + 1) << GGML_THREADPOOL_N_THREADS_BITS) | (n_threads & GGML_THREADPOOL_N_THREADS_MASK);
 
-    // Update the number of active threads
-    atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed);
+    GGML_PRINT_DEBUG("compute-kickoff: n_threads %d n_graph %d\n", n_threads, n_graph);
 
     // Indicate the graph is ready to be processed
     // We need the full seq-cst fence here because of the polling threads (used in thread_sync)
-    atomic_fetch_add_explicit(&threadpool->n_graph, 1, memory_order_seq_cst);
+    atomic_store_explicit(&threadpool->n_graph, n_graph, memory_order_seq_cst);
 
     if (threadpool->pause) {
        // Update main thread prio and affinity to match the threadpool settings
@@ -3975,8 +3971,7 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(
         threadpool->pause            = tpp->paused;
         threadpool->abort            = -1;
         threadpool->workers          = NULL;
-        threadpool->n_threads_max    = tpp->n_threads;
-        threadpool->n_threads_cur    = tpp->n_threads;
+        threadpool->n_threads        = tpp->n_threads;
         threadpool->poll             = tpp->poll;
         threadpool->prio             = tpp->prio;
         threadpool->ec               = GGML_STATUS_SUCCESS;
@@ -4071,7 +4066,7 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
             {
                 // update the number of threads from the actual number of threads that we got from OpenMP
                 n_threads = omp_get_num_threads();
-                atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed);
+                atomic_store_explicit(&threadpool->n_graph, n_threads, memory_order_relaxed);
             }
 
             // Apply thread CPU mask and priority
@@ -4084,13 +4079,13 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
             ggml_graph_compute_thread(&threadpool->workers[ith]);
         }
     } else {
-        atomic_store_explicit(&threadpool->n_threads_cur, 1, memory_order_relaxed);
+        atomic_store_explicit(&threadpool->n_graph, 1, memory_order_relaxed);
         ggml_graph_compute_thread(&threadpool->workers[0]);
     }
 #else
-    if (n_threads > threadpool->n_threads_max) {
-        GGML_LOG_WARN("cplan requested more threads (%d) than available (%d)\n", n_threads, threadpool->n_threads_max);
-        n_threads = threadpool->n_threads_max;
+    if (n_threads > threadpool->n_threads) {
+        GGML_LOG_WARN("cplan requested more threads (%d) than available (%d)\n", n_threads, threadpool->n_threads);
+        n_threads = threadpool->n_threads;
     }
 
     // Kick all threads to start the new graph
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
index 75b31346f..254e9875d 100644
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -67,19 +67,22 @@
 #define GGML_CUDA_CC_RDNA1      (GGML_CUDA_CC_OFFSET_AMD + 0x1010) // RX 5000
 #define GGML_CUDA_CC_RDNA2      (GGML_CUDA_CC_OFFSET_AMD + 0x1030) // RX 6000, minimum for dp4a
 #define GGML_CUDA_CC_RDNA3      (GGML_CUDA_CC_OFFSET_AMD + 0x1100) // RX 7000, minimum for WMMA
+#define GGML_CUDA_CC_RDNA3_5    (GGML_CUDA_CC_OFFSET_AMD + 0x1150) // AI 370, AI Max 395 laptops.
 #define GGML_CUDA_CC_RDNA4      (GGML_CUDA_CC_OFFSET_AMD + 0x1200) // RX 9000
 
-#define GGML_CUDA_CC_IS_AMD(cc)   (cc >= GGML_CUDA_CC_OFFSET_AMD)
-#define GGML_CUDA_CC_IS_RDNA(cc)  (cc >= GGML_CUDA_CC_RDNA1)
-#define GGML_CUDA_CC_IS_RDNA1(cc) (cc >= GGML_CUDA_CC_RDNA1 && cc < GGML_CUDA_CC_RDNA2)
-#define GGML_CUDA_CC_IS_RDNA2(cc) (cc >= GGML_CUDA_CC_RDNA2 && cc < GGML_CUDA_CC_RDNA3)
-#define GGML_CUDA_CC_IS_RDNA3(cc) (cc >= GGML_CUDA_CC_RDNA3 && cc < GGML_CUDA_CC_RDNA4)
-#define GGML_CUDA_CC_IS_RDNA4(cc) (cc >= GGML_CUDA_CC_RDNA4)
-#define GGML_CUDA_CC_IS_GCN(cc)   (cc > GGML_CUDA_CC_OFFSET_AMD && cc < GGML_CUDA_CC_CDNA1)
-#define GGML_CUDA_CC_IS_CDNA(cc)  (cc >= GGML_CUDA_CC_CDNA1 && cc < GGML_CUDA_CC_RDNA1)
-#define GGML_CUDA_CC_IS_CDNA1(cc) (cc >= GGML_CUDA_CC_CDNA1 && cc < GGML_CUDA_CC_CDNA2)
-#define GGML_CUDA_CC_IS_CDNA2(cc) (cc >= GGML_CUDA_CC_CDNA2 && cc < GGML_CUDA_CC_CDNA3)
-#define GGML_CUDA_CC_IS_CDNA3(cc) (cc >= GGML_CUDA_CC_CDNA3 && cc < GGML_CUDA_CC_RDNA1)
+#define GGML_CUDA_CC_IS_AMD(cc)     (cc >= GGML_CUDA_CC_OFFSET_AMD)
+#define GGML_CUDA_CC_IS_RDNA(cc)    (cc >= GGML_CUDA_CC_RDNA1)
+#define GGML_CUDA_CC_IS_RDNA1(cc)   (cc >= GGML_CUDA_CC_RDNA1 && cc < GGML_CUDA_CC_RDNA2)
+#define GGML_CUDA_CC_IS_RDNA2(cc)   (cc >= GGML_CUDA_CC_RDNA2 && cc < GGML_CUDA_CC_RDNA3)
+#define GGML_CUDA_CC_IS_RDNA3_0(cc) (cc >= GGML_CUDA_CC_RDNA3 && cc < GGML_CUDA_CC_RDNA3_5)
+#define GGML_CUDA_CC_IS_RDNA3_5(cc) (cc >= GGML_CUDA_CC_RDNA3_5 && cc < GGML_CUDA_CC_RDNA4)
+#define GGML_CUDA_CC_IS_RDNA3(cc)   (GGML_CUDA_CC_IS_RDNA3_0(cc) || GGML_CUDA_CC_IS_RDNA3_5(cc))
+#define GGML_CUDA_CC_IS_RDNA4(cc)   (cc >= GGML_CUDA_CC_RDNA4)
+#define GGML_CUDA_CC_IS_GCN(cc)     (cc > GGML_CUDA_CC_OFFSET_AMD && cc < GGML_CUDA_CC_CDNA1)
+#define GGML_CUDA_CC_IS_CDNA(cc)    (cc >= GGML_CUDA_CC_CDNA1 && cc < GGML_CUDA_CC_RDNA1)
+#define GGML_CUDA_CC_IS_CDNA1(cc)   (cc >= GGML_CUDA_CC_CDNA1 && cc < GGML_CUDA_CC_CDNA2)
+#define GGML_CUDA_CC_IS_CDNA2(cc)   (cc >= GGML_CUDA_CC_CDNA2 && cc < GGML_CUDA_CC_CDNA3)
+#define GGML_CUDA_CC_IS_CDNA3(cc)   (cc >= GGML_CUDA_CC_CDNA3 && cc < GGML_CUDA_CC_RDNA1)
 
 // Moore Threads
 #define MUSART_HMASK 40300 // MUSA rc4.3, min. ver. for half2 -> uint mask comparisons
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index cb9fe4056..8c80db816 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -4643,9 +4643,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
         case GGML_OP_CUMSUM:
         case GGML_OP_TRI:
         case GGML_OP_DIAG:
-            return true;
         case GGML_OP_SOLVE_TRI:
-            return op->src[0]->ne[0] <= 64 && op->src[1]->ne[0] <= 32;
+            return true;
+
         default:
             return false;
     }
diff --git a/ggml/src/ggml-cuda/mma.cuh b/ggml/src/ggml-cuda/mma.cuh
index 0b13293da..dcfa40f4d 100644
--- a/ggml/src/ggml-cuda/mma.cuh
+++ b/ggml/src/ggml-cuda/mma.cuh
@@ -189,6 +189,9 @@ namespace ggml_cuda_mma {
                 return 8 * (threadIdx.x / 16) + l;
 #elif defined(RDNA3)
                 return 2 * l + (threadIdx.x / 16);
+#else
+                NO_DEVICE_CODE;
+                return -1;
 #endif // defined(RDNA4)
             } else {
                 NO_DEVICE_CODE;
@@ -290,8 +293,12 @@ namespace ggml_cuda_mma {
             }
         }
 #elif defined(AMD_WMMA_AVAILABLE)
-
+#if defined(RDNA3)
+        // RDNA3 has duplicated data as input.
+        static constexpr int ne = I * J / 32 * 2;
+#else
         static constexpr int ne = I * J / 32;
+#endif // defined(RDNA3)
         half2 x[ne] = {{0.0f, 0.0f}};
 
         static constexpr __device__ bool supported() {
@@ -310,7 +317,14 @@ namespace ggml_cuda_mma {
 
         static __device__ __forceinline__ int get_j(const int l) {
             if constexpr (I == 16 && J == 8) {
+#if defined(RDNA4)
                 return 4 * (threadIdx.x / 16) + l;
+#elif defined(RDNA3)
+                return l;
+#else
+                NO_DEVICE_CODE;
+                return -1;
+#endif // defined(RDNA4)
             } else {
                 NO_DEVICE_CODE;
                 return -1;
@@ -366,11 +380,16 @@ namespace ggml_cuda_mma {
         static constexpr int         I  = I_;
         static constexpr int         J  = J_;
         static constexpr data_layout dl = DATA_LAYOUT_I_MAJOR;
-        static constexpr int         ne = I * J / WARP_SIZE;
-
-        nv_bfloat162 x[ne] = {{0.0f, 0.0f}};
 
 #if defined(AMD_WMMA_AVAILABLE)
+#if defined(RDNA3)
+        // RDNA3 has duplicated data as input.
+        static constexpr int ne = I * J / 32 * 2;
+#else
+        static constexpr int ne = I * J / 32;
+#endif // defined(RDNA3)
+        nv_bfloat162 x[ne] = {{0.0f, 0.0f}};
+
         static constexpr __device__ bool supported() {
             if (I == 16 && J == 8) return true;
             return false;
@@ -387,13 +406,23 @@ namespace ggml_cuda_mma {
 
         static __device__ __forceinline__ int get_j(const int l) {
             if constexpr (I == 16 && J == 8) {
+#if defined(RDNA4)
                 return 4 * (threadIdx.x / 16) + l;
+#elif defined(RDNA3)
+                return l;
+#else
+                NO_DEVICE_CODE;
+                return -1;
+#endif // defined(RDNA4)
             } else {
                 NO_DEVICE_CODE;
                 return -1;
             }
         }
 #else
+        static constexpr int ne = I * J / WARP_SIZE;
+        nv_bfloat162 x[ne] = {{0.0f, 0.0f}};
+
         static constexpr __device__ bool supported() {
             if (I ==  8 && J ==  8) return true;
             if (I == 16 && J ==  4) return true;
@@ -546,8 +575,14 @@ namespace ggml_cuda_mma {
         }
 #elif defined(AMD_WMMA_AVAILABLE)
         if constexpr (std::is_same_v<T, half2> || std::is_same_v<T, nv_bfloat162>) {
-            ggml_cuda_memcpy_1<sizeof(t.x)>(t.x, xs0 + t.get_i(0) * stride + t.get_j(0));
-
+#if defined(RDNA4)
+                ggml_cuda_memcpy_1<sizeof(t.x)>(t.x, xs0 + t.get_i(0) * stride + t.get_j(0));
+#elif defined(RDNA3)
+                ggml_cuda_memcpy_1<sizeof(t.x)/2>(t.x, xs0 + t.get_i(0) * stride + t.get_j(0));
+                ggml_cuda_memcpy_1<sizeof(t.x)/2>(t.x + t.ne/2, xs0 + t.get_i(0) * stride + t.get_j(t.ne/2));
+#else
+                NO_DEVICE_CODE;
+#endif // defined(RDNA4)
         } else if constexpr (std::is_same_v<T, int>) {
             if constexpr (I == 16 && J == 4) {
                 int64_t * xi = (int64_t *) t.x;
@@ -888,6 +923,16 @@ namespace ggml_cuda_mma {
         const halfx8_t& a_frag = reinterpret_cast<const halfx8_t&>(A.x[0]);
         const halfx8_t& b_frag = reinterpret_cast<const halfx8_t&>(B.x[0]);
         acc_frag = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12(a_frag, b_frag, acc_frag);
+#elif defined(RDNA3)
+        using halfx16_t = __attribute__((ext_vector_type(16))) _Float16;
+        using floatx8_t = __attribute__((ext_vector_type(8))) float;
+        floatx8_t& acc_frag = reinterpret_cast<floatx8_t&>(D.x[0]);
+        const halfx16_t& a_frag = reinterpret_cast<const halfx16_t&>(A.x[0]);
+        const halfx16_t& b_frag = reinterpret_cast<const halfx16_t&>(B.x[0]);
+        acc_frag = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32(a_frag, b_frag, acc_frag);
+#else
+        GGML_UNUSED_VARS(D, A, B);
+        NO_DEVICE_CODE;
 #endif // RDNA4
 #else
         GGML_UNUSED_VARS(D, A, B);
@@ -905,6 +950,16 @@ namespace ggml_cuda_mma {
         const bf16x8_t& a_frag = reinterpret_cast<const bf16x8_t&>(A.x[0]);
         const bf16x8_t& b_frag = reinterpret_cast<const bf16x8_t&>(B.x[0]);
         acc_frag = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12(a_frag, b_frag, acc_frag);
+#elif defined(RDNA3)
+        using bf16x16_t = __attribute__((ext_vector_type(16))) __bf16;
+        using floatx8_t = __attribute__((ext_vector_type(8))) float;
+        floatx8_t& acc_frag = reinterpret_cast<floatx8_t&>(D.x[0]);
+        const bf16x16_t& a_frag = reinterpret_cast<const bf16x16_t&>(A.x[0]);
+        const bf16x16_t& b_frag = reinterpret_cast<const bf16x16_t&>(B.x[0]);
+        acc_frag = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32(a_frag, b_frag, acc_frag);
+#else
+        GGML_UNUSED_VARS(D, A, B);
+        NO_DEVICE_CODE;
 #endif // RDNA4
 #else
         GGML_UNUSED_VARS(D, A, B);
diff --git a/ggml/src/ggml-cuda/mmf.cu b/ggml/src/ggml-cuda/mmf.cu
index 7cf33f0dd..6643f243b 100644
--- a/ggml/src/ggml-cuda/mmf.cu
+++ b/ggml/src/ggml-cuda/mmf.cu
@@ -151,7 +151,9 @@ bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const
             return false;
         }
     } else {
-        if (src1_ncols > 16) {
+        if (GGML_CUDA_CC_IS_RDNA3_0(cc) && src1_ncols > 8) {
+            return false;
+        } else if (src1_ncols > 16) {
             return false;
         }
     }
@@ -160,9 +162,9 @@ bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const
         case GGML_TYPE_F32:
             return ampere_mma_available(cc);
         case GGML_TYPE_F16:
-            return volta_mma_available(cc) || turing_mma_available(cc) || (amd_wmma_available(cc) && GGML_CUDA_CC_IS_RDNA4(cc));
+            return volta_mma_available(cc) || turing_mma_available(cc) || amd_wmma_available(cc);
         case GGML_TYPE_BF16:
-            return ampere_mma_available(cc) || (amd_wmma_available(cc) && GGML_CUDA_CC_IS_RDNA4(cc));
+            return ampere_mma_available(cc) || amd_wmma_available(cc);
         default:
             return false;
     }
diff --git a/ggml/src/ggml-cuda/mmvf.cu b/ggml/src/ggml-cuda/mmvf.cu
index 6238ce7eb..32948e4d7 100644
--- a/ggml/src/ggml-cuda/mmvf.cu
+++ b/ggml/src/ggml-cuda/mmvf.cu
@@ -765,7 +765,10 @@ bool ggml_cuda_should_use_mmvf(enum ggml_type type, int cc, const int64_t * src0
                 return ne11 <= 8;
             } else if (GGML_CUDA_CC_IS_AMD(cc)) {
                 if (fp16_mma_hardware_available(cc)) {
-                    if (GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc)) {
+                    if (GGML_CUDA_CC_IS_RDNA3(cc)) {
+                        return ne11 <= 3;
+                    }
+                    if (GGML_CUDA_CC_IS_RDNA4(cc)) {
                         return ne11 <= 5;
                     }
                     return ne11 <= 2;
diff --git a/ggml/src/ggml-cuda/solve_tri.cu b/ggml/src/ggml-cuda/solve_tri.cu
index e161d4dc4..177ffc268 100644
--- a/ggml/src/ggml-cuda/solve_tri.cu
+++ b/ggml/src/ggml-cuda/solve_tri.cu
@@ -3,6 +3,80 @@
 #include "solve_tri.cuh"
 
 #define MAX_N_FAST 64
+#define MAX_K_FAST 32
+
+static __global__ void get_batch_pointers(const float *  A,
+                                          float *        X,
+                                          const float ** A_ptrs,
+                                          float **       X_ptrs,
+                                          int64_t        ne02,
+                                          int64_t        total_batches,
+                                          size_t         s02,
+                                          size_t         s03,
+                                          size_t         s2,
+                                          size_t         s3) {
+    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= total_batches) {
+        return;
+    }
+
+    const int64_t i3 = idx / ne02;
+    const int64_t i2 = idx % ne02;
+
+    A_ptrs[idx] = A + i3 * s03 + i2 * s02;
+    X_ptrs[idx] = X + i3 * s3 + i2 * s2;
+}
+
+static void solve_tri_f32_cublas(ggml_backend_cuda_context & ctx,
+                                 const float *               A,
+                                 const float *               B,
+                                 float *                     X,
+                                 int                         n,
+                                 int                         k,
+                                 int64_t                     ne02,
+                                 int64_t                     ne03,
+                                 size_t                      s02,
+                                 size_t                      s03,
+                                 size_t                      s12,
+                                 size_t                      s13,
+                                 size_t                      s2,
+                                 size_t                      s3,
+                                 cudaStream_t                stream) {
+    const float   alpha         = 1.0f;
+    const int64_t total_batches = ne02 * ne03;
+    if (total_batches == 0) {
+        return;
+    }
+
+    // Bulk copy B -> X (contiguous tensors)
+    if (X != B) {
+        const int64_t total_elements_BX = n * k * total_batches;
+        CUDA_CHECK(cudaMemcpyAsync(X, B, total_elements_BX * sizeof(float), cudaMemcpyDeviceToDevice, stream));
+    }
+
+    const int id = ggml_cuda_get_device();
+
+    ggml_cuda_pool_alloc<const float *> A_ptrs_alloc(ctx.pool(id), total_batches);
+    ggml_cuda_pool_alloc<float *>       X_ptrs_alloc(ctx.pool(id), total_batches);
+
+    const float ** A_ptrs_dev = A_ptrs_alloc.get();
+    float **       X_ptrs_dev = X_ptrs_alloc.get();
+
+    get_batch_pointers<<<(total_batches + 255) / 256, 256, 0, stream>>>(A, X, A_ptrs_dev, X_ptrs_dev, ne02,
+                                                                        total_batches, s02, s03, s2, s3);
+
+    CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream));
+
+    // Yes, this is necessary, without this we get RMSE errors
+    CUBLAS_CHECK(cublasSetMathMode(ctx.cublas_handle(id), CUBLAS_DEFAULT_MATH));
+    CUBLAS_CHECK(cublasStrsmBatched(ctx.cublas_handle(id), CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N,
+                                    CUBLAS_DIAG_NON_UNIT, k, n, &alpha, A_ptrs_dev, n, X_ptrs_dev, k, total_batches));
+
+    // revert to standard mode from common.cuh
+    CUBLAS_CHECK(cublasSetMathMode(ctx.cublas_handle(id), CUBLAS_TF32_TENSOR_OP_MATH));
+
+    GGML_UNUSED_VARS(s12, s13);
+}
 
 // ======================
 // Fast Kernel (n <= 64, k <= 32) - Warp-based parallel reduction
@@ -63,7 +137,7 @@ static __global__ void solve_tri_f32_fast(const float * __restrict__ A,
     float x_low  = (lane < n) ? B_batch[lane * k + col_idx] : 0.0f;
     float x_high = (WARP_SIZE + lane < n) ? B_batch[(WARP_SIZE + lane) * k + col_idx] : 0.0f;
 
-    const int half = WARP_SIZE;
+    const int half      = WARP_SIZE;
     const int nrows_low = (n < half) ? n : half;
 
 #pragma unroll
@@ -81,8 +155,8 @@ static __global__ void solve_tri_f32_fast(const float * __restrict__ A,
 
 #pragma unroll
     for (int row = half; row < n; ++row) {
-        float sum = sA[row * n + lane] * x_low;
-        const int j = half + lane;
+        float     sum = sA[row * n + lane] * x_low;
+        const int j   = half + lane;
         if (j < row) {
             sum += sA[row * n + j] * x_high;
         }
@@ -97,7 +171,7 @@ static __global__ void solve_tri_f32_fast(const float * __restrict__ A,
     for (int rr = 0; rr < 2; ++rr) {
         const int row = rr * WARP_SIZE + lane;
         if (row < n) {
-            const float val = (row < half) ? x_low : x_high;
+            const float val            = (row < half) ? x_low : x_high;
             X_batch[row * k + col_idx] = val;
         }
     }
@@ -176,20 +250,26 @@ static void solve_tri_f32_cuda(const float * A,
 }
 
 void ggml_cuda_op_solve_tri(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-    const ggml_tensor * src0 = dst->src[0];  // A (triangular n x x matrix)
-    const ggml_tensor * src1 = dst->src[1];  // B (right hand side of n x k equation columns)
+    const ggml_tensor * src0 = dst->src[0];  // A (n×n, lower triangular)
+    const ggml_tensor * src1 = dst->src[1];  // B (n×k)
 
     ggml_is_contiguous(src0);
     ggml_is_contiguous(src1);
 
-    const int64_t n = src0->ne[0];
-    const int64_t k = src1->ne[0];
+    const int64_t n    = src0->ne[0];
+    const int64_t k    = src1->ne[0];
+    const int64_t ne02 = src0->ne[2];
+    const int64_t ne03 = src0->ne[3];
 
-    GGML_ASSERT(n <= 64);
-    GGML_ASSERT(k <= 32);
-
-    solve_tri_f32_cuda((const float *) src0->data, (const float *) src1->data, (float *) dst->data, n, k, src0->ne[2],
-                       src0->ne[3], src0->nb[2] / sizeof(float), src0->nb[3] / sizeof(float),
-                       src1->nb[2] / sizeof(float), src1->nb[3] / sizeof(float), dst->nb[2] / sizeof(float),
-                       dst->nb[3] / sizeof(float), ctx.stream());
+    if (n <= MAX_N_FAST && k <= MAX_K_FAST) {
+        solve_tri_f32_cuda((const float *) src0->data, (const float *) src1->data, (float *) dst->data, n, k,
+                           src0->ne[2], src0->ne[3], src0->nb[2] / sizeof(float), src0->nb[3] / sizeof(float),
+                           src1->nb[2] / sizeof(float), src1->nb[3] / sizeof(float), dst->nb[2] / sizeof(float),
+                           dst->nb[3] / sizeof(float), ctx.stream());
+    } else {
+        solve_tri_f32_cublas(ctx, (const float *) src0->data, (const float *) src1->data, (float *) dst->data, n, k,
+                             ne02, ne03, src0->nb[2] / sizeof(float), src0->nb[3] / sizeof(float),
+                             src1->nb[2] / sizeof(float), src1->nb[3] / sizeof(float), dst->nb[2] / sizeof(float),
+                             dst->nb[3] / sizeof(float), ctx.stream());
+    }
 }
diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h
index b7d6edf7f..951a88d56 100644
--- a/ggml/src/ggml-cuda/vendors/hip.h
+++ b/ggml/src/ggml-cuda/vendors/hip.h
@@ -19,6 +19,9 @@
 #define CUDA_R_16F  HIPBLAS_R_16F
 #define CUDA_R_16BF HIPBLAS_R_16B
 #define CUDA_R_32F  HIPBLAS_R_32F
+#define CUBLAS_SIDE_RIGHT HIPBLAS_SIDE_RIGHT
+#define CUBLAS_FILL_MODE_UPPER HIPBLAS_FILL_MODE_UPPER
+#define CUBLAS_DIAG_NON_UNIT HIPBLAS_DIAG_NON_UNIT
 #define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED hipDeviceAttributeVirtualMemoryManagementSupported
 #define CU_MEM_ALLOC_GRANULARITY_RECOMMENDED hipMemAllocationGranularityRecommended
 #define CU_MEM_ALLOCATION_TYPE_PINNED hipMemAllocationTypePinned
@@ -30,6 +33,7 @@
 #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
 #define __all_sync(mask, var) __all(var)
 #define __any_sync(mask, var) __any(var)
+#define cublasStrsmBatched hipblasStrsmBatched
 #define cublasCreate hipblasCreate
 #define cublasDestroy hipblasDestroy
 #define cublasGemmEx hipblasGemmEx
diff --git a/ggml/src/ggml-cuda/vendors/musa.h b/ggml/src/ggml-cuda/vendors/musa.h
index 8c55a2e4e..221e67f96 100644
--- a/ggml/src/ggml-cuda/vendors/musa.h
+++ b/ggml/src/ggml-cuda/vendors/musa.h
@@ -12,11 +12,16 @@
 #define CUBLAS_GEMM_DEFAULT_TENSOR_OP MUBLAS_GEMM_DEFAULT
 #define CUBLAS_OP_N MUBLAS_OP_N
 #define CUBLAS_OP_T MUBLAS_OP_T
+#define CUBLAS_DEFAULT_MATH MUBLAS_DEFAULT_MATH
+#define CUBLAS_SIDE_RIGHT MUBLAS_SIDE_RIGHT
+#define CUBLAS_FILL_MODE_UPPER MUBLAS_FILL_MODE_UPPER
+#define CUBLAS_DIAG_NON_UNIT MUBLAS_DIAG_NON_UNIT
 #define CUBLAS_STATUS_SUCCESS MUBLAS_STATUS_SUCCESS
 #define CUBLAS_TF32_TENSOR_OP_MATH MUBLAS_TENSOR_OP_MATH
 #define CUDA_R_16F  MUSA_R_16F
 #define CUDA_R_16BF MUSA_R_16BF
 #define CUDA_R_32F  MUSA_R_32F
+#define cublasStrsmBatched mublasStrsmBatched
 #define cublasComputeType_t cudaDataType_t
 #define cublasCreate mublasCreate
 #define cublasDestroy mublasDestroy
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 86e0a9c58..e130a9328 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -5276,8 +5276,6 @@ struct ggml_tensor * ggml_flash_attn_ext(
 
     if (mask) {
         GGML_ASSERT(ggml_is_contiguous(mask));
-        GGML_ASSERT(mask->ne[1] >= GGML_PAD(q->ne[1], GGML_KQ_MASK_PAD) &&
-                "the Flash-Attention kernel requires the mask to be padded to GGML_KQ_MASK_PAD and at least n_queries big");
         //GGML_ASSERT(ggml_can_repeat_rows(mask, qk));
 
         GGML_ASSERT(q->ne[2] % mask->ne[2] == 0);
diff --git a/otherarch/whispercpp/whisper.cpp b/otherarch/whispercpp/whisper.cpp
index 499b51e45..00c58cbfa 100644
--- a/otherarch/whispercpp/whisper.cpp
+++ b/otherarch/whispercpp/whisper.cpp
@@ -2378,7 +2378,7 @@ static struct ggml_cgraph * whisper_build_graph_decoder(
 
     const float KQscale = pow(float(n_state_head), -0.25);
 
-    struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), 1);
+    struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, 1), 1);
     ggml_set_name(KQ_mask, "KQ_mask");
     ggml_set_input(KQ_mask);
 
@@ -2806,7 +2806,7 @@ static bool whisper_decode_internal(
                     }
                 }
 
-                for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
+                for (int i = n_tokens; i < GGML_PAD(n_tokens, 1); ++i) {
                     for (int j = 0; j < n_kv; ++j) {
                         data[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
                     }
diff --git a/src/llama-batch.cpp b/src/llama-batch.cpp
index 86a1a4ba1..386fab04a 100644
--- a/src/llama-batch.cpp
+++ b/src/llama-batch.cpp
@@ -695,6 +695,8 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
     udata->seq_idx   .resize(LLAMA_MAX_SEQ, -1);
     udata->output    .resize(n_tokens);
 
+    udata->seq_id_data.reserve(n_tokens);
+
     seq_set_t seq_set_unq;
 
     for (size_t i = 0; i < idxs.size(); ++i) {
@@ -716,11 +718,13 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
         }
 
         udata->n_seq_id[i] = batch.n_seq_id[idxs[i]];
-        udata->seq_id[i]   = batch.seq_id[idxs[i]];
         udata->output[i]   = batch.logits[idxs[i]];
 
         for (int s = 0; s < udata->n_seq_id[i]; ++s) {
-            seq_set_unq.set(udata->seq_id[i][s]);
+            const llama_seq_id seq_id = batch.seq_id[idxs[i]][s];
+
+            udata->seq_id_data.push_back(seq_id);
+            seq_set_unq.set(seq_id);
         }
 
         if (udata->output[i]) {
@@ -728,6 +732,12 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
         }
     }
 
+    llama_seq_id * seq_id_ptr = udata->seq_id_data.data();
+    for (size_t i = 0; i < idxs.size(); ++i) {
+        udata->seq_id[i] = seq_id_ptr;
+        seq_id_ptr += udata->n_seq_id[i];
+    }
+
     for (uint32_t s = 0; s < n_seq_max; ++s) {
         if (seq_set_unq.test(s)) {
             udata->seq_idx[s] = udata->seq_id_unq.size();
diff --git a/src/llama-batch.h b/src/llama-batch.h
index 209cf3699..8e6fac0ef 100644
--- a/src/llama-batch.h
+++ b/src/llama-batch.h
@@ -56,13 +56,15 @@ struct llama_ubatch {
         std::vector<float>          embd;
         std::vector<llama_pos>      pos;
         std::vector<int32_t>        n_seq_id;
-        std::vector<llama_seq_id *> seq_id;
+        std::vector<llama_seq_id *> seq_id;      // these point into the seq_id_data below
         std::vector<llama_seq_id>   seq_id_unq;
         std::vector<int32_t>        seq_idx;
         std::vector<int8_t>         output;
+
+        std::vector<llama_seq_id> seq_id_data;
     };
 
-    // the llama_ubatch pointers above point to this data if set. otherwise - points to non-owning data
+    // the llama_ubatch pointers above point to this data if set. otherwise - point to external non-owning data
     std::shared_ptr<data_t> data;
 };
 
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index cf93ae314..f186b17ee 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -96,14 +96,6 @@ llama_context::llama_context(
     // with causal attention, the batch size is limited by the context size
     cparams.n_batch = cparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
 
-    // the batch has to be at least GGML_KQ_MASK_PAD because we will be padding the KQ_mask
-    // this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext)
-    // ref: https://github.com/ggerganov/llama.cpp/pull/5021
-    // TODO: this padding is not needed for the cache-less context so we should probably move it to llama_memory
-    if (cparams.n_batch < GGML_KQ_MASK_PAD) {
-        LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD);
-        cparams.n_batch = GGML_KQ_MASK_PAD;
-    }
     cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
 
     cparams.op_offload = params.op_offload;
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index 43620df78..6cf9a883a 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -385,7 +385,7 @@ bool llm_graph_input_attn_kv::can_reuse(const llm_graph_params & params) {
   //res &= self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
 
     res &= self_kq_mask->ne[0] == mctx->get_n_kv();
-    res &= self_kq_mask->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD);
+    res &= self_kq_mask->ne[1] == params.ubatch.n_tokens;
 
     return res;
 }
@@ -416,10 +416,10 @@ bool llm_graph_input_attn_kv_iswa::can_reuse(const llm_graph_params & params) {
   //res &= self_v_idxs_swa->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
 
     res &= self_kq_mask->ne[0] == mctx->get_base()->get_n_kv();
-    res &= self_kq_mask->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD);
+    res &= self_kq_mask->ne[1] == params.ubatch.n_tokens;
 
     res &= self_kq_mask_swa->ne[0] == mctx->get_swa()->get_n_kv();
-    res &= self_kq_mask_swa->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD);
+    res &= self_kq_mask_swa->ne[1] == params.ubatch.n_tokens;
 
     return res;
 }
@@ -452,7 +452,7 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
             }
         }
 
-        for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
+        for (int i = n_tokens; i < n_tokens; ++i) {
             for (int j = 0; j < n_enc; ++j) {
                 data[h*(n_enc*n_tokens) + i*n_enc + j] = -INFINITY;
             }
@@ -1470,13 +1470,13 @@ llm_graph_input_attn_no_cache * llm_graph_context::build_attn_inp_no_cache() con
     auto inp = std::make_unique<llm_graph_input_attn_no_cache>(hparams, cparams);
 
     // note: there is no KV cache, so the number of KV values is equal to the number of tokens in the batch
-    inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), 1, 1);
+    inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens, 1, 1);
     ggml_set_input(inp->self_kq_mask);
 
     inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
 
     if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
-        inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), 1, 1);
+        inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens, 1, 1);
         ggml_set_input(inp->self_kq_mask_swa);
 
         inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
@@ -1558,7 +1558,7 @@ static std::unique_ptr<llm_graph_input_attn_kv> build_attn_inp_kv_impl(
         inp->self_k_idxs = mctx_cur->build_input_k_idxs(ctx0, ubatch);
         inp->self_v_idxs = mctx_cur->build_input_v_idxs(ctx0, ubatch);
 
-        inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens/n_stream, GGML_KQ_MASK_PAD), 1, n_stream);
+        inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
         ggml_set_input(inp->self_kq_mask);
 
         inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
@@ -1701,7 +1701,7 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const {
 
     const int32_t n_enc = !cross->v_embd.empty() ? cross->n_enc : hparams.n_ctx_train;
 
-    inp->cross_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_enc, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), 1, 1);
+    inp->cross_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_enc, n_tokens, 1, 1);
     ggml_set_input(inp->cross_kq_mask);
 
     inp->cross_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->cross_kq_mask, GGML_TYPE_F16) : inp->cross_kq_mask;
@@ -1767,7 +1767,7 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const
         inp->self_k_idxs = mctx_cur->get_base()->build_input_k_idxs(ctx0, ubatch);
         inp->self_v_idxs = mctx_cur->get_base()->build_input_v_idxs(ctx0, ubatch);
 
-        inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens/n_stream, GGML_KQ_MASK_PAD), 1, n_stream);
+        inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
         ggml_set_input(inp->self_kq_mask);
 
         inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
@@ -1781,7 +1781,7 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const
         inp->self_k_idxs_swa = mctx_cur->get_swa()->build_input_k_idxs(ctx0, ubatch);
         inp->self_v_idxs_swa = mctx_cur->get_swa()->build_input_v_idxs(ctx0, ubatch);
 
-        inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens/n_stream, GGML_KQ_MASK_PAD), 1, n_stream);
+        inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
         ggml_set_input(inp->self_kq_mask_swa);
 
         inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
index 41887c89c..0390a7ade 100644
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -1232,8 +1232,7 @@ void llama_kv_cache::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * u
     GGML_ASSERT(n_tokens%n_stream == 0);
 
     // n_tps == n_tokens_per_stream
-    const int64_t n_tps     = n_tokens/n_stream;
-    const int64_t n_tps_pad = GGML_PAD(n_tps, GGML_KQ_MASK_PAD);
+    const int64_t n_tps = n_tokens/n_stream;
 
     std::fill(data, data + ggml_nelements(dst), -INFINITY);
 
@@ -1266,7 +1265,7 @@ void llama_kv_cache::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * u
                 const llama_pos p1_x = is_2d ? ubatch->pos[i + ubatch->n_tokens*2] : 0;
                 const llama_pos p1_y = is_2d ? ubatch->pos[i + ubatch->n_tokens]   : 0;
 
-                const uint64_t idst = n_kv*(h*n_stream*n_tps_pad + s*n_tps_pad + ii);
+                const uint64_t idst = n_kv*(h*n_stream*n_tps + s*n_tps + ii);
 
                 for (uint32_t j = 0; j < n_kv; ++j) {
                     if (cells.is_empty(j)) {
diff --git a/tools/completion/completion.cpp b/tools/completion/completion.cpp
index 426e0436e..87954dd76 100644
--- a/tools/completion/completion.cpp
+++ b/tools/completion/completion.cpp
@@ -87,6 +87,10 @@ static void sigint_handler(int signo) {
 int main(int argc, char ** argv) {
     common_params params;
     g_params = &params;
+
+    // disable jinja by default
+    params.use_jinja = false;
+
     if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMPLETION, print_usage)) {
         return 1;
     }
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index cab35e6fb..03ebcfa12 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -622,11 +622,12 @@ struct clip_graph {
             cur = ggml_mul(ctx0, cur, model.mm_input_norm_w);
             cur = ggml_add(ctx0, cur, model.mm_input_norm_b);
 
-            cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
-            cur = ggml_add(ctx0, cur, model.mm_1_b);
-            cur = ggml_gelu(ctx0, cur);
-            cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
-            cur = ggml_add(ctx0, cur, model.mm_2_b);
+            cur = build_ffn(cur,
+                model.mm_1_w, model.mm_1_b,
+                nullptr, nullptr,
+                model.mm_2_w, model.mm_2_b,
+                FFN_GELU,
+                -1);
 
         } else if (ctx->proj_type() == PROJECTOR_TYPE_JANUS_PRO) {
             cur = build_ffn(cur,
@@ -694,16 +695,12 @@ struct clip_graph {
 
         // LlavaMultiModalProjector (always using GELU activation)
         {
-            cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
-            if (model.mm_1_b) {
-                cur = ggml_add(ctx0, cur, model.mm_1_b);
-            }
-
-            cur = ggml_gelu(ctx0, cur);
-            cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
-            if (model.mm_2_b) {
-                cur = ggml_add(ctx0, cur, model.mm_2_b);
-            }
+            cur = build_ffn(cur,
+                model.mm_1_w, model.mm_1_b,
+                nullptr, nullptr,
+                model.mm_2_w, model.mm_2_b,
+                FFN_GELU,
+                -1);
         }
 
         // arrangement of the [IMG_BREAK] token
@@ -802,10 +799,6 @@ struct clip_graph {
 
             // if flash attn is used, we need to pad the mask and cast to f16
             if (ctx->flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) {
-                int n_pad = GGML_PAD(window_mask->ne[1], GGML_KQ_MASK_PAD) - window_mask->ne[1];
-                if (n_pad > 0) {
-                    window_mask = ggml_pad(ctx0, window_mask, 0, n_pad, 0, 0);
-                }
                 window_mask = ggml_cast(ctx0, window_mask, GGML_TYPE_F16);
             }
 
@@ -818,7 +811,7 @@ struct clip_graph {
 
         // loop over layers
         for (int il = 0; il < n_layer; il++) {
-            auto & layer = model.layers[il];
+            const auto & layer = model.layers[il];
             const bool full_attn = use_window_attn ? (il + 1) % n_wa_pattern == 0 : true;
 
             ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
@@ -897,16 +890,12 @@ struct clip_graph {
         // multimodal projection
         ggml_tensor * embeddings = inpL;
         embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, n_pos / 4, batch_size);
-
-        embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
-        embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
-
-        // GELU activation
-        embeddings = ggml_gelu(ctx0, embeddings);
-
-        // Second linear layer
-        embeddings = ggml_mul_mat(ctx0, model.mm_1_w, embeddings);
-        embeddings = ggml_add(ctx0, embeddings, model.mm_1_b);
+        embeddings = build_ffn(embeddings,
+                            model.mm_0_w, model.mm_0_b,
+                            nullptr, nullptr,
+                            model.mm_1_w, model.mm_1_b,
+                            FFN_GELU,
+                            -1);
 
         if (use_window_attn) {
             window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / 4);
@@ -1284,11 +1273,12 @@ struct clip_graph {
             // projector LayerNorm uses pytorch's default eps = 1e-5
             // ref: https://huggingface.co/OpenGVLab/InternVL3-8B-Instruct/blob/a34d3e4e129a5856abfd6aa6de79776484caa14e/modeling_internvl_chat.py#L79
             cur = build_norm(cur, model.mm_0_w, model.mm_0_b, NORM_TYPE_NORMAL, 1e-5, -1);
-            cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
-            cur = ggml_add(ctx0, cur, model.mm_1_b);
-            cur = ggml_gelu(ctx0, cur);
-            cur = ggml_mul_mat(ctx0, model.mm_3_w, cur);
-            cur = ggml_add(ctx0, cur, model.mm_3_b);
+            cur = build_ffn(cur,
+                model.mm_1_w, model.mm_1_b,
+                nullptr, nullptr,
+                model.mm_3_w, model.mm_3_b,
+                FFN_GELU,
+                -1);
         }
 
         // build the graph
@@ -1439,11 +1429,12 @@ struct clip_graph {
             cb(cur, "proj_inp_normed", -1);
 
             // projection mlp
-            cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
-            cur = ggml_add(ctx0, cur, model.mm_1_b);
-            cur = ggml_gelu(ctx0, cur);
-            cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
-            cur = ggml_add(ctx0, cur, model.mm_2_b);
+            cur = build_ffn(cur,
+                model.mm_1_w, model.mm_1_b,
+                nullptr, nullptr,
+                model.mm_2_w, model.mm_2_b,
+                FFN_GELU,
+                -1);
             cb(cur, "proj_out", -1);
         }
 
@@ -1914,9 +1905,12 @@ struct clip_graph {
 
         } else if (ctx->proj_type() == PROJECTOR_TYPE_VOXTRAL) {
             // projector
-            cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
-            cur = ggml_gelu_erf(ctx0, cur);
-            cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
+            cur = build_ffn(cur,
+                model.mm_1_w, model.mm_1_b,
+                nullptr, nullptr,
+                model.mm_2_w, model.mm_2_b,
+                FFN_GELU_ERF,
+                -1);
 
         } else {
             GGML_ABORT("%s: unknown projector type", __func__);
@@ -2101,34 +2095,66 @@ private:
 
             // self-attention
             {
-                ggml_tensor * Qcur = ggml_mul_mat(ctx0, layer.q_w, cur);
-                if (layer.q_b) {
-                    Qcur = ggml_add(ctx0, Qcur, layer.q_b);
-                }
+                ggml_tensor * Qcur = nullptr;
+                ggml_tensor * Kcur = nullptr;
+                ggml_tensor * Vcur = nullptr;
+                if (layer.qkv_w != nullptr) {
+                    // fused qkv
+                    cur = ggml_mul_mat(ctx0, layer.qkv_w, cur);
+                    if (layer.qkv_b != nullptr) {
+                        cur = ggml_add(ctx0, cur, layer.qkv_b);
+                    }
 
-                ggml_tensor * Kcur = ggml_mul_mat(ctx0, layer.k_w, cur);
-                if (layer.k_b) {
-                    Kcur = ggml_add(ctx0, Kcur, layer.k_b);
-                }
+                    Qcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
+                        /* nb1    */ ggml_row_size(cur->type, d_head),
+                        /* nb2    */ cur->nb[1],
+                        /* offset */ 0);
 
-                ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.v_w, cur);
-                if (layer.v_b) {
-                    Vcur = ggml_add(ctx0, Vcur, layer.v_b);
-                }
+                    Kcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
+                        /* nb1    */ ggml_row_size(cur->type, d_head),
+                        /* nb2    */ cur->nb[1],
+                        /* offset */ ggml_row_size(cur->type, n_embd));
 
-                if (layer.q_norm) {
-                    Qcur = build_norm(Qcur, layer.q_norm, NULL, norm_t, eps, il);
-                    cb(Qcur, "Qcur_norm", il);
-                }
+                    Vcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
+                        /* nb1    */ ggml_row_size(cur->type, d_head),
+                        /* nb2    */ cur->nb[1],
+                        /* offset */ ggml_row_size(cur->type, 2 * n_embd));
 
-                if (layer.k_norm) {
-                    Kcur = build_norm(Kcur, layer.k_norm, NULL, norm_t, eps, il);
-                    cb(Kcur, "Kcur_norm", il);
-                }
+                    // TODO: q/k norm requires row size == n_embd, while here it's d_head
+                    // we can add support in the future if needed
+                    GGML_ASSERT(layer.q_norm == nullptr && layer.k_norm == nullptr);
 
-                Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos);
-                Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos);
-                Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos);
+                } else {
+                    // separate q, k, v
+                    Qcur = ggml_mul_mat(ctx0, layer.q_w, cur);
+                    if (layer.q_b) {
+                        Qcur = ggml_add(ctx0, Qcur, layer.q_b);
+                    }
+
+                    Kcur = ggml_mul_mat(ctx0, layer.k_w, cur);
+                    if (layer.k_b) {
+                        Kcur = ggml_add(ctx0, Kcur, layer.k_b);
+                    }
+
+                    Vcur = ggml_mul_mat(ctx0, layer.v_w, cur);
+                    if (layer.v_b) {
+                        Vcur = ggml_add(ctx0, Vcur, layer.v_b);
+                    }
+
+                    if (layer.q_norm) {
+                        Qcur = build_norm(Qcur, layer.q_norm, NULL, norm_t, eps, il);
+                        cb(Qcur, "Qcur_norm", il);
+                    }
+
+                    if (layer.k_norm) {
+                        Kcur = build_norm(Kcur, layer.k_norm, NULL, norm_t, eps, il);
+                        cb(Kcur, "Kcur_norm", il);
+                    }
+
+                    Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos);
+                    Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos);
+                    Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos);
+                }
 
                 cb(Qcur, "Qcur", il);
                 cb(Kcur, "Kcur", il);
diff --git a/tools/mtmd/mtmd-cli.cpp b/tools/mtmd/mtmd-cli.cpp
index a75af406c..25d24603d 100644
--- a/tools/mtmd/mtmd-cli.cpp
+++ b/tools/mtmd/mtmd-cli.cpp
@@ -270,6 +270,7 @@ int main(int argc, char ** argv) {
     ggml_time_init();
 
     common_params params;
+    params.use_jinja = false;   // disable jinja by default
     params.sampling.temp = 0.2; // lower temp by default for better quality
 
     if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_MTMD, show_additional_info)) {
@@ -317,7 +318,9 @@ int main(int argc, char ** argv) {
         g_is_generating = true;
         if (params.prompt.find(mtmd_default_marker()) == std::string::npos) {
             for (size_t i = 0; i < params.image.size(); i++) {
-                params.prompt += mtmd_default_marker();
+                // most models require the marker before each image
+                // ref: https://github.com/ggml-org/llama.cpp/pull/17616
+                params.prompt = mtmd_default_marker() + params.prompt;
             }
         }
         common_chat_msg msg;
diff --git a/tools/mtmd/tests.sh b/tools/mtmd/tests.sh
index 472f7d821..82b486ec9 100755
--- a/tools/mtmd/tests.sh
+++ b/tools/mtmd/tests.sh
@@ -32,23 +32,32 @@ fi
 
 arr_prefix=()
 arr_hf=()
-arr_tmpl=() # chat template
+arr_extra_args=()
 arr_file=()
 
 add_test_vision() {
     local hf=$1
-    local tmpl=${2:-""} # default to empty string if not provided
+    shift
+    local extra_args=""
+    if [ $# -gt 0 ]; then
+        extra_args=$(printf " %q" "$@")
+    fi
     arr_prefix+=("[vision]")
     arr_hf+=("$hf")
-    arr_tmpl+=("$tmpl")
+    arr_extra_args+=("$extra_args")
     arr_file+=("test-1.jpeg")
 }
 
 add_test_audio() {
     local hf=$1
+    shift
+    local extra_args=""
+    if [ $# -gt 0 ]; then
+        extra_args=$(printf " %q" "$@")
+    fi
     arr_prefix+=("[audio] ")
     arr_hf+=("$hf")
-    arr_tmpl+=("") # no need for chat tmpl
+    arr_extra_args+=("$extra_args")
     arr_file+=("test-2.mp3")
 }
 
@@ -56,9 +65,9 @@ add_test_vision "ggml-org/SmolVLM-500M-Instruct-GGUF:Q8_0"
 add_test_vision "ggml-org/SmolVLM2-2.2B-Instruct-GGUF:Q4_K_M"
 add_test_vision "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF:Q8_0"
 add_test_vision "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M"
-add_test_vision "THUDM/glm-edge-v-5b-gguf:Q4_K_M"
-add_test_vision "second-state/Llava-v1.5-7B-GGUF:Q2_K"            "vicuna"
-add_test_vision "cjpais/llava-1.6-mistral-7b-gguf:Q3_K_M"         "vicuna"
+add_test_vision "THUDM/glm-edge-v-5b-gguf:Q4_K_M" -p "name of the newspaper?<__media__>"
+add_test_vision "second-state/Llava-v1.5-7B-GGUF:Q2_K" --chat-template vicuna
+add_test_vision "cjpais/llava-1.6-mistral-7b-gguf:Q3_K_M" --chat-template vicuna
 add_test_vision "ibm-research/granite-vision-3.2-2b-GGUF:Q4_K_M"
 add_test_vision "second-state/MiniCPM-Llama3-V-2_5-GGUF:Q2_K"  # model from openbmb is corrupted
 add_test_vision "openbmb/MiniCPM-V-2_6-gguf:Q2_K"
@@ -79,7 +88,7 @@ add_test_audio  "ggml-org/Voxtral-Mini-3B-2507-GGUF:Q4_K_M"
 # to test the big models, run: ./tests.sh big
 if [ "$RUN_BIG_TESTS" = true ]; then
     add_test_vision "ggml-org/pixtral-12b-GGUF:Q4_K_M"
-    add_test_vision "ggml-org/Mistral-Small-3.1-24B-Instruct-2503-GGUF" "mistral-v7"
+    add_test_vision "ggml-org/Mistral-Small-3.1-24B-Instruct-2503-GGUF" --chat-template mistral-v7
     add_test_vision "ggml-org/Qwen2-VL-2B-Instruct-GGUF:Q4_K_M"
     add_test_vision "ggml-org/Qwen2-VL-7B-Instruct-GGUF:Q4_K_M"
     add_test_vision "ggml-org/Qwen2.5-VL-3B-Instruct-GGUF:Q4_K_M"
@@ -89,7 +98,7 @@ if [ "$RUN_BIG_TESTS" = true ]; then
     add_test_vision "ggml-org/InternVL3-14B-Instruct-GGUF:Q4_K_M"
     add_test_vision "ggml-org/Qwen2.5-Omni-7B-GGUF:Q4_K_M"
     # add_test_vision "ggml-org/Qwen2.5-VL-32B-Instruct-GGUF:Q4_K_M" # does not work on my mac M3 Ultra
-    add_test_vision "ggml-org/Kimi-VL-A3B-Thinking-2506-GGUF:Q4_K_M"
+    # add_test_vision "ggml-org/Kimi-VL-A3B-Thinking-2506-GGUF:Q4_K_M" # not always working
 
     add_test_audio  "ggml-org/ultravox-v0_5-llama-3_1-8b-GGUF:Q4_K_M"
     add_test_audio  "ggml-org/Qwen2.5-Omni-7B-GGUF:Q4_K_M"
@@ -122,21 +131,25 @@ for i in "${!arr_hf[@]}"; do
     bin="llama-mtmd-cli"
     prefix="${arr_prefix[$i]}"
     hf="${arr_hf[$i]}"
-    tmpl="${arr_tmpl[$i]}"
+    extra_args="${arr_extra_args[$i]}"
     inp_file="${arr_file[$i]}"
 
     echo "Running test with binary: $bin and HF model: $hf"
     echo ""
     echo ""
 
-    output=$(\
-        "$PROJ_ROOT/build/bin/$bin" \
-        -hf "$hf" \
-        --image $SCRIPT_DIR/$inp_file \
-        -p "what is the publisher name of the newspaper?" \
+    cmd="$(printf %q "$PROJ_ROOT/build/bin/$bin") \
+        -hf $(printf %q "$hf") \
+        --image $(printf %q "$SCRIPT_DIR/$inp_file") \
         --temp 0 -n 128 \
-        ${tmpl:+--chat-template "$tmpl"} \
-        2>&1 | tee /dev/tty)
+        ${extra_args}"
+
+    # if extra_args does not contain -p, we add a default prompt
+    if ! [[ "$extra_args" =~ "-p" ]]; then
+        cmd+=" -p \"what is the publisher name of the newspaper?\""
+    fi
+
+    output=$(eval "$cmd" 2>&1 | tee /dev/tty)
 
     echo "$output" > $SCRIPT_DIR/output/$bin-$(echo "$hf" | tr '/' '-').log
 
@@ -144,9 +157,9 @@ for i in "${!arr_hf[@]}"; do
     if echo "$output" | grep -iq "new york" \
             || (echo "$output" | grep -iq "men" && echo "$output" | grep -iq "walk")
     then
-        result="$prefix \033[32mOK\033[0m:   $bin $hf"
+        result="$prefix \033[32mOK\033[0m:   $hf"
     else
-        result="$prefix \033[31mFAIL\033[0m: $bin $hf"
+        result="$prefix \033[31mFAIL\033[0m: $hf"
     fi
     echo -e "$result"
     arr_res+=("$result")
diff --git a/tools/server/public/index.html.gz b/tools/server/public/index.html.gz
index 2db04e952..3fd631b77 100644
Binary files a/tools/server/public/index.html.gz and b/tools/server/public/index.html.gz differ
diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp
index 6f88e93c4..6c618a673 100644
--- a/tools/server/server-models.cpp
+++ b/tools/server/server-models.cpp
@@ -1,6 +1,7 @@
 #include "server-common.h"
 #include "server-models.h"
 
+#include "preset.h"
 #include "download.h"
 
 #include <cpp-httplib/httplib.h> // TODO: remove this once we use HTTP client from download.h
@@ -33,6 +34,10 @@
 
 #define CMD_EXIT "exit"
 
+// address for child process, this is needed because router may run on 0.0.0.0
+// ref: https://github.com/ggml-org/llama.cpp/issues/17862
+#define CHILD_ADDR "127.0.0.1"
+
 static std::filesystem::path get_server_exec_path() {
 #if defined(_WIN32)
     wchar_t buf[32768] = { 0 };  // Large buffer to handle long paths
@@ -132,6 +137,93 @@ static std::vector<local_model> list_local_models(const std::string & dir) {
     return models;
 }
 
+//
+// server_presets
+//
+
+
+server_presets::server_presets(int argc, char ** argv, common_params & base_params, const std::string & presets_path)
+        : ctx_params(common_params_parser_init(base_params, LLAMA_EXAMPLE_SERVER)) {
+    if (!presets_path.empty()) {
+        presets = common_presets_load(presets_path, ctx_params);
+        SRV_INF("Loaded %zu presets from %s\n", presets.size(), presets_path.c_str());
+    }
+
+    // populate reserved args (will be appended by the router)
+    for (auto & opt : ctx_params.options) {
+        if (opt.env == nullptr) {
+            continue;
+        }
+        std::string env = opt.env;
+        if (env == "LLAMA_ARG_PORT" ||
+            env == "LLAMA_ARG_HOST" ||
+            env == "LLAMA_ARG_ALIAS" ||
+            env == "LLAMA_ARG_API_KEY" ||
+            env == "LLAMA_ARG_MODELS_DIR" ||
+            env == "LLAMA_ARG_MODELS_MAX" ||
+            env == "LLAMA_ARG_MODELS_PRESET" ||
+            env == "LLAMA_ARG_MODEL" ||
+            env == "LLAMA_ARG_MMPROJ" ||
+            env == "LLAMA_ARG_HF_REPO" ||
+            env == "LLAMA_ARG_NO_MODELS_AUTOLOAD") {
+            control_args[env] = opt;
+        }
+    }
+
+    // read base args from router's argv
+    common_params_parse(argc, argv, LLAMA_EXAMPLE_SERVER, base_args);
+
+    // remove any router-controlled args from base_args
+    for (const auto & cargs : control_args) {
+        auto it = base_args.find(cargs.second);
+        if (it != base_args.end()) {
+            base_args.erase(it);
+        }
+    }
+}
+
+common_preset server_presets::get_preset(const std::string & name) {
+    auto it = presets.find(name);
+    if (it != presets.end()) {
+        return it->second;
+    }
+    return common_preset();
+}
+
+void server_presets::render_args(server_model_meta & meta) {
+    common_preset preset = meta.preset; // copy
+    // merging 3 kinds of args:
+    // 1. model-specific args (from preset)
+    // force removing control args if any
+    for (auto & cargs : control_args) {
+        if (preset.options.find(cargs.second) != preset.options.end()) {
+            SRV_WRN("Preset '%s' contains reserved arg '%s', removing it\n", preset.name.c_str(), cargs.second.args[0]);
+            preset.options.erase(cargs.second);
+        }
+    }
+    // 2. base args (from router)
+    // inherit from base args
+    for (const auto & [arg, value] : base_args) {
+        preset.options[arg] = value;
+    }
+    // 3. control args (from router)
+    // set control values
+    preset.options[control_args["LLAMA_ARG_HOST"]] = CHILD_ADDR;
+    preset.options[control_args["LLAMA_ARG_PORT"]] = std::to_string(meta.port);
+    preset.options[control_args["LLAMA_ARG_ALIAS"]] = meta.name;
+    if (meta.in_cache) {
+        preset.options[control_args["LLAMA_ARG_HF_REPO"]] = meta.name;
+    } else {
+        preset.options[control_args["LLAMA_ARG_MODEL"]] = meta.path;
+        if (!meta.path_mmproj.empty()) {
+            preset.options[control_args["LLAMA_ARG_MMPROJ"]] = meta.path_mmproj;
+        }
+    }
+    meta.args = preset.to_args();
+    // add back the binary path at the front
+    meta.args.insert(meta.args.begin(), get_server_exec_path().string());
+}
+
 //
 // server_models
 //
@@ -140,7 +232,7 @@ server_models::server_models(
         const common_params & params,
         int argc,
         char ** argv,
-        char ** envp) : base_params(params) {
+        char ** envp) : base_params(params), presets(argc, argv, base_params, params.models_preset) {
     for (int i = 0; i < argc; i++) {
         base_args.push_back(std::string(argv[i]));
     }
@@ -155,11 +247,58 @@ server_models::server_models(
         LOG_WRN("failed to get server executable path: %s\n", e.what());
         LOG_WRN("using original argv[0] as fallback: %s\n", base_args[0].c_str());
     }
-    // TODO: allow refreshing cached model list
-    // add cached models
+    load_models();
+}
+
+void server_models::add_model(server_model_meta && meta) {
+    if (mapping.find(meta.name) != mapping.end()) {
+        throw std::runtime_error(string_format("model '%s' appears multiple times", meta.name.c_str()));
+    }
+    presets.render_args(meta); // populate meta.args
+    std::string name = meta.name;
+    mapping[name] = instance_t{
+        /* subproc */ std::make_shared<subprocess_s>(),
+        /* th      */ std::thread(),
+        /* meta    */ std::move(meta)
+    };
+}
+
+static std::vector<local_model> list_custom_path_models(server_presets & presets) {
+    // detect any custom-path models in presets
+    std::vector<local_model> custom_models;
+    for (auto & [model_name, preset] : presets.presets) {
+        local_model model;
+        model.name = model_name;
+        std::vector<common_arg> to_erase;
+        for (auto & [arg, value] : preset.options) {
+            std::string env(arg.env ? arg.env : "");
+            if (env == "LLAMA_ARG_MODEL") {
+                model.path = value;
+                to_erase.push_back(arg);
+            }
+            if (env == "LLAMA_ARG_MMPROJ") {
+                model.path_mmproj = value;
+                to_erase.push_back(arg);
+            }
+        }
+        for (auto & arg : to_erase) {
+            preset.options.erase(arg);
+        }
+        if (!model.name.empty() && !model.path.empty()) {
+            custom_models.push_back(model);
+        }
+    }
+    return custom_models;
+}
+
+// TODO: allow refreshing cached model list
+void server_models::load_models() {
+    // loading models from 3 sources:
+    // 1. cached models
     auto cached_models = common_list_cached_models();
     for (const auto & model : cached_models) {
         server_model_meta meta{
+            /* preset      */ presets.get_preset(model.to_string()),
             /* name        */ model.to_string(),
             /* path        */ model.manifest_path,
             /* path_mmproj */ "", // auto-detected when loading
@@ -170,21 +309,18 @@ server_models::server_models(
             /* args        */ std::vector<std::string>(),
             /* exit_code   */ 0
         };
-        mapping[meta.name] = instance_t{
-            /* subproc */ std::make_shared<subprocess_s>(),
-            /* th      */ std::thread(),
-            /* meta    */ meta
-        };
+        add_model(std::move(meta));
     }
-    // add local models specificed via --models-dir
-    if (!params.models_dir.empty()) {
-        auto local_models = list_local_models(params.models_dir);
+    // 2. local models specificed via --models-dir
+    if (!base_params.models_dir.empty()) {
+        auto local_models = list_local_models(base_params.models_dir);
         for (const auto & model : local_models) {
             if (mapping.find(model.name) != mapping.end()) {
                 // already exists in cached models, skip
                 continue;
             }
             server_model_meta meta{
+                /* preset      */ presets.get_preset(model.name),
                 /* name        */ model.name,
                 /* path        */ model.path,
                 /* path_mmproj */ model.path_mmproj,
@@ -195,13 +331,31 @@ server_models::server_models(
                 /* args        */ std::vector<std::string>(),
                 /* exit_code   */ 0
             };
-            mapping[meta.name] = instance_t{
-                /* subproc */ std::make_shared<subprocess_s>(),
-                /* th      */ std::thread(),
-                /* meta    */ meta
-            };
+            add_model(std::move(meta));
         }
     }
+    // 3. custom-path models specified in presets
+    auto custom_models = list_custom_path_models(presets);
+    for (const auto & model : custom_models) {
+        server_model_meta meta{
+            /* preset      */ presets.get_preset(model.name),
+            /* name        */ model.name,
+            /* path        */ model.path,
+            /* path_mmproj */ model.path_mmproj,
+            /* in_cache    */ false,
+            /* port        */ 0,
+            /* status      */ SERVER_MODEL_STATUS_UNLOADED,
+            /* last_used   */ 0,
+            /* args        */ std::vector<std::string>(),
+            /* exit_code   */ 0
+        };
+        add_model(std::move(meta));
+    }
+    // log available models
+    SRV_INF("Available models (%zu) (*: custom preset)\n", mapping.size());
+    for (const auto & [name, inst] : mapping) {
+        SRV_INF("  %c %s\n", inst.meta.preset.name.empty() ? ' ' : '*', name.c_str());
+    }
 }
 
 void server_models::update_meta(const std::string & name, const server_model_meta & meta) {
@@ -335,19 +489,7 @@ void server_models::unload_lru() {
     }
 }
 
-static void add_or_replace_arg(std::vector<std::string> & args, const std::string & key, const std::string & value) {
-    for (size_t i = 0; i < args.size(); i++) {
-        if (args[i] == key && i + 1 < args.size()) {
-            args[i + 1] = value;
-            return;
-        }
-    }
-    // not found, append
-    args.push_back(key);
-    args.push_back(value);
-}
-
-void server_models::load(const std::string & name, bool auto_load) {
+void server_models::load(const std::string & name) {
     if (!has_model(name)) {
         throw std::runtime_error("model name=" + name + " is not found");
     }
@@ -376,26 +518,10 @@ void server_models::load(const std::string & name, bool auto_load) {
     {
         SRV_INF("spawning server instance with name=%s on port %d\n", inst.meta.name.c_str(), inst.meta.port);
 
-        std::vector<std::string> child_args;
-        if (auto_load && !meta.args.empty()) {
-            child_args = meta.args; // copy previous args
-        } else {
-            child_args = base_args; // copy
-            if (inst.meta.in_cache) {
-                add_or_replace_arg(child_args, "-hf", inst.meta.name);
-            } else {
-                add_or_replace_arg(child_args, "-m", inst.meta.path);
-                if (!inst.meta.path_mmproj.empty()) {
-                    add_or_replace_arg(child_args, "--mmproj", inst.meta.path_mmproj);
-                }
-            }
-        }
+        presets.render_args(inst.meta); // update meta.args
 
-        // set model args
-        add_or_replace_arg(child_args, "--port", std::to_string(inst.meta.port));
-        add_or_replace_arg(child_args, "--alias", inst.meta.name);
-
-        std::vector<std::string> child_env = base_env; // copy
+        std::vector<std::string> child_args = inst.meta.args; // copy
+        std::vector<std::string> child_env  = base_env; // copy
         child_env.push_back("LLAMA_SERVER_ROUTER_PORT=" + std::to_string(base_params.port));
 
         SRV_INF("%s", "spawning server instance with args:\n");
@@ -541,7 +667,7 @@ bool server_models::ensure_model_loaded(const std::string & name) {
     }
     if (meta->status == SERVER_MODEL_STATUS_UNLOADED) {
         SRV_INF("model name=%s is not loaded, loading...\n", name.c_str());
-        load(name, true);
+        load(name);
     }
 
     SRV_INF("waiting until model name=%s is fully loaded...\n", name.c_str());
@@ -571,7 +697,7 @@ server_http_res_ptr server_models::proxy_request(const server_http_req & req, co
     SRV_INF("proxying request to model %s on port %d\n", name.c_str(), meta->port);
     auto proxy = std::make_unique<server_http_proxy>(
             method,
-            base_params.hostname,
+            CHILD_ADDR,
             meta->port,
             req.path,
             req.headers,
@@ -724,38 +850,6 @@ void server_models_routes::init_routes() {
         return models.proxy_request(req, method, name, true); // update last usage for POST request only
     };
 
-    this->get_router_models = [this](const server_http_req &) {
-        auto res = std::make_unique<server_http_res>();
-        json models_json = json::array();
-        auto all_models = models.get_all_meta();
-        std::time_t t = std::time(0);
-        for (const auto & meta : all_models) {
-            json status {
-                {"value", server_model_status_to_string(meta.status)},
-                {"args",  meta.args},
-            };
-            if (meta.is_failed()) {
-                status["exit_code"] = meta.exit_code;
-                status["failed"]    = true;
-            }
-            models_json.push_back(json {
-                {"id",       meta.name},
-                {"object",   "model"},    // for OAI-compat
-                {"owned_by", "llamacpp"}, // for OAI-compat
-                {"created",  t},          // for OAI-compat
-                {"in_cache", meta.in_cache},
-                {"path",     meta.path},
-                {"status",   status},
-                // TODO: add other fields, may require reading GGUF metadata
-            });
-        }
-        res_ok(res, {
-            {"data", models_json},
-            {"object", "list"},
-        });
-        return res;
-    };
-
     this->post_router_models_load = [this](const server_http_req & req) {
         auto res = std::make_unique<server_http_res>();
         json body = json::parse(req.body);
@@ -769,7 +863,7 @@ void server_models_routes::init_routes() {
             res_err(res, format_error_response("model is already loaded", ERROR_TYPE_INVALID_REQUEST));
             return res;
         }
-        models.load(name, false);
+        models.load(name);
         res_ok(res, {{"success", true}});
         return res;
     };
@@ -793,9 +887,12 @@ void server_models_routes::init_routes() {
         std::time_t t = std::time(0);
         for (const auto & meta : all_models) {
             json status {
-                {"value", server_model_status_to_string(meta.status)},
-                {"args",  meta.args},
+                {"value",  server_model_status_to_string(meta.status)},
+                {"args",   meta.args},
             };
+            if (!meta.preset.name.empty()) {
+                status["preset"] = meta.preset.to_ini();
+            }
             if (meta.is_failed()) {
                 status["exit_code"] = meta.exit_code;
                 status["failed"]    = true;
diff --git a/tools/server/server-models.h b/tools/server/server-models.h
index 526e7488d..9cdbbad9b 100644
--- a/tools/server/server-models.h
+++ b/tools/server/server-models.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include "common.h"
+#include "preset.h"
 #include "server-http.h"
 
 #include <mutex>
@@ -47,6 +48,7 @@ static std::string server_model_status_to_string(server_model_status status) {
 }
 
 struct server_model_meta {
+    common_preset preset;
     std::string name;
     std::string path;
     std::string path_mmproj; // only available if in_cache=false
@@ -54,7 +56,7 @@ struct server_model_meta {
     int port = 0;
     server_model_status status = SERVER_MODEL_STATUS_UNLOADED;
     int64_t last_used = 0; // for LRU unloading
-    std::vector<std::string> args; // additional args passed to the model instance (used for debugging)
+    std::vector<std::string> args; // args passed to the model instance, will be populated by render_args()
     int exit_code = 0; // exit code of the model instance process (only valid if status == FAILED)
 
     bool is_active() const {
@@ -66,6 +68,19 @@ struct server_model_meta {
     }
 };
 
+// the server_presets struct holds the presets read from presets.ini
+// as well as base args from the router server
+struct server_presets {
+    common_presets presets;
+    common_params_context ctx_params;
+    std::map<common_arg, std::string> base_args;
+    std::map<std::string, common_arg> control_args; // args reserved for server control
+
+    server_presets(int argc, char ** argv, common_params & base_params, const std::string & models_dir);
+    common_preset get_preset(const std::string & name);
+    void render_args(server_model_meta & meta);
+};
+
 struct subprocess_s;
 
 struct server_models {
@@ -85,14 +100,21 @@ private:
     std::vector<std::string> base_args;
     std::vector<std::string> base_env;
 
+    server_presets presets;
+
     void update_meta(const std::string & name, const server_model_meta & meta);
 
     // unload least recently used models if the limit is reached
     void unload_lru();
 
+    // not thread-safe, caller must hold mutex
+    void add_model(server_model_meta && meta);
+
 public:
     server_models(const common_params & params, int argc, char ** argv, char ** envp);
 
+    void load_models();
+
     // check if a model instance exists
     bool has_model(const std::string & name);
 
@@ -102,8 +124,7 @@ public:
     // return a copy of all model metadata
     std::vector<server_model_meta> get_all_meta();
 
-    // if auto_load is true, load the model with previous args if any
-    void load(const std::string & name, bool auto_load);
+    void load(const std::string & name);
     void unload(const std::string & name);
     void unload_all();
 
diff --git a/tools/server/webui/package-lock.json b/tools/server/webui/package-lock.json
index 9c1c2499c..4f37b308b 100644
--- a/tools/server/webui/package-lock.json
+++ b/tools/server/webui/package-lock.json
@@ -41,7 +41,7 @@
 				"@tailwindcss/vite": "^4.0.0",
 				"@types/node": "^22",
 				"@vitest/browser": "^3.2.3",
-				"bits-ui": "^2.8.11",
+				"bits-ui": "^2.14.4",
 				"clsx": "^2.1.1",
 				"dexie": "^4.0.11",
 				"eslint": "^9.18.0",
@@ -3343,17 +3343,17 @@
 			}
 		},
 		"node_modules/bits-ui": {
-			"version": "2.8.11",
-			"resolved": "https://registry.npmjs.org/bits-ui/-/bits-ui-2.8.11.tgz",
-			"integrity": "sha512-lKN9rAk69my6j7H1D4B87r8LrHuEtfEsf1xCixBj9yViql2BdI3f04HyyyT7T1GOCpgb9+8b0B+nm3LN81Konw==",
+			"version": "2.14.4",
+			"resolved": "https://registry.npmjs.org/bits-ui/-/bits-ui-2.14.4.tgz",
+			"integrity": "sha512-W6kenhnbd/YVvur+DKkaVJ6GldE53eLewur5AhUCqslYQ0vjZr8eWlOfwZnMiPB+PF5HMVqf61vXBvmyrAmPWg==",
 			"dev": true,
 			"license": "MIT",
 			"dependencies": {
 				"@floating-ui/core": "^1.7.1",
 				"@floating-ui/dom": "^1.7.1",
 				"esm-env": "^1.1.2",
-				"runed": "^0.29.1",
-				"svelte-toolbelt": "^0.9.3",
+				"runed": "^0.35.1",
+				"svelte-toolbelt": "^0.10.6",
 				"tabbable": "^6.2.0"
 			},
 			"engines": {
@@ -3368,9 +3368,9 @@
 			}
 		},
 		"node_modules/bits-ui/node_modules/runed": {
-			"version": "0.29.2",
-			"resolved": "https://registry.npmjs.org/runed/-/runed-0.29.2.tgz",
-			"integrity": "sha512-0cq6cA6sYGZwl/FvVqjx9YN+1xEBu9sDDyuWdDW1yWX7JF2wmvmVKfH+hVCZs+csW+P3ARH92MjI3H9QTagOQA==",
+			"version": "0.35.1",
+			"resolved": "https://registry.npmjs.org/runed/-/runed-0.35.1.tgz",
+			"integrity": "sha512-2F4Q/FZzbeJTFdIS/PuOoPRSm92sA2LhzTnv6FXhCoENb3huf5+fDuNOg1LNvGOouy3u/225qxmuJvcV3IZK5Q==",
 			"dev": true,
 			"funding": [
 				"https://github.com/sponsors/huntabyte",
@@ -3378,23 +3378,31 @@
 			],
 			"license": "MIT",
 			"dependencies": {
-				"esm-env": "^1.0.0"
+				"dequal": "^2.0.3",
+				"esm-env": "^1.0.0",
+				"lz-string": "^1.5.0"
 			},
 			"peerDependencies": {
+				"@sveltejs/kit": "^2.21.0",
 				"svelte": "^5.7.0"
+			},
+			"peerDependenciesMeta": {
+				"@sveltejs/kit": {
+					"optional": true
+				}
 			}
 		},
 		"node_modules/bits-ui/node_modules/svelte-toolbelt": {
-			"version": "0.9.3",
-			"resolved": "https://registry.npmjs.org/svelte-toolbelt/-/svelte-toolbelt-0.9.3.tgz",
-			"integrity": "sha512-HCSWxCtVmv+c6g1ACb8LTwHVbDqLKJvHpo6J8TaqwUme2hj9ATJCpjCPNISR1OCq2Q4U1KT41if9ON0isINQZw==",
+			"version": "0.10.6",
+			"resolved": "https://registry.npmjs.org/svelte-toolbelt/-/svelte-toolbelt-0.10.6.tgz",
+			"integrity": "sha512-YWuX+RE+CnWYx09yseAe4ZVMM7e7GRFZM6OYWpBKOb++s+SQ8RBIMMe+Bs/CznBMc0QPLjr+vDBxTAkozXsFXQ==",
 			"dev": true,
 			"funding": [
 				"https://github.com/sponsors/huntabyte"
 			],
 			"dependencies": {
 				"clsx": "^2.1.1",
-				"runed": "^0.29.0",
+				"runed": "^0.35.1",
 				"style-to-object": "^1.0.8"
 			},
 			"engines": {
diff --git a/tools/server/webui/package.json b/tools/server/webui/package.json
index 987a7239e..c20ab3cfd 100644
--- a/tools/server/webui/package.json
+++ b/tools/server/webui/package.json
@@ -43,7 +43,7 @@
 		"@tailwindcss/vite": "^4.0.0",
 		"@types/node": "^22",
 		"@vitest/browser": "^3.2.3",
-		"bits-ui": "^2.8.11",
+		"bits-ui": "^2.14.4",
 		"clsx": "^2.1.1",
 		"dexie": "^4.0.11",
 		"eslint": "^9.18.0",
diff --git a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatForm.svelte b/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatForm.svelte
index 7f8e38286..78cc1c47d 100644
--- a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatForm.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatForm.svelte
@@ -331,6 +331,7 @@
 	class="{INPUT_CLASSES} border-radius-bottom-none mx-auto max-w-[48rem] overflow-hidden rounded-3xl backdrop-blur-md {disabled
 		? 'cursor-not-allowed opacity-60'
 		: ''} {className}"
+	data-slot="chat-form"
 >
 	<ChatAttachmentsList
 		bind:uploadedFiles
diff --git a/tools/server/webui/src/lib/components/app/chat/ChatSidebar/ChatSidebarSearch.svelte b/tools/server/webui/src/lib/components/app/chat/ChatSidebar/ChatSidebarSearch.svelte
index c9e6c6616..afc984702 100644
--- a/tools/server/webui/src/lib/components/app/chat/ChatSidebar/ChatSidebarSearch.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatSidebar/ChatSidebarSearch.svelte
@@ -1,6 +1,5 @@
 <script lang="ts">
-	import { Input } from '$lib/components/ui/input';
-	import { Search } from '@lucide/svelte';
+	import { SearchInput } from '$lib/components/app';
 
 	interface Props {
 		value?: string;
@@ -15,19 +14,6 @@
 		onInput,
 		class: className
 	}: Props = $props();
-
-	function handleInput(event: Event) {
-		const target = event.target as HTMLInputElement;
-
-		value = target.value;
-		onInput?.(target.value);
-	}
 </script>
 
-<div class="relative mb-4 {className}">
-	<Search
-		class="absolute top-1/2 left-3 h-4 w-4 -translate-y-1/2 transform text-muted-foreground"
-	/>
-
-	<Input bind:value class="pl-10" oninput={handleInput} {placeholder} type="search" />
-</div>
+<SearchInput bind:value {placeholder} {onInput} class="mb-4 {className}" />
diff --git a/tools/server/webui/src/lib/components/app/index.ts b/tools/server/webui/src/lib/components/app/index.ts
index 87b24598b..8631d4fb3 100644
--- a/tools/server/webui/src/lib/components/app/index.ts
+++ b/tools/server/webui/src/lib/components/app/index.ts
@@ -64,6 +64,7 @@ export { default as CopyToClipboardIcon } from './misc/CopyToClipboardIcon.svelt
 export { default as KeyboardShortcutInfo } from './misc/KeyboardShortcutInfo.svelte';
 export { default as MarkdownContent } from './misc/MarkdownContent.svelte';
 export { default as RemoveButton } from './misc/RemoveButton.svelte';
+export { default as SearchInput } from './misc/SearchInput.svelte';
 export { default as SyntaxHighlightedCode } from './misc/SyntaxHighlightedCode.svelte';
 export { default as ModelsSelector } from './models/ModelsSelector.svelte';
 
diff --git a/tools/server/webui/src/lib/components/app/misc/SearchInput.svelte b/tools/server/webui/src/lib/components/app/misc/SearchInput.svelte
new file mode 100644
index 000000000..15cd6abaa
--- /dev/null
+++ b/tools/server/webui/src/lib/components/app/misc/SearchInput.svelte
@@ -0,0 +1,73 @@
+<script lang="ts">
+	import { Input } from '$lib/components/ui/input';
+	import { Search, X } from '@lucide/svelte';
+
+	interface Props {
+		value?: string;
+		placeholder?: string;
+		onInput?: (value: string) => void;
+		onClose?: () => void;
+		onKeyDown?: (event: KeyboardEvent) => void;
+		class?: string;
+		id?: string;
+		ref?: HTMLInputElement | null;
+	}
+
+	let {
+		value = $bindable(''),
+		placeholder = 'Search...',
+		onInput,
+		onClose,
+		onKeyDown,
+		class: className,
+		id,
+		ref = $bindable(null)
+	}: Props = $props();
+
+	let showClearButton = $derived(!!value || !!onClose);
+
+	function handleInput(event: Event) {
+		const target = event.target as HTMLInputElement;
+
+		value = target.value;
+		onInput?.(target.value);
+	}
+
+	function handleClear() {
+		if (value) {
+			value = '';
+			onInput?.('');
+			ref?.focus();
+		} else {
+			onClose?.();
+		}
+	}
+</script>
+
+<div class="relative {className}">
+	<Search
+		class="absolute top-1/2 left-3 h-4 w-4 -translate-y-1/2 transform text-muted-foreground"
+	/>
+
+	<Input
+		{id}
+		bind:value
+		bind:ref
+		class="pl-9 {showClearButton ? 'pr-9' : ''}"
+		oninput={handleInput}
+		onkeydown={onKeyDown}
+		{placeholder}
+		type="search"
+	/>
+
+	{#if showClearButton}
+		<button
+			type="button"
+			class="absolute top-1/2 right-3 -translate-y-1/2 transform text-muted-foreground transition-colors hover:text-foreground"
+			onclick={handleClear}
+			aria-label={value ? 'Clear search' : 'Close'}
+		>
+			<X class="h-4 w-4" />
+		</button>
+	{/if}
+</div>
diff --git a/tools/server/webui/src/lib/components/app/models/ModelsSelector.svelte b/tools/server/webui/src/lib/components/app/models/ModelsSelector.svelte
index c4331e92f..ac0937696 100644
--- a/tools/server/webui/src/lib/components/app/models/ModelsSelector.svelte
+++ b/tools/server/webui/src/lib/components/app/models/ModelsSelector.svelte
@@ -2,8 +2,8 @@
 	import { onMount, tick } from 'svelte';
 	import { ChevronDown, EyeOff, Loader2, MicOff, Package, Power } from '@lucide/svelte';
 	import * as Tooltip from '$lib/components/ui/tooltip';
+	import * as Popover from '$lib/components/ui/popover';
 	import { cn } from '$lib/components/ui/utils';
-	import { portalToBody } from '$lib/utils';
 	import {
 		modelsStore,
 		modelOptions,
@@ -17,12 +17,8 @@
 	import { usedModalities, conversationsStore } from '$lib/stores/conversations.svelte';
 	import { ServerModelStatus } from '$lib/enums';
 	import { isRouterMode } from '$lib/stores/server.svelte';
-	import { DialogModelInformation } from '$lib/components/app';
-	import {
-		MENU_MAX_WIDTH,
-		MENU_OFFSET,
-		VIEWPORT_GUTTER
-	} from '$lib/constants/floating-ui-constraints';
+	import { DialogModelInformation, SearchInput } from '$lib/components/app';
+	import type { ModelOption } from '$lib/types/models';
 
 	interface Props {
 		class?: string;
@@ -145,185 +141,126 @@
 		return options.some((option) => option.model === currentModel);
 	});
 
-	let isOpen = $state(false);
-	let showModelDialog = $state(false);
-	let container: HTMLDivElement | null = null;
-	let menuRef = $state<HTMLDivElement | null>(null);
-	let triggerButton = $state<HTMLButtonElement | null>(null);
-	let menuPosition = $state<{
-		top: number;
-		left: number;
-		width: number;
-		placement: 'top' | 'bottom';
-		maxHeight: number;
-	} | null>(null);
+	let searchTerm = $state('');
+	let searchInputRef = $state<HTMLInputElement | null>(null);
+	let highlightedIndex = $state<number>(-1);
 
-	onMount(async () => {
-		try {
-			await modelsStore.fetch();
-		} catch (error) {
-			console.error('Unable to load models:', error);
-		}
+	let filteredOptions: ModelOption[] = $derived(
+		(() => {
+			const term = searchTerm.trim().toLowerCase();
+			if (!term) return options;
+
+			return options.filter(
+				(option) =>
+					option.model.toLowerCase().includes(term) || option.name?.toLowerCase().includes(term)
+			);
+		})()
+	);
+
+	// Get indices of compatible options for keyboard navigation
+	let compatibleIndices = $derived(
+		filteredOptions
+			.map((option, index) => (isModelCompatible(option) ? index : -1))
+			.filter((i) => i !== -1)
+	);
+
+	// Reset highlighted index when search term changes
+	$effect(() => {
+		void searchTerm;
+		highlightedIndex = -1;
 	});
 
-	function toggleOpen() {
+	let isOpen = $state(false);
+	let showModelDialog = $state(false);
+
+	onMount(() => {
+		modelsStore.fetch().catch((error) => {
+			console.error('Unable to load models:', error);
+		});
+	});
+
+	function handleOpenChange(open: boolean) {
 		if (loading || updating) return;
 
-		if (isRouter) {
-			// Router mode: show dropdown
-			if (isOpen) {
-				closeMenu();
-			} else {
-				openMenu();
+		if (open) {
+			isOpen = true;
+			searchTerm = '';
+			highlightedIndex = -1;
+
+			// Focus search input after popover opens
+			tick().then(() => {
+				requestAnimationFrame(() => searchInputRef?.focus());
+			});
+
+			if (isRouter) {
+				modelsStore.fetchRouterModels().then(() => {
+					modelsStore.fetchModalitiesForLoadedModels();
+				});
 			}
 		} else {
-			// Single model mode: show dialog
-			showModelDialog = true;
+			isOpen = false;
+			searchTerm = '';
+			highlightedIndex = -1;
 		}
 	}
 
-	async function openMenu() {
+	function handleTriggerClick() {
 		if (loading || updating) return;
 
-		isOpen = true;
-		await tick();
-		updateMenuPosition();
-		requestAnimationFrame(() => updateMenuPosition());
-
-		if (isRouter) {
-			modelsStore.fetchRouterModels().then(() => {
-				modelsStore.fetchModalitiesForLoadedModels();
-			});
+		if (!isRouter) {
+			// Single model mode: show dialog instead of popover
+			showModelDialog = true;
 		}
+		// For router mode, the Popover handles open/close
 	}
 
 	export function open() {
 		if (isRouter) {
-			openMenu();
+			handleOpenChange(true);
 		} else {
 			showModelDialog = true;
 		}
 	}
 
 	function closeMenu() {
-		if (!isOpen) return;
-
-		isOpen = false;
-		menuPosition = null;
+		handleOpenChange(false);
 	}
 
-	function handlePointerDown(event: PointerEvent) {
-		if (!container) return;
+	function handleSearchKeyDown(event: KeyboardEvent) {
+		if (event.isComposing) return;
 
-		const target = event.target as Node | null;
+		if (event.key === 'ArrowDown') {
+			event.preventDefault();
+			if (compatibleIndices.length === 0) return;
 
-		if (target && !container.contains(target) && !(menuRef && menuRef.contains(target))) {
-			closeMenu();
-		}
-	}
-
-	function handleKeydown(event: KeyboardEvent) {
-		if (event.key === 'Escape') {
-			closeMenu();
-		}
-	}
-
-	function handleResize() {
-		if (isOpen) {
-			updateMenuPosition();
-		}
-	}
-
-	function updateMenuPosition() {
-		if (!isOpen || !triggerButton || !menuRef) return;
-
-		const triggerRect = triggerButton.getBoundingClientRect();
-		const viewportWidth = window.innerWidth;
-		const viewportHeight = window.innerHeight;
-
-		if (viewportWidth === 0 || viewportHeight === 0) return;
-
-		const scrollWidth = menuRef.scrollWidth;
-		const scrollHeight = menuRef.scrollHeight;
-
-		const availableWidth = Math.max(0, viewportWidth - VIEWPORT_GUTTER * 2);
-		const constrainedMaxWidth = Math.min(MENU_MAX_WIDTH, availableWidth || MENU_MAX_WIDTH);
-		const safeMaxWidth =
-			constrainedMaxWidth > 0 ? constrainedMaxWidth : Math.min(MENU_MAX_WIDTH, viewportWidth);
-		const desiredMinWidth = Math.min(160, safeMaxWidth || 160);
-
-		let width = Math.min(
-			Math.max(triggerRect.width, scrollWidth, desiredMinWidth),
-			safeMaxWidth || 320
-		);
-
-		const availableBelow = Math.max(
-			0,
-			viewportHeight - VIEWPORT_GUTTER - triggerRect.bottom - MENU_OFFSET
-		);
-		const availableAbove = Math.max(0, triggerRect.top - VIEWPORT_GUTTER - MENU_OFFSET);
-		const viewportAllowance = Math.max(0, viewportHeight - VIEWPORT_GUTTER * 2);
-		const fallbackAllowance = Math.max(1, viewportAllowance > 0 ? viewportAllowance : scrollHeight);
-
-		function computePlacement(placement: 'top' | 'bottom') {
-			const available = placement === 'bottom' ? availableBelow : availableAbove;
-			const allowedHeight =
-				available > 0 ? Math.min(available, fallbackAllowance) : fallbackAllowance;
-			const maxHeight = Math.min(scrollHeight, allowedHeight);
-			const height = Math.max(0, maxHeight);
-
-			let top: number;
-			if (placement === 'bottom') {
-				const rawTop = triggerRect.bottom + MENU_OFFSET;
-				const minTop = VIEWPORT_GUTTER;
-				const maxTop = viewportHeight - VIEWPORT_GUTTER - height;
-				if (maxTop < minTop) {
-					top = minTop;
-				} else {
-					top = Math.min(Math.max(rawTop, minTop), maxTop);
-				}
+			const currentPos = compatibleIndices.indexOf(highlightedIndex);
+			if (currentPos === -1 || currentPos === compatibleIndices.length - 1) {
+				highlightedIndex = compatibleIndices[0];
 			} else {
-				const rawTop = triggerRect.top - MENU_OFFSET - height;
-				const minTop = VIEWPORT_GUTTER;
-				const maxTop = viewportHeight - VIEWPORT_GUTTER - height;
-				if (maxTop < minTop) {
-					top = minTop;
-				} else {
-					top = Math.max(Math.min(rawTop, maxTop), minTop);
+				highlightedIndex = compatibleIndices[currentPos + 1];
+			}
+		} else if (event.key === 'ArrowUp') {
+			event.preventDefault();
+			if (compatibleIndices.length === 0) return;
+
+			const currentPos = compatibleIndices.indexOf(highlightedIndex);
+			if (currentPos === -1 || currentPos === 0) {
+				highlightedIndex = compatibleIndices[compatibleIndices.length - 1];
+			} else {
+				highlightedIndex = compatibleIndices[currentPos - 1];
+			}
+		} else if (event.key === 'Enter') {
+			event.preventDefault();
+			if (highlightedIndex >= 0 && highlightedIndex < filteredOptions.length) {
+				const option = filteredOptions[highlightedIndex];
+				if (isModelCompatible(option)) {
+					handleSelect(option.id);
 				}
-			}
-
-			return { placement, top, height, maxHeight };
-		}
-
-		const belowMetrics = computePlacement('bottom');
-		const aboveMetrics = computePlacement('top');
-
-		let metrics = belowMetrics;
-		if (scrollHeight > belowMetrics.maxHeight && aboveMetrics.maxHeight > belowMetrics.maxHeight) {
-			metrics = aboveMetrics;
-		}
-
-		let left = triggerRect.right - width;
-		const maxLeft = viewportWidth - VIEWPORT_GUTTER - width;
-		if (maxLeft < VIEWPORT_GUTTER) {
-			left = VIEWPORT_GUTTER;
-		} else {
-			if (left > maxLeft) {
-				left = maxLeft;
-			}
-			if (left < VIEWPORT_GUTTER) {
-				left = VIEWPORT_GUTTER;
+			} else if (compatibleIndices.length > 0) {
+				// No selection - highlight first compatible option
+				highlightedIndex = compatibleIndices[0];
 			}
 		}
-
-		menuPosition = {
-			top: Math.round(metrics.top),
-			left: Math.round(left),
-			width: Math.round(width),
-			placement: metrics.placement,
-			maxHeight: Math.round(metrics.maxHeight)
-		};
 	}
 
 	async function handleSelect(modelId: string) {
@@ -356,6 +293,14 @@
 
 		if (shouldCloseMenu) {
 			closeMenu();
+
+			// Focus the chat textarea after model selection
+			requestAnimationFrame(() => {
+				const textarea = document.querySelector<HTMLTextAreaElement>(
+					'[data-slot="chat-form"] textarea'
+				);
+				textarea?.focus();
+			});
 		}
 	}
 
@@ -404,10 +349,7 @@
 	}
 </script>
 
-<svelte:window onresize={handleResize} />
-<svelte:document onpointerdown={handlePointerDown} onkeydown={handleKeydown} />
-
-<div class={cn('relative inline-flex flex-col items-end gap-1', className)} bind:this={container}>
+<div class={cn('relative inline-flex flex-col items-end gap-1', className)}>
 	{#if loading && options.length === 0 && isRouter}
 		<div class="flex items-center gap-2 text-xs text-muted-foreground">
 			<Loader2 class="h-3.5 w-3.5 animate-spin" />
@@ -418,9 +360,8 @@
 	{:else}
 		{@const selectedOption = getDisplayOption()}
 
-		<div class="relative">
-			<button
-				type="button"
+		<Popover.Root bind:open={isOpen} onOpenChange={handleOpenChange}>
+			<Popover.Trigger
 				class={cn(
 					`inline-flex cursor-pointer items-center gap-1.5 rounded-sm bg-muted-foreground/10 px-1.5 py-1 text-xs transition hover:text-foreground focus:outline-none focus-visible:ring-2 focus-visible:ring-ring focus-visible:ring-offset-2 disabled:cursor-not-allowed disabled:opacity-60`,
 					!isCurrentModelInCache()
@@ -430,15 +371,11 @@
 							: isHighlightedCurrentModelActive
 								? 'text-foreground'
 								: 'text-muted-foreground',
-					isOpen ? 'text-foreground' : '',
-					className
+					isOpen ? 'text-foreground' : ''
 				)}
 				style="max-width: min(calc(100cqw - 6.5rem), 32rem)"
-				aria-haspopup={isRouter ? 'listbox' : undefined}
-				aria-expanded={isRouter ? isOpen : undefined}
-				onclick={toggleOpen}
-				bind:this={triggerButton}
-				disabled={disabled || updating}
+				onclick={handleTriggerClick}
+				disabled={disabled || updating || !isRouter}
 			>
 				<Package class="h-3.5 w-3.5" />
 
@@ -451,33 +388,35 @@
 				{:else if isRouter}
 					<ChevronDown class="h-3 w-3.5" />
 				{/if}
-			</button>
+			</Popover.Trigger>
 
-			{#if isOpen && isRouter}
-				<div
-					bind:this={menuRef}
-					use:portalToBody
-					class={cn(
-						'fixed z-[1000] overflow-hidden rounded-md border bg-popover shadow-lg transition-opacity',
-						menuPosition ? 'opacity-100' : 'pointer-events-none opacity-0'
-					)}
-					role="listbox"
-					style:top={menuPosition ? `${menuPosition.top}px` : undefined}
-					style:left={menuPosition ? `${menuPosition.left}px` : undefined}
-					style:width={menuPosition ? `${menuPosition.width}px` : undefined}
-					data-placement={menuPosition?.placement ?? 'bottom'}
-				>
+			<Popover.Content
+				class="group/popover-content w-96 max-w-[calc(100vw-2rem)] p-0"
+				align="end"
+				sideOffset={8}
+				collisionPadding={16}
+			>
+				<div class="flex max-h-[50dvh] flex-col overflow-hidden">
 					<div
-						class="overflow-y-auto py-1"
-						style:max-height={menuPosition && menuPosition.maxHeight > 0
-							? `${menuPosition.maxHeight}px`
-							: undefined}
+						class="order-1 shrink-0 border-b p-4 group-data-[side=top]/popover-content:order-2 group-data-[side=top]/popover-content:border-t group-data-[side=top]/popover-content:border-b-0"
+					>
+						<SearchInput
+							id="model-search"
+							placeholder="Search models..."
+							bind:value={searchTerm}
+							bind:ref={searchInputRef}
+							onClose={closeMenu}
+							onKeyDown={handleSearchKeyDown}
+						/>
+					</div>
+					<div
+						class="models-list order-2 min-h-0 flex-1 overflow-y-auto group-data-[side=top]/popover-content:order-1"
 					>
 						{#if !isCurrentModelInCache() && currentModel}
 							<!-- Show unavailable model as first option (disabled) -->
 							<button
 								type="button"
-								class="flex w-full cursor-not-allowed items-center bg-red-400/10 px-3 py-2 text-left text-sm text-red-400"
+								class="flex w-full cursor-not-allowed items-center bg-red-400/10 px-4 py-2 text-left text-sm text-red-400"
 								role="option"
 								aria-selected="true"
 								aria-disabled="true"
@@ -488,20 +427,25 @@
 							</button>
 							<div class="my-1 h-px bg-border"></div>
 						{/if}
-						{#each options as option (option.id)}
+						{#if filteredOptions.length === 0}
+							<p class="px-4 py-3 text-sm text-muted-foreground">No models found.</p>
+						{/if}
+						{#each filteredOptions as option, index (option.id)}
 							{@const status = getModelStatus(option.model)}
 							{@const isLoaded = status === ServerModelStatus.LOADED}
 							{@const isLoading = status === ServerModelStatus.LOADING}
 							{@const isSelected = currentModel === option.model || activeId === option.id}
 							{@const isCompatible = isModelCompatible(option)}
+							{@const isHighlighted = index === highlightedIndex}
 							{@const missingModalities = getMissingModalities(option)}
+
 							<div
 								class={cn(
-									'group flex w-full items-center gap-2 px-3 py-2 text-left text-sm transition focus:outline-none',
+									'group flex w-full items-center gap-2 px-4 py-2 text-left text-sm transition focus:outline-none',
 									isCompatible
 										? 'cursor-pointer hover:bg-muted focus:bg-muted'
 										: 'cursor-not-allowed opacity-50',
-									isSelected
+									isSelected || isHighlighted
 										? 'bg-accent text-accent-foreground'
 										: isCompatible
 											? 'hover:bg-accent hover:text-accent-foreground'
@@ -509,10 +453,11 @@
 									isLoaded ? 'text-popover-foreground' : 'text-muted-foreground'
 								)}
 								role="option"
-								aria-selected={isSelected}
+								aria-selected={isSelected || isHighlighted}
 								aria-disabled={!isCompatible}
 								tabindex={isCompatible ? 0 : -1}
 								onclick={() => isCompatible && handleSelect(option.id)}
+								onmouseenter={() => (highlightedIndex = index)}
 								onkeydown={(e) => {
 									if (isCompatible && (e.key === 'Enter' || e.key === ' ')) {
 										e.preventDefault();
@@ -586,8 +531,8 @@
 						{/each}
 					</div>
 				</div>
-			{/if}
-		</div>
+			</Popover.Content>
+		</Popover.Root>
 	{/if}
 </div>
 
diff --git a/tools/server/webui/src/lib/components/ui/popover/index.ts b/tools/server/webui/src/lib/components/ui/popover/index.ts
new file mode 100644
index 000000000..c5937fb3a
--- /dev/null
+++ b/tools/server/webui/src/lib/components/ui/popover/index.ts
@@ -0,0 +1,19 @@
+import Root from './popover.svelte';
+import Close from './popover-close.svelte';
+import Content from './popover-content.svelte';
+import Trigger from './popover-trigger.svelte';
+import Portal from './popover-portal.svelte';
+
+export {
+	Root,
+	Content,
+	Trigger,
+	Close,
+	Portal,
+	//
+	Root as Popover,
+	Content as PopoverContent,
+	Trigger as PopoverTrigger,
+	Close as PopoverClose,
+	Portal as PopoverPortal
+};
diff --git a/tools/server/webui/src/lib/components/ui/popover/popover-close.svelte b/tools/server/webui/src/lib/components/ui/popover/popover-close.svelte
new file mode 100644
index 000000000..dc4dec4b3
--- /dev/null
+++ b/tools/server/webui/src/lib/components/ui/popover/popover-close.svelte
@@ -0,0 +1,7 @@
+<script lang="ts">
+	import { Popover as PopoverPrimitive } from 'bits-ui';
+
+	let { ref = $bindable(null), ...restProps }: PopoverPrimitive.CloseProps = $props();
+</script>
+
+<PopoverPrimitive.Close bind:ref data-slot="popover-close" {...restProps} />
diff --git a/tools/server/webui/src/lib/components/ui/popover/popover-content.svelte b/tools/server/webui/src/lib/components/ui/popover/popover-content.svelte
new file mode 100644
index 000000000..2d3513d34
--- /dev/null
+++ b/tools/server/webui/src/lib/components/ui/popover/popover-content.svelte
@@ -0,0 +1,37 @@
+<script lang="ts">
+	import { Popover as PopoverPrimitive } from 'bits-ui';
+	import PopoverPortal from './popover-portal.svelte';
+	import { cn, type WithoutChildrenOrChild } from '$lib/components/ui/utils.js';
+	import type { ComponentProps } from 'svelte';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		sideOffset = 4,
+		side,
+		align = 'center',
+		collisionPadding = 8,
+		avoidCollisions = true,
+		portalProps,
+		...restProps
+	}: PopoverPrimitive.ContentProps & {
+		portalProps?: WithoutChildrenOrChild<ComponentProps<typeof PopoverPortal>>;
+	} = $props();
+</script>
+
+<PopoverPortal {...portalProps}>
+	<PopoverPrimitive.Content
+		bind:ref
+		data-slot="popover-content"
+		{sideOffset}
+		{side}
+		{align}
+		{collisionPadding}
+		{avoidCollisions}
+		class={cn(
+			'z-50 w-72 origin-(--bits-popover-content-transform-origin) rounded-md border bg-popover p-4 text-popover-foreground shadow-md outline-hidden data-[side=bottom]:slide-in-from-top-2 data-[side=left]:slide-in-from-end-2 data-[side=right]:slide-in-from-start-2 data-[side=top]:slide-in-from-bottom-2 data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=closed]:zoom-out-95 data-[state=open]:animate-in data-[state=open]:fade-in-0 data-[state=open]:zoom-in-95',
+			className
+		)}
+		{...restProps}
+	/>
+</PopoverPortal>
diff --git a/tools/server/webui/src/lib/components/ui/popover/popover-portal.svelte b/tools/server/webui/src/lib/components/ui/popover/popover-portal.svelte
new file mode 100644
index 000000000..25efb877b
--- /dev/null
+++ b/tools/server/webui/src/lib/components/ui/popover/popover-portal.svelte
@@ -0,0 +1,7 @@
+<script lang="ts">
+	import { Popover as PopoverPrimitive } from 'bits-ui';
+
+	let { ...restProps }: PopoverPrimitive.PortalProps = $props();
+</script>
+
+<PopoverPrimitive.Portal {...restProps} />
diff --git a/tools/server/webui/src/lib/components/ui/popover/popover-trigger.svelte b/tools/server/webui/src/lib/components/ui/popover/popover-trigger.svelte
new file mode 100644
index 000000000..5ef3d0e93
--- /dev/null
+++ b/tools/server/webui/src/lib/components/ui/popover/popover-trigger.svelte
@@ -0,0 +1,17 @@
+<script lang="ts">
+	import { cn } from '$lib/components/ui/utils.js';
+	import { Popover as PopoverPrimitive } from 'bits-ui';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		...restProps
+	}: PopoverPrimitive.TriggerProps = $props();
+</script>
+
+<PopoverPrimitive.Trigger
+	bind:ref
+	data-slot="popover-trigger"
+	class={cn('', className)}
+	{...restProps}
+/>
diff --git a/tools/server/webui/src/lib/components/ui/popover/popover.svelte b/tools/server/webui/src/lib/components/ui/popover/popover.svelte
new file mode 100644
index 000000000..f39b867a6
--- /dev/null
+++ b/tools/server/webui/src/lib/components/ui/popover/popover.svelte
@@ -0,0 +1,7 @@
+<script lang="ts">
+	import { Popover as PopoverPrimitive } from 'bits-ui';
+
+	let { open = $bindable(false), ...restProps }: PopoverPrimitive.RootProps = $props();
+</script>
+
+<PopoverPrimitive.Root bind:open {...restProps} />
diff --git a/tools/server/webui/src/lib/constants/floating-ui-constraints.ts b/tools/server/webui/src/lib/constants/floating-ui-constraints.ts
index c95d3f184..003fc77ac 100644
--- a/tools/server/webui/src/lib/constants/floating-ui-constraints.ts
+++ b/tools/server/webui/src/lib/constants/floating-ui-constraints.ts
@@ -1,3 +1,2 @@
 export const VIEWPORT_GUTTER = 8;
 export const MENU_OFFSET = 6;
-export const MENU_MAX_WIDTH = 320;
diff --git a/tools/server/webui/src/lib/stores/models.svelte.ts b/tools/server/webui/src/lib/stores/models.svelte.ts
index 29416c2fe..34b26403e 100644
--- a/tools/server/webui/src/lib/stores/models.svelte.ts
+++ b/tools/server/webui/src/lib/stores/models.svelte.ts
@@ -295,14 +295,21 @@ class ModelsStore {
 	 * Fetch props for a specific model from /props endpoint
 	 * Uses caching to avoid redundant requests
 	 *
+	 * In ROUTER mode, this will only fetch props if the model is loaded,
+	 * since unloaded models return 400 from /props endpoint.
+	 *
 	 * @param modelId - Model identifier to fetch props for
-	 * @returns Props data or null if fetch failed
+	 * @returns Props data or null if fetch failed or model not loaded
 	 */
 	async fetchModelProps(modelId: string): Promise<ApiLlamaCppServerProps | null> {
 		// Return cached props if available
 		const cached = this.modelPropsCache.get(modelId);
 		if (cached) return cached;
 
+		if (serverStore.isRouterMode && !this.isModelLoaded(modelId)) {
+			return null;
+		}
+
 		// Avoid duplicate fetches
 		if (this.modelPropsFetching.has(modelId)) return null;
 
diff --git a/tools/server/webui/src/lib/utils/latex-protection.test.ts b/tools/server/webui/src/lib/utils/latex-protection.test.ts
index 2354f8fa0..40fe1b0db 100644
--- a/tools/server/webui/src/lib/utils/latex-protection.test.ts
+++ b/tools/server/webui/src/lib/utils/latex-protection.test.ts
@@ -303,6 +303,27 @@ $$\n\\pi_n(\\mathbb{S}^3) = \\begin{cases}
 		expect(output).toBe(input); // Code blocks prevent misinterpretation
 	});
 
+	test('preserves backslash parentheses in code blocks (GitHub issue)', () => {
+		const input = '```python\nfoo = "\\(bar\\)"\n```';
+		const output = preprocessLaTeX(input);
+
+		expect(output).toBe(input); // Code blocks should not have LaTeX conversion applied
+	});
+
+	test('preserves backslash brackets in code blocks', () => {
+		const input = '```python\nfoo = "\\[bar\\]"\n```';
+		const output = preprocessLaTeX(input);
+
+		expect(output).toBe(input); // Code blocks should not have LaTeX conversion applied
+	});
+
+	test('preserves backslash parentheses in inline code', () => {
+		const input = 'Use `foo = "\\(bar\\)"` in your code.';
+		const output = preprocessLaTeX(input);
+
+		expect(output).toBe(input);
+	});
+
 	test('escape backslash in mchem ce', () => {
 		const input = 'mchem ce:\n$\\ce{2H2(g) + O2(g) -> 2H2O(l)}$';
 		const output = preprocessLaTeX(input);
diff --git a/tools/server/webui/src/lib/utils/latex-protection.ts b/tools/server/webui/src/lib/utils/latex-protection.ts
index 7f5cf2cdd..cafa2d476 100644
--- a/tools/server/webui/src/lib/utils/latex-protection.ts
+++ b/tools/server/webui/src/lib/utils/latex-protection.ts
@@ -226,19 +226,16 @@ export function preprocessLaTeX(content: string): string {
 		return expr;
 	});
 
-	// Step 5: Restore code blocks
-	content = content.replace(/<<CODE_BLOCK_(\d+)>>/g, (_, index) => {
-		return codeBlocks[parseInt(index)];
-	});
-
-	// Step 6: Apply additional escaping functions (brackets and mhchem)
+	// Step 5: Apply additional escaping functions (brackets and mhchem)
+	// This must happen BEFORE restoring code blocks to avoid affecting code content
 	content = escapeBrackets(content);
 
 	if (doEscapeMhchem && (content.includes('\\ce{') || content.includes('\\pu{'))) {
 		content = escapeMhchem(content);
 	}
 
-	// Final pass: Convert \(...\) → $...$, \[...\] → $$...$$
+	// Step 6: Convert remaining \(...\) → $...$, \[...\] → $$...$$
+	// This must happen BEFORE restoring code blocks to avoid affecting code content
 	content = content
 		// Using the look‑behind pattern `(?<!\\)` we skip matches
 		// that are preceded by a backslash, e.g.
@@ -248,12 +245,18 @@ export function preprocessLaTeX(content: string): string {
 			// Using the look‑behind pattern `(?<!\\)` we skip matches
 			// that are preceded by a backslash, e.g. `\\[4pt]`.
 			/(?<!\\)\\\[([\s\S]*?)\\\]/g, // display, see also PR #16599
-			(_, prefix: string, content: string) => {
-				return `${prefix}$$${content}$$`;
+			(_, content: string) => {
+				return `$$${content}$$`;
 			}
 		);
 
-	// Step 7: Restore blockquote markers
+	// Step 7: Restore code blocks
+	// This happens AFTER all LaTeX conversions to preserve code content
+	content = content.replace(/<<CODE_BLOCK_(\d+)>>/g, (_, index) => {
+		return codeBlocks[parseInt(index)];
+	});
+
+	// Step 8: Restore blockquote markers
 	if (blockquoteMarkers.size > 0) {
 		const finalLines = content.split('\n');
 		const restoredLines = finalLines.map((line, index) => {
diff --git a/vendor/cpp-httplib/CMakeLists.txt b/vendor/cpp-httplib/CMakeLists.txt
index 369502d7a..e90e8e2d1 100644
--- a/vendor/cpp-httplib/CMakeLists.txt
+++ b/vendor/cpp-httplib/CMakeLists.txt
@@ -9,6 +9,10 @@ if (NOT MSVC)
 endif()
 
 target_link_libraries  (${TARGET} PRIVATE Threads::Threads)
+
+if (WIN32 AND NOT MSVC)
+    target_link_libraries(${TARGET} PUBLIC ws2_32)
+endif()
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
 
 target_compile_definitions(${TARGET} PRIVATE