diff --git a/common/arg.cpp b/common/arg.cpp
index f9434bf70..0c5e9b651 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -774,6 +774,11 @@ bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<com
         }
         auto opt = *arg_to_options[arg];
         std::string val;
+        if (opt.value_hint == nullptr && opt.value_hint_2 == nullptr) {
+            // bool arg (need to reverse the meaning for negative args)
+            bool is_neg = std::find(opt.args_neg.begin(), opt.args_neg.end(), arg) != opt.args_neg.end();
+            val = is_neg ? "0" : "1";
+        }
         if (opt.value_hint != nullptr) {
             // arg with single value
             check_arg(i);
@@ -1141,7 +1146,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         }
     ).set_env("LLAMA_ARG_CTX_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
     add_opt(common_arg(
-        {"--cache-ram", "-cram"}, "N",
+        {"-cram", "--cache-ram"}, "N",
         string_format("set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)"
             "[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)", params.cache_ram_mib),
         [](common_params & params, int value) {
@@ -1149,7 +1154,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         }
     ).set_env("LLAMA_ARG_CACHE_RAM").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
     add_opt(common_arg(
-        {"--kv-unified", "-kvu"},
+        {"-kvu", "--kv-unified"},
         "use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)",
         [](common_params & params) {
             params.kv_unified = true;
@@ -1417,7 +1422,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         }
     ).set_sparam());
     add_opt(common_arg(
-        {"--sampling-seq", "--sampler-seq"}, "SEQUENCE",
+        {"--sampler-seq", "--sampling-seq"}, "SEQUENCE",
         string_format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()),
         [](common_params & params, const std::string & value) {
             params.sampling.samplers = common_sampler_types_from_chars(value);
@@ -2075,26 +2080,26 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         }
     ));
     add_opt(common_arg(
-        {"--override-tensor", "-ot"}, "<tensor name pattern>=<buffer type>,...",
+        {"-ot", "--override-tensor"}, "<tensor name pattern>=<buffer type>,...",
         "override tensor buffer type", [](common_params & params, const std::string & value) {
             parse_tensor_buffer_overrides(value, params.tensor_buft_overrides);
         }
     ));
     add_opt(common_arg(
-        {"--override-tensor-draft", "-otd"}, "<tensor name pattern>=<buffer type>,...",
+        {"-otd", "--override-tensor-draft"}, "<tensor name pattern>=<buffer type>,...",
         "override tensor buffer type for draft model", [](common_params & params, const std::string & value) {
             parse_tensor_buffer_overrides(value, params.speculative.tensor_buft_overrides);
         }
     ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
     add_opt(common_arg(
-        {"--cpu-moe", "-cmoe"},
+        {"-cmoe", "--cpu-moe"},
         "keep all Mixture of Experts (MoE) weights in the CPU",
         [](common_params & params) {
             params.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override());
         }
     ).set_env("LLAMA_ARG_CPU_MOE"));
     add_opt(common_arg(
-        {"--n-cpu-moe", "-ncmoe"}, "N",
+        {"-ncmoe", "--n-cpu-moe"}, "N",
         "keep the Mixture of Experts (MoE) weights of the first N layers in the CPU",
         [](common_params & params, int value) {
             if (value < 0) {
@@ -2109,14 +2114,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         }
     ).set_env("LLAMA_ARG_N_CPU_MOE"));
     add_opt(common_arg(
-        {"--cpu-moe-draft", "-cmoed"},
+        {"-cmoed", "--cpu-moe-draft"},
         "keep all Mixture of Experts (MoE) weights in the CPU for the draft model",
         [](common_params & params) {
             params.speculative.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override());
         }
     ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_CPU_MOE_DRAFT"));
     add_opt(common_arg(
-        {"--n-cpu-moe-draft", "-ncmoed"}, "N",
+        {"-ncmoed", "--n-cpu-moe-draft"}, "N",
         "keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model",
         [](common_params & params, int value) {
             if (value < 0) {
@@ -2644,7 +2649,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS"));
     add_opt(common_arg(
-        {"--reranking", "--rerank"},
+        {"--rerank", "--reranking"},
         string_format("enable reranking endpoint on server (default: %s)", "disabled"),
         [](common_params & params) {
             params.embedding = true;
@@ -3115,7 +3120,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         }
     ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
     add_opt(common_arg(
-        {"--draft-max", "--draft", "--draft-n"}, "N",
+        {"--draft", "--draft-n", "--draft-max"}, "N",
         string_format("number of tokens to draft for speculative decoding (default: %d)", params.speculative.n_max),
         [](common_params & params, int value) {
             params.speculative.n_max = value;
diff --git a/common/preset.cpp b/common/preset.cpp
index 60746aad5..1aa9864d0 100644
--- a/common/preset.cpp
+++ b/common/preset.cpp
@@ -2,6 +2,7 @@
 #include "preset.h"
 #include "peg-parser.h"
 #include "log.h"
+#include "download.h"
 
 #include <fstream>
 #include <sstream>
@@ -15,9 +16,13 @@ static std::string rm_leading_dashes(const std::string & str) {
     return str.substr(pos);
 }
 
-std::vector<std::string> common_preset::to_args() const {
+std::vector<std::string> common_preset::to_args(const std::string & bin_path) const {
     std::vector<std::string> args;
 
+    if (!bin_path.empty()) {
+        args.push_back(bin_path);
+    }
+
     for (const auto & [opt, value] : options) {
         args.push_back(opt.args.back()); // use the last arg as the main arg
         if (opt.value_hint == nullptr && opt.value_hint_2 == nullptr) {
@@ -63,6 +68,52 @@ std::string common_preset::to_ini() const {
     return ss.str();
 }
 
+void common_preset::set_option(const common_preset_context & ctx, const std::string & env, const std::string & value) {
+    // try if option exists, update it
+    for (auto & [opt, val] : options) {
+        if (opt.env && env == opt.env) {
+            val = value;
+            return;
+        }
+    }
+    // if option does not exist, we need to add it
+    if (ctx.key_to_opt.find(env) == ctx.key_to_opt.end()) {
+        throw std::runtime_error(string_format(
+            "%s: option with env '%s' not found in ctx_params",
+            __func__, env.c_str()
+        ));
+    }
+    options[ctx.key_to_opt.at(env)] = value;
+}
+
+void common_preset::unset_option(const std::string & env) {
+    for (auto it = options.begin(); it != options.end(); ) {
+        const common_arg & opt = it->first;
+        if (opt.env && env == opt.env) {
+            it = options.erase(it);
+            return;
+        } else {
+            ++it;
+        }
+    }
+}
+
+bool common_preset::get_option(const std::string & env, std::string & value) const {
+    for (const auto & [opt, val] : options) {
+        if (opt.env && env == opt.env) {
+            value = val;
+            return true;
+        }
+    }
+    return false;
+}
+
+void common_preset::merge(const common_preset & other) {
+    for (const auto & [opt, val] : other.options) {
+        options[opt] = val; // overwrite existing options
+    }
+}
+
 static std::map<std::string, std::map<std::string, std::string>> parse_ini_from_file(const std::string & path) {
     std::map<std::string, std::map<std::string, std::string>> parsed;
 
@@ -172,9 +223,12 @@ static std::string parse_bool_arg(const common_arg & arg, const std::string & ke
     return value;
 }
 
-common_presets common_presets_load(const std::string & path, common_params_context & ctx_params) {
+common_preset_context::common_preset_context(llama_example ex)
+    : ctx_params(common_params_parser_init(default_params, ex)),
+      key_to_opt(get_map_key_opt(ctx_params)) {}
+
+common_presets common_preset_context::load_from_ini(const std::string & path, common_preset & global) const {
     common_presets out;
-    auto key_to_opt = get_map_key_opt(ctx_params);
     auto ini_data = parse_ini_from_file(path);
 
     for (auto section : ini_data) {
@@ -188,7 +242,7 @@ common_presets common_presets_load(const std::string & path, common_params_conte
         for (const auto & [key, value] : section.second) {
             LOG_DBG("option: %s = %s\n", key.c_str(), value.c_str());
             if (key_to_opt.find(key) != key_to_opt.end()) {
-                auto & opt = key_to_opt[key];
+                const auto & opt = key_to_opt.at(key);
                 if (is_bool_arg(opt)) {
                     preset.options[opt] = parse_bool_arg(opt, key, value);
                 } else {
@@ -199,8 +253,137 @@ common_presets common_presets_load(const std::string & path, common_params_conte
                 // TODO: maybe warn about unknown key?
             }
         }
+
+        if (preset.name == "*") {
+            // handle global preset
+            global = preset;
+        } else {
+            out[preset.name] = preset;
+        }
+    }
+
+    return out;
+}
+
+common_presets common_preset_context::load_from_cache() const {
+    common_presets out;
+
+    auto cached_models = common_list_cached_models();
+    for (const auto & model : cached_models) {
+        common_preset preset;
+        preset.name = model.to_string();
+        preset.set_option(*this, "LLAMA_ARG_HF_REPO", model.to_string());
         out[preset.name] = preset;
     }
 
     return out;
 }
+
+struct local_model {
+    std::string name;
+    std::string path;
+    std::string path_mmproj;
+};
+
+common_presets common_preset_context::load_from_models_dir(const std::string & models_dir) const {
+    if (!std::filesystem::exists(models_dir) || !std::filesystem::is_directory(models_dir)) {
+        throw std::runtime_error(string_format("error: '%s' does not exist or is not a directory\n", models_dir.c_str()));
+    }
+
+    std::vector<local_model> models;
+    auto scan_subdir = [&models](const std::string & subdir_path, const std::string & name) {
+        auto files = fs_list(subdir_path, false);
+        common_file_info model_file;
+        common_file_info first_shard_file;
+        common_file_info mmproj_file;
+        for (const auto & file : files) {
+            if (string_ends_with(file.name, ".gguf")) {
+                if (file.name.find("mmproj") != std::string::npos) {
+                    mmproj_file = file;
+                } else if (file.name.find("-00001-of-") != std::string::npos) {
+                    first_shard_file = file;
+                } else {
+                    model_file = file;
+                }
+            }
+        }
+        // single file model
+        local_model model{
+            /* name        */ name,
+            /* path        */ first_shard_file.path.empty() ? model_file.path : first_shard_file.path,
+            /* path_mmproj */ mmproj_file.path // can be empty
+        };
+        if (!model.path.empty()) {
+            models.push_back(model);
+        }
+    };
+
+    auto files = fs_list(models_dir, true);
+    for (const auto & file : files) {
+        if (file.is_dir) {
+            scan_subdir(file.path, file.name);
+        } else if (string_ends_with(file.name, ".gguf")) {
+            // single file model
+            std::string name = file.name;
+            string_replace_all(name, ".gguf", "");
+            local_model model{
+                /* name        */ name,
+                /* path        */ file.path,
+                /* path_mmproj */ ""
+            };
+            models.push_back(model);
+        }
+    }
+
+    // convert local models to presets
+    common_presets out;
+    for (const auto & model : models) {
+        common_preset preset;
+        preset.name = model.name;
+        preset.set_option(*this, "LLAMA_ARG_MODEL", model.path);
+        if (!model.path_mmproj.empty()) {
+            preset.set_option(*this, "LLAMA_ARG_MMPROJ", model.path_mmproj);
+        }
+        out[preset.name] = preset;
+    }
+
+    return out;
+}
+
+common_preset common_preset_context::load_from_args(int argc, char ** argv) const {
+    common_preset preset;
+    preset.name = COMMON_PRESET_DEFAULT_NAME;
+
+    bool ok = common_params_to_map(argc, argv, ctx_params.ex, preset.options);
+    if (!ok) {
+        throw std::runtime_error("failed to parse CLI arguments into preset");
+    }
+
+    return preset;
+}
+
+common_presets common_preset_context::cascade(const common_presets & base, const common_presets & added) const {
+    common_presets out = base; // copy
+    for (const auto & [name, preset_added] : added) {
+        if (out.find(name) != out.end()) {
+            // if exists, merge
+            common_preset & target = out[name];
+            target.merge(preset_added);
+        } else {
+            // otherwise, add directly
+            out[name] = preset_added;
+        }
+    }
+    return out;
+}
+
+common_presets common_preset_context::cascade(const common_preset & base, const common_presets & presets) const {
+    common_presets out;
+    for (const auto & [name, preset] : presets) {
+        common_preset tmp = base; // copy
+        tmp.name = name;
+        tmp.merge(preset);
+        out[name] = std::move(tmp);
+    }
+    return out;
+}
diff --git a/common/preset.h b/common/preset.h
index dceb849eb..3a84d1be2 100644
--- a/common/preset.h
+++ b/common/preset.h
@@ -13,20 +13,62 @@
 
 constexpr const char * COMMON_PRESET_DEFAULT_NAME = "default";
 
+struct common_preset_context;
+
 struct common_preset {
     std::string name;
-    // TODO: support repeated args in the future
+
+    // options are stored as common_arg to string mapping, representing CLI arg and its value
     std::map<common_arg, std::string> options;
 
     // convert preset to CLI argument list
-    std::vector<std::string> to_args() const;
+    std::vector<std::string> to_args(const std::string & bin_path = "") const;
 
     // convert preset to INI format string
     std::string to_ini() const;
 
     // TODO: maybe implement to_env() if needed
+
+    // modify preset options where argument is identified by its env variable
+    void set_option(const common_preset_context & ctx, const std::string & env, const std::string & value);
+
+    // unset option by its env variable
+    void unset_option(const std::string & env);
+
+    // get option value by its env variable, return false if not found
+    bool get_option(const std::string & env, std::string & value) const;
+
+    // merge another preset into this one, overwriting existing options
+    void merge(const common_preset & other);
 };
 
 // interface for multiple presets in one file
 using common_presets = std::map<std::string, common_preset>;
-common_presets common_presets_load(const std::string & path, common_params_context & ctx_params);
+
+// context for loading and editing presets
+struct common_preset_context {
+    common_params default_params; // unused for now
+    common_params_context ctx_params;
+    std::map<std::string, common_arg> key_to_opt;
+    common_preset_context(llama_example ex);
+
+    // load presets from INI file
+    common_presets load_from_ini(const std::string & path, common_preset & global) const;
+
+    // generate presets from cached models
+    common_presets load_from_cache() const;
+
+    // generate presets from local models directory
+    // for the directory structure, see "Using multiple models" in server/README.md
+    common_presets load_from_models_dir(const std::string & models_dir) const;
+
+    // generate one preset from CLI arguments
+    common_preset load_from_args(int argc, char ** argv) const;
+
+    // cascade multiple presets if exist on both: base < added
+    // if preset does not exist in base, it will be added without modification
+    common_presets cascade(const common_presets & base, const common_presets & added) const;
+
+    // apply presets over a base preset (same idea as CSS cascading)
+    common_presets cascade(const common_preset & base, const common_presets & presets) const;
+};
diff --git a/docs/android/imported-into-android-studio.jpg b/docs/android/imported-into-android-studio.jpg
new file mode 100644
index 000000000..bbe6867c6
Binary files /dev/null and b/docs/android/imported-into-android-studio.jpg differ
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index a948570e2..2f54905bb 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -1544,6 +1544,8 @@ private:
 #endif // GGML_VULKAN_MEMORY_DEBUG
 
 static bool vk_perf_logger_enabled = false;
+static bool vk_perf_logger_concurrent = false;
+static bool vk_enable_sync_logger = false;
 // number of calls between perf logger prints
 static uint32_t vk_perf_logger_frequency = 1;
 
@@ -1594,14 +1596,14 @@ class vk_perf_logger {
         flops.clear();
     }
 
-    void log_timing(const ggml_tensor * node, const char *fusion_name, uint64_t time) {
+    std::string get_node_fusion_name(const ggml_tensor * node, const char *fusion_name, uint64_t *n_flops) {
+        *n_flops = 0;
         std::string fusion_str;
         if (fusion_name) {
             fusion_str = fusion_name + std::string(" ");
         }
         if (node->op == GGML_OP_UNARY) {
-            timings[fusion_str + ggml_unary_op_name(ggml_get_unary_op(node))].push_back(time);
-            return;
+            return fusion_str + ggml_unary_op_name(ggml_get_unary_op(node));
         }
         if (node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_MUL_MAT_ID) {
             const uint64_t m     = node->ne[0];
@@ -1623,9 +1625,8 @@ class vk_perf_logger {
                 name += " batch=" + std::to_string(batch);
             }
             name = fusion_str + name;
-            timings[name].push_back(time);
-            flops[name].push_back(m * n * (k + (k - 1)) * batch);
-            return;
+            *n_flops = m * n * (k + (k - 1)) * batch;
+            return name;
         }
         if (node->op == GGML_OP_CONV_2D || node->op == GGML_OP_CONV_TRANSPOSE_2D) {
             std::string   name    = ggml_op_name(node->op);
@@ -1641,20 +1642,17 @@ class vk_perf_logger {
             uint64_t      size_M  = Cout;
             uint64_t      size_K  = Cin * KW * KH;
             uint64_t      size_N  = N * OW * OH;
-            uint64_t      n_flops = size_M * size_N * (size_K + (size_K - 1));
+            *n_flops = size_M * size_N * (size_K + (size_K - 1));
             name += " M=Cout=" + std::to_string(size_M) + ", K=Cin*KW*KH=" + std::to_string(size_K) +
                     ", N=N*OW*OH=" + std::to_string(size_N);
             name = fusion_str + name;
-            flops[name].push_back(n_flops);
-            timings[name].push_back(time);
-            return;
+            return name;
         }
         if (node->op == GGML_OP_RMS_NORM) {
             std::string   name    = ggml_op_name(node->op);
             name += "(" + std::to_string(node->ne[0]) + "," + std::to_string(node->ne[1]) + "," + std::to_string(node->ne[2]) + "," + std::to_string(node->ne[3]) + ")";
             name = fusion_str + name;
-            timings[name].push_back(time);
-            return;
+            return name;
         }
         if (node->op == GGML_OP_FLASH_ATTN_EXT) {
             const ggml_tensor * dst = node;
@@ -1670,8 +1668,7 @@ class vk_perf_logger {
                 " k(" << k->ne[0] << "," << k->ne[1] << "," << k->ne[2] << "," << k->ne[3] << "), " <<
                 " v(" << v->ne[0] << "," << v->ne[1] << "," << v->ne[2] << "," << v->ne[3] << "), " <<
                 " m(" << (m?m->ne[0]:0) << "," << (m?m->ne[1]:0) << "," << (m?m->ne[2]:0) << "," << (m?m->ne[3]:0) << ")";
-            timings[name.str()].push_back(time);
-            return;
+            return name.str();
         }
         if (node->op == GGML_OP_TOP_K) {
             std::stringstream name;
@@ -1679,11 +1676,38 @@ class vk_perf_logger {
             name << ggml_op_name(node->op) <<
                 " K=" << node->ne[0] <<
                 " (" << node->src[0]->ne[0] << "," << node->src[0]->ne[1] << "," << node->src[0]->ne[2] << "," << node->src[0]->ne[3] << ")";
-            timings[name.str()].push_back(time);
-            return;
+            return name.str();
         }
-        timings[fusion_str + ggml_op_name(node->op)].push_back(time);
+        return fusion_str + ggml_op_name(node->op);
     }
+
+    void log_timing(const ggml_tensor * node, const char *fusion_name, uint64_t time) {
+        uint64_t n_flops;
+        std::string name = get_node_fusion_name(node, fusion_name, &n_flops);
+        if (n_flops) {
+            flops[name].push_back(n_flops);
+        }
+        timings[name].push_back(time);
+    }
+
+    void log_timing(const std::vector<ggml_tensor *> &nodes, const std::vector<const char *> &names, uint64_t time) {
+        uint64_t total_flops = 0;
+        std::string name;
+        for (size_t n = 0; n < nodes.size(); ++n) {
+            uint64_t n_flops = 0;
+            name += get_node_fusion_name(nodes[n], names[n], &n_flops);
+            total_flops += n_flops;
+
+            if (n != nodes.size() - 1) {
+                name += ", ";
+            }
+        }
+        if (total_flops) {
+            flops[name].push_back(total_flops);
+        }
+        timings[name].push_back(time);
+    }
+
   private:
     std::map<std::string, std::vector<uint64_t>> timings;
     std::map<std::string, std::vector<uint64_t>> flops;
@@ -1746,7 +1770,9 @@ struct ggml_backend_vk_context {
     std::unique_ptr<vk_perf_logger> perf_logger;
     vk::QueryPool query_pool;
     std::vector<const char *> query_fusion_names;
+    std::vector<int> query_fusion_node_count;
     std::vector<ggml_tensor *> query_nodes;
+    std::vector<int> query_node_idx;
     int32_t num_queries {};
     int32_t query_idx {};
 };
@@ -5225,6 +5251,8 @@ static void ggml_vk_instance_init() {
     }
 
     vk_perf_logger_enabled = getenv("GGML_VK_PERF_LOGGER") != nullptr;
+    vk_perf_logger_concurrent = getenv("GGML_VK_PERF_LOGGER_CONCURRENT") != nullptr;
+    vk_enable_sync_logger = getenv("GGML_VK_SYNC_LOGGER") != nullptr;
     const char* GGML_VK_PERF_LOGGER_FREQUENCY = getenv("GGML_VK_PERF_LOGGER_FREQUENCY");
 
     if (GGML_VK_PERF_LOGGER_FREQUENCY != nullptr) {
@@ -11857,15 +11885,18 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
             }
         }
 
-#define ENABLE_SYNC_LOGGING 0
-
         if (need_sync) {
-#if ENABLE_SYNC_LOGGING
-            std::cerr <<  "sync" << std::endl;
-#endif
+            if (vk_enable_sync_logger) {
+                std::cerr <<  "sync" << std::endl;
+            }
             ctx->unsynced_nodes_written.clear();
             ctx->unsynced_nodes_read.clear();
             ggml_vk_sync_buffers(ctx, compute_ctx);
+
+            if (vk_perf_logger_enabled && vk_perf_logger_concurrent) {
+                ctx->query_node_idx[ctx->query_idx] = node_idx;
+                compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++);
+            }
         }
         // Add all fused nodes to the unsynchronized lists.
         for (int32_t i = 0; i < ctx->num_additional_fused_ops + 1; ++i) {
@@ -11882,20 +11913,20 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
             }
         }
     }
-#if ENABLE_SYNC_LOGGING
-    for (int i = 0; i < ctx->num_additional_fused_ops + 1; ++i) {
-        auto *n = cgraph->nodes[node_idx + i];
-        std::cerr << node_idx + i << " " << ggml_op_name(n->op) << " " <<  n->name;
-        if (n->op == GGML_OP_GLU) {
-            std::cerr << " " << ggml_glu_op_name(ggml_get_glu_op(n)) << " " << (n->src[1] ? "split" : "single") << " ";
+    if (vk_enable_sync_logger) {
+        for (int i = 0; i < ctx->num_additional_fused_ops + 1; ++i) {
+            auto *n = cgraph->nodes[node_idx + i];
+            std::cerr << node_idx + i << " " << ggml_op_name(n->op) << " " <<  n->name;
+            if (n->op == GGML_OP_GLU) {
+                std::cerr << " " << ggml_glu_op_name(ggml_get_glu_op(n)) << " " << (n->src[1] ? "split" : "single") << " ";
+            }
+            if (n->op == GGML_OP_ROPE) {
+                const int mode = ((const int32_t *) n->op_params)[2];
+                std::cerr << " rope mode: " << mode;
+            }
+            std::cerr << std::endl;
         }
-        if (n->op == GGML_OP_ROPE) {
-            const int mode = ((const int32_t *) n->op_params)[2];
-            std::cerr << " rope mode: " << mode;
-        }
-        std::cerr << std::endl;
     }
-#endif
 
     switch (node->op) {
     case GGML_OP_REPEAT:
@@ -13175,12 +13206,16 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
             ctx->query_pool = ctx->device->device.createQueryPool(query_create_info);
             ctx->num_queries = query_create_info.queryCount;
             ctx->query_fusion_names.resize(ctx->num_queries);
+            ctx->query_fusion_node_count.resize(ctx->num_queries);
             ctx->query_nodes.resize(ctx->num_queries);
+            ctx->query_node_idx.resize(ctx->num_queries);
         }
 
         ctx->device->device.resetQueryPool(ctx->query_pool, 0, cgraph->n_nodes+1);
         std::fill(ctx->query_fusion_names.begin(), ctx->query_fusion_names.end(), nullptr);
+        std::fill(ctx->query_fusion_node_count.begin(), ctx->query_fusion_node_count.end(), 0);
         std::fill(ctx->query_nodes.begin(), ctx->query_nodes.end(), nullptr);
+        std::fill(ctx->query_node_idx.begin(), ctx->query_node_idx.end(), 0);
 
         GGML_ASSERT(ctx->compute_ctx.expired());
         compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
@@ -13309,9 +13344,16 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
             } else {
                 compute_ctx = ctx->compute_ctx.lock();
             }
-            ctx->query_nodes[ctx->query_idx] = cgraph->nodes[i];
-            ctx->query_fusion_names[ctx->query_idx] = fusion_string;
-            compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++);
+            if (!vk_perf_logger_concurrent) {
+                // track a single node/fusion for the current query
+                ctx->query_nodes[ctx->query_idx] = cgraph->nodes[i];
+                ctx->query_fusion_names[ctx->query_idx] = fusion_string;
+                compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++);
+            } else {
+                // track a fusion string and number of fused ops for the current node_idx
+                ctx->query_fusion_names[i] = fusion_string;
+                ctx->query_fusion_node_count[i] = ctx->num_additional_fused_ops;
+            }
         }
 
         if (enqueued) {
@@ -13353,12 +13395,32 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
         // Get the results and pass them to the logger
         std::vector<uint64_t> timestamps(cgraph->n_nodes + 1);
         VK_CHECK(ctx->device->device.getQueryPoolResults(ctx->query_pool, 0, ctx->query_idx, (cgraph->n_nodes + 1)*sizeof(uint64_t), timestamps.data(), sizeof(uint64_t), vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait), "get timestamp results");
-        for (int i = 1; i < ctx->query_idx; i++) {
-            auto node = ctx->query_nodes[i];
-            auto name = ctx->query_fusion_names[i];
-            ctx->perf_logger->log_timing(node, name, uint64_t((timestamps[i] - timestamps[i-1]) * ctx->device->properties.limits.timestampPeriod));
+        if (!vk_perf_logger_concurrent) {
+            // Log each op separately
+            for (int i = 1; i < ctx->query_idx; i++) {
+                auto node = ctx->query_nodes[i];
+                auto name = ctx->query_fusion_names[i];
+                ctx->perf_logger->log_timing(node, name, uint64_t((timestamps[i] - timestamps[i-1]) * ctx->device->properties.limits.timestampPeriod));
+            }
+        } else {
+            // Log each group of nodes
+            int prev_node_idx = 0;
+            for (int i = 1; i < ctx->query_idx; i++) {
+                auto cur_node_idx = ctx->query_node_idx[i];
+                std::vector<ggml_tensor *> nodes;
+                std::vector<const char *> names;
+                for (int node_idx = prev_node_idx; node_idx < cur_node_idx; ++node_idx) {
+                    if (ggml_op_is_empty(cgraph->nodes[node_idx]->op)) {
+                        continue;
+                    }
+                    nodes.push_back(cgraph->nodes[node_idx]);
+                    names.push_back(ctx->query_fusion_names[node_idx]);
+                    node_idx += ctx->query_fusion_node_count[node_idx];
+                }
+                prev_node_idx = cur_node_idx;
+                ctx->perf_logger->log_timing(nodes, names, uint64_t((timestamps[i] - timestamps[i-1]) * ctx->device->properties.limits.timestampPeriod));
+            }
         }
-
         ctx->perf_logger->print_timings();
     }
 
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
index 966e82d00..b64549f56 100644
--- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp
@@ -1109,10 +1109,10 @@ bool llama_model_loader::load_all_data(
             } else {
                 // If upload_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU.
                 if (upload_backend) {
-                    auto offset = (off_t) weight->offs;
+                    size_t offset = weight->offs;
                     alignment = file->read_alignment();
-                    off_t aligned_offset = offset & ~(alignment - 1);
-                    off_t offset_from_alignment = offset - aligned_offset;
+                    size_t aligned_offset = offset & ~(alignment - 1);
+                    size_t offset_from_alignment = offset - aligned_offset;
                     file->seek(aligned_offset, SEEK_SET);
 
                     // Calculate aligned read boundaries
diff --git a/tools/server/public/index.html.gz b/tools/server/public/index.html.gz
index 9e44f0326..b5266edee 100644
Binary files a/tools/server/public/index.html.gz and b/tools/server/public/index.html.gz differ
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index def57d025..9228fba9f 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -1974,19 +1974,33 @@ struct server_context_impl {
 
                         if (!slot.can_split()) {
                             if (slot.task->n_tokens() > n_ubatch) {
-                                send_error(slot, "input is too large to process. increase the physical batch size", ERROR_TYPE_SERVER);
+                                send_error(slot,
+                                           string_format(
+                                               "input (%d tokens) is too large to process. increase the physical batch "
+                                               "size (current batch size: %d)",
+                                               slot.task->n_tokens(), n_ubatch),
+                                           ERROR_TYPE_SERVER);
                                 slot.release();
                                 continue;
                             }
 
                             if (slot.task->n_tokens() > slot.n_ctx) {
-                                send_error(slot, "input is larger than the max context size. skipping", ERROR_TYPE_EXCEED_CONTEXT_SIZE);
+                                send_error(
+                                    slot,
+                                    string_format(
+                                        "input (%d tokens) is larger than the max context size (%d tokens). skipping",
+                                        slot.task->n_tokens(), slot.n_ctx),
+                                    ERROR_TYPE_EXCEED_CONTEXT_SIZE);
                                 slot.release();
                                 continue;
                             }
                         } else {
                             if (slot.task->n_tokens() >= slot.n_ctx) {
-                                send_error(slot, "the request exceeds the available context size, try increasing it", ERROR_TYPE_EXCEED_CONTEXT_SIZE);
+                                send_error(slot,
+                                           string_format("request (%d tokens) exceeds the available context size (%d "
+                                                         "tokens), try increasing it",
+                                                         slot.task->n_tokens(), slot.n_ctx),
+                                           ERROR_TYPE_EXCEED_CONTEXT_SIZE);
                                 slot.release();
                                 continue;
                             }
diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp
index c1f86e549..db7ab667f 100644
--- a/tools/server/server-models.cpp
+++ b/tools/server/server-models.cpp
@@ -82,154 +82,30 @@ static std::filesystem::path get_server_exec_path() {
 #endif
 }
 
-struct local_model {
-    std::string name;
-    std::string path;
-    std::string path_mmproj;
-};
-
-static std::vector<local_model> list_local_models(const std::string & dir) {
-    if (!std::filesystem::exists(dir) || !std::filesystem::is_directory(dir)) {
-        throw std::runtime_error(string_format("error: '%s' does not exist or is not a directory\n", dir.c_str()));
-    }
-
-    std::vector<local_model> models;
-    auto scan_subdir = [&models](const std::string & subdir_path, const std::string & name) {
-        auto files = fs_list(subdir_path, false);
-        common_file_info model_file;
-        common_file_info first_shard_file;
-        common_file_info mmproj_file;
-        for (const auto & file : files) {
-            if (string_ends_with(file.name, ".gguf")) {
-                if (file.name.find("mmproj") != std::string::npos) {
-                    mmproj_file = file;
-                } else if (file.name.find("-00001-of-") != std::string::npos) {
-                    first_shard_file = file;
-                } else {
-                    model_file = file;
-                }
-            }
-        }
-        // single file model
-        local_model model{
-            /* name        */ name,
-            /* path        */ first_shard_file.path.empty() ? model_file.path : first_shard_file.path,
-            /* path_mmproj */ mmproj_file.path // can be empty
-        };
-        if (!model.path.empty()) {
-            models.push_back(model);
-        }
-    };
-
-    auto files = fs_list(dir, true);
-    for (const auto & file : files) {
-        if (file.is_dir) {
-            scan_subdir(file.path, file.name);
-        } else if (string_ends_with(file.name, ".gguf")) {
-            // single file model
-            std::string name = file.name;
-            string_replace_all(name, ".gguf", "");
-            local_model model{
-                /* name        */ name,
-                /* path        */ file.path,
-                /* path_mmproj */ ""
-            };
-            models.push_back(model);
-        }
-    }
-    return models;
-}
-
-//
-// server_presets
-//
-
-
-server_presets::server_presets(int argc, char ** argv, common_params & base_params, const std::string & presets_path)
-        : ctx_params(common_params_parser_init(base_params, LLAMA_EXAMPLE_SERVER)) {
-    if (!presets_path.empty()) {
-        presets = common_presets_load(presets_path, ctx_params);
-        SRV_INF("Loaded %zu presets from %s\n", presets.size(), presets_path.c_str());
-    }
-
-    // populate reserved args (will be appended by the router)
-    for (auto & opt : ctx_params.options) {
-        if (opt.env == nullptr) {
-            continue;
-        }
-        std::string env = opt.env;
-        if (env == "LLAMA_ARG_PORT" ||
-            env == "LLAMA_ARG_HOST" ||
-            env == "LLAMA_ARG_ALIAS" ||
-            env == "LLAMA_ARG_API_KEY" ||
-            env == "LLAMA_ARG_MODELS_DIR" ||
-            env == "LLAMA_ARG_MODELS_MAX" ||
-            env == "LLAMA_ARG_MODELS_PRESET" ||
-            env == "LLAMA_ARG_MODEL" ||
-            env == "LLAMA_ARG_MMPROJ" ||
-            env == "LLAMA_ARG_HF_REPO" ||
-            env == "LLAMA_ARG_NO_MODELS_AUTOLOAD" ||
-            env == "LLAMA_ARG_SSL_KEY_FILE" ||
-            env == "LLAMA_ARG_SSL_CERT_FILE") {
-            control_args[env] = opt;
-        }
-    }
-
-    // read base args from router's argv
-    common_params_to_map(argc, argv, LLAMA_EXAMPLE_SERVER, base_args);
-
-    // remove any router-controlled args from base_args
-    for (const auto & cargs : control_args) {
-        auto it = base_args.find(cargs.second);
-        if (it != base_args.end()) {
-            base_args.erase(it);
-        }
+static void unset_reserved_args(common_preset & preset, bool unset_model_args) {
+    preset.unset_option("LLAMA_ARG_SSL_KEY_FILE");
+    preset.unset_option("LLAMA_ARG_SSL_CERT_FILE");
+    preset.unset_option("LLAMA_API_KEY");
+    preset.unset_option("LLAMA_ARG_MODELS_DIR");
+    preset.unset_option("LLAMA_ARG_MODELS_MAX");
+    preset.unset_option("LLAMA_ARG_MODELS_PRESET");
+    preset.unset_option("LLAMA_ARG_MODELS_AUTOLOAD");
+    if (unset_model_args) {
+        preset.unset_option("LLAMA_ARG_MODEL");
+        preset.unset_option("LLAMA_ARG_MMPROJ");
+        preset.unset_option("LLAMA_ARG_HF_REPO");
     }
 }
 
-common_preset server_presets::get_preset(const std::string & name) {
-    auto it = presets.find(name);
-    if (it != presets.end()) {
-        return it->second;
-    }
-    return common_preset();
-}
-
-void server_presets::render_args(server_model_meta & meta) {
-    common_preset preset = meta.preset; // copy
-    // merging 3 kinds of args:
-    // 1. model-specific args (from preset)
-    // force removing control args if any
-    for (auto & cargs : control_args) {
-        if (preset.options.find(cargs.second) != preset.options.end()) {
-            SRV_WRN("Preset '%s' contains reserved arg '%s', removing it\n", preset.name.c_str(), cargs.second.args[0]);
-            preset.options.erase(cargs.second);
-        }
-    }
-    // 2. base args (from router)
-    // inherit from base args
-    for (const auto & [arg, value] : base_args) {
-        preset.options[arg] = value;
-    }
-    // 3. control args (from router)
-    // set control values
-    preset.options[control_args["LLAMA_ARG_HOST"]] = CHILD_ADDR;
-    preset.options[control_args["LLAMA_ARG_PORT"]] = std::to_string(meta.port);
-    preset.options[control_args["LLAMA_ARG_ALIAS"]] = meta.name;
-    if (meta.in_cache) {
-        preset.options[control_args["LLAMA_ARG_HF_REPO"]] = meta.name;
-    } else {
-        preset.options[control_args["LLAMA_ARG_MODEL"]] = meta.path;
-        if (!meta.path_mmproj.empty()) {
-            preset.options[control_args["LLAMA_ARG_MMPROJ"]] = meta.path_mmproj;
-        }
-    }
-    // disable SSL for child processes (HTTPS already handled by router)
-    preset.options[control_args["LLAMA_ARG_SSL_KEY_FILE"]] = "";
-    preset.options[control_args["LLAMA_ARG_SSL_CERT_FILE"]] = "";
-    meta.args = preset.to_args();
-    // add back the binary path at the front
-    meta.args.insert(meta.args.begin(), get_server_exec_path().string());
+void server_model_meta::update_args(common_preset_context & ctx_preset, std::string bin_path) {
+    // update params
+    unset_reserved_args(preset, false);
+    preset.set_option(ctx_preset, "LLAMA_ARG_HOST",  CHILD_ADDR);
+    preset.set_option(ctx_preset, "LLAMA_ARG_PORT",  std::to_string(port));
+    preset.set_option(ctx_preset, "LLAMA_ARG_ALIAS", name);
+    // TODO: maybe validate preset before rendering ?
+    // render args
+    args = preset.to_args(bin_path);
 }
 
 //
@@ -240,20 +116,22 @@ server_models::server_models(
         const common_params & params,
         int argc,
         char ** argv,
-        char ** envp) : base_params(params), presets(argc, argv, base_params, params.models_preset) {
-    for (int i = 0; i < argc; i++) {
-        base_args.push_back(std::string(argv[i]));
-    }
+        char ** envp)
+            : ctx_preset(LLAMA_EXAMPLE_SERVER),
+              base_params(params),
+              base_preset(ctx_preset.load_from_args(argc, argv)) {
     for (char ** env = envp; *env != nullptr; env++) {
         base_env.push_back(std::string(*env));
     }
-    GGML_ASSERT(!base_args.empty());
+    // clean up base preset
+    unset_reserved_args(base_preset, true);
     // set binary path
     try {
-        base_args[0] = get_server_exec_path().string();
+        bin_path = get_server_exec_path().string();
     } catch (const std::exception & e) {
+        bin_path = argv[0];
         LOG_WRN("failed to get server executable path: %s\n", e.what());
-        LOG_WRN("using original argv[0] as fallback: %s\n", base_args[0].c_str());
+        LOG_WRN("using original argv[0] as fallback: %s\n", argv[0]);
     }
     load_models();
 }
@@ -262,7 +140,7 @@ void server_models::add_model(server_model_meta && meta) {
     if (mapping.find(meta.name) != mapping.end()) {
         throw std::runtime_error(string_format("model '%s' appears multiple times", meta.name.c_str()));
     }
-    presets.render_args(meta); // populate meta.args
+    meta.update_args(ctx_preset, bin_path); // render args
     std::string name = meta.name;
     mapping[name] = instance_t{
         /* subproc */ std::make_shared<subprocess_s>(),
@@ -271,86 +149,62 @@ void server_models::add_model(server_model_meta && meta) {
     };
 }
 
-static std::vector<local_model> list_custom_path_models(server_presets & presets) {
-    // detect any custom-path models in presets
-    std::vector<local_model> custom_models;
-    for (auto & [model_name, preset] : presets.presets) {
-        local_model model;
-        model.name = model_name;
-        std::vector<common_arg> to_erase;
-        for (auto & [arg, value] : preset.options) {
-            std::string env(arg.env ? arg.env : "");
-            if (env == "LLAMA_ARG_MODEL") {
-                model.path = value;
-                to_erase.push_back(arg);
-            }
-            if (env == "LLAMA_ARG_MMPROJ") {
-                model.path_mmproj = value;
-                to_erase.push_back(arg);
-            }
-        }
-        for (auto & arg : to_erase) {
-            preset.options.erase(arg);
-        }
-        if (!model.name.empty() && !model.path.empty()) {
-            custom_models.push_back(model);
-        }
-    }
-    return custom_models;
-}
-
 // TODO: allow refreshing cached model list
 void server_models::load_models() {
     // loading models from 3 sources:
     // 1. cached models
-    auto cached_models = common_list_cached_models();
-    for (const auto & model : cached_models) {
-        server_model_meta meta{
-            /* preset      */ presets.get_preset(model.to_string()),
-            /* name        */ model.to_string(),
-            /* path        */ model.manifest_path,
-            /* path_mmproj */ "", // auto-detected when loading
-            /* in_cache    */ true,
-            /* port        */ 0,
-            /* status      */ SERVER_MODEL_STATUS_UNLOADED,
-            /* last_used   */ 0,
-            /* args        */ std::vector<std::string>(),
-            /* exit_code   */ 0
-        };
-        add_model(std::move(meta));
-    }
-    // 2. local models specificed via --models-dir
+    common_presets cached_models = ctx_preset.load_from_cache();
+    SRV_INF("Loaded %zu cached model presets\n", cached_models.size());
+    // 2. local models from --models-dir
+    common_presets local_models;
     if (!base_params.models_dir.empty()) {
-        auto local_models = list_local_models(base_params.models_dir);
-        for (const auto & model : local_models) {
-            if (mapping.find(model.name) != mapping.end()) {
-                // already exists in cached models, skip
-                continue;
-            }
-            server_model_meta meta{
-                /* preset      */ presets.get_preset(model.name),
-                /* name        */ model.name,
-                /* path        */ model.path,
-                /* path_mmproj */ model.path_mmproj,
-                /* in_cache    */ false,
-                /* port        */ 0,
-                /* status      */ SERVER_MODEL_STATUS_UNLOADED,
-                /* last_used   */ 0,
-                /* args        */ std::vector<std::string>(),
-                /* exit_code   */ 0
-            };
-            add_model(std::move(meta));
+        local_models = ctx_preset.load_from_models_dir(base_params.models_dir);
+        SRV_INF("Loaded %zu local model presets from %s\n", local_models.size(), base_params.models_dir.c_str());
+    }
+    // 3. custom-path models from presets
+    common_preset global = {};
+    common_presets custom_presets = {};
+    if (!base_params.models_preset.empty()) {
+        custom_presets = ctx_preset.load_from_ini(base_params.models_preset, global);
+        SRV_INF("Loaded %zu custom model presets from %s\n", custom_presets.size(), base_params.models_preset.c_str());
+    }
+
+    // cascade, apply global preset first
+    cached_models  = ctx_preset.cascade(global, cached_models);
+    local_models   = ctx_preset.cascade(global, local_models);
+    custom_presets = ctx_preset.cascade(global, custom_presets);
+
+    // note: if a model exists in both cached and local, local takes precedence
+    common_presets final_presets;
+    for (const auto & [name, preset] : cached_models) {
+        final_presets[name] = preset;
+    }
+    for (const auto & [name, preset] : local_models) {
+        final_presets[name] = preset;
+    }
+
+    // process custom presets from INI
+    for (const auto & [name, custom] : custom_presets) {
+        if (final_presets.find(name) != final_presets.end()) {
+            // apply custom config if exists
+            common_preset & target = final_presets[name];
+            target.merge(custom);
+        } else {
+            // otherwise add directly
+            final_presets[name] = custom;
         }
     }
-    // 3. custom-path models specified in presets
-    auto custom_models = list_custom_path_models(presets);
-    for (const auto & model : custom_models) {
+
+    // server base preset from CLI args take highest precedence
+    for (auto & [name, preset] : final_presets) {
+        preset.merge(base_preset);
+    }
+
+    // convert presets to server_model_meta and add to mapping
+    for (const auto & preset : final_presets) {
         server_model_meta meta{
-            /* preset      */ presets.get_preset(model.name),
-            /* name        */ model.name,
-            /* path        */ model.path,
-            /* path_mmproj */ model.path_mmproj,
-            /* in_cache    */ false,
+            /* preset      */ preset.second,
+            /* name        */ preset.first,
             /* port        */ 0,
             /* status      */ SERVER_MODEL_STATUS_UNLOADED,
             /* last_used   */ 0,
@@ -359,10 +213,18 @@ void server_models::load_models() {
         };
         add_model(std::move(meta));
     }
+
     // log available models
-    SRV_INF("Available models (%zu) (*: custom preset)\n", mapping.size());
-    for (const auto & [name, inst] : mapping) {
-        SRV_INF("  %c %s\n", inst.meta.preset.name.empty() ? ' ' : '*', name.c_str());
+    {
+        std::unordered_set<std::string> custom_names;
+        for (const auto & [name, preset] : custom_presets) {
+            custom_names.insert(name);
+        }
+        SRV_INF("Available models (%zu) (*: custom preset)\n", mapping.size());
+        for (const auto & [name, inst] : mapping) {
+            bool has_custom = custom_names.find(name) != custom_names.end();
+            SRV_INF("  %c %s\n", has_custom ? '*' : ' ', name.c_str());
+        }
     }
 }
 
@@ -526,7 +388,7 @@ void server_models::load(const std::string & name) {
     {
         SRV_INF("spawning server instance with name=%s on port %d\n", inst.meta.name.c_str(), inst.meta.port);
 
-        presets.render_args(inst.meta); // update meta.args
+        inst.meta.update_args(ctx_preset, bin_path); // render args
 
         std::vector<std::string> child_args = inst.meta.args; // copy
         std::vector<std::string> child_env  = base_env; // copy
@@ -877,7 +739,12 @@ void server_models_routes::init_routes() {
                 {"args",   meta.args},
             };
             if (!meta.preset.name.empty()) {
-                status["preset"] = meta.preset.to_ini();
+                common_preset preset_copy = meta.preset;
+                unset_reserved_args(preset_copy, false);
+                preset_copy.unset_option("LLAMA_ARG_HOST");
+                preset_copy.unset_option("LLAMA_ARG_PORT");
+                preset_copy.unset_option("LLAMA_ARG_ALIAS");
+                status["preset"] = preset_copy.to_ini();
             }
             if (meta.is_failed()) {
                 status["exit_code"] = meta.exit_code;
@@ -888,8 +755,6 @@ void server_models_routes::init_routes() {
                 {"object",   "model"},    // for OAI-compat
                 {"owned_by", "llamacpp"}, // for OAI-compat
                 {"created",  t},          // for OAI-compat
-                {"in_cache", meta.in_cache},
-                {"path",     meta.path},
                 {"status",   status},
                 // TODO: add other fields, may require reading GGUF metadata
             });
diff --git a/tools/server/server-models.h b/tools/server/server-models.h
index cbc4c4324..56fb398e3 100644
--- a/tools/server/server-models.h
+++ b/tools/server/server-models.h
@@ -51,9 +51,6 @@ static std::string server_model_status_to_string(server_model_status status) {
 struct server_model_meta {
     common_preset preset;
     std::string name;
-    std::string path;
-    std::string path_mmproj; // only available if in_cache=false
-    bool in_cache = false; // if true, use -hf; use -m otherwise
     int port = 0;
     server_model_status status = SERVER_MODEL_STATUS_UNLOADED;
     int64_t last_used = 0; // for LRU unloading
@@ -67,19 +64,8 @@ struct server_model_meta {
     bool is_failed() const {
         return status == SERVER_MODEL_STATUS_UNLOADED && exit_code != 0;
     }
-};
 
-// the server_presets struct holds the presets read from presets.ini
-// as well as base args from the router server
-struct server_presets {
-    common_presets presets;
-    common_params_context ctx_params;
-    std::map<common_arg, std::string> base_args;
-    std::map<std::string, common_arg> control_args; // args reserved for server control
-
-    server_presets(int argc, char ** argv, common_params & base_params, const std::string & models_dir);
-    common_preset get_preset(const std::string & name);
-    void render_args(server_model_meta & meta);
+    void update_args(common_preset_context & ctx_presets, std::string bin_path);
 };
 
 struct subprocess_s;
@@ -97,11 +83,12 @@ private:
     std::condition_variable cv;
     std::map<std::string, instance_t> mapping;
 
-    common_params base_params;
-    std::vector<std::string> base_args;
-    std::vector<std::string> base_env;
+    common_preset_context ctx_preset;
 
-    server_presets presets;
+    common_params base_params;
+    std::string bin_path;
+    std::vector<std::string> base_env;
+    common_preset base_preset; // base preset from llama-server CLI args
 
     void update_meta(const std::string & name, const server_model_meta & meta);
 
diff --git a/tools/server/webui/docs/architecture/high-level-architecture-simplified.md b/tools/server/webui/docs/architecture/high-level-architecture-simplified.md
index 50f2e1df0..a6cb1e9c3 100644
--- a/tools/server/webui/docs/architecture/high-level-architecture-simplified.md
+++ b/tools/server/webui/docs/architecture/high-level-architecture-simplified.md
@@ -11,6 +11,8 @@ flowchart TB
         C_Screen["ChatScreen"]
         C_Form["ChatForm"]
         C_Messages["ChatMessages"]
+        C_Message["ChatMessage"]
+        C_MessageEditForm["ChatMessageEditForm"]
         C_ModelsSelector["ModelsSelector"]
         C_Settings["ChatSettings"]
     end
@@ -54,7 +56,9 @@ flowchart TB
 
     %% Component hierarchy
     C_Screen --> C_Form & C_Messages & C_Settings
-    C_Form & C_Messages --> C_ModelsSelector
+    C_Messages --> C_Message
+    C_Message --> C_MessageEditForm
+    C_Form & C_MessageEditForm --> C_ModelsSelector
 
     %% Components → Hooks → Stores
     C_Form & C_Messages --> H1 & H2
@@ -93,7 +97,7 @@ flowchart TB
     classDef apiStyle fill:#e3f2fd,stroke:#1565c0,stroke-width:2px
 
     class R1,R2,RL routeStyle
-    class C_Sidebar,C_Screen,C_Form,C_Messages,C_ModelsSelector,C_Settings componentStyle
+    class C_Sidebar,C_Screen,C_Form,C_Messages,C_Message,C_MessageEditForm,C_ModelsSelector,C_Settings componentStyle
     class H1,H2 hookStyle
     class S1,S2,S3,S4,S5 storeStyle
     class SV1,SV2,SV3,SV4,SV5 serviceStyle
diff --git a/tools/server/webui/docs/architecture/high-level-architecture.md b/tools/server/webui/docs/architecture/high-level-architecture.md
index 730da10a5..c5ec4d690 100644
--- a/tools/server/webui/docs/architecture/high-level-architecture.md
+++ b/tools/server/webui/docs/architecture/high-level-architecture.md
@@ -16,6 +16,8 @@ end
             C_Form["ChatForm"]
             C_Messages["ChatMessages"]
             C_Message["ChatMessage"]
+            C_MessageUser["ChatMessageUser"]
+            C_MessageEditForm["ChatMessageEditForm"]
             C_Attach["ChatAttachments"]
             C_ModelsSelector["ModelsSelector"]
             C_Settings["ChatSettings"]
@@ -38,7 +40,7 @@ end
             S1Error["<b>Error Handling:</b><br/>showErrorDialog()<br/>dismissErrorDialog()<br/>isAbortError()"]
             S1Msg["<b>Message Operations:</b><br/>addMessage()<br/>sendMessage()<br/>updateMessage()<br/>deleteMessage()<br/>getDeletionInfo()"]
             S1Regen["<b>Regeneration:</b><br/>regenerateMessage()<br/>regenerateMessageWithBranching()<br/>continueAssistantMessage()"]
-            S1Edit["<b>Editing:</b><br/>editAssistantMessage()<br/>editUserMessagePreserveResponses()<br/>editMessageWithBranching()"]
+            S1Edit["<b>Editing:</b><br/>editAssistantMessage()<br/>editUserMessagePreserveResponses()<br/>editMessageWithBranching()<br/>clearEditMode()<br/>isEditModeActive()<br/>getAddFilesHandler()<br/>setEditModeActive()"]
             S1Utils["<b>Utilities:</b><br/>getApiOptions()<br/>parseTimingData()<br/>getOrCreateAbortController()<br/>getConversationModel()"]
         end
         subgraph S2["conversationsStore"]
@@ -88,6 +90,10 @@ end
                 RE7["getChatStreaming()"]
                 RE8["getAllLoadingChats()"]
                 RE9["getAllStreamingChats()"]
+                RE9a["isEditModeActive()"]
+                RE9b["getAddFilesHandler()"]
+                RE9c["setEditModeActive()"]
+                RE9d["clearEditMode()"]
             end
             subgraph ConvExports["conversationsStore"]
                 RE10["conversations()"]
@@ -182,7 +188,10 @@ end
     %% Component hierarchy
     C_Screen --> C_Form & C_Messages & C_Settings
     C_Messages --> C_Message
-    C_Message --> C_ModelsSelector
+    C_Message --> C_MessageUser
+    C_MessageUser --> C_MessageEditForm
+    C_MessageEditForm --> C_ModelsSelector
+    C_MessageEditForm --> C_Attach
     C_Form --> C_ModelsSelector
     C_Form --> C_Attach
     C_Message --> C_Attach
@@ -190,6 +199,7 @@ end
     %% Components use Hooks
     C_Form --> H1
     C_Message --> H1 & H2
+    C_MessageEditForm --> H1
     C_Screen --> H2
 
     %% Hooks use Stores
@@ -244,7 +254,7 @@ end
     classDef apiStyle fill:#e3f2fd,stroke:#1565c0,stroke-width:2px
 
     class R1,R2,RL routeStyle
-    class C_Sidebar,C_Screen,C_Form,C_Messages,C_Message componentStyle
+    class C_Sidebar,C_Screen,C_Form,C_Messages,C_Message,C_MessageUser,C_MessageEditForm componentStyle
     class C_ModelsSelector,C_Settings componentStyle
     class C_Attach componentStyle
     class H1,H2,H3 methodStyle
diff --git a/tools/server/webui/package-lock.json b/tools/server/webui/package-lock.json
index 0d1a03aca..6fa9d39c7 100644
--- a/tools/server/webui/package-lock.json
+++ b/tools/server/webui/package-lock.json
@@ -25,7 +25,7 @@
 				"@chromatic-com/storybook": "^4.1.2",
 				"@eslint/compat": "^1.2.5",
 				"@eslint/js": "^9.18.0",
-				"@internationalized/date": "^3.8.2",
+				"@internationalized/date": "^3.10.1",
 				"@lucide/svelte": "^0.515.0",
 				"@playwright/test": "^1.49.1",
 				"@storybook/addon-a11y": "^10.0.7",
@@ -862,9 +862,9 @@
 			}
 		},
 		"node_modules/@internationalized/date": {
-			"version": "3.8.2",
-			"resolved": "https://registry.npmjs.org/@internationalized/date/-/date-3.8.2.tgz",
-			"integrity": "sha512-/wENk7CbvLbkUvX1tu0mwq49CVkkWpkXubGel6birjRPyo6uQ4nQpnq5xZu823zRCwwn82zgHrvgF1vZyvmVgA==",
+			"version": "3.10.1",
+			"resolved": "https://registry.npmjs.org/@internationalized/date/-/date-3.10.1.tgz",
+			"integrity": "sha512-oJrXtQiAXLvT9clCf1K4kxp3eKsQhIaZqxEyowkBcsvZDdZkbWrVmnGknxs5flTD0VGsxrxKgBCZty1EzoiMzA==",
 			"dev": true,
 			"license": "Apache-2.0",
 			"dependencies": {
diff --git a/tools/server/webui/package.json b/tools/server/webui/package.json
index 1c970ae7a..1a8c27374 100644
--- a/tools/server/webui/package.json
+++ b/tools/server/webui/package.json
@@ -26,7 +26,7 @@
 		"@chromatic-com/storybook": "^4.1.2",
 		"@eslint/compat": "^1.2.5",
 		"@eslint/js": "^9.18.0",
-		"@internationalized/date": "^3.8.2",
+		"@internationalized/date": "^3.10.1",
 		"@lucide/svelte": "^0.515.0",
 		"@playwright/test": "^1.49.1",
 		"@storybook/addon-a11y": "^10.0.7",
diff --git a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatForm.svelte b/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatForm.svelte
index 3ad14ed3a..fd2f7f60e 100644
--- a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatForm.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatForm.svelte
@@ -8,6 +8,7 @@
 		ChatFormTextarea
 	} from '$lib/components/app';
 	import { INPUT_CLASSES } from '$lib/constants/input-classes';
+	import { SETTING_CONFIG_DEFAULT } from '$lib/constants/settings-config';
 	import { config } from '$lib/stores/settings.svelte';
 	import { modelsStore, modelOptions, selectedModelId } from '$lib/stores/models.svelte';
 	import { isRouterMode } from '$lib/stores/server.svelte';
@@ -66,7 +67,7 @@
 	let message = $state('');
 	let pasteLongTextToFileLength = $derived.by(() => {
 		const n = Number(currentConfig.pasteLongTextToFileLen);
-		return Number.isNaN(n) ? 2500 : n;
+		return Number.isNaN(n) ? Number(SETTING_CONFIG_DEFAULT.pasteLongTextToFileLen) : n;
 	});
 	let previousIsLoading = $state(isLoading);
 	let recordingSupported = $state(false);
diff --git a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage.svelte b/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage.svelte
index 0969a937e..220276fc9 100644
--- a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage.svelte
@@ -12,13 +12,21 @@
 		onCopy?: (message: DatabaseMessage) => void;
 		onContinueAssistantMessage?: (message: DatabaseMessage) => void;
 		onDelete?: (message: DatabaseMessage) => void;
-		onEditWithBranching?: (message: DatabaseMessage, newContent: string) => void;
+		onEditWithBranching?: (
+			message: DatabaseMessage,
+			newContent: string,
+			newExtras?: DatabaseMessageExtra[]
+		) => void;
 		onEditWithReplacement?: (
 			message: DatabaseMessage,
 			newContent: string,
 			shouldBranch: boolean
 		) => void;
-		onEditUserMessagePreserveResponses?: (message: DatabaseMessage, newContent: string) => void;
+		onEditUserMessagePreserveResponses?: (
+			message: DatabaseMessage,
+			newContent: string,
+			newExtras?: DatabaseMessageExtra[]
+		) => void;
 		onNavigateToSibling?: (siblingId: string) => void;
 		onRegenerateWithBranching?: (message: DatabaseMessage, modelOverride?: string) => void;
 		siblingInfo?: ChatMessageSiblingInfo | null;
@@ -45,6 +53,8 @@
 		messageTypes: string[];
 	} | null>(null);
 	let editedContent = $state(message.content);
+	let editedExtras = $state<DatabaseMessageExtra[]>(message.extra ? [...message.extra] : []);
+	let editedUploadedFiles = $state<ChatUploadedFile[]>([]);
 	let isEditing = $state(false);
 	let showDeleteDialog = $state(false);
 	let shouldBranchAfterEdit = $state(false);
@@ -85,6 +95,16 @@
 	function handleCancelEdit() {
 		isEditing = false;
 		editedContent = message.content;
+		editedExtras = message.extra ? [...message.extra] : [];
+		editedUploadedFiles = [];
+	}
+
+	function handleEditedExtrasChange(extras: DatabaseMessageExtra[]) {
+		editedExtras = extras;
+	}
+
+	function handleEditedUploadedFilesChange(files: ChatUploadedFile[]) {
+		editedUploadedFiles = files;
 	}
 
 	async function handleCopy() {
@@ -107,6 +127,8 @@
 	function handleEdit() {
 		isEditing = true;
 		editedContent = message.content;
+		editedExtras = message.extra ? [...message.extra] : [];
+		editedUploadedFiles = [];
 
 		setTimeout(() => {
 			if (textareaElement) {
@@ -143,9 +165,10 @@
 		onContinueAssistantMessage?.(message);
 	}
 
-	function handleSaveEdit() {
+	async function handleSaveEdit() {
 		if (message.role === 'user' || message.role === 'system') {
-			onEditWithBranching?.(message, editedContent.trim());
+			const finalExtras = await getMergedExtras();
+			onEditWithBranching?.(message, editedContent.trim(), finalExtras);
 		} else {
 			// For assistant messages, preserve exact content including trailing whitespace
 			// This is important for the Continue feature to work properly
@@ -154,15 +177,30 @@
 
 		isEditing = false;
 		shouldBranchAfterEdit = false;
+		editedUploadedFiles = [];
 	}
 
-	function handleSaveEditOnly() {
+	async function handleSaveEditOnly() {
 		if (message.role === 'user') {
 			// For user messages, trim to avoid accidental whitespace
-			onEditUserMessagePreserveResponses?.(message, editedContent.trim());
+			const finalExtras = await getMergedExtras();
+			onEditUserMessagePreserveResponses?.(message, editedContent.trim(), finalExtras);
 		}
 
 		isEditing = false;
+		editedUploadedFiles = [];
+	}
+
+	async function getMergedExtras(): Promise<DatabaseMessageExtra[]> {
+		if (editedUploadedFiles.length === 0) {
+			return editedExtras;
+		}
+
+		const { parseFilesToMessageExtras } = await import('$lib/utils/browser-only');
+		const result = await parseFilesToMessageExtras(editedUploadedFiles);
+		const newExtras = result?.extras || [];
+
+		return [...editedExtras, ...newExtras];
 	}
 
 	function handleShowDeleteDialogChange(show: boolean) {
@@ -197,6 +235,8 @@
 		class={className}
 		{deletionInfo}
 		{editedContent}
+		{editedExtras}
+		{editedUploadedFiles}
 		{isEditing}
 		{message}
 		onCancelEdit={handleCancelEdit}
@@ -206,6 +246,8 @@
 		onEdit={handleEdit}
 		onEditKeydown={handleEditKeydown}
 		onEditedContentChange={handleEditedContentChange}
+		onEditedExtrasChange={handleEditedExtrasChange}
+		onEditedUploadedFilesChange={handleEditedUploadedFilesChange}
 		{onNavigateToSibling}
 		onSaveEdit={handleSaveEdit}
 		onSaveEditOnly={handleSaveEditOnly}
diff --git a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageEditForm.svelte b/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageEditForm.svelte
new file mode 100644
index 000000000..f812ea2fd
--- /dev/null
+++ b/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageEditForm.svelte
@@ -0,0 +1,391 @@
+<script lang="ts">
+	import { X, ArrowUp, Paperclip, AlertTriangle } from '@lucide/svelte';
+	import { Button } from '$lib/components/ui/button';
+	import { Switch } from '$lib/components/ui/switch';
+	import { ChatAttachmentsList, DialogConfirmation, ModelsSelector } from '$lib/components/app';
+	import { INPUT_CLASSES } from '$lib/constants/input-classes';
+	import { SETTING_CONFIG_DEFAULT } from '$lib/constants/settings-config';
+	import { AttachmentType, FileTypeCategory, MimeTypeText } from '$lib/enums';
+	import { config } from '$lib/stores/settings.svelte';
+	import { useModelChangeValidation } from '$lib/hooks/use-model-change-validation.svelte';
+	import { setEditModeActive, clearEditMode } from '$lib/stores/chat.svelte';
+	import { conversationsStore } from '$lib/stores/conversations.svelte';
+	import { modelsStore } from '$lib/stores/models.svelte';
+	import { isRouterMode } from '$lib/stores/server.svelte';
+	import {
+		autoResizeTextarea,
+		getFileTypeCategory,
+		getFileTypeCategoryByExtension,
+		parseClipboardContent
+	} from '$lib/utils';
+
+	interface Props {
+		messageId: string;
+		editedContent: string;
+		editedExtras?: DatabaseMessageExtra[];
+		editedUploadedFiles?: ChatUploadedFile[];
+		originalContent: string;
+		originalExtras?: DatabaseMessageExtra[];
+		showSaveOnlyOption?: boolean;
+		onCancelEdit: () => void;
+		onSaveEdit: () => void;
+		onSaveEditOnly?: () => void;
+		onEditKeydown: (event: KeyboardEvent) => void;
+		onEditedContentChange: (content: string) => void;
+		onEditedExtrasChange?: (extras: DatabaseMessageExtra[]) => void;
+		onEditedUploadedFilesChange?: (files: ChatUploadedFile[]) => void;
+		textareaElement?: HTMLTextAreaElement;
+	}
+
+	let {
+		messageId,
+		editedContent,
+		editedExtras = [],
+		editedUploadedFiles = [],
+		originalContent,
+		originalExtras = [],
+		showSaveOnlyOption = false,
+		onCancelEdit,
+		onSaveEdit,
+		onSaveEditOnly,
+		onEditKeydown,
+		onEditedContentChange,
+		onEditedExtrasChange,
+		onEditedUploadedFilesChange,
+		textareaElement = $bindable()
+	}: Props = $props();
+
+	let fileInputElement: HTMLInputElement | undefined = $state();
+	let saveWithoutRegenerate = $state(false);
+	let showDiscardDialog = $state(false);
+	let isRouter = $derived(isRouterMode());
+	let currentConfig = $derived(config());
+
+	let pasteLongTextToFileLength = $derived.by(() => {
+		const n = Number(currentConfig.pasteLongTextToFileLen);
+
+		return Number.isNaN(n) ? Number(SETTING_CONFIG_DEFAULT.pasteLongTextToFileLen) : n;
+	});
+
+	let hasUnsavedChanges = $derived.by(() => {
+		if (editedContent !== originalContent) return true;
+		if (editedUploadedFiles.length > 0) return true;
+
+		const extrasChanged =
+			editedExtras.length !== originalExtras.length ||
+			editedExtras.some((extra, i) => extra !== originalExtras[i]);
+
+		if (extrasChanged) return true;
+
+		return false;
+	});
+
+	let hasAttachments = $derived(
+		(editedExtras && editedExtras.length > 0) ||
+			(editedUploadedFiles && editedUploadedFiles.length > 0)
+	);
+
+	let canSubmit = $derived(editedContent.trim().length > 0 || hasAttachments);
+
+	function getEditedAttachmentsModalities(): ModelModalities {
+		const modalities: ModelModalities = { vision: false, audio: false };
+
+		for (const extra of editedExtras) {
+			if (extra.type === AttachmentType.IMAGE) {
+				modalities.vision = true;
+			}
+
+			if (
+				extra.type === AttachmentType.PDF &&
+				'processedAsImages' in extra &&
+				extra.processedAsImages
+			) {
+				modalities.vision = true;
+			}
+
+			if (extra.type === AttachmentType.AUDIO) {
+				modalities.audio = true;
+			}
+		}
+
+		for (const file of editedUploadedFiles) {
+			const category = getFileTypeCategory(file.type) || getFileTypeCategoryByExtension(file.name);
+			if (category === FileTypeCategory.IMAGE) {
+				modalities.vision = true;
+			}
+			if (category === FileTypeCategory.AUDIO) {
+				modalities.audio = true;
+			}
+		}
+
+		return modalities;
+	}
+
+	function getRequiredModalities(): ModelModalities {
+		const beforeModalities = conversationsStore.getModalitiesUpToMessage(messageId);
+		const editedModalities = getEditedAttachmentsModalities();
+
+		return {
+			vision: beforeModalities.vision || editedModalities.vision,
+			audio: beforeModalities.audio || editedModalities.audio
+		};
+	}
+
+	const { handleModelChange } = useModelChangeValidation({
+		getRequiredModalities,
+		onValidationFailure: async (previousModelId) => {
+			if (previousModelId) {
+				await modelsStore.selectModelById(previousModelId);
+			}
+		}
+	});
+
+	function handleFileInputChange(event: Event) {
+		const input = event.target as HTMLInputElement;
+		if (!input.files || input.files.length === 0) return;
+
+		const files = Array.from(input.files);
+
+		processNewFiles(files);
+		input.value = '';
+	}
+
+	function handleGlobalKeydown(event: KeyboardEvent) {
+		if (event.key === 'Escape') {
+			event.preventDefault();
+			attemptCancel();
+		}
+	}
+
+	function attemptCancel() {
+		if (hasUnsavedChanges) {
+			showDiscardDialog = true;
+		} else {
+			onCancelEdit();
+		}
+	}
+
+	function handleRemoveExistingAttachment(index: number) {
+		if (!onEditedExtrasChange) return;
+
+		const newExtras = [...editedExtras];
+
+		newExtras.splice(index, 1);
+		onEditedExtrasChange(newExtras);
+	}
+
+	function handleRemoveUploadedFile(fileId: string) {
+		if (!onEditedUploadedFilesChange) return;
+
+		const newFiles = editedUploadedFiles.filter((f) => f.id !== fileId);
+
+		onEditedUploadedFilesChange(newFiles);
+	}
+
+	function handleSubmit() {
+		if (!canSubmit) return;
+
+		if (saveWithoutRegenerate && onSaveEditOnly) {
+			onSaveEditOnly();
+		} else {
+			onSaveEdit();
+		}
+
+		saveWithoutRegenerate = false;
+	}
+
+	async function processNewFiles(files: File[]) {
+		if (!onEditedUploadedFilesChange) return;
+
+		const { processFilesToChatUploaded } = await import('$lib/utils/browser-only');
+		const processed = await processFilesToChatUploaded(files);
+
+		onEditedUploadedFilesChange([...editedUploadedFiles, ...processed]);
+	}
+
+	function handlePaste(event: ClipboardEvent) {
+		if (!event.clipboardData) return;
+
+		const files = Array.from(event.clipboardData.items)
+			.filter((item) => item.kind === 'file')
+			.map((item) => item.getAsFile())
+			.filter((file): file is File => file !== null);
+
+		if (files.length > 0) {
+			event.preventDefault();
+			processNewFiles(files);
+
+			return;
+		}
+
+		const text = event.clipboardData.getData(MimeTypeText.PLAIN);
+
+		if (text.startsWith('"')) {
+			const parsed = parseClipboardContent(text);
+
+			if (parsed.textAttachments.length > 0) {
+				event.preventDefault();
+				onEditedContentChange(parsed.message);
+
+				const attachmentFiles = parsed.textAttachments.map(
+					(att) =>
+						new File([att.content], att.name, {
+							type: MimeTypeText.PLAIN
+						})
+				);
+
+				processNewFiles(attachmentFiles);
+
+				setTimeout(() => {
+					textareaElement?.focus();
+				}, 10);
+
+				return;
+			}
+		}
+
+		if (
+			text.length > 0 &&
+			pasteLongTextToFileLength > 0 &&
+			text.length > pasteLongTextToFileLength
+		) {
+			event.preventDefault();
+
+			const textFile = new File([text], 'Pasted', {
+				type: MimeTypeText.PLAIN
+			});
+
+			processNewFiles([textFile]);
+		}
+	}
+
+	$effect(() => {
+		if (textareaElement) {
+			autoResizeTextarea(textareaElement);
+		}
+	});
+
+	$effect(() => {
+		setEditModeActive(processNewFiles);
+
+		return () => {
+			clearEditMode();
+		};
+	});
+</script>
+
+<svelte:window onkeydown={handleGlobalKeydown} />
+
+<input
+	bind:this={fileInputElement}
+	type="file"
+	multiple
+	class="hidden"
+	onchange={handleFileInputChange}
+/>
+
+<div
+	class="{INPUT_CLASSES} w-full max-w-[80%] overflow-hidden rounded-3xl backdrop-blur-md"
+	data-slot="edit-form"
+>
+	<ChatAttachmentsList
+		attachments={editedExtras}
+		uploadedFiles={editedUploadedFiles}
+		readonly={false}
+		onFileRemove={(fileId) => {
+			if (fileId.startsWith('attachment-')) {
+				const index = parseInt(fileId.replace('attachment-', ''), 10);
+				if (!isNaN(index) && index >= 0 && index < editedExtras.length) {
+					handleRemoveExistingAttachment(index);
+				}
+			} else {
+				handleRemoveUploadedFile(fileId);
+			}
+		}}
+		limitToSingleRow
+		class="py-5"
+		style="scroll-padding: 1rem;"
+	/>
+
+	<div class="relative min-h-[48px] px-5 py-3">
+		<textarea
+			bind:this={textareaElement}
+			bind:value={editedContent}
+			class="field-sizing-content max-h-80 min-h-10 w-full resize-none bg-transparent text-sm outline-none"
+			onkeydown={onEditKeydown}
+			oninput={(e) => {
+				autoResizeTextarea(e.currentTarget);
+				onEditedContentChange(e.currentTarget.value);
+			}}
+			onpaste={handlePaste}
+			placeholder="Edit your message..."
+		></textarea>
+
+		<div class="flex w-full items-center gap-3" style="container-type: inline-size">
+			<Button
+				class="h-8 w-8 shrink-0 rounded-full bg-transparent p-0 text-muted-foreground hover:bg-foreground/10 hover:text-foreground"
+				onclick={() => fileInputElement?.click()}
+				type="button"
+				title="Add attachment"
+			>
+				<span class="sr-only">Attach files</span>
+
+				<Paperclip class="h-4 w-4" />
+			</Button>
+
+			<div class="flex-1"></div>
+
+			{#if isRouter}
+				<ModelsSelector
+					forceForegroundText={true}
+					useGlobalSelection={true}
+					onModelChange={handleModelChange}
+				/>
+			{/if}
+
+			<Button
+				class="h-8 w-8 shrink-0 rounded-full p-0"
+				onclick={handleSubmit}
+				disabled={!canSubmit}
+				type="button"
+				title={saveWithoutRegenerate ? 'Save changes' : 'Send and regenerate'}
+			>
+				<span class="sr-only">{saveWithoutRegenerate ? 'Save' : 'Send'}</span>
+
+				<ArrowUp class="h-5 w-5" />
+			</Button>
+		</div>
+	</div>
+</div>
+
+<div class="mt-2 flex w-full max-w-[80%] items-center justify-between">
+	{#if showSaveOnlyOption && onSaveEditOnly}
+		<div class="flex items-center gap-2">
+			<Switch id="save-only-switch" bind:checked={saveWithoutRegenerate} class="scale-75" />
+
+			<label for="save-only-switch" class="cursor-pointer text-xs text-muted-foreground">
+				Update without re-sending
+			</label>
+		</div>
+	{:else}
+		<div></div>
+	{/if}
+
+	<Button class="h-7 px-3 text-xs" onclick={attemptCancel} size="sm" variant="ghost">
+		<X class="mr-1 h-3 w-3" />
+
+		Cancel
+	</Button>
+</div>
+
+<DialogConfirmation
+	bind:open={showDiscardDialog}
+	title="Discard changes?"
+	description="You have unsaved changes. Are you sure you want to discard them?"
+	confirmText="Discard"
+	cancelText="Keep editing"
+	variant="destructive"
+	icon={AlertTriangle}
+	onConfirm={onCancelEdit}
+	onCancel={() => (showDiscardDialog = false)}
+/>
diff --git a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageUser.svelte b/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageUser.svelte
index 3d2b8dd35..041c6bd25 100644
--- a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageUser.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageUser.svelte
@@ -1,18 +1,17 @@
 <script lang="ts">
-	import { Check, X, Send } from '@lucide/svelte';
 	import { Card } from '$lib/components/ui/card';
-	import { Button } from '$lib/components/ui/button';
 	import { ChatAttachmentsList, MarkdownContent } from '$lib/components/app';
-	import { INPUT_CLASSES } from '$lib/constants/input-classes';
 	import { config } from '$lib/stores/settings.svelte';
-	import { autoResizeTextarea } from '$lib/utils';
 	import ChatMessageActions from './ChatMessageActions.svelte';
+	import ChatMessageEditForm from './ChatMessageEditForm.svelte';
 
 	interface Props {
 		class?: string;
 		message: DatabaseMessage;
 		isEditing: boolean;
 		editedContent: string;
+		editedExtras?: DatabaseMessageExtra[];
+		editedUploadedFiles?: ChatUploadedFile[];
 		siblingInfo?: ChatMessageSiblingInfo | null;
 		showDeleteDialog: boolean;
 		deletionInfo: {
@@ -26,6 +25,8 @@
 		onSaveEditOnly?: () => void;
 		onEditKeydown: (event: KeyboardEvent) => void;
 		onEditedContentChange: (content: string) => void;
+		onEditedExtrasChange?: (extras: DatabaseMessageExtra[]) => void;
+		onEditedUploadedFilesChange?: (files: ChatUploadedFile[]) => void;
 		onCopy: () => void;
 		onEdit: () => void;
 		onDelete: () => void;
@@ -40,6 +41,8 @@
 		message,
 		isEditing,
 		editedContent,
+		editedExtras = [],
+		editedUploadedFiles = [],
 		siblingInfo = null,
 		showDeleteDialog,
 		deletionInfo,
@@ -48,6 +51,8 @@
 		onSaveEditOnly,
 		onEditKeydown,
 		onEditedContentChange,
+		onEditedExtrasChange,
+		onEditedUploadedFilesChange,
 		onCopy,
 		onEdit,
 		onDelete,
@@ -61,12 +66,6 @@
 	let messageElement: HTMLElement | undefined = $state();
 	const currentConfig = config();
 
-	$effect(() => {
-		if (isEditing && textareaElement) {
-			autoResizeTextarea(textareaElement);
-		}
-	});
-
 	$effect(() => {
 		if (!messageElement || !message.content.trim()) return;
 
@@ -98,44 +97,23 @@
 	role="group"
 >
 	{#if isEditing}
-		<div class="w-full max-w-[80%]">
-			<textarea
-				bind:this={textareaElement}
-				bind:value={editedContent}
-				class="min-h-[60px] w-full resize-none rounded-2xl px-3 py-2 text-sm {INPUT_CLASSES}"
-				onkeydown={onEditKeydown}
-				oninput={(e) => {
-					autoResizeTextarea(e.currentTarget);
-					onEditedContentChange(e.currentTarget.value);
-				}}
-				placeholder="Edit your message..."
-			></textarea>
-
-			<div class="mt-2 flex justify-end gap-2">
-				<Button class="h-8 px-3" onclick={onCancelEdit} size="sm" variant="ghost">
-					<X class="mr-1 h-3 w-3" />
-					Cancel
-				</Button>
-
-				{#if onSaveEditOnly}
-					<Button
-						class="h-8 px-3"
-						onclick={onSaveEditOnly}
-						disabled={!editedContent.trim()}
-						size="sm"
-						variant="outline"
-					>
-						<Check class="mr-1 h-3 w-3" />
-						Save
-					</Button>
-				{/if}
-
-				<Button class="h-8 px-3" onclick={onSaveEdit} disabled={!editedContent.trim()} size="sm">
-					<Send class="mr-1 h-3 w-3" />
-					Send
-				</Button>
-			</div>
-		</div>
+		<ChatMessageEditForm
+			bind:textareaElement
+			messageId={message.id}
+			{editedContent}
+			{editedExtras}
+			{editedUploadedFiles}
+			originalContent={message.content}
+			originalExtras={message.extra}
+			showSaveOnlyOption={!!onSaveEditOnly}
+			{onCancelEdit}
+			{onSaveEdit}
+			{onSaveEditOnly}
+			{onEditKeydown}
+			{onEditedContentChange}
+			{onEditedExtrasChange}
+			{onEditedUploadedFilesChange}
+		/>
 	{:else}
 		{#if message.extra && message.extra.length > 0}
 			<div class="mb-2 max-w-[80%]">
diff --git a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessages.svelte b/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessages.svelte
index 2e5f57cb6..c203f1009 100644
--- a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessages.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessages.svelte
@@ -66,10 +66,14 @@
 		await conversationsStore.navigateToSibling(siblingId);
 	}
 
-	async function handleEditWithBranching(message: DatabaseMessage, newContent: string) {
+	async function handleEditWithBranching(
+		message: DatabaseMessage,
+		newContent: string,
+		newExtras?: DatabaseMessageExtra[]
+	) {
 		onUserAction?.();
 
-		await chatStore.editMessageWithBranching(message.id, newContent);
+		await chatStore.editMessageWithBranching(message.id, newContent, newExtras);
 
 		refreshAllMessages();
 	}
@@ -104,11 +108,12 @@
 
 	async function handleEditUserMessagePreserveResponses(
 		message: DatabaseMessage,
-		newContent: string
+		newContent: string,
+		newExtras?: DatabaseMessageExtra[]
 	) {
 		onUserAction?.();
 
-		await chatStore.editUserMessagePreserveResponses(message.id, newContent);
+		await chatStore.editUserMessagePreserveResponses(message.id, newContent, newExtras);
 
 		refreshAllMessages();
 	}
diff --git a/tools/server/webui/src/lib/components/app/chat/ChatScreen/ChatScreen.svelte b/tools/server/webui/src/lib/components/app/chat/ChatScreen/ChatScreen.svelte
index 23a9f37b7..b777f4019 100644
--- a/tools/server/webui/src/lib/components/app/chat/ChatScreen/ChatScreen.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatScreen/ChatScreen.svelte
@@ -17,7 +17,13 @@
 		AUTO_SCROLL_INTERVAL,
 		INITIAL_SCROLL_DELAY
 	} from '$lib/constants/auto-scroll';
-	import { chatStore, errorDialog, isLoading } from '$lib/stores/chat.svelte';
+	import {
+		chatStore,
+		errorDialog,
+		isLoading,
+		isEditing,
+		getAddFilesHandler
+	} from '$lib/stores/chat.svelte';
 	import {
 		conversationsStore,
 		activeMessages,
@@ -181,7 +187,18 @@
 		dragCounter = 0;
 
 		if (event.dataTransfer?.files) {
-			processFiles(Array.from(event.dataTransfer.files));
+			const files = Array.from(event.dataTransfer.files);
+
+			if (isEditing()) {
+				const handler = getAddFilesHandler();
+
+				if (handler) {
+					handler(files);
+					return;
+				}
+			}
+
+			processFiles(files);
 		}
 	}
 
@@ -410,7 +427,7 @@
 
 			<div class="conversation-chat-form pointer-events-auto rounded-t-3xl pb-4">
 				<ChatForm
-					disabled={hasPropsError}
+					disabled={hasPropsError || isEditing()}
 					isLoading={isCurrentConversationLoading}
 					onFileRemove={handleFileRemove}
 					onFileUpload={handleFileUpload}
diff --git a/tools/server/webui/src/lib/components/ui/switch/index.ts b/tools/server/webui/src/lib/components/ui/switch/index.ts
new file mode 100644
index 000000000..129f8f5c3
--- /dev/null
+++ b/tools/server/webui/src/lib/components/ui/switch/index.ts
@@ -0,0 +1,7 @@
+import Root from './switch.svelte';
+
+export {
+	Root,
+	//
+	Root as Switch
+};
diff --git a/tools/server/webui/src/lib/components/ui/switch/switch.svelte b/tools/server/webui/src/lib/components/ui/switch/switch.svelte
new file mode 100644
index 000000000..5a5975e13
--- /dev/null
+++ b/tools/server/webui/src/lib/components/ui/switch/switch.svelte
@@ -0,0 +1,29 @@
+<script lang="ts">
+	import { Switch as SwitchPrimitive } from 'bits-ui';
+	import { cn, type WithoutChildrenOrChild } from '$lib/components/ui/utils.js';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		checked = $bindable(false),
+		...restProps
+	}: WithoutChildrenOrChild<SwitchPrimitive.RootProps> = $props();
+</script>
+
+<SwitchPrimitive.Root
+	bind:ref
+	bind:checked
+	data-slot="switch"
+	class={cn(
+		'peer inline-flex h-[1.15rem] w-8 shrink-0 items-center rounded-full border border-transparent shadow-xs transition-all outline-none focus-visible:border-ring focus-visible:ring-[3px] focus-visible:ring-ring/50 disabled:cursor-not-allowed disabled:opacity-50 data-[state=checked]:bg-primary data-[state=unchecked]:bg-input dark:data-[state=unchecked]:bg-input/80',
+		className
+	)}
+	{...restProps}
+>
+	<SwitchPrimitive.Thumb
+		data-slot="switch-thumb"
+		class={cn(
+			'pointer-events-none block size-4 rounded-full bg-background ring-0 transition-transform data-[state=checked]:translate-x-[calc(100%-2px)] data-[state=unchecked]:translate-x-0 dark:data-[state=checked]:bg-primary-foreground dark:data-[state=unchecked]:bg-foreground'
+		)}
+	/>
+</SwitchPrimitive.Root>
diff --git a/tools/server/webui/src/lib/stores/chat.svelte.ts b/tools/server/webui/src/lib/stores/chat.svelte.ts
index e0431ee64..010889452 100644
--- a/tools/server/webui/src/lib/stores/chat.svelte.ts
+++ b/tools/server/webui/src/lib/stores/chat.svelte.ts
@@ -74,6 +74,8 @@ class ChatStore {
 	private processingStates = new SvelteMap<string, ApiProcessingState | null>();
 	private activeConversationId = $state<string | null>(null);
 	private isStreamingActive = $state(false);
+	private isEditModeActive = $state(false);
+	private addFilesHandler: ((files: File[]) => void) | null = $state(null);
 
 	// ─────────────────────────────────────────────────────────────────────────────
 	// Loading State
@@ -965,230 +967,9 @@ class ChatStore {
 	// Editing
 	// ─────────────────────────────────────────────────────────────────────────────
 
-	async editAssistantMessage(
-		messageId: string,
-		newContent: string,
-		shouldBranch: boolean
-	): Promise<void> {
-		const activeConv = conversationsStore.activeConversation;
-		if (!activeConv || this.isLoading) return;
-
-		const result = this.getMessageByIdWithRole(messageId, 'assistant');
-		if (!result) return;
-		const { message: msg, index: idx } = result;
-
-		try {
-			if (shouldBranch) {
-				const newMessage = await DatabaseService.createMessageBranch(
-					{
-						convId: msg.convId,
-						type: msg.type,
-						timestamp: Date.now(),
-						role: msg.role,
-						content: newContent,
-						thinking: msg.thinking || '',
-						toolCalls: msg.toolCalls || '',
-						children: [],
-						model: msg.model
-					},
-					msg.parent!
-				);
-				await conversationsStore.updateCurrentNode(newMessage.id);
-			} else {
-				await DatabaseService.updateMessage(msg.id, { content: newContent, timestamp: Date.now() });
-				await conversationsStore.updateCurrentNode(msg.id);
-				conversationsStore.updateMessageAtIndex(idx, {
-					content: newContent,
-					timestamp: Date.now()
-				});
-			}
-			conversationsStore.updateConversationTimestamp();
-			await conversationsStore.refreshActiveMessages();
-		} catch (error) {
-			console.error('Failed to edit assistant message:', error);
-		}
-	}
-
-	async editUserMessagePreserveResponses(messageId: string, newContent: string): Promise<void> {
-		const activeConv = conversationsStore.activeConversation;
-		if (!activeConv) return;
-
-		const result = this.getMessageByIdWithRole(messageId, 'user');
-		if (!result) return;
-		const { message: msg, index: idx } = result;
-
-		try {
-			await DatabaseService.updateMessage(messageId, {
-				content: newContent,
-				timestamp: Date.now()
-			});
-			conversationsStore.updateMessageAtIndex(idx, { content: newContent, timestamp: Date.now() });
-
-			const allMessages = await conversationsStore.getConversationMessages(activeConv.id);
-			const rootMessage = allMessages.find((m) => m.type === 'root' && m.parent === null);
-
-			if (rootMessage && msg.parent === rootMessage.id && newContent.trim()) {
-				await conversationsStore.updateConversationTitleWithConfirmation(
-					activeConv.id,
-					newContent.trim(),
-					conversationsStore.titleUpdateConfirmationCallback
-				);
-			}
-			conversationsStore.updateConversationTimestamp();
-		} catch (error) {
-			console.error('Failed to edit user message:', error);
-		}
-	}
-
-	async editMessageWithBranching(messageId: string, newContent: string): Promise<void> {
-		const activeConv = conversationsStore.activeConversation;
-		if (!activeConv || this.isLoading) return;
-
-		let result = this.getMessageByIdWithRole(messageId, 'user');
-
-		if (!result) {
-			result = this.getMessageByIdWithRole(messageId, 'system');
-		}
-
-		if (!result) return;
-		const { message: msg } = result;
-
-		try {
-			const allMessages = await conversationsStore.getConversationMessages(activeConv.id);
-			const rootMessage = allMessages.find((m) => m.type === 'root' && m.parent === null);
-			const isFirstUserMessage =
-				msg.role === 'user' && rootMessage && msg.parent === rootMessage.id;
-
-			const parentId = msg.parent || rootMessage?.id;
-			if (!parentId) return;
-
-			const newMessage = await DatabaseService.createMessageBranch(
-				{
-					convId: msg.convId,
-					type: msg.type,
-					timestamp: Date.now(),
-					role: msg.role,
-					content: newContent,
-					thinking: msg.thinking || '',
-					toolCalls: msg.toolCalls || '',
-					children: [],
-					extra: msg.extra ? JSON.parse(JSON.stringify(msg.extra)) : undefined,
-					model: msg.model
-				},
-				parentId
-			);
-			await conversationsStore.updateCurrentNode(newMessage.id);
-			conversationsStore.updateConversationTimestamp();
-
-			if (isFirstUserMessage && newContent.trim()) {
-				await conversationsStore.updateConversationTitleWithConfirmation(
-					activeConv.id,
-					newContent.trim(),
-					conversationsStore.titleUpdateConfirmationCallback
-				);
-			}
-			await conversationsStore.refreshActiveMessages();
-
-			if (msg.role === 'user') {
-				await this.generateResponseForMessage(newMessage.id);
-			}
-		} catch (error) {
-			console.error('Failed to edit message with branching:', error);
-		}
-	}
-
-	async regenerateMessageWithBranching(messageId: string, modelOverride?: string): Promise<void> {
-		const activeConv = conversationsStore.activeConversation;
-		if (!activeConv || this.isLoading) return;
-		try {
-			const idx = conversationsStore.findMessageIndex(messageId);
-			if (idx === -1) return;
-			const msg = conversationsStore.activeMessages[idx];
-			if (msg.role !== 'assistant') return;
-
-			const allMessages = await conversationsStore.getConversationMessages(activeConv.id);
-			const parentMessage = allMessages.find((m) => m.id === msg.parent);
-			if (!parentMessage) return;
-
-			this.setChatLoading(activeConv.id, true);
-			this.clearChatStreaming(activeConv.id);
-
-			const newAssistantMessage = await DatabaseService.createMessageBranch(
-				{
-					convId: activeConv.id,
-					type: 'text',
-					timestamp: Date.now(),
-					role: 'assistant',
-					content: '',
-					thinking: '',
-					toolCalls: '',
-					children: [],
-					model: null
-				},
-				parentMessage.id
-			);
-			await conversationsStore.updateCurrentNode(newAssistantMessage.id);
-			conversationsStore.updateConversationTimestamp();
-			await conversationsStore.refreshActiveMessages();
-
-			const conversationPath = filterByLeafNodeId(
-				allMessages,
-				parentMessage.id,
-				false
-			) as DatabaseMessage[];
-			// Use modelOverride if provided, otherwise use the original message's model
-			// If neither is available, don't pass model (will use global selection)
-			const modelToUse = modelOverride || msg.model || undefined;
-			await this.streamChatCompletion(
-				conversationPath,
-				newAssistantMessage,
-				undefined,
-				undefined,
-				modelToUse
-			);
-		} catch (error) {
-			if (!this.isAbortError(error))
-				console.error('Failed to regenerate message with branching:', error);
-			this.setChatLoading(activeConv?.id || '', false);
-		}
-	}
-
-	private async generateResponseForMessage(userMessageId: string): Promise<void> {
-		const activeConv = conversationsStore.activeConversation;
-
-		if (!activeConv) return;
-
-		this.errorDialogState = null;
-		this.setChatLoading(activeConv.id, true);
-		this.clearChatStreaming(activeConv.id);
-
-		try {
-			const allMessages = await conversationsStore.getConversationMessages(activeConv.id);
-			const conversationPath = filterByLeafNodeId(
-				allMessages,
-				userMessageId,
-				false
-			) as DatabaseMessage[];
-			const assistantMessage = await DatabaseService.createMessageBranch(
-				{
-					convId: activeConv.id,
-					type: 'text',
-					timestamp: Date.now(),
-					role: 'assistant',
-					content: '',
-					thinking: '',
-					toolCalls: '',
-					children: [],
-					model: null
-				},
-				userMessageId
-			);
-			conversationsStore.addMessageToActive(assistantMessage);
-			await this.streamChatCompletion(conversationPath, assistantMessage);
-		} catch (error) {
-			console.error('Failed to generate response:', error);
-			this.setChatLoading(activeConv.id, false);
-		}
+	clearEditMode(): void {
+		this.isEditModeActive = false;
+		this.addFilesHandler = null;
 	}
 
 	async continueAssistantMessage(messageId: string): Promise<void> {
@@ -1340,19 +1121,284 @@ class ChatStore {
 		}
 	}
 
-	public isChatLoadingPublic(convId: string): boolean {
-		return this.isChatLoading(convId);
+	async editAssistantMessage(
+		messageId: string,
+		newContent: string,
+		shouldBranch: boolean
+	): Promise<void> {
+		const activeConv = conversationsStore.activeConversation;
+		if (!activeConv || this.isLoading) return;
+
+		const result = this.getMessageByIdWithRole(messageId, 'assistant');
+		if (!result) return;
+		const { message: msg, index: idx } = result;
+
+		try {
+			if (shouldBranch) {
+				const newMessage = await DatabaseService.createMessageBranch(
+					{
+						convId: msg.convId,
+						type: msg.type,
+						timestamp: Date.now(),
+						role: msg.role,
+						content: newContent,
+						thinking: msg.thinking || '',
+						toolCalls: msg.toolCalls || '',
+						children: [],
+						model: msg.model
+					},
+					msg.parent!
+				);
+				await conversationsStore.updateCurrentNode(newMessage.id);
+			} else {
+				await DatabaseService.updateMessage(msg.id, { content: newContent });
+				await conversationsStore.updateCurrentNode(msg.id);
+				conversationsStore.updateMessageAtIndex(idx, {
+					content: newContent
+				});
+			}
+			conversationsStore.updateConversationTimestamp();
+			await conversationsStore.refreshActiveMessages();
+		} catch (error) {
+			console.error('Failed to edit assistant message:', error);
+		}
 	}
+
+	async editUserMessagePreserveResponses(
+		messageId: string,
+		newContent: string,
+		newExtras?: DatabaseMessageExtra[]
+	): Promise<void> {
+		const activeConv = conversationsStore.activeConversation;
+		if (!activeConv) return;
+
+		const result = this.getMessageByIdWithRole(messageId, 'user');
+		if (!result) return;
+		const { message: msg, index: idx } = result;
+
+		try {
+			const updateData: Partial<DatabaseMessage> = {
+				content: newContent
+			};
+
+			// Update extras if provided (including empty array to clear attachments)
+			// Deep clone to avoid Proxy objects from Svelte reactivity
+			if (newExtras !== undefined) {
+				updateData.extra = JSON.parse(JSON.stringify(newExtras));
+			}
+
+			await DatabaseService.updateMessage(messageId, updateData);
+			conversationsStore.updateMessageAtIndex(idx, updateData);
+
+			const allMessages = await conversationsStore.getConversationMessages(activeConv.id);
+			const rootMessage = allMessages.find((m) => m.type === 'root' && m.parent === null);
+
+			if (rootMessage && msg.parent === rootMessage.id && newContent.trim()) {
+				await conversationsStore.updateConversationTitleWithConfirmation(
+					activeConv.id,
+					newContent.trim(),
+					conversationsStore.titleUpdateConfirmationCallback
+				);
+			}
+			conversationsStore.updateConversationTimestamp();
+		} catch (error) {
+			console.error('Failed to edit user message:', error);
+		}
+	}
+
+	async editMessageWithBranching(
+		messageId: string,
+		newContent: string,
+		newExtras?: DatabaseMessageExtra[]
+	): Promise<void> {
+		const activeConv = conversationsStore.activeConversation;
+		if (!activeConv || this.isLoading) return;
+
+		let result = this.getMessageByIdWithRole(messageId, 'user');
+
+		if (!result) {
+			result = this.getMessageByIdWithRole(messageId, 'system');
+		}
+
+		if (!result) return;
+		const { message: msg } = result;
+
+		try {
+			const allMessages = await conversationsStore.getConversationMessages(activeConv.id);
+			const rootMessage = allMessages.find((m) => m.type === 'root' && m.parent === null);
+			const isFirstUserMessage =
+				msg.role === 'user' && rootMessage && msg.parent === rootMessage.id;
+
+			const parentId = msg.parent || rootMessage?.id;
+			if (!parentId) return;
+
+			// Use newExtras if provided, otherwise copy existing extras
+			// Deep clone to avoid Proxy objects from Svelte reactivity
+			const extrasToUse =
+				newExtras !== undefined
+					? JSON.parse(JSON.stringify(newExtras))
+					: msg.extra
+						? JSON.parse(JSON.stringify(msg.extra))
+						: undefined;
+
+			const newMessage = await DatabaseService.createMessageBranch(
+				{
+					convId: msg.convId,
+					type: msg.type,
+					timestamp: Date.now(),
+					role: msg.role,
+					content: newContent,
+					thinking: msg.thinking || '',
+					toolCalls: msg.toolCalls || '',
+					children: [],
+					extra: extrasToUse,
+					model: msg.model
+				},
+				parentId
+			);
+			await conversationsStore.updateCurrentNode(newMessage.id);
+			conversationsStore.updateConversationTimestamp();
+
+			if (isFirstUserMessage && newContent.trim()) {
+				await conversationsStore.updateConversationTitleWithConfirmation(
+					activeConv.id,
+					newContent.trim(),
+					conversationsStore.titleUpdateConfirmationCallback
+				);
+			}
+			await conversationsStore.refreshActiveMessages();
+
+			if (msg.role === 'user') {
+				await this.generateResponseForMessage(newMessage.id);
+			}
+		} catch (error) {
+			console.error('Failed to edit message with branching:', error);
+		}
+	}
+
+	async regenerateMessageWithBranching(messageId: string, modelOverride?: string): Promise<void> {
+		const activeConv = conversationsStore.activeConversation;
+		if (!activeConv || this.isLoading) return;
+		try {
+			const idx = conversationsStore.findMessageIndex(messageId);
+			if (idx === -1) return;
+			const msg = conversationsStore.activeMessages[idx];
+			if (msg.role !== 'assistant') return;
+
+			const allMessages = await conversationsStore.getConversationMessages(activeConv.id);
+			const parentMessage = allMessages.find((m) => m.id === msg.parent);
+			if (!parentMessage) return;
+
+			this.setChatLoading(activeConv.id, true);
+			this.clearChatStreaming(activeConv.id);
+
+			const newAssistantMessage = await DatabaseService.createMessageBranch(
+				{
+					convId: activeConv.id,
+					type: 'text',
+					timestamp: Date.now(),
+					role: 'assistant',
+					content: '',
+					thinking: '',
+					toolCalls: '',
+					children: [],
+					model: null
+				},
+				parentMessage.id
+			);
+			await conversationsStore.updateCurrentNode(newAssistantMessage.id);
+			conversationsStore.updateConversationTimestamp();
+			await conversationsStore.refreshActiveMessages();
+
+			const conversationPath = filterByLeafNodeId(
+				allMessages,
+				parentMessage.id,
+				false
+			) as DatabaseMessage[];
+			// Use modelOverride if provided, otherwise use the original message's model
+			// If neither is available, don't pass model (will use global selection)
+			const modelToUse = modelOverride || msg.model || undefined;
+			await this.streamChatCompletion(
+				conversationPath,
+				newAssistantMessage,
+				undefined,
+				undefined,
+				modelToUse
+			);
+		} catch (error) {
+			if (!this.isAbortError(error))
+				console.error('Failed to regenerate message with branching:', error);
+			this.setChatLoading(activeConv?.id || '', false);
+		}
+	}
+
+	private async generateResponseForMessage(userMessageId: string): Promise<void> {
+		const activeConv = conversationsStore.activeConversation;
+
+		if (!activeConv) return;
+
+		this.errorDialogState = null;
+		this.setChatLoading(activeConv.id, true);
+		this.clearChatStreaming(activeConv.id);
+
+		try {
+			const allMessages = await conversationsStore.getConversationMessages(activeConv.id);
+			const conversationPath = filterByLeafNodeId(
+				allMessages,
+				userMessageId,
+				false
+			) as DatabaseMessage[];
+			const assistantMessage = await DatabaseService.createMessageBranch(
+				{
+					convId: activeConv.id,
+					type: 'text',
+					timestamp: Date.now(),
+					role: 'assistant',
+					content: '',
+					thinking: '',
+					toolCalls: '',
+					children: [],
+					model: null
+				},
+				userMessageId
+			);
+			conversationsStore.addMessageToActive(assistantMessage);
+			await this.streamChatCompletion(conversationPath, assistantMessage);
+		} catch (error) {
+			console.error('Failed to generate response:', error);
+			this.setChatLoading(activeConv.id, false);
+		}
+	}
+
+	getAddFilesHandler(): ((files: File[]) => void) | null {
+		return this.addFilesHandler;
+	}
+
+	public getAllLoadingChats(): string[] {
+		return Array.from(this.chatLoadingStates.keys());
+	}
+
+	public getAllStreamingChats(): string[] {
+		return Array.from(this.chatStreamingStates.keys());
+	}
+
 	public getChatStreamingPublic(
 		convId: string
 	): { response: string; messageId: string } | undefined {
 		return this.getChatStreaming(convId);
 	}
-	public getAllLoadingChats(): string[] {
-		return Array.from(this.chatLoadingStates.keys());
+
+	public isChatLoadingPublic(convId: string): boolean {
+		return this.isChatLoading(convId);
 	}
-	public getAllStreamingChats(): string[] {
-		return Array.from(this.chatStreamingStates.keys());
+
+	isEditing(): boolean {
+		return this.isEditModeActive;
+	}
+
+	setEditModeActive(handler: (files: File[]) => void): void {
+		this.isEditModeActive = true;
+		this.addFilesHandler = handler;
 	}
 
 	// ─────────────────────────────────────────────────────────────────────────────
@@ -1416,13 +1462,17 @@ class ChatStore {
 
 export const chatStore = new ChatStore();
 
-export const isLoading = () => chatStore.isLoading;
+export const activeProcessingState = () => chatStore.activeProcessingState;
+export const clearEditMode = () => chatStore.clearEditMode();
 export const currentResponse = () => chatStore.currentResponse;
 export const errorDialog = () => chatStore.errorDialogState;
-export const activeProcessingState = () => chatStore.activeProcessingState;
-export const isChatStreaming = () => chatStore.isStreaming();
-
-export const isChatLoading = (convId: string) => chatStore.isChatLoadingPublic(convId);
-export const getChatStreaming = (convId: string) => chatStore.getChatStreamingPublic(convId);
+export const getAddFilesHandler = () => chatStore.getAddFilesHandler();
 export const getAllLoadingChats = () => chatStore.getAllLoadingChats();
 export const getAllStreamingChats = () => chatStore.getAllStreamingChats();
+export const getChatStreaming = (convId: string) => chatStore.getChatStreamingPublic(convId);
+export const isChatLoading = (convId: string) => chatStore.isChatLoadingPublic(convId);
+export const isChatStreaming = () => chatStore.isStreaming();
+export const isEditing = () => chatStore.isEditing();
+export const isLoading = () => chatStore.isLoading;
+export const setEditModeActive = (handler: (files: File[]) => void) =>
+	chatStore.setEditModeActive(handler);