mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-05-19 08:00:25 +00:00
Merge commit 'a61c8bc3bf' into concedo_experimental
# Conflicts: # ggml/src/ggml-opencl/CMakeLists.txt # ggml/src/ggml-opencl/ggml-opencl.cpp # ggml/src/ggml-webgpu/ggml-webgpu.cpp # scripts/pr2wt.sh # src/llama-model.cpp # tools/CMakeLists.txt # tools/mtmd/CMakeLists.txt # tools/mtmd/clip.cpp # tools/mtmd/clip.h
This commit is contained in:
commit
0dc18c668c
35 changed files with 3867 additions and 2025 deletions
167
common/arg.cpp
167
common/arg.cpp
|
|
@ -8,6 +8,7 @@
|
|||
#include "chat.h"
|
||||
#include "build-info.h"
|
||||
#include "download.h"
|
||||
#include "preset.h"
|
||||
|
||||
// fix problem with std::min and std::max
|
||||
#if defined(_WIN32)
|
||||
|
|
@ -270,6 +271,46 @@ static void parse_tensor_buffer_overrides(const std::string & value, std::vector
|
|||
}
|
||||
}
|
||||
|
||||
static std::string clean_file_name(const std::string & fname) {
|
||||
std::string clean_fname = fname;
|
||||
string_replace_all(clean_fname, "\\", "_");
|
||||
string_replace_all(clean_fname, "/", "_");
|
||||
return clean_fname;
|
||||
}
|
||||
|
||||
static bool common_params_handle_remote_preset(common_params & params, llama_example ex) {
|
||||
GGML_ASSERT(!params.model.hf_repo.empty());
|
||||
|
||||
const bool offline = params.offline;
|
||||
std::string model_endpoint = get_model_endpoint();
|
||||
auto preset_url = model_endpoint + params.model.hf_repo + "/resolve/main/preset.ini";
|
||||
|
||||
// prepare local path for caching
|
||||
auto preset_fname = clean_file_name(params.model.hf_repo + "_preset.ini");
|
||||
auto preset_path = fs_get_cache_file(preset_fname);
|
||||
const int status = common_download_file_single(preset_url, preset_path, params.hf_token, offline);
|
||||
const bool has_preset = status >= 200 && status < 400;
|
||||
|
||||
// remote preset is optional, so we don't error out if not found
|
||||
if (has_preset) {
|
||||
LOG_INF("applying remote preset from %s\n", preset_url.c_str());
|
||||
common_preset_context ctx(ex, /* only_remote_allowed */ true);
|
||||
common_preset global; // unused for now
|
||||
auto remote_presets = ctx.load_from_ini(preset_path, global);
|
||||
if (remote_presets.find(COMMON_PRESET_DEFAULT_NAME) != remote_presets.end()) {
|
||||
common_preset & preset = remote_presets.at(COMMON_PRESET_DEFAULT_NAME);
|
||||
LOG_INF("\n%s", preset.to_ini().c_str()); // to_ini already added trailing newline
|
||||
preset.apply_to_params(params);
|
||||
} else {
|
||||
throw std::runtime_error("Remote preset.ini does not contain [" + std::string(COMMON_PRESET_DEFAULT_NAME) + "] section");
|
||||
}
|
||||
} else {
|
||||
LOG_INF("%s", "no remote preset found, skipping\n");
|
||||
}
|
||||
|
||||
return has_preset;
|
||||
}
|
||||
|
||||
struct handle_model_result {
|
||||
bool found_mmproj = false;
|
||||
common_params_model mmproj;
|
||||
|
|
@ -311,9 +352,7 @@ static handle_model_result common_params_handle_model(
|
|||
// make sure model path is present (for caching purposes)
|
||||
if (model.path.empty()) {
|
||||
// this is to avoid different repo having same file name, or same file name in different subdirs
|
||||
std::string filename = model.hf_repo + "_" + model.hf_file;
|
||||
// to make sure we don't have any slashes in the filename
|
||||
string_replace_all(filename, "/", "_");
|
||||
std::string filename = clean_file_name(model.hf_repo + "_" + model.hf_file);
|
||||
model.path = fs_get_cache_file(filename);
|
||||
}
|
||||
|
||||
|
|
@ -427,61 +466,87 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
|||
}
|
||||
};
|
||||
|
||||
std::set<std::string> seen_args;
|
||||
auto parse_cli_args = [&]() {
|
||||
std::set<std::string> seen_args;
|
||||
|
||||
for (int i = 1; i < argc; i++) {
|
||||
const std::string arg_prefix = "--";
|
||||
for (int i = 1; i < argc; i++) {
|
||||
const std::string arg_prefix = "--";
|
||||
|
||||
std::string arg = argv[i];
|
||||
if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
|
||||
std::replace(arg.begin(), arg.end(), '_', '-');
|
||||
}
|
||||
if (arg_to_options.find(arg) == arg_to_options.end()) {
|
||||
throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
|
||||
}
|
||||
if (!seen_args.insert(arg).second) {
|
||||
LOG_WRN("DEPRECATED: argument '%s' specified multiple times, use comma-separated values instead (only last value will be used)\n", arg.c_str());
|
||||
}
|
||||
auto & tmp = arg_to_options[arg];
|
||||
auto opt = *tmp.first;
|
||||
bool is_positive = tmp.second;
|
||||
if (opt.has_value_from_env()) {
|
||||
fprintf(stderr, "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env, arg.c_str());
|
||||
}
|
||||
try {
|
||||
if (opt.handler_void) {
|
||||
opt.handler_void(params);
|
||||
continue;
|
||||
std::string arg = argv[i];
|
||||
if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
|
||||
std::replace(arg.begin(), arg.end(), '_', '-');
|
||||
}
|
||||
if (opt.handler_bool) {
|
||||
opt.handler_bool(params, is_positive);
|
||||
continue;
|
||||
if (arg_to_options.find(arg) == arg_to_options.end()) {
|
||||
throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
|
||||
}
|
||||
if (!seen_args.insert(arg).second) {
|
||||
LOG_WRN("DEPRECATED: argument '%s' specified multiple times, use comma-separated values instead (only last value will be used)\n", arg.c_str());
|
||||
}
|
||||
auto & tmp = arg_to_options[arg];
|
||||
auto opt = *tmp.first;
|
||||
bool is_positive = tmp.second;
|
||||
if (opt.has_value_from_env()) {
|
||||
fprintf(stderr, "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env, arg.c_str());
|
||||
}
|
||||
try {
|
||||
if (opt.handler_void) {
|
||||
opt.handler_void(params);
|
||||
continue;
|
||||
}
|
||||
if (opt.handler_bool) {
|
||||
opt.handler_bool(params, is_positive);
|
||||
continue;
|
||||
}
|
||||
|
||||
// arg with single value
|
||||
check_arg(i);
|
||||
std::string val = argv[++i];
|
||||
if (opt.handler_int) {
|
||||
opt.handler_int(params, std::stoi(val));
|
||||
continue;
|
||||
}
|
||||
if (opt.handler_string) {
|
||||
opt.handler_string(params, val);
|
||||
continue;
|
||||
}
|
||||
// arg with single value
|
||||
check_arg(i);
|
||||
std::string val = argv[++i];
|
||||
if (opt.handler_int) {
|
||||
opt.handler_int(params, std::stoi(val));
|
||||
continue;
|
||||
}
|
||||
if (opt.handler_string) {
|
||||
opt.handler_string(params, val);
|
||||
continue;
|
||||
}
|
||||
|
||||
// arg with 2 values
|
||||
check_arg(i);
|
||||
std::string val2 = argv[++i];
|
||||
if (opt.handler_str_str) {
|
||||
opt.handler_str_str(params, val, val2);
|
||||
continue;
|
||||
// arg with 2 values
|
||||
check_arg(i);
|
||||
std::string val2 = argv[++i];
|
||||
if (opt.handler_str_str) {
|
||||
opt.handler_str_str(params, val, val2);
|
||||
continue;
|
||||
}
|
||||
} catch (std::exception & e) {
|
||||
throw std::invalid_argument(string_format(
|
||||
"error while handling argument \"%s\": %s\n\n"
|
||||
"usage:\n%s\n\nto show complete usage, run with -h",
|
||||
arg.c_str(), e.what(), opt.to_string().c_str()));
|
||||
}
|
||||
} catch (std::exception & e) {
|
||||
throw std::invalid_argument(string_format(
|
||||
"error while handling argument \"%s\": %s\n\n"
|
||||
"usage:\n%s\n\nto show complete usage, run with -h",
|
||||
arg.c_str(), e.what(), opt.to_string().c_str()));
|
||||
}
|
||||
};
|
||||
|
||||
// parse the first time to get -hf option (used for remote preset)
|
||||
parse_cli_args();
|
||||
|
||||
// maybe handle remote preset
|
||||
if (!params.model.hf_repo.empty()) {
|
||||
std::string cli_hf_repo = params.model.hf_repo;
|
||||
bool has_preset = common_params_handle_remote_preset(params, ctx_arg.ex);
|
||||
|
||||
// special case: if hf_repo explicitly set by preset, we need to preserve it (ignore CLI value)
|
||||
// this is useful when we have one HF repo pointing to other HF repos (one model - multiple GGUFs)
|
||||
std::string preset_hf_repo = params.model.hf_repo;
|
||||
bool preset_has_hf_repo = preset_hf_repo != cli_hf_repo;
|
||||
|
||||
if (has_preset) {
|
||||
// re-parse CLI args to override preset values
|
||||
parse_cli_args();
|
||||
}
|
||||
|
||||
// preserve hf_repo from preset if needed
|
||||
if (preset_has_hf_repo) {
|
||||
params.model.hf_repo = preset_hf_repo;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -157,6 +157,10 @@ static std::string read_etag(const std::string & path) {
|
|||
return none;
|
||||
}
|
||||
|
||||
static bool is_http_status_ok(int status) {
|
||||
return status >= 200 && status < 400;
|
||||
}
|
||||
|
||||
#ifdef LLAMA_USE_CURL
|
||||
|
||||
//
|
||||
|
|
@ -306,12 +310,14 @@ static bool common_download_head(CURL * curl,
|
|||
}
|
||||
|
||||
// download one single file from remote URL to local path
|
||||
static bool common_download_file_single_online(const std::string & url,
|
||||
// returns status code or -1 on error
|
||||
static int common_download_file_single_online(const std::string & url,
|
||||
const std::string & path,
|
||||
const std::string & bearer_token,
|
||||
const common_header_list & custom_headers) {
|
||||
static const int max_attempts = 3;
|
||||
static const int retry_delay_seconds = 2;
|
||||
|
||||
for (int i = 0; i < max_attempts; ++i) {
|
||||
std::string etag;
|
||||
|
||||
|
|
@ -371,7 +377,7 @@ static bool common_download_file_single_online(const std::string & url,
|
|||
LOG_WRN("%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
|
||||
if (remove(path.c_str()) != 0) {
|
||||
LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
|
||||
return false;
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -380,14 +386,14 @@ static bool common_download_file_single_online(const std::string & url,
|
|||
if (std::filesystem::exists(path_temporary)) {
|
||||
if (remove(path_temporary.c_str()) != 0) {
|
||||
LOG_ERR("%s: unable to delete file: %s\n", __func__, path_temporary.c_str());
|
||||
return false;
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
if (std::filesystem::exists(path)) {
|
||||
if (remove(path.c_str()) != 0) {
|
||||
LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
|
||||
return false;
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -414,23 +420,27 @@ static bool common_download_file_single_online(const std::string & url,
|
|||
|
||||
long http_code = 0;
|
||||
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
|
||||
if (http_code < 200 || http_code >= 400) {
|
||||
|
||||
int status = static_cast<int>(http_code);
|
||||
if (!is_http_status_ok(http_code)) {
|
||||
LOG_ERR("%s: invalid http status code received: %ld\n", __func__, http_code);
|
||||
return false;
|
||||
return status; // TODO: maybe only return on certain codes
|
||||
}
|
||||
|
||||
if (rename(path_temporary.c_str(), path.c_str()) != 0) {
|
||||
LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
|
||||
return false;
|
||||
return -1;
|
||||
}
|
||||
|
||||
return static_cast<int>(http_code);
|
||||
} else {
|
||||
LOG_INF("%s: using cached file: %s\n", __func__, path.c_str());
|
||||
}
|
||||
|
||||
break;
|
||||
return 304; // Not Modified - fake cached response
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
return -1; // max attempts reached
|
||||
}
|
||||
|
||||
std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params & params) {
|
||||
|
|
@ -625,7 +635,8 @@ static bool common_pull_file(httplib::Client & cli,
|
|||
}
|
||||
|
||||
// download one single file from remote URL to local path
|
||||
static bool common_download_file_single_online(const std::string & url,
|
||||
// returns status code or -1 on error
|
||||
static int common_download_file_single_online(const std::string & url,
|
||||
const std::string & path,
|
||||
const std::string & bearer_token,
|
||||
const common_header_list & custom_headers) {
|
||||
|
|
@ -659,8 +670,10 @@ static bool common_download_file_single_online(const std::string & url,
|
|||
LOG_WRN("%s: HEAD invalid http status code received: %d\n", __func__, head ? head->status : -1);
|
||||
if (file_exists) {
|
||||
LOG_INF("%s: Using cached file (HEAD failed): %s\n", __func__, path.c_str());
|
||||
return true;
|
||||
return 304; // 304 Not Modified - fake cached response
|
||||
}
|
||||
return head->status; // cannot use cached file, return raw status code
|
||||
// TODO: maybe retry only on certain codes
|
||||
}
|
||||
|
||||
std::string etag;
|
||||
|
|
@ -692,12 +705,12 @@ static bool common_download_file_single_online(const std::string & url,
|
|||
if (file_exists) {
|
||||
if (!should_download_from_scratch) {
|
||||
LOG_INF("%s: using cached file: %s\n", __func__, path.c_str());
|
||||
return true;
|
||||
return 304; // 304 Not Modified - fake cached response
|
||||
}
|
||||
LOG_WRN("%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
|
||||
if (remove(path.c_str()) != 0) {
|
||||
LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
|
||||
return false;
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -709,7 +722,7 @@ static bool common_download_file_single_online(const std::string & url,
|
|||
existing_size = std::filesystem::file_size(path_temporary);
|
||||
} else if (remove(path_temporary.c_str()) != 0) {
|
||||
LOG_ERR("%s: unable to delete file: %s\n", __func__, path_temporary.c_str());
|
||||
return false;
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -730,15 +743,16 @@ static bool common_download_file_single_online(const std::string & url,
|
|||
|
||||
if (std::rename(path_temporary.c_str(), path.c_str()) != 0) {
|
||||
LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
|
||||
return false;
|
||||
return -1;
|
||||
}
|
||||
if (!etag.empty()) {
|
||||
write_etag(path, etag);
|
||||
}
|
||||
break;
|
||||
|
||||
return head->status; // TODO: use actual GET status?
|
||||
}
|
||||
|
||||
return true;
|
||||
return -1; // max attempts reached
|
||||
}
|
||||
|
||||
std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url,
|
||||
|
|
@ -777,22 +791,22 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string
|
|||
|
||||
#if defined(LLAMA_USE_CURL) || defined(LLAMA_USE_HTTPLIB)
|
||||
|
||||
static bool common_download_file_single(const std::string & url,
|
||||
const std::string & path,
|
||||
const std::string & bearer_token,
|
||||
bool offline,
|
||||
const common_header_list & headers) {
|
||||
int common_download_file_single(const std::string & url,
|
||||
const std::string & path,
|
||||
const std::string & bearer_token,
|
||||
bool offline,
|
||||
const common_header_list & headers) {
|
||||
if (!offline) {
|
||||
return common_download_file_single_online(url, path, bearer_token, headers);
|
||||
}
|
||||
|
||||
if (!std::filesystem::exists(path)) {
|
||||
LOG_ERR("%s: required file is not available in cache (offline mode): %s\n", __func__, path.c_str());
|
||||
return false;
|
||||
return -1;
|
||||
}
|
||||
|
||||
LOG_INF("%s: using cached file (offline mode): %s\n", __func__, path.c_str());
|
||||
return true;
|
||||
return 304; // Not Modified - fake cached response
|
||||
}
|
||||
|
||||
// download multiple files from remote URLs to local paths
|
||||
|
|
@ -810,7 +824,8 @@ static bool common_download_file_multiple(const std::vector<std::pair<std::strin
|
|||
std::async(
|
||||
std::launch::async,
|
||||
[&bearer_token, offline, &headers](const std::pair<std::string, std::string> & it) -> bool {
|
||||
return common_download_file_single(it.first, it.second, bearer_token, offline, headers);
|
||||
const int http_status = common_download_file_single(it.first, it.second, bearer_token, offline, headers);
|
||||
return is_http_status_ok(http_status);
|
||||
},
|
||||
item
|
||||
)
|
||||
|
|
@ -837,7 +852,8 @@ bool common_download_model(const common_params_model & model,
|
|||
return false;
|
||||
}
|
||||
|
||||
if (!common_download_file_single(model.url, model.path, bearer_token, offline, headers)) {
|
||||
const int http_status = common_download_file_single(model.url, model.path, bearer_token, offline, headers);
|
||||
if (!is_http_status_ok(http_status)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
@ -975,7 +991,7 @@ common_hf_file_res common_get_hf_file(const std::string & hf_repo_with_tag,
|
|||
} else if (res_code == 401) {
|
||||
throw std::runtime_error("error: model is private or does not exist; if you are accessing a gated model, please provide a valid HF token");
|
||||
} else {
|
||||
throw std::runtime_error(string_format("error from HF API, response code: %ld, data: %s", res_code, res_str.c_str()));
|
||||
throw std::runtime_error(string_format("error from HF API (%s), response code: %ld, data: %s", url.c_str(), res_code, res_str.c_str()));
|
||||
}
|
||||
|
||||
// check response
|
||||
|
|
@ -1094,7 +1110,8 @@ std::string common_docker_resolve_model(const std::string & docker) {
|
|||
std::string local_path = fs_get_cache_file(model_filename);
|
||||
|
||||
const std::string blob_url = url_prefix + "/blobs/" + gguf_digest;
|
||||
if (!common_download_file_single(blob_url, local_path, token, false, {})) {
|
||||
const int http_status = common_download_file_single(blob_url, local_path, token, false, {});
|
||||
if (!is_http_status_ok(http_status)) {
|
||||
throw std::runtime_error("Failed to download Docker Model");
|
||||
}
|
||||
|
||||
|
|
@ -1120,6 +1137,14 @@ std::string common_docker_resolve_model(const std::string &) {
|
|||
throw std::runtime_error("download functionality is not enabled in this build");
|
||||
}
|
||||
|
||||
int common_download_file_single(const std::string &,
|
||||
const std::string &,
|
||||
const std::string &,
|
||||
bool,
|
||||
const common_header_list &) {
|
||||
throw std::runtime_error("download functionality is not enabled in this build");
|
||||
}
|
||||
|
||||
#endif // LLAMA_USE_CURL || LLAMA_USE_HTTPLIB
|
||||
|
||||
std::vector<common_cached_model_info> common_list_cached_models() {
|
||||
|
|
|
|||
|
|
@ -65,6 +65,14 @@ bool common_download_model(
|
|||
// returns list of cached models
|
||||
std::vector<common_cached_model_info> common_list_cached_models();
|
||||
|
||||
// download single file from url to local path
|
||||
// returns status code or -1 on error
|
||||
int common_download_file_single(const std::string & url,
|
||||
const std::string & path,
|
||||
const std::string & bearer_token,
|
||||
bool offline,
|
||||
const common_header_list & headers = {});
|
||||
|
||||
// resolve and download model from Docker registry
|
||||
// return local path to downloaded model file
|
||||
std::string common_docker_resolve_model(const std::string & docker);
|
||||
|
|
|
|||
|
|
@ -16,6 +16,46 @@ static std::string rm_leading_dashes(const std::string & str) {
|
|||
return str.substr(pos);
|
||||
}
|
||||
|
||||
// only allow a subset of args for remote presets for security reasons
|
||||
// do not add more args unless absolutely necessary
|
||||
// args that output to files are strictly prohibited
|
||||
static std::set<std::string> get_remote_preset_whitelist(const std::map<std::string, common_arg> & key_to_opt) {
|
||||
static const std::set<std::string> allowed_options = {
|
||||
"model-url",
|
||||
"hf-repo",
|
||||
"hf-repo-draft",
|
||||
"hf-repo-v", // vocoder
|
||||
"hf-file-v", // vocoder
|
||||
"mmproj-url",
|
||||
"pooling",
|
||||
"jinja",
|
||||
"batch-size",
|
||||
"ubatch-size",
|
||||
"cache-reuse",
|
||||
// note: sampling params are automatically allowed by default
|
||||
// negated args will be added automatically
|
||||
};
|
||||
|
||||
std::set<std::string> allowed_keys;
|
||||
|
||||
for (const auto & it : key_to_opt) {
|
||||
const std::string & key = it.first;
|
||||
const common_arg & opt = it.second;
|
||||
if (allowed_options.find(key) != allowed_options.end() || opt.is_sparam) {
|
||||
allowed_keys.insert(key);
|
||||
// also add variant keys (args without leading dashes and env vars)
|
||||
for (const auto & arg : opt.get_args()) {
|
||||
allowed_keys.insert(rm_leading_dashes(arg));
|
||||
}
|
||||
for (const auto & env : opt.get_env()) {
|
||||
allowed_keys.insert(env);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return allowed_keys;
|
||||
}
|
||||
|
||||
std::vector<std::string> common_preset::to_args(const std::string & bin_path) const {
|
||||
std::vector<std::string> args;
|
||||
|
||||
|
|
@ -121,6 +161,29 @@ void common_preset::merge(const common_preset & other) {
|
|||
}
|
||||
}
|
||||
|
||||
void common_preset::apply_to_params(common_params & params) const {
|
||||
for (const auto & [opt, val] : options) {
|
||||
// apply each option to params
|
||||
if (opt.handler_string) {
|
||||
opt.handler_string(params, val);
|
||||
} else if (opt.handler_int) {
|
||||
opt.handler_int(params, std::stoi(val));
|
||||
} else if (opt.handler_bool) {
|
||||
opt.handler_bool(params, common_arg_utils::is_truthy(val));
|
||||
} else if (opt.handler_str_str) {
|
||||
// not supported yet
|
||||
throw std::runtime_error(string_format(
|
||||
"%s: option with two values is not supported yet",
|
||||
__func__
|
||||
));
|
||||
} else if (opt.handler_void) {
|
||||
opt.handler_void(params);
|
||||
} else {
|
||||
GGML_ABORT("unknown handler type");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static std::map<std::string, std::map<std::string, std::string>> parse_ini_from_file(const std::string & path) {
|
||||
std::map<std::string, std::map<std::string, std::string>> parsed;
|
||||
|
||||
|
|
@ -230,10 +293,16 @@ static std::string parse_bool_arg(const common_arg & arg, const std::string & ke
|
|||
return value;
|
||||
}
|
||||
|
||||
common_preset_context::common_preset_context(llama_example ex)
|
||||
common_preset_context::common_preset_context(llama_example ex, bool only_remote_allowed)
|
||||
: ctx_params(common_params_parser_init(default_params, ex)) {
|
||||
common_params_add_preset_options(ctx_params.options);
|
||||
key_to_opt = get_map_key_opt(ctx_params);
|
||||
|
||||
// setup allowed keys if only_remote_allowed is true
|
||||
if (only_remote_allowed) {
|
||||
filter_allowed_keys = true;
|
||||
allowed_keys = get_remote_preset_whitelist(key_to_opt);
|
||||
}
|
||||
}
|
||||
|
||||
common_presets common_preset_context::load_from_ini(const std::string & path, common_preset & global) const {
|
||||
|
|
@ -250,6 +319,12 @@ common_presets common_preset_context::load_from_ini(const std::string & path, co
|
|||
LOG_DBG("loading preset: %s\n", preset.name.c_str());
|
||||
for (const auto & [key, value] : section.second) {
|
||||
LOG_DBG("option: %s = %s\n", key.c_str(), value.c_str());
|
||||
if (filter_allowed_keys && allowed_keys.find(key) == allowed_keys.end()) {
|
||||
throw std::runtime_error(string_format(
|
||||
"option '%s' is not allowed in remote presets",
|
||||
key.c_str()
|
||||
));
|
||||
}
|
||||
if (key_to_opt.find(key) != key_to_opt.end()) {
|
||||
const auto & opt = key_to_opt.at(key);
|
||||
if (is_bool_arg(opt)) {
|
||||
|
|
|
|||
|
|
@ -6,6 +6,7 @@
|
|||
#include <string>
|
||||
#include <vector>
|
||||
#include <map>
|
||||
#include <set>
|
||||
|
||||
//
|
||||
// INI preset parser and writer
|
||||
|
|
@ -40,6 +41,9 @@ struct common_preset {
|
|||
|
||||
// merge another preset into this one, overwriting existing options
|
||||
void merge(const common_preset & other);
|
||||
|
||||
// apply preset options to common_params
|
||||
void apply_to_params(common_params & params) const;
|
||||
};
|
||||
|
||||
// interface for multiple presets in one file
|
||||
|
|
@ -50,7 +54,12 @@ struct common_preset_context {
|
|||
common_params default_params; // unused for now
|
||||
common_params_context ctx_params;
|
||||
std::map<std::string, common_arg> key_to_opt;
|
||||
common_preset_context(llama_example ex);
|
||||
|
||||
bool filter_allowed_keys = false;
|
||||
std::set<std::string> allowed_keys;
|
||||
|
||||
// if only_remote_allowed is true, only accept whitelisted keys
|
||||
common_preset_context(llama_example ex, bool only_remote_allowed = false);
|
||||
|
||||
// load presets from INI file
|
||||
common_presets load_from_ini(const std::string & path, common_preset & global) const;
|
||||
|
|
|
|||
|
|
@ -528,7 +528,11 @@ class ModelBase:
|
|||
return ()
|
||||
|
||||
def prepare_tensors(self):
|
||||
max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,")
|
||||
# Handle empty tensor_map for models with block_count=0 (like MobileNetV5)
|
||||
if self.tensor_map.mapping:
|
||||
max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,")
|
||||
else:
|
||||
max_name_len = len("vision_encoder.weight,") # Default reasonable length
|
||||
|
||||
for name, data_torch in chain(self.generate_extra_tensors(), self.get_tensors()):
|
||||
# we don't need these
|
||||
|
|
@ -6038,7 +6042,175 @@ class Gemma3VisionModel(MmprojModel):
|
|||
return [] # skip other tensors
|
||||
|
||||
|
||||
class ConformerAudioModel(MmprojModel):
|
||||
_batch_norm_tensors: list[dict[str, Tensor]] | None = None
|
||||
|
||||
@staticmethod
|
||||
def is_audio_tensor(name: str):
|
||||
return any(p in name for p in ["audio", "codebook", "conformer", "depth_embedding", "depthformer", "depth_linear"])
|
||||
|
||||
def tensor_force_quant(self, name, new_name, bid, n_dims):
|
||||
if ConformerAudioModel.is_audio_tensor(name):
|
||||
if ".conv" in name or "_conv" in name and ".weight" in name:
|
||||
return gguf.GGMLQuantizationType.F32
|
||||
return super().tensor_force_quant(name, new_name, bid, n_dims)
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
# fold running_mean, running_var and eps into weight and bias for batch_norm
|
||||
if "batch_norm" in name:
|
||||
if self._batch_norm_tensors is None:
|
||||
self._batch_norm_tensors = [{} for _ in range(self.block_count)]
|
||||
assert bid is not None
|
||||
self._batch_norm_tensors[bid][name] = data_torch
|
||||
|
||||
if len(self._batch_norm_tensors[bid]) < 5:
|
||||
return []
|
||||
|
||||
weight = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.weight"]
|
||||
bias = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.bias"]
|
||||
running_mean = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.running_mean"]
|
||||
running_var = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.running_var"]
|
||||
eps = 1e-5 # default value
|
||||
|
||||
a = weight / torch.sqrt(running_var + eps)
|
||||
b = bias - running_mean * a
|
||||
return [
|
||||
(self.map_tensor_name(f"conformer.layers.{bid}.conv.batch_norm.weight"), a),
|
||||
(self.map_tensor_name(f"conformer.layers.{bid}.conv.batch_norm.bias"), b),
|
||||
]
|
||||
|
||||
# reshape conv weights
|
||||
if name.startswith("conformer.pre_encode.conv.") and name.endswith(".bias"):
|
||||
data_torch = data_torch[:, None, None]
|
||||
if "conv.depthwise_conv" in name and name.endswith(".weight"):
|
||||
assert data_torch.shape[1] == 1
|
||||
data_torch = data_torch.reshape(data_torch.shape[0], data_torch.shape[2])
|
||||
if "conv.pointwise_conv" in name and name.endswith(".weight"):
|
||||
assert data_torch.shape[2] == 1
|
||||
data_torch = data_torch.reshape(data_torch.shape[0], data_torch.shape[1])
|
||||
|
||||
return [(self.map_tensor_name(name), data_torch)]
|
||||
|
||||
|
||||
@ModelBase.register("Gemma3nForConditionalGeneration")
|
||||
class Gemma3nVisionAudioModel(ConformerAudioModel):
|
||||
has_audio_encoder = True
|
||||
has_vision_encoder = True
|
||||
|
||||
# Double indexed mapping for MobileNetV5 blocks (not supported by tensor_mapping.py)
|
||||
# This is the only known model having this, so we prefer implementing it outside of tensor_mapping.py
|
||||
block_tensor_mapping = {
|
||||
"model.vision_tower.timm_model.blocks.{bid}.{sid}.conv_exp.weight": "v.blk.{bid}.{sid}.conv_exp.weight",
|
||||
"model.vision_tower.timm_model.blocks.{bid}.{sid}.bn1.weight": "v.blk.{bid}.{sid}.bn1.weight",
|
||||
"model.vision_tower.timm_model.blocks.{bid}.{sid}.conv_pwl.weight": "v.blk.{bid}.{sid}.conv_pwl.weight",
|
||||
"model.vision_tower.timm_model.blocks.{bid}.{sid}.bn2.weight": "v.blk.{bid}.{sid}.bn2.weight",
|
||||
"model.vision_tower.timm_model.blocks.{bid}.{sid}.dw_start.conv.weight": "v.blk.{bid}.{sid}.dw_start.conv.weight",
|
||||
"model.vision_tower.timm_model.blocks.{bid}.{sid}.dw_start.bn.weight": "v.blk.{bid}.{sid}.dw_start.bn.weight",
|
||||
"model.vision_tower.timm_model.blocks.{bid}.{sid}.dw_mid.conv.weight": "v.blk.{bid}.{sid}.dw_mid.conv.weight",
|
||||
"model.vision_tower.timm_model.blocks.{bid}.{sid}.dw_mid.bn.weight": "v.blk.{bid}.{sid}.dw_mid.bn.weight",
|
||||
"model.vision_tower.timm_model.blocks.{bid}.{sid}.pw_exp.conv.weight": "v.blk.{bid}.{sid}.pw_exp.conv.weight",
|
||||
"model.vision_tower.timm_model.blocks.{bid}.{sid}.pw_exp.bn.weight": "v.blk.{bid}.{sid}.pw_exp.bn.weight",
|
||||
"model.vision_tower.timm_model.blocks.{bid}.{sid}.pw_proj.conv.weight": "v.blk.{bid}.{sid}.pw_proj.conv.weight",
|
||||
"model.vision_tower.timm_model.blocks.{bid}.{sid}.pw_proj.bn.weight": "v.blk.{bid}.{sid}.pw_proj.bn.weight",
|
||||
"model.vision_tower.timm_model.blocks.{bid}.{sid}.layer_scale.gamma": "v.blk.{bid}.{sid}.layer_scale.gamma",
|
||||
"model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.query.proj.weight": "v.blk.{bid}.{sid}.attn.query.proj.weight",
|
||||
"model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.key.proj.weight": "v.blk.{bid}.{sid}.attn.key.proj.weight",
|
||||
"model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.value.proj.weight": "v.blk.{bid}.{sid}.attn.value.proj.weight",
|
||||
"model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.output.proj.weight": "v.blk.{bid}.{sid}.attn.output.proj.weight",
|
||||
"model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.key.down_conv.weight": "v.blk.{bid}.{sid}.attn.key.down_conv.weight",
|
||||
"model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.key.norm.weight": "v.blk.{bid}.{sid}.attn.key.norm.weight",
|
||||
"model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.value.down_conv.weight": "v.blk.{bid}.{sid}.attn.value.down_conv.weight",
|
||||
"model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.value.norm.weight": "v.blk.{bid}.{sid}.attn.value.norm.weight",
|
||||
"model.vision_tower.timm_model.blocks.{bid}.{sid}.norm.weight": "v.blk.{bid}.{sid}.norm.weight",
|
||||
}
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
# Parent init will call find_hparam which now returns 0 for empty keys
|
||||
super().__init__(*args, **kwargs)
|
||||
assert self.hparams_vision is not None
|
||||
self.hparams_vision["n_layers"] = 128 # fake value for audio encoder, vision encoder doesn't use it
|
||||
self.hparams_vision["intermediate_size"] = self.hparams_vision.get("intermediate_size", 2048) * 4
|
||||
self.hparams_vision["num_attention_heads"] = self.hparams_vision.get("num_attention_heads", 8)
|
||||
|
||||
# MobileNetV5 does not use image_mean/std
|
||||
self.preprocessor_config["image_mean"] = [0.0 ,0.0 , 0.0]
|
||||
self.preprocessor_config["image_std"] = [1.0 ,1.0 ,1.0]
|
||||
self.hparams_vision["image_size"] = self.preprocessor_config.get(
|
||||
"size", {"height": 768, "width": 768}
|
||||
)["height"]
|
||||
|
||||
# Image sequence length (256 tokens = 16x16 for Gemma3n)
|
||||
image_seq_length = self.preprocessor_config.get("image_seq_length", 256)
|
||||
image_size = self.hparams_vision["image_size"]
|
||||
self.hparams_vision["patch_size"] = image_size // image_seq_length
|
||||
|
||||
# remap audio hparams
|
||||
assert self.hparams_audio is not None
|
||||
self.hparams_audio["n_layers"] = self.hparams_audio["conf_num_hidden_layers"]
|
||||
self.hparams_audio["num_attention_heads"] = self.hparams_audio["conf_num_attention_heads"]
|
||||
self.hparams_audio["feat_in"] = self.hparams_audio["input_feat_size"]
|
||||
self.hparams_audio["intermediate_size"] = self.hparams_audio.get("intermediate_size", 6144)
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
|
||||
# vision params
|
||||
self.gguf_writer.add_clip_vision_projector_type(gguf.VisionProjectorType.GEMMA3NV)
|
||||
self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6))
|
||||
|
||||
# audio params
|
||||
assert self.hparams_audio is not None
|
||||
self.gguf_writer.add_clip_audio_projector_type(gguf.VisionProjectorType.GEMMA3NA)
|
||||
self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["feat_in"])
|
||||
self.gguf_writer.add_audio_attention_layernorm_eps(1e-5)
|
||||
|
||||
def tensor_force_quant(self, name, new_name, bid, n_dims):
|
||||
# Force quantization settings for specific tensor types
|
||||
if "input_projection" in name or "input_proj" in name:
|
||||
return gguf.GGMLQuantizationType.F16
|
||||
if ".embeddings." in name or "stem" in name:
|
||||
return gguf.GGMLQuantizationType.F32
|
||||
return super().tensor_force_quant(name, new_name, bid, n_dims)
|
||||
|
||||
def custom_map(self, name: str) -> str:
|
||||
"""Parses names like model.vision_tower.timm_model.blocks.1.2.suffix and applies template mapping."""
|
||||
parts = name.split(".")
|
||||
# MobileNet blocks have at least 7 parts: model, vision_tower, timm_model, blocks, bid, sid, and suffix
|
||||
if len(parts) >= 7:
|
||||
bid, sid = parts[4], parts[5]
|
||||
suffix = ".".join(parts[6:])
|
||||
template = f"model.vision_tower.timm_model.blocks.{{bid}}.{{sid}}.{suffix}"
|
||||
if template in self.block_tensor_mapping:
|
||||
return self.block_tensor_mapping[template].format(bid=bid, sid=sid)
|
||||
|
||||
raise ValueError(f"Unknown name: {name}")
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
if (ConformerAudioModel.is_audio_tensor(name)):
|
||||
name = name.replace("model.audio_tower.conformer.", "conformer.layers.")
|
||||
return super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
# Gemma3n uses
|
||||
# - model.embed_vision.* for projection layers
|
||||
# - model.vision_tower.* for vision encoder
|
||||
# Skip non-vision tensors
|
||||
if not (name.startswith("model.embed_vision.") or name.startswith("model.vision_tower.")):
|
||||
return []
|
||||
|
||||
if name.startswith("model.vision_tower.timm_model.blocks."):
|
||||
# Double-indexed block tensors through custom logic
|
||||
new_name = self.custom_map(name)
|
||||
else:
|
||||
# Route non-repeating (conv_stem, msfa, embedding, etc.) and un-catched through tensor_mapping.py
|
||||
new_name = self.map_tensor_name(name)
|
||||
|
||||
if new_name.endswith("conv_stem.conv.bias") or new_name.endswith("layer_scale.gamma"):
|
||||
data_torch = data_torch.unsqueeze(0).unsqueeze(-1).unsqueeze(-1) # [1, C, 1, 1]
|
||||
|
||||
return [(new_name, data_torch)]
|
||||
|
||||
|
||||
@ModelBase.register("Gemma3nForCausalLM", "Gemma3nForConditionalGeneration")
|
||||
class Gemma3NModel(Gemma3Model):
|
||||
model_arch = gguf.MODEL_ARCH.GEMMA3N
|
||||
norm_shift = 0.0 # same value with Gemma3p5RMSNorm scale_shift on python code
|
||||
|
|
@ -6061,8 +6233,25 @@ class Gemma3NModel(Gemma3Model):
|
|||
]
|
||||
|
||||
def set_vocab(self):
|
||||
# For Gemma3n multimodal models, we need the FULL vocab_size (262400)
|
||||
# which includes special tokens from 262144-262399 for vision/audio.
|
||||
# The vocab_size_per_layer_input (262144) is only the embedding size per layer.
|
||||
# Temporarily override the hparams lookup order to prioritize vocab_size.
|
||||
|
||||
# Store original vocab_size_per_layer_input if it exists
|
||||
vocab_size_per_layer_input = self.hparams.get("vocab_size_per_layer_input")
|
||||
|
||||
# Temporarily remove vocab_size_per_layer_input to force using vocab_size
|
||||
if vocab_size_per_layer_input is not None:
|
||||
del self.hparams["vocab_size_per_layer_input"]
|
||||
|
||||
# Call parent set_vocab which will now use vocab_size (262400)
|
||||
super().set_vocab()
|
||||
|
||||
# Restore vocab_size_per_layer_input for later use
|
||||
if vocab_size_per_layer_input is not None:
|
||||
self.hparams["vocab_size_per_layer_input"] = vocab_size_per_layer_input
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
self.gguf_writer.add_altup_active_idx(self.hparams["altup_active_idx"])
|
||||
|
|
@ -6098,8 +6287,32 @@ class Gemma3NModel(Gemma3Model):
|
|||
if "language_model." not in name:
|
||||
return [] # skip non-language model tensors
|
||||
|
||||
# Pad token embeddings for vision/audio special tokens (262144-262399)
|
||||
if "embed_tokens.weight" in name or "embed_tokens_per_layer" in name:
|
||||
# Move to CPU to avoid meta device issues during padding
|
||||
data_torch = data_torch.to(device="cpu")
|
||||
|
||||
vocab_size = self.hparams.get("vocab_size", 262400)
|
||||
current_size = data_torch.shape[0] # First dimension is vocab_size
|
||||
|
||||
if current_size < vocab_size:
|
||||
# Pad with zeros for vision/audio tokens (they get embeddings from vision tower)
|
||||
padding_size = vocab_size - current_size
|
||||
tensor_type = "per-layer embeddings" if "per_layer" in name else "token embeddings"
|
||||
logger.info(f"Padding {tensor_type} shape {list(data_torch.shape)} from {current_size} to {vocab_size} (adding {padding_size} vision/audio token slots)")
|
||||
|
||||
# Create padding with zeros (vision tokens won't use these embeddings)
|
||||
padding = torch.zeros((padding_size, data_torch.shape[1]), dtype=data_torch.dtype, device=data_torch.device)
|
||||
data_torch = torch.cat([data_torch, padding], dim=0)
|
||||
|
||||
# Continue with normal processing
|
||||
name = name.replace("language_model.", "")
|
||||
return [(self.map_tensor_name(name), data_torch)]
|
||||
|
||||
if "altup_unembed_projections" in name:
|
||||
data_torch = data_torch.to(device="cpu")
|
||||
# altup_unembed matrices are [hidden_size, hidden_size], NOT vocab-based
|
||||
# They should NOT be padded
|
||||
if ".0." in name:
|
||||
self._altup_unembd[0] = data_torch
|
||||
elif ".1." in name:
|
||||
|
|
@ -9936,7 +10149,7 @@ class LFM2Model(TextModel):
|
|||
self._add_feed_forward_length()
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
if self._is_vision_tensor(name) or self._is_audio_tensor(name):
|
||||
if self._is_vision_tensor(name) or ConformerAudioModel.is_audio_tensor(name):
|
||||
# skip multimodal tensors
|
||||
return []
|
||||
|
||||
|
|
@ -9952,9 +10165,6 @@ class LFM2Model(TextModel):
|
|||
def _is_vision_tensor(self, name: str) -> bool:
|
||||
return "vision_tower" in name or "multi_modal_projector" in name
|
||||
|
||||
def _is_audio_tensor(self, name: str):
|
||||
return any(p in name for p in ["audio", "codebook", "conformer", "depth_embedding", "depthformer", "depth_linear"])
|
||||
|
||||
|
||||
@ModelBase.register("Lfm2Model")
|
||||
class LFM2ColBertModel(LFM2Model):
|
||||
|
|
@ -10082,13 +10292,11 @@ class LFM2VLModel(MmprojModel):
|
|||
|
||||
|
||||
@ModelBase.register("Lfm2AudioForConditionalGeneration")
|
||||
class LFM2AudioModel(MmprojModel):
|
||||
class LFM2AudioModel(ConformerAudioModel):
|
||||
has_vision_encoder = False
|
||||
has_audio_encoder = True
|
||||
model_name = "Lfm2AudioEncoder"
|
||||
|
||||
_batch_norm_tensors: list[dict[str, Tensor]] | None = None
|
||||
|
||||
def get_audio_config(self) -> dict[str, Any] | None:
|
||||
return self.global_config.get("encoder")
|
||||
|
||||
|
|
@ -10102,12 +10310,7 @@ class LFM2AudioModel(MmprojModel):
|
|||
self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["feat_in"])
|
||||
self.gguf_writer.add_audio_attention_layernorm_eps(1e-5)
|
||||
|
||||
def tensor_force_quant(self, name, new_name, bid, n_dims):
|
||||
if ".conv" in name and ".weight" in name:
|
||||
return gguf.GGMLQuantizationType.F32
|
||||
return super().tensor_force_quant(name, new_name, bid, n_dims)
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
def modify_tensors(self, data_torch, name, bid):
|
||||
# skip language model tensors
|
||||
if name.startswith("lfm."):
|
||||
return []
|
||||
|
|
@ -10120,40 +10323,7 @@ class LFM2AudioModel(MmprojModel):
|
|||
if any(p in name for p in ["codebook_offsets", "depth_embeddings", "depth_linear", "depthformer"]):
|
||||
return []
|
||||
|
||||
# fold running_mean, running_var and eps into weight and bias for batch_norm
|
||||
if "batch_norm" in name:
|
||||
if self._batch_norm_tensors is None:
|
||||
self._batch_norm_tensors = [{} for _ in range(self.block_count)]
|
||||
assert bid is not None
|
||||
self._batch_norm_tensors[bid][name] = data_torch
|
||||
|
||||
if len(self._batch_norm_tensors[bid]) < 5:
|
||||
return []
|
||||
|
||||
weight = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.weight"]
|
||||
bias = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.bias"]
|
||||
running_mean = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.running_mean"]
|
||||
running_var = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.running_var"]
|
||||
eps = 1e-5 # default value
|
||||
|
||||
a = weight / torch.sqrt(running_var + eps)
|
||||
b = bias - running_mean * a
|
||||
return [
|
||||
(self.map_tensor_name(f"conformer.layers.{bid}.conv.batch_norm.weight"), a),
|
||||
(self.map_tensor_name(f"conformer.layers.{bid}.conv.batch_norm.bias"), b),
|
||||
]
|
||||
|
||||
# reshape conv weights
|
||||
if name.startswith("conformer.pre_encode.conv.") and name.endswith(".bias"):
|
||||
data_torch = data_torch[:, None, None]
|
||||
if "conv.depthwise_conv" in name and name.endswith(".weight"):
|
||||
assert data_torch.shape[1] == 1
|
||||
data_torch = data_torch.reshape(data_torch.shape[0], data_torch.shape[2])
|
||||
if "conv.pointwise_conv" in name and name.endswith(".weight"):
|
||||
assert data_torch.shape[2] == 1
|
||||
data_torch = data_torch.reshape(data_torch.shape[0], data_torch.shape[1])
|
||||
|
||||
return [(self.map_tensor_name(name), data_torch)]
|
||||
return super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@ModelBase.register("SmallThinkerForCausalLM")
|
||||
|
|
|
|||
3989
embd_res/klite.embd
3989
embd_res/klite.embd
File diff suppressed because one or more lines are too long
|
|
@ -234,6 +234,11 @@
|
|||
|
||||
#if UINTPTR_MAX == 0xFFFFFFFF
|
||||
#define GGML_MEM_ALIGN 4
|
||||
#elif defined(__EMSCRIPTEN__)
|
||||
// emscripten uses max_align_t == 8, so we need GGML_MEM_ALIGN == 8 for 64-bit wasm.
|
||||
// (for 32-bit wasm, the first conditional is true and GGML_MEM_ALIGN stays 4.)
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/18628
|
||||
#define GGML_MEM_ALIGN 8
|
||||
#else
|
||||
#define GGML_MEM_ALIGN 16
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -144,7 +144,7 @@ extern "C" {
|
|||
// device description: short informative description of the device, could be the model name
|
||||
const char * (*get_description)(ggml_backend_dev_t dev);
|
||||
|
||||
// device memory in bytes
|
||||
// device memory in bytes: 0 bytes to indicate no memory to report
|
||||
void (*get_memory)(ggml_backend_dev_t dev, size_t * free, size_t * total);
|
||||
|
||||
// device type
|
||||
|
|
|
|||
|
|
@ -276,12 +276,13 @@ class Keys:
|
|||
DATASETS = "imatrix.datasets"
|
||||
|
||||
class Clip:
|
||||
PROJECTOR_TYPE = "clip.projector_type"
|
||||
HAS_VISION_ENCODER = "clip.has_vision_encoder"
|
||||
HAS_AUDIO_ENCODER = "clip.has_audio_encoder"
|
||||
HAS_LLAVA_PROJECTOR = "clip.has_llava_projector"
|
||||
PROJECTOR_TYPE = "clip.projector_type"
|
||||
HAS_VISION_ENCODER = "clip.has_vision_encoder"
|
||||
HAS_AUDIO_ENCODER = "clip.has_audio_encoder"
|
||||
HAS_LLAVA_PROJECTOR = "clip.has_llava_projector"
|
||||
|
||||
class ClipVision:
|
||||
PROJECTOR_TYPE = "clip.vision.projector_type" # for mixed modality models
|
||||
IMAGE_SIZE = "clip.vision.image_size"
|
||||
PREPROC_IMAGE_SIZE = "clip.vision.preproc_image_size"
|
||||
PATCH_SIZE = "clip.vision.patch_size"
|
||||
|
|
@ -307,6 +308,7 @@ class Keys:
|
|||
SCALE_FACTOR = "clip.vision.projector.scale_factor"
|
||||
|
||||
class ClipAudio:
|
||||
PROJECTOR_TYPE = "clip.audio.projector_type" # for mixed modality models
|
||||
NUM_MEL_BINS = "clip.audio.num_mel_bins"
|
||||
EMBEDDING_LENGTH = "clip.audio.embedding_length"
|
||||
FEED_FORWARD_LENGTH = "clip.audio.feed_forward_length"
|
||||
|
|
@ -465,6 +467,7 @@ class VISION_PROJECTOR_TYPE(IntEnum):
|
|||
RESAMPLER = auto()
|
||||
GLM_EDGE = auto()
|
||||
MERGER = auto()
|
||||
GEMMA3N = auto()
|
||||
GEMMA3 = auto()
|
||||
QWEN3VL = auto()
|
||||
COGVLM = auto()
|
||||
|
|
@ -675,6 +678,15 @@ class MODEL_TENSOR(IntEnum):
|
|||
V_MM_INP_NORM = auto()
|
||||
V_MM_INP_PROJ = auto() # gemma3
|
||||
V_MM_SOFT_EMB_NORM = auto() # gemma3
|
||||
V_MM_EMBEDDING = auto() # gemma3n
|
||||
V_MM_HARD_EMB_NORM = auto() # gemma3n
|
||||
V_ENC_CONV_STEM = auto() # gemma3n
|
||||
V_ENC_CONV_STEM_NORM = auto() # gemma3n
|
||||
V_ENC_MSFA_EXP = auto() # gemma3n
|
||||
V_ENC_MSFA_EXP_NORM = auto() # gemma3n
|
||||
V_ENC_MSFA_PROJ = auto() # gemma3n
|
||||
V_ENC_MSFA_PROJ_NORM = auto() # gemma3n
|
||||
V_ENC_MSFA_NORM = auto() # gemma3n
|
||||
V_RESMPL_POS_EMBD_K = auto() # minicpmv
|
||||
V_RESMPL_ATTN_Q = auto() # minicpmv
|
||||
V_RESMPL_ATTN_K = auto() # minicpmv
|
||||
|
|
@ -698,30 +710,41 @@ class MODEL_TENSOR(IntEnum):
|
|||
V_TOK_BOI = auto() # cogvlm
|
||||
V_TOK_EOI = auto() # cogvlm
|
||||
# audio (mtmd)
|
||||
A_ENC_EMBD_POS = auto()
|
||||
A_ENC_EMBD_NORM = auto()
|
||||
A_ENC_EMBD_TO_LOGITS = auto()
|
||||
A_ENC_CONV1D = auto()
|
||||
A_PRE_NORM = auto()
|
||||
A_POST_NORM = auto()
|
||||
A_ENC_ATTN_Q = auto()
|
||||
A_ENC_ATTN_K = auto()
|
||||
A_ENC_ATTN_V = auto()
|
||||
A_ENC_INPUT_NORM = auto()
|
||||
A_ENC_OUTPUT = auto()
|
||||
A_ENC_OUTPUT_NORM = auto()
|
||||
A_ENC_FFN_UP = auto()
|
||||
A_ENC_FFN_NORM = auto()
|
||||
A_ENC_FFN_GATE = auto()
|
||||
A_ENC_FFN_DOWN = auto()
|
||||
A_ENC_FFN_UP_1 = auto()
|
||||
A_ENC_FFN_NORM_1 = auto()
|
||||
A_ENC_FFN_GATE_1 = auto()
|
||||
A_ENC_FFN_DOWN_1 = auto()
|
||||
A_MMPROJ = auto()
|
||||
A_MMPROJ_FC = auto()
|
||||
A_MM_NORM_PRE = auto()
|
||||
A_MM_NORM_MID = auto()
|
||||
A_ENC_EMBD_POS = auto()
|
||||
A_ENC_EMBD_NORM = auto()
|
||||
A_ENC_EMBD_TO_LOGITS = auto() # lfm2
|
||||
A_ENC_CONV1D = auto()
|
||||
A_ENC_CONV1D_NORM = auto() # gemma3n
|
||||
A_PRE_NORM = auto()
|
||||
A_POST_NORM = auto()
|
||||
A_ENC_LAYER_PRE_NORM = auto() # gemma3n
|
||||
A_ENC_ATTN_Q = auto()
|
||||
A_ENC_ATTN_K = auto()
|
||||
A_ENC_ATTN_V = auto()
|
||||
A_ENC_PER_DIM_SCALE = auto() # gemma3n
|
||||
A_ENC_INPUT_NORM = auto()
|
||||
A_ENC_OUTPUT = auto()
|
||||
A_ENC_OUTPUT_NORM = auto()
|
||||
A_ENC_FFN_UP = auto()
|
||||
A_ENC_FFN_NORM = auto()
|
||||
A_ENC_FFN_POST_NORM = auto() # gemma3n
|
||||
A_ENC_FFN_SCALE = auto() # gemma3n
|
||||
A_ENC_FFN_GATE = auto()
|
||||
A_ENC_FFN_DOWN = auto()
|
||||
A_ENC_FFN_UP_1 = auto() # lfm2, gemma3n
|
||||
A_ENC_FFN_NORM_1 = auto() # lfm2, gemma3n (pre-norm)
|
||||
A_ENC_FFN_POST_NORM_1 = auto() # gemma3n
|
||||
A_ENC_FFN_SCALE_1 = auto() # gemma3n
|
||||
A_ENC_FFN_GATE_1 = auto() # lfm2, gemma3n
|
||||
A_ENC_FFN_DOWN_1 = auto() # lfm2, gemma3n
|
||||
A_MMPROJ = auto()
|
||||
A_MMPROJ_FC = auto()
|
||||
A_MM_NORM_PRE = auto()
|
||||
A_MM_NORM_MID = auto()
|
||||
A_MM_EMBEDDING = auto() # gemma3n
|
||||
A_MM_HARD_EMB_NORM = auto() # gemma3n
|
||||
A_MM_SOFT_EMB_NORM = auto() # gemma3n
|
||||
A_MM_INP_PROJ = auto() # gemma3n
|
||||
# nextn/mtp
|
||||
NEXTN_EH_PROJ = auto()
|
||||
NEXTN_EMBED_TOKENS = auto()
|
||||
|
|
@ -1071,7 +1094,16 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
|||
MODEL_TENSOR.V_MM_POST_NORM: "mm.post_norm",
|
||||
MODEL_TENSOR.V_MM_INP_PROJ: "mm.input_projection",
|
||||
MODEL_TENSOR.V_MM_INP_NORM: "mm.input_norm",
|
||||
MODEL_TENSOR.V_MM_SOFT_EMB_NORM: "mm.soft_emb_norm",
|
||||
MODEL_TENSOR.V_MM_SOFT_EMB_NORM: "mm.soft_emb_norm", # gemma3n
|
||||
MODEL_TENSOR.V_MM_EMBEDDING: "mm.embedding", # gemma3n
|
||||
MODEL_TENSOR.V_MM_HARD_EMB_NORM: "mm.hard_emb_norm", # gemma3n
|
||||
MODEL_TENSOR.V_ENC_CONV_STEM: "v.conv_stem.conv", # gemma3n
|
||||
MODEL_TENSOR.V_ENC_CONV_STEM_NORM: "v.conv_stem.bn", # gemma3n
|
||||
MODEL_TENSOR.V_ENC_MSFA_EXP: "v.msfa.ffn.pw_exp.conv", # gemma3n
|
||||
MODEL_TENSOR.V_ENC_MSFA_EXP_NORM: "v.msfa.ffn.pw_exp.bn", # gemma3n
|
||||
MODEL_TENSOR.V_ENC_MSFA_PROJ: "v.msfa.ffn.pw_proj.conv", # gemma3n
|
||||
MODEL_TENSOR.V_ENC_MSFA_PROJ_NORM: "v.msfa.ffn.pw_proj.bn", # gemma3n
|
||||
MODEL_TENSOR.V_ENC_MSFA_NORM: "v.msfa.norm", # gemma3n
|
||||
MODEL_TENSOR.V_RESMPL_POS_EMBD_K: "resampler.pos_embd_k",
|
||||
MODEL_TENSOR.V_RESMPL_ATTN_Q: "resampler.attn.q",
|
||||
MODEL_TENSOR.V_RESMPL_ATTN_K: "resampler.attn.k",
|
||||
|
|
@ -1100,19 +1132,26 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
|||
MODEL_TENSOR.A_ENC_EMBD_NORM: "a.position_embd_norm",
|
||||
MODEL_TENSOR.A_ENC_EMBD_TO_LOGITS: "a.embd_to_logits",
|
||||
MODEL_TENSOR.A_ENC_CONV1D: "a.conv1d.{bid}",
|
||||
MODEL_TENSOR.A_ENC_CONV1D_NORM: "a.conv1d.{bid}.norm",
|
||||
MODEL_TENSOR.A_PRE_NORM: "a.pre_ln",
|
||||
MODEL_TENSOR.A_POST_NORM: "a.post_ln",
|
||||
MODEL_TENSOR.A_ENC_LAYER_PRE_NORM: "a.blk.{bid}.layer_pre_norm",
|
||||
MODEL_TENSOR.A_ENC_ATTN_Q: "a.blk.{bid}.attn_q",
|
||||
MODEL_TENSOR.A_ENC_ATTN_K: "a.blk.{bid}.attn_k",
|
||||
MODEL_TENSOR.A_ENC_ATTN_V: "a.blk.{bid}.attn_v",
|
||||
MODEL_TENSOR.A_ENC_PER_DIM_SCALE: "a.blk.{bid}.per_dim_scale",
|
||||
MODEL_TENSOR.A_ENC_INPUT_NORM: "a.blk.{bid}.ln1",
|
||||
MODEL_TENSOR.A_ENC_OUTPUT: "a.blk.{bid}.attn_out",
|
||||
MODEL_TENSOR.A_ENC_OUTPUT_NORM: "a.blk.{bid}.ln2",
|
||||
MODEL_TENSOR.A_ENC_FFN_NORM: "a.blk.{bid}.ffn_norm",
|
||||
MODEL_TENSOR.A_ENC_FFN_POST_NORM: "a.blk.{bid}.ffn_post_norm",
|
||||
MODEL_TENSOR.A_ENC_FFN_SCALE: "a.blk.{bid}.ffn_scale",
|
||||
MODEL_TENSOR.A_ENC_FFN_UP: "a.blk.{bid}.ffn_up",
|
||||
MODEL_TENSOR.A_ENC_FFN_GATE: "a.blk.{bid}.ffn_gate",
|
||||
MODEL_TENSOR.A_ENC_FFN_DOWN: "a.blk.{bid}.ffn_down",
|
||||
MODEL_TENSOR.A_ENC_FFN_NORM_1: "a.blk.{bid}.ffn_norm_1",
|
||||
MODEL_TENSOR.A_ENC_FFN_POST_NORM_1: "a.blk.{bid}.ffn_post_norm_1",
|
||||
MODEL_TENSOR.A_ENC_FFN_SCALE_1: "a.blk.{bid}.ffn_scale_1",
|
||||
MODEL_TENSOR.A_ENC_FFN_UP_1: "a.blk.{bid}.ffn_up_1",
|
||||
MODEL_TENSOR.A_ENC_FFN_GATE_1: "a.blk.{bid}.ffn_gate_1",
|
||||
MODEL_TENSOR.A_ENC_FFN_DOWN_1: "a.blk.{bid}.ffn_down_1",
|
||||
|
|
@ -1120,6 +1159,10 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
|||
MODEL_TENSOR.A_MMPROJ_FC: "mm.a.fc",
|
||||
MODEL_TENSOR.A_MM_NORM_PRE: "mm.a.norm_pre",
|
||||
MODEL_TENSOR.A_MM_NORM_MID: "mm.a.norm_mid",
|
||||
MODEL_TENSOR.A_MM_INP_PROJ: "mm.a.input_projection", # gemma3n
|
||||
MODEL_TENSOR.A_MM_SOFT_EMB_NORM: "mm.a.soft_emb_norm", # gemma3n
|
||||
MODEL_TENSOR.A_MM_EMBEDDING: "mm.a.embedding", # gemma3n
|
||||
MODEL_TENSOR.A_MM_HARD_EMB_NORM: "mm.a.hard_emb_norm", # gemma3n
|
||||
# lfm2 audio
|
||||
MODEL_TENSOR.A_ENC_NORM_CONV: "a.blk.{bid}.norm_conv",
|
||||
MODEL_TENSOR.A_ENC_LINEAR_POS: "a.blk.{bid}.linear_pos",
|
||||
|
|
@ -1170,6 +1213,15 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|||
MODEL_TENSOR.V_MM_INP_PROJ,
|
||||
MODEL_TENSOR.V_MM_INP_NORM,
|
||||
MODEL_TENSOR.V_MM_SOFT_EMB_NORM,
|
||||
MODEL_TENSOR.V_MM_EMBEDDING,
|
||||
MODEL_TENSOR.V_MM_HARD_EMB_NORM,
|
||||
MODEL_TENSOR.V_ENC_CONV_STEM,
|
||||
MODEL_TENSOR.V_ENC_CONV_STEM_NORM,
|
||||
MODEL_TENSOR.V_ENC_MSFA_EXP,
|
||||
MODEL_TENSOR.V_ENC_MSFA_EXP_NORM,
|
||||
MODEL_TENSOR.V_ENC_MSFA_PROJ,
|
||||
MODEL_TENSOR.V_ENC_MSFA_PROJ_NORM,
|
||||
MODEL_TENSOR.V_ENC_MSFA_NORM,
|
||||
MODEL_TENSOR.V_RESMPL_POS_EMBD_K,
|
||||
MODEL_TENSOR.V_RESMPL_ATTN_Q,
|
||||
MODEL_TENSOR.V_RESMPL_ATTN_K,
|
||||
|
|
@ -1197,19 +1249,26 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|||
MODEL_TENSOR.A_ENC_EMBD_NORM,
|
||||
MODEL_TENSOR.A_ENC_EMBD_TO_LOGITS,
|
||||
MODEL_TENSOR.A_ENC_CONV1D,
|
||||
MODEL_TENSOR.A_ENC_CONV1D_NORM,
|
||||
MODEL_TENSOR.A_PRE_NORM,
|
||||
MODEL_TENSOR.A_POST_NORM,
|
||||
MODEL_TENSOR.A_ENC_LAYER_PRE_NORM,
|
||||
MODEL_TENSOR.A_ENC_ATTN_Q,
|
||||
MODEL_TENSOR.A_ENC_ATTN_K,
|
||||
MODEL_TENSOR.A_ENC_ATTN_V,
|
||||
MODEL_TENSOR.A_ENC_PER_DIM_SCALE,
|
||||
MODEL_TENSOR.A_ENC_INPUT_NORM,
|
||||
MODEL_TENSOR.A_ENC_OUTPUT,
|
||||
MODEL_TENSOR.A_ENC_OUTPUT_NORM,
|
||||
MODEL_TENSOR.A_ENC_FFN_NORM,
|
||||
MODEL_TENSOR.A_ENC_FFN_POST_NORM,
|
||||
MODEL_TENSOR.A_ENC_FFN_SCALE,
|
||||
MODEL_TENSOR.A_ENC_FFN_UP,
|
||||
MODEL_TENSOR.A_ENC_FFN_GATE,
|
||||
MODEL_TENSOR.A_ENC_FFN_DOWN,
|
||||
MODEL_TENSOR.A_ENC_FFN_NORM_1,
|
||||
MODEL_TENSOR.A_ENC_FFN_POST_NORM_1,
|
||||
MODEL_TENSOR.A_ENC_FFN_SCALE_1,
|
||||
MODEL_TENSOR.A_ENC_FFN_UP_1,
|
||||
MODEL_TENSOR.A_ENC_FFN_GATE_1,
|
||||
MODEL_TENSOR.A_ENC_FFN_DOWN_1,
|
||||
|
|
@ -1226,6 +1285,10 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|||
MODEL_TENSOR.A_ENC_CONV_NORM,
|
||||
MODEL_TENSOR.A_ENC_CONV_PW1,
|
||||
MODEL_TENSOR.A_ENC_CONV_PW2,
|
||||
MODEL_TENSOR.A_MM_INP_PROJ,
|
||||
MODEL_TENSOR.A_MM_SOFT_EMB_NORM,
|
||||
MODEL_TENSOR.A_MM_EMBEDDING,
|
||||
MODEL_TENSOR.A_MM_HARD_EMB_NORM,
|
||||
],
|
||||
MODEL_ARCH.LLAMA: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
|
|
@ -3496,6 +3559,8 @@ class GGUFValueType(IntEnum):
|
|||
|
||||
class VisionProjectorType:
|
||||
GEMMA3 = "gemma3"
|
||||
GEMMA3NV = "gemma3nv"
|
||||
GEMMA3NA = "gemma3na"
|
||||
IDEFICS3 = "idefics3"
|
||||
PIXTRAL = "pixtral"
|
||||
LLAMA4 = "llama4"
|
||||
|
|
|
|||
|
|
@ -1086,6 +1086,9 @@ class GGUFWriter:
|
|||
def add_clip_projector_type(self, value: str) -> None:
|
||||
self.add_string(Keys.Clip.PROJECTOR_TYPE, value)
|
||||
|
||||
def add_clip_vision_projector_type(self, value: str) -> None:
|
||||
self.add_string(Keys.ClipVision.PROJECTOR_TYPE, value)
|
||||
|
||||
def add_vision_projection_dim(self, value: int) -> None:
|
||||
self.add_uint32(Keys.ClipVision.PROJECTION_DIM, value)
|
||||
|
||||
|
|
@ -1168,6 +1171,9 @@ class GGUFWriter:
|
|||
|
||||
# audio models
|
||||
|
||||
def add_clip_audio_projector_type(self, value: str) -> None:
|
||||
self.add_string(Keys.ClipAudio.PROJECTOR_TYPE, value)
|
||||
|
||||
def add_audio_projection_dim(self, value: int) -> None:
|
||||
self.add_uint32(Keys.ClipAudio.PROJECTION_DIM, value)
|
||||
|
||||
|
|
|
|||
|
|
@ -123,6 +123,40 @@ class TensorNameMap:
|
|||
MODEL_TENSOR.CONV1D: (
|
||||
"backbone.embed", # roberta
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_MM_EMBEDDING: (
|
||||
"model.embed_vision.embedding", # gemma3n
|
||||
),
|
||||
MODEL_TENSOR.V_MM_HARD_EMB_NORM: (
|
||||
"model.embed_vision.hard_embedding_norm", # gemma3n
|
||||
),
|
||||
MODEL_TENSOR.V_MM_INP_PROJ: (
|
||||
"model.embed_vision.embedding_projection", # gemma3n
|
||||
),
|
||||
MODEL_TENSOR.V_MM_SOFT_EMB_NORM: (
|
||||
"model.embed_vision.soft_embedding_norm", # gemma3n
|
||||
),
|
||||
MODEL_TENSOR.V_ENC_CONV_STEM: (
|
||||
"model.vision_tower.timm_model.conv_stem.conv", # gemma3n
|
||||
),
|
||||
MODEL_TENSOR.V_ENC_CONV_STEM_NORM: (
|
||||
"model.vision_tower.timm_model.conv_stem.bn", # gemma3n
|
||||
),
|
||||
MODEL_TENSOR.V_ENC_MSFA_EXP: (
|
||||
"model.vision_tower.timm_model.msfa.ffn.pw_exp.conv", # gemma3n
|
||||
),
|
||||
MODEL_TENSOR.V_ENC_MSFA_EXP_NORM: (
|
||||
"model.vision_tower.timm_model.msfa.ffn.pw_exp.bn", # gemma3n
|
||||
),
|
||||
MODEL_TENSOR.V_ENC_MSFA_PROJ: (
|
||||
"model.vision_tower.timm_model.msfa.ffn.pw_proj.conv", # gemma3n
|
||||
),
|
||||
MODEL_TENSOR.V_ENC_MSFA_PROJ_NORM: (
|
||||
"model.vision_tower.timm_model.msfa.ffn.pw_proj.bn", # gemma3n
|
||||
),
|
||||
MODEL_TENSOR.V_ENC_MSFA_NORM: (
|
||||
"model.vision_tower.timm_model.msfa.norm", # gemma3n
|
||||
),
|
||||
}
|
||||
|
||||
block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
|
||||
|
|
@ -1575,6 +1609,11 @@ class TensorNameMap:
|
|||
MODEL_TENSOR.A_ENC_CONV1D: (
|
||||
"audio_tower.conv{bid}", # ultravox
|
||||
"conformer.pre_encode.conv.{bid}", # lfm2
|
||||
"model.audio_tower.subsample_conv_projection.conv_{bid}.conv", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_CONV1D_NORM: (
|
||||
"model.audio_tower.subsample_conv_projection.conv_{bid}.norm", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_PRE_NORM: (),
|
||||
|
|
@ -1587,40 +1626,64 @@ class TensorNameMap:
|
|||
MODEL_TENSOR.A_ENC_ATTN_Q: (
|
||||
"audio_tower.layers.{bid}.self_attn.q_proj", # ultravox
|
||||
"conformer.layers.{bid}.self_attn.linear_q", # lfm2
|
||||
"conformer.layers.{bid}.attention.attn.q_proj", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_ATTN_K: (
|
||||
"audio_tower.layers.{bid}.self_attn.k_proj", # ultravox
|
||||
"conformer.layers.{bid}.self_attn.linear_k", # lfm2
|
||||
"conformer.layers.{bid}.attention.attn.k_proj", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_ATTN_V: (
|
||||
"audio_tower.layers.{bid}.self_attn.v_proj", # ultravox
|
||||
"conformer.layers.{bid}.self_attn.linear_v", # lfm2
|
||||
"conformer.layers.{bid}.attention.attn.v_proj", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_PER_DIM_SCALE: (
|
||||
"conformer.layers.{bid}.attention.attn.per_dim_scale", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_LAYER_PRE_NORM: (
|
||||
"conformer.layers.{bid}.norm", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_INPUT_NORM: (
|
||||
"audio_tower.layers.{bid}.self_attn_layer_norm", # ultravox
|
||||
"conformer.layers.{bid}.norm_self_att", # lfm2
|
||||
"conformer.layers.{bid}.attention.pre_attn_norm", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_OUTPUT: (
|
||||
"audio_tower.layers.{bid}.self_attn.out_proj", # ultravox
|
||||
"conformer.layers.{bid}.self_attn.linear_out", # lfm2
|
||||
"conformer.layers.{bid}.attention.post", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_OUTPUT_NORM: (
|
||||
"audio_tower.layers.{bid}.final_layer_norm", # ultravox
|
||||
"conformer.layers.{bid}.norm_out", # lfm2
|
||||
"conformer.layers.{bid}.attention.post_norm", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_FFN_NORM: (
|
||||
"conformer.layers.{bid}.norm_feed_forward1", # lfm2
|
||||
"conformer.layers.{bid}.ffw_layer_start.pre_layer_norm", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_FFN_POST_NORM: (
|
||||
"conformer.layers.{bid}.ffw_layer_start.post_layer_norm", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_FFN_SCALE: (
|
||||
"conformer.layers.{bid}.ffw_layer_start.post_layer_scale", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_FFN_UP: (
|
||||
"audio_tower.layers.{bid}.fc1", # ultravox
|
||||
"conformer.layers.{bid}.feed_forward1.linear1", # lfm2
|
||||
"conformer.layers.{bid}.ffw_layer_start.ffw_layer_1", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_FFN_GATE: (),
|
||||
|
|
@ -1628,22 +1691,35 @@ class TensorNameMap:
|
|||
MODEL_TENSOR.A_ENC_FFN_DOWN: (
|
||||
"audio_tower.layers.{bid}.fc2", # ultravox
|
||||
"conformer.layers.{bid}.feed_forward1.linear2", # lfm2
|
||||
"conformer.layers.{bid}.ffw_layer_start.ffw_layer_2", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_FFN_UP_1: (
|
||||
"conformer.layers.{bid}.feed_forward2.linear1", # lfm2
|
||||
"conformer.layers.{bid}.ffw_layer_end.ffw_layer_1", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_FFN_DOWN_1: (
|
||||
"conformer.layers.{bid}.feed_forward2.linear2", # lfm2
|
||||
"conformer.layers.{bid}.ffw_layer_end.ffw_layer_2", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_FFN_NORM_1: (
|
||||
"conformer.layers.{bid}.norm_feed_forward2", # lfm2
|
||||
"conformer.layers.{bid}.ffw_layer_end.pre_layer_norm", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_FFN_POST_NORM_1: (
|
||||
"conformer.layers.{bid}.ffw_layer_end.post_layer_norm", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_FFN_SCALE_1: (
|
||||
"conformer.layers.{bid}.ffw_layer_end.post_layer_scale", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_LINEAR_POS: (
|
||||
"conformer.layers.{bid}.self_attn.linear_pos", # lfm2
|
||||
"conformer.layers.{bid}.attention.attn.relative_position_embedding.pos_proj", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_POS_BIAS_U: (
|
||||
|
|
@ -1656,6 +1732,7 @@ class TensorNameMap:
|
|||
|
||||
MODEL_TENSOR.A_ENC_OUT: (
|
||||
"conformer.pre_encode.out", # lfm2
|
||||
"model.audio_tower.subsample_conv_projection.input_proj_linear", # gemma3n
|
||||
),
|
||||
|
||||
# note: some tensors below has "audio." pseudo-prefix, to prevent conflicts with vision tensors
|
||||
|
|
@ -1681,22 +1758,40 @@ class TensorNameMap:
|
|||
|
||||
MODEL_TENSOR.A_ENC_CONV_DW: (
|
||||
"conformer.layers.{bid}.conv.depthwise_conv", # lfm2
|
||||
"conformer.layers.{bid}.lconv1d.depthwise_conv1d", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_CONV_NORM: (
|
||||
"conformer.layers.{bid}.conv.batch_norm", # lfm2
|
||||
"conformer.layers.{bid}.lconv1d.pre_layer_norm", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_CONV_PW1: (
|
||||
"conformer.layers.{bid}.conv.pointwise_conv1", # lfm2
|
||||
"conformer.layers.{bid}.lconv1d.linear_start", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_CONV_PW2: (
|
||||
"conformer.layers.{bid}.conv.pointwise_conv2", # lfm2
|
||||
"conformer.layers.{bid}.lconv1d.linear_end", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_NORM_CONV: (
|
||||
"conformer.layers.{bid}.norm_conv", # lfm2
|
||||
"conformer.layers.{bid}.lconv1d.conv_norm", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_MM_EMBEDDING: (
|
||||
"model.embed_audio.embedding", # gemma3n
|
||||
),
|
||||
MODEL_TENSOR.A_MM_HARD_EMB_NORM: (
|
||||
"model.embed_audio.hard_embedding_norm", # gemma3n
|
||||
),
|
||||
MODEL_TENSOR.A_MM_INP_PROJ: (
|
||||
"model.embed_audio.embedding_projection", # gemma3n
|
||||
),
|
||||
MODEL_TENSOR.A_MM_SOFT_EMB_NORM: (
|
||||
"model.embed_audio.soft_embedding_norm", # gemma3n
|
||||
),
|
||||
|
||||
# NextN/MTP tensors for GLM4_MOE
|
||||
|
|
|
|||
|
|
@ -1295,7 +1295,9 @@ extern "C" {
|
|||
// available samplers:
|
||||
|
||||
LLAMA_API struct llama_sampler * llama_sampler_init_greedy(void);
|
||||
LLAMA_API struct llama_sampler * llama_sampler_init_dist (uint32_t seed);
|
||||
|
||||
/// seed == LLAMA_DEFAULT_SEED to use a random seed.
|
||||
LLAMA_API struct llama_sampler * llama_sampler_init_dist(uint32_t seed);
|
||||
|
||||
/// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
||||
/// Setting k <= 0 makes this a noop
|
||||
|
|
|
|||
|
|
@ -2561,6 +2561,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
pimpl->gpu_buft_list.emplace(dev, std::move(buft_list));
|
||||
}
|
||||
|
||||
ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||
if (cpu_dev == nullptr) {
|
||||
throw std::runtime_error(format("%s: no CPU backend found", __func__));
|
||||
}
|
||||
|
||||
// calculate the split points
|
||||
bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + n_devices(), [](float x) { return x == 0.0f; });
|
||||
std::vector<float> splits(n_devices());
|
||||
|
|
@ -2571,6 +2576,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
size_t total;
|
||||
size_t free;
|
||||
ggml_backend_dev_memory(dev, &free, &total);
|
||||
|
||||
// devices can return 0 bytes for free and total memory if they do not
|
||||
// have any to report. in this case, we will use the host memory as a fallback
|
||||
// fixes: https://github.com/ggml-org/llama.cpp/issues/18577
|
||||
if (free == 0 && total == 0) {
|
||||
ggml_backend_dev_memory(cpu_dev, &free, &total);
|
||||
}
|
||||
splits[i] = free;
|
||||
}
|
||||
} else {
|
||||
|
|
@ -2587,7 +2599,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
splits[i] /= split_sum;
|
||||
}
|
||||
|
||||
ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||
int i_gpu_start = std::max(int(hparams.n_layer) + 1 - n_gpu_layers, 0);
|
||||
|
||||
#if defined(GGML_USE_CLBLAST)
|
||||
|
|
@ -2596,9 +2607,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
i_gpu_start = std::max((int64_t) hparams.n_layer, (int64_t) 0);
|
||||
#endif
|
||||
|
||||
if (cpu_dev == nullptr) {
|
||||
throw std::runtime_error(format("%s: no CPU backend found", __func__));
|
||||
}
|
||||
const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, int(n_layer) + 1);
|
||||
auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
|
||||
const bool is_swa = il < int(hparams.n_layer) && hparams.is_swa(il);
|
||||
|
|
|
|||
|
|
@ -2142,7 +2142,7 @@ struct llama_sampler_xtc {
|
|||
const uint32_t seed;
|
||||
uint32_t seed_cur;
|
||||
|
||||
std::mt19937 rng;
|
||||
std::mt19937 rng;
|
||||
};
|
||||
|
||||
static const char * llama_sampler_xtc_name(const struct llama_sampler * /*smpl*/) {
|
||||
|
|
|
|||
|
|
@ -135,8 +135,20 @@ static std::vector<llama_device_memory_data> llama_get_device_memory_data(
|
|||
}
|
||||
}
|
||||
for (size_t i = 0; i < ret.size(); i++) {
|
||||
size_t free, total;
|
||||
size_t free;
|
||||
size_t total;
|
||||
ggml_backend_dev_memory(model->devices[i], &free, &total);
|
||||
|
||||
// devices can return 0 bytes for free and total memory if they do not
|
||||
// have any to report. in this case, we will use the host memory as a fallback
|
||||
// fixes: https://github.com/ggml-org/llama.cpp/issues/18577
|
||||
if (free == 0 && total == 0) {
|
||||
ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||
if (cpu_dev == nullptr) {
|
||||
throw std::runtime_error(format("%s: no CPU backend found", __func__));
|
||||
}
|
||||
ggml_backend_dev_memory(cpu_dev, &free, &total);
|
||||
}
|
||||
ret[i].free = free;
|
||||
ret[i].total = total;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -255,10 +255,20 @@ ggml_tensor * llm_build_gemma3n_iswa::get_per_layer_inputs() {
|
|||
inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_altup, n_layer, n_tokens);
|
||||
inp_per_layer = ggml_scale(ctx0, inp_per_layer, sqrtf((float) n_embd_altup));
|
||||
cb(inp_per_layer, "inp_per_layer_selected", -1);
|
||||
res->add_input(std::move(inp));
|
||||
} else {
|
||||
GGML_ABORT("TODO: support embd input");
|
||||
// Vision embedding path: use padding token (ID=0) embedding
|
||||
const int64_t embd_size = model.tok_embd_per_layer->ne[0]; // n_embd_altup * n_layer
|
||||
|
||||
// Extract and dequantize padding token embedding (column 0)
|
||||
ggml_tensor * padding_q = ggml_view_1d(ctx0, model.tok_embd_per_layer, embd_size, 0);
|
||||
ggml_tensor * padding_f32 = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, embd_size);
|
||||
inp_per_layer = ggml_cpy(ctx0, padding_q, padding_f32);
|
||||
|
||||
// Reshape to [n_embd_altup, n_layer, 1]
|
||||
inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_altup, n_layer, 1);
|
||||
cb(inp_per_layer, "inp_per_layer_vision", -1);
|
||||
}
|
||||
res->add_input(std::move(inp));
|
||||
return inp_per_layer;
|
||||
}
|
||||
|
||||
|
|
@ -276,7 +286,7 @@ ggml_tensor * llm_build_gemma3n_iswa::project_per_layer_inputs(ggml_tensor * inp
|
|||
-1); // [n_embd_altup, n_layer, n_tokens]
|
||||
cb(per_layer_proj, "per_layer_proj", -1);
|
||||
|
||||
inp_per_layer = ggml_add(ctx0, inp_per_layer, per_layer_proj);
|
||||
inp_per_layer = ggml_add(ctx0, per_layer_proj, inp_per_layer);
|
||||
inp_per_layer = ggml_scale(ctx0, inp_per_layer, per_layer_input_scale);
|
||||
cb(inp_per_layer, "inp_per_layer", -1);
|
||||
|
||||
|
|
|
|||
|
|
@ -154,6 +154,47 @@
|
|||
#define TN_CONV_PW1 "%s.blk.%d.conv_pw1.%s"
|
||||
#define TN_CONV_PW2 "%s.blk.%d.conv_pw2.%s"
|
||||
|
||||
// mobilenetv5 (gemma3n) definitions
|
||||
#define TN_MNV5_STEM_CONV "v.conv_stem.conv.weight"
|
||||
#define TN_MNV5_STEM_BIAS "v.conv_stem.conv.bias"
|
||||
#define TN_MNV5_STEM_BN "v.conv_stem.bn.weight"
|
||||
|
||||
// Stage 0 Block (Edge Residual)
|
||||
#define TN_MNV5_BLK_S0_EXP_W "v.blk.%d.%d.conv_exp.weight"
|
||||
#define TN_MNV5_BLK_S0_BN1_W "v.blk.%d.%d.bn1.weight"
|
||||
#define TN_MNV5_BLK_S0_PWL_W "v.blk.%d.%d.conv_pwl.weight"
|
||||
#define TN_MNV5_BLK_S0_BN2_W "v.blk.%d.%d.bn2.weight"
|
||||
|
||||
// Stage 1+ Block (Universal Inverted Residual)
|
||||
#define TN_MNV5_BLK_DW_START_W "v.blk.%d.%d.dw_start.conv.weight"
|
||||
#define TN_MNV5_BLK_DW_START_BN "v.blk.%d.%d.dw_start.bn.weight"
|
||||
#define TN_MNV5_BLK_DW_MID_W "v.blk.%d.%d.dw_mid.conv.weight"
|
||||
#define TN_MNV5_BLK_DW_MID_BN "v.blk.%d.%d.dw_mid.bn.weight"
|
||||
#define TN_MNV5_BLK_PW_EXP_W "v.blk.%d.%d.pw_exp.conv.weight"
|
||||
#define TN_MNV5_BLK_PW_EXP_BN "v.blk.%d.%d.pw_exp.bn.weight"
|
||||
#define TN_MNV5_BLK_PW_PROJ_W "v.blk.%d.%d.pw_proj.conv.weight"
|
||||
#define TN_MNV5_BLK_PW_PROJ_BN "v.blk.%d.%d.pw_proj.bn.weight"
|
||||
#define TN_MNV5_BLK_LAYER_SCALE "v.blk.%d.%d.layer_scale.gamma"
|
||||
|
||||
// Attention Components
|
||||
#define TN_MNV5_ATTN_Q_W "v.blk.%d.%d.attn.query.proj.weight"
|
||||
#define TN_MNV5_ATTN_K_W "v.blk.%d.%d.attn.key.proj.weight"
|
||||
#define TN_MNV5_ATTN_V_W "v.blk.%d.%d.attn.value.proj.weight"
|
||||
#define TN_MNV5_ATTN_O_W "v.blk.%d.%d.attn.output.proj.weight"
|
||||
#define TN_MNV5_ATTN_K_DW "v.blk.%d.%d.attn.key.down_conv.weight"
|
||||
#define TN_MNV5_ATTN_K_NORM "v.blk.%d.%d.attn.key.norm.weight"
|
||||
#define TN_MNV5_ATTN_V_DW "v.blk.%d.%d.attn.value.down_conv.weight"
|
||||
#define TN_MNV5_ATTN_V_NORM "v.blk.%d.%d.attn.value.norm.weight"
|
||||
#define TN_MNV5_ATTN_NORM "v.blk.%d.%d.norm.weight" // Block norm used in attn blocks
|
||||
|
||||
// MSFA
|
||||
#define TN_MNV5_MSFA_FFN_EXP_W "v.msfa.ffn.pw_exp.conv.weight"
|
||||
#define TN_MNV5_MSFA_FFN_EXP_BN "v.msfa.ffn.pw_exp.bn.weight"
|
||||
#define TN_MNV5_MSFA_FFN_PROJ_W "v.msfa.ffn.pw_proj.conv.weight"
|
||||
#define TN_MNV5_MSFA_FFN_PROJ_BN "v.msfa.ffn.pw_proj.bn.weight"
|
||||
#define TN_MNV5_MSFA_NORM "v.msfa.norm.weight"
|
||||
|
||||
|
||||
// align x to upper multiple of n
|
||||
#define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n))
|
||||
|
||||
|
|
@ -171,6 +212,8 @@ enum projector_type {
|
|||
PROJECTOR_TYPE_QWEN2VL,
|
||||
PROJECTOR_TYPE_QWEN3VL,
|
||||
PROJECTOR_TYPE_GEMMA3,
|
||||
PROJECTOR_TYPE_GEMMA3NV,
|
||||
PROJECTOR_TYPE_GEMMA3NA,
|
||||
PROJECTOR_TYPE_IDEFICS3,
|
||||
PROJECTOR_TYPE_PIXTRAL,
|
||||
PROJECTOR_TYPE_QWEN25VL,
|
||||
|
|
@ -203,6 +246,8 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
|||
{ PROJECTOR_TYPE_QWEN25VL, "qwen2.5vl_merger"},
|
||||
{ PROJECTOR_TYPE_QWEN3VL, "qwen3vl_merger"},
|
||||
{ PROJECTOR_TYPE_GEMMA3, "gemma3"},
|
||||
{ PROJECTOR_TYPE_GEMMA3NV, "gemma3nv"},
|
||||
{ PROJECTOR_TYPE_GEMMA3NA, "gemma3na"},
|
||||
{ PROJECTOR_TYPE_IDEFICS3, "idefics3"},
|
||||
{ PROJECTOR_TYPE_PIXTRAL, "pixtral"},
|
||||
{ PROJECTOR_TYPE_ULTRAVOX, "ultravox"},
|
||||
|
|
|
|||
|
|
@ -173,6 +173,45 @@ struct clip_layer {
|
|||
}
|
||||
};
|
||||
|
||||
// Expanded MobileNetV5 block structure for Gemma3n vision encoder
|
||||
struct mobilenetv5_block {
|
||||
// Stage 0 (Edge Residual)
|
||||
ggml_tensor * s0_conv_exp_w = nullptr;
|
||||
ggml_tensor * s0_bn1_w = nullptr;
|
||||
ggml_tensor * s0_conv_pwl_w = nullptr;
|
||||
ggml_tensor * s0_bn2_w = nullptr;
|
||||
|
||||
// Stage 1+ (Universal Inverted Residual)
|
||||
ggml_tensor * dw_start_w = nullptr;
|
||||
ggml_tensor * dw_start_bn_w = nullptr;
|
||||
|
||||
ggml_tensor * pw_exp_w = nullptr;
|
||||
ggml_tensor * pw_exp_bn_w = nullptr;
|
||||
|
||||
ggml_tensor * dw_mid_w = nullptr;
|
||||
ggml_tensor * dw_mid_bn_w = nullptr;
|
||||
|
||||
ggml_tensor * pw_proj_w = nullptr;
|
||||
ggml_tensor * pw_proj_bn_w = nullptr;
|
||||
|
||||
ggml_tensor * layer_scale_w = nullptr;
|
||||
|
||||
// Attention (MQA) components
|
||||
ggml_tensor * attn_q_w = nullptr;
|
||||
ggml_tensor * attn_k_w = nullptr;
|
||||
ggml_tensor * attn_v_w = nullptr;
|
||||
ggml_tensor * attn_o_w = nullptr;
|
||||
|
||||
// Optional downsampling/norm in attention
|
||||
ggml_tensor * attn_k_dw_w = nullptr;
|
||||
ggml_tensor * attn_k_norm_w = nullptr;
|
||||
ggml_tensor * attn_v_dw_w = nullptr;
|
||||
ggml_tensor * attn_v_norm_w = nullptr;
|
||||
|
||||
// Block norm (often present in attention blocks)
|
||||
ggml_tensor * attn_norm_w = nullptr;
|
||||
};
|
||||
|
||||
struct clip_model {
|
||||
clip_modality modality = CLIP_MODALITY_VISION;
|
||||
projector_type proj_type = PROJECTOR_TYPE_MLP;
|
||||
|
|
@ -289,6 +328,23 @@ struct clip_model {
|
|||
ggml_tensor * mm_input_proj_w = nullptr;
|
||||
ggml_tensor * mm_soft_emb_norm_w = nullptr;
|
||||
|
||||
// mobilenetv5 for gemma3n
|
||||
std::vector<mobilenetv5_block> mobilenet_blocks;
|
||||
std::vector<int> mobilenet_stage_ends;
|
||||
ggml_tensor * mobilenet_stem_conv_w = nullptr;
|
||||
ggml_tensor * mobilenet_stem_conv_b = nullptr;
|
||||
ggml_tensor * mobilenet_stem_norm_w = nullptr;
|
||||
ggml_tensor * mm_post_proj_norm_w = nullptr;
|
||||
|
||||
// Multi-Scale Fusion Adapter (MSFA) components
|
||||
ggml_tensor * msfa_concat_conv_w = nullptr;
|
||||
ggml_tensor * msfa_concat_norm_w = nullptr;
|
||||
ggml_tensor * msfa_ffn_expand_w = nullptr;
|
||||
ggml_tensor * msfa_ffn_project_w = nullptr;
|
||||
ggml_tensor * msfa_ffn_expand_bn = nullptr;
|
||||
ggml_tensor * msfa_ffn_project_bn = nullptr;
|
||||
|
||||
|
||||
// pixtral, glm4v
|
||||
ggml_tensor * token_embd_img_break = nullptr;
|
||||
ggml_tensor * mm_patch_merger_w = nullptr;
|
||||
|
|
|
|||
|
|
@ -62,6 +62,7 @@
|
|||
#include "models/qwen3vl.cpp"
|
||||
#include "models/siglip.cpp"
|
||||
#include "models/whisper-enc.cpp"
|
||||
#include "models/mobilenetv5.cpp"
|
||||
#include "models/youtuvl.cpp"
|
||||
|
||||
struct clip_logger_state g_logger_state = {clip_log_callback_default, NULL};
|
||||
|
|
@ -833,6 +834,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|||
{
|
||||
builder = std::make_unique<clip_graph_siglip>(ctx, img);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GEMMA3NV:
|
||||
{
|
||||
builder = std::make_unique<clip_graph_mobilenetv5>(ctx, img);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_PIXTRAL:
|
||||
case PROJECTOR_TYPE_LIGHTONOCR:
|
||||
{
|
||||
|
|
@ -1215,6 +1220,14 @@ struct clip_model_loader {
|
|||
// test model (tinygemma3) has a different value, we optionally read it
|
||||
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
|
||||
} break;
|
||||
|
||||
case PROJECTOR_TYPE_GEMMA3NV:
|
||||
{
|
||||
// Gemma3n uses MobileNetV5 which produces 256 tokens (16x16)
|
||||
// Similar configuration to Gemma3
|
||||
hparams.n_merge = 1; // MobileNetV5 handles resizing internally
|
||||
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_QWEN2VL:
|
||||
case PROJECTOR_TYPE_QWEN25VL:
|
||||
case PROJECTOR_TYPE_QWEN3VL:
|
||||
|
|
@ -1408,6 +1421,10 @@ struct clip_model_loader {
|
|||
|
||||
model.position_embeddings = get_tensor(string_format(TN_POS_EMBD, prefix), false);
|
||||
|
||||
if (model.proj_type == PROJECTOR_TYPE_GEMMA3NV) {
|
||||
hparams.n_layer = 0; // gemma3n does not use normal layer structure
|
||||
}
|
||||
|
||||
// layers
|
||||
model.layers.resize(hparams.n_layer);
|
||||
for (int il = 0; il < hparams.n_layer; ++il) {
|
||||
|
|
@ -1482,6 +1499,7 @@ struct clip_model_loader {
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
switch (model.proj_type) {
|
||||
case PROJECTOR_TYPE_MLP:
|
||||
case PROJECTOR_TYPE_MLP_NORM:
|
||||
|
|
@ -1621,6 +1639,99 @@ struct clip_model_loader {
|
|||
model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ);
|
||||
model.mm_soft_emb_norm_w = get_tensor(TN_MM_SOFT_EMB_N);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GEMMA3NV:
|
||||
{
|
||||
model.mobilenet_stem_conv_w = get_tensor(TN_MNV5_STEM_CONV, false);
|
||||
model.mobilenet_stem_conv_b = get_tensor(TN_MNV5_STEM_BIAS, false);
|
||||
model.mobilenet_stem_norm_w = get_tensor(TN_MNV5_STEM_BN, false);
|
||||
|
||||
model.msfa_ffn_expand_w = get_tensor(TN_MNV5_MSFA_FFN_EXP_W, false);
|
||||
model.msfa_ffn_expand_bn = get_tensor(TN_MNV5_MSFA_FFN_EXP_BN, false); // Consume BN if present but likely folded
|
||||
model.msfa_ffn_project_w = get_tensor(TN_MNV5_MSFA_FFN_PROJ_W, false);
|
||||
model.msfa_ffn_project_bn = get_tensor(TN_MNV5_MSFA_FFN_PROJ_BN, false);
|
||||
|
||||
model.msfa_concat_norm_w = get_tensor(TN_MNV5_MSFA_NORM, false);
|
||||
|
||||
// Dynamically load blocks stage by stage
|
||||
for (int stage = 0; stage < 4; ++stage) {
|
||||
int blocks_found_in_stage = 0;
|
||||
|
||||
for (int blk_idx = 0; ; ++blk_idx) {
|
||||
bool found_block = false;
|
||||
mobilenetv5_block block;
|
||||
|
||||
// 1. Check for Edge Residual (S0)
|
||||
block.s0_conv_exp_w = get_tensor(string_format(TN_MNV5_BLK_S0_EXP_W, stage, blk_idx), false);
|
||||
if (block.s0_conv_exp_w) {
|
||||
found_block = true;
|
||||
block.s0_bn1_w = get_tensor(string_format(TN_MNV5_BLK_S0_BN1_W, stage, blk_idx), false);
|
||||
block.s0_conv_pwl_w = get_tensor(string_format(TN_MNV5_BLK_S0_PWL_W, stage, blk_idx), false);
|
||||
block.s0_bn2_w = get_tensor(string_format(TN_MNV5_BLK_S0_BN2_W, stage, blk_idx), false);
|
||||
}
|
||||
// 2. Check for UIR (Universal Inverted Residual)
|
||||
else {
|
||||
// Check for dw_start OR pw_exp (some UIR blocks skip dw_start)
|
||||
block.dw_start_w = get_tensor(string_format(TN_MNV5_BLK_DW_START_W, stage, blk_idx), false);
|
||||
block.pw_exp_w = get_tensor(string_format(TN_MNV5_BLK_PW_EXP_W, stage, blk_idx), false);
|
||||
|
||||
if (block.dw_start_w || block.pw_exp_w) {
|
||||
found_block = true;
|
||||
if (block.dw_start_w) {
|
||||
block.dw_start_bn_w = get_tensor(string_format(TN_MNV5_BLK_DW_START_BN, stage, blk_idx), false);
|
||||
}
|
||||
if (block.pw_exp_w) {
|
||||
block.pw_exp_bn_w = get_tensor(string_format(TN_MNV5_BLK_PW_EXP_BN, stage, blk_idx), false);
|
||||
}
|
||||
block.dw_mid_w = get_tensor(string_format(TN_MNV5_BLK_DW_MID_W, stage, blk_idx), false);
|
||||
if (block.dw_mid_w) {
|
||||
block.dw_mid_bn_w = get_tensor(string_format(TN_MNV5_BLK_DW_MID_BN, stage, blk_idx), false);
|
||||
}
|
||||
block.pw_proj_w = get_tensor(string_format(TN_MNV5_BLK_PW_PROJ_W, stage, blk_idx), false);
|
||||
if (block.pw_proj_w) {
|
||||
block.pw_proj_bn_w = get_tensor(string_format(TN_MNV5_BLK_PW_PROJ_BN, stage, blk_idx), false);
|
||||
}
|
||||
block.layer_scale_w = get_tensor(string_format(TN_MNV5_BLK_LAYER_SCALE, stage, blk_idx), false);
|
||||
}
|
||||
}
|
||||
|
||||
// 3. Check for Attention (MQA)
|
||||
// Even if UIR/Edge check failed, this might be a pure attention block
|
||||
ggml_tensor* attn_q_check = get_tensor(string_format(TN_MNV5_ATTN_Q_W, stage, blk_idx), false);
|
||||
if (attn_q_check) {
|
||||
found_block = true;
|
||||
block.attn_q_w = attn_q_check;
|
||||
block.attn_k_w = get_tensor(string_format(TN_MNV5_ATTN_K_W, stage, blk_idx), false);
|
||||
block.attn_v_w = get_tensor(string_format(TN_MNV5_ATTN_V_W, stage, blk_idx), false);
|
||||
block.attn_o_w = get_tensor(string_format(TN_MNV5_ATTN_O_W, stage, blk_idx), false);
|
||||
block.attn_k_dw_w = get_tensor(string_format(TN_MNV5_ATTN_K_DW, stage, blk_idx), false);
|
||||
block.attn_k_norm_w = get_tensor(string_format(TN_MNV5_ATTN_K_NORM, stage, blk_idx), false);
|
||||
block.attn_v_dw_w = get_tensor(string_format(TN_MNV5_ATTN_V_DW, stage, blk_idx), false);
|
||||
block.attn_v_norm_w = get_tensor(string_format(TN_MNV5_ATTN_V_NORM, stage, blk_idx), false);
|
||||
block.attn_norm_w = get_tensor(string_format(TN_MNV5_ATTN_NORM, stage, blk_idx), false);
|
||||
// Note: Attention blocks also have layer_scale, load it if not already loaded by UIR check
|
||||
if (!block.layer_scale_w) {
|
||||
block.layer_scale_w = get_tensor(string_format(TN_MNV5_BLK_LAYER_SCALE, stage, blk_idx), false);
|
||||
}
|
||||
}
|
||||
|
||||
if (found_block) {
|
||||
model.mobilenet_blocks.push_back(block);
|
||||
blocks_found_in_stage++;
|
||||
} else {
|
||||
// End of blocks for this stage
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Track where this stage ends in the flat vector
|
||||
if (blocks_found_in_stage > 0) {
|
||||
model.mobilenet_stage_ends.push_back(model.mobilenet_blocks.size() - 1);
|
||||
LOG_INF("%s: Stage %d ended at global block index %zu\n", __func__, stage, model.mobilenet_blocks.size() - 1);
|
||||
}
|
||||
}
|
||||
model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ);
|
||||
model.mm_soft_emb_norm_w = get_tensor(TN_MM_SOFT_EMB_N);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_IDEFICS3:
|
||||
{
|
||||
model.projection = get_tensor(TN_MM_PROJECTOR);
|
||||
|
|
@ -2081,6 +2192,7 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params
|
|||
|
||||
try {
|
||||
clip_model_loader loader(fname);
|
||||
bool skip_audio = false;
|
||||
|
||||
if (loader.has_vision) {
|
||||
ctx_vision = new clip_ctx(ctx_params);
|
||||
|
|
@ -2090,10 +2202,14 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params
|
|||
loader.warmup(*ctx_vision);
|
||||
}
|
||||
|
||||
// TODO: we don't support audio for Gemma 3N, but GGUF contains audio tensors
|
||||
// we can remove this check when we implement audio support for Gemma 3N
|
||||
skip_audio = ctx_vision->model.proj_type == PROJECTOR_TYPE_GEMMA3NV;
|
||||
|
||||
// clip_debug_encode(ctx_vision, 24*14, 24*14, 0.5f);
|
||||
}
|
||||
|
||||
if (loader.has_audio) {
|
||||
if (loader.has_audio && !skip_audio) {
|
||||
ctx_audio = new clip_ctx(ctx_params);
|
||||
loader.load_hparams(ctx_audio->model, CLIP_MODALITY_AUDIO);
|
||||
loader.load_tensors(*ctx_audio);
|
||||
|
|
@ -3051,6 +3167,16 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
|
|||
res_imgs->entries.push_back(std::move(img_f32));
|
||||
} break;
|
||||
|
||||
case PROJECTOR_TYPE_GEMMA3NV:
|
||||
{
|
||||
clip_image_u8 resized_image;
|
||||
int sz = params.image_size;
|
||||
img_tool::resize(*img, resized_image, {sz, sz}, img_tool::RESIZE_ALGO_BILINEAR, false);
|
||||
clip_image_f32_ptr img_f32(clip_image_f32_init());
|
||||
normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std);
|
||||
res_imgs->entries.push_back(std::move(img_f32));
|
||||
} break;
|
||||
|
||||
case PROJECTOR_TYPE_JANUS_PRO:
|
||||
{
|
||||
// Janus Pro preprocessing: pad to square with gray(127), resize to 384x384
|
||||
|
|
@ -3313,6 +3439,12 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
|
|||
int scale_factor = ctx->model.hparams.n_merge;
|
||||
n_patches /= (scale_factor * scale_factor);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GEMMA3NV:
|
||||
{
|
||||
// MobileNetV5 MSFA adapter always outputs fixed 16x16 resolution
|
||||
// regardless of input size (see architecture description)
|
||||
n_patches = ctx->model.hparams.image_size / ctx->model.hparams.patch_size;
|
||||
} break;
|
||||
case PROJECTOR_TYPE_LFM2:
|
||||
case PROJECTOR_TYPE_KIMIVL:
|
||||
{
|
||||
|
|
@ -3705,6 +3837,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|||
set_input_i32("patches", patches);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GEMMA3:
|
||||
case PROJECTOR_TYPE_GEMMA3NV:
|
||||
case PROJECTOR_TYPE_IDEFICS3:
|
||||
case PROJECTOR_TYPE_INTERNVL:
|
||||
case PROJECTOR_TYPE_QWEN2A:
|
||||
|
|
@ -4029,6 +4162,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
|||
// main path + deepstack paths
|
||||
return ctx->model.mm_1_b->ne[0] * (1 + ctx->model.n_deepstack_layers);
|
||||
case PROJECTOR_TYPE_GEMMA3:
|
||||
case PROJECTOR_TYPE_GEMMA3NV:
|
||||
return ctx->model.mm_input_proj_w->ne[0];
|
||||
case PROJECTOR_TYPE_IDEFICS3:
|
||||
return ctx->model.projection->ne[1];
|
||||
|
|
@ -4059,6 +4193,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
|||
}
|
||||
|
||||
int clip_is_minicpmv(const struct clip_ctx * ctx) {
|
||||
// TODO: remove this function
|
||||
if (ctx->proj_type() == PROJECTOR_TYPE_MINICPMV) {
|
||||
return ctx->model.hparams.minicpmv_version;
|
||||
}
|
||||
|
|
@ -4066,14 +4201,20 @@ int clip_is_minicpmv(const struct clip_ctx * ctx) {
|
|||
}
|
||||
|
||||
bool clip_is_glm(const struct clip_ctx * ctx) {
|
||||
// TODO: remove this function
|
||||
return ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE;
|
||||
}
|
||||
|
||||
bool clip_is_mrope(const struct clip_ctx * ctx) {
|
||||
return ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL
|
||||
|| ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL
|
||||
|| ctx->proj_type() == PROJECTOR_TYPE_QWEN3VL
|
||||
|| ctx->proj_type() == PROJECTOR_TYPE_GLM4V;
|
||||
switch (ctx->proj_type()) {
|
||||
case PROJECTOR_TYPE_QWEN2VL:
|
||||
case PROJECTOR_TYPE_QWEN25VL:
|
||||
case PROJECTOR_TYPE_QWEN3VL:
|
||||
case PROJECTOR_TYPE_GLM4V:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
bool clip_is_llava(const struct clip_ctx * ctx) {
|
||||
|
|
@ -4097,11 +4238,16 @@ bool clip_has_audio_encoder(const struct clip_ctx * ctx) {
|
|||
}
|
||||
|
||||
bool clip_has_whisper_encoder(const struct clip_ctx * ctx) {
|
||||
return ctx->proj_type() == PROJECTOR_TYPE_ULTRAVOX
|
||||
|| ctx->proj_type() == PROJECTOR_TYPE_QWEN2A
|
||||
|| ctx->proj_type() == PROJECTOR_TYPE_GLMA
|
||||
|| ctx->proj_type() == PROJECTOR_TYPE_VOXTRAL
|
||||
|| ctx->proj_type() == PROJECTOR_TYPE_MUSIC_FLAMINGO;
|
||||
switch (ctx->proj_type()) {
|
||||
case PROJECTOR_TYPE_ULTRAVOX:
|
||||
case PROJECTOR_TYPE_QWEN2A:
|
||||
case PROJECTOR_TYPE_GLMA:
|
||||
case PROJECTOR_TYPE_VOXTRAL:
|
||||
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
|
||||
|
|
|
|||
451
tools/mtmd/models/mobilenetv5.cpp
Normal file
451
tools/mtmd/models/mobilenetv5.cpp
Normal file
|
|
@ -0,0 +1,451 @@
|
|||
#include "models.h"
|
||||
|
||||
// Helpers for MobileNetV5 Blocks
|
||||
// RMS Norm 2D - normalizes over channels for each spatial position
|
||||
ggml_tensor * clip_graph_mobilenetv5::rms_norm_2d(ggml_tensor * inp, ggml_tensor * weight, float eps) {
|
||||
// inp: [W, H, C, B]
|
||||
|
||||
ggml_tensor * cur = ggml_permute(ctx0, inp, 2, 1, 0, 3);
|
||||
cur = ggml_cont(ctx0, cur);
|
||||
cur = ggml_rms_norm(ctx0, cur, eps);
|
||||
|
||||
if (weight) {
|
||||
cur = ggml_mul(ctx0, cur, weight);
|
||||
}
|
||||
|
||||
cur = ggml_permute(ctx0, cur, 2, 1, 0, 3);
|
||||
cur = ggml_cont(ctx0, cur);
|
||||
|
||||
return cur;
|
||||
}
|
||||
|
||||
// Conv2dSame padding - asymmetric SAME padding like PyTorch/TF
|
||||
ggml_tensor* clip_graph_mobilenetv5::pad_same_2d(ggml_tensor* inp, int kernel_h, int kernel_w, int stride_h, int stride_w, int dilation_h, int dilation_w) {
|
||||
const int64_t ih = inp->ne[1]; // height
|
||||
const int64_t iw = inp->ne[0]; // width
|
||||
|
||||
// Calculate output size (ceil division)
|
||||
const int64_t oh = (ih + stride_h - 1) / stride_h;
|
||||
const int64_t ow = (iw + stride_w - 1) / stride_w;
|
||||
|
||||
// Calculate padding needed
|
||||
const int64_t pad_h = std::max((int64_t)0, (oh - 1) * stride_h + (kernel_h - 1) * dilation_h + 1 - ih);
|
||||
const int64_t pad_w = std::max((int64_t)0, (ow - 1) * stride_w + (kernel_w - 1) * dilation_w + 1 - iw);
|
||||
|
||||
// Split padding asymmetrically
|
||||
const int pad_h_top = pad_h / 2;
|
||||
const int pad_h_bottom = pad_h - pad_h_top;
|
||||
const int pad_w_left = pad_w / 2;
|
||||
const int pad_w_right = pad_w - pad_w_left;
|
||||
|
||||
// Apply padding if needed
|
||||
// ggml_pad_ext: (ctx, tensor, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3)
|
||||
// For [W, H, C, B]: p0=width, p1=height, p2=channels, p3=batch
|
||||
if (pad_h > 0 || pad_w > 0) {
|
||||
inp = ggml_pad_ext(ctx0, inp,
|
||||
pad_w_left, pad_w_right, // width padding (dim 0)
|
||||
pad_h_top, pad_h_bottom, // height padding (dim 1)
|
||||
0, 0, // no channel padding (dim 2)
|
||||
0, 0); // no batch padding (dim 3)
|
||||
}
|
||||
|
||||
return inp;
|
||||
}
|
||||
|
||||
|
||||
// Edge Residual Block (Stage 0)
|
||||
ggml_tensor * clip_graph_mobilenetv5::build_edge_residual(ggml_tensor * inp, const mobilenetv5_block & block, int stride) {
|
||||
ggml_tensor * cur = inp;
|
||||
|
||||
// 1. Expansion Conv (3x3)
|
||||
if (stride == 2) {
|
||||
// Case: Downsampling (Block 0)
|
||||
// Replicates Conv2dSame(kernel=3, stride=2)
|
||||
cur = pad_same_2d(cur, 3, 3, stride, stride);
|
||||
cur = ggml_conv_2d_direct(ctx0, block.s0_conv_exp_w, cur, stride, stride, 0, 0, 1, 1);
|
||||
} else {
|
||||
// Case: Normal 3x3 Block (Block 1, 2)
|
||||
// Replicates Conv2d(kernel=3, stride=1, padding=1)
|
||||
cur = ggml_conv_2d_direct(ctx0, block.s0_conv_exp_w, cur, stride, stride, 1, 1, 1, 1);
|
||||
}
|
||||
|
||||
// BN + Activation
|
||||
if (block.s0_bn1_w) cur = rms_norm_2d(cur, block.s0_bn1_w);
|
||||
cur = ggml_gelu(ctx0, cur);
|
||||
|
||||
// 2. Pointwise Linear Conv (1x1)
|
||||
// 1x1 Convs usually have padding=0 and stride=1
|
||||
cur = ggml_conv_2d_direct(ctx0, block.s0_conv_pwl_w, cur, 1, 1, 0, 0, 1, 1);
|
||||
if (block.s0_bn2_w) cur = rms_norm_2d(cur, block.s0_bn2_w);
|
||||
|
||||
// 3. Residual Connection
|
||||
// Only apply residual if spatial dimensions and channels match (stride 1)
|
||||
if (stride == 1 && inp->ne[2] == cur->ne[2] && inp->ne[0] == cur->ne[0]) {
|
||||
cur = ggml_add(ctx0, cur, inp);
|
||||
}
|
||||
|
||||
return cur;
|
||||
}
|
||||
|
||||
// Universal Inverted Residual Block (Stage 1+)
|
||||
ggml_tensor * clip_graph_mobilenetv5::build_inverted_residual(ggml_tensor * inp, const mobilenetv5_block & block, int stride) {
|
||||
ggml_tensor * cur = inp;
|
||||
|
||||
// 1. Depthwise Start (Optional)
|
||||
// NOTE: dw_start always has stride=1 (no downsampling here)
|
||||
if (block.dw_start_w) {
|
||||
int k = block.dw_start_w->ne[0]; // 3 or 5
|
||||
int p = k / 2;
|
||||
cur = ggml_conv_2d_dw(ctx0, block.dw_start_w, cur, 1, 1, p, p, 1, 1);
|
||||
if (block.dw_start_bn_w) cur = rms_norm_2d(cur, block.dw_start_bn_w);
|
||||
}
|
||||
|
||||
// 2. Pointwise Expansion (1x1)
|
||||
if (block.pw_exp_w) {
|
||||
// Standard 1x1 conv, pad=0, stride=1
|
||||
cur = ggml_conv_2d_direct(ctx0, block.pw_exp_w, cur, 1, 1, 0, 0, 1, 1);
|
||||
if (block.pw_exp_bn_w) cur = rms_norm_2d(cur, block.pw_exp_bn_w);
|
||||
cur = ggml_gelu(ctx0, cur);
|
||||
}
|
||||
|
||||
// 3. Depthwise Mid (Optional)
|
||||
// NOTE: dw_mid is where downsampling happens (stride=2 for first block of stage)
|
||||
if (block.dw_mid_w) {
|
||||
int k = block.dw_mid_w->ne[0]; // 3 or 5
|
||||
|
||||
if (stride > 1) {
|
||||
// Case: Stride 2 (Downsample) -> Use Asymmetric "Same" Padding
|
||||
cur = pad_same_2d(cur, k, k, stride, stride);
|
||||
cur = ggml_conv_2d_dw(ctx0, block.dw_mid_w, cur, stride, stride, 0, 0, 1, 1); // pad=0
|
||||
} else {
|
||||
// Case: Stride 1 -> Use Standard Symmetric Padding
|
||||
int p = k / 2;
|
||||
cur = ggml_conv_2d_dw(ctx0, block.dw_mid_w, cur, stride, stride, p, p, 1, 1);
|
||||
}
|
||||
|
||||
if (block.dw_mid_bn_w) cur = rms_norm_2d(cur, block.dw_mid_bn_w);
|
||||
cur = ggml_gelu(ctx0, cur);
|
||||
}
|
||||
|
||||
// 4. Pointwise Projection (1x1)
|
||||
if (block.pw_proj_w) {
|
||||
cur = ggml_conv_2d_direct(ctx0, block.pw_proj_w, cur, 1, 1, 0, 0, 1, 1);
|
||||
if (block.pw_proj_bn_w) cur = rms_norm_2d(cur, block.pw_proj_bn_w);
|
||||
}
|
||||
|
||||
// Apply Layer Scaling if present
|
||||
if (block.layer_scale_w) {
|
||||
cur = ggml_mul(ctx0, cur, block.layer_scale_w);
|
||||
}
|
||||
|
||||
// 5. Residual Connection
|
||||
bool same_spatial = (inp->ne[0] == cur->ne[0]) && (inp->ne[1] == cur->ne[1]);
|
||||
bool same_channel = (inp->ne[2] == cur->ne[2]);
|
||||
if (same_spatial && same_channel) {
|
||||
cur = ggml_add(ctx0, cur, inp);
|
||||
}
|
||||
|
||||
return cur;
|
||||
}
|
||||
|
||||
// Attention Block (MQA)
|
||||
ggml_tensor * clip_graph_mobilenetv5::build_mobilenet_attn(ggml_tensor * inp, const mobilenetv5_block & block) {
|
||||
ggml_tensor * cur = inp;
|
||||
|
||||
// Norm
|
||||
if (block.attn_norm_w) {
|
||||
cur = rms_norm_2d(cur, block.attn_norm_w, 1e-6f);
|
||||
}
|
||||
|
||||
// 1. Q Calculation
|
||||
ggml_tensor * q = ggml_conv_2d_direct(ctx0, block.attn_q_w, cur, 1, 1, 0, 0, 1, 1);
|
||||
|
||||
// 2. K Calculation (Downsampled)
|
||||
// Uses Conv2dSame(640, 640, kernel_size=(3, 3), stride=(2, 2), groups=640)
|
||||
ggml_tensor * k_inp = cur;
|
||||
if (block.attn_k_dw_w) {
|
||||
int k_size = block.attn_k_dw_w->ne[0]; // Usually 3
|
||||
k_inp = pad_same_2d(cur, k_size, k_size, 2, 2); // Apply SAME padding
|
||||
k_inp = ggml_conv_2d_dw(ctx0, block.attn_k_dw_w, k_inp, 2, 2, 0, 0, 1, 1); // padding=0
|
||||
if (block.attn_k_norm_w) {
|
||||
k_inp = rms_norm_2d(k_inp, block.attn_k_norm_w, 1e-6f);
|
||||
}
|
||||
}
|
||||
ggml_tensor * k = ggml_conv_2d_direct(ctx0, block.attn_k_w, k_inp, 1, 1, 0, 0, 1, 1);
|
||||
|
||||
// 3. V Calculation (Downsampled)
|
||||
// Uses Conv2dSame(640, 640, kernel_size=(3, 3), stride=(2, 2), groups=640)
|
||||
ggml_tensor * v_inp = cur;
|
||||
if (block.attn_v_dw_w) {
|
||||
int v_size = block.attn_v_dw_w->ne[0]; // Usually 3
|
||||
v_inp = pad_same_2d(cur, v_size, v_size, 2, 2); // Apply SAME padding
|
||||
v_inp = ggml_conv_2d_dw(ctx0, block.attn_v_dw_w, v_inp, 2, 2, 0, 0, 1, 1); // padding=0
|
||||
if (block.attn_v_norm_w) {
|
||||
v_inp = rms_norm_2d(v_inp, block.attn_v_norm_w, 1e-6f);
|
||||
}
|
||||
}
|
||||
ggml_tensor * v = ggml_conv_2d_direct(ctx0, block.attn_v_w, v_inp, 1, 1, 0, 0, 1, 1);
|
||||
|
||||
const int W = cur->ne[0]; const int H = cur->ne[1]; const int B = cur->ne[3];
|
||||
const int D = k->ne[2]; // Head dimension
|
||||
const int n_head = q->ne[2] / D;
|
||||
const int N = W * H;
|
||||
|
||||
// Process Q: [W, H, D*n_head, B] -> [D, N, n_head, B]
|
||||
q = ggml_reshape_3d(ctx0, q, N, D*n_head, B);
|
||||
q = ggml_reshape_4d(ctx0, q, N, D, n_head, B);
|
||||
q = ggml_permute(ctx0, q, 1, 0, 2, 3); // [D, N, n_head, B]
|
||||
q = ggml_cont(ctx0, q);
|
||||
|
||||
const int Wk = k->ne[0]; const int Hk = k->ne[1];
|
||||
const int M = Wk * Hk;
|
||||
|
||||
// Process K: [Wk, Hk, D, B] -> [D, M, 1, B]
|
||||
k = ggml_reshape_3d(ctx0, k, M, D, B);
|
||||
k = ggml_reshape_4d(ctx0, k, M, D, 1, B);
|
||||
k = ggml_permute(ctx0, k, 1, 0, 2, 3); // [D, M, 1, B]
|
||||
k = ggml_cont(ctx0, k);
|
||||
|
||||
// Process V: [Wk, Hk, D, B] -> [M, D, 1, B]
|
||||
v = ggml_reshape_3d(ctx0, v, M, D, B);
|
||||
v = ggml_reshape_4d(ctx0, v, M, D, 1, B);
|
||||
v = ggml_cont(ctx0, v); // [M, D, 1, B]
|
||||
|
||||
// Multi-Query Attention
|
||||
float scale = 1.0f / sqrtf((float)D);
|
||||
|
||||
// Step 1: Compute Q @ K.T
|
||||
ggml_tensor * scores = ggml_mul_mat(ctx0, k, q);
|
||||
|
||||
scores = ggml_scale(ctx0, scores, scale);
|
||||
|
||||
scores = ggml_soft_max(ctx0, scores);
|
||||
|
||||
ggml_tensor * kqv = ggml_mul_mat(ctx0, v, scores);
|
||||
|
||||
kqv = ggml_permute(ctx0, kqv, 1, 0, 2, 3);
|
||||
kqv = ggml_cont(ctx0, kqv);
|
||||
|
||||
|
||||
kqv = ggml_reshape_3d(ctx0, kqv, N, D * n_head, B);
|
||||
kqv = ggml_reshape_4d(ctx0, kqv, W, H, D * n_head, B);
|
||||
kqv = ggml_cont(ctx0, kqv);
|
||||
|
||||
// Output projection
|
||||
cur = ggml_conv_2d_direct(ctx0, block.attn_o_w, kqv, 1, 1, 0, 0, 1, 1);
|
||||
|
||||
// Residual & Layer Scale
|
||||
if (inp->ne[0] == cur->ne[0] && inp->ne[2] == cur->ne[2]) {
|
||||
if (block.layer_scale_w) {
|
||||
cur = ggml_mul(ctx0, cur, block.layer_scale_w);
|
||||
}
|
||||
cur = ggml_add(ctx0, cur, inp);
|
||||
}
|
||||
|
||||
return cur;
|
||||
}
|
||||
|
||||
ggml_cgraph * clip_graph_mobilenetv5::build() {
|
||||
ggml_tensor * inp = build_inp_raw();
|
||||
|
||||
// 1. Stem - Conv2dSame(3, 64, kernel_size=(3, 3), stride=(2, 2))
|
||||
ggml_tensor * cur = pad_same_2d(inp, 3, 3, 2, 2); // Apply SAME padding
|
||||
|
||||
cur = ggml_conv_2d_direct(ctx0, model.mobilenet_stem_conv_w, cur, 2, 2, 0, 0, 1, 1); // padding=0
|
||||
if (model.mobilenet_stem_conv_b) {
|
||||
cur = ggml_add(ctx0, cur, model.mobilenet_stem_conv_b);
|
||||
}
|
||||
if (model.mobilenet_stem_norm_w) cur = rms_norm_2d(cur, model.mobilenet_stem_norm_w);
|
||||
cur = ggml_gelu(ctx0, cur);
|
||||
|
||||
|
||||
// 2. Blocks
|
||||
std::vector<ggml_tensor*> intermediate_features;
|
||||
const int total_blocks = model.mobilenet_blocks.size();
|
||||
|
||||
auto is_stage_start = [&](int i) {
|
||||
if (i == 0) return true;
|
||||
for (int end_idx : model.mobilenet_stage_ends) {
|
||||
if (i == end_idx + 1) return true;
|
||||
}
|
||||
return false;
|
||||
};
|
||||
|
||||
auto is_fusion_point = [&](int i) {
|
||||
if (model.mobilenet_stage_ends.size() >= 4) {
|
||||
if (i == model.mobilenet_stage_ends[2]) return true; // End of Stage 2
|
||||
if (i == model.mobilenet_stage_ends[3]) return true; // End of Stage 3
|
||||
} else {
|
||||
if (i == total_blocks - 1) return true;
|
||||
}
|
||||
return false;
|
||||
};
|
||||
|
||||
for (int i = 0; i < total_blocks; i++) {
|
||||
const auto & block = model.mobilenet_blocks[i];
|
||||
int stride = is_stage_start(i) ? 2 : 1;
|
||||
|
||||
if (block.s0_conv_exp_w) cur = build_edge_residual(cur, block, stride);
|
||||
else if (block.attn_q_w) cur = build_mobilenet_attn(cur, block);
|
||||
else cur = build_inverted_residual(cur, block, stride);
|
||||
|
||||
if (is_fusion_point(i)) {
|
||||
|
||||
intermediate_features.push_back(cur);
|
||||
}
|
||||
}
|
||||
|
||||
// 3. Multi-Scale Fusion Adapter (MSFA)
|
||||
if (!intermediate_features.empty()) {
|
||||
|
||||
// A. Reference Resolution: PyTorch implementation uses inputs[0]
|
||||
// We assume intermediate_features[0] is the "High Resolution" target.
|
||||
// In MobileNet designs, this is typically the feature map with the smallest stride (e.g. 32x32).
|
||||
ggml_tensor* target_feat = intermediate_features[0];
|
||||
int high_res_w = target_feat->ne[0];
|
||||
int high_res_h = target_feat->ne[1];
|
||||
|
||||
std::vector<ggml_tensor*> resized_feats;
|
||||
|
||||
// B. Resize inputs to match inputs[0] (High Resolution)
|
||||
for (auto feat : intermediate_features) {
|
||||
int feat_w = feat->ne[0];
|
||||
int feat_h = feat->ne[1];
|
||||
|
||||
// PyTorch: if feat_size < high_resolution: interpolate
|
||||
if (feat_w < high_res_w || feat_h < high_res_h) {
|
||||
// Calculate scale factor.
|
||||
// Note: PyTorch 'nearest' works on arbitrary float scales.
|
||||
// ggml_upscale generally takes integer factors or target sizes depending on helper.
|
||||
// Assuming standard power-of-2 scaling (e.g. 16 -> 32 means scale=2).
|
||||
int scale_w = high_res_w / feat_w;
|
||||
// int scale_h = high_res_h / feat_h;
|
||||
|
||||
// Safety check for non-integer scaling if strictly replicating
|
||||
GGML_ASSERT(high_res_w % feat_w == 0);
|
||||
|
||||
// Upsample (Nearest Neighbor)
|
||||
// 2 is the scale factor
|
||||
feat = ggml_upscale(ctx0, feat, scale_w, ggml_scale_mode::GGML_SCALE_MODE_NEAREST);
|
||||
}
|
||||
resized_feats.push_back(feat);
|
||||
}
|
||||
|
||||
// C. Concatenate at High Resolution (Channel Dim = 2 in ggml)
|
||||
cur = resized_feats[0];
|
||||
for (size_t k = 1; k < resized_feats.size(); ++k) {
|
||||
cur = ggml_concat(ctx0, cur, resized_feats[k], 2);
|
||||
}
|
||||
|
||||
// D. FFN (UniversalInvertedResidual)
|
||||
// Structure: Expand Conv -> Norm -> GELU -> Project Conv -> Norm
|
||||
|
||||
// 1. Expansion
|
||||
if (model.msfa_ffn_expand_w) {
|
||||
// 1x1 Conv
|
||||
cur = ggml_conv_2d_direct(ctx0, model.msfa_ffn_expand_w, cur, 1, 1, 0, 0, 1, 1);
|
||||
|
||||
if (model.msfa_ffn_expand_bn) {
|
||||
cur = rms_norm_2d(cur, model.msfa_ffn_expand_bn);
|
||||
}
|
||||
|
||||
cur = ggml_gelu(ctx0, cur);
|
||||
|
||||
}
|
||||
|
||||
// 2. Projection (No DW because kernel_size=0)
|
||||
if (model.msfa_ffn_project_w) {
|
||||
// 1x1 Conv
|
||||
cur = ggml_conv_2d_direct(ctx0, model.msfa_ffn_project_w, cur, 1, 1, 0, 0, 1, 1);
|
||||
|
||||
// UniversalInvertedResidual typically has a norm after projection
|
||||
if (model.msfa_ffn_project_bn) {
|
||||
cur = rms_norm_2d(cur, model.msfa_ffn_project_bn);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// E. Final Downsample to Target Resolution (Output Resolution)
|
||||
// PyTorch: matches self.output_resolution (e.g. 16x16)
|
||||
const int target_out_res = 16;
|
||||
int current_w = cur->ne[0];
|
||||
|
||||
if (current_w > target_out_res) {
|
||||
int s = current_w / target_out_res;
|
||||
|
||||
GGML_ASSERT(current_w % target_out_res == 0);
|
||||
|
||||
// Avg Pool: Kernel=s, Stride=s
|
||||
cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_AVG, s, s, s, s, 0, 0);
|
||||
|
||||
}
|
||||
|
||||
// F. Final Norm
|
||||
if (model.msfa_concat_norm_w) {
|
||||
cur = rms_norm_2d(cur, model.msfa_concat_norm_w);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
// 4. Gemma 3n Multimodal Projection (Embedder)
|
||||
// Input: 'cur' is [Width, Height, Channels, Batch]
|
||||
int W = cur->ne[0];
|
||||
int H = cur->ne[1];
|
||||
int C = cur->ne[2];
|
||||
int B = cur->ne[3];
|
||||
|
||||
GGML_ASSERT(C == hparams.n_embd);
|
||||
|
||||
// 1. Permute and Flatten to [Channels, Tokens, Batch]
|
||||
// PyTorch expects (Batch, Seq, Hidden), GGML usually processes (Hidden, Seq, Batch)
|
||||
cur = ggml_permute(ctx0, cur, 2, 1, 0, 3); // -> [C, H, W, B]
|
||||
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); // -> [C, W, H, B]
|
||||
cur = ggml_cont(ctx0, cur);
|
||||
cur = ggml_reshape_3d(ctx0, cur, C, W*H, B);
|
||||
cur = ggml_cont(ctx0, cur);
|
||||
|
||||
|
||||
// 2. FEATURE SCALING
|
||||
// PyTorch: vision_outputs *= self.config.vision_config.hidden_size**0.5
|
||||
const float scale_factor = sqrtf((float)C);
|
||||
cur = ggml_scale(ctx0, cur, scale_factor);
|
||||
|
||||
|
||||
// 3. SOFT EMBEDDING NORM
|
||||
// PyTorch: self._norm(x) * self.weight
|
||||
// We must normalize regardless, then multiply if weight exists.
|
||||
{
|
||||
const float eps = 1e-6f; // Gemma3n uses 1e-6
|
||||
cur = ggml_rms_norm(ctx0, cur, eps);
|
||||
|
||||
if (model.mm_soft_emb_norm_w) {
|
||||
// Weight shape is (2048,) -> Element-wise broadcast multiply
|
||||
cur = ggml_mul(ctx0, cur, model.mm_soft_emb_norm_w);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// 4. PROJECTION
|
||||
// PyTorch: embedding_projection = nn.Linear(vision_hidden, text_hidden, bias=False)
|
||||
// Weight stored as [out_features, in_features] = [text_hidden_size, vision_hidden_size]
|
||||
if (model.mm_input_proj_w) {
|
||||
cur = ggml_mul_mat(ctx0, model.mm_input_proj_w, cur);
|
||||
}
|
||||
|
||||
// 5. POST PROJECTION NORM
|
||||
// PyTorch: embedding_post_projection_norm = Gemma3nRMSNorm(..., with_scale=False)
|
||||
// with_scale=False means weight is registered as buffer with value 1.0
|
||||
// So output = rms_norm(x) * 1.0 = rms_norm(x), magnitude ~1
|
||||
{
|
||||
const float eps = 1e-6f;
|
||||
cur = ggml_rms_norm(ctx0, cur, eps);
|
||||
|
||||
if (model.mm_post_proj_norm_w) {
|
||||
// If weight is loaded, multiply (should be ~1.0 anyway)
|
||||
cur = ggml_mul(ctx0, cur, model.mm_post_proj_norm_w);
|
||||
}
|
||||
}
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
return gf;
|
||||
}
|
||||
|
|
@ -76,3 +76,36 @@ struct clip_graph_glm4v : clip_graph {
|
|||
clip_graph_glm4v(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||
ggml_cgraph * build() override;
|
||||
};
|
||||
|
||||
struct clip_graph_mobilenetv5 : clip_graph {
|
||||
clip_graph_mobilenetv5(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||
ggml_cgraph * build() override;
|
||||
|
||||
ggml_tensor * rms_norm_2d(
|
||||
ggml_tensor * inp,
|
||||
ggml_tensor * weight,
|
||||
float eps = 1e-6f);
|
||||
|
||||
ggml_tensor* pad_same_2d(
|
||||
ggml_tensor* inp,
|
||||
int kernel_h,
|
||||
int kernel_w,
|
||||
int stride_h,
|
||||
int stride_w,
|
||||
int dilation_h = 1,
|
||||
int dilation_w = 1);
|
||||
|
||||
ggml_tensor * build_edge_residual(
|
||||
ggml_tensor * inp,
|
||||
const mobilenetv5_block & block,
|
||||
int stride);
|
||||
|
||||
ggml_tensor * build_inverted_residual(
|
||||
ggml_tensor * inp,
|
||||
const mobilenetv5_block & block,
|
||||
int stride);
|
||||
|
||||
ggml_tensor * build_mobilenet_attn(
|
||||
ggml_tensor * inp,
|
||||
const mobilenetv5_block & block);
|
||||
};
|
||||
|
|
|
|||
|
|
@ -266,7 +266,7 @@ struct mtmd_context {
|
|||
}
|
||||
|
||||
// set boi/eoi
|
||||
if (proj == PROJECTOR_TYPE_GEMMA3) {
|
||||
if (proj == PROJECTOR_TYPE_GEMMA3 || proj == PROJECTOR_TYPE_GEMMA3NV) {
|
||||
// <start_of_image> ... (image embeddings) ... <end_of_image>
|
||||
img_beg = "<start_of_image>";
|
||||
img_end = "<end_of_image>";
|
||||
|
|
@ -862,10 +862,15 @@ float * mtmd_get_output_embd(mtmd_context * ctx) {
|
|||
}
|
||||
|
||||
bool mtmd_decode_use_non_causal(mtmd_context * ctx) {
|
||||
if (ctx->ctx_v && clip_get_projector_type(ctx->ctx_v) == PROJECTOR_TYPE_GEMMA3) {
|
||||
return true;
|
||||
switch (ctx->proj_type_v()) {
|
||||
case PROJECTOR_TYPE_QWEN2VL:
|
||||
case PROJECTOR_TYPE_QWEN25VL:
|
||||
case PROJECTOR_TYPE_QWEN3VL:
|
||||
case PROJECTOR_TYPE_YOUTUVL:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool mtmd_decode_use_mrope(mtmd_context * ctx) {
|
||||
|
|
|
|||
Binary file not shown.
|
|
@ -4,7 +4,6 @@
|
|||
#include "server-task.h"
|
||||
#include "server-queue.h"
|
||||
|
||||
#include "arg.h"
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
#include "log.h"
|
||||
|
|
@ -16,7 +15,6 @@
|
|||
#include <cstddef>
|
||||
#include <cinttypes>
|
||||
#include <memory>
|
||||
#include <unordered_set>
|
||||
#include <filesystem>
|
||||
|
||||
// fix problem with std::min and std::max
|
||||
|
|
@ -2617,10 +2615,6 @@ private:
|
|||
// on successful decode, restore the original batch size
|
||||
n_batch = llama_n_batch(ctx);
|
||||
|
||||
// technically, measuring the time here excludes the sampling time for the last batch
|
||||
// but on the other hand, we don't want to do too many system calls to measure the time, so it's ok
|
||||
const int64_t t_current = ggml_time_us();
|
||||
|
||||
for (auto & slot : slots) {
|
||||
// may need to copy state to other slots
|
||||
if (slot.state == SLOT_STATE_DONE_PROMPT && slot.is_parent()) {
|
||||
|
|
@ -2687,6 +2681,9 @@ private:
|
|||
|
||||
common_sampler_accept(slot.smpl.get(), id, true);
|
||||
|
||||
// here we have synchronized the llama_context (due to the sampling above), so we can do time measurement
|
||||
const int64_t t_current = ggml_time_us();
|
||||
|
||||
slot.n_decoded += 1;
|
||||
|
||||
if (slot.n_decoded == 1) {
|
||||
|
|
@ -2730,6 +2727,8 @@ private:
|
|||
slot.i_batch_dft.clear();
|
||||
slot.drafted.clear();
|
||||
|
||||
const int64_t t_current = ggml_time_us();
|
||||
|
||||
slot.n_decoded += ids.size();
|
||||
|
||||
slot.t_token_generation = std::max<int64_t>(1, t_current - slot.t_start_generation) / 1e3;
|
||||
|
|
@ -2927,9 +2926,14 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
|
|||
if (task.params.n_cmpl > 1) {
|
||||
task.n_children = task.params.n_cmpl - 1;
|
||||
for (size_t j = 0; j < task.n_children; j++) {
|
||||
server_task child = task.create_child(
|
||||
task.id,
|
||||
rd.get_new_id());
|
||||
server_task child = task.create_child(task.id, rd.get_new_id());
|
||||
|
||||
// use different sampling seed for each child
|
||||
// note: https://github.com/ggml-org/llama.cpp/pull/18700#discussion_r2675115723
|
||||
if (child.params.sampling.seed != LLAMA_DEFAULT_SEED) {
|
||||
child.params.sampling.seed += j + 1;
|
||||
}
|
||||
|
||||
tasks.push_back(std::move(child));
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -503,5 +503,4 @@ def test_chat_completions_multiple_choices():
|
|||
assert len(res.body["choices"]) == 2
|
||||
for choice in res.body["choices"]:
|
||||
assert "assistant" == choice["message"]["role"]
|
||||
assert match_regex("Suddenly", choice["message"]["content"])
|
||||
assert choice["finish_reason"] == "length"
|
||||
|
|
|
|||
|
|
@ -10,21 +10,11 @@
|
|||
import { INPUT_CLASSES } from '$lib/constants/input-classes';
|
||||
import { SETTING_CONFIG_DEFAULT } from '$lib/constants/settings-config';
|
||||
import { config } from '$lib/stores/settings.svelte';
|
||||
import { modelsStore, modelOptions, selectedModelId } from '$lib/stores/models.svelte';
|
||||
import { modelOptions, selectedModelId } from '$lib/stores/models.svelte';
|
||||
import { isRouterMode } from '$lib/stores/server.svelte';
|
||||
import { chatStore } from '$lib/stores/chat.svelte';
|
||||
import { activeMessages } from '$lib/stores/conversations.svelte';
|
||||
import {
|
||||
FileTypeCategory,
|
||||
MimeTypeApplication,
|
||||
FileExtensionAudio,
|
||||
FileExtensionImage,
|
||||
FileExtensionPdf,
|
||||
FileExtensionText,
|
||||
MimeTypeAudio,
|
||||
MimeTypeImage,
|
||||
MimeTypeText
|
||||
} from '$lib/enums';
|
||||
import { MimeTypeText } from '$lib/enums';
|
||||
import { isIMEComposing, parseClipboardContent } from '$lib/utils';
|
||||
import {
|
||||
AudioRecorder,
|
||||
|
|
@ -61,7 +51,6 @@
|
|||
let audioRecorder: AudioRecorder | undefined;
|
||||
let chatFormActionsRef: ChatFormActions | undefined = $state(undefined);
|
||||
let currentConfig = $derived(config());
|
||||
let fileAcceptString = $state<string | undefined>(undefined);
|
||||
let fileInputRef: ChatFormFileInputInvisible | undefined = $state(undefined);
|
||||
let isRecording = $state(false);
|
||||
let message = $state('');
|
||||
|
|
@ -104,40 +93,6 @@
|
|||
return null;
|
||||
});
|
||||
|
||||
// State for model props reactivity
|
||||
let modelPropsVersion = $state(0);
|
||||
|
||||
// Fetch model props when active model changes (works for both MODEL and ROUTER mode)
|
||||
$effect(() => {
|
||||
if (activeModelId) {
|
||||
const cached = modelsStore.getModelProps(activeModelId);
|
||||
if (!cached) {
|
||||
modelsStore.fetchModelProps(activeModelId).then(() => {
|
||||
modelPropsVersion++;
|
||||
});
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// Derive modalities from active model (works for both MODEL and ROUTER mode)
|
||||
let hasAudioModality = $derived.by(() => {
|
||||
if (activeModelId) {
|
||||
void modelPropsVersion; // Trigger reactivity on props fetch
|
||||
return modelsStore.modelSupportsAudio(activeModelId);
|
||||
}
|
||||
|
||||
return false;
|
||||
});
|
||||
|
||||
let hasVisionModality = $derived.by(() => {
|
||||
if (activeModelId) {
|
||||
void modelPropsVersion; // Trigger reactivity on props fetch
|
||||
return modelsStore.modelSupportsVision(activeModelId);
|
||||
}
|
||||
|
||||
return false;
|
||||
});
|
||||
|
||||
function checkModelSelected(): boolean {
|
||||
if (!hasModelSelected) {
|
||||
// Open the model selector
|
||||
|
|
@ -148,42 +103,12 @@
|
|||
return true;
|
||||
}
|
||||
|
||||
function getAcceptStringForFileType(fileType: FileTypeCategory): string {
|
||||
switch (fileType) {
|
||||
case FileTypeCategory.IMAGE:
|
||||
return [...Object.values(FileExtensionImage), ...Object.values(MimeTypeImage)].join(',');
|
||||
|
||||
case FileTypeCategory.AUDIO:
|
||||
return [...Object.values(FileExtensionAudio), ...Object.values(MimeTypeAudio)].join(',');
|
||||
|
||||
case FileTypeCategory.PDF:
|
||||
return [...Object.values(FileExtensionPdf), ...Object.values(MimeTypeApplication)].join(
|
||||
','
|
||||
);
|
||||
|
||||
case FileTypeCategory.TEXT:
|
||||
return [...Object.values(FileExtensionText), MimeTypeText.PLAIN].join(',');
|
||||
|
||||
default:
|
||||
return '';
|
||||
}
|
||||
}
|
||||
|
||||
function handleFileSelect(files: File[]) {
|
||||
onFileUpload?.(files);
|
||||
}
|
||||
|
||||
function handleFileUpload(fileType?: FileTypeCategory) {
|
||||
if (fileType) {
|
||||
fileAcceptString = getAcceptStringForFileType(fileType);
|
||||
} else {
|
||||
fileAcceptString = undefined;
|
||||
}
|
||||
|
||||
// Use setTimeout to ensure the accept attribute is applied before opening dialog
|
||||
setTimeout(() => {
|
||||
fileInputRef?.click();
|
||||
}, 10);
|
||||
function handleFileUpload() {
|
||||
fileInputRef?.click();
|
||||
}
|
||||
|
||||
async function handleKeydown(event: KeyboardEvent) {
|
||||
|
|
@ -343,13 +268,7 @@
|
|||
});
|
||||
</script>
|
||||
|
||||
<ChatFormFileInputInvisible
|
||||
bind:this={fileInputRef}
|
||||
bind:accept={fileAcceptString}
|
||||
{hasAudioModality}
|
||||
{hasVisionModality}
|
||||
onFileSelect={handleFileSelect}
|
||||
/>
|
||||
<ChatFormFileInputInvisible bind:this={fileInputRef} onFileSelect={handleFileSelect} />
|
||||
|
||||
<form
|
||||
onsubmit={handleSubmit}
|
||||
|
|
|
|||
|
|
@ -4,14 +4,13 @@
|
|||
import * as DropdownMenu from '$lib/components/ui/dropdown-menu';
|
||||
import * as Tooltip from '$lib/components/ui/tooltip';
|
||||
import { FILE_TYPE_ICONS } from '$lib/constants/icons';
|
||||
import { FileTypeCategory } from '$lib/enums';
|
||||
|
||||
interface Props {
|
||||
class?: string;
|
||||
disabled?: boolean;
|
||||
hasAudioModality?: boolean;
|
||||
hasVisionModality?: boolean;
|
||||
onFileUpload?: (fileType?: FileTypeCategory) => void;
|
||||
onFileUpload?: () => void;
|
||||
}
|
||||
|
||||
let {
|
||||
|
|
@ -27,10 +26,6 @@
|
|||
? 'Text files and PDFs supported. Images, audio, and video require vision models.'
|
||||
: 'Attach files';
|
||||
});
|
||||
|
||||
function handleFileUpload(fileType?: FileTypeCategory) {
|
||||
onFileUpload?.(fileType);
|
||||
}
|
||||
</script>
|
||||
|
||||
<div class="flex items-center gap-1 {className}">
|
||||
|
|
@ -61,7 +56,7 @@
|
|||
<DropdownMenu.Item
|
||||
class="images-button flex cursor-pointer items-center gap-2"
|
||||
disabled={!hasVisionModality}
|
||||
onclick={() => handleFileUpload(FileTypeCategory.IMAGE)}
|
||||
onclick={() => onFileUpload?.()}
|
||||
>
|
||||
<FILE_TYPE_ICONS.image class="h-4 w-4" />
|
||||
|
||||
|
|
@ -81,7 +76,7 @@
|
|||
<DropdownMenu.Item
|
||||
class="audio-button flex cursor-pointer items-center gap-2"
|
||||
disabled={!hasAudioModality}
|
||||
onclick={() => handleFileUpload(FileTypeCategory.AUDIO)}
|
||||
onclick={() => onFileUpload?.()}
|
||||
>
|
||||
<FILE_TYPE_ICONS.audio class="h-4 w-4" />
|
||||
|
||||
|
|
@ -98,7 +93,7 @@
|
|||
|
||||
<DropdownMenu.Item
|
||||
class="flex cursor-pointer items-center gap-2"
|
||||
onclick={() => handleFileUpload(FileTypeCategory.TEXT)}
|
||||
onclick={() => onFileUpload?.()}
|
||||
>
|
||||
<FILE_TYPE_ICONS.text class="h-4 w-4" />
|
||||
|
||||
|
|
@ -109,7 +104,7 @@
|
|||
<Tooltip.Trigger class="w-full">
|
||||
<DropdownMenu.Item
|
||||
class="flex cursor-pointer items-center gap-2"
|
||||
onclick={() => handleFileUpload(FileTypeCategory.PDF)}
|
||||
onclick={() => onFileUpload?.()}
|
||||
>
|
||||
<FILE_TYPE_ICONS.pdf class="h-4 w-4" />
|
||||
|
||||
|
|
|
|||
|
|
@ -24,7 +24,7 @@
|
|||
isRecording?: boolean;
|
||||
hasText?: boolean;
|
||||
uploadedFiles?: ChatUploadedFile[];
|
||||
onFileUpload?: (fileType?: FileTypeCategory) => void;
|
||||
onFileUpload?: () => void;
|
||||
onMicClick?: () => void;
|
||||
onStop?: () => void;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,35 +1,14 @@
|
|||
<script lang="ts">
|
||||
import { generateModalityAwareAcceptString } from '$lib/utils';
|
||||
|
||||
interface Props {
|
||||
accept?: string;
|
||||
class?: string;
|
||||
hasAudioModality?: boolean;
|
||||
hasVisionModality?: boolean;
|
||||
multiple?: boolean;
|
||||
onFileSelect?: (files: File[]) => void;
|
||||
}
|
||||
|
||||
let {
|
||||
accept = $bindable(),
|
||||
class: className = '',
|
||||
hasAudioModality = false,
|
||||
hasVisionModality = false,
|
||||
multiple = true,
|
||||
onFileSelect
|
||||
}: Props = $props();
|
||||
let { class: className = '', multiple = true, onFileSelect }: Props = $props();
|
||||
|
||||
let fileInputElement: HTMLInputElement | undefined;
|
||||
|
||||
// Use modality-aware accept string by default, but allow override
|
||||
let finalAccept = $derived(
|
||||
accept ??
|
||||
generateModalityAwareAcceptString({
|
||||
hasVision: hasVisionModality,
|
||||
hasAudio: hasAudioModality
|
||||
})
|
||||
);
|
||||
|
||||
export function click() {
|
||||
fileInputElement?.click();
|
||||
}
|
||||
|
|
@ -46,7 +25,6 @@
|
|||
bind:this={fileInputElement}
|
||||
type="file"
|
||||
{multiple}
|
||||
accept={finalAccept}
|
||||
onchange={handleFileSelect}
|
||||
class="hidden {className}"
|
||||
/>
|
||||
|
|
|
|||
|
|
@ -195,9 +195,28 @@ export function getFileTypeByExtension(filename: string): string | null {
|
|||
}
|
||||
|
||||
export function isFileTypeSupported(filename: string, mimeType?: string): boolean {
|
||||
if (mimeType && getFileTypeCategory(mimeType)) {
|
||||
// Images are detected and handled separately for vision models
|
||||
if (mimeType) {
|
||||
const category = getFileTypeCategory(mimeType);
|
||||
if (
|
||||
category === FileTypeCategory.IMAGE ||
|
||||
category === FileTypeCategory.AUDIO ||
|
||||
category === FileTypeCategory.PDF
|
||||
) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// Check extension for known types (especially images without MIME)
|
||||
const extCategory = getFileTypeCategoryByExtension(filename);
|
||||
if (
|
||||
extCategory === FileTypeCategory.IMAGE ||
|
||||
extCategory === FileTypeCategory.AUDIO ||
|
||||
extCategory === FileTypeCategory.PDF
|
||||
) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return getFileTypeByExtension(filename) !== null;
|
||||
// Fallback: treat everything else as text (inclusive by default)
|
||||
return true;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -76,7 +76,6 @@ export {
|
|||
isFileTypeSupportedByModel,
|
||||
filterFilesByModalities,
|
||||
generateModalityErrorMessage,
|
||||
generateModalityAwareAcceptString,
|
||||
type ModalityCapabilities
|
||||
} from './modality-file-validation';
|
||||
|
||||
|
|
|
|||
|
|
@ -4,17 +4,7 @@
|
|||
*/
|
||||
|
||||
import { getFileTypeCategory } from '$lib/utils';
|
||||
import {
|
||||
FileExtensionAudio,
|
||||
FileExtensionImage,
|
||||
FileExtensionPdf,
|
||||
FileExtensionText,
|
||||
MimeTypeAudio,
|
||||
MimeTypeImage,
|
||||
MimeTypeApplication,
|
||||
MimeTypeText,
|
||||
FileTypeCategory
|
||||
} from '$lib/enums';
|
||||
import { FileTypeCategory } from '$lib/enums';
|
||||
|
||||
/** Modality capabilities for file validation */
|
||||
export interface ModalityCapabilities {
|
||||
|
|
@ -170,29 +160,3 @@ export function generateModalityErrorMessage(
|
|||
* @param capabilities - The modality capabilities to check against
|
||||
* @returns Accept string for HTML file input element
|
||||
*/
|
||||
export function generateModalityAwareAcceptString(capabilities: ModalityCapabilities): string {
|
||||
const { hasVision, hasAudio } = capabilities;
|
||||
|
||||
const acceptedExtensions: string[] = [];
|
||||
const acceptedMimeTypes: string[] = [];
|
||||
|
||||
// Always include text files and PDFs
|
||||
acceptedExtensions.push(...Object.values(FileExtensionText));
|
||||
acceptedMimeTypes.push(...Object.values(MimeTypeText));
|
||||
acceptedExtensions.push(...Object.values(FileExtensionPdf));
|
||||
acceptedMimeTypes.push(...Object.values(MimeTypeApplication));
|
||||
|
||||
// Include images only if vision is supported
|
||||
if (hasVision) {
|
||||
acceptedExtensions.push(...Object.values(FileExtensionImage));
|
||||
acceptedMimeTypes.push(...Object.values(MimeTypeImage));
|
||||
}
|
||||
|
||||
// Include audio only if audio is supported
|
||||
if (hasAudio) {
|
||||
acceptedExtensions.push(...Object.values(FileExtensionAudio));
|
||||
acceptedMimeTypes.push(...Object.values(MimeTypeAudio));
|
||||
}
|
||||
|
||||
return [...acceptedExtensions, ...acceptedMimeTypes].join(',');
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,5 +1,4 @@
|
|||
import { isSvgMimeType, svgBase64UrlToPngDataURL } from './svg-to-png';
|
||||
import { isTextFileByName } from './text-files';
|
||||
import { isWebpMimeType, webpBase64UrlToPngDataURL } from './webp-to-png';
|
||||
import { FileTypeCategory } from '$lib/enums';
|
||||
import { modelsStore } from '$lib/stores/models.svelte';
|
||||
|
|
@ -84,17 +83,6 @@ export async function processFilesToChatUploaded(
|
|||
}
|
||||
|
||||
results.push({ ...base, preview });
|
||||
} else if (
|
||||
getFileTypeCategory(file.type) === FileTypeCategory.TEXT ||
|
||||
isTextFileByName(file.name)
|
||||
) {
|
||||
try {
|
||||
const textContent = await readFileAsUTF8(file);
|
||||
results.push({ ...base, textContent });
|
||||
} catch (err) {
|
||||
console.warn('Failed to read text file, adding without content:', err);
|
||||
results.push(base);
|
||||
}
|
||||
} else if (getFileTypeCategory(file.type) === FileTypeCategory.PDF) {
|
||||
// Extract text content from PDF for preview
|
||||
try {
|
||||
|
|
@ -129,8 +117,14 @@ export async function processFilesToChatUploaded(
|
|||
const preview = await readFileAsDataURL(file);
|
||||
results.push({ ...base, preview });
|
||||
} else {
|
||||
// Other files: add as-is
|
||||
results.push(base);
|
||||
// Fallback: treat unknown files as text
|
||||
try {
|
||||
const textContent = await readFileAsUTF8(file);
|
||||
results.push({ ...base, textContent });
|
||||
} catch (err) {
|
||||
console.warn('Failed to read file as text, adding without content:', err);
|
||||
results.push(base);
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('Error processing file', file.name, error);
|
||||
|
|
|
|||
|
|
@ -65,10 +65,7 @@
|
|||
await expect(textarea).toHaveValue(text);
|
||||
|
||||
const fileInput = document.querySelector('input[type="file"]');
|
||||
const acceptAttr = fileInput?.getAttribute('accept');
|
||||
await expect(fileInput).toHaveAttribute('accept');
|
||||
await expect(acceptAttr).not.toContain('image/');
|
||||
await expect(acceptAttr).not.toContain('audio/');
|
||||
await expect(fileInput).not.toHaveAttribute('accept');
|
||||
|
||||
// Open file attachments dropdown
|
||||
const fileUploadButton = canvas.getByText('Attach files');
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue