mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-05-19 08:00:25 +00:00
Merge branch 'upstream' into concedo_experimental
# Conflicts: # .devops/vulkan.Dockerfile # .github/workflows/build.yml # ci/run.sh # examples/model-conversion/Makefile # examples/model-conversion/README.md # examples/model-conversion/scripts/causal/compare-logits.py # examples/model-conversion/scripts/embedding/run-converted-model.sh # examples/model-conversion/scripts/utils/common.py # examples/model-conversion/scripts/utils/semantic_check.py # ggml/src/ggml-cann/ggml-cann.cpp # ggml/src/ggml-cuda/CMakeLists.txt # ggml/src/ggml-opencl/CMakeLists.txt # ggml/src/ggml-opencl/ggml-opencl.cpp # ggml/src/ggml-sycl/ggml-sycl.cpp # ggml/src/ggml-webgpu/ggml-webgpu.cpp # scripts/pr2wt.sh # scripts/sync_vendor.py # tests/test-arg-parser.cpp
This commit is contained in:
commit
983baac46b
28 changed files with 2516 additions and 757 deletions
|
|
@ -2090,11 +2090,22 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||
add_opt(common_arg(
|
||||
{"--mmap"},
|
||||
{"--no-mmap"},
|
||||
string_format("whether to memory-map model (if disabled, slower load but may reduce pageouts if not using mlock) (default: %s)", params.use_mmap ? "enabled" : "disabled"),
|
||||
string_format("whether to memory-map model. Explicitly enabling mmap disables direct-io. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: %s)", params.use_mmap ? "enabled" : "disabled"),
|
||||
[](common_params & params, bool value) {
|
||||
params.use_mmap = value;
|
||||
if (value) {
|
||||
params.use_direct_io = false; // disable direct io when mmap is explicitly enabled
|
||||
}
|
||||
}
|
||||
).set_env("LLAMA_ARG_MMAP"));
|
||||
add_opt(common_arg(
|
||||
{"-dio", "--direct-io"},
|
||||
{"-ndio", "--no-direct-io"},
|
||||
string_format("use DirectIO if available. Takes precedence over --mmap (default: %s)", params.use_direct_io ? "enabled" : "disabled"),
|
||||
[](common_params & params, bool value) {
|
||||
params.use_direct_io = value;
|
||||
}
|
||||
).set_env("LLAMA_ARG_DIO"));
|
||||
add_opt(common_arg(
|
||||
{"--numa"}, "TYPE",
|
||||
"attempt optimizations that help on some NUMA systems\n"
|
||||
|
|
@ -2246,7 +2257,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||
std::vector<std::string> split_arg{ it, {} };
|
||||
if (split_arg.size() >= llama_max_devices()) {
|
||||
throw std::invalid_argument(
|
||||
string_format("got %d input configs, but system only has %d devices", (int)split_arg.size(), (int)llama_max_devices())
|
||||
string_format("got %zu input configs, but system only has %zu devices", split_arg.size(), llama_max_devices())
|
||||
);
|
||||
}
|
||||
for (size_t i = 0; i < llama_max_devices(); ++i) {
|
||||
|
|
@ -2286,10 +2297,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||
}
|
||||
).set_env("LLAMA_ARG_FIT"));
|
||||
add_opt(common_arg(
|
||||
{ "-fitt", "--fit-target" }, "MiB",
|
||||
string_format("target margin per device for --fit option, default: %zu", params.fit_params_target/(1024*1024)),
|
||||
[](common_params & params, int value) {
|
||||
params.fit_params_target = value * size_t(1024*1024);
|
||||
{ "-fitt", "--fit-target" }, "MiB0,MiB1,MiB2,...",
|
||||
string_format("target margin per device for --fit, comma-separated list of values, "
|
||||
"single value is broadcast across all devices, default: %zu", params.fit_params_target[0]/(1024*1024)),
|
||||
[](common_params & params, const std::string & value) {
|
||||
std::string arg_next = value;
|
||||
|
||||
// split string by , and /
|
||||
const std::regex regex{ R"([,/]+)" };
|
||||
std::sregex_token_iterator it{ arg_next.begin(), arg_next.end(), regex, -1 };
|
||||
std::vector<std::string> split_arg{ it, {} };
|
||||
if (split_arg.size() >= llama_max_devices()) {
|
||||
throw std::invalid_argument(
|
||||
string_format("got %zu input configs, but system only has %zu devices", split_arg.size(), llama_max_devices())
|
||||
);
|
||||
}
|
||||
if (split_arg.size() == 1) {
|
||||
std::fill(params.fit_params_target.begin(), params.fit_params_target.end(), std::stoul(split_arg[0]) * 1024*1024);
|
||||
return;
|
||||
}
|
||||
for (size_t i = 0; i < split_arg.size(); i++) {
|
||||
params.fit_params_target[i] = std::stoul(split_arg[i]) * 1024*1024;
|
||||
}
|
||||
}
|
||||
).set_env("LLAMA_ARG_FIT_TARGET"));
|
||||
add_opt(common_arg(
|
||||
|
|
|
|||
|
|
@ -129,11 +129,3 @@ void common_params_add_preset_options(std::vector<common_arg> & args);
|
|||
|
||||
// initialize argument parser context - used by test-arg-parser and preset
|
||||
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
|
||||
|
||||
struct common_remote_params {
|
||||
std::vector<std::string> headers;
|
||||
long timeout = 0; // CURLOPT_TIMEOUT, in seconds ; 0 means no timeout
|
||||
long max_size = 0; // max size of the response ; unlimited if 0 ; max is 2GB
|
||||
};
|
||||
// get remote file content, returns <http_code, raw_response_body>
|
||||
std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params & params);
|
||||
|
|
|
|||
|
|
@ -1105,7 +1105,7 @@ common_init_result::common_init_result(common_params & params) :
|
|||
if (params.fit_params) {
|
||||
LOG_INF("%s: fitting params to device memory, for bugs during this step try to reproduce them with -fit off, or provide --verbose logs if the bug only occurs with -fit on\n", __func__);
|
||||
llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
|
||||
params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target, params.fit_params_min_ctx,
|
||||
params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target.data(), params.fit_params_min_ctx,
|
||||
params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);
|
||||
}
|
||||
|
||||
|
|
@ -1374,6 +1374,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
|
|||
mparams.split_mode = params.split_mode;
|
||||
mparams.tensor_split = params.tensor_split;
|
||||
mparams.use_mmap = params.use_mmap;
|
||||
mparams.use_direct_io = params.use_direct_io;
|
||||
mparams.use_mlock = params.use_mlock;
|
||||
mparams.check_tensors = params.check_tensors;
|
||||
mparams.use_extra_bufts = !params.no_extra_bufts;
|
||||
|
|
|
|||
|
|
@ -328,12 +328,14 @@ struct common_params {
|
|||
// offload params
|
||||
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
|
||||
|
||||
int32_t n_gpu_layers = -1; // number of layers to store in VRAM, -1 is auto, <= -2 is all
|
||||
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
||||
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
|
||||
bool fit_params = true; // whether to fit unset model/context parameters to free device memory
|
||||
size_t fit_params_target = 1024 * 1024*1024; // margin per device in bytes for fitting parameters to free memory
|
||||
int32_t fit_params_min_ctx = 4096; // minimum context size to set when trying to reduce memory use
|
||||
int32_t n_gpu_layers = -1; // number of layers to store in VRAM, -1 is auto, <= -2 is all
|
||||
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
||||
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
|
||||
bool fit_params = true; // whether to fit unset model/context parameters to free device memory
|
||||
int32_t fit_params_min_ctx = 4096; // minimum context size to set when trying to reduce memory use
|
||||
|
||||
// margin per device in bytes for fitting parameters to free memory:
|
||||
std::vector<size_t> fit_params_target = std::vector<size_t>(llama_max_devices(), 1024 * 1024*1024);
|
||||
|
||||
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
|
||||
|
||||
|
|
@ -424,7 +426,8 @@ struct common_params {
|
|||
bool kv_unified = false; // enable unified KV cache
|
||||
|
||||
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
|
||||
bool use_mmap = true; // use mmap for faster loads
|
||||
bool use_mmap = true; // enable mmap to use filesystem cache
|
||||
bool use_direct_io = true; // read from disk without buffering for faster model loading
|
||||
bool use_mlock = false; // use mlock to keep model in memory
|
||||
bool verbose_prompt = false; // print prompt tokens before generation
|
||||
bool display_prompt = true; // print prompt before generation
|
||||
|
|
|
|||
|
|
@ -308,7 +308,8 @@ static bool common_download_head(CURL * curl,
|
|||
// download one single file from remote URL to local path
|
||||
static bool common_download_file_single_online(const std::string & url,
|
||||
const std::string & path,
|
||||
const std::string & bearer_token) {
|
||||
const std::string & bearer_token,
|
||||
const common_header_list & custom_headers) {
|
||||
static const int max_attempts = 3;
|
||||
static const int retry_delay_seconds = 2;
|
||||
for (int i = 0; i < max_attempts; ++i) {
|
||||
|
|
@ -330,6 +331,11 @@ static bool common_download_file_single_online(const std::string & url,
|
|||
common_load_model_from_url_headers headers;
|
||||
curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
|
||||
curl_slist_ptr http_headers;
|
||||
|
||||
for (const auto & h : custom_headers) {
|
||||
std::string s = h.first + ": " + h.second;
|
||||
http_headers.ptr = curl_slist_append(http_headers.ptr, s.c_str());
|
||||
}
|
||||
const bool was_perform_successful = common_download_head(curl.get(), http_headers, url, bearer_token);
|
||||
if (!was_perform_successful) {
|
||||
head_request_ok = false;
|
||||
|
|
@ -454,8 +460,10 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string &
|
|||
curl_easy_setopt(curl.get(), CURLOPT_MAXFILESIZE, params.max_size);
|
||||
}
|
||||
http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
|
||||
|
||||
for (const auto & header : params.headers) {
|
||||
http_headers.ptr = curl_slist_append(http_headers.ptr, header.c_str());
|
||||
std::string header_ = header.first + ": " + header.second;
|
||||
http_headers.ptr = curl_slist_append(http_headers.ptr, header_.c_str());
|
||||
}
|
||||
curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
|
||||
|
||||
|
|
@ -619,7 +627,8 @@ static bool common_pull_file(httplib::Client & cli,
|
|||
// download one single file from remote URL to local path
|
||||
static bool common_download_file_single_online(const std::string & url,
|
||||
const std::string & path,
|
||||
const std::string & bearer_token) {
|
||||
const std::string & bearer_token,
|
||||
const common_header_list & custom_headers) {
|
||||
static const int max_attempts = 3;
|
||||
static const int retry_delay_seconds = 2;
|
||||
|
||||
|
|
@ -629,6 +638,9 @@ static bool common_download_file_single_online(const std::string & url,
|
|||
if (!bearer_token.empty()) {
|
||||
default_headers.insert({"Authorization", "Bearer " + bearer_token});
|
||||
}
|
||||
for (const auto & h : custom_headers) {
|
||||
default_headers.emplace(h.first, h.second);
|
||||
}
|
||||
cli.set_default_headers(default_headers);
|
||||
|
||||
const bool file_exists = std::filesystem::exists(path);
|
||||
|
|
@ -734,13 +746,9 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string
|
|||
auto [cli, parts] = common_http_client(url);
|
||||
|
||||
httplib::Headers headers = {{"User-Agent", "llama-cpp"}};
|
||||
|
||||
for (const auto & header : params.headers) {
|
||||
size_t pos = header.find(':');
|
||||
if (pos != std::string::npos) {
|
||||
headers.emplace(header.substr(0, pos), header.substr(pos + 1));
|
||||
} else {
|
||||
headers.emplace(header, "");
|
||||
}
|
||||
headers.emplace(header.first, header.second);
|
||||
}
|
||||
|
||||
if (params.timeout > 0) {
|
||||
|
|
@ -772,9 +780,10 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string
|
|||
static bool common_download_file_single(const std::string & url,
|
||||
const std::string & path,
|
||||
const std::string & bearer_token,
|
||||
bool offline) {
|
||||
bool offline,
|
||||
const common_header_list & headers) {
|
||||
if (!offline) {
|
||||
return common_download_file_single_online(url, path, bearer_token);
|
||||
return common_download_file_single_online(url, path, bearer_token, headers);
|
||||
}
|
||||
|
||||
if (!std::filesystem::exists(path)) {
|
||||
|
|
@ -788,13 +797,24 @@ static bool common_download_file_single(const std::string & url,
|
|||
|
||||
// download multiple files from remote URLs to local paths
|
||||
// the input is a vector of pairs <url, path>
|
||||
static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> & urls, const std::string & bearer_token, bool offline) {
|
||||
static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> & urls,
|
||||
const std::string & bearer_token,
|
||||
bool offline,
|
||||
const common_header_list & headers) {
|
||||
// Prepare download in parallel
|
||||
std::vector<std::future<bool>> futures_download;
|
||||
futures_download.reserve(urls.size());
|
||||
|
||||
for (auto const & item : urls) {
|
||||
futures_download.push_back(std::async(std::launch::async, [bearer_token, offline](const std::pair<std::string, std::string> & it) -> bool {
|
||||
return common_download_file_single(it.first, it.second, bearer_token, offline);
|
||||
}, item));
|
||||
futures_download.push_back(
|
||||
std::async(
|
||||
std::launch::async,
|
||||
[&bearer_token, offline, &headers](const std::pair<std::string, std::string> & it) -> bool {
|
||||
return common_download_file_single(it.first, it.second, bearer_token, offline, headers);
|
||||
},
|
||||
item
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
// Wait for all downloads to complete
|
||||
|
|
@ -807,17 +827,17 @@ static bool common_download_file_multiple(const std::vector<std::pair<std::strin
|
|||
return true;
|
||||
}
|
||||
|
||||
bool common_download_model(
|
||||
const common_params_model & model,
|
||||
const std::string & bearer_token,
|
||||
bool offline) {
|
||||
bool common_download_model(const common_params_model & model,
|
||||
const std::string & bearer_token,
|
||||
bool offline,
|
||||
const common_header_list & headers) {
|
||||
// Basic validation of the model.url
|
||||
if (model.url.empty()) {
|
||||
LOG_ERR("%s: invalid model url\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!common_download_file_single(model.url, model.path, bearer_token, offline)) {
|
||||
if (!common_download_file_single(model.url, model.path, bearer_token, offline, headers)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
@ -876,13 +896,16 @@ bool common_download_model(
|
|||
}
|
||||
|
||||
// Download in parallel
|
||||
common_download_file_multiple(urls, bearer_token, offline);
|
||||
common_download_file_multiple(urls, bearer_token, offline, headers);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
common_hf_file_res common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & bearer_token, bool offline) {
|
||||
common_hf_file_res common_get_hf_file(const std::string & hf_repo_with_tag,
|
||||
const std::string & bearer_token,
|
||||
bool offline,
|
||||
const common_header_list & custom_headers) {
|
||||
auto parts = string_split<std::string>(hf_repo_with_tag, ':');
|
||||
std::string tag = parts.size() > 1 ? parts.back() : "latest";
|
||||
std::string hf_repo = parts[0];
|
||||
|
|
@ -893,10 +916,10 @@ common_hf_file_res common_get_hf_file(const std::string & hf_repo_with_tag, cons
|
|||
std::string url = get_model_endpoint() + "v2/" + hf_repo + "/manifests/" + tag;
|
||||
|
||||
// headers
|
||||
std::vector<std::string> headers;
|
||||
headers.push_back("Accept: application/json");
|
||||
common_header_list headers = custom_headers;
|
||||
headers.push_back({"Accept", "application/json"});
|
||||
if (!bearer_token.empty()) {
|
||||
headers.push_back("Authorization: Bearer " + bearer_token);
|
||||
headers.push_back({"Authorization", "Bearer " + bearer_token});
|
||||
}
|
||||
// Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response
|
||||
// User-Agent header is already set in common_remote_get_content, no need to set it here
|
||||
|
|
@ -1031,9 +1054,10 @@ std::string common_docker_resolve_model(const std::string & docker) {
|
|||
const std::string url_prefix = "https://registry-1.docker.io/v2/" + repo;
|
||||
std::string manifest_url = url_prefix + "/manifests/" + tag;
|
||||
common_remote_params manifest_params;
|
||||
manifest_params.headers.push_back("Authorization: Bearer " + token);
|
||||
manifest_params.headers.push_back(
|
||||
"Accept: application/vnd.docker.distribution.manifest.v2+json,application/vnd.oci.image.manifest.v1+json");
|
||||
manifest_params.headers.push_back({"Authorization", "Bearer " + token});
|
||||
manifest_params.headers.push_back({"Accept",
|
||||
"application/vnd.docker.distribution.manifest.v2+json,application/vnd.oci.image.manifest.v1+json"
|
||||
});
|
||||
auto manifest_res = common_remote_get_content(manifest_url, manifest_params);
|
||||
if (manifest_res.first != 200) {
|
||||
throw std::runtime_error("Failed to get Docker manifest, HTTP code: " + std::to_string(manifest_res.first));
|
||||
|
|
@ -1070,7 +1094,7 @@ std::string common_docker_resolve_model(const std::string & docker) {
|
|||
std::string local_path = fs_get_cache_file(model_filename);
|
||||
|
||||
const std::string blob_url = url_prefix + "/blobs/" + gguf_digest;
|
||||
if (!common_download_file_single(blob_url, local_path, token, false)) {
|
||||
if (!common_download_file_single(blob_url, local_path, token, false, {})) {
|
||||
throw std::runtime_error("Failed to download Docker Model");
|
||||
}
|
||||
|
||||
|
|
@ -1084,11 +1108,11 @@ std::string common_docker_resolve_model(const std::string & docker) {
|
|||
|
||||
#else
|
||||
|
||||
common_hf_file_res common_get_hf_file(const std::string &, const std::string &, bool) {
|
||||
common_hf_file_res common_get_hf_file(const std::string &, const std::string &, bool, const common_header_list &) {
|
||||
throw std::runtime_error("download functionality is not enabled in this build");
|
||||
}
|
||||
|
||||
bool common_download_model(const common_params_model &, const std::string &, bool) {
|
||||
bool common_download_model(const common_params_model &, const std::string &, bool, const common_header_list &) {
|
||||
throw std::runtime_error("download functionality is not enabled in this build");
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1,12 +1,21 @@
|
|||
#pragma once
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
struct common_params_model;
|
||||
|
||||
//
|
||||
// download functionalities
|
||||
//
|
||||
using common_header = std::pair<std::string, std::string>;
|
||||
using common_header_list = std::vector<common_header>;
|
||||
|
||||
struct common_remote_params {
|
||||
common_header_list headers;
|
||||
long timeout = 0; // in seconds, 0 means no timeout
|
||||
long max_size = 0; // unlimited if 0
|
||||
};
|
||||
|
||||
// get remote file content, returns <http_code, raw_response_body>
|
||||
std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params & params);
|
||||
|
||||
struct common_cached_model_info {
|
||||
std::string manifest_path;
|
||||
|
|
@ -41,13 +50,17 @@ struct common_hf_file_res {
|
|||
common_hf_file_res common_get_hf_file(
|
||||
const std::string & hf_repo_with_tag,
|
||||
const std::string & bearer_token,
|
||||
bool offline);
|
||||
bool offline,
|
||||
const common_header_list & headers = {}
|
||||
);
|
||||
|
||||
// returns true if download succeeded
|
||||
bool common_download_model(
|
||||
const common_params_model & model,
|
||||
const std::string & bearer_token,
|
||||
bool offline);
|
||||
bool offline,
|
||||
const common_header_list & headers = {}
|
||||
);
|
||||
|
||||
// returns list of cached models
|
||||
std::vector<common_cached_model_info> common_list_cached_models();
|
||||
|
|
|
|||
|
|
@ -771,8 +771,8 @@ class TextModel(ModelBase):
|
|||
|
||||
self.rope_parameters = self.hparams.get("rope_parameters", self.hparams.get("rope_scaling")) or {}
|
||||
|
||||
rope_theta = self.find_hparam(["rope_theta", "global_rope_theta", "rotary_emb_base"], optional=True)
|
||||
local_rope_theta = self.find_hparam(["local_rope_theta", "rope_local_theta", "swa_rope_theta", "rope_local_base_freq"], optional=True)
|
||||
rope_theta = self.find_hparam(["global_rope_theta", "rope_global_theta", "rope_theta_global", "rope_theta", "rotary_emb_base"], optional=True)
|
||||
local_rope_theta = self.find_hparam(["local_rope_theta", "rope_local_theta", "rope_theta_local", "swa_rope_theta", "rope_local_base_freq"], optional=True)
|
||||
|
||||
# Ensure "rope_theta" and "rope_type" is mirrored in rope_parameters
|
||||
if "full_attention" not in self.rope_parameters and "sliding_attention" not in self.rope_parameters:
|
||||
|
|
|
|||
|
|
@ -553,6 +553,7 @@ int main(int argc, char ** argv) {
|
|||
model_params.n_gpu_layers = params.n_gpu_layers;
|
||||
model_params.devices = params.devices.data();
|
||||
model_params.use_mmap = params.use_mmap;
|
||||
model_params.use_direct_io = params.use_direct_io;
|
||||
model_params.use_mlock = params.use_mlock;
|
||||
model_params.check_tensors = params.check_tensors;
|
||||
|
||||
|
|
|
|||
|
|
@ -4134,6 +4134,7 @@ struct ggml_backend_cuda_device_context {
|
|||
std::string name;
|
||||
std::string description;
|
||||
std::string pci_bus_id;
|
||||
int op_offload_min_batch_size;
|
||||
};
|
||||
|
||||
static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
|
||||
|
|
@ -4688,11 +4689,9 @@ static int64_t get_op_batch_size(const ggml_tensor * op) {
|
|||
}
|
||||
|
||||
static bool ggml_backend_cuda_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
|
||||
const int min_batch_size = 32;
|
||||
ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) dev->context;
|
||||
|
||||
return get_op_batch_size(op) >= min_batch_size;
|
||||
|
||||
GGML_UNUSED(dev);
|
||||
return get_op_batch_size(op) >= dev_ctx->op_offload_min_batch_size;
|
||||
}
|
||||
|
||||
static ggml_backend_event_t ggml_backend_cuda_device_event_new(ggml_backend_dev_t dev) {
|
||||
|
|
@ -4861,6 +4860,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
|
|||
std::lock_guard<std::mutex> lock(mutex);
|
||||
if (!initialized) {
|
||||
ggml_backend_cuda_reg_context * ctx = new ggml_backend_cuda_reg_context;
|
||||
const int min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32;
|
||||
|
||||
for (int i = 0; i < ggml_cuda_info().device_count; i++) {
|
||||
ggml_backend_cuda_device_context * dev_ctx = new ggml_backend_cuda_device_context;
|
||||
|
|
@ -4874,6 +4874,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
|
|||
char pci_bus_id[16] = {};
|
||||
snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.0", prop.pciDomainID, prop.pciBusID, prop.pciDeviceID);
|
||||
dev_ctx->pci_bus_id = pci_bus_id;
|
||||
dev_ctx->op_offload_min_batch_size = min_batch_size;
|
||||
|
||||
ggml_backend_dev_t dev = new ggml_backend_device {
|
||||
/* .iface = */ ggml_backend_cuda_device_interface,
|
||||
|
|
|
|||
|
|
@ -219,6 +219,8 @@ struct ggml_metal_device_props {
|
|||
bool use_shared_buffers;
|
||||
|
||||
bool supports_gpu_family_apple7;
|
||||
|
||||
int op_offload_min_batch_size;
|
||||
};
|
||||
|
||||
ggml_metal_device_t ggml_metal_device_init(void);
|
||||
|
|
|
|||
|
|
@ -788,6 +788,8 @@ ggml_metal_device_t ggml_metal_device_init(void) {
|
|||
|
||||
dev->props.supports_gpu_family_apple7 = [dev->mtl_device supportsFamily:MTLGPUFamilyApple7];
|
||||
|
||||
dev->props.op_offload_min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32;
|
||||
|
||||
dev->props.max_buffer_size = dev->mtl_device.maxBufferLength;
|
||||
dev->props.max_working_set_size = dev->mtl_device.recommendedMaxWorkingSetSize;
|
||||
dev->props.max_theadgroup_memory_size = dev->mtl_device.maxThreadgroupMemoryLength;
|
||||
|
|
|
|||
|
|
@ -625,14 +625,11 @@ static int64_t get_op_batch_size(const ggml_tensor * op) {
|
|||
}
|
||||
|
||||
static bool ggml_backend_metal_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
|
||||
const int min_batch_size = 32;
|
||||
ggml_metal_device_t ctx_dev = (ggml_metal_device_t)dev->context;
|
||||
|
||||
return (op->op == GGML_OP_MUL_MAT ||
|
||||
op->op == GGML_OP_MUL_MAT_ID) &&
|
||||
get_op_batch_size(op) >= min_batch_size;
|
||||
|
||||
GGML_UNUSED(dev);
|
||||
GGML_UNUSED(op);
|
||||
get_op_batch_size(op) >= ggml_metal_device_get_props(ctx_dev)->op_offload_min_batch_size;
|
||||
}
|
||||
|
||||
static ggml_backend_device_i ggml_backend_metal_device_i = {
|
||||
|
|
|
|||
|
|
@ -9148,6 +9148,7 @@ typedef decltype(kernel_mul_mm_id_map0<1>) kernel_mul_mm_id_map0_t;
|
|||
template [[host_name("kernel_mul_mm_id_map0_ne20_1" )]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<1>;
|
||||
template [[host_name("kernel_mul_mm_id_map0_ne20_2" )]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<2>;
|
||||
template [[host_name("kernel_mul_mm_id_map0_ne20_4" )]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<4>;
|
||||
template [[host_name("kernel_mul_mm_id_map0_ne20_5" )]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<5>;
|
||||
template [[host_name("kernel_mul_mm_id_map0_ne20_6" )]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<6>;
|
||||
template [[host_name("kernel_mul_mm_id_map0_ne20_8" )]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<8>;
|
||||
template [[host_name("kernel_mul_mm_id_map0_ne20_10")]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<10>;
|
||||
|
|
|
|||
|
|
@ -586,6 +586,7 @@ struct vk_device_struct {
|
|||
bool uma;
|
||||
bool prefer_host_memory;
|
||||
bool float_controls_rte_fp16;
|
||||
bool subgroup_basic;
|
||||
bool subgroup_arithmetic;
|
||||
bool subgroup_shuffle;
|
||||
bool subgroup_ballot;
|
||||
|
|
@ -1520,6 +1521,11 @@ template <> void init_pushconst_fastdiv(vk_op_sum_rows_push_constants &p) {
|
|||
init_fastdiv_values(p.ne01, p.ne0_1mp, p.ne0_1L);
|
||||
}
|
||||
|
||||
struct vk_quantize_q8_1_push_constants {
|
||||
uint32_t ne;
|
||||
uint32_t num_blocks;
|
||||
};
|
||||
|
||||
// Allow pre-recording command buffers
|
||||
struct vk_staging_memcpy {
|
||||
vk_staging_memcpy(void * _dst, const void * _src, size_t _n) : dst(_dst), src(_src), n(_n) {}
|
||||
|
|
@ -3356,12 +3362,12 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|||
|
||||
GGML_ASSERT(device->subgroup_ballot);
|
||||
|
||||
CREATE_MM(GGML_TYPE_F32, pipeline_matmul_id_f32, matmul_id_subgroup_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, mul_mat_id_param_count, _id);
|
||||
CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16, matmul_id_subgroup_f16, wg_denoms, warptile, vk_mat_mat_push_constants, mul_mat_id_param_count, _id);
|
||||
CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16_f32, matmul_id_subgroup_f16_f32, wg_denoms, warptile, vk_mat_mat_push_constants, mul_mat_id_param_count, _id);
|
||||
CREATE_MM(GGML_TYPE_F32, pipeline_matmul_id_f32, matmul_id_subgroup_f32_f32, , wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id);
|
||||
CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16, matmul_id_subgroup_f16, wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id);
|
||||
CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16_f32, matmul_id_subgroup_f16_f32, wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id);
|
||||
#if defined(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT)
|
||||
if (device->coopmat_bf16_support) {
|
||||
CREATE_MM(GGML_TYPE_BF16, pipeline_matmul_id_bf16, matmul_id_subgroup_bf16, , wg_denoms, warptile, vk_mat_mat_push_constants, mul_mat_id_param_count, _id);
|
||||
CREATE_MM(GGML_TYPE_BF16, pipeline_matmul_id_bf16, matmul_id_subgroup_bf16, , wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
|
@ -3469,9 +3475,9 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|||
#endif
|
||||
|
||||
if (device->subgroup_ballot && device->subgroup_require_full_support && subgroup_min_size_16) {
|
||||
CREATE_MM(GGML_TYPE_F32, pipeline_matmul_id_f32, matmul_id_subgroup_f32_f32, , wg_denoms, warptile_id, vk_mat_mat_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16);
|
||||
CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16, matmul_id_subgroup_f16, wg_denoms, warptile_id, vk_mat_mat_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16);
|
||||
CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16_f32, matmul_id_subgroup_f16_f32, wg_denoms, warptile_id, vk_mat_mat_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16);
|
||||
CREATE_MM(GGML_TYPE_F32, pipeline_matmul_id_f32, matmul_id_subgroup_f32_f32, , wg_denoms, warptile_id, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16);
|
||||
CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16, matmul_id_subgroup_f16, wg_denoms, warptile_id, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16);
|
||||
CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16_f32, matmul_id_subgroup_f16_f32, wg_denoms, warptile_id, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16);
|
||||
CREATE_MM(GGML_TYPE_BF16, pipeline_matmul_id_bf16, matmul_id_subgroup_bf16, , wg_denoms, warptile_id, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16);
|
||||
|
||||
CREATE_MM2(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0], matmul_id_subgroup_q4_0_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size);
|
||||
|
|
@ -3513,9 +3519,9 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|||
}
|
||||
#endif
|
||||
} else {
|
||||
CREATE_MM(GGML_TYPE_F32, pipeline_matmul_id_f32, matmul_id_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, mul_mat_id_param_count, _id, 0);
|
||||
CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16, matmul_id_f16, wg_denoms, warptile, vk_mat_mat_push_constants, mul_mat_id_param_count, _id, 0);
|
||||
CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16_f32, matmul_id_f16_f32, wg_denoms, warptile, vk_mat_mat_push_constants, mul_mat_id_param_count, _id, 0);
|
||||
CREATE_MM(GGML_TYPE_F32, pipeline_matmul_id_f32, matmul_id_f32_f32, , wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0);
|
||||
CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16, matmul_id_f16, wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0);
|
||||
CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16_f32, matmul_id_f16_f32, wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0);
|
||||
CREATE_MM(GGML_TYPE_BF16, pipeline_matmul_id_bf16, matmul_id_bf16, , wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0);
|
||||
|
||||
CREATE_MM2(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0], matmul_id_q4_0_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0);
|
||||
|
|
@ -3630,9 +3636,9 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|||
#endif
|
||||
|
||||
if (device->subgroup_ballot && device->subgroup_require_full_support && subgroup_min_size_16) {
|
||||
CREATE_MM(GGML_TYPE_F32, pipeline_matmul_id_f32, matmul_id_subgroup_f32_f32, , wg_denoms, warptile_id, vk_mat_mat_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16);
|
||||
CREATE_MM(GGML_TYPE_F16, pipeline_matmul_id_f16.f32acc, matmul_id_subgroup_f16, , wg_denoms, warptile_id, vk_mat_mat_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16);
|
||||
CREATE_MM(GGML_TYPE_F16, pipeline_matmul_id_f16_f32.f32acc, matmul_id_subgroup_f16_f32, , wg_denoms, warptile_id, vk_mat_mat_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16);
|
||||
CREATE_MM(GGML_TYPE_F32, pipeline_matmul_id_f32, matmul_id_subgroup_f32_f32, , wg_denoms, warptile_id, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16);
|
||||
CREATE_MM(GGML_TYPE_F16, pipeline_matmul_id_f16.f32acc, matmul_id_subgroup_f16, , wg_denoms, warptile_id, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16);
|
||||
CREATE_MM(GGML_TYPE_F16, pipeline_matmul_id_f16_f32.f32acc, matmul_id_subgroup_f16_f32, , wg_denoms, warptile_id, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16);
|
||||
CREATE_MM(GGML_TYPE_BF16, pipeline_matmul_id_bf16, matmul_id_subgroup_bf16, , wg_denoms, warptile_id, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16);
|
||||
|
||||
CREATE_MM(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f32acc, matmul_id_subgroup_q4_0_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size);
|
||||
|
|
@ -3656,9 +3662,9 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|||
CREATE_MM(GGML_TYPE_IQ4_NL, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f32acc, matmul_id_subgroup_iq4_nl_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size);
|
||||
CREATE_MM(GGML_TYPE_MXFP4, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_MXFP4].f32acc, matmul_id_subgroup_mxfp4_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size);
|
||||
} else {
|
||||
CREATE_MM(GGML_TYPE_F32, pipeline_matmul_id_f32, matmul_id_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, mul_mat_id_param_count, _id, 0);
|
||||
CREATE_MM(GGML_TYPE_F16, pipeline_matmul_id_f16.f32acc, matmul_id_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, mul_mat_id_param_count, _id, 0);
|
||||
CREATE_MM(GGML_TYPE_F16, pipeline_matmul_id_f16_f32.f32acc, matmul_id_f16_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, mul_mat_id_param_count, _id, 0);
|
||||
CREATE_MM(GGML_TYPE_F32, pipeline_matmul_id_f32, matmul_id_f32_f32, , wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0);
|
||||
CREATE_MM(GGML_TYPE_F16, pipeline_matmul_id_f16.f32acc, matmul_id_f16, , wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0);
|
||||
CREATE_MM(GGML_TYPE_F16, pipeline_matmul_id_f16_f32.f32acc, matmul_id_f16_f32, , wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0);
|
||||
CREATE_MM(GGML_TYPE_BF16, pipeline_matmul_id_bf16, matmul_id_bf16, , wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0);
|
||||
|
||||
CREATE_MM(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f32acc, matmul_id_q4_0_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0);
|
||||
|
|
@ -3856,22 +3862,22 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|||
const uint32_t subgroup_size_int = (device->vendor_id == VK_VENDOR_ID_INTEL && device->subgroup_size_control) ? device->subgroup_min_size : device->subgroup_size;
|
||||
const uint32_t wg_size_subgroup_int = (w == DMMV_WG_SIZE_SUBGROUP) ? subgroup_size_int : (subgroup_size_int * 4);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q4_0], "mul_mat_vec_id_q4_0_q8_1_f32", arr_dmmv_id_q4_0_q8_1_f32_len[reduc], arr_dmmv_id_q4_0_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_stdq_int}, 1, true, use_subgroups, subgroup_size_int);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q4_1], "mul_mat_vec_id_q4_1_q8_1_f32", arr_dmmv_id_q4_1_q8_1_f32_len[reduc], arr_dmmv_id_q4_1_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_stdq_int}, 1, true, use_subgroups, subgroup_size_int);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q5_0], "mul_mat_vec_id_q5_0_q8_1_f32", arr_dmmv_id_q5_0_q8_1_f32_len[reduc], arr_dmmv_id_q5_0_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_stdq_int}, 1, true, use_subgroups, subgroup_size_int);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q5_1], "mul_mat_vec_id_q5_1_q8_1_f32", arr_dmmv_id_q5_1_q8_1_f32_len[reduc], arr_dmmv_id_q5_1_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_stdq_int}, 1, true, use_subgroups, subgroup_size_int);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q8_0], "mul_mat_vec_id_q8_0_q8_1_f32", arr_dmmv_id_q8_0_q8_1_f32_len[reduc], arr_dmmv_id_q8_0_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_stdq_int}, 1, true, use_subgroups, subgroup_size_int);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q4_0], "mul_mat_vec_id_q4_0_q8_1_f32", arr_dmmv_id_q4_0_q8_1_f32_len[reduc], arr_dmmv_id_q4_0_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_stdq_int}, 1, true, use_subgroups, subgroup_size_int);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q4_1], "mul_mat_vec_id_q4_1_q8_1_f32", arr_dmmv_id_q4_1_q8_1_f32_len[reduc], arr_dmmv_id_q4_1_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_stdq_int}, 1, true, use_subgroups, subgroup_size_int);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q5_0], "mul_mat_vec_id_q5_0_q8_1_f32", arr_dmmv_id_q5_0_q8_1_f32_len[reduc], arr_dmmv_id_q5_0_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_stdq_int}, 1, true, use_subgroups, subgroup_size_int);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q5_1], "mul_mat_vec_id_q5_1_q8_1_f32", arr_dmmv_id_q5_1_q8_1_f32_len[reduc], arr_dmmv_id_q5_1_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_stdq_int}, 1, true, use_subgroups, subgroup_size_int);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q8_0], "mul_mat_vec_id_q8_0_q8_1_f32", arr_dmmv_id_q8_0_q8_1_f32_len[reduc], arr_dmmv_id_q8_0_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_stdq_int}, 1, true, use_subgroups, subgroup_size_int);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_MXFP4], "mul_mat_vec_id_mxfp4_q8_1_f32", arr_dmmv_id_mxfp4_q8_1_f32_len[reduc], arr_dmmv_id_mxfp4_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_push_constants), {2*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 2*rm_stdq_int}, 1, true, use_subgroups, subgroup_size_int);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_MXFP4], "mul_mat_vec_id_mxfp4_q8_1_f32", arr_dmmv_id_mxfp4_q8_1_f32_len[reduc], arr_dmmv_id_mxfp4_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 2*rm_stdq_int}, 1, true, use_subgroups, subgroup_size_int);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q2_K], "mul_mat_vec_id_q2_k_q8_1_f32", arr_dmmv_id_q2_k_q8_1_f32_len[reduc], arr_dmmv_id_q2_k_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_push_constants), {2*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 2*rm_kq_int}, 1, true, use_subgroups, subgroup_size_int);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q3_K], "mul_mat_vec_id_q3_k_q8_1_f32", arr_dmmv_id_q3_k_q8_1_f32_len[reduc], arr_dmmv_id_q3_k_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_kq_int}, 1, true, use_subgroups, subgroup_size_int);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_q8_1_f32", arr_dmmv_id_q4_k_q8_1_f32_len[reduc], arr_dmmv_id_q4_k_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_kq_int}, 1, true, use_subgroups, subgroup_size_int);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_q8_1_f32", arr_dmmv_id_q5_k_q8_1_f32_len[reduc], arr_dmmv_id_q5_k_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_kq_int}, 1, true, use_subgroups, subgroup_size_int);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_q8_1_f32", arr_dmmv_id_q6_k_q8_1_f32_len[reduc], arr_dmmv_id_q6_k_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_kq_int}, 1, true, use_subgroups, subgroup_size_int);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q2_K], "mul_mat_vec_id_q2_k_q8_1_f32", arr_dmmv_id_q2_k_q8_1_f32_len[reduc], arr_dmmv_id_q2_k_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {2*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 2*rm_kq_int}, 1, true, use_subgroups, subgroup_size_int);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q3_K], "mul_mat_vec_id_q3_k_q8_1_f32", arr_dmmv_id_q3_k_q8_1_f32_len[reduc], arr_dmmv_id_q3_k_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_kq_int}, 1, true, use_subgroups, subgroup_size_int);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_q8_1_f32", arr_dmmv_id_q4_k_q8_1_f32_len[reduc], arr_dmmv_id_q4_k_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_kq_int}, 1, true, use_subgroups, subgroup_size_int);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_q8_1_f32", arr_dmmv_id_q5_k_q8_1_f32_len[reduc], arr_dmmv_id_q5_k_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_kq_int}, 1, true, use_subgroups, subgroup_size_int);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_q8_1_f32", arr_dmmv_id_q6_k_q8_1_f32_len[reduc], arr_dmmv_id_q6_k_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_kq_int}, 1, true, use_subgroups, subgroup_size_int);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_IQ1_S], "mul_mat_vec_id_iq1_s_q8_1_f32", arr_dmmv_id_iq1_s_q8_1_f32_len[reduc], arr_dmmv_id_iq1_s_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_iq_int(0), 1, 1}, {wg_size_subgroup_int, 1*rm_iq_int(0)}, 1, true, use_subgroups, subgroup_size_int);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_IQ1_M], "mul_mat_vec_id_iq1_m_q8_1_f32", arr_dmmv_id_iq1_m_q8_1_f32_len[reduc], arr_dmmv_id_iq1_m_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_iq_int(0), 1, 1}, {wg_size_subgroup_int, 1*rm_iq_int(0)}, 1, true, use_subgroups, subgroup_size_int);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_IQ1_S], "mul_mat_vec_id_iq1_s_q8_1_f32", arr_dmmv_id_iq1_s_q8_1_f32_len[reduc], arr_dmmv_id_iq1_s_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1*rm_iq_int(0), 1, 1}, {wg_size_subgroup_int, 1*rm_iq_int(0)}, 1, true, use_subgroups, subgroup_size_int);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_IQ1_M], "mul_mat_vec_id_iq1_m_q8_1_f32", arr_dmmv_id_iq1_m_q8_1_f32_len[reduc], arr_dmmv_id_iq1_m_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1*rm_iq_int(0), 1, 1}, {wg_size_subgroup_int, 1*rm_iq_int(0)}, 1, true, use_subgroups, subgroup_size_int);
|
||||
}
|
||||
#endif // GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT
|
||||
}
|
||||
|
|
@ -3959,9 +3965,9 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|||
ggml_vk_create_pipeline(device, device->pipeline_flash_attn_split_k_reduce, "fa_split_k_reduce", fa_split_k_reduce_len, fa_split_k_reduce_data, "main", 3, 5 * sizeof(uint32_t), {1, device->subgroup_size, 1}, {device->subgroup_size}, 1, true);
|
||||
|
||||
if (device->subgroup_clustered && device->subgroup_require_full_support) {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_quantize_q8_1_x4, "quantize_q8_1_x4", quantize_q8_1_x4_subgroup_len, quantize_q8_1_x4_subgroup_data, "main", 2, 1 * sizeof(uint32_t), {32 * device->subgroup_size / 8, 1, 1}, { device->subgroup_size }, 1, true, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_quantize_q8_1_x4, "quantize_q8_1_x4", quantize_q8_1_x4_subgroup_len, quantize_q8_1_x4_subgroup_data, "main", 2, sizeof(vk_quantize_q8_1_push_constants), {32 * device->subgroup_size / 8, 1, 1}, { device->subgroup_size }, 1, true, true);
|
||||
} else {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_quantize_q8_1_x4, "quantize_q8_1_x4", quantize_q8_1_x4_len, quantize_q8_1_x4_data, "main", 2, 1 * sizeof(uint32_t), {32 * device->subgroup_size / 8, 1, 1}, { device->subgroup_size }, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_quantize_q8_1_x4, "quantize_q8_1_x4", quantize_q8_1_x4_len, quantize_q8_1_x4_data, "main", 2, sizeof(vk_quantize_q8_1_push_constants), {32 * device->subgroup_size / 8, 1, 1}, { device->subgroup_size }, 1);
|
||||
}
|
||||
|
||||
for (uint32_t i = 0; i < p021_max_gqa_ratio; ++i) {
|
||||
|
|
@ -4169,9 +4175,9 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|||
ggml_vk_create_pipeline(device, device->pipeline_add1_f16_f32, "add1_f16_f32", add1_f16_f32_len, add1_f16_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_add1_f32_f32, "add1_f32_f32", add1_f32_f32_len, add1_f32_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_arange_f32, "arange_f32", arange_f32_len, arange_f32_data, "main", 1, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_arange_f32, "arange_f32", arange_f32_len, arange_f32_data, "main", 1, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_fill_f32, "fill_f32", fill_f32_len, fill_f32_data, "main", 1, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_fill_f32, "fill_f32", fill_f32_len, fill_f32_data, "main", 1, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
||||
|
||||
#define CREATE_GLU(name) \
|
||||
if (device->float_controls_rte_fp16) { \
|
||||
|
|
@ -4317,8 +4323,8 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|||
ggml_vk_create_pipeline(device, device->pipeline_rwkv_wkv7_f32, "rwkv_wkv7_f32", rwkv_wkv7_f32_len, rwkv_wkv7_f32_data, "main", 8, sizeof(vk_op_rwkv_wkv7_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
|
||||
|
||||
if (device->subgroup_arithmetic && device->subgroup_require_full_support) {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_ssm_scan_f32_d128, "ssm_scan_128_f32", ssm_scan_subgroup_f32_len, ssm_scan_subgroup_f32_data, "main", 8, sizeof(vk_op_ssm_scan_push_constants), {1, 1, 1}, {128, device->subgroup_size, 16}, 1, true, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_ssm_scan_f32_d256, "ssm_scan_256_f32", ssm_scan_subgroup_f32_len, ssm_scan_subgroup_f32_data, "main", 8, sizeof(vk_op_ssm_scan_push_constants), {1, 1, 1}, {256, device->subgroup_size, 16}, 1, true, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_ssm_scan_f32_d128, "ssm_scan_128_f32", ssm_scan_subgroup_f32_len, ssm_scan_subgroup_f32_data, "main", 8, sizeof(vk_op_ssm_scan_push_constants), {1, 1, 1}, {128, device->subgroup_size}, 1, true, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_ssm_scan_f32_d256, "ssm_scan_256_f32", ssm_scan_subgroup_f32_len, ssm_scan_subgroup_f32_data, "main", 8, sizeof(vk_op_ssm_scan_push_constants), {1, 1, 1}, {256, device->subgroup_size}, 1, true, true);
|
||||
} else {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_ssm_scan_f32_d128, "ssm_scan_128_f32", ssm_scan_f32_len, ssm_scan_f32_data, "main", 8, sizeof(vk_op_ssm_scan_push_constants), {1, 1, 1}, {128, device->subgroup_size, 16}, 1, true, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_ssm_scan_f32_d256, "ssm_scan_256_f32", ssm_scan_f32_len, ssm_scan_f32_data, "main", 8, sizeof(vk_op_ssm_scan_push_constants), {1, 1, 1}, {256, device->subgroup_size, 16}, 1, true, true);
|
||||
|
|
@ -4664,6 +4670,8 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
|||
}
|
||||
device->float_controls_rte_fp16 = vk12_props.shaderRoundingModeRTEFloat16;
|
||||
|
||||
device->subgroup_basic = (vk11_props.subgroupSupportedStages & vk::ShaderStageFlagBits::eCompute) &&
|
||||
(vk11_props.subgroupSupportedOperations & vk::SubgroupFeatureFlagBits::eBasic);
|
||||
device->subgroup_arithmetic = (vk11_props.subgroupSupportedStages & vk::ShaderStageFlagBits::eCompute) &&
|
||||
(vk11_props.subgroupSupportedOperations & vk::SubgroupFeatureFlagBits::eArithmetic);
|
||||
#ifdef __APPLE__
|
||||
|
|
@ -6135,6 +6143,7 @@ static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context* ctx, vk_context&
|
|||
GGML_ASSERT(ctx->descriptor_set_idx < ctx->descriptor_sets.size());
|
||||
GGML_ASSERT(descriptor_buffer_infos.size() <= MAX_PARAMETER_COUNT);
|
||||
GGML_ASSERT(pipeline->parameter_count == descriptor_buffer_infos.size());
|
||||
GGML_ASSERT(pipeline->push_constant_size == push_constant_size(push_constants));
|
||||
|
||||
vk::DescriptorSet& descriptor_set = ctx->descriptor_sets[ctx->descriptor_set_idx++];
|
||||
vk::WriteDescriptorSet write_descriptor_set{ descriptor_set, 0, 0, pipeline->parameter_count, vk::DescriptorType::eStorageBuffer, nullptr, descriptor_buffer_infos.begin() };
|
||||
|
|
@ -6917,7 +6926,12 @@ static void ggml_vk_quantize_q8_1(ggml_backend_vk_context * ctx, vk_context& sub
|
|||
const uint64_t max_elements = std::min<uint64_t>(uint64_t{ctx->device->properties.limits.maxComputeWorkGroupCount[0]} * pipeline->wg_denoms[0], std::numeric_limits<uint32_t>::max());
|
||||
const uint32_t elements = std::min(ne, static_cast<uint32_t>(max_elements));
|
||||
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, std::array<uint32_t, 2>{ ne, num_blocks }, { elements, 1, 1 });
|
||||
const vk_quantize_q8_1_push_constants pc = {
|
||||
ne,
|
||||
num_blocks,
|
||||
};
|
||||
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, pc, { elements, 1, 1 });
|
||||
ggml_vk_sync_buffers(ctx, subctx);
|
||||
}
|
||||
|
||||
|
|
@ -9908,8 +9922,9 @@ static void ggml_vk_ssm_scan(ggml_backend_vk_context * ctx, vk_context& subctx,
|
|||
|
||||
std::array<uint32_t, 3> elements;
|
||||
|
||||
const int splitH = 16;
|
||||
const uint32_t num_workgroups_x = CEIL_DIV(n_head * head_dim, splitH);
|
||||
const uint32_t d_state = src0->ne[0];
|
||||
uint32_t num_subgroups = d_state / ctx->device->subgroup_size;
|
||||
const uint32_t num_workgroups_x = CEIL_DIV(n_head * head_dim, num_subgroups);
|
||||
const uint32_t num_workgroups_y = n_seq;
|
||||
elements = { num_workgroups_x, num_workgroups_y, 1 };
|
||||
|
||||
|
|
@ -14287,6 +14302,7 @@ struct ggml_backend_vk_device_context {
|
|||
std::string description;
|
||||
bool is_integrated_gpu;
|
||||
std::string pci_bus_id;
|
||||
int op_offload_min_batch_size;
|
||||
};
|
||||
|
||||
static const char * ggml_backend_vk_device_get_name(ggml_backend_dev_t dev) {
|
||||
|
|
@ -14814,11 +14830,13 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
|||
return false;
|
||||
}
|
||||
|
||||
const uint32_t SPLIT_H = 16;
|
||||
size_t shmem_size = d_state * sizeof(float);
|
||||
|
||||
size_t stateC_size = SPLIT_H * d_state * sizeof(float);
|
||||
if (shmem_size > device->properties.limits.maxComputeSharedMemorySize) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (stateC_size > device->properties.limits.maxComputeSharedMemorySize) {
|
||||
if (!device->subgroup_basic) {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
@ -14858,12 +14876,10 @@ static bool ggml_backend_vk_device_supports_buft(ggml_backend_dev_t dev, ggml_ba
|
|||
}
|
||||
|
||||
static bool ggml_backend_vk_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
|
||||
const int min_batch_size = 32;
|
||||
ggml_backend_vk_device_context * dev_ctx = (ggml_backend_vk_device_context *)dev->context;
|
||||
|
||||
return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) ||
|
||||
(op->ne[2] >= min_batch_size && op->op == GGML_OP_MUL_MAT_ID);
|
||||
|
||||
UNUSED(dev);
|
||||
return (op->ne[1] >= dev_ctx->op_offload_min_batch_size && op->op != GGML_OP_GET_ROWS) ||
|
||||
(op->ne[2] >= dev_ctx->op_offload_min_batch_size && op->op == GGML_OP_MUL_MAT_ID);
|
||||
}
|
||||
|
||||
static ggml_backend_event_t ggml_backend_vk_device_event_new(ggml_backend_dev_t dev) {
|
||||
|
|
@ -14989,6 +15005,7 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
|
|||
static std::mutex mutex;
|
||||
std::lock_guard<std::mutex> lock(mutex);
|
||||
if (!initialized) {
|
||||
const int min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32;
|
||||
for (int i = 0; i < ggml_backend_vk_get_device_count(); i++) {
|
||||
ggml_backend_vk_device_context * ctx = new ggml_backend_vk_device_context;
|
||||
char desc[256];
|
||||
|
|
@ -14998,6 +15015,7 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
|
|||
ctx->description = desc;
|
||||
ctx->is_integrated_gpu = ggml_backend_vk_get_device_type(i) == vk::PhysicalDeviceType::eIntegratedGpu;
|
||||
ctx->pci_bus_id = ggml_backend_vk_get_device_pci_id(i);
|
||||
ctx->op_offload_min_batch_size = min_batch_size;
|
||||
devices.push_back(new ggml_backend_device {
|
||||
/* .iface = */ ggml_backend_vk_device_i,
|
||||
/* .reg = */ reg,
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
#version 450
|
||||
|
||||
#extension GL_EXT_control_flow_attributes : require
|
||||
#extension GL_KHR_shader_subgroup_basic : enable
|
||||
#if USE_SUBGROUP_ADD
|
||||
#extension GL_KHR_shader_subgroup_arithmetic : enable
|
||||
#endif
|
||||
|
|
@ -9,7 +10,8 @@
|
|||
|
||||
layout(constant_id = 0) const uint D_STATE = 128;
|
||||
layout(constant_id = 1) const uint SUBGROUP_SIZE = 32;
|
||||
layout(constant_id = 2) const uint SPLIT_H = 16;
|
||||
|
||||
const uint32_t c_factor = D_STATE / SUBGROUP_SIZE;
|
||||
|
||||
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
|
|
@ -41,22 +43,28 @@ float softplus(float x) {
|
|||
}
|
||||
}
|
||||
|
||||
shared float stateC[SPLIT_H * D_STATE];
|
||||
#if !USE_SUBGROUP_ADD
|
||||
shared float temp[D_STATE];
|
||||
#endif
|
||||
|
||||
void main() {
|
||||
const uint tid = gl_LocalInvocationID.x;
|
||||
const uint head_idx = (gl_WorkGroupID.x * SPLIT_H) / d_head;
|
||||
const uint head_off = ((gl_WorkGroupID.x * SPLIT_H) % d_head) * 4;
|
||||
const uint seq_idx = gl_WorkGroupID.y;
|
||||
const uint subgroup = gl_SubgroupID;
|
||||
const uint lane = gl_SubgroupInvocationID;
|
||||
const uint tid = gl_SubgroupID * SUBGROUP_SIZE + lane;
|
||||
const uint subgroup_idx = gl_WorkGroupID.x * c_factor + subgroup;
|
||||
|
||||
const uint head_idx = subgroup_idx / d_head;
|
||||
const uint head_off = (subgroup_idx % d_head) * 4;
|
||||
const uint seq_idx = gl_WorkGroupID.y;
|
||||
|
||||
const uint group_off = (head_idx / (n_head / n_group)) * D_STATE * 4;
|
||||
const uint s0_base_idx = (uint(ids[seq_idx]) * nb03 + head_idx * nb02 + head_off * D_STATE) / 4;
|
||||
const uint x_base_idx = (seq_idx * nb13 + gl_WorkGroupID.x * SPLIT_H * 4) / 4;
|
||||
const uint x_base_idx = (seq_idx * nb13 + subgroup_idx * 4) / 4;
|
||||
const uint dt_base_idx = (seq_idx * nb22 + head_idx * 4) / 4;
|
||||
const uint A_base_idx = (head_idx * nb31) / 4;
|
||||
const uint B_base_idx = (seq_idx * nb43 + group_off) / 4;
|
||||
const uint C_base_idx = (seq_idx * nb53 + group_off) / 4;
|
||||
const uint y_base_idx = seq_idx * n_tok * n_head * d_head + gl_WorkGroupID.x * SPLIT_H;
|
||||
const uint y_base_idx = seq_idx * n_tok * n_head * d_head + subgroup_idx;
|
||||
const uint s_base_idx = (s_off + seq_idx * nb03 + head_idx * nb02 + head_off * D_STATE) / 4;
|
||||
|
||||
const uint stride_x = nb12 / 4;
|
||||
|
|
@ -65,76 +73,52 @@ void main() {
|
|||
const uint stride_C = nb52 / 4;
|
||||
const uint stride_y = n_head * d_head;
|
||||
|
||||
float state[SPLIT_H];
|
||||
[[unroll]] for (uint j = 0; j < SPLIT_H; j++) {
|
||||
state[j] = s0[s0_base_idx + j * D_STATE + tid];
|
||||
float state[c_factor];
|
||||
|
||||
[[unroll]] for (uint j = 0; j < c_factor; j++) {
|
||||
state[j] = s0[s0_base_idx + SUBGROUP_SIZE * j + lane];
|
||||
}
|
||||
|
||||
float a = A[A_base_idx];
|
||||
|
||||
for (uint i = 0; i < n_tok; i++) {
|
||||
const float dt_soft_plus = softplus(dt[dt_base_idx + i * stride_dt]);
|
||||
float dt_soft_plus = softplus(dt[dt_base_idx + i * stride_dt]);
|
||||
|
||||
const float dA = exp(dt_soft_plus * A[A_base_idx]);
|
||||
|
||||
const float B_val = B[B_base_idx + i * stride_B + tid];
|
||||
const float C_val = C[C_base_idx + i * stride_C + tid];
|
||||
|
||||
[[unroll]] for (uint j = 0; j < SPLIT_H; j++) {
|
||||
const float x_dt = x[x_base_idx + i * stride_x + j] * dt_soft_plus;
|
||||
float state_sum = 0.0f;
|
||||
|
||||
const float dA = exp(dt_soft_plus * a);
|
||||
const float x_dt = x[x_base_idx + i * stride_x] * dt_soft_plus;
|
||||
[[unroll]] for (uint j = 0; j < c_factor; j++) {
|
||||
float B_val = B[B_base_idx + i * stride_B + SUBGROUP_SIZE * j + lane];
|
||||
float C_val = C[C_base_idx + i * stride_C + SUBGROUP_SIZE * j + lane];
|
||||
state[j] = (state[j] * dA) + (B_val * x_dt);
|
||||
|
||||
stateC[j * D_STATE + tid] = state[j] * C_val;
|
||||
state_sum += state[j] * C_val;
|
||||
}
|
||||
|
||||
#if USE_SUBGROUP_ADD
|
||||
state_sum = subgroupAdd(state_sum);
|
||||
#else
|
||||
temp[tid] = state_sum;
|
||||
barrier();
|
||||
[[unroll]]
|
||||
for (uint w = D_STATE / 2; w >= SUBGROUP_SIZE; w >>= 1) {
|
||||
[[unroll]] for (uint j = 0; j < (w * SPLIT_H + D_STATE - 1) / D_STATE; j++) {
|
||||
const uint k = (tid % w) + (D_STATE * (tid / w)) + j * D_STATE * (D_STATE / w);
|
||||
if (k < SPLIT_H * D_STATE && (k + w) < SPLIT_H * D_STATE) {
|
||||
stateC[k] += stateC[k + w];
|
||||
}
|
||||
[[unroll]] for (uint s = SUBGROUP_SIZE / 2; s > 0; s >>= 1) {
|
||||
if (lane < s) {
|
||||
temp[tid] += temp[tid + s];
|
||||
}
|
||||
barrier();
|
||||
}
|
||||
|
||||
[[unroll]] for (uint j = 0; j < max(1, SPLIT_H / (D_STATE / SUBGROUP_SIZE)); j++) {
|
||||
const uint idx = (tid % SUBGROUP_SIZE) +
|
||||
D_STATE * (tid / SUBGROUP_SIZE) +
|
||||
j * D_STATE * (D_STATE / SUBGROUP_SIZE);
|
||||
const uint max_idx = SUBGROUP_SIZE - 1 +
|
||||
D_STATE * ((D_STATE - 1) / SUBGROUP_SIZE) +
|
||||
j * D_STATE * (D_STATE / SUBGROUP_SIZE);
|
||||
|
||||
if (idx < SPLIT_H * D_STATE ||
|
||||
max_idx < SPLIT_H * D_STATE) {
|
||||
float sc;
|
||||
#if USE_SUBGROUP_ADD
|
||||
sc = stateC[idx];
|
||||
sc = subgroupAdd(sc);
|
||||
#else
|
||||
[[unroll]] for (uint offset = SUBGROUP_SIZE / 2; offset > 0; offset >>= 1) {
|
||||
if (idx + offset < SPLIT_H * D_STATE) {
|
||||
stateC[idx] += stateC[idx + offset];
|
||||
}
|
||||
barrier();
|
||||
}
|
||||
if (tid % SUBGROUP_SIZE == 0) {
|
||||
sc = stateC[idx];
|
||||
}
|
||||
// get the value from lane 0
|
||||
state_sum = temp[subgroup * SUBGROUP_SIZE];
|
||||
barrier();
|
||||
#endif
|
||||
|
||||
if (tid % SUBGROUP_SIZE == 0) {
|
||||
const uint k = tid / SUBGROUP_SIZE + j * (D_STATE / SUBGROUP_SIZE);
|
||||
d[y_base_idx + i * stride_y + k] = sc;
|
||||
}
|
||||
}
|
||||
if (lane == 0) {
|
||||
d[y_base_idx + i * stride_y] = state_sum;
|
||||
}
|
||||
|
||||
barrier();
|
||||
}
|
||||
|
||||
[[unroll]] for (uint j = 0; j < SPLIT_H; j++) {
|
||||
d[s_base_idx + j * D_STATE + tid] = state[j];
|
||||
// write back the state
|
||||
[[unroll]]
|
||||
for (int j = 0; j < c_factor; j++) {
|
||||
d[s_base_idx + SUBGROUP_SIZE * j + lane] = state[j];
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -641,6 +641,7 @@ static void speculative_decoding_setup(std::string spec_model_filename, const ll
|
|||
|
||||
draft_model_params.use_mmap = base_model_params.use_mmap;
|
||||
draft_model_params.use_mlock = base_model_params.use_mlock;
|
||||
draft_model_params.use_direct_io = base_model_params.use_direct_io;
|
||||
draft_model_params.n_gpu_layers = draft_gpulayers; //layers offload the speculative model.
|
||||
draft_ctx_params.n_ctx = base_ctx_params.n_ctx;
|
||||
draft_ctx_params.offload_kqv = base_ctx_params.offload_kqv;
|
||||
|
|
@ -2371,6 +2372,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
|||
llama_ctx_params.kv_unified = true;
|
||||
model_params.use_mmap = inputs.use_mmap;
|
||||
model_params.use_mlock = inputs.use_mlock;
|
||||
model_params.use_direct_io = false; //no direct io for now until stable
|
||||
model_params.n_gpu_layers = inputs.gpulayers;
|
||||
|
||||
#if defined(GGML_USE_CLBLAST)
|
||||
|
|
@ -2549,6 +2551,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
|||
|
||||
//apply overrides from autofit
|
||||
float tensor_split_temp[128] = {0}; //temp buffer for autofit
|
||||
std::vector<size_t> fit_params_target = std::vector<size_t>(llama_max_devices(),1024*1024*1024);
|
||||
if(inputs.autofit)
|
||||
{
|
||||
common_params temp_params;
|
||||
|
|
@ -2561,8 +2564,9 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
|||
model_params.tensor_split = tensor_split_temp;
|
||||
model_params.n_gpu_layers = -1; //must be this value to be considered default
|
||||
printf("Autofit Reserve Space: %d MB\n",taxmb);
|
||||
fit_params_target[0] = taxmb*1024*1024;
|
||||
llama_params_fit(kcpp_data->model_filename.c_str(), &model_params, &llama_ctx_params,
|
||||
tensor_split_temp, tenos.data(), taxmb*1024*1024, kcpp_data->n_ctx,
|
||||
tensor_split_temp, tenos.data(), fit_params_target.data(), kcpp_data->n_ctx,
|
||||
GGML_LOG_LEVEL_DEBUG);
|
||||
printf("Autofit Result: ");
|
||||
print_fitted_params(model_params,llama_ctx_params);
|
||||
|
|
|
|||
|
|
@ -312,6 +312,7 @@ extern "C" {
|
|||
// Keep the booleans together to avoid misalignment during copy-by-value.
|
||||
bool vocab_only; // only load the vocabulary, no weights
|
||||
bool use_mmap; // use mmap if possible
|
||||
bool use_direct_io; // use direct io, takes precedence over use_mmap
|
||||
bool use_mlock; // force system to keep model in RAM
|
||||
bool check_tensors; // validate model tensor data
|
||||
bool use_extra_bufts; // use extra buffer types (used for weight repacking)
|
||||
|
|
@ -497,7 +498,7 @@ extern "C" {
|
|||
struct llama_context_params * cparams,
|
||||
float * tensor_split, // writable buffer for tensor split, needs at least llama_max_devices elements
|
||||
struct llama_model_tensor_buft_override * tensor_buft_overrides, // writable buffer for overrides, needs at least llama_max_tensor_buft_overrides elements
|
||||
size_t margin, // margin of memory to leave per device in bytes
|
||||
size_t * margins, // margins of memory to leave per device in bytes
|
||||
uint32_t n_ctx_min, // minimum context size to set when trying to reduce memory use
|
||||
enum ggml_log_level log_level); // minimum log level to print during fitting, lower levels go to debug log
|
||||
|
||||
|
|
|
|||
|
|
@ -110,7 +110,7 @@ struct llama_file::impl {
|
|||
}
|
||||
}
|
||||
|
||||
void read_raw(void * ptr, size_t len) const {
|
||||
void read_raw(void * ptr, size_t len) {
|
||||
size_t bytes_read = 0;
|
||||
while (bytes_read < len) {
|
||||
size_t chunk_size = std::min<size_t>(len - bytes_read, 64*1024*1024);
|
||||
|
|
@ -127,7 +127,7 @@ struct llama_file::impl {
|
|||
}
|
||||
}
|
||||
|
||||
uint32_t read_u32() const {
|
||||
uint32_t read_u32() {
|
||||
uint32_t val;
|
||||
read_raw(&val, sizeof(val));
|
||||
return val;
|
||||
|
|
@ -154,8 +154,8 @@ struct llama_file::impl {
|
|||
write_raw(&val, sizeof(val));
|
||||
}
|
||||
|
||||
void read_aligned_chunk(size_t offset, void * dest, size_t size) const {
|
||||
throw std::runtime_error("DirectIO is not implemented on Windows.");
|
||||
bool has_direct_io() const {
|
||||
return true;
|
||||
}
|
||||
|
||||
~impl() {
|
||||
|
|
@ -164,33 +164,45 @@ struct llama_file::impl {
|
|||
}
|
||||
}
|
||||
#else
|
||||
impl(const char * fname, const char * mode, [[maybe_unused]] const bool use_direct_io = false) {
|
||||
impl(const char * fname, const char * mode, [[maybe_unused]] const bool use_direct_io = false) : fname(fname) {
|
||||
#ifdef __linux__
|
||||
// Try unbuffered I/O for read only
|
||||
if (use_direct_io && std::strcmp(mode, "rb") == 0) {
|
||||
fd = open(fname, O_RDONLY | O_DIRECT);
|
||||
|
||||
if (fd != -1) {
|
||||
struct stat file_stats{};
|
||||
fstat(fd, &file_stats);
|
||||
|
||||
size = file_stats.st_size;
|
||||
alignment = file_stats.st_blksize;
|
||||
|
||||
off_t ret = lseek(fd, 0, SEEK_SET);
|
||||
if (ret == -1) {
|
||||
throw std::runtime_error(format("seek error: %s", strerror(errno)));
|
||||
}
|
||||
if (init_fd()) {
|
||||
return;
|
||||
}
|
||||
|
||||
LLAMA_LOG_WARN("Failed to open model %s with error: %s. Falling back to buffered I/O",
|
||||
fname, strerror(errno));
|
||||
LLAMA_LOG_WARN("Failed to open file '%s' with error: %s. Falling back to buffered I/O",
|
||||
fname, strerror(errno));
|
||||
}
|
||||
#endif
|
||||
fp = ggml_fopen(fname, mode);
|
||||
init_fp(mode);
|
||||
}
|
||||
|
||||
#ifdef __linux__
|
||||
bool init_fd() {
|
||||
fd = open(fname.c_str(), O_RDONLY | O_DIRECT);
|
||||
|
||||
if (fd != -1) {
|
||||
struct stat file_stats{};
|
||||
fstat(fd, &file_stats);
|
||||
|
||||
size = file_stats.st_size;
|
||||
alignment = file_stats.st_blksize;
|
||||
|
||||
off_t ret = lseek(fd, 0, SEEK_SET);
|
||||
if (ret == -1) {
|
||||
throw std::runtime_error(format("seek error: %s", strerror(errno)));
|
||||
}
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
|
||||
void init_fp(const char * mode) {
|
||||
fp = ggml_fopen(fname.c_str(), mode);
|
||||
if (fp == NULL) {
|
||||
throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
|
||||
throw std::runtime_error(format("failed to open %s: %s", fname.c_str(), strerror(errno)));
|
||||
}
|
||||
seek(0, SEEK_END);
|
||||
size = tell();
|
||||
|
|
@ -226,7 +238,7 @@ struct llama_file::impl {
|
|||
}
|
||||
}
|
||||
|
||||
void read_raw(void * ptr, size_t len) const {
|
||||
void read_raw_unsafe(void * ptr, size_t len) {
|
||||
if (len == 0) {
|
||||
return;
|
||||
}
|
||||
|
|
@ -249,6 +261,17 @@ struct llama_file::impl {
|
|||
if (errno == EINTR) {
|
||||
continue; // Interrupted by signal, retry
|
||||
}
|
||||
// Fallback to std::fread in case the DMA controller cannot access the buffer
|
||||
if (errno == EFAULT) {
|
||||
auto curr_off = tell();
|
||||
close(fd);
|
||||
fd = -1;
|
||||
alignment = 1;
|
||||
init_fp("rb");
|
||||
seek(curr_off, SEEK_SET);
|
||||
read_raw_unsafe(ptr, len);
|
||||
return;
|
||||
}
|
||||
throw std::runtime_error(format("read error: %s", strerror(errno)));
|
||||
}
|
||||
if (ret == 0) {
|
||||
|
|
@ -266,7 +289,8 @@ struct llama_file::impl {
|
|||
}
|
||||
}
|
||||
|
||||
void read_aligned_chunk(size_t offset, void * dest, size_t size) const {
|
||||
void read_aligned_chunk(void * dest, size_t size) {
|
||||
size_t offset = tell();
|
||||
off_t aligned_offset = offset & ~(alignment - 1);
|
||||
off_t offset_from_alignment = offset - aligned_offset;
|
||||
size_t bytes_to_read = (offset_from_alignment + size + alignment - 1) & ~(alignment - 1);
|
||||
|
|
@ -283,13 +307,21 @@ struct llama_file::impl {
|
|||
std::unique_ptr<void, aligned_buffer_deleter> buffer(raw_buffer);
|
||||
|
||||
seek(aligned_offset, SEEK_SET);
|
||||
read_raw(buffer.get(), bytes_to_read);
|
||||
read_raw_unsafe(buffer.get(), bytes_to_read);
|
||||
|
||||
uintptr_t actual_data = reinterpret_cast<uintptr_t>(buffer.get()) + offset_from_alignment;
|
||||
memcpy(dest, reinterpret_cast<void *>(actual_data), size);
|
||||
}
|
||||
|
||||
uint32_t read_u32() const {
|
||||
void read_raw(void * ptr, size_t len) {
|
||||
if (has_direct_io()) {
|
||||
read_aligned_chunk(ptr, len);
|
||||
} else {
|
||||
read_raw_unsafe(ptr, len);
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t read_u32() {
|
||||
uint32_t ret;
|
||||
read_raw(&ret, sizeof(ret));
|
||||
return ret;
|
||||
|
|
@ -310,6 +342,10 @@ struct llama_file::impl {
|
|||
write_raw(&val, sizeof(val));
|
||||
}
|
||||
|
||||
bool has_direct_io() const {
|
||||
return fd != -1 && alignment > 1;
|
||||
}
|
||||
|
||||
~impl() {
|
||||
if (fd != -1) {
|
||||
close(fd);
|
||||
|
|
@ -318,17 +354,9 @@ struct llama_file::impl {
|
|||
}
|
||||
}
|
||||
int fd = -1;
|
||||
std::string fname;
|
||||
#endif
|
||||
|
||||
void read_raw_at(void * ptr, size_t len, size_t offset) const {
|
||||
if (alignment != 1) {
|
||||
read_aligned_chunk(offset, ptr, len);
|
||||
} else {
|
||||
seek(offset, SEEK_SET);
|
||||
read_raw(ptr, len);
|
||||
}
|
||||
}
|
||||
|
||||
size_t read_alignment() const {
|
||||
return alignment;
|
||||
}
|
||||
|
|
@ -347,6 +375,7 @@ size_t llama_file::tell() const { return pimpl->tell(); }
|
|||
size_t llama_file::size() const { return pimpl->size; }
|
||||
|
||||
size_t llama_file::read_alignment() const { return pimpl->read_alignment(); }
|
||||
bool llama_file::has_direct_io() const { return pimpl->has_direct_io(); }
|
||||
|
||||
int llama_file::file_id() const {
|
||||
#ifdef _WIN32
|
||||
|
|
@ -361,10 +390,14 @@ int llama_file::file_id() const {
|
|||
}
|
||||
|
||||
void llama_file::seek(size_t offset, int whence) const { pimpl->seek(offset, whence); }
|
||||
void llama_file::read_raw(void * ptr, size_t len) const { pimpl->read_raw(ptr, len); }
|
||||
void llama_file::read_raw_at(void * ptr, size_t len, size_t offset) const { pimpl->read_raw_at(ptr, len, offset); }
|
||||
void llama_file::read_raw(void * ptr, size_t len) { pimpl->read_raw(ptr, len); }
|
||||
#ifdef _WIN32
|
||||
void llama_file::read_raw_unsafe(void * ptr, size_t len) { pimpl->read_raw(ptr, len); }
|
||||
#else
|
||||
void llama_file::read_raw_unsafe(void * ptr, size_t len) { pimpl->read_raw_unsafe(ptr, len); }
|
||||
#endif
|
||||
|
||||
uint32_t llama_file::read_u32() const { return pimpl->read_u32(); }
|
||||
uint32_t llama_file::read_u32() { return pimpl->read_u32(); }
|
||||
|
||||
void llama_file::write_raw(const void * ptr, size_t len) const { pimpl->write_raw(ptr, len); }
|
||||
void llama_file::write_u32(uint32_t val) const { pimpl->write_u32(val); }
|
||||
|
|
|
|||
|
|
@ -24,15 +24,16 @@ struct llama_file {
|
|||
|
||||
void seek(size_t offset, int whence) const;
|
||||
|
||||
void read_raw(void * ptr, size_t len) const;
|
||||
void read_raw_at(void * ptr, size_t len, size_t offset) const;
|
||||
void read_aligned_chunk(size_t offset, void * dest, size_t size) const;
|
||||
uint32_t read_u32() const;
|
||||
void read_raw(void * ptr, size_t len);
|
||||
void read_raw_unsafe(void * ptr, size_t len);
|
||||
void read_aligned_chunk(void * dest, size_t size);
|
||||
uint32_t read_u32();
|
||||
|
||||
void write_raw(const void * ptr, size_t len) const;
|
||||
void write_u32(uint32_t val) const;
|
||||
|
||||
size_t read_alignment() const;
|
||||
bool has_direct_io() const;
|
||||
private:
|
||||
struct impl;
|
||||
std::unique_ptr<impl> pimpl;
|
||||
|
|
|
|||
|
|
@ -499,6 +499,7 @@ llama_model_loader::llama_model_loader(
|
|||
const std::string & fname,
|
||||
std::vector<std::string> & splits,
|
||||
bool use_mmap,
|
||||
bool use_direct_io,
|
||||
bool check_tensors,
|
||||
bool no_alloc,
|
||||
const llama_model_kv_override * param_overrides_p,
|
||||
|
|
@ -531,9 +532,17 @@ llama_model_loader::llama_model_loader(
|
|||
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
|
||||
llm_kv = LLM_KV(llm_arch_from_string(arch_name));
|
||||
|
||||
files.emplace_back(new llama_file(fname.c_str(), "rb", !use_mmap));
|
||||
files.emplace_back(new llama_file(fname.c_str(), "rb", use_direct_io));
|
||||
contexts.emplace_back(ctx);
|
||||
|
||||
use_direct_io = use_direct_io && files.back()->has_direct_io();
|
||||
|
||||
// Disable mmap in case Direct I/O is enabled and available
|
||||
if (use_direct_io && use_mmap) {
|
||||
use_mmap = false;
|
||||
LLAMA_LOG_WARN("%s: direct I/O is enabled, disabling mmap\n", __func__);
|
||||
}
|
||||
|
||||
// Save tensors data offset of the main file.
|
||||
// For subsidiary files, `meta` tensor data offset must not be used,
|
||||
// so we build a unified tensors index for weights.
|
||||
|
|
@ -599,7 +608,7 @@ llama_model_loader::llama_model_loader(
|
|||
}
|
||||
}
|
||||
|
||||
files.emplace_back(new llama_file(fname_split, "rb", !use_mmap));
|
||||
files.emplace_back(new llama_file(fname_split, "rb", use_direct_io));
|
||||
contexts.emplace_back(ctx);
|
||||
|
||||
// Save tensors data offset info of the shard.
|
||||
|
|
@ -744,6 +753,7 @@ llama_model_loader::llama_model_loader(
|
|||
}
|
||||
|
||||
this->use_mmap = use_mmap;
|
||||
this->use_direct_io = use_direct_io;
|
||||
this->check_tensors = check_tensors;
|
||||
this->no_alloc = no_alloc;
|
||||
}
|
||||
|
|
@ -1123,7 +1133,8 @@ bool llama_model_loader::load_all_data(
|
|||
const auto & file = files.at(weight->idx);
|
||||
|
||||
if (ggml_backend_buffer_is_host(cur->buffer)) {
|
||||
file->read_raw_at(cur->data, n_size, weight->offs);
|
||||
file->seek(weight->offs, SEEK_SET);
|
||||
file->read_raw(cur->data, n_size);
|
||||
if (check_tensors) {
|
||||
validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] {
|
||||
return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size));
|
||||
|
|
@ -1155,7 +1166,7 @@ bool llama_model_loader::load_all_data(
|
|||
ggml_backend_event_synchronize(events[buffer_idx]);
|
||||
|
||||
// Read aligned chunk from file
|
||||
file->read_raw(reinterpret_cast<void *>(ptr_dest_aligned), read_size);
|
||||
file->read_raw_unsafe(reinterpret_cast<void *>(ptr_dest_aligned), read_size);
|
||||
|
||||
// Calculate actual data portion (excluding alignment padding)
|
||||
uintptr_t ptr_data = ptr_dest_aligned;
|
||||
|
|
@ -1185,7 +1196,8 @@ bool llama_model_loader::load_all_data(
|
|||
}
|
||||
} else {
|
||||
read_buf.resize(n_size);
|
||||
file->read_raw_at(read_buf.data(), n_size, weight->offs);
|
||||
file->seek(weight->offs, SEEK_SET);
|
||||
file->read_raw(read_buf.data(), n_size);
|
||||
ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
|
||||
if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
|
||||
throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
|
||||
|
|
|
|||
|
|
@ -70,6 +70,7 @@ struct llama_model_loader {
|
|||
size_t n_bytes = 0;
|
||||
|
||||
bool use_mmap = false;
|
||||
bool use_direct_io = false;
|
||||
bool check_tensors;
|
||||
bool no_alloc;
|
||||
|
||||
|
|
@ -97,6 +98,7 @@ struct llama_model_loader {
|
|||
const std::string & fname,
|
||||
std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
|
||||
bool use_mmap,
|
||||
bool use_direct_io,
|
||||
bool check_tensors,
|
||||
bool no_alloc,
|
||||
const llama_model_kv_override * param_overrides_p,
|
||||
|
|
|
|||
|
|
@ -2549,7 +2549,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
|
||||
bool use_mmap_buffer = true;
|
||||
|
||||
LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s)\n", __func__, ml.use_mmap ? "true" : "false");
|
||||
LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s, direct_io = %s)\n",
|
||||
__func__, ml.use_mmap ? "true" : "false", ml.use_direct_io ? "true" : "false");
|
||||
|
||||
// build a list of buffer types for the CPU and GPU devices
|
||||
pimpl->cpu_buft_list = make_cpu_buft_list(devices, params.use_extra_bufts, params.no_host);
|
||||
|
|
@ -8139,6 +8140,7 @@ llama_model_params llama_model_default_params() {
|
|||
/*.kv_overrides =*/ nullptr,
|
||||
/*.vocab_only =*/ false,
|
||||
/*.use_mmap =*/ true,
|
||||
/*.use_direct_io =*/ true,
|
||||
/*.use_mlock =*/ false,
|
||||
/*.check_tensors =*/ false,
|
||||
/*.use_extra_bufts =*/ true,
|
||||
|
|
|
|||
|
|
@ -599,7 +599,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|||
}
|
||||
|
||||
std::vector<std::string> splits = {};
|
||||
llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
|
||||
llama_model_loader ml(fname_inp, splits, use_mmap, /*use_direct_io*/ true, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
|
||||
ml.init_mappings(false); // no prefetching
|
||||
|
||||
llama_model model(llama_model_default_params());
|
||||
|
|
|
|||
|
|
@ -171,9 +171,8 @@ class llama_params_fit_exception : public std::runtime_error {
|
|||
static void llama_params_fit_impl(
|
||||
const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
|
||||
float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
|
||||
size_t margin_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
|
||||
size_t * margins_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
|
||||
constexpr int64_t MiB = 1024*1024;
|
||||
const int64_t margin = margin_s; // this function uses int64_t rather than size_t for memory sizes to more conveniently handle deficits
|
||||
typedef std::vector<llama_device_memory_data> dmds_t;
|
||||
const llama_model_params default_mparams = llama_model_default_params();
|
||||
|
||||
|
|
@ -192,6 +191,12 @@ static void llama_params_fit_impl(
|
|||
return;
|
||||
}
|
||||
|
||||
std::vector<int64_t> margins; // this function uses int64_t rather than size_t for memory sizes to more conveniently handle deficits
|
||||
margins.reserve(nd);
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
margins.push_back(margins_s[id]);
|
||||
}
|
||||
|
||||
std::vector<std::string> dev_names;
|
||||
{
|
||||
dev_names.reserve(nd);
|
||||
|
|
@ -211,9 +216,10 @@ static void llama_params_fit_impl(
|
|||
|
||||
int64_t sum_free = 0;
|
||||
int64_t sum_projected_free = 0;
|
||||
int64_t min_projected_free = INT64_MAX;
|
||||
int64_t sum_projected_used = 0;
|
||||
int64_t sum_projected_model = 0;
|
||||
std::vector<int64_t> projected_free_per_device;
|
||||
projected_free_per_device.reserve(nd);
|
||||
|
||||
if (nd > 1) {
|
||||
LLAMA_LOG_INFO("%s: projected memory use with initial parameters [MiB]:\n", __func__);
|
||||
|
|
@ -223,45 +229,63 @@ static void llama_params_fit_impl(
|
|||
|
||||
const int64_t projected_used = dmd.mb.total();
|
||||
const int64_t projected_free = dmd.free - projected_used;
|
||||
projected_free_per_device.push_back(projected_free);
|
||||
|
||||
sum_free += dmd.free;
|
||||
sum_projected_used += projected_used;
|
||||
sum_projected_free += projected_free;
|
||||
min_projected_free = std::min(min_projected_free, projected_free);
|
||||
sum_projected_model += dmd.mb.model;
|
||||
|
||||
if (nd > 1) {
|
||||
LLAMA_LOG_INFO("%s: - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " %s\n",
|
||||
__func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, std::abs(projected_free)/MiB,
|
||||
projected_free >= 0 ? "surplus" : "deficit");
|
||||
LLAMA_LOG_INFO("%s: - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " free vs. target of %6" PRId64 "\n",
|
||||
__func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, projected_free/MiB, margins[id]/MiB);
|
||||
}
|
||||
}
|
||||
assert(sum_free >= 0 && sum_projected_used >= 0);
|
||||
LLAMA_LOG_INFO("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n",
|
||||
__func__, sum_projected_used/MiB, sum_free/MiB);
|
||||
if (min_projected_free >= margin) {
|
||||
if (nd == 1) {
|
||||
if (nd == 1) {
|
||||
if (projected_free_per_device[0] >= margins[0]) {
|
||||
LLAMA_LOG_INFO("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n",
|
||||
__func__, min_projected_free/MiB, margin/MiB);
|
||||
__func__, projected_free_per_device[0]/MiB, margins[0]/MiB);
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
bool changes_needed = false;
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
if (projected_free_per_device[id] < margins[id]) {
|
||||
changes_needed = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!changes_needed) {
|
||||
LLAMA_LOG_INFO("%s: targets for free memory can be met on all devices, no changes needed\n", __func__);
|
||||
return;
|
||||
}
|
||||
LLAMA_LOG_INFO("%s: will leave at least %" PRId64 " >= %" PRId64 " MiB of free memory on all devices, no changes needed\n",
|
||||
__func__, min_projected_free/MiB, margin/MiB);
|
||||
return;
|
||||
}
|
||||
|
||||
// step 2: try reducing memory use by reducing the context size
|
||||
|
||||
{
|
||||
int64_t global_surplus = sum_projected_free - int64_t(nd)*margin;
|
||||
int64_t global_surplus = sum_projected_free;
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
global_surplus -= margins[id];
|
||||
}
|
||||
if (global_surplus < 0) {
|
||||
LLAMA_LOG_INFO(nd == 1 ?
|
||||
"%s: cannot fulfill margin of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n" :
|
||||
"%s: cannot fulfill margin of %" PRId64 " MiB on all devices, need to use %" PRId64 " MiB less in total\n",
|
||||
__func__, margin/MiB, -global_surplus/MiB);
|
||||
if (nd == 1) {
|
||||
LLAMA_LOG_INFO("%s: cannot meet free memory target of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n",
|
||||
__func__, margins[0]/MiB, -global_surplus/MiB);
|
||||
} else {
|
||||
LLAMA_LOG_INFO(
|
||||
"%s: cannot meet free memory targets on all devices, need to use %" PRId64 " MiB less in total\n",
|
||||
__func__, -global_surplus/MiB);
|
||||
}
|
||||
if (cparams->n_ctx == 0) {
|
||||
if (hp_nct > n_ctx_min) {
|
||||
int64_t sum_used_target = sum_free - nd*margin_s;
|
||||
int64_t sum_used_target = sum_free;
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
sum_used_target -= margins[id];
|
||||
}
|
||||
if (nd > 1) {
|
||||
// for multiple devices we need to be more conservative in terms of how much context we think can fit:
|
||||
// - for dense models only whole layers can be assigned to devices
|
||||
|
|
@ -472,9 +496,9 @@ static void llama_params_fit_impl(
|
|||
const dmds_t dmds_cpu_moe = llama_get_device_memory_data(
|
||||
path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
|
||||
|
||||
for (const llama_device_memory_data & dmd : dmds_cpu_moe) {
|
||||
global_surplus_cpu_moe += dmd.free;
|
||||
global_surplus_cpu_moe -= int64_t(dmd.mb.total()) + margin;
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
global_surplus_cpu_moe += dmds_cpu_moe[id].free;
|
||||
global_surplus_cpu_moe -= int64_t(dmds_cpu_moe[id].mb.total()) + margins[id];
|
||||
}
|
||||
|
||||
if (global_surplus_cpu_moe > 0) {
|
||||
|
|
@ -493,7 +517,7 @@ static void llama_params_fit_impl(
|
|||
std::vector<int64_t> targets; // maximum acceptable memory use per device
|
||||
targets.reserve(nd);
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
targets.push_back(dmds_full[id].free - margin);
|
||||
targets.push_back(dmds_full[id].free - margins[id]);
|
||||
LLAMA_LOG_DEBUG("%s: id=%zu, target=%" PRId64 " MiB\n", __func__, id, targets[id]/MiB);
|
||||
}
|
||||
|
||||
|
|
@ -725,11 +749,11 @@ static void llama_params_fit_impl(
|
|||
enum llama_params_fit_status llama_params_fit(
|
||||
const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
|
||||
float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
|
||||
size_t margin_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
|
||||
size_t * margins, uint32_t n_ctx_min, enum ggml_log_level log_level) {
|
||||
const int64_t t0_us = llama_time_us();
|
||||
llama_params_fit_status status = LLAMA_PARAMS_FIT_STATUS_SUCCESS;
|
||||
try {
|
||||
llama_params_fit_impl(path_model, mparams, cparams, tensor_split, tensor_buft_overrides, margin_s, n_ctx_min, log_level);
|
||||
llama_params_fit_impl(path_model, mparams, cparams, tensor_split, tensor_buft_overrides, margins, n_ctx_min, log_level);
|
||||
LLAMA_LOG_INFO("%s: successfully fit params to free device memory\n", __func__);
|
||||
} catch (const llama_params_fit_exception & e) {
|
||||
LLAMA_LOG_WARN("%s: failed to fit params to free device memory: %s\n", __func__, e.what());
|
||||
|
|
@ -822,7 +846,7 @@ static int llama_model_load(const std::string & fname, std::vector<std::string>
|
|||
model.t_start_us = tm.t_start_us;
|
||||
|
||||
try {
|
||||
llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides);
|
||||
llama_model_loader ml(fname, splits, params.use_mmap, params.use_direct_io, params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides);
|
||||
|
||||
ml.print_info();
|
||||
|
||||
|
|
|
|||
|
|
@ -27,7 +27,7 @@ int main(int argc, char ** argv) {
|
|||
auto mparams = common_model_params_to_llama(params);
|
||||
auto cparams = common_context_params_to_llama(params);
|
||||
const llama_params_fit_status status = llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
|
||||
params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target, params.fit_params_min_ctx,
|
||||
params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target.data(), params.fit_params_min_ctx,
|
||||
params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);
|
||||
if (status != LLAMA_PARAMS_FIT_STATUS_SUCCESS) {
|
||||
LOG_ERR("%s: failed to fit CLI arguments to free memory, exiting...\n", __func__);
|
||||
|
|
|
|||
|
|
@ -1,10 +1,10 @@
|
|||
#include "common.h"
|
||||
#include "download.h"
|
||||
#include "log.h"
|
||||
#include "llama.h"
|
||||
#include "mtmd.h"
|
||||
#include "mtmd-helper.h"
|
||||
#include "chat.h"
|
||||
#include "arg.h" // for common_remote_get_content; TODO: use download.h only
|
||||
#include "base64.hpp"
|
||||
|
||||
#include "server-common.h"
|
||||
|
|
@ -779,7 +779,7 @@ static void handle_media(
|
|||
// download remote image
|
||||
// TODO @ngxson : maybe make these params configurable
|
||||
common_remote_params params;
|
||||
params.headers.push_back("User-Agent: llama.cpp/" + build_info);
|
||||
params.headers.push_back({"User-Agent", "llama.cpp/" + build_info});
|
||||
params.max_size = 1024 * 1024 * 10; // 10MB
|
||||
params.timeout = 10; // seconds
|
||||
SRV_INF("downloading image from '%s'\n", url.c_str());
|
||||
|
|
|
|||
1486
vendor/cpp-httplib/httplib.cpp
vendored
1486
vendor/cpp-httplib/httplib.cpp
vendored
File diff suppressed because it is too large
Load diff
1124
vendor/cpp-httplib/httplib.h
vendored
1124
vendor/cpp-httplib/httplib.h
vendored
File diff suppressed because it is too large
Load diff
Loading…
Add table
Add a link
Reference in a new issue