mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-05-07 17:22:04 +00:00
Merge branch 'upstream' into concedo_experimental
# Conflicts: # docs/build-s390x.md # docs/ops.md # docs/ops/zDNN.csv # ggml/include/ggml-zdnn.h # ggml/src/ggml-sycl/binbcast.cpp # ggml/src/ggml-sycl/concat.cpp # ggml/src/ggml-sycl/conv.cpp # ggml/src/ggml-sycl/convert.cpp # ggml/src/ggml-sycl/cpy.cpp # ggml/src/ggml-sycl/dmmv.cpp # ggml/src/ggml-sycl/dpct/helper.hpp # ggml/src/ggml-sycl/element_wise.cpp # ggml/src/ggml-sycl/getrows.cpp # ggml/src/ggml-sycl/ggml-sycl.cpp # ggml/src/ggml-sycl/gla.cpp # ggml/src/ggml-sycl/im2col.cpp # ggml/src/ggml-sycl/mmq.cpp # ggml/src/ggml-sycl/mmvq.cpp # ggml/src/ggml-sycl/norm.cpp # ggml/src/ggml-sycl/rope.cpp # ggml/src/ggml-sycl/set_rows.cpp # ggml/src/ggml-sycl/softmax.cpp # ggml/src/ggml-sycl/tsembd.cpp # ggml/src/ggml-sycl/wkv.cpp # ggml/src/ggml-zdnn/ggml-zdnn-impl.h # ggml/src/ggml-zdnn/ggml-zdnn.cpp # tools/llama-bench/llama-bench.cpp
This commit is contained in:
commit
1dbd2fc259
14 changed files with 406 additions and 68 deletions
131
common/arg.cpp
131
common/arg.cpp
|
|
@ -747,6 +747,124 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string &
|
|||
|
||||
#endif // LLAMA_USE_CURL
|
||||
|
||||
//
|
||||
// Docker registry functions
|
||||
//
|
||||
|
||||
static std::string common_docker_get_token(const std::string & repo) {
|
||||
std::string url = "https://auth.docker.io/token?service=registry.docker.io&scope=repository:" + repo + ":pull";
|
||||
|
||||
common_remote_params params;
|
||||
auto res = common_remote_get_content(url, params);
|
||||
|
||||
if (res.first != 200) {
|
||||
throw std::runtime_error("Failed to get Docker registry token, HTTP code: " + std::to_string(res.first));
|
||||
}
|
||||
|
||||
std::string response_str(res.second.begin(), res.second.end());
|
||||
nlohmann::ordered_json response = nlohmann::ordered_json::parse(response_str);
|
||||
|
||||
if (!response.contains("token")) {
|
||||
throw std::runtime_error("Docker registry token response missing 'token' field");
|
||||
}
|
||||
|
||||
return response["token"].get<std::string>();
|
||||
}
|
||||
|
||||
static std::string common_docker_resolve_model(const std::string & docker) {
|
||||
// Parse ai/smollm2:135M-Q4_K_M
|
||||
size_t colon_pos = docker.find(':');
|
||||
std::string repo, tag;
|
||||
if (colon_pos != std::string::npos) {
|
||||
repo = docker.substr(0, colon_pos);
|
||||
tag = docker.substr(colon_pos + 1);
|
||||
} else {
|
||||
repo = docker;
|
||||
tag = "latest";
|
||||
}
|
||||
|
||||
// ai/ is the default
|
||||
size_t slash_pos = docker.find('/');
|
||||
if (slash_pos == std::string::npos) {
|
||||
repo.insert(0, "ai/");
|
||||
}
|
||||
|
||||
LOG_INF("%s: Downloading Docker Model: %s:%s\n", __func__, repo.c_str(), tag.c_str());
|
||||
try {
|
||||
// --- helper: digest validation ---
|
||||
auto validate_oci_digest = [](const std::string & digest) -> std::string {
|
||||
// Expected: algo:hex ; start with sha256 (64 hex chars)
|
||||
// You can extend this map if supporting other algorithms in future.
|
||||
static const std::regex re("^sha256:([a-fA-F0-9]{64})$");
|
||||
std::smatch m;
|
||||
if (!std::regex_match(digest, m, re)) {
|
||||
throw std::runtime_error("Invalid OCI digest format received in manifest: " + digest);
|
||||
}
|
||||
// normalize hex to lowercase
|
||||
std::string normalized = digest;
|
||||
std::transform(normalized.begin()+7, normalized.end(), normalized.begin()+7, [](unsigned char c){
|
||||
return std::tolower(c);
|
||||
});
|
||||
return normalized;
|
||||
};
|
||||
|
||||
std::string token = common_docker_get_token(repo); // Get authentication token
|
||||
|
||||
// Get manifest
|
||||
const std::string url_prefix = "https://registry-1.docker.io/v2/" + repo;
|
||||
std::string manifest_url = url_prefix + "/manifests/" + tag;
|
||||
common_remote_params manifest_params;
|
||||
manifest_params.headers.push_back("Authorization: Bearer " + token);
|
||||
manifest_params.headers.push_back(
|
||||
"Accept: application/vnd.docker.distribution.manifest.v2+json,application/vnd.oci.image.manifest.v1+json");
|
||||
auto manifest_res = common_remote_get_content(manifest_url, manifest_params);
|
||||
if (manifest_res.first != 200) {
|
||||
throw std::runtime_error("Failed to get Docker manifest, HTTP code: " + std::to_string(manifest_res.first));
|
||||
}
|
||||
|
||||
std::string manifest_str(manifest_res.second.begin(), manifest_res.second.end());
|
||||
nlohmann::ordered_json manifest = nlohmann::ordered_json::parse(manifest_str);
|
||||
std::string gguf_digest; // Find the GGUF layer
|
||||
if (manifest.contains("layers")) {
|
||||
for (const auto & layer : manifest["layers"]) {
|
||||
if (layer.contains("mediaType")) {
|
||||
std::string media_type = layer["mediaType"].get<std::string>();
|
||||
if (media_type == "application/vnd.docker.ai.gguf.v3" ||
|
||||
media_type.find("gguf") != std::string::npos) {
|
||||
gguf_digest = layer["digest"].get<std::string>();
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (gguf_digest.empty()) {
|
||||
throw std::runtime_error("No GGUF layer found in Docker manifest");
|
||||
}
|
||||
|
||||
// Validate & normalize digest
|
||||
gguf_digest = validate_oci_digest(gguf_digest);
|
||||
LOG_DBG("%s: Using validated digest: %s\n", __func__, gguf_digest.c_str());
|
||||
|
||||
// Prepare local filename
|
||||
std::string model_filename = repo;
|
||||
std::replace(model_filename.begin(), model_filename.end(), '/', '_');
|
||||
model_filename += "_" + tag + ".gguf";
|
||||
std::string local_path = fs_get_cache_file(model_filename);
|
||||
|
||||
const std::string blob_url = url_prefix + "/blobs/" + gguf_digest;
|
||||
if (!common_download_file_single(blob_url, local_path, token, false)) {
|
||||
throw std::runtime_error("Failed to download Docker Model");
|
||||
}
|
||||
|
||||
LOG_INF("%s: Downloaded Docker Model to: %s\n", __func__, local_path.c_str());
|
||||
return local_path;
|
||||
} catch (const std::exception & e) {
|
||||
LOG_ERR("%s: Docker Model download failed: %s\n", __func__, e.what());
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// utils
|
||||
//
|
||||
|
|
@ -797,7 +915,9 @@ static handle_model_result common_params_handle_model(
|
|||
handle_model_result result;
|
||||
// handle pre-fill default model path and url based on hf_repo and hf_file
|
||||
{
|
||||
if (!model.hf_repo.empty()) {
|
||||
if (!model.docker_repo.empty()) { // Handle Docker URLs by resolving them to local paths
|
||||
model.path = common_docker_resolve_model(model.docker_repo);
|
||||
} else if (!model.hf_repo.empty()) {
|
||||
// short-hand to avoid specifying --hf-file -> default it to --model
|
||||
if (model.hf_file.empty()) {
|
||||
if (model.path.empty()) {
|
||||
|
|
@ -2638,6 +2758,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||
params.model.url = value;
|
||||
}
|
||||
).set_env("LLAMA_ARG_MODEL_URL"));
|
||||
add_opt(common_arg(
|
||||
{ "-dr", "--docker-repo" }, "[<repo>/]<model>[:quant]",
|
||||
"Docker Hub model repository. repo is optional, default to ai/. quant is optional, default to :latest.\n"
|
||||
"example: gemma3\n"
|
||||
"(default: unused)",
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.model.docker_repo = value;
|
||||
}
|
||||
).set_env("LLAMA_ARG_DOCKER_REPO"));
|
||||
add_opt(common_arg(
|
||||
{"-hf", "-hfr", "--hf-repo"}, "<user>/<model>[:quant]",
|
||||
"Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.\n"
|
||||
|
|
|
|||
|
|
@ -189,10 +189,11 @@ struct common_params_sampling {
|
|||
};
|
||||
|
||||
struct common_params_model {
|
||||
std::string path = ""; // model local path // NOLINT
|
||||
std::string url = ""; // model url to download // NOLINT
|
||||
std::string hf_repo = ""; // HF repo // NOLINT
|
||||
std::string hf_file = ""; // HF file // NOLINT
|
||||
std::string path = ""; // model local path // NOLINT
|
||||
std::string url = ""; // model url to download // NOLINT
|
||||
std::string hf_repo = ""; // HF repo // NOLINT
|
||||
std::string hf_file = ""; // HF file // NOLINT
|
||||
std::string docker_repo = ""; // Docker repo // NOLINT
|
||||
};
|
||||
|
||||
struct common_params_speculative {
|
||||
|
|
@ -448,7 +449,7 @@ struct common_params {
|
|||
|
||||
std::string slot_save_path;
|
||||
|
||||
float slot_prompt_similarity = 0.5f;
|
||||
float slot_prompt_similarity = 0.1f;
|
||||
|
||||
// batched-bench params
|
||||
bool is_pp_shared = false;
|
||||
|
|
|
|||
|
|
@ -132,6 +132,8 @@ extern "C" {
|
|||
GGML_BACKEND_DEVICE_TYPE_CPU,
|
||||
// GPU device using dedicated memory
|
||||
GGML_BACKEND_DEVICE_TYPE_GPU,
|
||||
// integrated GPU device using host memory
|
||||
GGML_BACKEND_DEVICE_TYPE_IGPU,
|
||||
// accelerator devices intended to be used together with the CPU backend (e.g. BLAS or AMX)
|
||||
GGML_BACKEND_DEVICE_TYPE_ACCEL
|
||||
};
|
||||
|
|
@ -150,11 +152,21 @@ extern "C" {
|
|||
|
||||
// all the device properties
|
||||
struct ggml_backend_dev_props {
|
||||
// device name
|
||||
const char * name;
|
||||
// device description
|
||||
const char * description;
|
||||
// device free memory in bytes
|
||||
size_t memory_free;
|
||||
// device total memory in bytes
|
||||
size_t memory_total;
|
||||
// device type
|
||||
enum ggml_backend_dev_type type;
|
||||
// device id
|
||||
// for PCI devices, this should be the PCI bus id formatted as "domain:bus:device.function" (e.g. "0000:01:00.0")
|
||||
// if the id is unknown, this should be NULL
|
||||
const char * device_id;
|
||||
// device capabilities
|
||||
struct ggml_backend_dev_caps caps;
|
||||
};
|
||||
|
||||
|
|
|
|||
|
|
@ -8,7 +8,7 @@
|
|||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define GGML_BACKEND_API_VERSION 1
|
||||
#define GGML_BACKEND_API_VERSION 2
|
||||
|
||||
//
|
||||
// Backend buffer type
|
||||
|
|
|
|||
|
|
@ -400,9 +400,8 @@ ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const
|
|||
|
||||
ggml_backend_t ggml_backend_init_best(void) {
|
||||
ggml_backend_dev_t dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU);
|
||||
if (!dev) {
|
||||
dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||
}
|
||||
dev = dev ? dev : ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU);
|
||||
dev = dev ? dev : ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||
if (!dev) {
|
||||
return nullptr;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -560,7 +560,7 @@ static __device__ __forceinline__ void ggml_cuda_mad(float & acc, const float2 v
|
|||
}
|
||||
|
||||
static __device__ __forceinline__ void ggml_cuda_mad(float & acc, const half2 v, const half2 u) {
|
||||
#if defined(GGML_USE_HIP) && defined(GCN)
|
||||
#if defined(GGML_USE_HIP) && (defined(RDNA2) || defined(RDNA3) || defined(RDNA4) || defined(__gfx906__) || defined(CDNA))
|
||||
asm volatile("v_dot2_f32_f16 %0, %1, %2, %0" : "+v"(acc) : "v"(v), "v"(u));
|
||||
#else
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
|
|
@ -572,7 +572,21 @@ static __device__ __forceinline__ void ggml_cuda_mad(float & acc, const half2 v,
|
|||
acc += tmpv.x * tmpu.x;
|
||||
acc += tmpv.y * tmpu.y;
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
#endif // defined(GGML_USE_HIP) && defined(GCN)
|
||||
#endif // defined(GGML_USE_HIP) && (defined(RDNA2) || defined(RDNA3) || defined(RDNA4) || defined(GCN5) || defined(CDNA))
|
||||
}
|
||||
|
||||
// Aligned memory transfers of 8/16 bytes can be faster than 2 transfers with 4 bytes, especially on AMD.
|
||||
template <int nbytes>
|
||||
static __device__ __forceinline__ void ggml_cuda_memcpy_1(void * __restrict__ dst, const void * __restrict__ src) {
|
||||
if constexpr (nbytes == 4) {
|
||||
*(int *) dst = *(const int *) src;
|
||||
} else if constexpr (nbytes == 8) {
|
||||
*(int2 *) dst = *(const int2 *) src;
|
||||
} else if constexpr (nbytes == 16) {
|
||||
*(int4 *) dst = *(const int4 *) src;
|
||||
} else {
|
||||
static_assert(nbytes == 0 && nbytes == -1, "bad nbytes");
|
||||
}
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ float ggml_cuda_e8m0_to_fp32(uint8_t x) {
|
||||
|
|
|
|||
|
|
@ -8,11 +8,14 @@ static int fattn_tile_get_kq_stride_host(const int D, const int ncols, const int
|
|||
if (GGML_CUDA_CC_IS_AMD(cc)) {
|
||||
switch (D) {
|
||||
case 64:
|
||||
return ncols <= 16 ? 32 : 64;
|
||||
case 128:
|
||||
return ncols <= 16 ? 64 : warp_size;
|
||||
case 256:
|
||||
return 64;
|
||||
case 128:
|
||||
case 256:
|
||||
if (GGML_CUDA_CC_IS_GCN(cc) || GGML_CUDA_CC_IS_CDNA(cc)) {
|
||||
return ncols <= 16 ? 64 : 32;
|
||||
} else {
|
||||
return 64;
|
||||
}
|
||||
default:
|
||||
GGML_ABORT("fatal error");
|
||||
return -1;
|
||||
|
|
@ -41,17 +44,26 @@ static int fattn_tile_get_kq_stride_host(const int D, const int ncols, const int
|
|||
GGML_ABORT("fatal error");
|
||||
return -1;
|
||||
}
|
||||
GGML_UNUSED(warp_size);
|
||||
}
|
||||
|
||||
static constexpr __device__ int fattn_tile_get_kq_stride_device(int D, int ncols, int warp_size) {
|
||||
#ifdef GGML_USE_HIP
|
||||
switch (D) {
|
||||
case 64:
|
||||
return ncols <= 16 ? 32 : 64;
|
||||
case 128:
|
||||
return ncols <= 16 ? 64 : warp_size;
|
||||
case 256:
|
||||
return 64;
|
||||
case 128:
|
||||
#if defined(GCN) || defined(CDNA)
|
||||
return ncols <= 16 ? 64 : 32;
|
||||
#else
|
||||
return 64;
|
||||
#endif // defined(GCN) || defined(CDNA)
|
||||
case 256:
|
||||
#if defined(GCN) || defined(CDNA)
|
||||
return ncols <= 16 ? 64 : 32;
|
||||
#else
|
||||
return 64;
|
||||
#endif // defined(GCN) || defined(CDNA)
|
||||
default:
|
||||
return -1;
|
||||
}
|
||||
|
|
@ -88,9 +100,17 @@ static constexpr __device__ int fattn_tile_get_kq_nbatch_device(int D, int ncols
|
|||
case 64:
|
||||
return 64;
|
||||
case 128:
|
||||
return ncols <= 16 ? 2*warp_size : 128;
|
||||
#if defined(GCN) || defined(CDNA)
|
||||
return ncols <= 16 ? 64 : 128;
|
||||
#else
|
||||
return 64;
|
||||
#endif // defined(GCN) || defined(CDNA)
|
||||
case 256:
|
||||
return ncols <= 16 ? 128 : 2*warp_size;
|
||||
#if defined(GCN) || defined(CDNA)
|
||||
return ncols <= 16 ? 64 : 128;
|
||||
#else
|
||||
return ncols <= 16 ? 64 : 256;
|
||||
#endif // defined(GCN) || defined(CDNA)
|
||||
default:
|
||||
return -1;
|
||||
}
|
||||
|
|
@ -196,14 +216,21 @@ static __global__ void flash_attn_tile(
|
|||
|
||||
const float slope = get_alibi_slope(max_bias, head, n_head_log2, m0, m1);
|
||||
|
||||
#if defined(GGML_USE_HIP)
|
||||
constexpr int cpy_nb = 16;
|
||||
#else
|
||||
constexpr int cpy_nb = 8;
|
||||
#endif // defined(GGML_USE_HIP) && defined(GCN)
|
||||
constexpr int cpy_ne = cpy_nb / 4;
|
||||
|
||||
__shared__ float KQ[ncols][kq_stride];
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
__shared__ half2 Q_tmp[ncols][D/2];
|
||||
__shared__ half2 KV_tmp_h2[kq_stride * (kq_nbatch/2 + 1)]; // Padded to avoid memory bank conflicts.
|
||||
__shared__ half2 KV_tmp_h2[kq_stride * (kq_nbatch/2 + cpy_ne)]; // Padded to avoid memory bank conflicts.
|
||||
half2 VKQ[ncols/nwarps][D/(2*warp_size)] = {{{0.0f, 0.0f}}};
|
||||
#else
|
||||
__shared__ float Q_tmp[ncols][D];
|
||||
__shared__ float KV_tmp_f[kq_stride * (kq_nbatch + 1)]; // Padded to avoid memory bank conflicts.
|
||||
__shared__ float KV_tmp_f[kq_stride * (kq_nbatch + cpy_ne)]; // Padded to avoid memory bank conflicts.
|
||||
float2 * KV_tmp_f2 = (float2 *) KV_tmp_f;
|
||||
float2 VKQ[ncols/nwarps][D/(2*warp_size)] = {{{0.0f, 0.0f}}};
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
|
|
@ -256,11 +283,11 @@ static __global__ void flash_attn_tile(
|
|||
for (int k_KQ_1 = 0; k_KQ_1 < kq_nbatch/2; k_KQ_1 += warp_size) {
|
||||
const half2 tmp_h2 = K_h2[int64_t(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ_0/2 + k_KQ_1 + threadIdx.x];
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
KV_tmp_h2[i_KQ*(kq_nbatch/2 + 1) + k_KQ_1 + threadIdx.x] = tmp_h2;
|
||||
KV_tmp_h2[i_KQ*(kq_nbatch/2 + cpy_ne) + k_KQ_1 + threadIdx.x] = tmp_h2;
|
||||
#else
|
||||
const float2 tmp_f2 = __half22float2(tmp_h2);
|
||||
KV_tmp_f[i_KQ*(kq_nbatch + 1) + 2*k_KQ_1 + threadIdx.x] = tmp_f2.x;
|
||||
KV_tmp_f[i_KQ*(kq_nbatch + 1) + 2*k_KQ_1 + warp_size + threadIdx.x] = tmp_f2.y;
|
||||
KV_tmp_f[i_KQ*(kq_nbatch + cpy_ne) + 2*k_KQ_1 + threadIdx.x] = tmp_f2.x;
|
||||
KV_tmp_f[i_KQ*(kq_nbatch + cpy_ne) + 2*k_KQ_1 + warp_size + threadIdx.x] = tmp_f2.y;
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
}
|
||||
}
|
||||
|
|
@ -269,14 +296,14 @@ static __global__ void flash_attn_tile(
|
|||
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
#pragma unroll
|
||||
for (int k_KQ_1 = 0; k_KQ_1 < kq_nbatch/2; ++k_KQ_1) {
|
||||
half2 K_k[kq_stride/warp_size];
|
||||
half2 Q_k[ncols/nwarps];
|
||||
for (int k_KQ_1 = 0; k_KQ_1 < kq_nbatch/2; k_KQ_1 += cpy_ne) {
|
||||
half2 K_k[kq_stride/warp_size][cpy_ne];
|
||||
half2 Q_k[ncols/nwarps][cpy_ne];
|
||||
#else
|
||||
#pragma unroll
|
||||
for (int k_KQ_1 = 0; k_KQ_1 < kq_nbatch; ++k_KQ_1) {
|
||||
float K_k[kq_stride/warp_size];
|
||||
float Q_k[ncols/nwarps];
|
||||
for (int k_KQ_1 = 0; k_KQ_1 < kq_nbatch; k_KQ_1 += cpy_ne) {
|
||||
float K_k[kq_stride/warp_size][cpy_ne];
|
||||
float Q_k[ncols/nwarps][cpy_ne];
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
|
||||
#pragma unroll
|
||||
|
|
@ -284,9 +311,9 @@ static __global__ void flash_attn_tile(
|
|||
const int i_KQ = i_KQ_0 + threadIdx.x;
|
||||
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
K_k[i_KQ_0/warp_size] = KV_tmp_h2[i_KQ*(kq_nbatch/2 + 1) + k_KQ_1];
|
||||
ggml_cuda_memcpy_1<cpy_nb>(&K_k[i_KQ_0/warp_size], &KV_tmp_h2[i_KQ*(kq_nbatch/2 + cpy_ne) + k_KQ_1]);
|
||||
#else
|
||||
K_k[i_KQ_0/warp_size] = KV_tmp_f [i_KQ*(kq_nbatch + 1) + k_KQ_1];
|
||||
ggml_cuda_memcpy_1<cpy_nb>(&K_k[i_KQ_0/warp_size], &KV_tmp_f [i_KQ*(kq_nbatch + cpy_ne) + k_KQ_1]);
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
}
|
||||
#pragma unroll
|
||||
|
|
@ -294,9 +321,9 @@ static __global__ void flash_attn_tile(
|
|||
const int j_KQ = j_KQ_0 + threadIdx.y;
|
||||
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
Q_k[j_KQ_0/nwarps] = Q_tmp[j_KQ][k_KQ_0/2 + k_KQ_1];
|
||||
ggml_cuda_memcpy_1<cpy_nb>(&Q_k[j_KQ_0/nwarps], &Q_tmp[j_KQ][k_KQ_0/2 + k_KQ_1]);
|
||||
#else
|
||||
Q_k[j_KQ_0/nwarps] = Q_tmp[j_KQ][k_KQ_0 + k_KQ_1];
|
||||
ggml_cuda_memcpy_1<cpy_nb>(&Q_k[j_KQ_0/nwarps], &Q_tmp[j_KQ][k_KQ_0 + k_KQ_1]);
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
}
|
||||
|
||||
|
|
@ -304,7 +331,10 @@ static __global__ void flash_attn_tile(
|
|||
for (int i_KQ_0 = 0; i_KQ_0 < kq_stride; i_KQ_0 += warp_size) {
|
||||
#pragma unroll
|
||||
for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) {
|
||||
ggml_cuda_mad(sum[i_KQ_0/warp_size][j_KQ_0/nwarps], K_k[i_KQ_0/warp_size], Q_k[j_KQ_0/nwarps]);
|
||||
#pragma unroll
|
||||
for (int k = 0; k < cpy_ne; ++k) {
|
||||
ggml_cuda_mad(sum[i_KQ_0/warp_size][j_KQ_0/nwarps], K_k[i_KQ_0/warp_size][k], Q_k[j_KQ_0/nwarps][k]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -345,14 +375,54 @@ static __global__ void flash_attn_tile(
|
|||
kqmax[j0/nwarps] = kqmax_new[j0/nwarps];
|
||||
|
||||
float kqsum_add = 0.0f;
|
||||
if (kq_stride % (4*warp_size) == 0 && cpy_ne % 4 == 0) {
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < kq_stride; i0 += warp_size) {
|
||||
const int i = i0 + threadIdx.x;
|
||||
for (int i0 = 0; i0 < kq_stride; i0 += 4*warp_size) {
|
||||
const int i = i0 + 4*threadIdx.x;
|
||||
|
||||
const float diff = KQ[j][i] - kqmax[j0/nwarps];
|
||||
const float val = expf(diff);
|
||||
kqsum_add += val;
|
||||
KQ[j][i] = val;
|
||||
float4 val = *(const float4 *) &KQ[j][i];
|
||||
val.x = expf(val.x - kqmax[j0/nwarps]);
|
||||
val.y = expf(val.y - kqmax[j0/nwarps]);
|
||||
val.z = expf(val.z - kqmax[j0/nwarps]);
|
||||
val.w = expf(val.w - kqmax[j0/nwarps]);
|
||||
kqsum_add += val.x + val.y + val.z + val.w;
|
||||
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
const half2 tmp[2] = {make_half2(val.x, val.y), make_half2(val.z, val.w)};
|
||||
ggml_cuda_memcpy_1<sizeof(tmp)>(&KQ[j][i/2], &tmp);
|
||||
#else
|
||||
ggml_cuda_memcpy_1<sizeof(val)>(&KQ[j][i], &val);
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
}
|
||||
} else if (kq_stride % (2*warp_size) == 0 && cpy_ne % 2 == 0) {
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < kq_stride; i0 += 2*warp_size) {
|
||||
const int i = i0 + 2*threadIdx.x;
|
||||
|
||||
float2 val = *(const float2 *) &KQ[j][i];
|
||||
val.x = expf(val.x - kqmax[j0/nwarps]);
|
||||
val.y = expf(val.y - kqmax[j0/nwarps]);
|
||||
kqsum_add += val.x + val.y;
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
const half2 tmp = make_half2(val.x, val.y);
|
||||
ggml_cuda_memcpy_1<sizeof(tmp)>(&KQ[j][i/2], &tmp);
|
||||
#else
|
||||
ggml_cuda_memcpy_1<sizeof(val)>(&KQ[j][i], &val);
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
}
|
||||
} else {
|
||||
for (int i0 = 0; i0 < kq_stride; i0 += warp_size) {
|
||||
const int i = i0 + threadIdx.x;
|
||||
|
||||
const float diff = KQ[j][i] - kqmax[j0/nwarps];
|
||||
const float val = expf(diff);
|
||||
kqsum_add += val;
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
((half *) KQ[j])[i] = val;
|
||||
#else
|
||||
KQ[j][i] = val;
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
}
|
||||
}
|
||||
kqsum[j0/nwarps] = kqsum[j0/nwarps]*KQ_max_scale + kqsum_add;
|
||||
|
||||
|
|
@ -419,8 +489,7 @@ static __global__ void flash_attn_tile(
|
|||
const int j = j0 + threadIdx.y;
|
||||
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
const float tmp = KQ[j][k0 + k1];
|
||||
KQ_k[j0/nwarps] = make_half2(tmp, tmp);
|
||||
KQ_k[j0/nwarps] = __half2half2(((const half *)KQ[j])[k0 + k1]);
|
||||
#else
|
||||
KQ_k[j0/nwarps] = KQ[j][k0 + k1];
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
|
|
|
|||
|
|
@ -3223,6 +3223,7 @@ struct ggml_backend_cuda_device_context {
|
|||
int device;
|
||||
std::string name;
|
||||
std::string description;
|
||||
std::string pci_bus_id;
|
||||
};
|
||||
|
||||
static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
|
||||
|
|
@ -3247,9 +3248,12 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend
|
|||
}
|
||||
|
||||
static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
|
||||
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
|
||||
|
||||
props->name = ggml_backend_cuda_device_get_name(dev);
|
||||
props->description = ggml_backend_cuda_device_get_description(dev);
|
||||
props->type = ggml_backend_cuda_device_get_type(dev);
|
||||
props->device_id = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str();
|
||||
ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
||||
|
||||
bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr;
|
||||
|
|
@ -3818,6 +3822,10 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
|
|||
CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
|
||||
dev_ctx->description = prop.name;
|
||||
|
||||
char pci_bus_id[16] = {};
|
||||
snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.0", prop.pciDomainID, prop.pciBusID, prop.pciDeviceID);
|
||||
dev_ctx->pci_bus_id = pci_bus_id;
|
||||
|
||||
ggml_backend_dev_t dev = new ggml_backend_device {
|
||||
/* .iface = */ ggml_backend_cuda_device_interface,
|
||||
/* .reg = */ ®,
|
||||
|
|
|
|||
8
ggml/src/ggml-cuda/vendors/hip.h
vendored
8
ggml/src/ggml-cuda/vendors/hip.h
vendored
|
|
@ -162,6 +162,14 @@
|
|||
#define GCN
|
||||
#endif
|
||||
|
||||
#if defined(__gfx900__) || defined(__gfx906__)
|
||||
#define GCN5
|
||||
#endif
|
||||
|
||||
#if defined(__gfx803__)
|
||||
#define GCN4
|
||||
#endif
|
||||
|
||||
#if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx942__)
|
||||
#define CDNA // For the entire family
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -1976,7 +1976,7 @@ static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, const std
|
|||
}
|
||||
}
|
||||
|
||||
if (buf->device_memory == VK_NULL_HANDLE) {
|
||||
if (!buf->device_memory) {
|
||||
device->device.destroyBuffer(buf->buffer);
|
||||
throw vk::OutOfDeviceMemoryError("No suitable memory type found");
|
||||
}
|
||||
|
|
@ -4527,7 +4527,7 @@ static void ggml_vk_instance_init() {
|
|||
new_driver.pNext = &new_id;
|
||||
devices[i].getProperties2(&new_props);
|
||||
|
||||
if (new_props.properties.deviceType == vk::PhysicalDeviceType::eDiscreteGpu) {
|
||||
if (new_props.properties.deviceType == vk::PhysicalDeviceType::eDiscreteGpu || new_props.properties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu) {
|
||||
// Check if there are two physical devices corresponding to the same GPU
|
||||
auto old_device = std::find_if(
|
||||
vk_instance.device_indices.begin(),
|
||||
|
|
@ -4597,7 +4597,7 @@ static void ggml_vk_instance_init() {
|
|||
}
|
||||
}
|
||||
|
||||
// If no dedicated GPUs found, fall back to the first non-CPU device.
|
||||
// If no GPUs found, fall back to the first non-CPU device.
|
||||
// If only CPU devices are available, return without devices.
|
||||
if (vk_instance.device_indices.empty()) {
|
||||
for (size_t i = 0; i < devices.size(); i++) {
|
||||
|
|
@ -12108,12 +12108,63 @@ void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total
|
|||
}
|
||||
}
|
||||
|
||||
static vk::PhysicalDeviceType ggml_backend_vk_get_device_type(int device_idx) {
|
||||
GGML_ASSERT(device_idx >= 0 && device_idx < (int) vk_instance.device_indices.size());
|
||||
|
||||
vk::PhysicalDevice device = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[device_idx]];
|
||||
|
||||
vk::PhysicalDeviceProperties2 props = {};
|
||||
device.getProperties2(&props);
|
||||
|
||||
return props.properties.deviceType;
|
||||
}
|
||||
|
||||
static std::string ggml_backend_vk_get_device_pci_id(int device_idx) {
|
||||
GGML_ASSERT(device_idx >= 0 && device_idx < (int) vk_instance.device_indices.size());
|
||||
|
||||
vk::PhysicalDevice device = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[device_idx]];
|
||||
|
||||
const std::vector<vk::ExtensionProperties> ext_props = device.enumerateDeviceExtensionProperties();
|
||||
|
||||
bool ext_support = false;
|
||||
|
||||
for (const auto& properties : ext_props) {
|
||||
if (strcmp("VK_EXT_pci_bus_info", properties.extensionName) == 0) {
|
||||
ext_support = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!ext_support) {
|
||||
return "";
|
||||
}
|
||||
|
||||
vk::PhysicalDeviceProperties2 props = {};
|
||||
vk::PhysicalDevicePCIBusInfoPropertiesEXT pci_bus_info = {};
|
||||
|
||||
props.pNext = &pci_bus_info;
|
||||
|
||||
device.getProperties2(&props);
|
||||
|
||||
const uint32_t pci_domain = pci_bus_info.pciDomain;
|
||||
const uint32_t pci_bus = pci_bus_info.pciBus;
|
||||
const uint32_t pci_device = pci_bus_info.pciDevice;
|
||||
const uint8_t pci_function = (uint8_t) pci_bus_info.pciFunction; // pci function is between 0 and 7, prevent printf overflow warning
|
||||
|
||||
char pci_bus_id[16] = {};
|
||||
snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.%x", pci_domain, pci_bus, pci_device, pci_function);
|
||||
|
||||
return std::string(pci_bus_id);
|
||||
}
|
||||
|
||||
//////////////////////////
|
||||
|
||||
struct ggml_backend_vk_device_context {
|
||||
size_t device;
|
||||
std::string name;
|
||||
std::string description;
|
||||
bool is_integrated_gpu;
|
||||
std::string pci_bus_id;
|
||||
};
|
||||
|
||||
static const char * ggml_backend_vk_device_get_name(ggml_backend_dev_t dev) {
|
||||
|
|
@ -12142,14 +12193,18 @@ static ggml_backend_buffer_type_t ggml_backend_vk_device_get_host_buffer_type(gg
|
|||
}
|
||||
|
||||
static enum ggml_backend_dev_type ggml_backend_vk_device_get_type(ggml_backend_dev_t dev) {
|
||||
UNUSED(dev);
|
||||
return GGML_BACKEND_DEVICE_TYPE_GPU;
|
||||
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
|
||||
|
||||
return ctx->is_integrated_gpu ? GGML_BACKEND_DEVICE_TYPE_IGPU : GGML_BACKEND_DEVICE_TYPE_GPU;
|
||||
}
|
||||
|
||||
static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
|
||||
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
|
||||
|
||||
props->name = ggml_backend_vk_device_get_name(dev);
|
||||
props->description = ggml_backend_vk_device_get_description(dev);
|
||||
props->type = ggml_backend_vk_device_get_type(dev);
|
||||
props->device_id = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str();
|
||||
ggml_backend_vk_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
||||
props->caps = {
|
||||
/* .async = */ false,
|
||||
|
|
@ -12416,8 +12471,8 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
|||
}
|
||||
|
||||
if (
|
||||
src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_I32 ||
|
||||
src0_type == GGML_TYPE_I32 && src1_type == GGML_TYPE_F32
|
||||
(src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_I32) ||
|
||||
(src0_type == GGML_TYPE_I32 && src1_type == GGML_TYPE_F32)
|
||||
) {
|
||||
return true;
|
||||
}
|
||||
|
|
@ -12582,6 +12637,8 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
|
|||
ctx->device = i;
|
||||
ctx->name = GGML_VK_NAME + std::to_string(i);
|
||||
ctx->description = desc;
|
||||
ctx->is_integrated_gpu = ggml_backend_vk_get_device_type(i) == vk::PhysicalDeviceType::eIntegratedGpu;
|
||||
ctx->pci_bus_id = ggml_backend_vk_get_device_pci_id(i);
|
||||
devices.push_back(new ggml_backend_device {
|
||||
/* .iface = */ ggml_backend_vk_device_i,
|
||||
/* .reg = */ reg,
|
||||
|
|
|
|||
|
|
@ -181,7 +181,7 @@ llama_context::llama_context(
|
|||
// graph outputs buffer
|
||||
{
|
||||
// resized during inference when a batch uses more outputs
|
||||
if ((uint32_t) output_reserve(params.n_seq_max) < params.n_seq_max) {
|
||||
if (output_reserve(params.n_seq_max) < params.n_seq_max) {
|
||||
throw std::runtime_error("failed to reserve initial output buffer");
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -923,7 +923,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|||
new_type = tensor->type;
|
||||
new_data = tensor->data;
|
||||
new_size = ggml_nbytes(tensor);
|
||||
LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
|
||||
LLAMA_LOG_INFO("size = %8.3f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0);
|
||||
} else {
|
||||
const int64_t nelements = ggml_nelements(tensor);
|
||||
|
||||
|
|
@ -1040,8 +1040,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|||
}
|
||||
close_ofstream();
|
||||
|
||||
LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
|
||||
LLAMA_LOG_INFO("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
|
||||
LLAMA_LOG_INFO("%s: model size = %8.2f MiB\n", __func__, total_size_org/1024.0/1024.0);
|
||||
LLAMA_LOG_INFO("%s: quant size = %8.2f MiB\n", __func__, total_size_new/1024.0/1024.0);
|
||||
|
||||
if (qs.n_fallback > 0) {
|
||||
LLAMA_LOG_WARN("%s: WARNING: %d of %d tensor(s) required fallback quantization\n",
|
||||
|
|
|
|||
|
|
@ -89,6 +89,7 @@ bool llama_supports_gpu_offload(void) {
|
|||
return true;
|
||||
#else
|
||||
return ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU) != nullptr ||
|
||||
ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU) != nullptr ||
|
||||
llama_supports_rpc();
|
||||
#endif
|
||||
}
|
||||
|
|
@ -215,8 +216,13 @@ static struct llama_model * llama_model_load_from_file_impl(
|
|||
model->devices.push_back(*dev);
|
||||
}
|
||||
} else {
|
||||
// default device selection
|
||||
|
||||
// build list of available devices
|
||||
std::vector<ggml_backend_dev_t> gpus;
|
||||
std::vector<ggml_backend_dev_t> igpus;
|
||||
std::vector<ggml_backend_dev_t> rpc_servers;
|
||||
// use all available devices
|
||||
|
||||
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
|
||||
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
|
||||
switch (ggml_backend_dev_type(dev)) {
|
||||
|
|
@ -225,19 +231,51 @@ static struct llama_model * llama_model_load_from_file_impl(
|
|||
// skip CPU backends since they are handled separately
|
||||
break;
|
||||
|
||||
case GGML_BACKEND_DEVICE_TYPE_GPU:
|
||||
case GGML_BACKEND_DEVICE_TYPE_GPU: {
|
||||
ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
|
||||
if (ggml_backend_reg_name(reg) == std::string("RPC")) {
|
||||
rpc_servers.push_back(dev);
|
||||
} else {
|
||||
model->devices.push_back(dev);
|
||||
// check if there is already a GPU with the same device id
|
||||
ggml_backend_dev_props props;
|
||||
ggml_backend_dev_get_props(dev, &props);
|
||||
auto it = std::find_if(gpus.begin(), gpus.end(), [&props](ggml_backend_dev_t d) {
|
||||
ggml_backend_dev_props d_props;
|
||||
ggml_backend_dev_get_props(d, &d_props);
|
||||
if (props.device_id && d_props.device_id) {
|
||||
return strcmp(props.device_id, d_props.device_id) == 0;
|
||||
}
|
||||
return false;
|
||||
});
|
||||
|
||||
if (it != gpus.end()) {
|
||||
LLAMA_LOG_INFO("%s: skipping device %s (%s) with id %s - already using device %s (%s) with the same id\n",
|
||||
__func__,
|
||||
ggml_backend_dev_name(dev), ggml_backend_dev_description(dev),
|
||||
props.device_id ? props.device_id : "unknown id",
|
||||
ggml_backend_dev_name(*it), ggml_backend_dev_description(*it));
|
||||
} else {
|
||||
gpus.push_back(dev);
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
case GGML_BACKEND_DEVICE_TYPE_IGPU:
|
||||
igpus.push_back(dev);
|
||||
break;
|
||||
}
|
||||
}
|
||||
// add RPC servers at the front of the list
|
||||
if (!rpc_servers.empty()) {
|
||||
model->devices.insert(model->devices.begin(), rpc_servers.begin(), rpc_servers.end());
|
||||
|
||||
// add RPC servers at the front of the list to minimize network transfers
|
||||
model->devices.insert(model->devices.begin(), rpc_servers.begin(), rpc_servers.end());
|
||||
|
||||
// add GPUs
|
||||
model->devices.insert(model->devices.end(), gpus.begin(), gpus.end());
|
||||
|
||||
// add integrated GPUs only if no other devices were found
|
||||
if (model->devices.empty()) {
|
||||
model->devices.insert(model->devices.end(), igpus.begin(), igpus.end());
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -258,9 +296,12 @@ static struct llama_model * llama_model_load_from_file_impl(
|
|||
}
|
||||
|
||||
for (auto * dev : model->devices) {
|
||||
size_t free, total; // NOLINT
|
||||
ggml_backend_dev_memory(dev, &free, &total);
|
||||
LLAMA_LOG_INFO("%s: using device %s (%s) - %zu MiB free\n", __func__, ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), free/1024/1024);
|
||||
ggml_backend_dev_props props;
|
||||
ggml_backend_dev_get_props(dev, &props);
|
||||
LLAMA_LOG_INFO("%s: using device %s (%s) (%s) - %zu MiB free\n", __func__,
|
||||
ggml_backend_dev_name(dev), ggml_backend_dev_description(dev),
|
||||
props.device_id ? props.device_id : "unknown id",
|
||||
props.memory_free/1024/1024);
|
||||
}
|
||||
|
||||
const int status = llama_model_load(path_model, splits, *model, params);
|
||||
|
|
|
|||
|
|
@ -2372,7 +2372,7 @@ struct server_context {
|
|||
}
|
||||
|
||||
if (ret != nullptr) {
|
||||
SLT_DBG(*ret, "selected slot by lcs similarity, lcs_len = %d, similarity = %f\n", lcs_len, similarity);
|
||||
SLT_INF(*ret, "selected slot by lcs similarity, lcs_len = %d, similarity = %.3f (> %.3f thold)\n", lcs_len, similarity, slot_prompt_similarity);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -2394,7 +2394,7 @@ struct server_context {
|
|||
}
|
||||
|
||||
if (ret != nullptr) {
|
||||
SLT_DBG(*ret, "selected slot by lru, t_last = %" PRId64 "\n", t_last);
|
||||
SLT_INF(*ret, "selected slot by LRU, t_last = %" PRId64 "\n", t_last);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue