diff --git a/common/arg.cpp b/common/arg.cpp index 7a2f8e712..133bcbebf 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -747,6 +747,124 @@ std::pair> common_remote_get_content(const std::string & #endif // LLAMA_USE_CURL +// +// Docker registry functions +// + +static std::string common_docker_get_token(const std::string & repo) { + std::string url = "https://auth.docker.io/token?service=registry.docker.io&scope=repository:" + repo + ":pull"; + + common_remote_params params; + auto res = common_remote_get_content(url, params); + + if (res.first != 200) { + throw std::runtime_error("Failed to get Docker registry token, HTTP code: " + std::to_string(res.first)); + } + + std::string response_str(res.second.begin(), res.second.end()); + nlohmann::ordered_json response = nlohmann::ordered_json::parse(response_str); + + if (!response.contains("token")) { + throw std::runtime_error("Docker registry token response missing 'token' field"); + } + + return response["token"].get(); +} + +static std::string common_docker_resolve_model(const std::string & docker) { + // Parse ai/smollm2:135M-Q4_K_M + size_t colon_pos = docker.find(':'); + std::string repo, tag; + if (colon_pos != std::string::npos) { + repo = docker.substr(0, colon_pos); + tag = docker.substr(colon_pos + 1); + } else { + repo = docker; + tag = "latest"; + } + + // ai/ is the default + size_t slash_pos = docker.find('/'); + if (slash_pos == std::string::npos) { + repo.insert(0, "ai/"); + } + + LOG_INF("%s: Downloading Docker Model: %s:%s\n", __func__, repo.c_str(), tag.c_str()); + try { + // --- helper: digest validation --- + auto validate_oci_digest = [](const std::string & digest) -> std::string { + // Expected: algo:hex ; start with sha256 (64 hex chars) + // You can extend this map if supporting other algorithms in future. + static const std::regex re("^sha256:([a-fA-F0-9]{64})$"); + std::smatch m; + if (!std::regex_match(digest, m, re)) { + throw std::runtime_error("Invalid OCI digest format received in manifest: " + digest); + } + // normalize hex to lowercase + std::string normalized = digest; + std::transform(normalized.begin()+7, normalized.end(), normalized.begin()+7, [](unsigned char c){ + return std::tolower(c); + }); + return normalized; + }; + + std::string token = common_docker_get_token(repo); // Get authentication token + + // Get manifest + const std::string url_prefix = "https://registry-1.docker.io/v2/" + repo; + std::string manifest_url = url_prefix + "/manifests/" + tag; + common_remote_params manifest_params; + manifest_params.headers.push_back("Authorization: Bearer " + token); + manifest_params.headers.push_back( + "Accept: application/vnd.docker.distribution.manifest.v2+json,application/vnd.oci.image.manifest.v1+json"); + auto manifest_res = common_remote_get_content(manifest_url, manifest_params); + if (manifest_res.first != 200) { + throw std::runtime_error("Failed to get Docker manifest, HTTP code: " + std::to_string(manifest_res.first)); + } + + std::string manifest_str(manifest_res.second.begin(), manifest_res.second.end()); + nlohmann::ordered_json manifest = nlohmann::ordered_json::parse(manifest_str); + std::string gguf_digest; // Find the GGUF layer + if (manifest.contains("layers")) { + for (const auto & layer : manifest["layers"]) { + if (layer.contains("mediaType")) { + std::string media_type = layer["mediaType"].get(); + if (media_type == "application/vnd.docker.ai.gguf.v3" || + media_type.find("gguf") != std::string::npos) { + gguf_digest = layer["digest"].get(); + break; + } + } + } + } + + if (gguf_digest.empty()) { + throw std::runtime_error("No GGUF layer found in Docker manifest"); + } + + // Validate & normalize digest + gguf_digest = validate_oci_digest(gguf_digest); + LOG_DBG("%s: Using validated digest: %s\n", __func__, gguf_digest.c_str()); + + // Prepare local filename + std::string model_filename = repo; + std::replace(model_filename.begin(), model_filename.end(), '/', '_'); + model_filename += "_" + tag + ".gguf"; + std::string local_path = fs_get_cache_file(model_filename); + + const std::string blob_url = url_prefix + "/blobs/" + gguf_digest; + if (!common_download_file_single(blob_url, local_path, token, false)) { + throw std::runtime_error("Failed to download Docker Model"); + } + + LOG_INF("%s: Downloaded Docker Model to: %s\n", __func__, local_path.c_str()); + return local_path; + } catch (const std::exception & e) { + LOG_ERR("%s: Docker Model download failed: %s\n", __func__, e.what()); + throw; + } +} + // // utils // @@ -797,7 +915,9 @@ static handle_model_result common_params_handle_model( handle_model_result result; // handle pre-fill default model path and url based on hf_repo and hf_file { - if (!model.hf_repo.empty()) { + if (!model.docker_repo.empty()) { // Handle Docker URLs by resolving them to local paths + model.path = common_docker_resolve_model(model.docker_repo); + } else if (!model.hf_repo.empty()) { // short-hand to avoid specifying --hf-file -> default it to --model if (model.hf_file.empty()) { if (model.path.empty()) { @@ -2638,6 +2758,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.model.url = value; } ).set_env("LLAMA_ARG_MODEL_URL")); + add_opt(common_arg( + { "-dr", "--docker-repo" }, "[/][:quant]", + "Docker Hub model repository. repo is optional, default to ai/. quant is optional, default to :latest.\n" + "example: gemma3\n" + "(default: unused)", + [](common_params & params, const std::string & value) { + params.model.docker_repo = value; + } + ).set_env("LLAMA_ARG_DOCKER_REPO")); add_opt(common_arg( {"-hf", "-hfr", "--hf-repo"}, "/[:quant]", "Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.\n" diff --git a/common/common.h b/common/common.h index 0ba1c7624..a44130db3 100644 --- a/common/common.h +++ b/common/common.h @@ -189,10 +189,11 @@ struct common_params_sampling { }; struct common_params_model { - std::string path = ""; // model local path // NOLINT - std::string url = ""; // model url to download // NOLINT - std::string hf_repo = ""; // HF repo // NOLINT - std::string hf_file = ""; // HF file // NOLINT + std::string path = ""; // model local path // NOLINT + std::string url = ""; // model url to download // NOLINT + std::string hf_repo = ""; // HF repo // NOLINT + std::string hf_file = ""; // HF file // NOLINT + std::string docker_repo = ""; // Docker repo // NOLINT }; struct common_params_speculative { @@ -448,7 +449,7 @@ struct common_params { std::string slot_save_path; - float slot_prompt_similarity = 0.5f; + float slot_prompt_similarity = 0.1f; // batched-bench params bool is_pp_shared = false; diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h index 4f246f6cc..ab297e0c6 100644 --- a/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h @@ -132,6 +132,8 @@ extern "C" { GGML_BACKEND_DEVICE_TYPE_CPU, // GPU device using dedicated memory GGML_BACKEND_DEVICE_TYPE_GPU, + // integrated GPU device using host memory + GGML_BACKEND_DEVICE_TYPE_IGPU, // accelerator devices intended to be used together with the CPU backend (e.g. BLAS or AMX) GGML_BACKEND_DEVICE_TYPE_ACCEL }; @@ -150,11 +152,21 @@ extern "C" { // all the device properties struct ggml_backend_dev_props { + // device name const char * name; + // device description const char * description; + // device free memory in bytes size_t memory_free; + // device total memory in bytes size_t memory_total; + // device type enum ggml_backend_dev_type type; + // device id + // for PCI devices, this should be the PCI bus id formatted as "domain:bus:device.function" (e.g. "0000:01:00.0") + // if the id is unknown, this should be NULL + const char * device_id; + // device capabilities struct ggml_backend_dev_caps caps; }; diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h index 2db5c4e0f..89d80db6e 100644 --- a/ggml/src/ggml-backend-impl.h +++ b/ggml/src/ggml-backend-impl.h @@ -8,7 +8,7 @@ extern "C" { #endif - #define GGML_BACKEND_API_VERSION 1 + #define GGML_BACKEND_API_VERSION 2 // // Backend buffer type diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp index 3038d8a94..0627ee6d7 100644 --- a/ggml/src/ggml-backend-reg.cpp +++ b/ggml/src/ggml-backend-reg.cpp @@ -400,9 +400,8 @@ ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const ggml_backend_t ggml_backend_init_best(void) { ggml_backend_dev_t dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU); - if (!dev) { - dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); - } + dev = dev ? dev : ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU); + dev = dev ? dev : ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); if (!dev) { return nullptr; } diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index c205bb0f9..cabb3b192 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -560,7 +560,7 @@ static __device__ __forceinline__ void ggml_cuda_mad(float & acc, const float2 v } static __device__ __forceinline__ void ggml_cuda_mad(float & acc, const half2 v, const half2 u) { -#if defined(GGML_USE_HIP) && defined(GCN) +#if defined(GGML_USE_HIP) && (defined(RDNA2) || defined(RDNA3) || defined(RDNA4) || defined(__gfx906__) || defined(CDNA)) asm volatile("v_dot2_f32_f16 %0, %1, %2, %0" : "+v"(acc) : "v"(v), "v"(u)); #else #ifdef FAST_FP16_AVAILABLE @@ -572,7 +572,21 @@ static __device__ __forceinline__ void ggml_cuda_mad(float & acc, const half2 v, acc += tmpv.x * tmpu.x; acc += tmpv.y * tmpu.y; #endif // FAST_FP16_AVAILABLE -#endif // defined(GGML_USE_HIP) && defined(GCN) +#endif // defined(GGML_USE_HIP) && (defined(RDNA2) || defined(RDNA3) || defined(RDNA4) || defined(GCN5) || defined(CDNA)) +} + +// Aligned memory transfers of 8/16 bytes can be faster than 2 transfers with 4 bytes, especially on AMD. +template +static __device__ __forceinline__ void ggml_cuda_memcpy_1(void * __restrict__ dst, const void * __restrict__ src) { + if constexpr (nbytes == 4) { + *(int *) dst = *(const int *) src; + } else if constexpr (nbytes == 8) { + *(int2 *) dst = *(const int2 *) src; + } else if constexpr (nbytes == 16) { + *(int4 *) dst = *(const int4 *) src; + } else { + static_assert(nbytes == 0 && nbytes == -1, "bad nbytes"); + } } static __device__ __forceinline__ float ggml_cuda_e8m0_to_fp32(uint8_t x) { diff --git a/ggml/src/ggml-cuda/fattn-tile.cu b/ggml/src/ggml-cuda/fattn-tile.cu index 64f7d4a1a..c6a399ce5 100644 --- a/ggml/src/ggml-cuda/fattn-tile.cu +++ b/ggml/src/ggml-cuda/fattn-tile.cu @@ -8,11 +8,14 @@ static int fattn_tile_get_kq_stride_host(const int D, const int ncols, const int if (GGML_CUDA_CC_IS_AMD(cc)) { switch (D) { case 64: - return ncols <= 16 ? 32 : 64; - case 128: - return ncols <= 16 ? 64 : warp_size; - case 256: return 64; + case 128: + case 256: + if (GGML_CUDA_CC_IS_GCN(cc) || GGML_CUDA_CC_IS_CDNA(cc)) { + return ncols <= 16 ? 64 : 32; + } else { + return 64; + } default: GGML_ABORT("fatal error"); return -1; @@ -41,17 +44,26 @@ static int fattn_tile_get_kq_stride_host(const int D, const int ncols, const int GGML_ABORT("fatal error"); return -1; } + GGML_UNUSED(warp_size); } static constexpr __device__ int fattn_tile_get_kq_stride_device(int D, int ncols, int warp_size) { #ifdef GGML_USE_HIP switch (D) { case 64: - return ncols <= 16 ? 32 : 64; - case 128: - return ncols <= 16 ? 64 : warp_size; - case 256: return 64; + case 128: +#if defined(GCN) || defined(CDNA) + return ncols <= 16 ? 64 : 32; +#else + return 64; +#endif // defined(GCN) || defined(CDNA) + case 256: +#if defined(GCN) || defined(CDNA) + return ncols <= 16 ? 64 : 32; +#else + return 64; +#endif // defined(GCN) || defined(CDNA) default: return -1; } @@ -88,9 +100,17 @@ static constexpr __device__ int fattn_tile_get_kq_nbatch_device(int D, int ncols case 64: return 64; case 128: - return ncols <= 16 ? 2*warp_size : 128; +#if defined(GCN) || defined(CDNA) + return ncols <= 16 ? 64 : 128; +#else + return 64; +#endif // defined(GCN) || defined(CDNA) case 256: - return ncols <= 16 ? 128 : 2*warp_size; +#if defined(GCN) || defined(CDNA) + return ncols <= 16 ? 64 : 128; +#else + return ncols <= 16 ? 64 : 256; +#endif // defined(GCN) || defined(CDNA) default: return -1; } @@ -196,14 +216,21 @@ static __global__ void flash_attn_tile( const float slope = get_alibi_slope(max_bias, head, n_head_log2, m0, m1); +#if defined(GGML_USE_HIP) + constexpr int cpy_nb = 16; +#else + constexpr int cpy_nb = 8; +#endif // defined(GGML_USE_HIP) && defined(GCN) + constexpr int cpy_ne = cpy_nb / 4; + __shared__ float KQ[ncols][kq_stride]; #ifdef FAST_FP16_AVAILABLE __shared__ half2 Q_tmp[ncols][D/2]; - __shared__ half2 KV_tmp_h2[kq_stride * (kq_nbatch/2 + 1)]; // Padded to avoid memory bank conflicts. + __shared__ half2 KV_tmp_h2[kq_stride * (kq_nbatch/2 + cpy_ne)]; // Padded to avoid memory bank conflicts. half2 VKQ[ncols/nwarps][D/(2*warp_size)] = {{{0.0f, 0.0f}}}; #else __shared__ float Q_tmp[ncols][D]; - __shared__ float KV_tmp_f[kq_stride * (kq_nbatch + 1)]; // Padded to avoid memory bank conflicts. + __shared__ float KV_tmp_f[kq_stride * (kq_nbatch + cpy_ne)]; // Padded to avoid memory bank conflicts. float2 * KV_tmp_f2 = (float2 *) KV_tmp_f; float2 VKQ[ncols/nwarps][D/(2*warp_size)] = {{{0.0f, 0.0f}}}; #endif // FAST_FP16_AVAILABLE @@ -256,11 +283,11 @@ static __global__ void flash_attn_tile( for (int k_KQ_1 = 0; k_KQ_1 < kq_nbatch/2; k_KQ_1 += warp_size) { const half2 tmp_h2 = K_h2[int64_t(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ_0/2 + k_KQ_1 + threadIdx.x]; #ifdef FAST_FP16_AVAILABLE - KV_tmp_h2[i_KQ*(kq_nbatch/2 + 1) + k_KQ_1 + threadIdx.x] = tmp_h2; + KV_tmp_h2[i_KQ*(kq_nbatch/2 + cpy_ne) + k_KQ_1 + threadIdx.x] = tmp_h2; #else const float2 tmp_f2 = __half22float2(tmp_h2); - KV_tmp_f[i_KQ*(kq_nbatch + 1) + 2*k_KQ_1 + threadIdx.x] = tmp_f2.x; - KV_tmp_f[i_KQ*(kq_nbatch + 1) + 2*k_KQ_1 + warp_size + threadIdx.x] = tmp_f2.y; + KV_tmp_f[i_KQ*(kq_nbatch + cpy_ne) + 2*k_KQ_1 + threadIdx.x] = tmp_f2.x; + KV_tmp_f[i_KQ*(kq_nbatch + cpy_ne) + 2*k_KQ_1 + warp_size + threadIdx.x] = tmp_f2.y; #endif // FAST_FP16_AVAILABLE } } @@ -269,14 +296,14 @@ static __global__ void flash_attn_tile( #ifdef FAST_FP16_AVAILABLE #pragma unroll - for (int k_KQ_1 = 0; k_KQ_1 < kq_nbatch/2; ++k_KQ_1) { - half2 K_k[kq_stride/warp_size]; - half2 Q_k[ncols/nwarps]; + for (int k_KQ_1 = 0; k_KQ_1 < kq_nbatch/2; k_KQ_1 += cpy_ne) { + half2 K_k[kq_stride/warp_size][cpy_ne]; + half2 Q_k[ncols/nwarps][cpy_ne]; #else #pragma unroll - for (int k_KQ_1 = 0; k_KQ_1 < kq_nbatch; ++k_KQ_1) { - float K_k[kq_stride/warp_size]; - float Q_k[ncols/nwarps]; + for (int k_KQ_1 = 0; k_KQ_1 < kq_nbatch; k_KQ_1 += cpy_ne) { + float K_k[kq_stride/warp_size][cpy_ne]; + float Q_k[ncols/nwarps][cpy_ne]; #endif // FAST_FP16_AVAILABLE #pragma unroll @@ -284,9 +311,9 @@ static __global__ void flash_attn_tile( const int i_KQ = i_KQ_0 + threadIdx.x; #ifdef FAST_FP16_AVAILABLE - K_k[i_KQ_0/warp_size] = KV_tmp_h2[i_KQ*(kq_nbatch/2 + 1) + k_KQ_1]; + ggml_cuda_memcpy_1(&K_k[i_KQ_0/warp_size], &KV_tmp_h2[i_KQ*(kq_nbatch/2 + cpy_ne) + k_KQ_1]); #else - K_k[i_KQ_0/warp_size] = KV_tmp_f [i_KQ*(kq_nbatch + 1) + k_KQ_1]; + ggml_cuda_memcpy_1(&K_k[i_KQ_0/warp_size], &KV_tmp_f [i_KQ*(kq_nbatch + cpy_ne) + k_KQ_1]); #endif // FAST_FP16_AVAILABLE } #pragma unroll @@ -294,9 +321,9 @@ static __global__ void flash_attn_tile( const int j_KQ = j_KQ_0 + threadIdx.y; #ifdef FAST_FP16_AVAILABLE - Q_k[j_KQ_0/nwarps] = Q_tmp[j_KQ][k_KQ_0/2 + k_KQ_1]; + ggml_cuda_memcpy_1(&Q_k[j_KQ_0/nwarps], &Q_tmp[j_KQ][k_KQ_0/2 + k_KQ_1]); #else - Q_k[j_KQ_0/nwarps] = Q_tmp[j_KQ][k_KQ_0 + k_KQ_1]; + ggml_cuda_memcpy_1(&Q_k[j_KQ_0/nwarps], &Q_tmp[j_KQ][k_KQ_0 + k_KQ_1]); #endif // FAST_FP16_AVAILABLE } @@ -304,7 +331,10 @@ static __global__ void flash_attn_tile( for (int i_KQ_0 = 0; i_KQ_0 < kq_stride; i_KQ_0 += warp_size) { #pragma unroll for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) { - ggml_cuda_mad(sum[i_KQ_0/warp_size][j_KQ_0/nwarps], K_k[i_KQ_0/warp_size], Q_k[j_KQ_0/nwarps]); +#pragma unroll + for (int k = 0; k < cpy_ne; ++k) { + ggml_cuda_mad(sum[i_KQ_0/warp_size][j_KQ_0/nwarps], K_k[i_KQ_0/warp_size][k], Q_k[j_KQ_0/nwarps][k]); + } } } } @@ -345,14 +375,54 @@ static __global__ void flash_attn_tile( kqmax[j0/nwarps] = kqmax_new[j0/nwarps]; float kqsum_add = 0.0f; + if (kq_stride % (4*warp_size) == 0 && cpy_ne % 4 == 0) { #pragma unroll - for (int i0 = 0; i0 < kq_stride; i0 += warp_size) { - const int i = i0 + threadIdx.x; + for (int i0 = 0; i0 < kq_stride; i0 += 4*warp_size) { + const int i = i0 + 4*threadIdx.x; - const float diff = KQ[j][i] - kqmax[j0/nwarps]; - const float val = expf(diff); - kqsum_add += val; - KQ[j][i] = val; + float4 val = *(const float4 *) &KQ[j][i]; + val.x = expf(val.x - kqmax[j0/nwarps]); + val.y = expf(val.y - kqmax[j0/nwarps]); + val.z = expf(val.z - kqmax[j0/nwarps]); + val.w = expf(val.w - kqmax[j0/nwarps]); + kqsum_add += val.x + val.y + val.z + val.w; + +#ifdef FAST_FP16_AVAILABLE + const half2 tmp[2] = {make_half2(val.x, val.y), make_half2(val.z, val.w)}; + ggml_cuda_memcpy_1(&KQ[j][i/2], &tmp); +#else + ggml_cuda_memcpy_1(&KQ[j][i], &val); +#endif // FAST_FP16_AVAILABLE + } + } else if (kq_stride % (2*warp_size) == 0 && cpy_ne % 2 == 0) { +#pragma unroll + for (int i0 = 0; i0 < kq_stride; i0 += 2*warp_size) { + const int i = i0 + 2*threadIdx.x; + + float2 val = *(const float2 *) &KQ[j][i]; + val.x = expf(val.x - kqmax[j0/nwarps]); + val.y = expf(val.y - kqmax[j0/nwarps]); + kqsum_add += val.x + val.y; +#ifdef FAST_FP16_AVAILABLE + const half2 tmp = make_half2(val.x, val.y); + ggml_cuda_memcpy_1(&KQ[j][i/2], &tmp); +#else + ggml_cuda_memcpy_1(&KQ[j][i], &val); +#endif // FAST_FP16_AVAILABLE + } + } else { + for (int i0 = 0; i0 < kq_stride; i0 += warp_size) { + const int i = i0 + threadIdx.x; + + const float diff = KQ[j][i] - kqmax[j0/nwarps]; + const float val = expf(diff); + kqsum_add += val; +#ifdef FAST_FP16_AVAILABLE + ((half *) KQ[j])[i] = val; +#else + KQ[j][i] = val; +#endif // FAST_FP16_AVAILABLE + } } kqsum[j0/nwarps] = kqsum[j0/nwarps]*KQ_max_scale + kqsum_add; @@ -419,8 +489,7 @@ static __global__ void flash_attn_tile( const int j = j0 + threadIdx.y; #ifdef FAST_FP16_AVAILABLE - const float tmp = KQ[j][k0 + k1]; - KQ_k[j0/nwarps] = make_half2(tmp, tmp); + KQ_k[j0/nwarps] = __half2half2(((const half *)KQ[j])[k0 + k1]); #else KQ_k[j0/nwarps] = KQ[j][k0 + k1]; #endif // FAST_FP16_AVAILABLE diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 391ac4b7a..dcb32d08b 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -3223,6 +3223,7 @@ struct ggml_backend_cuda_device_context { int device; std::string name; std::string description; + std::string pci_bus_id; }; static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) { @@ -3247,9 +3248,12 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend } static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) { + ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context; + props->name = ggml_backend_cuda_device_get_name(dev); props->description = ggml_backend_cuda_device_get_description(dev); props->type = ggml_backend_cuda_device_get_type(dev); + props->device_id = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str(); ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total); bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr; @@ -3818,6 +3822,10 @@ ggml_backend_reg_t ggml_backend_cuda_reg() { CUDA_CHECK(cudaGetDeviceProperties(&prop, i)); dev_ctx->description = prop.name; + char pci_bus_id[16] = {}; + snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.0", prop.pciDomainID, prop.pciBusID, prop.pciDeviceID); + dev_ctx->pci_bus_id = pci_bus_id; + ggml_backend_dev_t dev = new ggml_backend_device { /* .iface = */ ggml_backend_cuda_device_interface, /* .reg = */ ®, diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h index c6a33d5de..12bbee455 100644 --- a/ggml/src/ggml-cuda/vendors/hip.h +++ b/ggml/src/ggml-cuda/vendors/hip.h @@ -162,6 +162,14 @@ #define GCN #endif +#if defined(__gfx900__) || defined(__gfx906__) +#define GCN5 +#endif + +#if defined(__gfx803__) +#define GCN4 +#endif + #if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx942__) #define CDNA // For the entire family #endif diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index ba5506a32..0749b5646 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -1976,7 +1976,7 @@ static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, const std } } - if (buf->device_memory == VK_NULL_HANDLE) { + if (!buf->device_memory) { device->device.destroyBuffer(buf->buffer); throw vk::OutOfDeviceMemoryError("No suitable memory type found"); } @@ -4527,7 +4527,7 @@ static void ggml_vk_instance_init() { new_driver.pNext = &new_id; devices[i].getProperties2(&new_props); - if (new_props.properties.deviceType == vk::PhysicalDeviceType::eDiscreteGpu) { + if (new_props.properties.deviceType == vk::PhysicalDeviceType::eDiscreteGpu || new_props.properties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu) { // Check if there are two physical devices corresponding to the same GPU auto old_device = std::find_if( vk_instance.device_indices.begin(), @@ -4597,7 +4597,7 @@ static void ggml_vk_instance_init() { } } - // If no dedicated GPUs found, fall back to the first non-CPU device. + // If no GPUs found, fall back to the first non-CPU device. // If only CPU devices are available, return without devices. if (vk_instance.device_indices.empty()) { for (size_t i = 0; i < devices.size(); i++) { @@ -12108,12 +12108,63 @@ void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total } } +static vk::PhysicalDeviceType ggml_backend_vk_get_device_type(int device_idx) { + GGML_ASSERT(device_idx >= 0 && device_idx < (int) vk_instance.device_indices.size()); + + vk::PhysicalDevice device = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[device_idx]]; + + vk::PhysicalDeviceProperties2 props = {}; + device.getProperties2(&props); + + return props.properties.deviceType; +} + +static std::string ggml_backend_vk_get_device_pci_id(int device_idx) { + GGML_ASSERT(device_idx >= 0 && device_idx < (int) vk_instance.device_indices.size()); + + vk::PhysicalDevice device = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[device_idx]]; + + const std::vector ext_props = device.enumerateDeviceExtensionProperties(); + + bool ext_support = false; + + for (const auto& properties : ext_props) { + if (strcmp("VK_EXT_pci_bus_info", properties.extensionName) == 0) { + ext_support = true; + break; + } + } + + if (!ext_support) { + return ""; + } + + vk::PhysicalDeviceProperties2 props = {}; + vk::PhysicalDevicePCIBusInfoPropertiesEXT pci_bus_info = {}; + + props.pNext = &pci_bus_info; + + device.getProperties2(&props); + + const uint32_t pci_domain = pci_bus_info.pciDomain; + const uint32_t pci_bus = pci_bus_info.pciBus; + const uint32_t pci_device = pci_bus_info.pciDevice; + const uint8_t pci_function = (uint8_t) pci_bus_info.pciFunction; // pci function is between 0 and 7, prevent printf overflow warning + + char pci_bus_id[16] = {}; + snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.%x", pci_domain, pci_bus, pci_device, pci_function); + + return std::string(pci_bus_id); +} + ////////////////////////// struct ggml_backend_vk_device_context { size_t device; std::string name; std::string description; + bool is_integrated_gpu; + std::string pci_bus_id; }; static const char * ggml_backend_vk_device_get_name(ggml_backend_dev_t dev) { @@ -12142,14 +12193,18 @@ static ggml_backend_buffer_type_t ggml_backend_vk_device_get_host_buffer_type(gg } static enum ggml_backend_dev_type ggml_backend_vk_device_get_type(ggml_backend_dev_t dev) { - UNUSED(dev); - return GGML_BACKEND_DEVICE_TYPE_GPU; + ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context; + + return ctx->is_integrated_gpu ? GGML_BACKEND_DEVICE_TYPE_IGPU : GGML_BACKEND_DEVICE_TYPE_GPU; } static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) { + ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context; + props->name = ggml_backend_vk_device_get_name(dev); props->description = ggml_backend_vk_device_get_description(dev); props->type = ggml_backend_vk_device_get_type(dev); + props->device_id = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str(); ggml_backend_vk_device_get_memory(dev, &props->memory_free, &props->memory_total); props->caps = { /* .async = */ false, @@ -12416,8 +12471,8 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm } if ( - src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_I32 || - src0_type == GGML_TYPE_I32 && src1_type == GGML_TYPE_F32 + (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_I32) || + (src0_type == GGML_TYPE_I32 && src1_type == GGML_TYPE_F32) ) { return true; } @@ -12582,6 +12637,8 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg, ctx->device = i; ctx->name = GGML_VK_NAME + std::to_string(i); ctx->description = desc; + ctx->is_integrated_gpu = ggml_backend_vk_get_device_type(i) == vk::PhysicalDeviceType::eIntegratedGpu; + ctx->pci_bus_id = ggml_backend_vk_get_device_pci_id(i); devices.push_back(new ggml_backend_device { /* .iface = */ ggml_backend_vk_device_i, /* .reg = */ reg, diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 7df7fc6a0..17479166c 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -181,7 +181,7 @@ llama_context::llama_context( // graph outputs buffer { // resized during inference when a batch uses more outputs - if ((uint32_t) output_reserve(params.n_seq_max) < params.n_seq_max) { + if (output_reserve(params.n_seq_max) < params.n_seq_max) { throw std::runtime_error("failed to reserve initial output buffer"); } diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 7711e7d86..c3b1de195 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -923,7 +923,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: new_type = tensor->type; new_data = tensor->data; new_size = ggml_nbytes(tensor); - LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0); + LLAMA_LOG_INFO("size = %8.3f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0); } else { const int64_t nelements = ggml_nelements(tensor); @@ -1040,8 +1040,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: } close_ofstream(); - LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0); - LLAMA_LOG_INFO("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0); + LLAMA_LOG_INFO("%s: model size = %8.2f MiB\n", __func__, total_size_org/1024.0/1024.0); + LLAMA_LOG_INFO("%s: quant size = %8.2f MiB\n", __func__, total_size_new/1024.0/1024.0); if (qs.n_fallback > 0) { LLAMA_LOG_WARN("%s: WARNING: %d of %d tensor(s) required fallback quantization\n", diff --git a/src/llama.cpp b/src/llama.cpp index 70b399544..aeba0807e 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -89,6 +89,7 @@ bool llama_supports_gpu_offload(void) { return true; #else return ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU) != nullptr || + ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU) != nullptr || llama_supports_rpc(); #endif } @@ -215,8 +216,13 @@ static struct llama_model * llama_model_load_from_file_impl( model->devices.push_back(*dev); } } else { + // default device selection + + // build list of available devices + std::vector gpus; + std::vector igpus; std::vector rpc_servers; - // use all available devices + for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { ggml_backend_dev_t dev = ggml_backend_dev_get(i); switch (ggml_backend_dev_type(dev)) { @@ -225,19 +231,51 @@ static struct llama_model * llama_model_load_from_file_impl( // skip CPU backends since they are handled separately break; - case GGML_BACKEND_DEVICE_TYPE_GPU: + case GGML_BACKEND_DEVICE_TYPE_GPU: { ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev); if (ggml_backend_reg_name(reg) == std::string("RPC")) { rpc_servers.push_back(dev); } else { - model->devices.push_back(dev); + // check if there is already a GPU with the same device id + ggml_backend_dev_props props; + ggml_backend_dev_get_props(dev, &props); + auto it = std::find_if(gpus.begin(), gpus.end(), [&props](ggml_backend_dev_t d) { + ggml_backend_dev_props d_props; + ggml_backend_dev_get_props(d, &d_props); + if (props.device_id && d_props.device_id) { + return strcmp(props.device_id, d_props.device_id) == 0; + } + return false; + }); + + if (it != gpus.end()) { + LLAMA_LOG_INFO("%s: skipping device %s (%s) with id %s - already using device %s (%s) with the same id\n", + __func__, + ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), + props.device_id ? props.device_id : "unknown id", + ggml_backend_dev_name(*it), ggml_backend_dev_description(*it)); + } else { + gpus.push_back(dev); + } } break; + } + + case GGML_BACKEND_DEVICE_TYPE_IGPU: + igpus.push_back(dev); + break; } } - // add RPC servers at the front of the list - if (!rpc_servers.empty()) { - model->devices.insert(model->devices.begin(), rpc_servers.begin(), rpc_servers.end()); + + // add RPC servers at the front of the list to minimize network transfers + model->devices.insert(model->devices.begin(), rpc_servers.begin(), rpc_servers.end()); + + // add GPUs + model->devices.insert(model->devices.end(), gpus.begin(), gpus.end()); + + // add integrated GPUs only if no other devices were found + if (model->devices.empty()) { + model->devices.insert(model->devices.end(), igpus.begin(), igpus.end()); } } @@ -258,9 +296,12 @@ static struct llama_model * llama_model_load_from_file_impl( } for (auto * dev : model->devices) { - size_t free, total; // NOLINT - ggml_backend_dev_memory(dev, &free, &total); - LLAMA_LOG_INFO("%s: using device %s (%s) - %zu MiB free\n", __func__, ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), free/1024/1024); + ggml_backend_dev_props props; + ggml_backend_dev_get_props(dev, &props); + LLAMA_LOG_INFO("%s: using device %s (%s) (%s) - %zu MiB free\n", __func__, + ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), + props.device_id ? props.device_id : "unknown id", + props.memory_free/1024/1024); } const int status = llama_model_load(path_model, splits, *model, params); diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 39ef439e9..160b97cf7 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -2372,7 +2372,7 @@ struct server_context { } if (ret != nullptr) { - SLT_DBG(*ret, "selected slot by lcs similarity, lcs_len = %d, similarity = %f\n", lcs_len, similarity); + SLT_INF(*ret, "selected slot by lcs similarity, lcs_len = %d, similarity = %.3f (> %.3f thold)\n", lcs_len, similarity, slot_prompt_similarity); } } @@ -2394,7 +2394,7 @@ struct server_context { } if (ret != nullptr) { - SLT_DBG(*ret, "selected slot by lru, t_last = %" PRId64 "\n", t_last); + SLT_INF(*ret, "selected slot by LRU, t_last = %" PRId64 "\n", t_last); } }